Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
PojavLauncherTeam
GitHub Repository: PojavLauncherTeam/mesa
Path: blob/21.2-virgl/src/amd/vulkan/radv_nir_to_llvm.c
7204 views
1
/*
2
* Copyright © 2016 Red Hat.
3
* Copyright © 2016 Bas Nieuwenhuizen
4
*
5
* based in part on anv driver which is:
6
* Copyright © 2015 Intel Corporation
7
*
8
* Permission is hereby granted, free of charge, to any person obtaining a
9
* copy of this software and associated documentation files (the "Software"),
10
* to deal in the Software without restriction, including without limitation
11
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
12
* and/or sell copies of the Software, and to permit persons to whom the
13
* Software is furnished to do so, subject to the following conditions:
14
*
15
* The above copyright notice and this permission notice (including the next
16
* paragraph) shall be included in all copies or substantial portions of the
17
* Software.
18
*
19
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
20
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
21
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
22
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
23
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
24
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
25
* IN THE SOFTWARE.
26
*/
27
28
#include "nir/nir.h"
29
#include "radv_debug.h"
30
#include "radv_private.h"
31
#include "radv_shader.h"
32
#include "radv_shader_args.h"
33
#include "radv_shader_helper.h"
34
35
#include "ac_binary.h"
36
#include "ac_exp_param.h"
37
#include "ac_llvm_build.h"
38
#include "ac_nir_to_llvm.h"
39
#include "ac_shader_abi.h"
40
#include "ac_shader_util.h"
41
#include "sid.h"
42
43
#define RADEON_LLVM_MAX_INPUTS (VARYING_SLOT_VAR31 + 1)
44
45
struct radv_shader_context {
46
struct ac_llvm_context ac;
47
const struct nir_shader *shader;
48
struct ac_shader_abi abi;
49
const struct radv_shader_args *args;
50
51
gl_shader_stage stage;
52
53
unsigned max_workgroup_size;
54
LLVMContextRef context;
55
LLVMValueRef main_function;
56
57
LLVMValueRef descriptor_sets[MAX_SETS];
58
59
LLVMValueRef ring_offsets;
60
61
LLVMValueRef vs_rel_patch_id;
62
63
LLVMValueRef gs_wave_id;
64
LLVMValueRef gs_vtx_offset[6];
65
66
LLVMValueRef esgs_ring;
67
LLVMValueRef gsvs_ring[4];
68
LLVMValueRef hs_ring_tess_offchip;
69
LLVMValueRef hs_ring_tess_factor;
70
71
LLVMValueRef inputs[RADEON_LLVM_MAX_INPUTS * 4];
72
73
uint64_t output_mask;
74
75
LLVMValueRef gs_next_vertex[4];
76
LLVMValueRef gs_curprim_verts[4];
77
LLVMValueRef gs_generated_prims[4];
78
LLVMValueRef gs_ngg_emit;
79
LLVMValueRef gs_ngg_scratch;
80
81
LLVMValueRef vertexptr; /* GFX10 only */
82
};
83
84
struct radv_shader_output_values {
85
LLVMValueRef values[4];
86
unsigned slot_name;
87
unsigned slot_index;
88
unsigned usage_mask;
89
};
90
91
static inline struct radv_shader_context *
92
radv_shader_context_from_abi(struct ac_shader_abi *abi)
93
{
94
return container_of(abi, struct radv_shader_context, abi);
95
}
96
97
static LLVMValueRef
98
create_llvm_function(struct ac_llvm_context *ctx, LLVMModuleRef module, LLVMBuilderRef builder,
99
const struct ac_shader_args *args, enum ac_llvm_calling_convention convention,
100
unsigned max_workgroup_size, const struct radv_nir_compiler_options *options)
101
{
102
LLVMValueRef main_function = ac_build_main(args, ctx, convention, "main", ctx->voidt, module);
103
104
if (options->address32_hi) {
105
ac_llvm_add_target_dep_function_attr(main_function, "amdgpu-32bit-address-high-bits",
106
options->address32_hi);
107
}
108
109
ac_llvm_set_workgroup_size(main_function, max_workgroup_size);
110
ac_llvm_set_target_features(main_function, ctx);
111
112
return main_function;
113
}
114
115
static void
116
load_descriptor_sets(struct radv_shader_context *ctx)
117
{
118
uint32_t mask = ctx->args->shader_info->desc_set_used_mask;
119
if (ctx->args->shader_info->need_indirect_descriptor_sets) {
120
LLVMValueRef desc_sets = ac_get_arg(&ctx->ac, ctx->args->descriptor_sets[0]);
121
while (mask) {
122
int i = u_bit_scan(&mask);
123
124
ctx->descriptor_sets[i] =
125
ac_build_load_to_sgpr(&ctx->ac, desc_sets, LLVMConstInt(ctx->ac.i32, i, false));
126
LLVMSetAlignment(ctx->descriptor_sets[i], 4);
127
}
128
} else {
129
while (mask) {
130
int i = u_bit_scan(&mask);
131
132
ctx->descriptor_sets[i] = ac_get_arg(&ctx->ac, ctx->args->descriptor_sets[i]);
133
}
134
}
135
}
136
137
static enum ac_llvm_calling_convention
138
get_llvm_calling_convention(LLVMValueRef func, gl_shader_stage stage)
139
{
140
switch (stage) {
141
case MESA_SHADER_VERTEX:
142
case MESA_SHADER_TESS_EVAL:
143
return AC_LLVM_AMDGPU_VS;
144
break;
145
case MESA_SHADER_GEOMETRY:
146
return AC_LLVM_AMDGPU_GS;
147
break;
148
case MESA_SHADER_TESS_CTRL:
149
return AC_LLVM_AMDGPU_HS;
150
break;
151
case MESA_SHADER_FRAGMENT:
152
return AC_LLVM_AMDGPU_PS;
153
break;
154
case MESA_SHADER_COMPUTE:
155
return AC_LLVM_AMDGPU_CS;
156
break;
157
default:
158
unreachable("Unhandle shader type");
159
}
160
}
161
162
/* Returns whether the stage is a stage that can be directly before the GS */
163
static bool
164
is_pre_gs_stage(gl_shader_stage stage)
165
{
166
return stage == MESA_SHADER_VERTEX || stage == MESA_SHADER_TESS_EVAL;
167
}
168
169
static void
170
create_function(struct radv_shader_context *ctx, gl_shader_stage stage, bool has_previous_stage)
171
{
172
if (ctx->ac.chip_class >= GFX10) {
173
if (is_pre_gs_stage(stage) && ctx->args->options->key.vs_common_out.as_ngg) {
174
/* On GFX10, VS is merged into GS for NGG. */
175
stage = MESA_SHADER_GEOMETRY;
176
has_previous_stage = true;
177
}
178
}
179
180
ctx->main_function =
181
create_llvm_function(&ctx->ac, ctx->ac.module, ctx->ac.builder, &ctx->args->ac,
182
get_llvm_calling_convention(ctx->main_function, stage),
183
ctx->max_workgroup_size, ctx->args->options);
184
185
ctx->ring_offsets = ac_build_intrinsic(&ctx->ac, "llvm.amdgcn.implicit.buffer.ptr",
186
LLVMPointerType(ctx->ac.i8, AC_ADDR_SPACE_CONST), NULL, 0,
187
AC_FUNC_ATTR_READNONE);
188
ctx->ring_offsets = LLVMBuildBitCast(ctx->ac.builder, ctx->ring_offsets,
189
ac_array_in_const_addr_space(ctx->ac.v4i32), "");
190
191
load_descriptor_sets(ctx);
192
193
if (stage == MESA_SHADER_TESS_CTRL ||
194
(stage == MESA_SHADER_VERTEX && ctx->args->options->key.vs_common_out.as_ls) ||
195
/* GFX9 has the ESGS ring buffer in LDS. */
196
(stage == MESA_SHADER_GEOMETRY && has_previous_stage)) {
197
ac_declare_lds_as_pointer(&ctx->ac);
198
}
199
}
200
201
static LLVMValueRef
202
radv_load_resource(struct ac_shader_abi *abi, LLVMValueRef index, unsigned desc_set,
203
unsigned binding)
204
{
205
struct radv_shader_context *ctx = radv_shader_context_from_abi(abi);
206
LLVMValueRef desc_ptr = ctx->descriptor_sets[desc_set];
207
struct radv_pipeline_layout *pipeline_layout = ctx->args->options->layout;
208
struct radv_descriptor_set_layout *layout = pipeline_layout->set[desc_set].layout;
209
unsigned base_offset = layout->binding[binding].offset;
210
LLVMValueRef offset, stride;
211
212
if (layout->binding[binding].type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC ||
213
layout->binding[binding].type == VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC) {
214
unsigned idx = pipeline_layout->set[desc_set].dynamic_offset_start +
215
layout->binding[binding].dynamic_offset_offset;
216
desc_ptr = ac_get_arg(&ctx->ac, ctx->args->ac.push_constants);
217
base_offset = pipeline_layout->push_constant_size + 16 * idx;
218
stride = LLVMConstInt(ctx->ac.i32, 16, false);
219
} else
220
stride = LLVMConstInt(ctx->ac.i32, layout->binding[binding].size, false);
221
222
offset = LLVMConstInt(ctx->ac.i32, base_offset, false);
223
224
if (layout->binding[binding].type != VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK_EXT) {
225
offset = ac_build_imad(&ctx->ac, index, stride, offset);
226
}
227
228
desc_ptr = LLVMBuildPtrToInt(ctx->ac.builder, desc_ptr, ctx->ac.i32, "");
229
230
LLVMValueRef res[] = {desc_ptr, offset, ctx->ac.i32_0};
231
return ac_build_gather_values(&ctx->ac, res, 3);
232
}
233
234
static uint32_t
235
radv_get_sample_pos_offset(uint32_t num_samples)
236
{
237
uint32_t sample_pos_offset = 0;
238
239
switch (num_samples) {
240
case 2:
241
sample_pos_offset = 1;
242
break;
243
case 4:
244
sample_pos_offset = 3;
245
break;
246
case 8:
247
sample_pos_offset = 7;
248
break;
249
default:
250
break;
251
}
252
return sample_pos_offset;
253
}
254
255
static LLVMValueRef
256
load_sample_position(struct ac_shader_abi *abi, LLVMValueRef sample_id)
257
{
258
struct radv_shader_context *ctx = radv_shader_context_from_abi(abi);
259
260
LLVMValueRef result;
261
LLVMValueRef index = LLVMConstInt(ctx->ac.i32, RING_PS_SAMPLE_POSITIONS, false);
262
LLVMValueRef ptr = LLVMBuildGEP(ctx->ac.builder, ctx->ring_offsets, &index, 1, "");
263
264
ptr = LLVMBuildBitCast(ctx->ac.builder, ptr, ac_array_in_const_addr_space(ctx->ac.v2f32), "");
265
266
uint32_t sample_pos_offset = radv_get_sample_pos_offset(ctx->args->options->key.fs.num_samples);
267
268
sample_id = LLVMBuildAdd(ctx->ac.builder, sample_id,
269
LLVMConstInt(ctx->ac.i32, sample_pos_offset, false), "");
270
result = ac_build_load_invariant(&ctx->ac, ptr, sample_id);
271
272
return result;
273
}
274
275
static LLVMValueRef
276
load_sample_mask_in(struct ac_shader_abi *abi)
277
{
278
struct radv_shader_context *ctx = radv_shader_context_from_abi(abi);
279
uint8_t log2_ps_iter_samples;
280
281
if (ctx->args->shader_info->ps.uses_sample_shading) {
282
log2_ps_iter_samples = util_logbase2(ctx->args->options->key.fs.num_samples);
283
} else {
284
log2_ps_iter_samples = ctx->args->options->key.fs.log2_ps_iter_samples;
285
}
286
287
LLVMValueRef result, sample_id;
288
if (log2_ps_iter_samples) {
289
/* gl_SampleMaskIn[0] = (SampleCoverage & (1 << gl_SampleID)). */
290
sample_id = ac_unpack_param(&ctx->ac, ac_get_arg(&ctx->ac, ctx->args->ac.ancillary), 8, 4);
291
sample_id = LLVMBuildShl(ctx->ac.builder, LLVMConstInt(ctx->ac.i32, 1, false), sample_id, "");
292
result = LLVMBuildAnd(ctx->ac.builder, sample_id,
293
ac_get_arg(&ctx->ac, ctx->args->ac.sample_coverage), "");
294
} else {
295
result = ac_get_arg(&ctx->ac, ctx->args->ac.sample_coverage);
296
}
297
298
return result;
299
}
300
301
static void gfx10_ngg_gs_emit_vertex(struct radv_shader_context *ctx, unsigned stream,
302
LLVMValueRef vertexidx, LLVMValueRef *addrs);
303
304
static void
305
visit_emit_vertex_with_counter(struct ac_shader_abi *abi, unsigned stream, LLVMValueRef vertexidx,
306
LLVMValueRef *addrs)
307
{
308
unsigned offset = 0;
309
struct radv_shader_context *ctx = radv_shader_context_from_abi(abi);
310
311
if (ctx->args->options->key.vs_common_out.as_ngg) {
312
gfx10_ngg_gs_emit_vertex(ctx, stream, vertexidx, addrs);
313
return;
314
}
315
316
for (unsigned i = 0; i < AC_LLVM_MAX_OUTPUTS; ++i) {
317
unsigned output_usage_mask = ctx->args->shader_info->gs.output_usage_mask[i];
318
uint8_t output_stream = ctx->args->shader_info->gs.output_streams[i];
319
LLVMValueRef *out_ptr = &addrs[i * 4];
320
int length = util_last_bit(output_usage_mask);
321
322
if (!(ctx->output_mask & (1ull << i)) || output_stream != stream)
323
continue;
324
325
for (unsigned j = 0; j < length; j++) {
326
if (!(output_usage_mask & (1 << j)))
327
continue;
328
329
LLVMValueRef out_val = LLVMBuildLoad(ctx->ac.builder, out_ptr[j], "");
330
LLVMValueRef voffset =
331
LLVMConstInt(ctx->ac.i32, offset * ctx->shader->info.gs.vertices_out, false);
332
333
offset++;
334
335
voffset = LLVMBuildAdd(ctx->ac.builder, voffset, vertexidx, "");
336
voffset = LLVMBuildMul(ctx->ac.builder, voffset, LLVMConstInt(ctx->ac.i32, 4, false), "");
337
338
out_val = ac_to_integer(&ctx->ac, out_val);
339
out_val = LLVMBuildZExtOrBitCast(ctx->ac.builder, out_val, ctx->ac.i32, "");
340
341
ac_build_buffer_store_dword(&ctx->ac, ctx->gsvs_ring[stream], out_val, 1, voffset,
342
ac_get_arg(&ctx->ac, ctx->args->ac.gs2vs_offset), 0,
343
ac_glc | ac_slc | ac_swizzled);
344
}
345
}
346
347
ac_build_sendmsg(&ctx->ac, AC_SENDMSG_GS_OP_EMIT | AC_SENDMSG_GS | (stream << 8),
348
ctx->gs_wave_id);
349
}
350
351
static void
352
visit_end_primitive(struct ac_shader_abi *abi, unsigned stream)
353
{
354
struct radv_shader_context *ctx = radv_shader_context_from_abi(abi);
355
356
if (ctx->args->options->key.vs_common_out.as_ngg) {
357
LLVMBuildStore(ctx->ac.builder, ctx->ac.i32_0, ctx->gs_curprim_verts[stream]);
358
return;
359
}
360
361
ac_build_sendmsg(&ctx->ac, AC_SENDMSG_GS_OP_CUT | AC_SENDMSG_GS | (stream << 8),
362
ctx->gs_wave_id);
363
}
364
365
static LLVMValueRef
366
load_tess_coord(struct ac_shader_abi *abi)
367
{
368
struct radv_shader_context *ctx = radv_shader_context_from_abi(abi);
369
370
LLVMValueRef coord[4] = {
371
ac_get_arg(&ctx->ac, ctx->args->ac.tes_u),
372
ac_get_arg(&ctx->ac, ctx->args->ac.tes_v),
373
ctx->ac.f32_0,
374
ctx->ac.f32_0,
375
};
376
377
if (ctx->shader->info.tess.primitive_mode == GL_TRIANGLES)
378
coord[2] = LLVMBuildFSub(ctx->ac.builder, ctx->ac.f32_1,
379
LLVMBuildFAdd(ctx->ac.builder, coord[0], coord[1], ""), "");
380
381
return ac_build_gather_values(&ctx->ac, coord, 3);
382
}
383
384
static LLVMValueRef
385
load_ring_tess_factors(struct ac_shader_abi *abi)
386
{
387
struct radv_shader_context *ctx = radv_shader_context_from_abi(abi);
388
assert(ctx->stage == MESA_SHADER_TESS_CTRL);
389
390
return ctx->hs_ring_tess_factor;
391
}
392
393
static LLVMValueRef
394
load_ring_tess_offchip(struct ac_shader_abi *abi)
395
{
396
struct radv_shader_context *ctx = radv_shader_context_from_abi(abi);
397
assert(ctx->stage == MESA_SHADER_TESS_CTRL || ctx->stage == MESA_SHADER_TESS_EVAL);
398
399
return ctx->hs_ring_tess_offchip;
400
}
401
402
static LLVMValueRef
403
load_ring_esgs(struct ac_shader_abi *abi)
404
{
405
struct radv_shader_context *ctx = radv_shader_context_from_abi(abi);
406
assert(ctx->stage == MESA_SHADER_VERTEX || ctx->stage == MESA_SHADER_TESS_EVAL ||
407
ctx->stage == MESA_SHADER_GEOMETRY);
408
409
return ctx->esgs_ring;
410
}
411
412
static LLVMValueRef
413
radv_load_base_vertex(struct ac_shader_abi *abi, bool non_indexed_is_zero)
414
{
415
struct radv_shader_context *ctx = radv_shader_context_from_abi(abi);
416
return ac_get_arg(&ctx->ac, ctx->args->ac.base_vertex);
417
}
418
419
static LLVMValueRef
420
get_desc_ptr(struct radv_shader_context *ctx, LLVMValueRef ptr, bool non_uniform)
421
{
422
LLVMValueRef set_ptr = ac_llvm_extract_elem(&ctx->ac, ptr, 0);
423
LLVMValueRef offset = ac_llvm_extract_elem(&ctx->ac, ptr, 1);
424
ptr = LLVMBuildNUWAdd(ctx->ac.builder, set_ptr, offset, "");
425
426
unsigned addr_space = AC_ADDR_SPACE_CONST_32BIT;
427
if (non_uniform) {
428
/* 32-bit seems to always use SMEM. addrspacecast from 32-bit -> 64-bit is broken. */
429
LLVMValueRef dwords[] = {ptr,
430
LLVMConstInt(ctx->ac.i32, ctx->args->options->address32_hi, false)};
431
ptr = ac_build_gather_values(&ctx->ac, dwords, 2);
432
ptr = LLVMBuildBitCast(ctx->ac.builder, ptr, ctx->ac.i64, "");
433
addr_space = AC_ADDR_SPACE_CONST;
434
}
435
return LLVMBuildIntToPtr(ctx->ac.builder, ptr, LLVMPointerType(ctx->ac.v4i32, addr_space), "");
436
}
437
438
static LLVMValueRef
439
radv_load_ssbo(struct ac_shader_abi *abi, LLVMValueRef buffer_ptr, bool write, bool non_uniform)
440
{
441
struct radv_shader_context *ctx = radv_shader_context_from_abi(abi);
442
LLVMValueRef result;
443
444
buffer_ptr = get_desc_ptr(ctx, buffer_ptr, non_uniform);
445
if (!non_uniform)
446
LLVMSetMetadata(buffer_ptr, ctx->ac.uniform_md_kind, ctx->ac.empty_md);
447
448
result = LLVMBuildLoad(ctx->ac.builder, buffer_ptr, "");
449
LLVMSetMetadata(result, ctx->ac.invariant_load_md_kind, ctx->ac.empty_md);
450
LLVMSetAlignment(result, 4);
451
452
return result;
453
}
454
455
static LLVMValueRef
456
radv_load_ubo(struct ac_shader_abi *abi, unsigned desc_set, unsigned binding, bool valid_binding,
457
LLVMValueRef buffer_ptr)
458
{
459
struct radv_shader_context *ctx = radv_shader_context_from_abi(abi);
460
LLVMValueRef result;
461
462
if (valid_binding) {
463
struct radv_pipeline_layout *pipeline_layout = ctx->args->options->layout;
464
struct radv_descriptor_set_layout *layout = pipeline_layout->set[desc_set].layout;
465
466
if (layout->binding[binding].type == VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK_EXT) {
467
LLVMValueRef set_ptr = ac_llvm_extract_elem(&ctx->ac, buffer_ptr, 0);
468
LLVMValueRef offset = ac_llvm_extract_elem(&ctx->ac, buffer_ptr, 1);
469
buffer_ptr = LLVMBuildNUWAdd(ctx->ac.builder, set_ptr, offset, "");
470
471
uint32_t desc_type =
472
S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
473
S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) | S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W);
474
475
if (ctx->ac.chip_class >= GFX10) {
476
desc_type |= S_008F0C_FORMAT(V_008F0C_GFX10_FORMAT_32_FLOAT) |
477
S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_RAW) | S_008F0C_RESOURCE_LEVEL(1);
478
} else {
479
desc_type |= S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
480
S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
481
}
482
483
LLVMValueRef desc_components[4] = {
484
LLVMBuildPtrToInt(ctx->ac.builder, buffer_ptr, ctx->ac.intptr, ""),
485
LLVMConstInt(ctx->ac.i32, S_008F04_BASE_ADDRESS_HI(ctx->args->options->address32_hi),
486
false),
487
LLVMConstInt(ctx->ac.i32, 0xffffffff, false),
488
LLVMConstInt(ctx->ac.i32, desc_type, false),
489
};
490
491
return ac_build_gather_values(&ctx->ac, desc_components, 4);
492
}
493
}
494
495
buffer_ptr = get_desc_ptr(ctx, buffer_ptr, false);
496
LLVMSetMetadata(buffer_ptr, ctx->ac.uniform_md_kind, ctx->ac.empty_md);
497
498
result = LLVMBuildLoad(ctx->ac.builder, buffer_ptr, "");
499
LLVMSetMetadata(result, ctx->ac.invariant_load_md_kind, ctx->ac.empty_md);
500
LLVMSetAlignment(result, 4);
501
502
return result;
503
}
504
505
static LLVMValueRef
506
radv_get_sampler_desc(struct ac_shader_abi *abi, unsigned descriptor_set, unsigned base_index,
507
unsigned constant_index, LLVMValueRef index,
508
enum ac_descriptor_type desc_type, bool image, bool write, bool bindless)
509
{
510
struct radv_shader_context *ctx = radv_shader_context_from_abi(abi);
511
LLVMValueRef list = ctx->descriptor_sets[descriptor_set];
512
struct radv_descriptor_set_layout *layout =
513
ctx->args->options->layout->set[descriptor_set].layout;
514
struct radv_descriptor_set_binding_layout *binding = layout->binding + base_index;
515
unsigned offset = binding->offset;
516
unsigned stride = binding->size;
517
unsigned type_size;
518
LLVMBuilderRef builder = ctx->ac.builder;
519
LLVMTypeRef type;
520
521
assert(base_index < layout->binding_count);
522
523
switch (desc_type) {
524
case AC_DESC_IMAGE:
525
type = ctx->ac.v8i32;
526
type_size = 32;
527
break;
528
case AC_DESC_FMASK:
529
type = ctx->ac.v8i32;
530
offset += 32;
531
type_size = 32;
532
break;
533
case AC_DESC_SAMPLER:
534
type = ctx->ac.v4i32;
535
if (binding->type == VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER) {
536
offset += radv_combined_image_descriptor_sampler_offset(binding);
537
}
538
539
type_size = 16;
540
break;
541
case AC_DESC_BUFFER:
542
type = ctx->ac.v4i32;
543
type_size = 16;
544
break;
545
case AC_DESC_PLANE_0:
546
case AC_DESC_PLANE_1:
547
case AC_DESC_PLANE_2:
548
type = ctx->ac.v8i32;
549
type_size = 32;
550
offset += 32 * (desc_type - AC_DESC_PLANE_0);
551
break;
552
default:
553
unreachable("invalid desc_type\n");
554
}
555
556
offset += constant_index * stride;
557
558
if (desc_type == AC_DESC_SAMPLER && binding->immutable_samplers_offset &&
559
(!index || binding->immutable_samplers_equal)) {
560
if (binding->immutable_samplers_equal)
561
constant_index = 0;
562
563
const uint32_t *samplers = radv_immutable_samplers(layout, binding);
564
565
LLVMValueRef constants[] = {
566
LLVMConstInt(ctx->ac.i32, samplers[constant_index * 4 + 0], 0),
567
LLVMConstInt(ctx->ac.i32, samplers[constant_index * 4 + 1], 0),
568
LLVMConstInt(ctx->ac.i32, samplers[constant_index * 4 + 2], 0),
569
LLVMConstInt(ctx->ac.i32, samplers[constant_index * 4 + 3], 0),
570
};
571
return ac_build_gather_values(&ctx->ac, constants, 4);
572
}
573
574
assert(stride % type_size == 0);
575
576
LLVMValueRef adjusted_index = index;
577
if (!adjusted_index)
578
adjusted_index = ctx->ac.i32_0;
579
580
adjusted_index =
581
LLVMBuildMul(builder, adjusted_index, LLVMConstInt(ctx->ac.i32, stride / type_size, 0), "");
582
583
LLVMValueRef val_offset = LLVMConstInt(ctx->ac.i32, offset, 0);
584
list = LLVMBuildGEP(builder, list, &val_offset, 1, "");
585
list = LLVMBuildPointerCast(builder, list, ac_array_in_const32_addr_space(type), "");
586
587
LLVMValueRef descriptor = ac_build_load_to_sgpr(&ctx->ac, list, adjusted_index);
588
589
/* 3 plane formats always have same size and format for plane 1 & 2, so
590
* use the tail from plane 1 so that we can store only the first 16 bytes
591
* of the last plane. */
592
if (desc_type == AC_DESC_PLANE_2) {
593
LLVMValueRef descriptor2 =
594
radv_get_sampler_desc(abi, descriptor_set, base_index, constant_index, index,
595
AC_DESC_PLANE_1, image, write, bindless);
596
597
LLVMValueRef components[8];
598
for (unsigned i = 0; i < 4; ++i)
599
components[i] = ac_llvm_extract_elem(&ctx->ac, descriptor, i);
600
601
for (unsigned i = 4; i < 8; ++i)
602
components[i] = ac_llvm_extract_elem(&ctx->ac, descriptor2, i);
603
descriptor = ac_build_gather_values(&ctx->ac, components, 8);
604
} else if (desc_type == AC_DESC_IMAGE &&
605
ctx->args->options->has_image_load_dcc_bug &&
606
image && !write) {
607
LLVMValueRef components[8];
608
609
for (unsigned i = 0; i < 8; i++)
610
components[i] = ac_llvm_extract_elem(&ctx->ac, descriptor, i);
611
612
/* WRITE_COMPRESS_ENABLE must be 0 for all image loads to workaround a hardware bug. */
613
components[6] = LLVMBuildAnd(ctx->ac.builder, components[6],
614
LLVMConstInt(ctx->ac.i32, C_00A018_WRITE_COMPRESS_ENABLE, false), "");
615
616
descriptor = ac_build_gather_values(&ctx->ac, components, 8);
617
}
618
619
return descriptor;
620
}
621
622
/* For 2_10_10_10 formats the alpha is handled as unsigned by pre-vega HW.
623
* so we may need to fix it up. */
624
static LLVMValueRef
625
adjust_vertex_fetch_alpha(struct radv_shader_context *ctx, unsigned adjustment, LLVMValueRef alpha)
626
{
627
if (adjustment == AC_FETCH_FORMAT_NONE)
628
return alpha;
629
630
LLVMValueRef c30 = LLVMConstInt(ctx->ac.i32, 30, 0);
631
632
alpha = LLVMBuildBitCast(ctx->ac.builder, alpha, ctx->ac.f32, "");
633
634
if (adjustment == AC_FETCH_FORMAT_SSCALED)
635
alpha = LLVMBuildFPToUI(ctx->ac.builder, alpha, ctx->ac.i32, "");
636
else
637
alpha = ac_to_integer(&ctx->ac, alpha);
638
639
/* For the integer-like cases, do a natural sign extension.
640
*
641
* For the SNORM case, the values are 0.0, 0.333, 0.666, 1.0
642
* and happen to contain 0, 1, 2, 3 as the two LSBs of the
643
* exponent.
644
*/
645
alpha =
646
LLVMBuildShl(ctx->ac.builder, alpha,
647
adjustment == AC_FETCH_FORMAT_SNORM ? LLVMConstInt(ctx->ac.i32, 7, 0) : c30, "");
648
alpha = LLVMBuildAShr(ctx->ac.builder, alpha, c30, "");
649
650
/* Convert back to the right type. */
651
if (adjustment == AC_FETCH_FORMAT_SNORM) {
652
LLVMValueRef clamp;
653
LLVMValueRef neg_one = LLVMConstReal(ctx->ac.f32, -1.0);
654
alpha = LLVMBuildSIToFP(ctx->ac.builder, alpha, ctx->ac.f32, "");
655
clamp = LLVMBuildFCmp(ctx->ac.builder, LLVMRealULT, alpha, neg_one, "");
656
alpha = LLVMBuildSelect(ctx->ac.builder, clamp, neg_one, alpha, "");
657
} else if (adjustment == AC_FETCH_FORMAT_SSCALED) {
658
alpha = LLVMBuildSIToFP(ctx->ac.builder, alpha, ctx->ac.f32, "");
659
}
660
661
return LLVMBuildBitCast(ctx->ac.builder, alpha, ctx->ac.i32, "");
662
}
663
664
static LLVMValueRef
665
radv_fixup_vertex_input_fetches(struct radv_shader_context *ctx, LLVMValueRef value,
666
unsigned num_channels, bool is_float)
667
{
668
LLVMValueRef zero = is_float ? ctx->ac.f32_0 : ctx->ac.i32_0;
669
LLVMValueRef one = is_float ? ctx->ac.f32_1 : ctx->ac.i32_1;
670
LLVMValueRef chan[4];
671
672
if (LLVMGetTypeKind(LLVMTypeOf(value)) == LLVMVectorTypeKind) {
673
unsigned vec_size = LLVMGetVectorSize(LLVMTypeOf(value));
674
675
if (num_channels == 4 && num_channels == vec_size)
676
return value;
677
678
num_channels = MIN2(num_channels, vec_size);
679
680
for (unsigned i = 0; i < num_channels; i++)
681
chan[i] = ac_llvm_extract_elem(&ctx->ac, value, i);
682
} else {
683
assert(num_channels == 1);
684
chan[0] = value;
685
}
686
687
for (unsigned i = num_channels; i < 4; i++) {
688
chan[i] = i == 3 ? one : zero;
689
chan[i] = ac_to_integer(&ctx->ac, chan[i]);
690
}
691
692
return ac_build_gather_values(&ctx->ac, chan, 4);
693
}
694
695
static void
696
handle_vs_input_decl(struct radv_shader_context *ctx, struct nir_variable *variable)
697
{
698
LLVMValueRef t_list_ptr = ac_get_arg(&ctx->ac, ctx->args->ac.vertex_buffers);
699
LLVMValueRef t_offset;
700
LLVMValueRef t_list;
701
LLVMValueRef input;
702
LLVMValueRef buffer_index;
703
unsigned attrib_count = glsl_count_attribute_slots(variable->type, true);
704
705
enum glsl_base_type type = glsl_get_base_type(variable->type);
706
for (unsigned i = 0; i < attrib_count; ++i) {
707
LLVMValueRef output[4];
708
unsigned attrib_index = variable->data.location + i - VERT_ATTRIB_GENERIC0;
709
unsigned attrib_format = ctx->args->options->key.vs.vertex_attribute_formats[attrib_index];
710
unsigned data_format = attrib_format & 0x0f;
711
unsigned num_format = (attrib_format >> 4) & 0x07;
712
bool is_float =
713
num_format != V_008F0C_BUF_NUM_FORMAT_UINT && num_format != V_008F0C_BUF_NUM_FORMAT_SINT;
714
uint8_t input_usage_mask =
715
ctx->args->shader_info->vs.input_usage_mask[variable->data.location + i];
716
unsigned num_input_channels = util_last_bit(input_usage_mask);
717
718
if (num_input_channels == 0)
719
continue;
720
721
if (ctx->args->options->key.vs.instance_rate_inputs & (1u << attrib_index)) {
722
uint32_t divisor = ctx->args->options->key.vs.instance_rate_divisors[attrib_index];
723
724
if (divisor) {
725
buffer_index = ctx->abi.instance_id;
726
727
if (divisor != 1) {
728
buffer_index = LLVMBuildUDiv(ctx->ac.builder, buffer_index,
729
LLVMConstInt(ctx->ac.i32, divisor, 0), "");
730
}
731
} else {
732
buffer_index = ctx->ac.i32_0;
733
}
734
735
buffer_index = LLVMBuildAdd(
736
ctx->ac.builder, ac_get_arg(&ctx->ac, ctx->args->ac.start_instance), buffer_index, "");
737
} else {
738
buffer_index = LLVMBuildAdd(ctx->ac.builder, ctx->abi.vertex_id,
739
ac_get_arg(&ctx->ac, ctx->args->ac.base_vertex), "");
740
}
741
742
const struct ac_data_format_info *vtx_info = ac_get_data_format_info(data_format);
743
744
/* Adjust the number of channels to load based on the vertex
745
* attribute format.
746
*/
747
unsigned num_channels = MIN2(num_input_channels, vtx_info->num_channels);
748
unsigned attrib_binding = ctx->args->options->key.vs.vertex_attribute_bindings[attrib_index];
749
unsigned attrib_offset = ctx->args->options->key.vs.vertex_attribute_offsets[attrib_index];
750
unsigned attrib_stride = ctx->args->options->key.vs.vertex_attribute_strides[attrib_index];
751
unsigned alpha_adjust = ctx->args->options->key.vs.alpha_adjust[attrib_index];
752
753
if (ctx->args->options->key.vs.post_shuffle & (1 << attrib_index)) {
754
/* Always load, at least, 3 channels for formats that
755
* need to be shuffled because X<->Z.
756
*/
757
num_channels = MAX2(num_channels, 3);
758
}
759
760
unsigned desc_index =
761
ctx->args->shader_info->vs.use_per_attribute_vb_descs ? attrib_index : attrib_binding;
762
desc_index = util_bitcount(ctx->args->shader_info->vs.vb_desc_usage_mask &
763
u_bit_consecutive(0, desc_index));
764
t_offset = LLVMConstInt(ctx->ac.i32, desc_index, false);
765
t_list = ac_build_load_to_sgpr(&ctx->ac, t_list_ptr, t_offset);
766
767
/* Always split typed vertex buffer loads on GFX6 and GFX10+
768
* to avoid any alignment issues that triggers memory
769
* violations and eventually a GPU hang. This can happen if
770
* the stride (static or dynamic) is unaligned and also if the
771
* VBO offset is aligned to a scalar (eg. stride is 8 and VBO
772
* offset is 2 for R16G16B16A16_SNORM).
773
*/
774
if (ctx->ac.chip_class == GFX6 || ctx->ac.chip_class >= GFX10) {
775
unsigned chan_format = vtx_info->chan_format;
776
LLVMValueRef values[4];
777
778
assert(ctx->ac.chip_class == GFX6 || ctx->ac.chip_class >= GFX10);
779
780
for (unsigned chan = 0; chan < num_channels; chan++) {
781
unsigned chan_offset = attrib_offset + chan * vtx_info->chan_byte_size;
782
LLVMValueRef chan_index = buffer_index;
783
784
if (attrib_stride != 0 && chan_offset > attrib_stride) {
785
LLVMValueRef buffer_offset =
786
LLVMConstInt(ctx->ac.i32, chan_offset / attrib_stride, false);
787
788
chan_index = LLVMBuildAdd(ctx->ac.builder, buffer_index, buffer_offset, "");
789
790
chan_offset = chan_offset % attrib_stride;
791
}
792
793
values[chan] = ac_build_struct_tbuffer_load(
794
&ctx->ac, t_list, chan_index, LLVMConstInt(ctx->ac.i32, chan_offset, false),
795
ctx->ac.i32_0, ctx->ac.i32_0, 1, chan_format, num_format, 0, true);
796
}
797
798
input = ac_build_gather_values(&ctx->ac, values, num_channels);
799
} else {
800
if (attrib_stride != 0 && attrib_offset > attrib_stride) {
801
LLVMValueRef buffer_offset =
802
LLVMConstInt(ctx->ac.i32, attrib_offset / attrib_stride, false);
803
804
buffer_index = LLVMBuildAdd(ctx->ac.builder, buffer_index, buffer_offset, "");
805
806
attrib_offset = attrib_offset % attrib_stride;
807
}
808
809
input = ac_build_struct_tbuffer_load(
810
&ctx->ac, t_list, buffer_index, LLVMConstInt(ctx->ac.i32, attrib_offset, false),
811
ctx->ac.i32_0, ctx->ac.i32_0, num_channels, data_format, num_format, 0, true);
812
}
813
814
if (ctx->args->options->key.vs.post_shuffle & (1 << attrib_index)) {
815
LLVMValueRef c[4];
816
c[0] = ac_llvm_extract_elem(&ctx->ac, input, 2);
817
c[1] = ac_llvm_extract_elem(&ctx->ac, input, 1);
818
c[2] = ac_llvm_extract_elem(&ctx->ac, input, 0);
819
c[3] = ac_llvm_extract_elem(&ctx->ac, input, 3);
820
821
input = ac_build_gather_values(&ctx->ac, c, 4);
822
}
823
824
input = radv_fixup_vertex_input_fetches(ctx, input, num_channels, is_float);
825
826
for (unsigned chan = 0; chan < 4; chan++) {
827
LLVMValueRef llvm_chan = LLVMConstInt(ctx->ac.i32, chan, false);
828
output[chan] = LLVMBuildExtractElement(ctx->ac.builder, input, llvm_chan, "");
829
if (type == GLSL_TYPE_FLOAT16) {
830
output[chan] = LLVMBuildBitCast(ctx->ac.builder, output[chan], ctx->ac.f32, "");
831
output[chan] = LLVMBuildFPTrunc(ctx->ac.builder, output[chan], ctx->ac.f16, "");
832
}
833
}
834
835
output[3] = adjust_vertex_fetch_alpha(ctx, alpha_adjust, output[3]);
836
837
for (unsigned chan = 0; chan < 4; chan++) {
838
output[chan] = ac_to_integer(&ctx->ac, output[chan]);
839
if (type == GLSL_TYPE_UINT16 || type == GLSL_TYPE_INT16)
840
output[chan] = LLVMBuildTrunc(ctx->ac.builder, output[chan], ctx->ac.i16, "");
841
842
ctx->inputs[ac_llvm_reg_index_soa(variable->data.location + i, chan)] = output[chan];
843
}
844
}
845
}
846
847
static void
848
handle_vs_inputs(struct radv_shader_context *ctx, struct nir_shader *nir)
849
{
850
nir_foreach_shader_in_variable (variable, nir)
851
handle_vs_input_decl(ctx, variable);
852
}
853
854
static void
855
prepare_interp_optimize(struct radv_shader_context *ctx, struct nir_shader *nir)
856
{
857
bool uses_center = false;
858
bool uses_centroid = false;
859
nir_foreach_shader_in_variable (variable, nir) {
860
if (glsl_get_base_type(glsl_without_array(variable->type)) != GLSL_TYPE_FLOAT ||
861
variable->data.sample)
862
continue;
863
864
if (variable->data.centroid)
865
uses_centroid = true;
866
else
867
uses_center = true;
868
}
869
870
ctx->abi.persp_centroid = ac_get_arg(&ctx->ac, ctx->args->ac.persp_centroid);
871
ctx->abi.linear_centroid = ac_get_arg(&ctx->ac, ctx->args->ac.linear_centroid);
872
873
if (uses_center && uses_centroid) {
874
LLVMValueRef sel =
875
LLVMBuildICmp(ctx->ac.builder, LLVMIntSLT, ac_get_arg(&ctx->ac, ctx->args->ac.prim_mask),
876
ctx->ac.i32_0, "");
877
ctx->abi.persp_centroid =
878
LLVMBuildSelect(ctx->ac.builder, sel, ac_get_arg(&ctx->ac, ctx->args->ac.persp_center),
879
ctx->abi.persp_centroid, "");
880
ctx->abi.linear_centroid =
881
LLVMBuildSelect(ctx->ac.builder, sel, ac_get_arg(&ctx->ac, ctx->args->ac.linear_center),
882
ctx->abi.linear_centroid, "");
883
}
884
}
885
886
static void
887
scan_shader_output_decl(struct radv_shader_context *ctx, struct nir_variable *variable,
888
struct nir_shader *shader, gl_shader_stage stage)
889
{
890
int idx = variable->data.driver_location;
891
unsigned attrib_count = glsl_count_attribute_slots(variable->type, false);
892
uint64_t mask_attribs;
893
894
if (variable->data.compact) {
895
unsigned component_count = variable->data.location_frac + glsl_get_length(variable->type);
896
attrib_count = (component_count + 3) / 4;
897
}
898
899
mask_attribs = ((1ull << attrib_count) - 1) << idx;
900
901
ctx->output_mask |= mask_attribs;
902
}
903
904
/* Initialize arguments for the shader export intrinsic */
905
static void
906
si_llvm_init_export_args(struct radv_shader_context *ctx, LLVMValueRef *values,
907
unsigned enabled_channels, unsigned target, struct ac_export_args *args)
908
{
909
/* Specify the channels that are enabled. */
910
args->enabled_channels = enabled_channels;
911
912
/* Specify whether the EXEC mask represents the valid mask */
913
args->valid_mask = 0;
914
915
/* Specify whether this is the last export */
916
args->done = 0;
917
918
/* Specify the target we are exporting */
919
args->target = target;
920
921
args->compr = false;
922
args->out[0] = LLVMGetUndef(ctx->ac.f32);
923
args->out[1] = LLVMGetUndef(ctx->ac.f32);
924
args->out[2] = LLVMGetUndef(ctx->ac.f32);
925
args->out[3] = LLVMGetUndef(ctx->ac.f32);
926
927
if (!values)
928
return;
929
930
bool is_16bit = ac_get_type_size(LLVMTypeOf(values[0])) == 2;
931
if (ctx->stage == MESA_SHADER_FRAGMENT) {
932
unsigned index = target - V_008DFC_SQ_EXP_MRT;
933
unsigned col_format = (ctx->args->options->key.fs.col_format >> (4 * index)) & 0xf;
934
bool is_int8 = (ctx->args->options->key.fs.is_int8 >> index) & 1;
935
bool is_int10 = (ctx->args->options->key.fs.is_int10 >> index) & 1;
936
937
LLVMValueRef (*packf)(struct ac_llvm_context * ctx, LLVMValueRef args[2]) = NULL;
938
LLVMValueRef (*packi)(struct ac_llvm_context * ctx, LLVMValueRef args[2], unsigned bits,
939
bool hi) = NULL;
940
941
switch (col_format) {
942
case V_028714_SPI_SHADER_ZERO:
943
args->enabled_channels = 0; /* writemask */
944
args->target = V_008DFC_SQ_EXP_NULL;
945
break;
946
947
case V_028714_SPI_SHADER_32_R:
948
args->enabled_channels = 1;
949
args->out[0] = values[0];
950
break;
951
952
case V_028714_SPI_SHADER_32_GR:
953
args->enabled_channels = 0x3;
954
args->out[0] = values[0];
955
args->out[1] = values[1];
956
break;
957
958
case V_028714_SPI_SHADER_32_AR:
959
if (ctx->ac.chip_class >= GFX10) {
960
args->enabled_channels = 0x3;
961
args->out[0] = values[0];
962
args->out[1] = values[3];
963
} else {
964
args->enabled_channels = 0x9;
965
args->out[0] = values[0];
966
args->out[3] = values[3];
967
}
968
break;
969
970
case V_028714_SPI_SHADER_FP16_ABGR:
971
args->enabled_channels = 0xf;
972
packf = ac_build_cvt_pkrtz_f16;
973
if (is_16bit) {
974
for (unsigned chan = 0; chan < 4; chan++)
975
values[chan] = LLVMBuildFPExt(ctx->ac.builder, values[chan], ctx->ac.f32, "");
976
}
977
break;
978
979
case V_028714_SPI_SHADER_UNORM16_ABGR:
980
args->enabled_channels = 0xf;
981
packf = ac_build_cvt_pknorm_u16;
982
break;
983
984
case V_028714_SPI_SHADER_SNORM16_ABGR:
985
args->enabled_channels = 0xf;
986
packf = ac_build_cvt_pknorm_i16;
987
break;
988
989
case V_028714_SPI_SHADER_UINT16_ABGR:
990
args->enabled_channels = 0xf;
991
packi = ac_build_cvt_pk_u16;
992
if (is_16bit) {
993
for (unsigned chan = 0; chan < 4; chan++)
994
values[chan] = LLVMBuildZExt(ctx->ac.builder, ac_to_integer(&ctx->ac, values[chan]),
995
ctx->ac.i32, "");
996
}
997
break;
998
999
case V_028714_SPI_SHADER_SINT16_ABGR:
1000
args->enabled_channels = 0xf;
1001
packi = ac_build_cvt_pk_i16;
1002
if (is_16bit) {
1003
for (unsigned chan = 0; chan < 4; chan++)
1004
values[chan] = LLVMBuildSExt(ctx->ac.builder, ac_to_integer(&ctx->ac, values[chan]),
1005
ctx->ac.i32, "");
1006
}
1007
break;
1008
1009
default:
1010
case V_028714_SPI_SHADER_32_ABGR:
1011
memcpy(&args->out[0], values, sizeof(values[0]) * 4);
1012
break;
1013
}
1014
1015
/* Replace NaN by zero (only 32-bit) to fix game bugs if
1016
* requested.
1017
*/
1018
if (ctx->args->options->enable_mrt_output_nan_fixup && !is_16bit &&
1019
(col_format == V_028714_SPI_SHADER_32_R || col_format == V_028714_SPI_SHADER_32_GR ||
1020
col_format == V_028714_SPI_SHADER_32_AR || col_format == V_028714_SPI_SHADER_32_ABGR ||
1021
col_format == V_028714_SPI_SHADER_FP16_ABGR)) {
1022
for (unsigned i = 0; i < 4; i++) {
1023
LLVMValueRef class_args[2] = {values[i],
1024
LLVMConstInt(ctx->ac.i32, S_NAN | Q_NAN, false)};
1025
LLVMValueRef isnan = ac_build_intrinsic(&ctx->ac, "llvm.amdgcn.class.f32", ctx->ac.i1,
1026
class_args, 2, AC_FUNC_ATTR_READNONE);
1027
values[i] = LLVMBuildSelect(ctx->ac.builder, isnan, ctx->ac.f32_0, values[i], "");
1028
}
1029
}
1030
1031
/* Pack f16 or norm_i16/u16. */
1032
if (packf) {
1033
for (unsigned chan = 0; chan < 2; chan++) {
1034
LLVMValueRef pack_args[2] = {values[2 * chan], values[2 * chan + 1]};
1035
LLVMValueRef packed;
1036
1037
packed = packf(&ctx->ac, pack_args);
1038
args->out[chan] = ac_to_float(&ctx->ac, packed);
1039
}
1040
args->compr = 1; /* COMPR flag */
1041
}
1042
1043
/* Pack i16/u16. */
1044
if (packi) {
1045
for (unsigned chan = 0; chan < 2; chan++) {
1046
LLVMValueRef pack_args[2] = {ac_to_integer(&ctx->ac, values[2 * chan]),
1047
ac_to_integer(&ctx->ac, values[2 * chan + 1])};
1048
LLVMValueRef packed;
1049
1050
packed = packi(&ctx->ac, pack_args, is_int8 ? 8 : is_int10 ? 10 : 16, chan == 1);
1051
args->out[chan] = ac_to_float(&ctx->ac, packed);
1052
}
1053
args->compr = 1; /* COMPR flag */
1054
}
1055
return;
1056
}
1057
1058
if (is_16bit) {
1059
for (unsigned chan = 0; chan < 4; chan++) {
1060
values[chan] = LLVMBuildBitCast(ctx->ac.builder, values[chan], ctx->ac.i16, "");
1061
args->out[chan] = LLVMBuildZExt(ctx->ac.builder, values[chan], ctx->ac.i32, "");
1062
}
1063
} else
1064
memcpy(&args->out[0], values, sizeof(values[0]) * 4);
1065
1066
for (unsigned i = 0; i < 4; ++i)
1067
args->out[i] = ac_to_float(&ctx->ac, args->out[i]);
1068
}
1069
1070
static void
1071
radv_export_param(struct radv_shader_context *ctx, unsigned index, LLVMValueRef *values,
1072
unsigned enabled_channels)
1073
{
1074
struct ac_export_args args;
1075
1076
si_llvm_init_export_args(ctx, values, enabled_channels, V_008DFC_SQ_EXP_PARAM + index, &args);
1077
ac_build_export(&ctx->ac, &args);
1078
}
1079
1080
static LLVMValueRef
1081
radv_load_output(struct radv_shader_context *ctx, unsigned index, unsigned chan)
1082
{
1083
LLVMValueRef output = ctx->abi.outputs[ac_llvm_reg_index_soa(index, chan)];
1084
return LLVMBuildLoad(ctx->ac.builder, output, "");
1085
}
1086
1087
static void
1088
radv_emit_stream_output(struct radv_shader_context *ctx, LLVMValueRef const *so_buffers,
1089
LLVMValueRef const *so_write_offsets,
1090
const struct radv_stream_output *output,
1091
struct radv_shader_output_values *shader_out)
1092
{
1093
unsigned num_comps = util_bitcount(output->component_mask);
1094
unsigned buf = output->buffer;
1095
unsigned offset = output->offset;
1096
unsigned start;
1097
LLVMValueRef out[4];
1098
1099
assert(num_comps && num_comps <= 4);
1100
if (!num_comps || num_comps > 4)
1101
return;
1102
1103
/* Get the first component. */
1104
start = ffs(output->component_mask) - 1;
1105
1106
/* Load the output as int. */
1107
for (int i = 0; i < num_comps; i++) {
1108
out[i] = ac_to_integer(&ctx->ac, shader_out->values[start + i]);
1109
}
1110
1111
/* Pack the output. */
1112
LLVMValueRef vdata = NULL;
1113
1114
switch (num_comps) {
1115
case 1: /* as i32 */
1116
vdata = out[0];
1117
break;
1118
case 2: /* as v2i32 */
1119
case 3: /* as v4i32 (aligned to 4) */
1120
out[3] = LLVMGetUndef(ctx->ac.i32);
1121
FALLTHROUGH;
1122
case 4: /* as v4i32 */
1123
vdata = ac_build_gather_values(&ctx->ac, out,
1124
!ac_has_vec3_support(ctx->ac.chip_class, false)
1125
? util_next_power_of_two(num_comps)
1126
: num_comps);
1127
break;
1128
}
1129
1130
ac_build_buffer_store_dword(&ctx->ac, so_buffers[buf], vdata, num_comps, so_write_offsets[buf],
1131
ctx->ac.i32_0, offset, ac_glc | ac_slc);
1132
}
1133
1134
static void
1135
radv_emit_streamout(struct radv_shader_context *ctx, unsigned stream)
1136
{
1137
int i;
1138
1139
/* Get bits [22:16], i.e. (so_param >> 16) & 127; */
1140
assert(ctx->args->ac.streamout_config.used);
1141
LLVMValueRef so_vtx_count = ac_build_bfe(
1142
&ctx->ac, ac_get_arg(&ctx->ac, ctx->args->ac.streamout_config),
1143
LLVMConstInt(ctx->ac.i32, 16, false), LLVMConstInt(ctx->ac.i32, 7, false), false);
1144
1145
LLVMValueRef tid = ac_get_thread_id(&ctx->ac);
1146
1147
/* can_emit = tid < so_vtx_count; */
1148
LLVMValueRef can_emit = LLVMBuildICmp(ctx->ac.builder, LLVMIntULT, tid, so_vtx_count, "");
1149
1150
/* Emit the streamout code conditionally. This actually avoids
1151
* out-of-bounds buffer access. The hw tells us via the SGPR
1152
* (so_vtx_count) which threads are allowed to emit streamout data.
1153
*/
1154
ac_build_ifcc(&ctx->ac, can_emit, 6501);
1155
{
1156
/* The buffer offset is computed as follows:
1157
* ByteOffset = streamout_offset[buffer_id]*4 +
1158
* (streamout_write_index + thread_id)*stride[buffer_id] +
1159
* attrib_offset
1160
*/
1161
LLVMValueRef so_write_index = ac_get_arg(&ctx->ac, ctx->args->ac.streamout_write_index);
1162
1163
/* Compute (streamout_write_index + thread_id). */
1164
so_write_index = LLVMBuildAdd(ctx->ac.builder, so_write_index, tid, "");
1165
1166
/* Load the descriptor and compute the write offset for each
1167
* enabled buffer.
1168
*/
1169
LLVMValueRef so_write_offset[4] = {0};
1170
LLVMValueRef so_buffers[4] = {0};
1171
LLVMValueRef buf_ptr = ac_get_arg(&ctx->ac, ctx->args->streamout_buffers);
1172
1173
for (i = 0; i < 4; i++) {
1174
uint16_t stride = ctx->args->shader_info->so.strides[i];
1175
1176
if (!stride)
1177
continue;
1178
1179
LLVMValueRef offset = LLVMConstInt(ctx->ac.i32, i, false);
1180
1181
so_buffers[i] = ac_build_load_to_sgpr(&ctx->ac, buf_ptr, offset);
1182
1183
LLVMValueRef so_offset = ac_get_arg(&ctx->ac, ctx->args->ac.streamout_offset[i]);
1184
1185
so_offset =
1186
LLVMBuildMul(ctx->ac.builder, so_offset, LLVMConstInt(ctx->ac.i32, 4, false), "");
1187
1188
so_write_offset[i] = ac_build_imad(
1189
&ctx->ac, so_write_index, LLVMConstInt(ctx->ac.i32, stride * 4, false), so_offset);
1190
}
1191
1192
/* Write streamout data. */
1193
for (i = 0; i < ctx->args->shader_info->so.num_outputs; i++) {
1194
struct radv_shader_output_values shader_out = {0};
1195
struct radv_stream_output *output = &ctx->args->shader_info->so.outputs[i];
1196
1197
if (stream != output->stream)
1198
continue;
1199
1200
for (int j = 0; j < 4; j++) {
1201
shader_out.values[j] = radv_load_output(ctx, output->location, j);
1202
}
1203
1204
radv_emit_stream_output(ctx, so_buffers, so_write_offset, output, &shader_out);
1205
}
1206
}
1207
ac_build_endif(&ctx->ac, 6501);
1208
}
1209
1210
static void
1211
radv_build_param_exports(struct radv_shader_context *ctx, struct radv_shader_output_values *outputs,
1212
unsigned noutput, struct radv_vs_output_info *outinfo,
1213
bool export_clip_dists)
1214
{
1215
unsigned param_count = 0;
1216
1217
for (unsigned i = 0; i < noutput; i++) {
1218
unsigned slot_name = outputs[i].slot_name;
1219
unsigned usage_mask = outputs[i].usage_mask;
1220
1221
if (slot_name != VARYING_SLOT_LAYER && slot_name != VARYING_SLOT_PRIMITIVE_ID &&
1222
slot_name != VARYING_SLOT_VIEWPORT && slot_name != VARYING_SLOT_CLIP_DIST0 &&
1223
slot_name != VARYING_SLOT_CLIP_DIST1 && slot_name < VARYING_SLOT_VAR0)
1224
continue;
1225
1226
if ((slot_name == VARYING_SLOT_CLIP_DIST0 || slot_name == VARYING_SLOT_CLIP_DIST1) &&
1227
!export_clip_dists)
1228
continue;
1229
1230
radv_export_param(ctx, param_count, outputs[i].values, usage_mask);
1231
1232
assert(i < ARRAY_SIZE(outinfo->vs_output_param_offset));
1233
outinfo->vs_output_param_offset[slot_name] = param_count++;
1234
}
1235
1236
outinfo->param_exports = param_count;
1237
}
1238
1239
/* Generate export instructions for hardware VS shader stage or NGG GS stage
1240
* (position and parameter data only).
1241
*/
1242
static void
1243
radv_llvm_export_vs(struct radv_shader_context *ctx, struct radv_shader_output_values *outputs,
1244
unsigned noutput, struct radv_vs_output_info *outinfo, bool export_clip_dists)
1245
{
1246
LLVMValueRef psize_value = NULL, layer_value = NULL, viewport_value = NULL;
1247
LLVMValueRef primitive_shading_rate = NULL;
1248
struct ac_export_args pos_args[4] = {0};
1249
unsigned pos_idx, index;
1250
int i;
1251
1252
/* Build position exports */
1253
for (i = 0; i < noutput; i++) {
1254
switch (outputs[i].slot_name) {
1255
case VARYING_SLOT_POS:
1256
si_llvm_init_export_args(ctx, outputs[i].values, 0xf, V_008DFC_SQ_EXP_POS, &pos_args[0]);
1257
break;
1258
case VARYING_SLOT_PSIZ:
1259
psize_value = outputs[i].values[0];
1260
break;
1261
case VARYING_SLOT_LAYER:
1262
layer_value = outputs[i].values[0];
1263
break;
1264
case VARYING_SLOT_VIEWPORT:
1265
viewport_value = outputs[i].values[0];
1266
break;
1267
case VARYING_SLOT_PRIMITIVE_SHADING_RATE:
1268
primitive_shading_rate = outputs[i].values[0];
1269
break;
1270
case VARYING_SLOT_CLIP_DIST0:
1271
case VARYING_SLOT_CLIP_DIST1:
1272
index = 2 + outputs[i].slot_index;
1273
si_llvm_init_export_args(ctx, outputs[i].values, 0xf, V_008DFC_SQ_EXP_POS + index,
1274
&pos_args[index]);
1275
break;
1276
default:
1277
break;
1278
}
1279
}
1280
1281
/* We need to add the position output manually if it's missing. */
1282
if (!pos_args[0].out[0]) {
1283
pos_args[0].enabled_channels = 0xf; /* writemask */
1284
pos_args[0].valid_mask = 0; /* EXEC mask */
1285
pos_args[0].done = 0; /* last export? */
1286
pos_args[0].target = V_008DFC_SQ_EXP_POS;
1287
pos_args[0].compr = 0; /* COMPR flag */
1288
pos_args[0].out[0] = ctx->ac.f32_0; /* X */
1289
pos_args[0].out[1] = ctx->ac.f32_0; /* Y */
1290
pos_args[0].out[2] = ctx->ac.f32_0; /* Z */
1291
pos_args[0].out[3] = ctx->ac.f32_1; /* W */
1292
}
1293
1294
bool writes_primitive_shading_rate = outinfo->writes_primitive_shading_rate ||
1295
ctx->args->options->force_vrs_rates;
1296
1297
if (outinfo->writes_pointsize || outinfo->writes_layer || outinfo->writes_layer ||
1298
outinfo->writes_viewport_index || writes_primitive_shading_rate) {
1299
pos_args[1].enabled_channels = ((outinfo->writes_pointsize == true ? 1 : 0) |
1300
(writes_primitive_shading_rate == true ? 2 : 0) |
1301
(outinfo->writes_layer == true ? 4 : 0));
1302
pos_args[1].valid_mask = 0;
1303
pos_args[1].done = 0;
1304
pos_args[1].target = V_008DFC_SQ_EXP_POS + 1;
1305
pos_args[1].compr = 0;
1306
pos_args[1].out[0] = ctx->ac.f32_0; /* X */
1307
pos_args[1].out[1] = ctx->ac.f32_0; /* Y */
1308
pos_args[1].out[2] = ctx->ac.f32_0; /* Z */
1309
pos_args[1].out[3] = ctx->ac.f32_0; /* W */
1310
1311
if (outinfo->writes_pointsize == true)
1312
pos_args[1].out[0] = psize_value;
1313
if (outinfo->writes_layer == true)
1314
pos_args[1].out[2] = layer_value;
1315
if (outinfo->writes_viewport_index == true) {
1316
if (ctx->args->options->chip_class >= GFX9) {
1317
/* GFX9 has the layer in out.z[10:0] and the viewport
1318
* index in out.z[19:16].
1319
*/
1320
LLVMValueRef v = viewport_value;
1321
v = ac_to_integer(&ctx->ac, v);
1322
v = LLVMBuildShl(ctx->ac.builder, v, LLVMConstInt(ctx->ac.i32, 16, false), "");
1323
v = LLVMBuildOr(ctx->ac.builder, v, ac_to_integer(&ctx->ac, pos_args[1].out[2]), "");
1324
1325
pos_args[1].out[2] = ac_to_float(&ctx->ac, v);
1326
pos_args[1].enabled_channels |= 1 << 2;
1327
} else {
1328
pos_args[1].out[3] = viewport_value;
1329
pos_args[1].enabled_channels |= 1 << 3;
1330
}
1331
}
1332
1333
if (outinfo->writes_primitive_shading_rate) {
1334
pos_args[1].out[1] = primitive_shading_rate;
1335
} else if (ctx->args->options->force_vrs_rates) {
1336
/* Bits [2:3] = VRS rate X
1337
* Bits [4:5] = VRS rate Y
1338
*
1339
* The range is [-2, 1]. Values:
1340
* 1: 2x coarser shading rate in that direction.
1341
* 0: normal shading rate
1342
* -1: 2x finer shading rate (sample shading, not directional)
1343
* -2: 4x finer shading rate (sample shading, not directional)
1344
*
1345
* Sample shading can't go above 8 samples, so both numbers can't be -2 at the same time.
1346
*/
1347
LLVMValueRef rates = LLVMConstInt(ctx->ac.i32, ctx->args->options->force_vrs_rates, false);
1348
LLVMValueRef cond;
1349
LLVMValueRef v;
1350
1351
/* If Pos.W != 1 (typical for non-GUI elements), use 2x2 coarse shading. */
1352
cond = LLVMBuildFCmp(ctx->ac.builder, LLVMRealUNE, pos_args[0].out[3], ctx->ac.f32_1, "");
1353
v = LLVMBuildSelect(ctx->ac.builder, cond, rates, ctx->ac.i32_0, "");
1354
1355
pos_args[1].out[1] = ac_to_float(&ctx->ac, v);
1356
}
1357
}
1358
1359
for (i = 0; i < 4; i++) {
1360
if (pos_args[i].out[0])
1361
outinfo->pos_exports++;
1362
}
1363
1364
/* GFX10 skip POS0 exports if EXEC=0 and DONE=0, causing a hang.
1365
* Setting valid_mask=1 prevents it and has no other effect.
1366
*/
1367
if (ctx->ac.chip_class == GFX10)
1368
pos_args[0].valid_mask = 1;
1369
1370
pos_idx = 0;
1371
for (i = 0; i < 4; i++) {
1372
if (!pos_args[i].out[0])
1373
continue;
1374
1375
/* Specify the target we are exporting */
1376
pos_args[i].target = V_008DFC_SQ_EXP_POS + pos_idx++;
1377
1378
if (pos_idx == outinfo->pos_exports)
1379
/* Specify that this is the last export */
1380
pos_args[i].done = 1;
1381
1382
ac_build_export(&ctx->ac, &pos_args[i]);
1383
}
1384
1385
/* Build parameter exports */
1386
radv_build_param_exports(ctx, outputs, noutput, outinfo, export_clip_dists);
1387
}
1388
1389
static void
1390
handle_vs_outputs_post(struct radv_shader_context *ctx, bool export_prim_id, bool export_clip_dists,
1391
struct radv_vs_output_info *outinfo)
1392
{
1393
struct radv_shader_output_values *outputs;
1394
unsigned noutput = 0;
1395
1396
if (ctx->args->options->key.has_multiview_view_index) {
1397
LLVMValueRef *tmp_out = &ctx->abi.outputs[ac_llvm_reg_index_soa(VARYING_SLOT_LAYER, 0)];
1398
if (!*tmp_out) {
1399
for (unsigned i = 0; i < 4; ++i)
1400
ctx->abi.outputs[ac_llvm_reg_index_soa(VARYING_SLOT_LAYER, i)] =
1401
ac_build_alloca_undef(&ctx->ac, ctx->ac.f32, "");
1402
}
1403
1404
LLVMValueRef view_index = ac_get_arg(&ctx->ac, ctx->args->ac.view_index);
1405
LLVMBuildStore(ctx->ac.builder, ac_to_float(&ctx->ac, view_index), *tmp_out);
1406
ctx->output_mask |= 1ull << VARYING_SLOT_LAYER;
1407
}
1408
1409
memset(outinfo->vs_output_param_offset, AC_EXP_PARAM_UNDEFINED,
1410
sizeof(outinfo->vs_output_param_offset));
1411
outinfo->pos_exports = 0;
1412
1413
if (!ctx->args->options->use_ngg_streamout && ctx->args->shader_info->so.num_outputs &&
1414
!ctx->args->is_gs_copy_shader) {
1415
/* The GS copy shader emission already emits streamout. */
1416
radv_emit_streamout(ctx, 0);
1417
}
1418
1419
/* Allocate a temporary array for the output values. */
1420
unsigned num_outputs = util_bitcount64(ctx->output_mask) + export_prim_id;
1421
outputs = malloc(num_outputs * sizeof(outputs[0]));
1422
1423
for (unsigned i = 0; i < AC_LLVM_MAX_OUTPUTS; ++i) {
1424
if (!(ctx->output_mask & (1ull << i)))
1425
continue;
1426
1427
outputs[noutput].slot_name = i;
1428
outputs[noutput].slot_index = i == VARYING_SLOT_CLIP_DIST1;
1429
1430
if (ctx->stage == MESA_SHADER_VERTEX && !ctx->args->is_gs_copy_shader) {
1431
outputs[noutput].usage_mask = ctx->args->shader_info->vs.output_usage_mask[i];
1432
} else if (ctx->stage == MESA_SHADER_TESS_EVAL) {
1433
outputs[noutput].usage_mask = ctx->args->shader_info->tes.output_usage_mask[i];
1434
} else {
1435
assert(ctx->args->is_gs_copy_shader);
1436
outputs[noutput].usage_mask = ctx->args->shader_info->gs.output_usage_mask[i];
1437
}
1438
1439
for (unsigned j = 0; j < 4; j++) {
1440
outputs[noutput].values[j] = ac_to_float(&ctx->ac, radv_load_output(ctx, i, j));
1441
}
1442
1443
noutput++;
1444
}
1445
1446
/* Export PrimitiveID. */
1447
if (export_prim_id) {
1448
outputs[noutput].slot_name = VARYING_SLOT_PRIMITIVE_ID;
1449
outputs[noutput].slot_index = 0;
1450
outputs[noutput].usage_mask = 0x1;
1451
if (ctx->stage == MESA_SHADER_TESS_EVAL)
1452
outputs[noutput].values[0] = ac_get_arg(&ctx->ac, ctx->args->ac.tes_patch_id);
1453
else
1454
outputs[noutput].values[0] = ac_get_arg(&ctx->ac, ctx->args->ac.vs_prim_id);
1455
for (unsigned j = 1; j < 4; j++)
1456
outputs[noutput].values[j] = ctx->ac.f32_0;
1457
noutput++;
1458
}
1459
1460
radv_llvm_export_vs(ctx, outputs, noutput, outinfo, export_clip_dists);
1461
1462
free(outputs);
1463
}
1464
1465
static LLVMValueRef
1466
get_wave_id_in_tg(struct radv_shader_context *ctx)
1467
{
1468
return ac_unpack_param(&ctx->ac, ac_get_arg(&ctx->ac, ctx->args->ac.merged_wave_info), 24, 4);
1469
}
1470
1471
static LLVMValueRef
1472
get_tgsize(struct radv_shader_context *ctx)
1473
{
1474
return ac_unpack_param(&ctx->ac, ac_get_arg(&ctx->ac, ctx->args->ac.merged_wave_info), 28, 4);
1475
}
1476
1477
static LLVMValueRef
1478
get_thread_id_in_tg(struct radv_shader_context *ctx)
1479
{
1480
LLVMBuilderRef builder = ctx->ac.builder;
1481
LLVMValueRef tmp;
1482
tmp = LLVMBuildMul(builder, get_wave_id_in_tg(ctx),
1483
LLVMConstInt(ctx->ac.i32, ctx->ac.wave_size, false), "");
1484
return LLVMBuildAdd(builder, tmp, ac_get_thread_id(&ctx->ac), "");
1485
}
1486
1487
static LLVMValueRef
1488
ngg_get_vtx_cnt(struct radv_shader_context *ctx)
1489
{
1490
return ac_build_bfe(&ctx->ac, ac_get_arg(&ctx->ac, ctx->args->ac.gs_tg_info),
1491
LLVMConstInt(ctx->ac.i32, 12, false), LLVMConstInt(ctx->ac.i32, 9, false),
1492
false);
1493
}
1494
1495
static LLVMValueRef
1496
ngg_get_prim_cnt(struct radv_shader_context *ctx)
1497
{
1498
return ac_build_bfe(&ctx->ac, ac_get_arg(&ctx->ac, ctx->args->ac.gs_tg_info),
1499
LLVMConstInt(ctx->ac.i32, 22, false), LLVMConstInt(ctx->ac.i32, 9, false),
1500
false);
1501
}
1502
1503
static LLVMValueRef
1504
ngg_get_ordered_id(struct radv_shader_context *ctx)
1505
{
1506
return ac_build_bfe(&ctx->ac, ac_get_arg(&ctx->ac, ctx->args->ac.gs_tg_info), ctx->ac.i32_0,
1507
LLVMConstInt(ctx->ac.i32, 12, false), false);
1508
}
1509
1510
static LLVMValueRef
1511
ngg_gs_get_vertex_storage(struct radv_shader_context *ctx)
1512
{
1513
unsigned num_outputs = util_bitcount64(ctx->output_mask);
1514
1515
if (ctx->args->options->key.has_multiview_view_index)
1516
num_outputs++;
1517
1518
LLVMTypeRef elements[2] = {
1519
LLVMArrayType(ctx->ac.i32, 4 * num_outputs),
1520
LLVMArrayType(ctx->ac.i8, 4),
1521
};
1522
LLVMTypeRef type = LLVMStructTypeInContext(ctx->ac.context, elements, 2, false);
1523
type = LLVMPointerType(LLVMArrayType(type, 0), AC_ADDR_SPACE_LDS);
1524
return LLVMBuildBitCast(ctx->ac.builder, ctx->gs_ngg_emit, type, "");
1525
}
1526
1527
/**
1528
* Return a pointer to the LDS storage reserved for the N'th vertex, where N
1529
* is in emit order; that is:
1530
* - during the epilogue, N is the threadidx (relative to the entire threadgroup)
1531
* - during vertex emit, i.e. while the API GS shader invocation is running,
1532
* N = threadidx * gs_max_out_vertices + emitidx
1533
*
1534
* Goals of the LDS memory layout:
1535
* 1. Eliminate bank conflicts on write for geometry shaders that have all emits
1536
* in uniform control flow
1537
* 2. Eliminate bank conflicts on read for export if, additionally, there is no
1538
* culling
1539
* 3. Agnostic to the number of waves (since we don't know it before compiling)
1540
* 4. Allow coalescing of LDS instructions (ds_write_b128 etc.)
1541
* 5. Avoid wasting memory.
1542
*
1543
* We use an AoS layout due to point 4 (this also helps point 3). In an AoS
1544
* layout, elimination of bank conflicts requires that each vertex occupy an
1545
* odd number of dwords. We use the additional dword to store the output stream
1546
* index as well as a flag to indicate whether this vertex ends a primitive
1547
* for rasterization.
1548
*
1549
* Swizzling is required to satisfy points 1 and 2 simultaneously.
1550
*
1551
* Vertices are stored in export order (gsthread * gs_max_out_vertices + emitidx).
1552
* Indices are swizzled in groups of 32, which ensures point 1 without
1553
* disturbing point 2.
1554
*
1555
* \return an LDS pointer to type {[N x i32], [4 x i8]}
1556
*/
1557
static LLVMValueRef
1558
ngg_gs_vertex_ptr(struct radv_shader_context *ctx, LLVMValueRef vertexidx)
1559
{
1560
LLVMBuilderRef builder = ctx->ac.builder;
1561
LLVMValueRef storage = ngg_gs_get_vertex_storage(ctx);
1562
1563
/* gs_max_out_vertices = 2^(write_stride_2exp) * some odd number */
1564
unsigned write_stride_2exp = ffs(MAX2(ctx->shader->info.gs.vertices_out, 1)) - 1;
1565
if (write_stride_2exp) {
1566
LLVMValueRef row = LLVMBuildLShr(builder, vertexidx, LLVMConstInt(ctx->ac.i32, 5, false), "");
1567
LLVMValueRef swizzle = LLVMBuildAnd(
1568
builder, row, LLVMConstInt(ctx->ac.i32, (1u << write_stride_2exp) - 1, false), "");
1569
vertexidx = LLVMBuildXor(builder, vertexidx, swizzle, "");
1570
}
1571
1572
return ac_build_gep0(&ctx->ac, storage, vertexidx);
1573
}
1574
1575
static LLVMValueRef
1576
ngg_gs_emit_vertex_ptr(struct radv_shader_context *ctx, LLVMValueRef gsthread, LLVMValueRef emitidx)
1577
{
1578
LLVMBuilderRef builder = ctx->ac.builder;
1579
LLVMValueRef tmp;
1580
1581
tmp = LLVMConstInt(ctx->ac.i32, ctx->shader->info.gs.vertices_out, false);
1582
tmp = LLVMBuildMul(builder, tmp, gsthread, "");
1583
const LLVMValueRef vertexidx = LLVMBuildAdd(builder, tmp, emitidx, "");
1584
return ngg_gs_vertex_ptr(ctx, vertexidx);
1585
}
1586
1587
static LLVMValueRef
1588
ngg_gs_get_emit_output_ptr(struct radv_shader_context *ctx, LLVMValueRef vertexptr,
1589
unsigned out_idx)
1590
{
1591
LLVMValueRef gep_idx[3] = {
1592
ctx->ac.i32_0, /* implied C-style array */
1593
ctx->ac.i32_0, /* first struct entry */
1594
LLVMConstInt(ctx->ac.i32, out_idx, false),
1595
};
1596
return LLVMBuildGEP(ctx->ac.builder, vertexptr, gep_idx, 3, "");
1597
}
1598
1599
static LLVMValueRef
1600
ngg_gs_get_emit_primflag_ptr(struct radv_shader_context *ctx, LLVMValueRef vertexptr,
1601
unsigned stream)
1602
{
1603
LLVMValueRef gep_idx[3] = {
1604
ctx->ac.i32_0, /* implied C-style array */
1605
ctx->ac.i32_1, /* second struct entry */
1606
LLVMConstInt(ctx->ac.i32, stream, false),
1607
};
1608
return LLVMBuildGEP(ctx->ac.builder, vertexptr, gep_idx, 3, "");
1609
}
1610
1611
static struct radv_stream_output *
1612
radv_get_stream_output_by_loc(struct radv_streamout_info *so, unsigned location)
1613
{
1614
for (unsigned i = 0; i < so->num_outputs; ++i) {
1615
if (so->outputs[i].location == location)
1616
return &so->outputs[i];
1617
}
1618
1619
return NULL;
1620
}
1621
1622
static void
1623
build_streamout_vertex(struct radv_shader_context *ctx, LLVMValueRef *so_buffer,
1624
LLVMValueRef *wg_offset_dw, unsigned stream, LLVMValueRef offset_vtx,
1625
LLVMValueRef vertexptr)
1626
{
1627
struct radv_streamout_info *so = &ctx->args->shader_info->so;
1628
LLVMBuilderRef builder = ctx->ac.builder;
1629
LLVMValueRef offset[4] = {0};
1630
LLVMValueRef tmp;
1631
1632
for (unsigned buffer = 0; buffer < 4; ++buffer) {
1633
if (!wg_offset_dw[buffer])
1634
continue;
1635
1636
tmp = LLVMBuildMul(builder, offset_vtx, LLVMConstInt(ctx->ac.i32, so->strides[buffer], false),
1637
"");
1638
tmp = LLVMBuildAdd(builder, wg_offset_dw[buffer], tmp, "");
1639
offset[buffer] = LLVMBuildShl(builder, tmp, LLVMConstInt(ctx->ac.i32, 2, false), "");
1640
}
1641
1642
if (ctx->stage == MESA_SHADER_GEOMETRY) {
1643
struct radv_shader_output_values outputs[AC_LLVM_MAX_OUTPUTS];
1644
unsigned noutput = 0;
1645
unsigned out_idx = 0;
1646
1647
for (unsigned i = 0; i < AC_LLVM_MAX_OUTPUTS; ++i) {
1648
unsigned output_usage_mask = ctx->args->shader_info->gs.output_usage_mask[i];
1649
uint8_t output_stream = ctx->args->shader_info->gs.output_streams[i];
1650
1651
if (!(ctx->output_mask & (1ull << i)) || output_stream != stream)
1652
continue;
1653
1654
outputs[noutput].slot_name = i;
1655
outputs[noutput].slot_index = i == VARYING_SLOT_CLIP_DIST1;
1656
outputs[noutput].usage_mask = output_usage_mask;
1657
1658
int length = util_last_bit(output_usage_mask);
1659
1660
for (unsigned j = 0; j < length; j++, out_idx++) {
1661
if (!(output_usage_mask & (1 << j)))
1662
continue;
1663
1664
tmp = ac_build_gep0(&ctx->ac, vertexptr, LLVMConstInt(ctx->ac.i32, out_idx, false));
1665
outputs[noutput].values[j] = LLVMBuildLoad(builder, tmp, "");
1666
}
1667
1668
for (unsigned j = length; j < 4; j++)
1669
outputs[noutput].values[j] = LLVMGetUndef(ctx->ac.f32);
1670
1671
noutput++;
1672
}
1673
1674
for (unsigned i = 0; i < noutput; i++) {
1675
struct radv_stream_output *output =
1676
radv_get_stream_output_by_loc(so, outputs[i].slot_name);
1677
1678
if (!output || output->stream != stream)
1679
continue;
1680
1681
struct radv_shader_output_values out = {0};
1682
1683
for (unsigned j = 0; j < 4; j++) {
1684
out.values[j] = outputs[i].values[j];
1685
}
1686
1687
radv_emit_stream_output(ctx, so_buffer, offset, output, &out);
1688
}
1689
} else {
1690
for (unsigned i = 0; i < so->num_outputs; ++i) {
1691
struct radv_stream_output *output = &ctx->args->shader_info->so.outputs[i];
1692
1693
if (stream != output->stream)
1694
continue;
1695
1696
struct radv_shader_output_values out = {0};
1697
1698
for (unsigned comp = 0; comp < 4; comp++) {
1699
if (!(output->component_mask & (1 << comp)))
1700
continue;
1701
1702
tmp =
1703
ac_build_gep0(&ctx->ac, vertexptr, LLVMConstInt(ctx->ac.i32, 4 * i + comp, false));
1704
out.values[comp] = LLVMBuildLoad(builder, tmp, "");
1705
}
1706
1707
radv_emit_stream_output(ctx, so_buffer, offset, output, &out);
1708
}
1709
}
1710
}
1711
1712
struct ngg_streamout {
1713
LLVMValueRef num_vertices;
1714
1715
/* per-thread data */
1716
LLVMValueRef prim_enable[4]; /* i1 per stream */
1717
LLVMValueRef vertices[3]; /* [N x i32] addrspace(LDS)* */
1718
1719
/* Output */
1720
LLVMValueRef emit[4]; /* per-stream emitted primitives (only valid for used streams) */
1721
};
1722
1723
/**
1724
* Build streamout logic.
1725
*
1726
* Implies a barrier.
1727
*
1728
* Writes number of emitted primitives to gs_ngg_scratch[4:7].
1729
*
1730
* Clobbers gs_ngg_scratch[8:].
1731
*/
1732
static void
1733
build_streamout(struct radv_shader_context *ctx, struct ngg_streamout *nggso)
1734
{
1735
struct radv_streamout_info *so = &ctx->args->shader_info->so;
1736
LLVMBuilderRef builder = ctx->ac.builder;
1737
LLVMValueRef buf_ptr = ac_get_arg(&ctx->ac, ctx->args->streamout_buffers);
1738
LLVMValueRef tid = get_thread_id_in_tg(ctx);
1739
LLVMValueRef cond, tmp, tmp2;
1740
LLVMValueRef i32_2 = LLVMConstInt(ctx->ac.i32, 2, false);
1741
LLVMValueRef i32_4 = LLVMConstInt(ctx->ac.i32, 4, false);
1742
LLVMValueRef i32_8 = LLVMConstInt(ctx->ac.i32, 8, false);
1743
LLVMValueRef so_buffer[4] = {0};
1744
unsigned max_num_vertices = 1 + (nggso->vertices[1] ? 1 : 0) + (nggso->vertices[2] ? 1 : 0);
1745
LLVMValueRef prim_stride_dw[4] = {0};
1746
LLVMValueRef prim_stride_dw_vgpr = LLVMGetUndef(ctx->ac.i32);
1747
int stream_for_buffer[4] = {-1, -1, -1, -1};
1748
unsigned bufmask_for_stream[4] = {0};
1749
bool isgs = ctx->stage == MESA_SHADER_GEOMETRY;
1750
unsigned scratch_emit_base = isgs ? 4 : 0;
1751
LLVMValueRef scratch_emit_basev = isgs ? i32_4 : ctx->ac.i32_0;
1752
unsigned scratch_offset_base = isgs ? 8 : 4;
1753
LLVMValueRef scratch_offset_basev = isgs ? i32_8 : i32_4;
1754
1755
ac_llvm_add_target_dep_function_attr(ctx->main_function, "amdgpu-gds-size", 256);
1756
1757
/* Determine the mapping of streamout buffers to vertex streams. */
1758
for (unsigned i = 0; i < so->num_outputs; ++i) {
1759
unsigned buf = so->outputs[i].buffer;
1760
unsigned stream = so->outputs[i].stream;
1761
assert(stream_for_buffer[buf] < 0 || stream_for_buffer[buf] == stream);
1762
stream_for_buffer[buf] = stream;
1763
bufmask_for_stream[stream] |= 1 << buf;
1764
}
1765
1766
for (unsigned buffer = 0; buffer < 4; ++buffer) {
1767
if (stream_for_buffer[buffer] == -1)
1768
continue;
1769
1770
assert(so->strides[buffer]);
1771
1772
LLVMValueRef stride_for_buffer = LLVMConstInt(ctx->ac.i32, so->strides[buffer], false);
1773
prim_stride_dw[buffer] = LLVMBuildMul(builder, stride_for_buffer, nggso->num_vertices, "");
1774
prim_stride_dw_vgpr =
1775
ac_build_writelane(&ctx->ac, prim_stride_dw_vgpr, prim_stride_dw[buffer],
1776
LLVMConstInt(ctx->ac.i32, buffer, false));
1777
1778
LLVMValueRef offset = LLVMConstInt(ctx->ac.i32, buffer, false);
1779
so_buffer[buffer] = ac_build_load_to_sgpr(&ctx->ac, buf_ptr, offset);
1780
}
1781
1782
cond = LLVMBuildICmp(builder, LLVMIntEQ, get_wave_id_in_tg(ctx), ctx->ac.i32_0, "");
1783
ac_build_ifcc(&ctx->ac, cond, 5200);
1784
{
1785
LLVMTypeRef gdsptr = LLVMPointerType(ctx->ac.i32, AC_ADDR_SPACE_GDS);
1786
LLVMValueRef gdsbase = LLVMBuildIntToPtr(builder, ctx->ac.i32_0, gdsptr, "");
1787
1788
/* Advance the streamout offsets in GDS. */
1789
LLVMValueRef offsets_vgpr = ac_build_alloca_undef(&ctx->ac, ctx->ac.i32, "");
1790
LLVMValueRef generated_by_stream_vgpr = ac_build_alloca_undef(&ctx->ac, ctx->ac.i32, "");
1791
1792
cond = LLVMBuildICmp(builder, LLVMIntULT, ac_get_thread_id(&ctx->ac), i32_4, "");
1793
ac_build_ifcc(&ctx->ac, cond, 5210);
1794
{
1795
/* Fetch the number of generated primitives and store
1796
* it in GDS for later use.
1797
*/
1798
if (isgs) {
1799
tmp = ac_build_gep0(&ctx->ac, ctx->gs_ngg_scratch, tid);
1800
tmp = LLVMBuildLoad(builder, tmp, "");
1801
} else {
1802
tmp = ac_build_writelane(&ctx->ac, ctx->ac.i32_0, ngg_get_prim_cnt(ctx), ctx->ac.i32_0);
1803
}
1804
LLVMBuildStore(builder, tmp, generated_by_stream_vgpr);
1805
1806
unsigned swizzle[4];
1807
int unused_stream = -1;
1808
for (unsigned stream = 0; stream < 4; ++stream) {
1809
if (!ctx->args->shader_info->gs.num_stream_output_components[stream]) {
1810
unused_stream = stream;
1811
break;
1812
}
1813
}
1814
for (unsigned buffer = 0; buffer < 4; ++buffer) {
1815
if (stream_for_buffer[buffer] >= 0) {
1816
swizzle[buffer] = stream_for_buffer[buffer];
1817
} else {
1818
assert(unused_stream >= 0);
1819
swizzle[buffer] = unused_stream;
1820
}
1821
}
1822
1823
tmp = ac_build_quad_swizzle(&ctx->ac, tmp, swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
1824
tmp = LLVMBuildMul(builder, tmp, prim_stride_dw_vgpr, "");
1825
1826
LLVMValueRef args[] = {
1827
LLVMBuildIntToPtr(builder, ngg_get_ordered_id(ctx), gdsptr, ""),
1828
tmp,
1829
ctx->ac.i32_0, // ordering
1830
ctx->ac.i32_0, // scope
1831
ctx->ac.i1false, // isVolatile
1832
LLVMConstInt(ctx->ac.i32, 4 << 24, false), // OA index
1833
ctx->ac.i1true, // wave release
1834
ctx->ac.i1true, // wave done
1835
};
1836
1837
tmp = ac_build_intrinsic(&ctx->ac, "llvm.amdgcn.ds.ordered.add", ctx->ac.i32, args,
1838
ARRAY_SIZE(args), 0);
1839
1840
/* Keep offsets in a VGPR for quick retrieval via readlane by
1841
* the first wave for bounds checking, and also store in LDS
1842
* for retrieval by all waves later. */
1843
LLVMBuildStore(builder, tmp, offsets_vgpr);
1844
1845
tmp2 = LLVMBuildAdd(builder, ac_get_thread_id(&ctx->ac), scratch_offset_basev, "");
1846
tmp2 = ac_build_gep0(&ctx->ac, ctx->gs_ngg_scratch, tmp2);
1847
LLVMBuildStore(builder, tmp, tmp2);
1848
}
1849
ac_build_endif(&ctx->ac, 5210);
1850
1851
/* Determine the max emit per buffer. This is done via the SALU, in part
1852
* because LLVM can't generate divide-by-multiply if we try to do this
1853
* via VALU with one lane per buffer.
1854
*/
1855
LLVMValueRef max_emit[4] = {0};
1856
for (unsigned buffer = 0; buffer < 4; ++buffer) {
1857
if (stream_for_buffer[buffer] == -1)
1858
continue;
1859
1860
/* Compute the streamout buffer size in DWORD. */
1861
LLVMValueRef bufsize_dw = LLVMBuildLShr(
1862
builder, LLVMBuildExtractElement(builder, so_buffer[buffer], i32_2, ""), i32_2, "");
1863
1864
/* Load the streamout buffer offset from GDS. */
1865
tmp = LLVMBuildLoad(builder, offsets_vgpr, "");
1866
LLVMValueRef offset_dw =
1867
ac_build_readlane(&ctx->ac, tmp, LLVMConstInt(ctx->ac.i32, buffer, false));
1868
1869
/* Compute the remaining size to emit. */
1870
LLVMValueRef remaining_dw = LLVMBuildSub(builder, bufsize_dw, offset_dw, "");
1871
tmp = LLVMBuildUDiv(builder, remaining_dw, prim_stride_dw[buffer], "");
1872
1873
cond = LLVMBuildICmp(builder, LLVMIntULT, bufsize_dw, offset_dw, "");
1874
max_emit[buffer] = LLVMBuildSelect(builder, cond, ctx->ac.i32_0, tmp, "");
1875
}
1876
1877
/* Determine the number of emitted primitives per stream and fixup the
1878
* GDS counter if necessary.
1879
*
1880
* This is complicated by the fact that a single stream can emit to
1881
* multiple buffers (but luckily not vice versa).
1882
*/
1883
LLVMValueRef emit_vgpr = ctx->ac.i32_0;
1884
1885
for (unsigned stream = 0; stream < 4; ++stream) {
1886
if (!ctx->args->shader_info->gs.num_stream_output_components[stream])
1887
continue;
1888
1889
/* Load the number of generated primitives from GDS and
1890
* determine that number for the given stream.
1891
*/
1892
tmp = LLVMBuildLoad(builder, generated_by_stream_vgpr, "");
1893
LLVMValueRef generated =
1894
ac_build_readlane(&ctx->ac, tmp, LLVMConstInt(ctx->ac.i32, stream, false));
1895
1896
/* Compute the number of emitted primitives. */
1897
LLVMValueRef emit = generated;
1898
for (unsigned buffer = 0; buffer < 4; ++buffer) {
1899
if (stream_for_buffer[buffer] == stream)
1900
emit = ac_build_umin(&ctx->ac, emit, max_emit[buffer]);
1901
}
1902
1903
/* Store the number of emitted primitives for that
1904
* stream.
1905
*/
1906
emit_vgpr =
1907
ac_build_writelane(&ctx->ac, emit_vgpr, emit, LLVMConstInt(ctx->ac.i32, stream, false));
1908
1909
/* Fixup the offset using a plain GDS atomic if we overflowed. */
1910
cond = LLVMBuildICmp(builder, LLVMIntULT, emit, generated, "");
1911
ac_build_ifcc(&ctx->ac, cond, 5221); /* scalar branch */
1912
tmp = LLVMBuildLShr(builder, LLVMConstInt(ctx->ac.i32, bufmask_for_stream[stream], false),
1913
ac_get_thread_id(&ctx->ac), "");
1914
tmp = LLVMBuildTrunc(builder, tmp, ctx->ac.i1, "");
1915
ac_build_ifcc(&ctx->ac, tmp, 5222);
1916
{
1917
tmp = LLVMBuildSub(builder, generated, emit, "");
1918
tmp = LLVMBuildMul(builder, tmp, prim_stride_dw_vgpr, "");
1919
tmp2 = LLVMBuildGEP(builder, gdsbase, &tid, 1, "");
1920
LLVMBuildAtomicRMW(builder, LLVMAtomicRMWBinOpSub, tmp2, tmp,
1921
LLVMAtomicOrderingMonotonic, false);
1922
}
1923
ac_build_endif(&ctx->ac, 5222);
1924
ac_build_endif(&ctx->ac, 5221);
1925
}
1926
1927
/* Store the number of emitted primitives to LDS for later use. */
1928
cond = LLVMBuildICmp(builder, LLVMIntULT, ac_get_thread_id(&ctx->ac), i32_4, "");
1929
ac_build_ifcc(&ctx->ac, cond, 5225);
1930
{
1931
tmp = LLVMBuildAdd(builder, ac_get_thread_id(&ctx->ac), scratch_emit_basev, "");
1932
tmp = ac_build_gep0(&ctx->ac, ctx->gs_ngg_scratch, tmp);
1933
LLVMBuildStore(builder, emit_vgpr, tmp);
1934
}
1935
ac_build_endif(&ctx->ac, 5225);
1936
}
1937
ac_build_endif(&ctx->ac, 5200);
1938
1939
/* Determine the workgroup-relative per-thread / primitive offset into
1940
* the streamout buffers */
1941
struct ac_wg_scan primemit_scan[4] = {0};
1942
1943
if (isgs) {
1944
for (unsigned stream = 0; stream < 4; ++stream) {
1945
if (!ctx->args->shader_info->gs.num_stream_output_components[stream])
1946
continue;
1947
1948
primemit_scan[stream].enable_exclusive = true;
1949
primemit_scan[stream].op = nir_op_iadd;
1950
primemit_scan[stream].src = nggso->prim_enable[stream];
1951
primemit_scan[stream].scratch = ac_build_gep0(
1952
&ctx->ac, ctx->gs_ngg_scratch, LLVMConstInt(ctx->ac.i32, 12 + 8 * stream, false));
1953
primemit_scan[stream].waveidx = get_wave_id_in_tg(ctx);
1954
primemit_scan[stream].numwaves = get_tgsize(ctx);
1955
primemit_scan[stream].maxwaves = 8;
1956
ac_build_wg_scan_top(&ctx->ac, &primemit_scan[stream]);
1957
}
1958
}
1959
1960
ac_build_s_barrier(&ctx->ac);
1961
1962
/* Fetch the per-buffer offsets and per-stream emit counts in all waves. */
1963
LLVMValueRef wgoffset_dw[4] = {0};
1964
1965
{
1966
LLVMValueRef scratch_vgpr;
1967
1968
tmp = ac_build_gep0(&ctx->ac, ctx->gs_ngg_scratch, ac_get_thread_id(&ctx->ac));
1969
scratch_vgpr = LLVMBuildLoad(builder, tmp, "");
1970
1971
for (unsigned buffer = 0; buffer < 4; ++buffer) {
1972
if (stream_for_buffer[buffer] >= 0) {
1973
wgoffset_dw[buffer] =
1974
ac_build_readlane(&ctx->ac, scratch_vgpr,
1975
LLVMConstInt(ctx->ac.i32, scratch_offset_base + buffer, false));
1976
}
1977
}
1978
1979
for (unsigned stream = 0; stream < 4; ++stream) {
1980
if (ctx->args->shader_info->gs.num_stream_output_components[stream]) {
1981
nggso->emit[stream] =
1982
ac_build_readlane(&ctx->ac, scratch_vgpr,
1983
LLVMConstInt(ctx->ac.i32, scratch_emit_base + stream, false));
1984
}
1985
}
1986
}
1987
1988
/* Write out primitive data */
1989
for (unsigned stream = 0; stream < 4; ++stream) {
1990
if (!ctx->args->shader_info->gs.num_stream_output_components[stream])
1991
continue;
1992
1993
if (isgs) {
1994
ac_build_wg_scan_bottom(&ctx->ac, &primemit_scan[stream]);
1995
} else {
1996
primemit_scan[stream].result_exclusive = tid;
1997
}
1998
1999
cond = LLVMBuildICmp(builder, LLVMIntULT, primemit_scan[stream].result_exclusive,
2000
nggso->emit[stream], "");
2001
cond = LLVMBuildAnd(builder, cond, nggso->prim_enable[stream], "");
2002
ac_build_ifcc(&ctx->ac, cond, 5240);
2003
{
2004
LLVMValueRef offset_vtx =
2005
LLVMBuildMul(builder, primemit_scan[stream].result_exclusive, nggso->num_vertices, "");
2006
2007
for (unsigned i = 0; i < max_num_vertices; ++i) {
2008
cond = LLVMBuildICmp(builder, LLVMIntULT, LLVMConstInt(ctx->ac.i32, i, false),
2009
nggso->num_vertices, "");
2010
ac_build_ifcc(&ctx->ac, cond, 5241);
2011
build_streamout_vertex(ctx, so_buffer, wgoffset_dw, stream, offset_vtx,
2012
nggso->vertices[i]);
2013
ac_build_endif(&ctx->ac, 5241);
2014
offset_vtx = LLVMBuildAdd(builder, offset_vtx, ctx->ac.i32_1, "");
2015
}
2016
}
2017
ac_build_endif(&ctx->ac, 5240);
2018
}
2019
}
2020
2021
static unsigned
2022
ngg_nogs_vertex_size(struct radv_shader_context *ctx)
2023
{
2024
unsigned lds_vertex_size = 0;
2025
2026
if (ctx->args->shader_info->so.num_outputs)
2027
lds_vertex_size = 4 * ctx->args->shader_info->so.num_outputs + 1;
2028
2029
return lds_vertex_size;
2030
}
2031
2032
/**
2033
* Returns an `[N x i32] addrspace(LDS)*` pointing at contiguous LDS storage
2034
* for the vertex outputs.
2035
*/
2036
static LLVMValueRef
2037
ngg_nogs_vertex_ptr(struct radv_shader_context *ctx, LLVMValueRef vtxid)
2038
{
2039
/* The extra dword is used to avoid LDS bank conflicts. */
2040
unsigned vertex_size = ngg_nogs_vertex_size(ctx);
2041
LLVMTypeRef ai32 = LLVMArrayType(ctx->ac.i32, vertex_size);
2042
LLVMTypeRef pai32 = LLVMPointerType(ai32, AC_ADDR_SPACE_LDS);
2043
LLVMValueRef tmp = LLVMBuildBitCast(ctx->ac.builder, ctx->esgs_ring, pai32, "");
2044
return LLVMBuildGEP(ctx->ac.builder, tmp, &vtxid, 1, "");
2045
}
2046
2047
static void
2048
handle_ngg_outputs_post_1(struct radv_shader_context *ctx)
2049
{
2050
struct radv_streamout_info *so = &ctx->args->shader_info->so;
2051
LLVMBuilderRef builder = ctx->ac.builder;
2052
LLVMValueRef vertex_ptr = NULL;
2053
LLVMValueRef tmp, tmp2;
2054
2055
assert((ctx->stage == MESA_SHADER_VERTEX || ctx->stage == MESA_SHADER_TESS_EVAL) &&
2056
!ctx->args->is_gs_copy_shader);
2057
2058
if (!ctx->args->shader_info->so.num_outputs)
2059
return;
2060
2061
vertex_ptr = ngg_nogs_vertex_ptr(ctx, get_thread_id_in_tg(ctx));
2062
2063
for (unsigned i = 0; i < so->num_outputs; ++i) {
2064
struct radv_stream_output *output = &ctx->args->shader_info->so.outputs[i];
2065
2066
unsigned loc = output->location;
2067
2068
for (unsigned comp = 0; comp < 4; comp++) {
2069
if (!(output->component_mask & (1 << comp)))
2070
continue;
2071
2072
tmp = ac_build_gep0(&ctx->ac, vertex_ptr, LLVMConstInt(ctx->ac.i32, 4 * i + comp, false));
2073
tmp2 = LLVMBuildLoad(builder, ctx->abi.outputs[4 * loc + comp], "");
2074
tmp2 = ac_to_integer(&ctx->ac, tmp2);
2075
LLVMBuildStore(builder, tmp2, tmp);
2076
}
2077
}
2078
}
2079
2080
static void
2081
handle_ngg_outputs_post_2(struct radv_shader_context *ctx)
2082
{
2083
LLVMBuilderRef builder = ctx->ac.builder;
2084
LLVMValueRef tmp;
2085
2086
assert((ctx->stage == MESA_SHADER_VERTEX || ctx->stage == MESA_SHADER_TESS_EVAL) &&
2087
!ctx->args->is_gs_copy_shader);
2088
2089
LLVMValueRef prims_in_wave =
2090
ac_unpack_param(&ctx->ac, ac_get_arg(&ctx->ac, ctx->args->ac.merged_wave_info), 8, 8);
2091
LLVMValueRef vtx_in_wave =
2092
ac_unpack_param(&ctx->ac, ac_get_arg(&ctx->ac, ctx->args->ac.merged_wave_info), 0, 8);
2093
LLVMValueRef is_gs_thread =
2094
LLVMBuildICmp(builder, LLVMIntULT, ac_get_thread_id(&ctx->ac), prims_in_wave, "");
2095
LLVMValueRef is_es_thread =
2096
LLVMBuildICmp(builder, LLVMIntULT, ac_get_thread_id(&ctx->ac), vtx_in_wave, "");
2097
LLVMValueRef vtxindex[] = {
2098
ac_unpack_param(&ctx->ac, ac_get_arg(&ctx->ac, ctx->args->ac.gs_vtx_offset[0]), 0, 16),
2099
ac_unpack_param(&ctx->ac, ac_get_arg(&ctx->ac, ctx->args->ac.gs_vtx_offset[0]), 16, 16),
2100
ac_unpack_param(&ctx->ac, ac_get_arg(&ctx->ac, ctx->args->ac.gs_vtx_offset[2]), 0, 16),
2101
};
2102
2103
/* Determine the number of vertices per primitive. */
2104
unsigned num_vertices;
2105
LLVMValueRef num_vertices_val;
2106
2107
if (ctx->stage == MESA_SHADER_VERTEX) {
2108
LLVMValueRef outprim_val =
2109
LLVMConstInt(ctx->ac.i32, ctx->args->options->key.vs.outprim, false);
2110
num_vertices_val = LLVMBuildAdd(builder, outprim_val, ctx->ac.i32_1, "");
2111
num_vertices = 3; /* TODO: optimize for points & lines */
2112
} else {
2113
assert(ctx->stage == MESA_SHADER_TESS_EVAL);
2114
2115
if (ctx->shader->info.tess.point_mode)
2116
num_vertices = 1;
2117
else if (ctx->shader->info.tess.primitive_mode == GL_ISOLINES)
2118
num_vertices = 2;
2119
else
2120
num_vertices = 3;
2121
2122
num_vertices_val = LLVMConstInt(ctx->ac.i32, num_vertices, false);
2123
}
2124
2125
/* Streamout */
2126
if (ctx->args->shader_info->so.num_outputs) {
2127
struct ngg_streamout nggso = {0};
2128
2129
nggso.num_vertices = num_vertices_val;
2130
nggso.prim_enable[0] = is_gs_thread;
2131
2132
for (unsigned i = 0; i < num_vertices; ++i)
2133
nggso.vertices[i] = ngg_nogs_vertex_ptr(ctx, vtxindex[i]);
2134
2135
build_streamout(ctx, &nggso);
2136
}
2137
2138
/* Copy Primitive IDs from GS threads to the LDS address corresponding
2139
* to the ES thread of the provoking vertex.
2140
*/
2141
if (ctx->stage == MESA_SHADER_VERTEX && ctx->args->options->key.vs_common_out.export_prim_id) {
2142
if (ctx->args->shader_info->so.num_outputs)
2143
ac_build_s_barrier(&ctx->ac);
2144
2145
ac_build_ifcc(&ctx->ac, is_gs_thread, 5400);
2146
2147
LLVMValueRef provoking_vtx_in_prim = LLVMConstInt(ctx->ac.i32, 0, false);
2148
2149
/* For provoking vertex last mode, use num_vtx_in_prim - 1. */
2150
if (ctx->args->options->key.vs.provoking_vtx_last)
2151
provoking_vtx_in_prim = LLVMConstInt(ctx->ac.i32, ctx->args->options->key.vs.outprim, false);
2152
2153
/* provoking_vtx_index = vtxindex[provoking_vtx_in_prim]; */
2154
LLVMValueRef indices = ac_build_gather_values(&ctx->ac, vtxindex, 3);
2155
LLVMValueRef provoking_vtx_index =
2156
LLVMBuildExtractElement(builder, indices, provoking_vtx_in_prim, "");
2157
2158
LLVMBuildStore(builder, ac_get_arg(&ctx->ac, ctx->args->ac.gs_prim_id),
2159
ac_build_gep0(&ctx->ac, ctx->esgs_ring, provoking_vtx_index));
2160
ac_build_endif(&ctx->ac, 5400);
2161
}
2162
2163
/* TODO: primitive culling */
2164
2165
ac_build_sendmsg_gs_alloc_req(&ctx->ac, get_wave_id_in_tg(ctx), ngg_get_vtx_cnt(ctx),
2166
ngg_get_prim_cnt(ctx));
2167
2168
/* TODO: streamout queries */
2169
/* Export primitive data to the index buffer.
2170
*
2171
* For the first version, we will always build up all three indices
2172
* independent of the primitive type. The additional garbage data
2173
* shouldn't hurt.
2174
*
2175
* TODO: culling depends on the primitive type, so can have some
2176
* interaction here.
2177
*/
2178
ac_build_ifcc(&ctx->ac, is_gs_thread, 6001);
2179
{
2180
struct ac_ngg_prim prim = {0};
2181
2182
if (ctx->args->options->key.vs_common_out.as_ngg_passthrough) {
2183
prim.passthrough = ac_get_arg(&ctx->ac, ctx->args->ac.gs_vtx_offset[0]);
2184
} else {
2185
prim.num_vertices = num_vertices;
2186
prim.isnull = ctx->ac.i1false;
2187
memcpy(prim.index, vtxindex, sizeof(vtxindex[0]) * 3);
2188
2189
for (unsigned i = 0; i < num_vertices; ++i) {
2190
tmp = LLVMBuildLShr(builder, ac_get_arg(&ctx->ac, ctx->args->ac.gs_invocation_id),
2191
LLVMConstInt(ctx->ac.i32, 8 + i, false), "");
2192
prim.edgeflag[i] = LLVMBuildTrunc(builder, tmp, ctx->ac.i1, "");
2193
}
2194
}
2195
2196
ac_build_export_prim(&ctx->ac, &prim);
2197
}
2198
ac_build_endif(&ctx->ac, 6001);
2199
2200
/* Export per-vertex data (positions and parameters). */
2201
ac_build_ifcc(&ctx->ac, is_es_thread, 6002);
2202
{
2203
struct radv_vs_output_info *outinfo = ctx->stage == MESA_SHADER_TESS_EVAL
2204
? &ctx->args->shader_info->tes.outinfo
2205
: &ctx->args->shader_info->vs.outinfo;
2206
2207
/* Exporting the primitive ID is handled below. */
2208
/* TODO: use the new VS export path */
2209
handle_vs_outputs_post(ctx, false, ctx->args->options->key.vs_common_out.export_clip_dists,
2210
outinfo);
2211
2212
if (ctx->args->options->key.vs_common_out.export_prim_id) {
2213
unsigned param_count = outinfo->param_exports;
2214
LLVMValueRef values[4];
2215
2216
if (ctx->stage == MESA_SHADER_VERTEX) {
2217
/* Wait for GS stores to finish. */
2218
ac_build_s_barrier(&ctx->ac);
2219
2220
tmp = ac_build_gep0(&ctx->ac, ctx->esgs_ring, get_thread_id_in_tg(ctx));
2221
values[0] = LLVMBuildLoad(builder, tmp, "");
2222
} else {
2223
assert(ctx->stage == MESA_SHADER_TESS_EVAL);
2224
values[0] = ac_get_arg(&ctx->ac, ctx->args->ac.tes_patch_id);
2225
}
2226
2227
values[0] = ac_to_float(&ctx->ac, values[0]);
2228
for (unsigned j = 1; j < 4; j++)
2229
values[j] = ctx->ac.f32_0;
2230
2231
radv_export_param(ctx, param_count, values, 0x1);
2232
2233
outinfo->vs_output_param_offset[VARYING_SLOT_PRIMITIVE_ID] = param_count++;
2234
outinfo->param_exports = param_count;
2235
}
2236
}
2237
ac_build_endif(&ctx->ac, 6002);
2238
}
2239
2240
static void
2241
gfx10_ngg_gs_emit_prologue(struct radv_shader_context *ctx)
2242
{
2243
/* Zero out the part of LDS scratch that is used to accumulate the
2244
* per-stream generated primitive count.
2245
*/
2246
LLVMBuilderRef builder = ctx->ac.builder;
2247
LLVMValueRef scratchptr = ctx->gs_ngg_scratch;
2248
LLVMValueRef tid = get_thread_id_in_tg(ctx);
2249
LLVMBasicBlockRef merge_block;
2250
LLVMValueRef cond;
2251
2252
LLVMValueRef fn = LLVMGetBasicBlockParent(LLVMGetInsertBlock(ctx->ac.builder));
2253
LLVMBasicBlockRef then_block = LLVMAppendBasicBlockInContext(ctx->ac.context, fn, "");
2254
merge_block = LLVMAppendBasicBlockInContext(ctx->ac.context, fn, "");
2255
2256
cond = LLVMBuildICmp(builder, LLVMIntULT, tid, LLVMConstInt(ctx->ac.i32, 4, false), "");
2257
LLVMBuildCondBr(ctx->ac.builder, cond, then_block, merge_block);
2258
LLVMPositionBuilderAtEnd(ctx->ac.builder, then_block);
2259
2260
LLVMValueRef ptr = ac_build_gep0(&ctx->ac, scratchptr, tid);
2261
LLVMBuildStore(builder, ctx->ac.i32_0, ptr);
2262
2263
LLVMBuildBr(ctx->ac.builder, merge_block);
2264
LLVMPositionBuilderAtEnd(ctx->ac.builder, merge_block);
2265
2266
ac_build_s_barrier(&ctx->ac);
2267
}
2268
2269
static void
2270
gfx10_ngg_gs_emit_epilogue_1(struct radv_shader_context *ctx)
2271
{
2272
LLVMBuilderRef builder = ctx->ac.builder;
2273
LLVMValueRef i8_0 = LLVMConstInt(ctx->ac.i8, 0, false);
2274
LLVMValueRef tmp;
2275
2276
/* Zero out remaining (non-emitted) primitive flags.
2277
*
2278
* Note: Alternatively, we could pass the relevant gs_next_vertex to
2279
* the emit threads via LDS. This is likely worse in the expected
2280
* typical case where each GS thread emits the full set of
2281
* vertices.
2282
*/
2283
for (unsigned stream = 0; stream < 4; ++stream) {
2284
unsigned num_components;
2285
2286
num_components = ctx->args->shader_info->gs.num_stream_output_components[stream];
2287
if (!num_components)
2288
continue;
2289
2290
const LLVMValueRef gsthread = get_thread_id_in_tg(ctx);
2291
2292
ac_build_bgnloop(&ctx->ac, 5100);
2293
2294
const LLVMValueRef vertexidx = LLVMBuildLoad(builder, ctx->gs_next_vertex[stream], "");
2295
tmp = LLVMBuildICmp(builder, LLVMIntUGE, vertexidx,
2296
LLVMConstInt(ctx->ac.i32, ctx->shader->info.gs.vertices_out, false), "");
2297
ac_build_ifcc(&ctx->ac, tmp, 5101);
2298
ac_build_break(&ctx->ac);
2299
ac_build_endif(&ctx->ac, 5101);
2300
2301
tmp = LLVMBuildAdd(builder, vertexidx, ctx->ac.i32_1, "");
2302
LLVMBuildStore(builder, tmp, ctx->gs_next_vertex[stream]);
2303
2304
tmp = ngg_gs_emit_vertex_ptr(ctx, gsthread, vertexidx);
2305
LLVMBuildStore(builder, i8_0, ngg_gs_get_emit_primflag_ptr(ctx, tmp, stream));
2306
2307
ac_build_endloop(&ctx->ac, 5100);
2308
}
2309
2310
/* Accumulate generated primitives counts across the entire threadgroup. */
2311
for (unsigned stream = 0; stream < 4; ++stream) {
2312
unsigned num_components;
2313
2314
num_components = ctx->args->shader_info->gs.num_stream_output_components[stream];
2315
if (!num_components)
2316
continue;
2317
2318
LLVMValueRef numprims = LLVMBuildLoad(builder, ctx->gs_generated_prims[stream], "");
2319
numprims = ac_build_reduce(&ctx->ac, numprims, nir_op_iadd, ctx->ac.wave_size);
2320
2321
tmp = LLVMBuildICmp(builder, LLVMIntEQ, ac_get_thread_id(&ctx->ac), ctx->ac.i32_0, "");
2322
ac_build_ifcc(&ctx->ac, tmp, 5105);
2323
{
2324
LLVMBuildAtomicRMW(
2325
builder, LLVMAtomicRMWBinOpAdd,
2326
ac_build_gep0(&ctx->ac, ctx->gs_ngg_scratch, LLVMConstInt(ctx->ac.i32, stream, false)),
2327
numprims, LLVMAtomicOrderingMonotonic, false);
2328
}
2329
ac_build_endif(&ctx->ac, 5105);
2330
}
2331
}
2332
2333
static void
2334
gfx10_ngg_gs_emit_epilogue_2(struct radv_shader_context *ctx)
2335
{
2336
const unsigned verts_per_prim =
2337
si_conv_gl_prim_to_vertices(ctx->shader->info.gs.output_primitive);
2338
LLVMBuilderRef builder = ctx->ac.builder;
2339
LLVMValueRef tmp, tmp2;
2340
2341
ac_build_s_barrier(&ctx->ac);
2342
2343
const LLVMValueRef tid = get_thread_id_in_tg(ctx);
2344
LLVMValueRef num_emit_threads = ngg_get_prim_cnt(ctx);
2345
2346
/* Streamout */
2347
if (ctx->args->shader_info->so.num_outputs) {
2348
struct ngg_streamout nggso = {0};
2349
2350
nggso.num_vertices = LLVMConstInt(ctx->ac.i32, verts_per_prim, false);
2351
2352
LLVMValueRef vertexptr = ngg_gs_vertex_ptr(ctx, tid);
2353
for (unsigned stream = 0; stream < 4; ++stream) {
2354
if (!ctx->args->shader_info->gs.num_stream_output_components[stream])
2355
continue;
2356
2357
tmp = LLVMBuildLoad(builder, ngg_gs_get_emit_primflag_ptr(ctx, vertexptr, stream), "");
2358
tmp = LLVMBuildTrunc(builder, tmp, ctx->ac.i1, "");
2359
tmp2 = LLVMBuildICmp(builder, LLVMIntULT, tid, num_emit_threads, "");
2360
nggso.prim_enable[stream] = LLVMBuildAnd(builder, tmp, tmp2, "");
2361
}
2362
2363
for (unsigned i = 0; i < verts_per_prim; ++i) {
2364
tmp = LLVMBuildSub(builder, tid, LLVMConstInt(ctx->ac.i32, verts_per_prim - i - 1, false),
2365
"");
2366
tmp = ngg_gs_vertex_ptr(ctx, tmp);
2367
nggso.vertices[i] = ac_build_gep0(&ctx->ac, tmp, ctx->ac.i32_0);
2368
}
2369
2370
build_streamout(ctx, &nggso);
2371
}
2372
2373
/* Write shader query data. */
2374
tmp = ac_get_arg(&ctx->ac, ctx->args->ngg_gs_state);
2375
tmp = LLVMBuildTrunc(builder, tmp, ctx->ac.i1, "");
2376
ac_build_ifcc(&ctx->ac, tmp, 5109);
2377
tmp = LLVMBuildICmp(builder, LLVMIntULT, tid, LLVMConstInt(ctx->ac.i32, 4, false), "");
2378
ac_build_ifcc(&ctx->ac, tmp, 5110);
2379
{
2380
tmp = LLVMBuildLoad(builder, ac_build_gep0(&ctx->ac, ctx->gs_ngg_scratch, tid), "");
2381
2382
ac_llvm_add_target_dep_function_attr(ctx->main_function, "amdgpu-gds-size", 256);
2383
2384
LLVMTypeRef gdsptr = LLVMPointerType(ctx->ac.i32, AC_ADDR_SPACE_GDS);
2385
LLVMValueRef gdsbase = LLVMBuildIntToPtr(builder, ctx->ac.i32_0, gdsptr, "");
2386
2387
const char *sync_scope = "workgroup-one-as";
2388
2389
/* Use a plain GDS atomic to accumulate the number of generated
2390
* primitives.
2391
*/
2392
ac_build_atomic_rmw(&ctx->ac, LLVMAtomicRMWBinOpAdd, gdsbase, tmp, sync_scope);
2393
}
2394
ac_build_endif(&ctx->ac, 5110);
2395
ac_build_endif(&ctx->ac, 5109);
2396
2397
/* TODO: culling */
2398
2399
/* Determine vertex liveness. */
2400
LLVMValueRef vertliveptr = ac_build_alloca(&ctx->ac, ctx->ac.i1, "vertexlive");
2401
2402
tmp = LLVMBuildICmp(builder, LLVMIntULT, tid, num_emit_threads, "");
2403
ac_build_ifcc(&ctx->ac, tmp, 5120);
2404
{
2405
for (unsigned i = 0; i < verts_per_prim; ++i) {
2406
const LLVMValueRef primidx =
2407
LLVMBuildAdd(builder, tid, LLVMConstInt(ctx->ac.i32, i, false), "");
2408
2409
if (i > 0) {
2410
tmp = LLVMBuildICmp(builder, LLVMIntULT, primidx, num_emit_threads, "");
2411
ac_build_ifcc(&ctx->ac, tmp, 5121 + i);
2412
}
2413
2414
/* Load primitive liveness */
2415
tmp = ngg_gs_vertex_ptr(ctx, primidx);
2416
tmp = LLVMBuildLoad(builder, ngg_gs_get_emit_primflag_ptr(ctx, tmp, 0), "");
2417
const LLVMValueRef primlive = LLVMBuildTrunc(builder, tmp, ctx->ac.i1, "");
2418
2419
tmp = LLVMBuildLoad(builder, vertliveptr, "");
2420
tmp = LLVMBuildOr(builder, tmp, primlive, ""), LLVMBuildStore(builder, tmp, vertliveptr);
2421
2422
if (i > 0)
2423
ac_build_endif(&ctx->ac, 5121 + i);
2424
}
2425
}
2426
ac_build_endif(&ctx->ac, 5120);
2427
2428
/* Inclusive scan addition across the current wave. */
2429
LLVMValueRef vertlive = LLVMBuildLoad(builder, vertliveptr, "");
2430
struct ac_wg_scan vertlive_scan = {0};
2431
vertlive_scan.op = nir_op_iadd;
2432
vertlive_scan.enable_reduce = true;
2433
vertlive_scan.enable_exclusive = true;
2434
vertlive_scan.src = vertlive;
2435
vertlive_scan.scratch = ac_build_gep0(&ctx->ac, ctx->gs_ngg_scratch, ctx->ac.i32_0);
2436
vertlive_scan.waveidx = get_wave_id_in_tg(ctx);
2437
vertlive_scan.numwaves = get_tgsize(ctx);
2438
vertlive_scan.maxwaves = 8;
2439
2440
ac_build_wg_scan(&ctx->ac, &vertlive_scan);
2441
2442
/* Skip all exports (including index exports) when possible. At least on
2443
* early gfx10 revisions this is also to avoid hangs.
2444
*/
2445
LLVMValueRef have_exports =
2446
LLVMBuildICmp(builder, LLVMIntNE, vertlive_scan.result_reduce, ctx->ac.i32_0, "");
2447
num_emit_threads = LLVMBuildSelect(builder, have_exports, num_emit_threads, ctx->ac.i32_0, "");
2448
2449
/* Allocate export space. Send this message as early as possible, to
2450
* hide the latency of the SQ <-> SPI roundtrip.
2451
*
2452
* Note: We could consider compacting primitives for export as well.
2453
* PA processes 1 non-null prim / clock, but it fetches 4 DW of
2454
* prim data per clock and skips null primitives at no additional
2455
* cost. So compacting primitives can only be beneficial when
2456
* there are 4 or more contiguous null primitives in the export
2457
* (in the common case of single-dword prim exports).
2458
*/
2459
ac_build_sendmsg_gs_alloc_req(&ctx->ac, get_wave_id_in_tg(ctx), vertlive_scan.result_reduce,
2460
num_emit_threads);
2461
2462
/* Setup the reverse vertex compaction permutation. We re-use stream 1
2463
* of the primitive liveness flags, relying on the fact that each
2464
* threadgroup can have at most 256 threads. */
2465
ac_build_ifcc(&ctx->ac, vertlive, 5130);
2466
{
2467
tmp = ngg_gs_vertex_ptr(ctx, vertlive_scan.result_exclusive);
2468
tmp2 = LLVMBuildTrunc(builder, tid, ctx->ac.i8, "");
2469
LLVMBuildStore(builder, tmp2, ngg_gs_get_emit_primflag_ptr(ctx, tmp, 1));
2470
}
2471
ac_build_endif(&ctx->ac, 5130);
2472
2473
ac_build_s_barrier(&ctx->ac);
2474
2475
/* Export primitive data */
2476
tmp = LLVMBuildICmp(builder, LLVMIntULT, tid, num_emit_threads, "");
2477
ac_build_ifcc(&ctx->ac, tmp, 5140);
2478
{
2479
LLVMValueRef flags;
2480
struct ac_ngg_prim prim = {0};
2481
prim.num_vertices = verts_per_prim;
2482
2483
tmp = ngg_gs_vertex_ptr(ctx, tid);
2484
flags = LLVMBuildLoad(builder, ngg_gs_get_emit_primflag_ptr(ctx, tmp, 0), "");
2485
prim.isnull = LLVMBuildNot(builder, LLVMBuildTrunc(builder, flags, ctx->ac.i1, ""), "");
2486
2487
for (unsigned i = 0; i < verts_per_prim; ++i) {
2488
prim.index[i] = LLVMBuildSub(builder, vertlive_scan.result_exclusive,
2489
LLVMConstInt(ctx->ac.i32, verts_per_prim - i - 1, false), "");
2490
prim.edgeflag[i] = ctx->ac.i1false;
2491
}
2492
2493
/* Geometry shaders output triangle strips, but NGG expects triangles. */
2494
if (verts_per_prim == 3) {
2495
LLVMValueRef is_odd = LLVMBuildLShr(builder, flags, ctx->ac.i8_1, "");
2496
is_odd = LLVMBuildTrunc(builder, is_odd, ctx->ac.i1, "");
2497
2498
LLVMValueRef flatshade_first =
2499
LLVMConstInt(ctx->ac.i32, !ctx->args->options->key.vs.provoking_vtx_last, false);
2500
2501
ac_build_triangle_strip_indices_to_triangle(&ctx->ac, is_odd, flatshade_first, prim.index);
2502
}
2503
2504
ac_build_export_prim(&ctx->ac, &prim);
2505
}
2506
ac_build_endif(&ctx->ac, 5140);
2507
2508
/* Export position and parameter data */
2509
tmp = LLVMBuildICmp(builder, LLVMIntULT, tid, vertlive_scan.result_reduce, "");
2510
ac_build_ifcc(&ctx->ac, tmp, 5145);
2511
{
2512
struct radv_vs_output_info *outinfo = &ctx->args->shader_info->vs.outinfo;
2513
bool export_view_index = ctx->args->options->key.has_multiview_view_index;
2514
struct radv_shader_output_values *outputs;
2515
unsigned noutput = 0;
2516
2517
/* Allocate a temporary array for the output values. */
2518
unsigned num_outputs = util_bitcount64(ctx->output_mask) + export_view_index;
2519
outputs = calloc(num_outputs, sizeof(outputs[0]));
2520
2521
memset(outinfo->vs_output_param_offset, AC_EXP_PARAM_UNDEFINED,
2522
sizeof(outinfo->vs_output_param_offset));
2523
outinfo->pos_exports = 0;
2524
2525
tmp = ngg_gs_vertex_ptr(ctx, tid);
2526
tmp = LLVMBuildLoad(builder, ngg_gs_get_emit_primflag_ptr(ctx, tmp, 1), "");
2527
tmp = LLVMBuildZExt(builder, tmp, ctx->ac.i32, "");
2528
const LLVMValueRef vertexptr = ngg_gs_vertex_ptr(ctx, tmp);
2529
2530
unsigned out_idx = 0;
2531
for (unsigned i = 0; i < AC_LLVM_MAX_OUTPUTS; ++i) {
2532
unsigned output_usage_mask = ctx->args->shader_info->gs.output_usage_mask[i];
2533
int length = util_last_bit(output_usage_mask);
2534
2535
if (!(ctx->output_mask & (1ull << i)))
2536
continue;
2537
2538
outputs[noutput].slot_name = i;
2539
outputs[noutput].slot_index = i == VARYING_SLOT_CLIP_DIST1;
2540
outputs[noutput].usage_mask = output_usage_mask;
2541
2542
for (unsigned j = 0; j < length; j++, out_idx++) {
2543
if (!(output_usage_mask & (1 << j)))
2544
continue;
2545
2546
tmp = ngg_gs_get_emit_output_ptr(ctx, vertexptr, out_idx);
2547
tmp = LLVMBuildLoad(builder, tmp, "");
2548
2549
LLVMTypeRef type = LLVMGetAllocatedType(ctx->abi.outputs[ac_llvm_reg_index_soa(i, j)]);
2550
if (ac_get_type_size(type) == 2) {
2551
tmp = ac_to_integer(&ctx->ac, tmp);
2552
tmp = LLVMBuildTrunc(ctx->ac.builder, tmp, ctx->ac.i16, "");
2553
}
2554
2555
outputs[noutput].values[j] = ac_to_float(&ctx->ac, tmp);
2556
}
2557
2558
for (unsigned j = length; j < 4; j++)
2559
outputs[noutput].values[j] = LLVMGetUndef(ctx->ac.f32);
2560
2561
noutput++;
2562
}
2563
2564
/* Export ViewIndex. */
2565
if (export_view_index) {
2566
outputs[noutput].slot_name = VARYING_SLOT_LAYER;
2567
outputs[noutput].slot_index = 0;
2568
outputs[noutput].usage_mask = 0x1;
2569
outputs[noutput].values[0] =
2570
ac_to_float(&ctx->ac, ac_get_arg(&ctx->ac, ctx->args->ac.view_index));
2571
for (unsigned j = 1; j < 4; j++)
2572
outputs[noutput].values[j] = ctx->ac.f32_0;
2573
noutput++;
2574
}
2575
2576
radv_llvm_export_vs(ctx, outputs, noutput, outinfo,
2577
ctx->args->options->key.vs_common_out.export_clip_dists);
2578
FREE(outputs);
2579
}
2580
ac_build_endif(&ctx->ac, 5145);
2581
}
2582
2583
static void
2584
gfx10_ngg_gs_emit_vertex(struct radv_shader_context *ctx, unsigned stream, LLVMValueRef vertexidx,
2585
LLVMValueRef *addrs)
2586
{
2587
LLVMBuilderRef builder = ctx->ac.builder;
2588
LLVMValueRef tmp;
2589
2590
const LLVMValueRef vertexptr = ngg_gs_emit_vertex_ptr(ctx, get_thread_id_in_tg(ctx), vertexidx);
2591
unsigned out_idx = 0;
2592
for (unsigned i = 0; i < AC_LLVM_MAX_OUTPUTS; ++i) {
2593
unsigned output_usage_mask = ctx->args->shader_info->gs.output_usage_mask[i];
2594
uint8_t output_stream = ctx->args->shader_info->gs.output_streams[i];
2595
LLVMValueRef *out_ptr = &addrs[i * 4];
2596
int length = util_last_bit(output_usage_mask);
2597
2598
if (!(ctx->output_mask & (1ull << i)) || output_stream != stream)
2599
continue;
2600
2601
for (unsigned j = 0; j < length; j++, out_idx++) {
2602
if (!(output_usage_mask & (1 << j)))
2603
continue;
2604
2605
LLVMValueRef out_val = LLVMBuildLoad(ctx->ac.builder, out_ptr[j], "");
2606
out_val = ac_to_integer(&ctx->ac, out_val);
2607
out_val = LLVMBuildZExtOrBitCast(ctx->ac.builder, out_val, ctx->ac.i32, "");
2608
2609
LLVMBuildStore(builder, out_val, ngg_gs_get_emit_output_ptr(ctx, vertexptr, out_idx));
2610
}
2611
}
2612
assert(out_idx * 4 <= ctx->args->shader_info->gs.gsvs_vertex_size);
2613
2614
/* Store the current number of emitted vertices to zero out remaining
2615
* primitive flags in case the geometry shader doesn't emit the maximum
2616
* number of vertices.
2617
*/
2618
tmp = LLVMBuildAdd(builder, vertexidx, ctx->ac.i32_1, "");
2619
LLVMBuildStore(builder, tmp, ctx->gs_next_vertex[stream]);
2620
2621
/* Determine and store whether this vertex completed a primitive. */
2622
const LLVMValueRef curverts = LLVMBuildLoad(builder, ctx->gs_curprim_verts[stream], "");
2623
2624
tmp = LLVMConstInt(
2625
ctx->ac.i32, si_conv_gl_prim_to_vertices(ctx->shader->info.gs.output_primitive) - 1, false);
2626
const LLVMValueRef iscompleteprim = LLVMBuildICmp(builder, LLVMIntUGE, curverts, tmp, "");
2627
2628
/* Since the geometry shader emits triangle strips, we need to
2629
* track which primitive is odd and swap vertex indices to get
2630
* the correct vertex order.
2631
*/
2632
LLVMValueRef is_odd = ctx->ac.i1false;
2633
if (stream == 0 && si_conv_gl_prim_to_vertices(ctx->shader->info.gs.output_primitive) == 3) {
2634
tmp = LLVMBuildAnd(builder, curverts, ctx->ac.i32_1, "");
2635
is_odd = LLVMBuildICmp(builder, LLVMIntEQ, tmp, ctx->ac.i32_1, "");
2636
}
2637
2638
tmp = LLVMBuildAdd(builder, curverts, ctx->ac.i32_1, "");
2639
LLVMBuildStore(builder, tmp, ctx->gs_curprim_verts[stream]);
2640
2641
/* The per-vertex primitive flag encoding:
2642
* bit 0: whether this vertex finishes a primitive
2643
* bit 1: whether the primitive is odd (if we are emitting triangle strips)
2644
*/
2645
tmp = LLVMBuildZExt(builder, iscompleteprim, ctx->ac.i8, "");
2646
tmp = LLVMBuildOr(
2647
builder, tmp,
2648
LLVMBuildShl(builder, LLVMBuildZExt(builder, is_odd, ctx->ac.i8, ""), ctx->ac.i8_1, ""), "");
2649
LLVMBuildStore(builder, tmp, ngg_gs_get_emit_primflag_ptr(ctx, vertexptr, stream));
2650
2651
tmp = LLVMBuildLoad(builder, ctx->gs_generated_prims[stream], "");
2652
tmp = LLVMBuildAdd(builder, tmp, LLVMBuildZExt(builder, iscompleteprim, ctx->ac.i32, ""), "");
2653
LLVMBuildStore(builder, tmp, ctx->gs_generated_prims[stream]);
2654
}
2655
2656
static bool
2657
si_export_mrt_color(struct radv_shader_context *ctx, LLVMValueRef *color, unsigned index,
2658
struct ac_export_args *args)
2659
{
2660
/* Export */
2661
si_llvm_init_export_args(ctx, color, 0xf, V_008DFC_SQ_EXP_MRT + index, args);
2662
if (!args->enabled_channels)
2663
return false; /* unnecessary NULL export */
2664
2665
return true;
2666
}
2667
2668
static void
2669
radv_export_mrt_z(struct radv_shader_context *ctx, LLVMValueRef depth, LLVMValueRef stencil,
2670
LLVMValueRef samplemask)
2671
{
2672
struct ac_export_args args;
2673
2674
ac_export_mrt_z(&ctx->ac, depth, stencil, samplemask, &args);
2675
2676
ac_build_export(&ctx->ac, &args);
2677
}
2678
2679
static void
2680
handle_fs_outputs_post(struct radv_shader_context *ctx)
2681
{
2682
unsigned index = 0;
2683
LLVMValueRef depth = NULL, stencil = NULL, samplemask = NULL;
2684
struct ac_export_args color_args[8];
2685
2686
for (unsigned i = 0; i < AC_LLVM_MAX_OUTPUTS; ++i) {
2687
LLVMValueRef values[4];
2688
2689
if (!(ctx->output_mask & (1ull << i)))
2690
continue;
2691
2692
if (i < FRAG_RESULT_DATA0)
2693
continue;
2694
2695
for (unsigned j = 0; j < 4; j++)
2696
values[j] = ac_to_float(&ctx->ac, radv_load_output(ctx, i, j));
2697
2698
bool ret = si_export_mrt_color(ctx, values, i - FRAG_RESULT_DATA0, &color_args[index]);
2699
if (ret)
2700
index++;
2701
}
2702
2703
/* Process depth, stencil, samplemask. */
2704
if (ctx->args->shader_info->ps.writes_z) {
2705
depth = ac_to_float(&ctx->ac, radv_load_output(ctx, FRAG_RESULT_DEPTH, 0));
2706
}
2707
if (ctx->args->shader_info->ps.writes_stencil) {
2708
stencil = ac_to_float(&ctx->ac, radv_load_output(ctx, FRAG_RESULT_STENCIL, 0));
2709
}
2710
if (ctx->args->shader_info->ps.writes_sample_mask) {
2711
samplemask = ac_to_float(&ctx->ac, radv_load_output(ctx, FRAG_RESULT_SAMPLE_MASK, 0));
2712
}
2713
2714
/* Set the DONE bit on last non-null color export only if Z isn't
2715
* exported.
2716
*/
2717
if (index > 0 && !ctx->args->shader_info->ps.writes_z &&
2718
!ctx->args->shader_info->ps.writes_stencil &&
2719
!ctx->args->shader_info->ps.writes_sample_mask) {
2720
unsigned last = index - 1;
2721
2722
color_args[last].valid_mask = 1; /* whether the EXEC mask is valid */
2723
color_args[last].done = 1; /* DONE bit */
2724
}
2725
2726
/* Export PS outputs. */
2727
for (unsigned i = 0; i < index; i++)
2728
ac_build_export(&ctx->ac, &color_args[i]);
2729
2730
if (depth || stencil || samplemask)
2731
radv_export_mrt_z(ctx, depth, stencil, samplemask);
2732
else if (!index)
2733
ac_build_export_null(&ctx->ac);
2734
}
2735
2736
static void
2737
emit_gs_epilogue(struct radv_shader_context *ctx)
2738
{
2739
if (ctx->args->options->key.vs_common_out.as_ngg) {
2740
gfx10_ngg_gs_emit_epilogue_1(ctx);
2741
return;
2742
}
2743
2744
if (ctx->ac.chip_class >= GFX10)
2745
LLVMBuildFence(ctx->ac.builder, LLVMAtomicOrderingRelease, false, "");
2746
2747
ac_build_sendmsg(&ctx->ac, AC_SENDMSG_GS_OP_NOP | AC_SENDMSG_GS_DONE, ctx->gs_wave_id);
2748
}
2749
2750
static void
2751
handle_shader_outputs_post(struct ac_shader_abi *abi, unsigned max_outputs, LLVMValueRef *addrs)
2752
{
2753
struct radv_shader_context *ctx = radv_shader_context_from_abi(abi);
2754
2755
switch (ctx->stage) {
2756
case MESA_SHADER_VERTEX:
2757
if (ctx->args->options->key.vs_common_out.as_ls)
2758
break; /* Lowered in NIR */
2759
else if (ctx->args->options->key.vs_common_out.as_es)
2760
break; /* Lowered in NIR */
2761
else if (ctx->args->options->key.vs_common_out.as_ngg)
2762
handle_ngg_outputs_post_1(ctx);
2763
else
2764
handle_vs_outputs_post(ctx, ctx->args->options->key.vs_common_out.export_prim_id,
2765
ctx->args->options->key.vs_common_out.export_clip_dists,
2766
&ctx->args->shader_info->vs.outinfo);
2767
break;
2768
case MESA_SHADER_FRAGMENT:
2769
handle_fs_outputs_post(ctx);
2770
break;
2771
case MESA_SHADER_GEOMETRY:
2772
emit_gs_epilogue(ctx);
2773
break;
2774
case MESA_SHADER_TESS_CTRL:
2775
break; /* Lowered in NIR */
2776
case MESA_SHADER_TESS_EVAL:
2777
if (ctx->args->options->key.vs_common_out.as_es)
2778
break; /* Lowered in NIR */
2779
else if (ctx->args->options->key.vs_common_out.as_ngg)
2780
handle_ngg_outputs_post_1(ctx);
2781
else
2782
handle_vs_outputs_post(ctx, ctx->args->options->key.vs_common_out.export_prim_id,
2783
ctx->args->options->key.vs_common_out.export_clip_dists,
2784
&ctx->args->shader_info->tes.outinfo);
2785
break;
2786
default:
2787
break;
2788
}
2789
}
2790
2791
static void
2792
ac_llvm_finalize_module(struct radv_shader_context *ctx, LLVMPassManagerRef passmgr,
2793
const struct radv_nir_compiler_options *options)
2794
{
2795
LLVMRunPassManager(passmgr, ctx->ac.module);
2796
LLVMDisposeBuilder(ctx->ac.builder);
2797
2798
ac_llvm_context_dispose(&ctx->ac);
2799
}
2800
2801
static void
2802
ac_nir_eliminate_const_vs_outputs(struct radv_shader_context *ctx)
2803
{
2804
struct radv_vs_output_info *outinfo;
2805
2806
switch (ctx->stage) {
2807
case MESA_SHADER_FRAGMENT:
2808
case MESA_SHADER_COMPUTE:
2809
case MESA_SHADER_TESS_CTRL:
2810
case MESA_SHADER_GEOMETRY:
2811
return;
2812
case MESA_SHADER_VERTEX:
2813
if (ctx->args->options->key.vs_common_out.as_ls ||
2814
ctx->args->options->key.vs_common_out.as_es)
2815
return;
2816
outinfo = &ctx->args->shader_info->vs.outinfo;
2817
break;
2818
case MESA_SHADER_TESS_EVAL:
2819
if (ctx->args->options->key.vs_common_out.as_es)
2820
return;
2821
outinfo = &ctx->args->shader_info->tes.outinfo;
2822
break;
2823
default:
2824
unreachable("Unhandled shader type");
2825
}
2826
2827
ac_optimize_vs_outputs(&ctx->ac, ctx->main_function, outinfo->vs_output_param_offset,
2828
VARYING_SLOT_MAX, 0, &outinfo->param_exports);
2829
}
2830
2831
static void
2832
ac_setup_rings(struct radv_shader_context *ctx)
2833
{
2834
if (ctx->args->options->chip_class <= GFX8 &&
2835
(ctx->stage == MESA_SHADER_GEOMETRY || ctx->args->options->key.vs_common_out.as_es)) {
2836
unsigned ring = ctx->stage == MESA_SHADER_GEOMETRY ? RING_ESGS_GS : RING_ESGS_VS;
2837
LLVMValueRef offset = LLVMConstInt(ctx->ac.i32, ring, false);
2838
2839
ctx->esgs_ring = ac_build_load_to_sgpr(&ctx->ac, ctx->ring_offsets, offset);
2840
}
2841
2842
if (ctx->args->is_gs_copy_shader) {
2843
ctx->gsvs_ring[0] = ac_build_load_to_sgpr(&ctx->ac, ctx->ring_offsets,
2844
LLVMConstInt(ctx->ac.i32, RING_GSVS_VS, false));
2845
}
2846
2847
if (ctx->stage == MESA_SHADER_GEOMETRY) {
2848
/* The conceptual layout of the GSVS ring is
2849
* v0c0 .. vLv0 v0c1 .. vLc1 ..
2850
* but the real memory layout is swizzled across
2851
* threads:
2852
* t0v0c0 .. t15v0c0 t0v1c0 .. t15v1c0 ... t15vLcL
2853
* t16v0c0 ..
2854
* Override the buffer descriptor accordingly.
2855
*/
2856
LLVMTypeRef v2i64 = LLVMVectorType(ctx->ac.i64, 2);
2857
uint64_t stream_offset = 0;
2858
unsigned num_records = ctx->ac.wave_size;
2859
LLVMValueRef base_ring;
2860
2861
base_ring = ac_build_load_to_sgpr(&ctx->ac, ctx->ring_offsets,
2862
LLVMConstInt(ctx->ac.i32, RING_GSVS_GS, false));
2863
2864
for (unsigned stream = 0; stream < 4; stream++) {
2865
unsigned num_components, stride;
2866
LLVMValueRef ring, tmp;
2867
2868
num_components = ctx->args->shader_info->gs.num_stream_output_components[stream];
2869
2870
if (!num_components)
2871
continue;
2872
2873
stride = 4 * num_components * ctx->shader->info.gs.vertices_out;
2874
2875
/* Limit on the stride field for <= GFX7. */
2876
assert(stride < (1 << 14));
2877
2878
ring = LLVMBuildBitCast(ctx->ac.builder, base_ring, v2i64, "");
2879
tmp = LLVMBuildExtractElement(ctx->ac.builder, ring, ctx->ac.i32_0, "");
2880
tmp = LLVMBuildAdd(ctx->ac.builder, tmp, LLVMConstInt(ctx->ac.i64, stream_offset, 0), "");
2881
ring = LLVMBuildInsertElement(ctx->ac.builder, ring, tmp, ctx->ac.i32_0, "");
2882
2883
stream_offset += stride * ctx->ac.wave_size;
2884
2885
ring = LLVMBuildBitCast(ctx->ac.builder, ring, ctx->ac.v4i32, "");
2886
2887
tmp = LLVMBuildExtractElement(ctx->ac.builder, ring, ctx->ac.i32_1, "");
2888
tmp = LLVMBuildOr(ctx->ac.builder, tmp,
2889
LLVMConstInt(ctx->ac.i32, S_008F04_STRIDE(stride), false), "");
2890
ring = LLVMBuildInsertElement(ctx->ac.builder, ring, tmp, ctx->ac.i32_1, "");
2891
2892
ring = LLVMBuildInsertElement(ctx->ac.builder, ring,
2893
LLVMConstInt(ctx->ac.i32, num_records, false),
2894
LLVMConstInt(ctx->ac.i32, 2, false), "");
2895
2896
ctx->gsvs_ring[stream] = ring;
2897
}
2898
}
2899
2900
if (ctx->stage == MESA_SHADER_TESS_CTRL || ctx->stage == MESA_SHADER_TESS_EVAL) {
2901
ctx->hs_ring_tess_offchip = ac_build_load_to_sgpr(
2902
&ctx->ac, ctx->ring_offsets, LLVMConstInt(ctx->ac.i32, RING_HS_TESS_OFFCHIP, false));
2903
ctx->hs_ring_tess_factor = ac_build_load_to_sgpr(
2904
&ctx->ac, ctx->ring_offsets, LLVMConstInt(ctx->ac.i32, RING_HS_TESS_FACTOR, false));
2905
}
2906
}
2907
2908
unsigned
2909
radv_nir_get_max_workgroup_size(enum chip_class chip_class, gl_shader_stage stage,
2910
const struct nir_shader *nir)
2911
{
2912
const unsigned backup_sizes[] = {chip_class >= GFX9 ? 128 : 64, 1, 1};
2913
unsigned sizes[3];
2914
for (unsigned i = 0; i < 3; i++)
2915
sizes[i] = nir ? nir->info.workgroup_size[i] : backup_sizes[i];
2916
return radv_get_max_workgroup_size(chip_class, stage, sizes);
2917
}
2918
2919
/* Fixup the HW not emitting the TCS regs if there are no HS threads. */
2920
static void
2921
ac_nir_fixup_ls_hs_input_vgprs(struct radv_shader_context *ctx)
2922
{
2923
LLVMValueRef count =
2924
ac_unpack_param(&ctx->ac, ac_get_arg(&ctx->ac, ctx->args->ac.merged_wave_info), 8, 8);
2925
LLVMValueRef hs_empty = LLVMBuildICmp(ctx->ac.builder, LLVMIntEQ, count, ctx->ac.i32_0, "");
2926
ctx->abi.instance_id =
2927
LLVMBuildSelect(ctx->ac.builder, hs_empty, ac_get_arg(&ctx->ac, ctx->args->ac.vertex_id),
2928
ctx->abi.instance_id, "");
2929
ctx->vs_rel_patch_id =
2930
LLVMBuildSelect(ctx->ac.builder, hs_empty, ac_get_arg(&ctx->ac, ctx->args->ac.tcs_rel_ids),
2931
ctx->vs_rel_patch_id, "");
2932
ctx->abi.vertex_id =
2933
LLVMBuildSelect(ctx->ac.builder, hs_empty, ac_get_arg(&ctx->ac, ctx->args->ac.tcs_patch_id),
2934
ctx->abi.vertex_id, "");
2935
}
2936
2937
static void
2938
prepare_gs_input_vgprs(struct radv_shader_context *ctx, bool merged)
2939
{
2940
if (merged) {
2941
for (int i = 5; i >= 0; --i) {
2942
ctx->gs_vtx_offset[i] = ac_unpack_param(
2943
&ctx->ac, ac_get_arg(&ctx->ac, ctx->args->ac.gs_vtx_offset[i & ~1]), (i & 1) * 16, 16);
2944
}
2945
2946
ctx->gs_wave_id =
2947
ac_unpack_param(&ctx->ac, ac_get_arg(&ctx->ac, ctx->args->ac.merged_wave_info), 16, 8);
2948
} else {
2949
for (int i = 0; i < 6; i++)
2950
ctx->gs_vtx_offset[i] = ac_get_arg(&ctx->ac, ctx->args->ac.gs_vtx_offset[i]);
2951
ctx->gs_wave_id = ac_get_arg(&ctx->ac, ctx->args->ac.gs_wave_id);
2952
}
2953
}
2954
2955
/* Ensure that the esgs ring is declared.
2956
*
2957
* We declare it with 64KB alignment as a hint that the
2958
* pointer value will always be 0.
2959
*/
2960
static void
2961
declare_esgs_ring(struct radv_shader_context *ctx)
2962
{
2963
if (ctx->esgs_ring)
2964
return;
2965
2966
assert(!LLVMGetNamedGlobal(ctx->ac.module, "esgs_ring"));
2967
2968
ctx->esgs_ring = LLVMAddGlobalInAddressSpace(ctx->ac.module, LLVMArrayType(ctx->ac.i32, 0),
2969
"esgs_ring", AC_ADDR_SPACE_LDS);
2970
LLVMSetLinkage(ctx->esgs_ring, LLVMExternalLinkage);
2971
LLVMSetAlignment(ctx->esgs_ring, 64 * 1024);
2972
}
2973
2974
static LLVMModuleRef
2975
ac_translate_nir_to_llvm(struct ac_llvm_compiler *ac_llvm, struct nir_shader *const *shaders,
2976
int shader_count, const struct radv_shader_args *args)
2977
{
2978
struct radv_shader_context ctx = {0};
2979
ctx.args = args;
2980
2981
enum ac_float_mode float_mode = AC_FLOAT_MODE_DEFAULT;
2982
2983
if (args->shader_info->float_controls_mode & FLOAT_CONTROLS_DENORM_FLUSH_TO_ZERO_FP32) {
2984
float_mode = AC_FLOAT_MODE_DENORM_FLUSH_TO_ZERO;
2985
}
2986
2987
ac_llvm_context_init(&ctx.ac, ac_llvm, args->options->chip_class, args->options->family,
2988
args->options->info, float_mode, args->shader_info->wave_size,
2989
args->shader_info->ballot_bit_size);
2990
ctx.context = ctx.ac.context;
2991
2992
ctx.max_workgroup_size = 0;
2993
for (int i = 0; i < shader_count; ++i) {
2994
ctx.max_workgroup_size = MAX2(
2995
ctx.max_workgroup_size, radv_nir_get_max_workgroup_size(
2996
args->options->chip_class, shaders[i]->info.stage, shaders[i]));
2997
}
2998
2999
if (ctx.ac.chip_class >= GFX10) {
3000
if (is_pre_gs_stage(shaders[0]->info.stage) && args->options->key.vs_common_out.as_ngg) {
3001
ctx.max_workgroup_size = 128;
3002
}
3003
}
3004
3005
create_function(&ctx, shaders[shader_count - 1]->info.stage, shader_count >= 2);
3006
3007
ctx.abi.inputs = &ctx.inputs[0];
3008
ctx.abi.emit_outputs = handle_shader_outputs_post;
3009
ctx.abi.emit_vertex_with_counter = visit_emit_vertex_with_counter;
3010
ctx.abi.load_ubo = radv_load_ubo;
3011
ctx.abi.load_ssbo = radv_load_ssbo;
3012
ctx.abi.load_sampler_desc = radv_get_sampler_desc;
3013
ctx.abi.load_resource = radv_load_resource;
3014
ctx.abi.load_ring_tess_factors = load_ring_tess_factors;
3015
ctx.abi.load_ring_tess_offchip = load_ring_tess_offchip;
3016
ctx.abi.load_ring_esgs = load_ring_esgs;
3017
ctx.abi.clamp_shadow_reference = false;
3018
ctx.abi.adjust_frag_coord_z = args->options->adjust_frag_coord_z;
3019
ctx.abi.robust_buffer_access = args->options->robust_buffer_access;
3020
3021
bool is_ngg = is_pre_gs_stage(shaders[0]->info.stage) && args->options->key.vs_common_out.as_ngg;
3022
if (shader_count >= 2 || is_ngg)
3023
ac_init_exec_full_mask(&ctx.ac);
3024
3025
if (args->ac.vertex_id.used)
3026
ctx.abi.vertex_id = ac_get_arg(&ctx.ac, args->ac.vertex_id);
3027
if (args->ac.vs_rel_patch_id.used)
3028
ctx.vs_rel_patch_id = ac_get_arg(&ctx.ac, args->ac.vs_rel_patch_id);
3029
if (args->ac.instance_id.used)
3030
ctx.abi.instance_id = ac_get_arg(&ctx.ac, args->ac.instance_id);
3031
3032
if (args->options->has_ls_vgpr_init_bug &&
3033
shaders[shader_count - 1]->info.stage == MESA_SHADER_TESS_CTRL)
3034
ac_nir_fixup_ls_hs_input_vgprs(&ctx);
3035
3036
if (is_ngg) {
3037
/* Declare scratch space base for streamout and vertex
3038
* compaction. Whether space is actually allocated is
3039
* determined during linking / PM4 creation.
3040
*
3041
* Add an extra dword per vertex to ensure an odd stride, which
3042
* avoids bank conflicts for SoA accesses.
3043
*/
3044
if (!args->options->key.vs_common_out.as_ngg_passthrough)
3045
declare_esgs_ring(&ctx);
3046
3047
/* This is really only needed when streamout and / or vertex
3048
* compaction is enabled.
3049
*/
3050
if (args->shader_info->so.num_outputs) {
3051
LLVMTypeRef asi32 = LLVMArrayType(ctx.ac.i32, 8);
3052
ctx.gs_ngg_scratch =
3053
LLVMAddGlobalInAddressSpace(ctx.ac.module, asi32, "ngg_scratch", AC_ADDR_SPACE_LDS);
3054
LLVMSetInitializer(ctx.gs_ngg_scratch, LLVMGetUndef(asi32));
3055
LLVMSetAlignment(ctx.gs_ngg_scratch, 4);
3056
}
3057
3058
/* GFX10 hang workaround - there needs to be an s_barrier before gs_alloc_req always */
3059
if (ctx.ac.chip_class == GFX10 && shader_count == 1)
3060
ac_build_s_barrier(&ctx.ac);
3061
}
3062
3063
for (int shader_idx = 0; shader_idx < shader_count; ++shader_idx) {
3064
ctx.stage = shaders[shader_idx]->info.stage;
3065
ctx.shader = shaders[shader_idx];
3066
ctx.output_mask = 0;
3067
3068
if (shaders[shader_idx]->info.stage == MESA_SHADER_GEOMETRY) {
3069
for (int i = 0; i < 4; i++) {
3070
ctx.gs_next_vertex[i] = ac_build_alloca(&ctx.ac, ctx.ac.i32, "");
3071
}
3072
if (args->options->key.vs_common_out.as_ngg) {
3073
for (unsigned i = 0; i < 4; ++i) {
3074
ctx.gs_curprim_verts[i] = ac_build_alloca(&ctx.ac, ctx.ac.i32, "");
3075
ctx.gs_generated_prims[i] = ac_build_alloca(&ctx.ac, ctx.ac.i32, "");
3076
}
3077
3078
unsigned scratch_size = 8;
3079
if (args->shader_info->so.num_outputs)
3080
scratch_size = 44;
3081
3082
LLVMTypeRef ai32 = LLVMArrayType(ctx.ac.i32, scratch_size);
3083
ctx.gs_ngg_scratch =
3084
LLVMAddGlobalInAddressSpace(ctx.ac.module, ai32, "ngg_scratch", AC_ADDR_SPACE_LDS);
3085
LLVMSetInitializer(ctx.gs_ngg_scratch, LLVMGetUndef(ai32));
3086
LLVMSetAlignment(ctx.gs_ngg_scratch, 4);
3087
3088
ctx.gs_ngg_emit = LLVMAddGlobalInAddressSpace(
3089
ctx.ac.module, LLVMArrayType(ctx.ac.i32, 0), "ngg_emit", AC_ADDR_SPACE_LDS);
3090
LLVMSetLinkage(ctx.gs_ngg_emit, LLVMExternalLinkage);
3091
LLVMSetAlignment(ctx.gs_ngg_emit, 4);
3092
}
3093
3094
ctx.abi.emit_primitive = visit_end_primitive;
3095
} else if (shaders[shader_idx]->info.stage == MESA_SHADER_TESS_EVAL) {
3096
ctx.abi.load_tess_coord = load_tess_coord;
3097
} else if (shaders[shader_idx]->info.stage == MESA_SHADER_VERTEX) {
3098
ctx.abi.load_base_vertex = radv_load_base_vertex;
3099
} else if (shaders[shader_idx]->info.stage == MESA_SHADER_FRAGMENT) {
3100
ctx.abi.load_sample_position = load_sample_position;
3101
ctx.abi.load_sample_mask_in = load_sample_mask_in;
3102
}
3103
3104
if (shaders[shader_idx]->info.stage == MESA_SHADER_VERTEX &&
3105
args->options->key.vs_common_out.as_ngg &&
3106
args->options->key.vs_common_out.export_prim_id) {
3107
declare_esgs_ring(&ctx);
3108
}
3109
3110
bool nested_barrier = false;
3111
3112
if (shader_idx) {
3113
if (shaders[shader_idx]->info.stage == MESA_SHADER_GEOMETRY &&
3114
args->options->key.vs_common_out.as_ngg) {
3115
gfx10_ngg_gs_emit_prologue(&ctx);
3116
nested_barrier = false;
3117
} else {
3118
nested_barrier = true;
3119
}
3120
}
3121
3122
if (nested_barrier) {
3123
/* Execute a barrier before the second shader in
3124
* a merged shader.
3125
*
3126
* Execute the barrier inside the conditional block,
3127
* so that empty waves can jump directly to s_endpgm,
3128
* which will also signal the barrier.
3129
*
3130
* This is possible in gfx9, because an empty wave
3131
* for the second shader does not participate in
3132
* the epilogue. With NGG, empty waves may still
3133
* be required to export data (e.g. GS output vertices),
3134
* so we cannot let them exit early.
3135
*
3136
* If the shader is TCS and the TCS epilog is present
3137
* and contains a barrier, it will wait there and then
3138
* reach s_endpgm.
3139
*/
3140
ac_emit_barrier(&ctx.ac, ctx.stage);
3141
}
3142
3143
nir_foreach_shader_out_variable(variable, shaders[shader_idx]) scan_shader_output_decl(
3144
&ctx, variable, shaders[shader_idx], shaders[shader_idx]->info.stage);
3145
3146
ac_setup_rings(&ctx);
3147
3148
LLVMBasicBlockRef merge_block = NULL;
3149
if (shader_count >= 2 || is_ngg) {
3150
LLVMValueRef fn = LLVMGetBasicBlockParent(LLVMGetInsertBlock(ctx.ac.builder));
3151
LLVMBasicBlockRef then_block = LLVMAppendBasicBlockInContext(ctx.ac.context, fn, "");
3152
merge_block = LLVMAppendBasicBlockInContext(ctx.ac.context, fn, "");
3153
3154
LLVMValueRef count = ac_unpack_param(
3155
&ctx.ac, ac_get_arg(&ctx.ac, args->ac.merged_wave_info), 8 * shader_idx, 8);
3156
LLVMValueRef thread_id = ac_get_thread_id(&ctx.ac);
3157
LLVMValueRef cond = LLVMBuildICmp(ctx.ac.builder, LLVMIntULT, thread_id, count, "");
3158
LLVMBuildCondBr(ctx.ac.builder, cond, then_block, merge_block);
3159
3160
LLVMPositionBuilderAtEnd(ctx.ac.builder, then_block);
3161
}
3162
3163
if (shaders[shader_idx]->info.stage == MESA_SHADER_FRAGMENT)
3164
prepare_interp_optimize(&ctx, shaders[shader_idx]);
3165
else if (shaders[shader_idx]->info.stage == MESA_SHADER_VERTEX)
3166
handle_vs_inputs(&ctx, shaders[shader_idx]);
3167
else if (shaders[shader_idx]->info.stage == MESA_SHADER_GEOMETRY)
3168
prepare_gs_input_vgprs(&ctx, shader_count >= 2);
3169
3170
ac_nir_translate(&ctx.ac, &ctx.abi, &args->ac, shaders[shader_idx]);
3171
3172
if (shader_count >= 2 || is_ngg) {
3173
LLVMBuildBr(ctx.ac.builder, merge_block);
3174
LLVMPositionBuilderAtEnd(ctx.ac.builder, merge_block);
3175
}
3176
3177
/* This needs to be outside the if wrapping the shader body, as sometimes
3178
* the HW generates waves with 0 es/vs threads. */
3179
if (is_pre_gs_stage(shaders[shader_idx]->info.stage) &&
3180
args->options->key.vs_common_out.as_ngg && shader_idx == shader_count - 1) {
3181
handle_ngg_outputs_post_2(&ctx);
3182
} else if (shaders[shader_idx]->info.stage == MESA_SHADER_GEOMETRY &&
3183
args->options->key.vs_common_out.as_ngg) {
3184
gfx10_ngg_gs_emit_epilogue_2(&ctx);
3185
}
3186
}
3187
3188
LLVMBuildRetVoid(ctx.ac.builder);
3189
3190
if (args->options->dump_preoptir) {
3191
fprintf(stderr, "%s LLVM IR:\n\n",
3192
radv_get_shader_name(args->shader_info, shaders[shader_count - 1]->info.stage));
3193
ac_dump_module(ctx.ac.module);
3194
fprintf(stderr, "\n");
3195
}
3196
3197
ac_llvm_finalize_module(&ctx, ac_llvm->passmgr, args->options);
3198
3199
if (shader_count == 1)
3200
ac_nir_eliminate_const_vs_outputs(&ctx);
3201
3202
if (args->options->dump_shader) {
3203
args->shader_info->private_mem_vgprs = ac_count_scratch_private_memory(ctx.main_function);
3204
}
3205
3206
return ctx.ac.module;
3207
}
3208
3209
static void
3210
ac_diagnostic_handler(LLVMDiagnosticInfoRef di, void *context)
3211
{
3212
unsigned *retval = (unsigned *)context;
3213
LLVMDiagnosticSeverity severity = LLVMGetDiagInfoSeverity(di);
3214
char *description = LLVMGetDiagInfoDescription(di);
3215
3216
if (severity == LLVMDSError) {
3217
*retval = 1;
3218
fprintf(stderr, "LLVM triggered Diagnostic Handler: %s\n", description);
3219
}
3220
3221
LLVMDisposeMessage(description);
3222
}
3223
3224
static unsigned
3225
radv_llvm_compile(LLVMModuleRef M, char **pelf_buffer, size_t *pelf_size,
3226
struct ac_llvm_compiler *ac_llvm)
3227
{
3228
unsigned retval = 0;
3229
LLVMContextRef llvm_ctx;
3230
3231
/* Setup Diagnostic Handler*/
3232
llvm_ctx = LLVMGetModuleContext(M);
3233
3234
LLVMContextSetDiagnosticHandler(llvm_ctx, ac_diagnostic_handler, &retval);
3235
3236
/* Compile IR*/
3237
if (!radv_compile_to_elf(ac_llvm, M, pelf_buffer, pelf_size))
3238
retval = 1;
3239
return retval;
3240
}
3241
3242
static void
3243
ac_compile_llvm_module(struct ac_llvm_compiler *ac_llvm, LLVMModuleRef llvm_module,
3244
struct radv_shader_binary **rbinary, gl_shader_stage stage, const char *name,
3245
const struct radv_nir_compiler_options *options)
3246
{
3247
char *elf_buffer = NULL;
3248
size_t elf_size = 0;
3249
char *llvm_ir_string = NULL;
3250
3251
if (options->dump_shader) {
3252
fprintf(stderr, "%s LLVM IR:\n\n", name);
3253
ac_dump_module(llvm_module);
3254
fprintf(stderr, "\n");
3255
}
3256
3257
if (options->record_ir) {
3258
char *llvm_ir = LLVMPrintModuleToString(llvm_module);
3259
llvm_ir_string = strdup(llvm_ir);
3260
LLVMDisposeMessage(llvm_ir);
3261
}
3262
3263
int v = radv_llvm_compile(llvm_module, &elf_buffer, &elf_size, ac_llvm);
3264
if (v) {
3265
fprintf(stderr, "compile failed\n");
3266
}
3267
3268
LLVMContextRef ctx = LLVMGetModuleContext(llvm_module);
3269
LLVMDisposeModule(llvm_module);
3270
LLVMContextDispose(ctx);
3271
3272
size_t llvm_ir_size = llvm_ir_string ? strlen(llvm_ir_string) : 0;
3273
size_t alloc_size = sizeof(struct radv_shader_binary_rtld) + elf_size + llvm_ir_size + 1;
3274
struct radv_shader_binary_rtld *rbin = calloc(1, alloc_size);
3275
memcpy(rbin->data, elf_buffer, elf_size);
3276
if (llvm_ir_string)
3277
memcpy(rbin->data + elf_size, llvm_ir_string, llvm_ir_size + 1);
3278
3279
rbin->base.type = RADV_BINARY_TYPE_RTLD;
3280
rbin->base.stage = stage;
3281
rbin->base.total_size = alloc_size;
3282
rbin->elf_size = elf_size;
3283
rbin->llvm_ir_size = llvm_ir_size;
3284
*rbinary = &rbin->base;
3285
3286
free(llvm_ir_string);
3287
free(elf_buffer);
3288
}
3289
3290
static void
3291
radv_compile_nir_shader(struct ac_llvm_compiler *ac_llvm, struct radv_shader_binary **rbinary,
3292
const struct radv_shader_args *args, struct nir_shader *const *nir,
3293
int nir_count)
3294
{
3295
3296
LLVMModuleRef llvm_module;
3297
3298
llvm_module = ac_translate_nir_to_llvm(ac_llvm, nir, nir_count, args);
3299
3300
ac_compile_llvm_module(ac_llvm, llvm_module, rbinary, nir[nir_count - 1]->info.stage,
3301
radv_get_shader_name(args->shader_info, nir[nir_count - 1]->info.stage),
3302
args->options);
3303
3304
/* Determine the ES type (VS or TES) for the GS on GFX9. */
3305
if (args->options->chip_class >= GFX9) {
3306
if (nir_count == 2 && nir[1]->info.stage == MESA_SHADER_GEOMETRY) {
3307
args->shader_info->gs.es_type = nir[0]->info.stage;
3308
}
3309
}
3310
}
3311
3312
static void
3313
ac_gs_copy_shader_emit(struct radv_shader_context *ctx)
3314
{
3315
LLVMValueRef vtx_offset =
3316
LLVMBuildMul(ctx->ac.builder, ac_get_arg(&ctx->ac, ctx->args->ac.vertex_id),
3317
LLVMConstInt(ctx->ac.i32, 4, false), "");
3318
LLVMValueRef stream_id;
3319
3320
/* Fetch the vertex stream ID. */
3321
if (!ctx->args->options->use_ngg_streamout && ctx->args->shader_info->so.num_outputs) {
3322
stream_id =
3323
ac_unpack_param(&ctx->ac, ac_get_arg(&ctx->ac, ctx->args->ac.streamout_config), 24, 2);
3324
} else {
3325
stream_id = ctx->ac.i32_0;
3326
}
3327
3328
LLVMBasicBlockRef end_bb;
3329
LLVMValueRef switch_inst;
3330
3331
end_bb = LLVMAppendBasicBlockInContext(ctx->ac.context, ctx->main_function, "end");
3332
switch_inst = LLVMBuildSwitch(ctx->ac.builder, stream_id, end_bb, 4);
3333
3334
for (unsigned stream = 0; stream < 4; stream++) {
3335
unsigned num_components = ctx->args->shader_info->gs.num_stream_output_components[stream];
3336
LLVMBasicBlockRef bb;
3337
unsigned offset;
3338
3339
if (stream > 0 && !num_components)
3340
continue;
3341
3342
if (stream > 0 && !ctx->args->shader_info->so.num_outputs)
3343
continue;
3344
3345
bb = LLVMInsertBasicBlockInContext(ctx->ac.context, end_bb, "out");
3346
LLVMAddCase(switch_inst, LLVMConstInt(ctx->ac.i32, stream, 0), bb);
3347
LLVMPositionBuilderAtEnd(ctx->ac.builder, bb);
3348
3349
offset = 0;
3350
for (unsigned i = 0; i < AC_LLVM_MAX_OUTPUTS; ++i) {
3351
unsigned output_usage_mask = ctx->args->shader_info->gs.output_usage_mask[i];
3352
unsigned output_stream = ctx->args->shader_info->gs.output_streams[i];
3353
int length = util_last_bit(output_usage_mask);
3354
3355
if (!(ctx->output_mask & (1ull << i)) || output_stream != stream)
3356
continue;
3357
3358
for (unsigned j = 0; j < length; j++) {
3359
LLVMValueRef value, soffset;
3360
3361
if (!(output_usage_mask & (1 << j)))
3362
continue;
3363
3364
soffset = LLVMConstInt(ctx->ac.i32, offset * ctx->shader->info.gs.vertices_out * 16 * 4,
3365
false);
3366
3367
offset++;
3368
3369
value = ac_build_buffer_load(&ctx->ac, ctx->gsvs_ring[0], 1, ctx->ac.i32_0, vtx_offset,
3370
soffset, 0, ctx->ac.f32, ac_glc | ac_slc, true, false);
3371
3372
LLVMTypeRef type = LLVMGetAllocatedType(ctx->abi.outputs[ac_llvm_reg_index_soa(i, j)]);
3373
if (ac_get_type_size(type) == 2) {
3374
value = LLVMBuildBitCast(ctx->ac.builder, value, ctx->ac.i32, "");
3375
value = LLVMBuildTrunc(ctx->ac.builder, value, ctx->ac.i16, "");
3376
}
3377
3378
LLVMBuildStore(ctx->ac.builder, ac_to_float(&ctx->ac, value),
3379
ctx->abi.outputs[ac_llvm_reg_index_soa(i, j)]);
3380
}
3381
}
3382
3383
if (!ctx->args->options->use_ngg_streamout && ctx->args->shader_info->so.num_outputs)
3384
radv_emit_streamout(ctx, stream);
3385
3386
if (stream == 0) {
3387
handle_vs_outputs_post(ctx, false, true, &ctx->args->shader_info->vs.outinfo);
3388
}
3389
3390
LLVMBuildBr(ctx->ac.builder, end_bb);
3391
}
3392
3393
LLVMPositionBuilderAtEnd(ctx->ac.builder, end_bb);
3394
}
3395
3396
static void
3397
radv_compile_gs_copy_shader(struct ac_llvm_compiler *ac_llvm, struct nir_shader *geom_shader,
3398
struct radv_shader_binary **rbinary,
3399
const struct radv_shader_args *args)
3400
{
3401
struct radv_shader_context ctx = {0};
3402
ctx.args = args;
3403
3404
assert(args->is_gs_copy_shader);
3405
3406
ac_llvm_context_init(&ctx.ac, ac_llvm, args->options->chip_class, args->options->family,
3407
args->options->info, AC_FLOAT_MODE_DEFAULT, 64, 64);
3408
ctx.context = ctx.ac.context;
3409
3410
ctx.stage = MESA_SHADER_VERTEX;
3411
ctx.shader = geom_shader;
3412
3413
create_function(&ctx, MESA_SHADER_VERTEX, false);
3414
3415
ac_setup_rings(&ctx);
3416
3417
nir_foreach_shader_out_variable(variable, geom_shader)
3418
{
3419
scan_shader_output_decl(&ctx, variable, geom_shader, MESA_SHADER_VERTEX);
3420
ac_handle_shader_output_decl(&ctx.ac, &ctx.abi, geom_shader, variable, MESA_SHADER_VERTEX);
3421
}
3422
3423
ac_gs_copy_shader_emit(&ctx);
3424
3425
LLVMBuildRetVoid(ctx.ac.builder);
3426
3427
ac_llvm_finalize_module(&ctx, ac_llvm->passmgr, args->options);
3428
3429
ac_compile_llvm_module(ac_llvm, ctx.ac.module, rbinary, MESA_SHADER_VERTEX, "GS Copy Shader",
3430
args->options);
3431
(*rbinary)->is_gs_copy_shader = true;
3432
}
3433
3434
void
3435
llvm_compile_shader(struct radv_device *device, unsigned shader_count,
3436
struct nir_shader *const *shaders, struct radv_shader_binary **binary,
3437
struct radv_shader_args *args)
3438
{
3439
enum ac_target_machine_options tm_options = 0;
3440
struct ac_llvm_compiler ac_llvm;
3441
3442
tm_options |= AC_TM_SUPPORTS_SPILL;
3443
if (args->options->check_ir)
3444
tm_options |= AC_TM_CHECK_IR;
3445
3446
radv_init_llvm_compiler(&ac_llvm, args->options->family, tm_options,
3447
args->shader_info->wave_size);
3448
3449
if (args->is_gs_copy_shader) {
3450
radv_compile_gs_copy_shader(&ac_llvm, *shaders, binary, args);
3451
} else {
3452
radv_compile_nir_shader(&ac_llvm, binary, args, shaders, shader_count);
3453
}
3454
}
3455
3456