Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
PojavLauncherTeam
GitHub Repository: PojavLauncherTeam/mesa
Path: blob/21.2-virgl/src/amd/llvm/ac_llvm_build.c
7237 views
1
/*
2
* Copyright 2014 Advanced Micro Devices, Inc.
3
*
4
* Permission is hereby granted, free of charge, to any person obtaining a
5
* copy of this software and associated documentation files (the
6
* "Software"), to deal in the Software without restriction, including
7
* without limitation the rights to use, copy, modify, merge, publish,
8
* distribute, sub license, and/or sell copies of the Software, and to
9
* permit persons to whom the Software is furnished to do so, subject to
10
* the following conditions:
11
*
12
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
13
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
14
* FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
15
* THE COPYRIGHT HOLDERS, AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM,
16
* DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
17
* OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
18
* USE OR OTHER DEALINGS IN THE SOFTWARE.
19
*
20
* The above copyright notice and this permission notice (including the
21
* next paragraph) shall be included in all copies or substantial portions
22
* of the Software.
23
*
24
*/
25
/* based on pieces from si_pipe.c and radeon_llvm_emit.c */
26
#include "ac_llvm_build.h"
27
28
#include "ac_exp_param.h"
29
#include "ac_llvm_util.h"
30
#include "ac_shader_util.h"
31
#include "c11/threads.h"
32
#include "shader_enums.h"
33
#include "sid.h"
34
#include "util/bitscan.h"
35
#include "util/macros.h"
36
#include "util/u_atomic.h"
37
#include "util/u_math.h"
38
#include <llvm-c/Core.h>
39
#include <llvm/Config/llvm-config.h>
40
41
#include <assert.h>
42
#include <stdio.h>
43
44
#define AC_LLVM_INITIAL_CF_DEPTH 4
45
46
/* Data for if/else/endif and bgnloop/endloop control flow structures.
47
*/
48
struct ac_llvm_flow {
49
/* Loop exit or next part of if/else/endif. */
50
LLVMBasicBlockRef next_block;
51
LLVMBasicBlockRef loop_entry_block;
52
};
53
54
/* Initialize module-independent parts of the context.
55
*
56
* The caller is responsible for initializing ctx::module and ctx::builder.
57
*/
58
void ac_llvm_context_init(struct ac_llvm_context *ctx, struct ac_llvm_compiler *compiler,
59
enum chip_class chip_class, enum radeon_family family,
60
const struct radeon_info *info,
61
enum ac_float_mode float_mode, unsigned wave_size,
62
unsigned ballot_mask_bits)
63
{
64
ctx->context = LLVMContextCreate();
65
66
ctx->chip_class = chip_class;
67
ctx->family = family;
68
ctx->info = info;
69
ctx->wave_size = wave_size;
70
ctx->ballot_mask_bits = ballot_mask_bits;
71
ctx->float_mode = float_mode;
72
ctx->module = ac_create_module(compiler->tm, ctx->context);
73
ctx->builder = ac_create_builder(ctx->context, float_mode);
74
75
ctx->voidt = LLVMVoidTypeInContext(ctx->context);
76
ctx->i1 = LLVMInt1TypeInContext(ctx->context);
77
ctx->i8 = LLVMInt8TypeInContext(ctx->context);
78
ctx->i16 = LLVMIntTypeInContext(ctx->context, 16);
79
ctx->i32 = LLVMIntTypeInContext(ctx->context, 32);
80
ctx->i64 = LLVMIntTypeInContext(ctx->context, 64);
81
ctx->i128 = LLVMIntTypeInContext(ctx->context, 128);
82
ctx->intptr = ctx->i32;
83
ctx->f16 = LLVMHalfTypeInContext(ctx->context);
84
ctx->f32 = LLVMFloatTypeInContext(ctx->context);
85
ctx->f64 = LLVMDoubleTypeInContext(ctx->context);
86
ctx->v2i16 = LLVMVectorType(ctx->i16, 2);
87
ctx->v4i16 = LLVMVectorType(ctx->i16, 4);
88
ctx->v2f16 = LLVMVectorType(ctx->f16, 2);
89
ctx->v4f16 = LLVMVectorType(ctx->f16, 4);
90
ctx->v2i32 = LLVMVectorType(ctx->i32, 2);
91
ctx->v3i32 = LLVMVectorType(ctx->i32, 3);
92
ctx->v4i32 = LLVMVectorType(ctx->i32, 4);
93
ctx->v2f32 = LLVMVectorType(ctx->f32, 2);
94
ctx->v3f32 = LLVMVectorType(ctx->f32, 3);
95
ctx->v4f32 = LLVMVectorType(ctx->f32, 4);
96
ctx->v8i32 = LLVMVectorType(ctx->i32, 8);
97
ctx->iN_wavemask = LLVMIntTypeInContext(ctx->context, ctx->wave_size);
98
ctx->iN_ballotmask = LLVMIntTypeInContext(ctx->context, ballot_mask_bits);
99
100
ctx->i8_0 = LLVMConstInt(ctx->i8, 0, false);
101
ctx->i8_1 = LLVMConstInt(ctx->i8, 1, false);
102
ctx->i16_0 = LLVMConstInt(ctx->i16, 0, false);
103
ctx->i16_1 = LLVMConstInt(ctx->i16, 1, false);
104
ctx->i32_0 = LLVMConstInt(ctx->i32, 0, false);
105
ctx->i32_1 = LLVMConstInt(ctx->i32, 1, false);
106
ctx->i64_0 = LLVMConstInt(ctx->i64, 0, false);
107
ctx->i64_1 = LLVMConstInt(ctx->i64, 1, false);
108
ctx->i128_0 = LLVMConstInt(ctx->i128, 0, false);
109
ctx->i128_1 = LLVMConstInt(ctx->i128, 1, false);
110
ctx->f16_0 = LLVMConstReal(ctx->f16, 0.0);
111
ctx->f16_1 = LLVMConstReal(ctx->f16, 1.0);
112
ctx->f32_0 = LLVMConstReal(ctx->f32, 0.0);
113
ctx->f32_1 = LLVMConstReal(ctx->f32, 1.0);
114
ctx->f64_0 = LLVMConstReal(ctx->f64, 0.0);
115
ctx->f64_1 = LLVMConstReal(ctx->f64, 1.0);
116
117
ctx->i1false = LLVMConstInt(ctx->i1, 0, false);
118
ctx->i1true = LLVMConstInt(ctx->i1, 1, false);
119
120
ctx->range_md_kind = LLVMGetMDKindIDInContext(ctx->context, "range", 5);
121
122
ctx->invariant_load_md_kind = LLVMGetMDKindIDInContext(ctx->context, "invariant.load", 14);
123
124
ctx->uniform_md_kind = LLVMGetMDKindIDInContext(ctx->context, "amdgpu.uniform", 14);
125
126
ctx->empty_md = LLVMMDNodeInContext(ctx->context, NULL, 0);
127
ctx->flow = calloc(1, sizeof(*ctx->flow));
128
}
129
130
void ac_llvm_context_dispose(struct ac_llvm_context *ctx)
131
{
132
free(ctx->flow->stack);
133
free(ctx->flow);
134
ctx->flow = NULL;
135
}
136
137
int ac_get_llvm_num_components(LLVMValueRef value)
138
{
139
LLVMTypeRef type = LLVMTypeOf(value);
140
unsigned num_components =
141
LLVMGetTypeKind(type) == LLVMVectorTypeKind ? LLVMGetVectorSize(type) : 1;
142
return num_components;
143
}
144
145
LLVMValueRef ac_llvm_extract_elem(struct ac_llvm_context *ac, LLVMValueRef value, int index)
146
{
147
if (LLVMGetTypeKind(LLVMTypeOf(value)) != LLVMVectorTypeKind) {
148
assert(index == 0);
149
return value;
150
}
151
152
return LLVMBuildExtractElement(ac->builder, value, LLVMConstInt(ac->i32, index, false), "");
153
}
154
155
int ac_get_elem_bits(struct ac_llvm_context *ctx, LLVMTypeRef type)
156
{
157
if (LLVMGetTypeKind(type) == LLVMVectorTypeKind)
158
type = LLVMGetElementType(type);
159
160
if (LLVMGetTypeKind(type) == LLVMIntegerTypeKind)
161
return LLVMGetIntTypeWidth(type);
162
163
if (LLVMGetTypeKind(type) == LLVMPointerTypeKind) {
164
if (LLVMGetPointerAddressSpace(type) == AC_ADDR_SPACE_LDS)
165
return 32;
166
}
167
168
if (type == ctx->f16)
169
return 16;
170
if (type == ctx->f32)
171
return 32;
172
if (type == ctx->f64)
173
return 64;
174
175
unreachable("Unhandled type kind in get_elem_bits");
176
}
177
178
unsigned ac_get_type_size(LLVMTypeRef type)
179
{
180
LLVMTypeKind kind = LLVMGetTypeKind(type);
181
182
switch (kind) {
183
case LLVMIntegerTypeKind:
184
return LLVMGetIntTypeWidth(type) / 8;
185
case LLVMHalfTypeKind:
186
return 2;
187
case LLVMFloatTypeKind:
188
return 4;
189
case LLVMDoubleTypeKind:
190
return 8;
191
case LLVMPointerTypeKind:
192
if (LLVMGetPointerAddressSpace(type) == AC_ADDR_SPACE_CONST_32BIT)
193
return 4;
194
return 8;
195
case LLVMVectorTypeKind:
196
return LLVMGetVectorSize(type) * ac_get_type_size(LLVMGetElementType(type));
197
case LLVMArrayTypeKind:
198
return LLVMGetArrayLength(type) * ac_get_type_size(LLVMGetElementType(type));
199
default:
200
assert(0);
201
return 0;
202
}
203
}
204
205
static LLVMTypeRef to_integer_type_scalar(struct ac_llvm_context *ctx, LLVMTypeRef t)
206
{
207
if (t == ctx->i1)
208
return ctx->i1;
209
else if (t == ctx->i8)
210
return ctx->i8;
211
else if (t == ctx->f16 || t == ctx->i16)
212
return ctx->i16;
213
else if (t == ctx->f32 || t == ctx->i32)
214
return ctx->i32;
215
else if (t == ctx->f64 || t == ctx->i64)
216
return ctx->i64;
217
else
218
unreachable("Unhandled integer size");
219
}
220
221
LLVMTypeRef ac_to_integer_type(struct ac_llvm_context *ctx, LLVMTypeRef t)
222
{
223
if (LLVMGetTypeKind(t) == LLVMVectorTypeKind) {
224
LLVMTypeRef elem_type = LLVMGetElementType(t);
225
return LLVMVectorType(to_integer_type_scalar(ctx, elem_type), LLVMGetVectorSize(t));
226
}
227
if (LLVMGetTypeKind(t) == LLVMPointerTypeKind) {
228
switch (LLVMGetPointerAddressSpace(t)) {
229
case AC_ADDR_SPACE_GLOBAL:
230
return ctx->i64;
231
case AC_ADDR_SPACE_CONST_32BIT:
232
case AC_ADDR_SPACE_LDS:
233
return ctx->i32;
234
default:
235
unreachable("unhandled address space");
236
}
237
}
238
return to_integer_type_scalar(ctx, t);
239
}
240
241
LLVMValueRef ac_to_integer(struct ac_llvm_context *ctx, LLVMValueRef v)
242
{
243
LLVMTypeRef type = LLVMTypeOf(v);
244
if (LLVMGetTypeKind(type) == LLVMPointerTypeKind) {
245
return LLVMBuildPtrToInt(ctx->builder, v, ac_to_integer_type(ctx, type), "");
246
}
247
return LLVMBuildBitCast(ctx->builder, v, ac_to_integer_type(ctx, type), "");
248
}
249
250
LLVMValueRef ac_to_integer_or_pointer(struct ac_llvm_context *ctx, LLVMValueRef v)
251
{
252
LLVMTypeRef type = LLVMTypeOf(v);
253
if (LLVMGetTypeKind(type) == LLVMPointerTypeKind)
254
return v;
255
return ac_to_integer(ctx, v);
256
}
257
258
static LLVMTypeRef to_float_type_scalar(struct ac_llvm_context *ctx, LLVMTypeRef t)
259
{
260
if (t == ctx->i8)
261
return ctx->i8;
262
else if (t == ctx->i16 || t == ctx->f16)
263
return ctx->f16;
264
else if (t == ctx->i32 || t == ctx->f32)
265
return ctx->f32;
266
else if (t == ctx->i64 || t == ctx->f64)
267
return ctx->f64;
268
else
269
unreachable("Unhandled float size");
270
}
271
272
LLVMTypeRef ac_to_float_type(struct ac_llvm_context *ctx, LLVMTypeRef t)
273
{
274
if (LLVMGetTypeKind(t) == LLVMVectorTypeKind) {
275
LLVMTypeRef elem_type = LLVMGetElementType(t);
276
return LLVMVectorType(to_float_type_scalar(ctx, elem_type), LLVMGetVectorSize(t));
277
}
278
return to_float_type_scalar(ctx, t);
279
}
280
281
LLVMValueRef ac_to_float(struct ac_llvm_context *ctx, LLVMValueRef v)
282
{
283
LLVMTypeRef type = LLVMTypeOf(v);
284
return LLVMBuildBitCast(ctx->builder, v, ac_to_float_type(ctx, type), "");
285
}
286
287
LLVMValueRef ac_build_intrinsic(struct ac_llvm_context *ctx, const char *name,
288
LLVMTypeRef return_type, LLVMValueRef *params, unsigned param_count,
289
unsigned attrib_mask)
290
{
291
LLVMValueRef function, call;
292
bool set_callsite_attrs = !(attrib_mask & AC_FUNC_ATTR_LEGACY);
293
294
function = LLVMGetNamedFunction(ctx->module, name);
295
if (!function) {
296
LLVMTypeRef param_types[32], function_type;
297
unsigned i;
298
299
assert(param_count <= 32);
300
301
for (i = 0; i < param_count; ++i) {
302
assert(params[i]);
303
param_types[i] = LLVMTypeOf(params[i]);
304
}
305
function_type = LLVMFunctionType(return_type, param_types, param_count, 0);
306
function = LLVMAddFunction(ctx->module, name, function_type);
307
308
LLVMSetFunctionCallConv(function, LLVMCCallConv);
309
LLVMSetLinkage(function, LLVMExternalLinkage);
310
311
if (!set_callsite_attrs)
312
ac_add_func_attributes(ctx->context, function, attrib_mask);
313
}
314
315
call = LLVMBuildCall(ctx->builder, function, params, param_count, "");
316
if (set_callsite_attrs)
317
ac_add_func_attributes(ctx->context, call, attrib_mask);
318
return call;
319
}
320
321
/**
322
* Given the i32 or vNi32 \p type, generate the textual name (e.g. for use with
323
* intrinsic names).
324
*/
325
void ac_build_type_name_for_intr(LLVMTypeRef type, char *buf, unsigned bufsize)
326
{
327
LLVMTypeRef elem_type = type;
328
329
if (LLVMGetTypeKind(type) == LLVMStructTypeKind) {
330
unsigned count = LLVMCountStructElementTypes(type);
331
int ret = snprintf(buf, bufsize, "sl_");
332
buf += ret;
333
bufsize -= ret;
334
335
LLVMTypeRef *elems = alloca(count * sizeof(LLVMTypeRef));
336
LLVMGetStructElementTypes(type, elems);
337
338
for (unsigned i = 0; i < count; i++) {
339
ac_build_type_name_for_intr(elems[i], buf, bufsize);
340
ret = strlen(buf);
341
buf += ret;
342
bufsize -= ret;
343
}
344
345
snprintf(buf, bufsize, "s");
346
return;
347
}
348
349
assert(bufsize >= 8);
350
if (LLVMGetTypeKind(type) == LLVMVectorTypeKind) {
351
int ret = snprintf(buf, bufsize, "v%u", LLVMGetVectorSize(type));
352
if (ret < 0) {
353
char *type_name = LLVMPrintTypeToString(type);
354
fprintf(stderr, "Error building type name for: %s\n", type_name);
355
LLVMDisposeMessage(type_name);
356
return;
357
}
358
elem_type = LLVMGetElementType(type);
359
buf += ret;
360
bufsize -= ret;
361
}
362
switch (LLVMGetTypeKind(elem_type)) {
363
default:
364
break;
365
case LLVMIntegerTypeKind:
366
snprintf(buf, bufsize, "i%d", LLVMGetIntTypeWidth(elem_type));
367
break;
368
case LLVMHalfTypeKind:
369
snprintf(buf, bufsize, "f16");
370
break;
371
case LLVMFloatTypeKind:
372
snprintf(buf, bufsize, "f32");
373
break;
374
case LLVMDoubleTypeKind:
375
snprintf(buf, bufsize, "f64");
376
break;
377
}
378
}
379
380
/**
381
* Helper function that builds an LLVM IR PHI node and immediately adds
382
* incoming edges.
383
*/
384
LLVMValueRef ac_build_phi(struct ac_llvm_context *ctx, LLVMTypeRef type, unsigned count_incoming,
385
LLVMValueRef *values, LLVMBasicBlockRef *blocks)
386
{
387
LLVMValueRef phi = LLVMBuildPhi(ctx->builder, type, "");
388
LLVMAddIncoming(phi, values, blocks, count_incoming);
389
return phi;
390
}
391
392
void ac_build_s_barrier(struct ac_llvm_context *ctx)
393
{
394
ac_build_intrinsic(ctx, "llvm.amdgcn.s.barrier", ctx->voidt, NULL, 0, AC_FUNC_ATTR_CONVERGENT);
395
}
396
397
/* Prevent optimizations (at least of memory accesses) across the current
398
* point in the program by emitting empty inline assembly that is marked as
399
* having side effects.
400
*
401
* Optionally, a value can be passed through the inline assembly to prevent
402
* LLVM from hoisting calls to ReadNone functions.
403
*/
404
void ac_build_optimization_barrier(struct ac_llvm_context *ctx, LLVMValueRef *pgpr, bool sgpr)
405
{
406
static int counter = 0;
407
408
LLVMBuilderRef builder = ctx->builder;
409
char code[16];
410
const char *constraint = sgpr ? "=s,0" : "=v,0";
411
412
snprintf(code, sizeof(code), "; %d", (int)p_atomic_inc_return(&counter));
413
414
if (!pgpr) {
415
LLVMTypeRef ftype = LLVMFunctionType(ctx->voidt, NULL, 0, false);
416
LLVMValueRef inlineasm = LLVMConstInlineAsm(ftype, code, "", true, false);
417
LLVMBuildCall(builder, inlineasm, NULL, 0, "");
418
} else if (LLVMTypeOf(*pgpr) == ctx->i32) {
419
/* Simple version for i32 that allows the caller to set LLVM metadata on the call
420
* instruction. */
421
LLVMTypeRef ftype = LLVMFunctionType(ctx->i32, &ctx->i32, 1, false);
422
LLVMValueRef inlineasm = LLVMConstInlineAsm(ftype, code, constraint, true, false);
423
424
*pgpr = LLVMBuildCall(builder, inlineasm, pgpr, 1, "");
425
} else if (LLVMTypeOf(*pgpr) == ctx->i16) {
426
/* Simple version for i16 that allows the caller to set LLVM metadata on the call
427
* instruction. */
428
LLVMTypeRef ftype = LLVMFunctionType(ctx->i16, &ctx->i16, 1, false);
429
LLVMValueRef inlineasm = LLVMConstInlineAsm(ftype, code, constraint, true, false);
430
431
*pgpr = LLVMBuildCall(builder, inlineasm, pgpr, 1, "");
432
} else if (LLVMGetTypeKind(LLVMTypeOf(*pgpr)) == LLVMPointerTypeKind) {
433
LLVMTypeRef type = LLVMTypeOf(*pgpr);
434
LLVMTypeRef ftype = LLVMFunctionType(type, &type, 1, false);
435
LLVMValueRef inlineasm = LLVMConstInlineAsm(ftype, code, constraint, true, false);
436
437
*pgpr = LLVMBuildCall(builder, inlineasm, pgpr, 1, "");
438
} else {
439
LLVMTypeRef ftype = LLVMFunctionType(ctx->i32, &ctx->i32, 1, false);
440
LLVMValueRef inlineasm = LLVMConstInlineAsm(ftype, code, constraint, true, false);
441
LLVMTypeRef type = LLVMTypeOf(*pgpr);
442
unsigned bitsize = ac_get_elem_bits(ctx, type);
443
LLVMValueRef vgpr = *pgpr;
444
LLVMTypeRef vgpr_type;
445
unsigned vgpr_size;
446
LLVMValueRef vgpr0;
447
448
if (bitsize < 32)
449
vgpr = LLVMBuildZExt(ctx->builder, vgpr, ctx->i32, "");
450
451
vgpr_type = LLVMTypeOf(vgpr);
452
vgpr_size = ac_get_type_size(vgpr_type);
453
454
assert(vgpr_size % 4 == 0);
455
456
vgpr = LLVMBuildBitCast(builder, vgpr, LLVMVectorType(ctx->i32, vgpr_size / 4), "");
457
vgpr0 = LLVMBuildExtractElement(builder, vgpr, ctx->i32_0, "");
458
vgpr0 = LLVMBuildCall(builder, inlineasm, &vgpr0, 1, "");
459
vgpr = LLVMBuildInsertElement(builder, vgpr, vgpr0, ctx->i32_0, "");
460
vgpr = LLVMBuildBitCast(builder, vgpr, vgpr_type, "");
461
462
if (bitsize < 32)
463
vgpr = LLVMBuildTrunc(builder, vgpr, type, "");
464
465
*pgpr = vgpr;
466
}
467
}
468
469
LLVMValueRef ac_build_shader_clock(struct ac_llvm_context *ctx, nir_scope scope)
470
{
471
const char *subgroup = "llvm.readcyclecounter";
472
const char *name = scope == NIR_SCOPE_DEVICE ? "llvm.amdgcn.s.memrealtime" : subgroup;
473
474
LLVMValueRef tmp = ac_build_intrinsic(ctx, name, ctx->i64, NULL, 0, 0);
475
return LLVMBuildBitCast(ctx->builder, tmp, ctx->v2i32, "");
476
}
477
478
LLVMValueRef ac_build_ballot(struct ac_llvm_context *ctx, LLVMValueRef value)
479
{
480
const char *name;
481
482
if (LLVMTypeOf(value) == ctx->i1)
483
value = LLVMBuildZExt(ctx->builder, value, ctx->i32, "");
484
485
if (ctx->wave_size == 64)
486
name = "llvm.amdgcn.icmp.i64.i32";
487
else
488
name = "llvm.amdgcn.icmp.i32.i32";
489
490
LLVMValueRef args[3] = {value, ctx->i32_0, LLVMConstInt(ctx->i32, LLVMIntNE, 0)};
491
492
/* We currently have no other way to prevent LLVM from lifting the icmp
493
* calls to a dominating basic block.
494
*/
495
ac_build_optimization_barrier(ctx, &args[0], false);
496
497
args[0] = ac_to_integer(ctx, args[0]);
498
499
return ac_build_intrinsic(
500
ctx, name, ctx->iN_wavemask, args, 3,
501
AC_FUNC_ATTR_NOUNWIND | AC_FUNC_ATTR_READNONE | AC_FUNC_ATTR_CONVERGENT);
502
}
503
504
LLVMValueRef ac_get_i1_sgpr_mask(struct ac_llvm_context *ctx, LLVMValueRef value)
505
{
506
const char *name;
507
508
if (ctx->wave_size == 64)
509
name = "llvm.amdgcn.icmp.i64.i1";
510
else
511
name = "llvm.amdgcn.icmp.i32.i1";
512
513
LLVMValueRef args[3] = {
514
value,
515
ctx->i1false,
516
LLVMConstInt(ctx->i32, LLVMIntNE, 0),
517
};
518
519
return ac_build_intrinsic(
520
ctx, name, ctx->iN_wavemask, args, 3,
521
AC_FUNC_ATTR_NOUNWIND | AC_FUNC_ATTR_READNONE | AC_FUNC_ATTR_CONVERGENT);
522
}
523
524
LLVMValueRef ac_build_vote_all(struct ac_llvm_context *ctx, LLVMValueRef value)
525
{
526
LLVMValueRef active_set = ac_build_ballot(ctx, ctx->i32_1);
527
LLVMValueRef vote_set = ac_build_ballot(ctx, value);
528
return LLVMBuildICmp(ctx->builder, LLVMIntEQ, vote_set, active_set, "");
529
}
530
531
LLVMValueRef ac_build_vote_any(struct ac_llvm_context *ctx, LLVMValueRef value)
532
{
533
LLVMValueRef vote_set = ac_build_ballot(ctx, value);
534
return LLVMBuildICmp(ctx->builder, LLVMIntNE, vote_set, LLVMConstInt(ctx->iN_wavemask, 0, 0),
535
"");
536
}
537
538
LLVMValueRef ac_build_vote_eq(struct ac_llvm_context *ctx, LLVMValueRef value)
539
{
540
LLVMValueRef active_set = ac_build_ballot(ctx, ctx->i32_1);
541
LLVMValueRef vote_set = ac_build_ballot(ctx, value);
542
543
LLVMValueRef all = LLVMBuildICmp(ctx->builder, LLVMIntEQ, vote_set, active_set, "");
544
LLVMValueRef none =
545
LLVMBuildICmp(ctx->builder, LLVMIntEQ, vote_set, LLVMConstInt(ctx->iN_wavemask, 0, 0), "");
546
return LLVMBuildOr(ctx->builder, all, none, "");
547
}
548
549
LLVMValueRef ac_build_varying_gather_values(struct ac_llvm_context *ctx, LLVMValueRef *values,
550
unsigned value_count, unsigned component)
551
{
552
LLVMValueRef vec = NULL;
553
554
if (value_count == 1) {
555
return values[component];
556
} else if (!value_count)
557
unreachable("value_count is 0");
558
559
for (unsigned i = component; i < value_count + component; i++) {
560
LLVMValueRef value = values[i];
561
562
if (i == component)
563
vec = LLVMGetUndef(LLVMVectorType(LLVMTypeOf(value), value_count));
564
LLVMValueRef index = LLVMConstInt(ctx->i32, i - component, false);
565
vec = LLVMBuildInsertElement(ctx->builder, vec, value, index, "");
566
}
567
return vec;
568
}
569
570
LLVMValueRef ac_build_gather_values_extended(struct ac_llvm_context *ctx, LLVMValueRef *values,
571
unsigned value_count, unsigned value_stride, bool load,
572
bool always_vector)
573
{
574
LLVMBuilderRef builder = ctx->builder;
575
LLVMValueRef vec = NULL;
576
unsigned i;
577
578
if (value_count == 1 && !always_vector) {
579
if (load)
580
return LLVMBuildLoad(builder, values[0], "");
581
return values[0];
582
} else if (!value_count)
583
unreachable("value_count is 0");
584
585
for (i = 0; i < value_count; i++) {
586
LLVMValueRef value = values[i * value_stride];
587
if (load)
588
value = LLVMBuildLoad(builder, value, "");
589
590
if (!i)
591
vec = LLVMGetUndef(LLVMVectorType(LLVMTypeOf(value), value_count));
592
LLVMValueRef index = LLVMConstInt(ctx->i32, i, false);
593
vec = LLVMBuildInsertElement(builder, vec, value, index, "");
594
}
595
return vec;
596
}
597
598
LLVMValueRef ac_build_gather_values(struct ac_llvm_context *ctx, LLVMValueRef *values,
599
unsigned value_count)
600
{
601
return ac_build_gather_values_extended(ctx, values, value_count, 1, false, false);
602
}
603
604
LLVMValueRef ac_build_concat(struct ac_llvm_context *ctx, LLVMValueRef a, LLVMValueRef b)
605
{
606
unsigned a_size = ac_get_llvm_num_components(a);
607
unsigned b_size = ac_get_llvm_num_components(b);
608
609
LLVMValueRef *elems = alloca((a_size + b_size) * sizeof(LLVMValueRef));
610
for (unsigned i = 0; i < a_size; i++)
611
elems[i] = ac_llvm_extract_elem(ctx, a, i);
612
for (unsigned i = 0; i < b_size; i++)
613
elems[a_size + i] = ac_llvm_extract_elem(ctx, b, i);
614
615
return ac_build_gather_values(ctx, elems, a_size + b_size);
616
}
617
618
/* Expand a scalar or vector to <dst_channels x type> by filling the remaining
619
* channels with undef. Extract at most src_channels components from the input.
620
*/
621
LLVMValueRef ac_build_expand(struct ac_llvm_context *ctx, LLVMValueRef value,
622
unsigned src_channels, unsigned dst_channels)
623
{
624
LLVMTypeRef elemtype;
625
LLVMValueRef *const chan = alloca(dst_channels * sizeof(LLVMValueRef));
626
627
if (LLVMGetTypeKind(LLVMTypeOf(value)) == LLVMVectorTypeKind) {
628
unsigned vec_size = LLVMGetVectorSize(LLVMTypeOf(value));
629
630
if (src_channels == dst_channels && vec_size == dst_channels)
631
return value;
632
633
src_channels = MIN2(src_channels, vec_size);
634
635
for (unsigned i = 0; i < src_channels; i++)
636
chan[i] = ac_llvm_extract_elem(ctx, value, i);
637
638
elemtype = LLVMGetElementType(LLVMTypeOf(value));
639
} else {
640
if (src_channels) {
641
assert(src_channels == 1);
642
chan[0] = value;
643
}
644
elemtype = LLVMTypeOf(value);
645
}
646
647
for (unsigned i = src_channels; i < dst_channels; i++)
648
chan[i] = LLVMGetUndef(elemtype);
649
650
return ac_build_gather_values(ctx, chan, dst_channels);
651
}
652
653
/* Extract components [start, start + channels) from a vector.
654
*/
655
LLVMValueRef ac_extract_components(struct ac_llvm_context *ctx, LLVMValueRef value, unsigned start,
656
unsigned channels)
657
{
658
LLVMValueRef *const chan = alloca(channels * sizeof(LLVMValueRef));
659
660
for (unsigned i = 0; i < channels; i++)
661
chan[i] = ac_llvm_extract_elem(ctx, value, i + start);
662
663
return ac_build_gather_values(ctx, chan, channels);
664
}
665
666
/* Expand a scalar or vector to <4 x type> by filling the remaining channels
667
* with undef. Extract at most num_channels components from the input.
668
*/
669
LLVMValueRef ac_build_expand_to_vec4(struct ac_llvm_context *ctx, LLVMValueRef value,
670
unsigned num_channels)
671
{
672
return ac_build_expand(ctx, value, num_channels, 4);
673
}
674
675
LLVMValueRef ac_build_round(struct ac_llvm_context *ctx, LLVMValueRef value)
676
{
677
unsigned type_size = ac_get_type_size(LLVMTypeOf(value));
678
const char *name;
679
680
if (type_size == 2)
681
name = "llvm.rint.f16";
682
else if (type_size == 4)
683
name = "llvm.rint.f32";
684
else
685
name = "llvm.rint.f64";
686
687
return ac_build_intrinsic(ctx, name, LLVMTypeOf(value), &value, 1, AC_FUNC_ATTR_READNONE);
688
}
689
690
LLVMValueRef ac_build_fdiv(struct ac_llvm_context *ctx, LLVMValueRef num, LLVMValueRef den)
691
{
692
unsigned type_size = ac_get_type_size(LLVMTypeOf(den));
693
const char *name;
694
695
/* For doubles, we need precise division to pass GLCTS. */
696
if (ctx->float_mode == AC_FLOAT_MODE_DEFAULT_OPENGL && type_size == 8)
697
return LLVMBuildFDiv(ctx->builder, num, den, "");
698
699
if (type_size == 2)
700
name = "llvm.amdgcn.rcp.f16";
701
else if (type_size == 4)
702
name = "llvm.amdgcn.rcp.f32";
703
else
704
name = "llvm.amdgcn.rcp.f64";
705
706
LLVMValueRef rcp =
707
ac_build_intrinsic(ctx, name, LLVMTypeOf(den), &den, 1, AC_FUNC_ATTR_READNONE);
708
709
return LLVMBuildFMul(ctx->builder, num, rcp, "");
710
}
711
712
/* See fast_idiv_by_const.h. */
713
/* Set: increment = util_fast_udiv_info::increment ? multiplier : 0; */
714
LLVMValueRef ac_build_fast_udiv(struct ac_llvm_context *ctx, LLVMValueRef num,
715
LLVMValueRef multiplier, LLVMValueRef pre_shift,
716
LLVMValueRef post_shift, LLVMValueRef increment)
717
{
718
LLVMBuilderRef builder = ctx->builder;
719
720
num = LLVMBuildLShr(builder, num, pre_shift, "");
721
num = LLVMBuildMul(builder, LLVMBuildZExt(builder, num, ctx->i64, ""),
722
LLVMBuildZExt(builder, multiplier, ctx->i64, ""), "");
723
num = LLVMBuildAdd(builder, num, LLVMBuildZExt(builder, increment, ctx->i64, ""), "");
724
num = LLVMBuildLShr(builder, num, LLVMConstInt(ctx->i64, 32, 0), "");
725
num = LLVMBuildTrunc(builder, num, ctx->i32, "");
726
return LLVMBuildLShr(builder, num, post_shift, "");
727
}
728
729
/* See fast_idiv_by_const.h. */
730
/* If num != UINT_MAX, this more efficient version can be used. */
731
/* Set: increment = util_fast_udiv_info::increment; */
732
LLVMValueRef ac_build_fast_udiv_nuw(struct ac_llvm_context *ctx, LLVMValueRef num,
733
LLVMValueRef multiplier, LLVMValueRef pre_shift,
734
LLVMValueRef post_shift, LLVMValueRef increment)
735
{
736
LLVMBuilderRef builder = ctx->builder;
737
738
num = LLVMBuildLShr(builder, num, pre_shift, "");
739
num = LLVMBuildNUWAdd(builder, num, increment, "");
740
num = LLVMBuildMul(builder, LLVMBuildZExt(builder, num, ctx->i64, ""),
741
LLVMBuildZExt(builder, multiplier, ctx->i64, ""), "");
742
num = LLVMBuildLShr(builder, num, LLVMConstInt(ctx->i64, 32, 0), "");
743
num = LLVMBuildTrunc(builder, num, ctx->i32, "");
744
return LLVMBuildLShr(builder, num, post_shift, "");
745
}
746
747
/* See fast_idiv_by_const.h. */
748
/* Both operands must fit in 31 bits and the divisor must not be 1. */
749
LLVMValueRef ac_build_fast_udiv_u31_d_not_one(struct ac_llvm_context *ctx, LLVMValueRef num,
750
LLVMValueRef multiplier, LLVMValueRef post_shift)
751
{
752
LLVMBuilderRef builder = ctx->builder;
753
754
num = LLVMBuildMul(builder, LLVMBuildZExt(builder, num, ctx->i64, ""),
755
LLVMBuildZExt(builder, multiplier, ctx->i64, ""), "");
756
num = LLVMBuildLShr(builder, num, LLVMConstInt(ctx->i64, 32, 0), "");
757
num = LLVMBuildTrunc(builder, num, ctx->i32, "");
758
return LLVMBuildLShr(builder, num, post_shift, "");
759
}
760
761
/* Coordinates for cube map selection. sc, tc, and ma are as in Table 8.27
762
* of the OpenGL 4.5 (Compatibility Profile) specification, except ma is
763
* already multiplied by two. id is the cube face number.
764
*/
765
struct cube_selection_coords {
766
LLVMValueRef stc[2];
767
LLVMValueRef ma;
768
LLVMValueRef id;
769
};
770
771
static void build_cube_intrinsic(struct ac_llvm_context *ctx, LLVMValueRef in[3],
772
struct cube_selection_coords *out)
773
{
774
LLVMTypeRef f32 = ctx->f32;
775
776
out->stc[1] = ac_build_intrinsic(ctx, "llvm.amdgcn.cubetc", f32, in, 3, AC_FUNC_ATTR_READNONE);
777
out->stc[0] = ac_build_intrinsic(ctx, "llvm.amdgcn.cubesc", f32, in, 3, AC_FUNC_ATTR_READNONE);
778
out->ma = ac_build_intrinsic(ctx, "llvm.amdgcn.cubema", f32, in, 3, AC_FUNC_ATTR_READNONE);
779
out->id = ac_build_intrinsic(ctx, "llvm.amdgcn.cubeid", f32, in, 3, AC_FUNC_ATTR_READNONE);
780
}
781
782
/**
783
* Build a manual selection sequence for cube face sc/tc coordinates and
784
* major axis vector (multiplied by 2 for consistency) for the given
785
* vec3 \p coords, for the face implied by \p selcoords.
786
*
787
* For the major axis, we always adjust the sign to be in the direction of
788
* selcoords.ma; i.e., a positive out_ma means that coords is pointed towards
789
* the selcoords major axis.
790
*/
791
static void build_cube_select(struct ac_llvm_context *ctx,
792
const struct cube_selection_coords *selcoords,
793
const LLVMValueRef *coords, LLVMValueRef *out_st,
794
LLVMValueRef *out_ma)
795
{
796
LLVMBuilderRef builder = ctx->builder;
797
LLVMTypeRef f32 = LLVMTypeOf(coords[0]);
798
LLVMValueRef is_ma_positive;
799
LLVMValueRef sgn_ma;
800
LLVMValueRef is_ma_z, is_not_ma_z;
801
LLVMValueRef is_ma_y;
802
LLVMValueRef is_ma_x;
803
LLVMValueRef sgn;
804
LLVMValueRef tmp;
805
806
is_ma_positive = LLVMBuildFCmp(builder, LLVMRealUGE, selcoords->ma, LLVMConstReal(f32, 0.0), "");
807
sgn_ma = LLVMBuildSelect(builder, is_ma_positive, LLVMConstReal(f32, 1.0),
808
LLVMConstReal(f32, -1.0), "");
809
810
is_ma_z = LLVMBuildFCmp(builder, LLVMRealUGE, selcoords->id, LLVMConstReal(f32, 4.0), "");
811
is_not_ma_z = LLVMBuildNot(builder, is_ma_z, "");
812
is_ma_y = LLVMBuildAnd(
813
builder, is_not_ma_z,
814
LLVMBuildFCmp(builder, LLVMRealUGE, selcoords->id, LLVMConstReal(f32, 2.0), ""), "");
815
is_ma_x = LLVMBuildAnd(builder, is_not_ma_z, LLVMBuildNot(builder, is_ma_y, ""), "");
816
817
/* Select sc */
818
tmp = LLVMBuildSelect(builder, is_ma_x, coords[2], coords[0], "");
819
sgn = LLVMBuildSelect(
820
builder, is_ma_y, LLVMConstReal(f32, 1.0),
821
LLVMBuildSelect(builder, is_ma_z, sgn_ma, LLVMBuildFNeg(builder, sgn_ma, ""), ""), "");
822
out_st[0] = LLVMBuildFMul(builder, tmp, sgn, "");
823
824
/* Select tc */
825
tmp = LLVMBuildSelect(builder, is_ma_y, coords[2], coords[1], "");
826
sgn = LLVMBuildSelect(builder, is_ma_y, sgn_ma, LLVMConstReal(f32, -1.0), "");
827
out_st[1] = LLVMBuildFMul(builder, tmp, sgn, "");
828
829
/* Select ma */
830
tmp = LLVMBuildSelect(builder, is_ma_z, coords[2],
831
LLVMBuildSelect(builder, is_ma_y, coords[1], coords[0], ""), "");
832
tmp = ac_build_intrinsic(ctx, "llvm.fabs.f32", ctx->f32, &tmp, 1, AC_FUNC_ATTR_READNONE);
833
*out_ma = LLVMBuildFMul(builder, tmp, LLVMConstReal(f32, 2.0), "");
834
}
835
836
void ac_prepare_cube_coords(struct ac_llvm_context *ctx, bool is_deriv, bool is_array, bool is_lod,
837
LLVMValueRef *coords_arg, LLVMValueRef *derivs_arg)
838
{
839
840
LLVMBuilderRef builder = ctx->builder;
841
struct cube_selection_coords selcoords;
842
LLVMValueRef coords[3];
843
LLVMValueRef invma;
844
845
if (is_array && !is_lod) {
846
LLVMValueRef tmp = ac_build_round(ctx, coords_arg[3]);
847
848
/* Section 8.9 (Texture Functions) of the GLSL 4.50 spec says:
849
*
850
* "For Array forms, the array layer used will be
851
*
852
* max(0, min(d−1, floor(layer+0.5)))
853
*
854
* where d is the depth of the texture array and layer
855
* comes from the component indicated in the tables below.
856
* Workaroudn for an issue where the layer is taken from a
857
* helper invocation which happens to fall on a different
858
* layer due to extrapolation."
859
*
860
* GFX8 and earlier attempt to implement this in hardware by
861
* clamping the value of coords[2] = (8 * layer) + face.
862
* Unfortunately, this means that the we end up with the wrong
863
* face when clamping occurs.
864
*
865
* Clamp the layer earlier to work around the issue.
866
*/
867
if (ctx->chip_class <= GFX8) {
868
LLVMValueRef ge0;
869
ge0 = LLVMBuildFCmp(builder, LLVMRealOGE, tmp, ctx->f32_0, "");
870
tmp = LLVMBuildSelect(builder, ge0, tmp, ctx->f32_0, "");
871
}
872
873
coords_arg[3] = tmp;
874
}
875
876
build_cube_intrinsic(ctx, coords_arg, &selcoords);
877
878
invma =
879
ac_build_intrinsic(ctx, "llvm.fabs.f32", ctx->f32, &selcoords.ma, 1, AC_FUNC_ATTR_READNONE);
880
invma = ac_build_fdiv(ctx, LLVMConstReal(ctx->f32, 1.0), invma);
881
882
for (int i = 0; i < 2; ++i)
883
coords[i] = LLVMBuildFMul(builder, selcoords.stc[i], invma, "");
884
885
coords[2] = selcoords.id;
886
887
if (is_deriv && derivs_arg) {
888
LLVMValueRef derivs[4];
889
int axis;
890
891
/* Convert cube derivatives to 2D derivatives. */
892
for (axis = 0; axis < 2; axis++) {
893
LLVMValueRef deriv_st[2];
894
LLVMValueRef deriv_ma;
895
896
/* Transform the derivative alongside the texture
897
* coordinate. Mathematically, the correct formula is
898
* as follows. Assume we're projecting onto the +Z face
899
* and denote by dx/dh the derivative of the (original)
900
* X texture coordinate with respect to horizontal
901
* window coordinates. The projection onto the +Z face
902
* plane is:
903
*
904
* f(x,z) = x/z
905
*
906
* Then df/dh = df/dx * dx/dh + df/dz * dz/dh
907
* = 1/z * dx/dh - x/z * 1/z * dz/dh.
908
*
909
* This motivatives the implementation below.
910
*
911
* Whether this actually gives the expected results for
912
* apps that might feed in derivatives obtained via
913
* finite differences is anyone's guess. The OpenGL spec
914
* seems awfully quiet about how textureGrad for cube
915
* maps should be handled.
916
*/
917
build_cube_select(ctx, &selcoords, &derivs_arg[axis * 3], deriv_st, &deriv_ma);
918
919
deriv_ma = LLVMBuildFMul(builder, deriv_ma, invma, "");
920
921
for (int i = 0; i < 2; ++i)
922
derivs[axis * 2 + i] =
923
LLVMBuildFSub(builder, LLVMBuildFMul(builder, deriv_st[i], invma, ""),
924
LLVMBuildFMul(builder, deriv_ma, coords[i], ""), "");
925
}
926
927
memcpy(derivs_arg, derivs, sizeof(derivs));
928
}
929
930
/* Shift the texture coordinate. This must be applied after the
931
* derivative calculation.
932
*/
933
for (int i = 0; i < 2; ++i)
934
coords[i] = LLVMBuildFAdd(builder, coords[i], LLVMConstReal(ctx->f32, 1.5), "");
935
936
if (is_array) {
937
/* for cube arrays coord.z = coord.w(array_index) * 8 + face */
938
/* coords_arg.w component - array_index for cube arrays */
939
coords[2] = ac_build_fmad(ctx, coords_arg[3], LLVMConstReal(ctx->f32, 8.0), coords[2]);
940
}
941
942
memcpy(coords_arg, coords, sizeof(coords));
943
}
944
945
LLVMValueRef ac_build_fs_interp(struct ac_llvm_context *ctx, LLVMValueRef llvm_chan,
946
LLVMValueRef attr_number, LLVMValueRef params, LLVMValueRef i,
947
LLVMValueRef j)
948
{
949
LLVMValueRef args[5];
950
LLVMValueRef p1;
951
952
args[0] = i;
953
args[1] = llvm_chan;
954
args[2] = attr_number;
955
args[3] = params;
956
957
p1 = ac_build_intrinsic(ctx, "llvm.amdgcn.interp.p1", ctx->f32, args, 4, AC_FUNC_ATTR_READNONE);
958
959
args[0] = p1;
960
args[1] = j;
961
args[2] = llvm_chan;
962
args[3] = attr_number;
963
args[4] = params;
964
965
return ac_build_intrinsic(ctx, "llvm.amdgcn.interp.p2", ctx->f32, args, 5,
966
AC_FUNC_ATTR_READNONE);
967
}
968
969
LLVMValueRef ac_build_fs_interp_f16(struct ac_llvm_context *ctx, LLVMValueRef llvm_chan,
970
LLVMValueRef attr_number, LLVMValueRef params, LLVMValueRef i,
971
LLVMValueRef j, bool high_16bits)
972
{
973
LLVMValueRef args[6];
974
LLVMValueRef p1;
975
976
args[0] = i;
977
args[1] = llvm_chan;
978
args[2] = attr_number;
979
args[3] = high_16bits ? ctx->i1true : ctx->i1false;
980
args[4] = params;
981
982
p1 = ac_build_intrinsic(ctx, "llvm.amdgcn.interp.p1.f16", ctx->f32, args, 5,
983
AC_FUNC_ATTR_READNONE);
984
985
args[0] = p1;
986
args[1] = j;
987
args[2] = llvm_chan;
988
args[3] = attr_number;
989
args[4] = high_16bits ? ctx->i1true : ctx->i1false;
990
args[5] = params;
991
992
return ac_build_intrinsic(ctx, "llvm.amdgcn.interp.p2.f16", ctx->f16, args, 6,
993
AC_FUNC_ATTR_READNONE);
994
}
995
996
LLVMValueRef ac_build_fs_interp_mov(struct ac_llvm_context *ctx, LLVMValueRef parameter,
997
LLVMValueRef llvm_chan, LLVMValueRef attr_number,
998
LLVMValueRef params)
999
{
1000
LLVMValueRef args[4];
1001
1002
args[0] = parameter;
1003
args[1] = llvm_chan;
1004
args[2] = attr_number;
1005
args[3] = params;
1006
1007
return ac_build_intrinsic(ctx, "llvm.amdgcn.interp.mov", ctx->f32, args, 4,
1008
AC_FUNC_ATTR_READNONE);
1009
}
1010
1011
LLVMValueRef ac_build_gep_ptr(struct ac_llvm_context *ctx, LLVMValueRef base_ptr,
1012
LLVMValueRef index)
1013
{
1014
return LLVMBuildGEP(ctx->builder, base_ptr, &index, 1, "");
1015
}
1016
1017
LLVMValueRef ac_build_gep0(struct ac_llvm_context *ctx, LLVMValueRef base_ptr, LLVMValueRef index)
1018
{
1019
LLVMValueRef indices[2] = {
1020
ctx->i32_0,
1021
index,
1022
};
1023
return LLVMBuildGEP(ctx->builder, base_ptr, indices, 2, "");
1024
}
1025
1026
LLVMValueRef ac_build_pointer_add(struct ac_llvm_context *ctx, LLVMValueRef ptr, LLVMValueRef index)
1027
{
1028
return LLVMBuildPointerCast(ctx->builder, LLVMBuildGEP(ctx->builder, ptr, &index, 1, ""),
1029
LLVMTypeOf(ptr), "");
1030
}
1031
1032
void ac_build_indexed_store(struct ac_llvm_context *ctx, LLVMValueRef base_ptr, LLVMValueRef index,
1033
LLVMValueRef value)
1034
{
1035
LLVMBuildStore(ctx->builder, value, ac_build_gep0(ctx, base_ptr, index));
1036
}
1037
1038
/**
1039
* Build an LLVM bytecode indexed load using LLVMBuildGEP + LLVMBuildLoad.
1040
* It's equivalent to doing a load from &base_ptr[index].
1041
*
1042
* \param base_ptr Where the array starts.
1043
* \param index The element index into the array.
1044
* \param uniform Whether the base_ptr and index can be assumed to be
1045
* dynamically uniform (i.e. load to an SGPR)
1046
* \param invariant Whether the load is invariant (no other opcodes affect it)
1047
* \param no_unsigned_wraparound
1048
* For all possible re-associations and re-distributions of an expression
1049
* "base_ptr + index * elemsize" into "addr + offset" (excluding GEPs
1050
* without inbounds in base_ptr), this parameter is true if "addr + offset"
1051
* does not result in an unsigned integer wraparound. This is used for
1052
* optimal code generation of 32-bit pointer arithmetic.
1053
*
1054
* For example, a 32-bit immediate offset that causes a 32-bit unsigned
1055
* integer wraparound can't be an imm offset in s_load_dword, because
1056
* the instruction performs "addr + offset" in 64 bits.
1057
*
1058
* Expected usage for bindless textures by chaining GEPs:
1059
* // possible unsigned wraparound, don't use InBounds:
1060
* ptr1 = LLVMBuildGEP(base_ptr, index);
1061
* image = load(ptr1); // becomes "s_load ptr1, 0"
1062
*
1063
* ptr2 = LLVMBuildInBoundsGEP(ptr1, 32 / elemsize);
1064
* sampler = load(ptr2); // becomes "s_load ptr1, 32" thanks to InBounds
1065
*/
1066
static LLVMValueRef ac_build_load_custom(struct ac_llvm_context *ctx, LLVMValueRef base_ptr,
1067
LLVMValueRef index, bool uniform, bool invariant,
1068
bool no_unsigned_wraparound)
1069
{
1070
LLVMValueRef pointer, result;
1071
1072
if (no_unsigned_wraparound &&
1073
LLVMGetPointerAddressSpace(LLVMTypeOf(base_ptr)) == AC_ADDR_SPACE_CONST_32BIT)
1074
pointer = LLVMBuildInBoundsGEP(ctx->builder, base_ptr, &index, 1, "");
1075
else
1076
pointer = LLVMBuildGEP(ctx->builder, base_ptr, &index, 1, "");
1077
1078
if (uniform)
1079
LLVMSetMetadata(pointer, ctx->uniform_md_kind, ctx->empty_md);
1080
result = LLVMBuildLoad(ctx->builder, pointer, "");
1081
if (invariant)
1082
LLVMSetMetadata(result, ctx->invariant_load_md_kind, ctx->empty_md);
1083
LLVMSetAlignment(result, 4);
1084
return result;
1085
}
1086
1087
LLVMValueRef ac_build_load(struct ac_llvm_context *ctx, LLVMValueRef base_ptr, LLVMValueRef index)
1088
{
1089
return ac_build_load_custom(ctx, base_ptr, index, false, false, false);
1090
}
1091
1092
LLVMValueRef ac_build_load_invariant(struct ac_llvm_context *ctx, LLVMValueRef base_ptr,
1093
LLVMValueRef index)
1094
{
1095
return ac_build_load_custom(ctx, base_ptr, index, false, true, false);
1096
}
1097
1098
/* This assumes that there is no unsigned integer wraparound during the address
1099
* computation, excluding all GEPs within base_ptr. */
1100
LLVMValueRef ac_build_load_to_sgpr(struct ac_llvm_context *ctx, LLVMValueRef base_ptr,
1101
LLVMValueRef index)
1102
{
1103
return ac_build_load_custom(ctx, base_ptr, index, true, true, true);
1104
}
1105
1106
/* See ac_build_load_custom() documentation. */
1107
LLVMValueRef ac_build_load_to_sgpr_uint_wraparound(struct ac_llvm_context *ctx,
1108
LLVMValueRef base_ptr, LLVMValueRef index)
1109
{
1110
return ac_build_load_custom(ctx, base_ptr, index, true, true, false);
1111
}
1112
1113
static unsigned get_load_cache_policy(struct ac_llvm_context *ctx, unsigned cache_policy)
1114
{
1115
return cache_policy | (ctx->chip_class >= GFX10 && cache_policy & ac_glc ? ac_dlc : 0);
1116
}
1117
1118
static void ac_build_buffer_store_common(struct ac_llvm_context *ctx, LLVMValueRef rsrc,
1119
LLVMValueRef data, LLVMValueRef vindex,
1120
LLVMValueRef voffset, LLVMValueRef soffset,
1121
unsigned cache_policy, bool use_format, bool structurized)
1122
{
1123
LLVMValueRef args[6];
1124
int idx = 0;
1125
args[idx++] = data;
1126
args[idx++] = LLVMBuildBitCast(ctx->builder, rsrc, ctx->v4i32, "");
1127
if (structurized)
1128
args[idx++] = vindex ? vindex : ctx->i32_0;
1129
args[idx++] = voffset ? voffset : ctx->i32_0;
1130
args[idx++] = soffset ? soffset : ctx->i32_0;
1131
args[idx++] = LLVMConstInt(ctx->i32, cache_policy, 0);
1132
const char *indexing_kind = structurized ? "struct" : "raw";
1133
char name[256], type_name[8];
1134
1135
ac_build_type_name_for_intr(LLVMTypeOf(data), type_name, sizeof(type_name));
1136
1137
if (use_format) {
1138
snprintf(name, sizeof(name), "llvm.amdgcn.%s.buffer.store.format.%s", indexing_kind,
1139
type_name);
1140
} else {
1141
snprintf(name, sizeof(name), "llvm.amdgcn.%s.buffer.store.%s", indexing_kind, type_name);
1142
}
1143
1144
ac_build_intrinsic(ctx, name, ctx->voidt, args, idx, AC_FUNC_ATTR_INACCESSIBLE_MEM_ONLY);
1145
}
1146
1147
void ac_build_buffer_store_format(struct ac_llvm_context *ctx, LLVMValueRef rsrc, LLVMValueRef data,
1148
LLVMValueRef vindex, LLVMValueRef voffset, unsigned cache_policy)
1149
{
1150
ac_build_buffer_store_common(ctx, rsrc, data, vindex, voffset, NULL, cache_policy, true, true);
1151
}
1152
1153
/* TBUFFER_STORE_FORMAT_{X,XY,XYZ,XYZW} <- the suffix is selected by num_channels=1..4.
1154
* The type of vdata must be one of i32 (num_channels=1), v2i32 (num_channels=2),
1155
* or v4i32 (num_channels=3,4).
1156
*/
1157
void ac_build_buffer_store_dword(struct ac_llvm_context *ctx, LLVMValueRef rsrc, LLVMValueRef vdata,
1158
unsigned num_channels, LLVMValueRef voffset, LLVMValueRef soffset,
1159
unsigned inst_offset, unsigned cache_policy)
1160
{
1161
/* Split 3 channel stores. */
1162
if (num_channels == 3 && !ac_has_vec3_support(ctx->chip_class, false)) {
1163
LLVMValueRef v[3], v01;
1164
1165
for (int i = 0; i < 3; i++) {
1166
v[i] = LLVMBuildExtractElement(ctx->builder, vdata, LLVMConstInt(ctx->i32, i, 0), "");
1167
}
1168
v01 = ac_build_gather_values(ctx, v, 2);
1169
1170
ac_build_buffer_store_dword(ctx, rsrc, v01, 2, voffset, soffset, inst_offset, cache_policy);
1171
ac_build_buffer_store_dword(ctx, rsrc, v[2], 1, voffset, soffset, inst_offset + 8,
1172
cache_policy);
1173
return;
1174
}
1175
1176
/* SWIZZLE_ENABLE requires that soffset isn't folded into voffset
1177
* (voffset is swizzled, but soffset isn't swizzled).
1178
* llvm.amdgcn.buffer.store doesn't have a separate soffset parameter.
1179
*/
1180
if (!(cache_policy & ac_swizzled)) {
1181
LLVMValueRef offset = soffset;
1182
1183
if (inst_offset)
1184
offset = LLVMBuildAdd(ctx->builder, offset, LLVMConstInt(ctx->i32, inst_offset, 0), "");
1185
1186
ac_build_buffer_store_common(ctx, rsrc, ac_to_float(ctx, vdata), ctx->i32_0, voffset, offset,
1187
cache_policy, false, false);
1188
return;
1189
}
1190
1191
static const unsigned dfmts[] = {V_008F0C_BUF_DATA_FORMAT_32, V_008F0C_BUF_DATA_FORMAT_32_32,
1192
V_008F0C_BUF_DATA_FORMAT_32_32_32,
1193
V_008F0C_BUF_DATA_FORMAT_32_32_32_32};
1194
unsigned dfmt = dfmts[num_channels - 1];
1195
unsigned nfmt = V_008F0C_BUF_NUM_FORMAT_UINT;
1196
LLVMValueRef immoffset = LLVMConstInt(ctx->i32, inst_offset, 0);
1197
1198
ac_build_raw_tbuffer_store(ctx, rsrc, vdata, voffset, soffset, immoffset, num_channels, dfmt,
1199
nfmt, cache_policy);
1200
}
1201
1202
static LLVMValueRef ac_build_buffer_load_common(struct ac_llvm_context *ctx, LLVMValueRef rsrc,
1203
LLVMValueRef vindex, LLVMValueRef voffset,
1204
LLVMValueRef soffset, unsigned num_channels,
1205
LLVMTypeRef channel_type, unsigned cache_policy,
1206
bool can_speculate, bool use_format,
1207
bool structurized)
1208
{
1209
LLVMValueRef args[5];
1210
int idx = 0;
1211
args[idx++] = LLVMBuildBitCast(ctx->builder, rsrc, ctx->v4i32, "");
1212
if (structurized)
1213
args[idx++] = vindex ? vindex : ctx->i32_0;
1214
args[idx++] = voffset ? voffset : ctx->i32_0;
1215
args[idx++] = soffset ? soffset : ctx->i32_0;
1216
args[idx++] = LLVMConstInt(ctx->i32, get_load_cache_policy(ctx, cache_policy), 0);
1217
unsigned func =
1218
!ac_has_vec3_support(ctx->chip_class, use_format) && num_channels == 3 ? 4 : num_channels;
1219
const char *indexing_kind = structurized ? "struct" : "raw";
1220
char name[256], type_name[8];
1221
1222
/* D16 is only supported on gfx8+ */
1223
assert(!use_format || (channel_type != ctx->f16 && channel_type != ctx->i16) ||
1224
ctx->chip_class >= GFX8);
1225
1226
LLVMTypeRef type = func > 1 ? LLVMVectorType(channel_type, func) : channel_type;
1227
ac_build_type_name_for_intr(type, type_name, sizeof(type_name));
1228
1229
if (use_format) {
1230
snprintf(name, sizeof(name), "llvm.amdgcn.%s.buffer.load.format.%s", indexing_kind,
1231
type_name);
1232
} else {
1233
snprintf(name, sizeof(name), "llvm.amdgcn.%s.buffer.load.%s", indexing_kind, type_name);
1234
}
1235
1236
return ac_build_intrinsic(ctx, name, type, args, idx, ac_get_load_intr_attribs(can_speculate));
1237
}
1238
1239
LLVMValueRef ac_build_buffer_load(struct ac_llvm_context *ctx, LLVMValueRef rsrc, int num_channels,
1240
LLVMValueRef vindex, LLVMValueRef voffset, LLVMValueRef soffset,
1241
unsigned inst_offset, LLVMTypeRef channel_type,
1242
unsigned cache_policy, bool can_speculate, bool allow_smem)
1243
{
1244
LLVMValueRef offset = LLVMConstInt(ctx->i32, inst_offset, 0);
1245
if (voffset)
1246
offset = LLVMBuildAdd(ctx->builder, offset, voffset, "");
1247
if (soffset)
1248
offset = LLVMBuildAdd(ctx->builder, offset, soffset, "");
1249
1250
if (allow_smem && !(cache_policy & ac_slc) &&
1251
(!(cache_policy & ac_glc) || ctx->chip_class >= GFX8)) {
1252
assert(vindex == NULL);
1253
1254
LLVMValueRef result[8];
1255
1256
for (int i = 0; i < num_channels; i++) {
1257
if (i) {
1258
offset = LLVMBuildAdd(ctx->builder, offset, LLVMConstInt(ctx->i32, 4, 0), "");
1259
}
1260
LLVMValueRef args[3] = {
1261
rsrc,
1262
offset,
1263
LLVMConstInt(ctx->i32, get_load_cache_policy(ctx, cache_policy), 0),
1264
};
1265
result[i] = ac_build_intrinsic(ctx, "llvm.amdgcn.s.buffer.load.f32", ctx->f32, args, 3,
1266
AC_FUNC_ATTR_READNONE);
1267
}
1268
if (num_channels == 1)
1269
return result[0];
1270
1271
if (num_channels == 3 && !ac_has_vec3_support(ctx->chip_class, false))
1272
result[num_channels++] = LLVMGetUndef(ctx->f32);
1273
return ac_build_gather_values(ctx, result, num_channels);
1274
}
1275
1276
return ac_build_buffer_load_common(ctx, rsrc, vindex, offset, ctx->i32_0, num_channels,
1277
channel_type, cache_policy, can_speculate, false, false);
1278
}
1279
1280
LLVMValueRef ac_build_buffer_load_format(struct ac_llvm_context *ctx, LLVMValueRef rsrc,
1281
LLVMValueRef vindex, LLVMValueRef voffset,
1282
unsigned num_channels, unsigned cache_policy,
1283
bool can_speculate, bool d16, bool tfe)
1284
{
1285
if (tfe) {
1286
assert(!d16);
1287
1288
char code[256];
1289
/* The definition in the assembly and the one in the constraint string
1290
* differs because of an assembler bug.
1291
*/
1292
snprintf(code, sizeof(code),
1293
"v_mov_b32 v0, 0\n"
1294
"v_mov_b32 v1, 0\n"
1295
"v_mov_b32 v2, 0\n"
1296
"v_mov_b32 v3, 0\n"
1297
"v_mov_b32 v4, 0\n"
1298
"buffer_load_format_xyzw v[0:3], $1, $2, 0, idxen offen %s %s tfe %s\n"
1299
"s_waitcnt vmcnt(0)",
1300
cache_policy & ac_glc ? "glc" : "",
1301
cache_policy & ac_slc ? "slc" : "",
1302
cache_policy & ac_dlc ? "dlc" : "");
1303
1304
LLVMTypeRef param_types[] = {ctx->v2i32, ctx->v4i32};
1305
LLVMTypeRef calltype = LLVMFunctionType(LLVMVectorType(ctx->f32, 5), param_types, 2, false);
1306
LLVMValueRef inlineasm = LLVMConstInlineAsm(calltype, code, "=&{v[0:4]},v,s", false, false);
1307
1308
LLVMValueRef addr_comp[2] = {vindex ? vindex : ctx->i32_0,
1309
voffset ? voffset : ctx->i32_0};
1310
1311
LLVMValueRef args[] = {ac_build_gather_values(ctx, addr_comp, 2),
1312
LLVMBuildBitCast(ctx->builder, rsrc, ctx->v4i32, "")};
1313
LLVMValueRef res = LLVMBuildCall(ctx->builder, inlineasm, args, 2, "");
1314
1315
return ac_build_concat(ctx, ac_trim_vector(ctx, res, num_channels),
1316
ac_llvm_extract_elem(ctx, res, 4));
1317
}
1318
1319
return ac_build_buffer_load_common(ctx, rsrc, vindex, voffset, ctx->i32_0, num_channels,
1320
d16 ? ctx->f16 : ctx->f32, cache_policy, can_speculate, true,
1321
true);
1322
}
1323
1324
static LLVMValueRef ac_build_tbuffer_load(struct ac_llvm_context *ctx, LLVMValueRef rsrc,
1325
LLVMValueRef vindex, LLVMValueRef voffset,
1326
LLVMValueRef soffset, LLVMValueRef immoffset,
1327
unsigned num_channels, unsigned dfmt, unsigned nfmt,
1328
unsigned cache_policy, bool can_speculate,
1329
bool structurized)
1330
{
1331
voffset = LLVMBuildAdd(ctx->builder, voffset, immoffset, "");
1332
1333
LLVMValueRef args[6];
1334
int idx = 0;
1335
args[idx++] = LLVMBuildBitCast(ctx->builder, rsrc, ctx->v4i32, "");
1336
if (structurized)
1337
args[idx++] = vindex ? vindex : ctx->i32_0;
1338
args[idx++] = voffset ? voffset : ctx->i32_0;
1339
args[idx++] = soffset ? soffset : ctx->i32_0;
1340
args[idx++] = LLVMConstInt(ctx->i32, ac_get_tbuffer_format(ctx->chip_class, dfmt, nfmt), 0);
1341
args[idx++] = LLVMConstInt(ctx->i32, get_load_cache_policy(ctx, cache_policy), 0);
1342
unsigned func =
1343
!ac_has_vec3_support(ctx->chip_class, true) && num_channels == 3 ? 4 : num_channels;
1344
const char *indexing_kind = structurized ? "struct" : "raw";
1345
char name[256], type_name[8];
1346
1347
LLVMTypeRef type = func > 1 ? LLVMVectorType(ctx->i32, func) : ctx->i32;
1348
ac_build_type_name_for_intr(type, type_name, sizeof(type_name));
1349
1350
snprintf(name, sizeof(name), "llvm.amdgcn.%s.tbuffer.load.%s", indexing_kind, type_name);
1351
1352
return ac_build_intrinsic(ctx, name, type, args, idx, ac_get_load_intr_attribs(can_speculate));
1353
}
1354
1355
LLVMValueRef ac_build_struct_tbuffer_load(struct ac_llvm_context *ctx, LLVMValueRef rsrc,
1356
LLVMValueRef vindex, LLVMValueRef voffset,
1357
LLVMValueRef soffset, LLVMValueRef immoffset,
1358
unsigned num_channels, unsigned dfmt, unsigned nfmt,
1359
unsigned cache_policy, bool can_speculate)
1360
{
1361
return ac_build_tbuffer_load(ctx, rsrc, vindex, voffset, soffset, immoffset, num_channels, dfmt,
1362
nfmt, cache_policy, can_speculate, true);
1363
}
1364
1365
LLVMValueRef ac_build_tbuffer_load_short(struct ac_llvm_context *ctx, LLVMValueRef rsrc,
1366
LLVMValueRef voffset, LLVMValueRef soffset,
1367
LLVMValueRef immoffset, unsigned cache_policy)
1368
{
1369
voffset = LLVMBuildAdd(ctx->builder, voffset, immoffset, "");
1370
1371
return ac_build_buffer_load_common(ctx, rsrc, NULL, voffset, soffset, 1, ctx->i16,
1372
cache_policy, false, false, false);
1373
}
1374
1375
LLVMValueRef ac_build_tbuffer_load_byte(struct ac_llvm_context *ctx, LLVMValueRef rsrc,
1376
LLVMValueRef voffset, LLVMValueRef soffset,
1377
LLVMValueRef immoffset, unsigned cache_policy)
1378
{
1379
voffset = LLVMBuildAdd(ctx->builder, voffset, immoffset, "");
1380
1381
return ac_build_buffer_load_common(ctx, rsrc, NULL, voffset, soffset, 1, ctx->i8, cache_policy,
1382
false, false, false);
1383
}
1384
1385
/**
1386
* Convert an 11- or 10-bit unsigned floating point number to an f32.
1387
*
1388
* The input exponent is expected to be biased analogous to IEEE-754, i.e. by
1389
* 2^(exp_bits-1) - 1 (as defined in OpenGL and other graphics APIs).
1390
*/
1391
static LLVMValueRef ac_ufN_to_float(struct ac_llvm_context *ctx, LLVMValueRef src,
1392
unsigned exp_bits, unsigned mant_bits)
1393
{
1394
assert(LLVMTypeOf(src) == ctx->i32);
1395
1396
LLVMValueRef tmp;
1397
LLVMValueRef mantissa;
1398
mantissa =
1399
LLVMBuildAnd(ctx->builder, src, LLVMConstInt(ctx->i32, (1 << mant_bits) - 1, false), "");
1400
1401
/* Converting normal numbers is just a shift + correcting the exponent bias */
1402
unsigned normal_shift = 23 - mant_bits;
1403
unsigned bias_shift = 127 - ((1 << (exp_bits - 1)) - 1);
1404
LLVMValueRef shifted, normal;
1405
1406
shifted = LLVMBuildShl(ctx->builder, src, LLVMConstInt(ctx->i32, normal_shift, false), "");
1407
normal =
1408
LLVMBuildAdd(ctx->builder, shifted, LLVMConstInt(ctx->i32, bias_shift << 23, false), "");
1409
1410
/* Converting nan/inf numbers is the same, but with a different exponent update */
1411
LLVMValueRef naninf;
1412
naninf = LLVMBuildOr(ctx->builder, normal, LLVMConstInt(ctx->i32, 0xff << 23, false), "");
1413
1414
/* Converting denormals is the complex case: determine the leading zeros of the
1415
* mantissa to obtain the correct shift for the mantissa and exponent correction.
1416
*/
1417
LLVMValueRef denormal;
1418
LLVMValueRef params[2] = {
1419
mantissa, ctx->i1true, /* result can be undef when arg is 0 */
1420
};
1421
LLVMValueRef ctlz =
1422
ac_build_intrinsic(ctx, "llvm.ctlz.i32", ctx->i32, params, 2, AC_FUNC_ATTR_READNONE);
1423
1424
/* Shift such that the leading 1 ends up as the LSB of the exponent field. */
1425
tmp = LLVMBuildSub(ctx->builder, ctlz, LLVMConstInt(ctx->i32, 8, false), "");
1426
denormal = LLVMBuildShl(ctx->builder, mantissa, tmp, "");
1427
1428
unsigned denormal_exp = bias_shift + (32 - mant_bits) - 1;
1429
tmp = LLVMBuildSub(ctx->builder, LLVMConstInt(ctx->i32, denormal_exp, false), ctlz, "");
1430
tmp = LLVMBuildShl(ctx->builder, tmp, LLVMConstInt(ctx->i32, 23, false), "");
1431
denormal = LLVMBuildAdd(ctx->builder, denormal, tmp, "");
1432
1433
/* Select the final result. */
1434
LLVMValueRef result;
1435
1436
tmp = LLVMBuildICmp(ctx->builder, LLVMIntUGE, src,
1437
LLVMConstInt(ctx->i32, ((1ULL << exp_bits) - 1) << mant_bits, false), "");
1438
result = LLVMBuildSelect(ctx->builder, tmp, naninf, normal, "");
1439
1440
tmp = LLVMBuildICmp(ctx->builder, LLVMIntUGE, src,
1441
LLVMConstInt(ctx->i32, 1ULL << mant_bits, false), "");
1442
result = LLVMBuildSelect(ctx->builder, tmp, result, denormal, "");
1443
1444
tmp = LLVMBuildICmp(ctx->builder, LLVMIntNE, src, ctx->i32_0, "");
1445
result = LLVMBuildSelect(ctx->builder, tmp, result, ctx->i32_0, "");
1446
1447
return ac_to_float(ctx, result);
1448
}
1449
1450
/**
1451
* Generate a fully general open coded buffer format fetch with all required
1452
* fixups suitable for vertex fetch, using non-format buffer loads.
1453
*
1454
* Some combinations of argument values have special interpretations:
1455
* - size = 8 bytes, format = fixed indicates PIPE_FORMAT_R11G11B10_FLOAT
1456
* - size = 8 bytes, format != {float,fixed} indicates a 2_10_10_10 data format
1457
*
1458
* \param log_size log(size of channel in bytes)
1459
* \param num_channels number of channels (1 to 4)
1460
* \param format AC_FETCH_FORMAT_xxx value
1461
* \param reverse whether XYZ channels are reversed
1462
* \param known_aligned whether the source is known to be aligned to hardware's
1463
* effective element size for loading the given format
1464
* (note: this means dword alignment for 8_8_8_8, 16_16, etc.)
1465
* \param rsrc buffer resource descriptor
1466
* \return the resulting vector of floats or integers bitcast to <4 x i32>
1467
*/
1468
LLVMValueRef ac_build_opencoded_load_format(struct ac_llvm_context *ctx, unsigned log_size,
1469
unsigned num_channels, unsigned format, bool reverse,
1470
bool known_aligned, LLVMValueRef rsrc,
1471
LLVMValueRef vindex, LLVMValueRef voffset,
1472
LLVMValueRef soffset, unsigned cache_policy,
1473
bool can_speculate)
1474
{
1475
LLVMValueRef tmp;
1476
unsigned load_log_size = log_size;
1477
unsigned load_num_channels = num_channels;
1478
if (log_size == 3) {
1479
load_log_size = 2;
1480
if (format == AC_FETCH_FORMAT_FLOAT) {
1481
load_num_channels = 2 * num_channels;
1482
} else {
1483
load_num_channels = 1; /* 10_11_11 or 2_10_10_10 */
1484
}
1485
}
1486
1487
int log_recombine = 0;
1488
if ((ctx->chip_class == GFX6 || ctx->chip_class >= GFX10) && !known_aligned) {
1489
/* Avoid alignment restrictions by loading one byte at a time. */
1490
load_num_channels <<= load_log_size;
1491
log_recombine = load_log_size;
1492
load_log_size = 0;
1493
} else if (load_num_channels == 2 || load_num_channels == 4) {
1494
log_recombine = -util_logbase2(load_num_channels);
1495
load_num_channels = 1;
1496
load_log_size += -log_recombine;
1497
}
1498
1499
LLVMValueRef loads[32]; /* up to 32 bytes */
1500
for (unsigned i = 0; i < load_num_channels; ++i) {
1501
tmp =
1502
LLVMBuildAdd(ctx->builder, soffset, LLVMConstInt(ctx->i32, i << load_log_size, false), "");
1503
LLVMTypeRef channel_type =
1504
load_log_size == 0 ? ctx->i8 : load_log_size == 1 ? ctx->i16 : ctx->i32;
1505
unsigned num_channels = 1 << (MAX2(load_log_size, 2) - 2);
1506
loads[i] =
1507
ac_build_buffer_load_common(ctx, rsrc, vindex, voffset, tmp, num_channels, channel_type,
1508
cache_policy, can_speculate, false, true);
1509
if (load_log_size >= 2)
1510
loads[i] = ac_to_integer(ctx, loads[i]);
1511
}
1512
1513
if (log_recombine > 0) {
1514
/* Recombine bytes if necessary (GFX6 only) */
1515
LLVMTypeRef dst_type = log_recombine == 2 ? ctx->i32 : ctx->i16;
1516
1517
for (unsigned src = 0, dst = 0; src < load_num_channels; ++dst) {
1518
LLVMValueRef accum = NULL;
1519
for (unsigned i = 0; i < (1 << log_recombine); ++i, ++src) {
1520
tmp = LLVMBuildZExt(ctx->builder, loads[src], dst_type, "");
1521
if (i == 0) {
1522
accum = tmp;
1523
} else {
1524
tmp = LLVMBuildShl(ctx->builder, tmp, LLVMConstInt(dst_type, 8 * i, false), "");
1525
accum = LLVMBuildOr(ctx->builder, accum, tmp, "");
1526
}
1527
}
1528
loads[dst] = accum;
1529
}
1530
} else if (log_recombine < 0) {
1531
/* Split vectors of dwords */
1532
if (load_log_size > 2) {
1533
assert(load_num_channels == 1);
1534
LLVMValueRef loaded = loads[0];
1535
unsigned log_split = load_log_size - 2;
1536
log_recombine += log_split;
1537
load_num_channels = 1 << log_split;
1538
load_log_size = 2;
1539
for (unsigned i = 0; i < load_num_channels; ++i) {
1540
tmp = LLVMConstInt(ctx->i32, i, false);
1541
loads[i] = LLVMBuildExtractElement(ctx->builder, loaded, tmp, "");
1542
}
1543
}
1544
1545
/* Further split dwords and shorts if required */
1546
if (log_recombine < 0) {
1547
for (unsigned src = load_num_channels, dst = load_num_channels << -log_recombine; src > 0;
1548
--src) {
1549
unsigned dst_bits = 1 << (3 + load_log_size + log_recombine);
1550
LLVMTypeRef dst_type = LLVMIntTypeInContext(ctx->context, dst_bits);
1551
LLVMValueRef loaded = loads[src - 1];
1552
LLVMTypeRef loaded_type = LLVMTypeOf(loaded);
1553
for (unsigned i = 1 << -log_recombine; i > 0; --i, --dst) {
1554
tmp = LLVMConstInt(loaded_type, dst_bits * (i - 1), false);
1555
tmp = LLVMBuildLShr(ctx->builder, loaded, tmp, "");
1556
loads[dst - 1] = LLVMBuildTrunc(ctx->builder, tmp, dst_type, "");
1557
}
1558
}
1559
}
1560
}
1561
1562
if (log_size == 3) {
1563
if (format == AC_FETCH_FORMAT_FLOAT) {
1564
for (unsigned i = 0; i < num_channels; ++i) {
1565
tmp = ac_build_gather_values(ctx, &loads[2 * i], 2);
1566
loads[i] = LLVMBuildBitCast(ctx->builder, tmp, ctx->f64, "");
1567
}
1568
} else if (format == AC_FETCH_FORMAT_FIXED) {
1569
/* 10_11_11_FLOAT */
1570
LLVMValueRef data = loads[0];
1571
LLVMValueRef i32_2047 = LLVMConstInt(ctx->i32, 2047, false);
1572
LLVMValueRef r = LLVMBuildAnd(ctx->builder, data, i32_2047, "");
1573
tmp = LLVMBuildLShr(ctx->builder, data, LLVMConstInt(ctx->i32, 11, false), "");
1574
LLVMValueRef g = LLVMBuildAnd(ctx->builder, tmp, i32_2047, "");
1575
LLVMValueRef b = LLVMBuildLShr(ctx->builder, data, LLVMConstInt(ctx->i32, 22, false), "");
1576
1577
loads[0] = ac_to_integer(ctx, ac_ufN_to_float(ctx, r, 5, 6));
1578
loads[1] = ac_to_integer(ctx, ac_ufN_to_float(ctx, g, 5, 6));
1579
loads[2] = ac_to_integer(ctx, ac_ufN_to_float(ctx, b, 5, 5));
1580
1581
num_channels = 3;
1582
log_size = 2;
1583
format = AC_FETCH_FORMAT_FLOAT;
1584
} else {
1585
/* 2_10_10_10 data formats */
1586
LLVMValueRef data = loads[0];
1587
LLVMTypeRef i10 = LLVMIntTypeInContext(ctx->context, 10);
1588
LLVMTypeRef i2 = LLVMIntTypeInContext(ctx->context, 2);
1589
loads[0] = LLVMBuildTrunc(ctx->builder, data, i10, "");
1590
tmp = LLVMBuildLShr(ctx->builder, data, LLVMConstInt(ctx->i32, 10, false), "");
1591
loads[1] = LLVMBuildTrunc(ctx->builder, tmp, i10, "");
1592
tmp = LLVMBuildLShr(ctx->builder, data, LLVMConstInt(ctx->i32, 20, false), "");
1593
loads[2] = LLVMBuildTrunc(ctx->builder, tmp, i10, "");
1594
tmp = LLVMBuildLShr(ctx->builder, data, LLVMConstInt(ctx->i32, 30, false), "");
1595
loads[3] = LLVMBuildTrunc(ctx->builder, tmp, i2, "");
1596
1597
num_channels = 4;
1598
}
1599
}
1600
1601
if (format == AC_FETCH_FORMAT_FLOAT) {
1602
if (log_size != 2) {
1603
for (unsigned chan = 0; chan < num_channels; ++chan) {
1604
tmp = ac_to_float(ctx, loads[chan]);
1605
if (log_size == 3)
1606
tmp = LLVMBuildFPTrunc(ctx->builder, tmp, ctx->f32, "");
1607
else if (log_size == 1)
1608
tmp = LLVMBuildFPExt(ctx->builder, tmp, ctx->f32, "");
1609
loads[chan] = ac_to_integer(ctx, tmp);
1610
}
1611
}
1612
} else if (format == AC_FETCH_FORMAT_UINT) {
1613
if (log_size != 2) {
1614
for (unsigned chan = 0; chan < num_channels; ++chan)
1615
loads[chan] = LLVMBuildZExt(ctx->builder, loads[chan], ctx->i32, "");
1616
}
1617
} else if (format == AC_FETCH_FORMAT_SINT) {
1618
if (log_size != 2) {
1619
for (unsigned chan = 0; chan < num_channels; ++chan)
1620
loads[chan] = LLVMBuildSExt(ctx->builder, loads[chan], ctx->i32, "");
1621
}
1622
} else {
1623
bool unsign = format == AC_FETCH_FORMAT_UNORM || format == AC_FETCH_FORMAT_USCALED ||
1624
format == AC_FETCH_FORMAT_UINT;
1625
1626
for (unsigned chan = 0; chan < num_channels; ++chan) {
1627
if (unsign) {
1628
tmp = LLVMBuildUIToFP(ctx->builder, loads[chan], ctx->f32, "");
1629
} else {
1630
tmp = LLVMBuildSIToFP(ctx->builder, loads[chan], ctx->f32, "");
1631
}
1632
1633
LLVMValueRef scale = NULL;
1634
if (format == AC_FETCH_FORMAT_FIXED) {
1635
assert(log_size == 2);
1636
scale = LLVMConstReal(ctx->f32, 1.0 / 0x10000);
1637
} else if (format == AC_FETCH_FORMAT_UNORM) {
1638
unsigned bits = LLVMGetIntTypeWidth(LLVMTypeOf(loads[chan]));
1639
scale = LLVMConstReal(ctx->f32, 1.0 / (((uint64_t)1 << bits) - 1));
1640
} else if (format == AC_FETCH_FORMAT_SNORM) {
1641
unsigned bits = LLVMGetIntTypeWidth(LLVMTypeOf(loads[chan]));
1642
scale = LLVMConstReal(ctx->f32, 1.0 / (((uint64_t)1 << (bits - 1)) - 1));
1643
}
1644
if (scale)
1645
tmp = LLVMBuildFMul(ctx->builder, tmp, scale, "");
1646
1647
if (format == AC_FETCH_FORMAT_SNORM) {
1648
/* Clamp to [-1, 1] */
1649
LLVMValueRef neg_one = LLVMConstReal(ctx->f32, -1.0);
1650
LLVMValueRef clamp = LLVMBuildFCmp(ctx->builder, LLVMRealULT, tmp, neg_one, "");
1651
tmp = LLVMBuildSelect(ctx->builder, clamp, neg_one, tmp, "");
1652
}
1653
1654
loads[chan] = ac_to_integer(ctx, tmp);
1655
}
1656
}
1657
1658
while (num_channels < 4) {
1659
if (format == AC_FETCH_FORMAT_UINT || format == AC_FETCH_FORMAT_SINT) {
1660
loads[num_channels] = num_channels == 3 ? ctx->i32_1 : ctx->i32_0;
1661
} else {
1662
loads[num_channels] = ac_to_integer(ctx, num_channels == 3 ? ctx->f32_1 : ctx->f32_0);
1663
}
1664
num_channels++;
1665
}
1666
1667
if (reverse) {
1668
tmp = loads[0];
1669
loads[0] = loads[2];
1670
loads[2] = tmp;
1671
}
1672
1673
return ac_build_gather_values(ctx, loads, 4);
1674
}
1675
1676
static void ac_build_tbuffer_store(struct ac_llvm_context *ctx, LLVMValueRef rsrc,
1677
LLVMValueRef vdata, LLVMValueRef vindex, LLVMValueRef voffset,
1678
LLVMValueRef soffset, LLVMValueRef immoffset,
1679
unsigned num_channels, unsigned dfmt, unsigned nfmt,
1680
unsigned cache_policy, bool structurized)
1681
{
1682
voffset = LLVMBuildAdd(ctx->builder, voffset ? voffset : ctx->i32_0, immoffset, "");
1683
1684
LLVMValueRef args[7];
1685
int idx = 0;
1686
args[idx++] = vdata;
1687
args[idx++] = LLVMBuildBitCast(ctx->builder, rsrc, ctx->v4i32, "");
1688
if (structurized)
1689
args[idx++] = vindex ? vindex : ctx->i32_0;
1690
args[idx++] = voffset ? voffset : ctx->i32_0;
1691
args[idx++] = soffset ? soffset : ctx->i32_0;
1692
args[idx++] = LLVMConstInt(ctx->i32, ac_get_tbuffer_format(ctx->chip_class, dfmt, nfmt), 0);
1693
args[idx++] = LLVMConstInt(ctx->i32, cache_policy, 0);
1694
unsigned func =
1695
!ac_has_vec3_support(ctx->chip_class, true) && num_channels == 3 ? 4 : num_channels;
1696
const char *indexing_kind = structurized ? "struct" : "raw";
1697
char name[256], type_name[8];
1698
1699
LLVMTypeRef type = func > 1 ? LLVMVectorType(ctx->i32, func) : ctx->i32;
1700
ac_build_type_name_for_intr(type, type_name, sizeof(type_name));
1701
1702
snprintf(name, sizeof(name), "llvm.amdgcn.%s.tbuffer.store.%s", indexing_kind, type_name);
1703
1704
ac_build_intrinsic(ctx, name, ctx->voidt, args, idx, AC_FUNC_ATTR_INACCESSIBLE_MEM_ONLY);
1705
}
1706
1707
void ac_build_struct_tbuffer_store(struct ac_llvm_context *ctx, LLVMValueRef rsrc,
1708
LLVMValueRef vdata, LLVMValueRef vindex, LLVMValueRef voffset,
1709
LLVMValueRef soffset, LLVMValueRef immoffset,
1710
unsigned num_channels, unsigned dfmt, unsigned nfmt,
1711
unsigned cache_policy)
1712
{
1713
ac_build_tbuffer_store(ctx, rsrc, vdata, vindex, voffset, soffset, immoffset, num_channels, dfmt,
1714
nfmt, cache_policy, true);
1715
}
1716
1717
void ac_build_raw_tbuffer_store(struct ac_llvm_context *ctx, LLVMValueRef rsrc, LLVMValueRef vdata,
1718
LLVMValueRef voffset, LLVMValueRef soffset, LLVMValueRef immoffset,
1719
unsigned num_channels, unsigned dfmt, unsigned nfmt,
1720
unsigned cache_policy)
1721
{
1722
ac_build_tbuffer_store(ctx, rsrc, vdata, NULL, voffset, soffset, immoffset, num_channels, dfmt,
1723
nfmt, cache_policy, false);
1724
}
1725
1726
void ac_build_tbuffer_store_short(struct ac_llvm_context *ctx, LLVMValueRef rsrc,
1727
LLVMValueRef vdata, LLVMValueRef voffset, LLVMValueRef soffset,
1728
unsigned cache_policy)
1729
{
1730
vdata = LLVMBuildBitCast(ctx->builder, vdata, ctx->i16, "");
1731
1732
ac_build_buffer_store_common(ctx, rsrc, vdata, NULL, voffset, soffset, cache_policy, false,
1733
false);
1734
}
1735
1736
void ac_build_tbuffer_store_byte(struct ac_llvm_context *ctx, LLVMValueRef rsrc, LLVMValueRef vdata,
1737
LLVMValueRef voffset, LLVMValueRef soffset, unsigned cache_policy)
1738
{
1739
vdata = LLVMBuildBitCast(ctx->builder, vdata, ctx->i8, "");
1740
1741
ac_build_buffer_store_common(ctx, rsrc, vdata, NULL, voffset, soffset, cache_policy, false,
1742
false);
1743
}
1744
1745
/**
1746
* Set range metadata on an instruction. This can only be used on load and
1747
* call instructions. If you know an instruction can only produce the values
1748
* 0, 1, 2, you would do set_range_metadata(value, 0, 3);
1749
* \p lo is the minimum value inclusive.
1750
* \p hi is the maximum value exclusive.
1751
*/
1752
void ac_set_range_metadata(struct ac_llvm_context *ctx, LLVMValueRef value, unsigned lo,
1753
unsigned hi)
1754
{
1755
LLVMValueRef range_md, md_args[2];
1756
LLVMTypeRef type = LLVMTypeOf(value);
1757
LLVMContextRef context = LLVMGetTypeContext(type);
1758
1759
md_args[0] = LLVMConstInt(type, lo, false);
1760
md_args[1] = LLVMConstInt(type, hi, false);
1761
range_md = LLVMMDNodeInContext(context, md_args, 2);
1762
LLVMSetMetadata(value, ctx->range_md_kind, range_md);
1763
}
1764
1765
LLVMValueRef ac_get_thread_id(struct ac_llvm_context *ctx)
1766
{
1767
return ac_build_mbcnt(ctx, LLVMConstInt(ctx->iN_wavemask, ~0ull, 0));
1768
}
1769
1770
/*
1771
* AMD GCN implements derivatives using the local data store (LDS)
1772
* All writes to the LDS happen in all executing threads at
1773
* the same time. TID is the Thread ID for the current
1774
* thread and is a value between 0 and 63, representing
1775
* the thread's position in the wavefront.
1776
*
1777
* For the pixel shader threads are grouped into quads of four pixels.
1778
* The TIDs of the pixels of a quad are:
1779
*
1780
* +------+------+
1781
* |4n + 0|4n + 1|
1782
* +------+------+
1783
* |4n + 2|4n + 3|
1784
* +------+------+
1785
*
1786
* So, masking the TID with 0xfffffffc yields the TID of the top left pixel
1787
* of the quad, masking with 0xfffffffd yields the TID of the top pixel of
1788
* the current pixel's column, and masking with 0xfffffffe yields the TID
1789
* of the left pixel of the current pixel's row.
1790
*
1791
* Adding 1 yields the TID of the pixel to the right of the left pixel, and
1792
* adding 2 yields the TID of the pixel below the top pixel.
1793
*/
1794
LLVMValueRef ac_build_ddxy(struct ac_llvm_context *ctx, uint32_t mask, int idx, LLVMValueRef val)
1795
{
1796
unsigned tl_lanes[4], trbl_lanes[4];
1797
char name[32], type[8];
1798
LLVMValueRef tl, trbl;
1799
LLVMTypeRef result_type;
1800
LLVMValueRef result;
1801
1802
result_type = ac_to_float_type(ctx, LLVMTypeOf(val));
1803
1804
if (result_type == ctx->f16)
1805
val = LLVMBuildZExt(ctx->builder, val, ctx->i32, "");
1806
else if (result_type == ctx->v2f16)
1807
val = LLVMBuildBitCast(ctx->builder, val, ctx->i32, "");
1808
1809
for (unsigned i = 0; i < 4; ++i) {
1810
tl_lanes[i] = i & mask;
1811
trbl_lanes[i] = (i & mask) + idx;
1812
}
1813
1814
tl = ac_build_quad_swizzle(ctx, val, tl_lanes[0], tl_lanes[1], tl_lanes[2], tl_lanes[3]);
1815
trbl =
1816
ac_build_quad_swizzle(ctx, val, trbl_lanes[0], trbl_lanes[1], trbl_lanes[2], trbl_lanes[3]);
1817
1818
if (result_type == ctx->f16) {
1819
tl = LLVMBuildTrunc(ctx->builder, tl, ctx->i16, "");
1820
trbl = LLVMBuildTrunc(ctx->builder, trbl, ctx->i16, "");
1821
}
1822
1823
tl = LLVMBuildBitCast(ctx->builder, tl, result_type, "");
1824
trbl = LLVMBuildBitCast(ctx->builder, trbl, result_type, "");
1825
result = LLVMBuildFSub(ctx->builder, trbl, tl, "");
1826
1827
ac_build_type_name_for_intr(result_type, type, sizeof(type));
1828
snprintf(name, sizeof(name), "llvm.amdgcn.wqm.%s", type);
1829
1830
return ac_build_intrinsic(ctx, name, result_type, &result, 1, 0);
1831
}
1832
1833
void ac_build_sendmsg(struct ac_llvm_context *ctx, uint32_t msg, LLVMValueRef wave_id)
1834
{
1835
LLVMValueRef args[2];
1836
args[0] = LLVMConstInt(ctx->i32, msg, false);
1837
args[1] = wave_id;
1838
ac_build_intrinsic(ctx, "llvm.amdgcn.s.sendmsg", ctx->voidt, args, 2, 0);
1839
}
1840
1841
LLVMValueRef ac_build_imsb(struct ac_llvm_context *ctx, LLVMValueRef arg, LLVMTypeRef dst_type)
1842
{
1843
LLVMValueRef msb =
1844
ac_build_intrinsic(ctx, "llvm.amdgcn.sffbh.i32", dst_type, &arg, 1, AC_FUNC_ATTR_READNONE);
1845
1846
/* The HW returns the last bit index from MSB, but NIR/TGSI wants
1847
* the index from LSB. Invert it by doing "31 - msb". */
1848
msb = LLVMBuildSub(ctx->builder, LLVMConstInt(ctx->i32, 31, false), msb, "");
1849
1850
LLVMValueRef all_ones = LLVMConstInt(ctx->i32, -1, true);
1851
LLVMValueRef cond =
1852
LLVMBuildOr(ctx->builder, LLVMBuildICmp(ctx->builder, LLVMIntEQ, arg, ctx->i32_0, ""),
1853
LLVMBuildICmp(ctx->builder, LLVMIntEQ, arg, all_ones, ""), "");
1854
1855
return LLVMBuildSelect(ctx->builder, cond, all_ones, msb, "");
1856
}
1857
1858
LLVMValueRef ac_build_umsb(struct ac_llvm_context *ctx, LLVMValueRef arg, LLVMTypeRef dst_type)
1859
{
1860
const char *intrin_name;
1861
LLVMTypeRef type;
1862
LLVMValueRef highest_bit;
1863
LLVMValueRef zero;
1864
unsigned bitsize;
1865
1866
bitsize = ac_get_elem_bits(ctx, LLVMTypeOf(arg));
1867
switch (bitsize) {
1868
case 64:
1869
intrin_name = "llvm.ctlz.i64";
1870
type = ctx->i64;
1871
highest_bit = LLVMConstInt(ctx->i64, 63, false);
1872
zero = ctx->i64_0;
1873
break;
1874
case 32:
1875
intrin_name = "llvm.ctlz.i32";
1876
type = ctx->i32;
1877
highest_bit = LLVMConstInt(ctx->i32, 31, false);
1878
zero = ctx->i32_0;
1879
break;
1880
case 16:
1881
intrin_name = "llvm.ctlz.i16";
1882
type = ctx->i16;
1883
highest_bit = LLVMConstInt(ctx->i16, 15, false);
1884
zero = ctx->i16_0;
1885
break;
1886
case 8:
1887
intrin_name = "llvm.ctlz.i8";
1888
type = ctx->i8;
1889
highest_bit = LLVMConstInt(ctx->i8, 7, false);
1890
zero = ctx->i8_0;
1891
break;
1892
default:
1893
unreachable(!"invalid bitsize");
1894
break;
1895
}
1896
1897
LLVMValueRef params[2] = {
1898
arg,
1899
ctx->i1true,
1900
};
1901
1902
LLVMValueRef msb = ac_build_intrinsic(ctx, intrin_name, type, params, 2, AC_FUNC_ATTR_READNONE);
1903
1904
/* The HW returns the last bit index from MSB, but TGSI/NIR wants
1905
* the index from LSB. Invert it by doing "31 - msb". */
1906
msb = LLVMBuildSub(ctx->builder, highest_bit, msb, "");
1907
1908
if (bitsize == 64) {
1909
msb = LLVMBuildTrunc(ctx->builder, msb, ctx->i32, "");
1910
} else if (bitsize < 32) {
1911
msb = LLVMBuildSExt(ctx->builder, msb, ctx->i32, "");
1912
}
1913
1914
/* check for zero */
1915
return LLVMBuildSelect(ctx->builder, LLVMBuildICmp(ctx->builder, LLVMIntEQ, arg, zero, ""),
1916
LLVMConstInt(ctx->i32, -1, true), msb, "");
1917
}
1918
1919
LLVMValueRef ac_build_fmin(struct ac_llvm_context *ctx, LLVMValueRef a, LLVMValueRef b)
1920
{
1921
char name[64], type[64];
1922
1923
ac_build_type_name_for_intr(LLVMTypeOf(a), type, sizeof(type));
1924
snprintf(name, sizeof(name), "llvm.minnum.%s", type);
1925
LLVMValueRef args[2] = {a, b};
1926
return ac_build_intrinsic(ctx, name, LLVMTypeOf(a), args, 2, AC_FUNC_ATTR_READNONE);
1927
}
1928
1929
LLVMValueRef ac_build_fmax(struct ac_llvm_context *ctx, LLVMValueRef a, LLVMValueRef b)
1930
{
1931
char name[64], type[64];
1932
1933
ac_build_type_name_for_intr(LLVMTypeOf(a), type, sizeof(type));
1934
snprintf(name, sizeof(name), "llvm.maxnum.%s", type);
1935
LLVMValueRef args[2] = {a, b};
1936
return ac_build_intrinsic(ctx, name, LLVMTypeOf(a), args, 2, AC_FUNC_ATTR_READNONE);
1937
}
1938
1939
LLVMValueRef ac_build_imin(struct ac_llvm_context *ctx, LLVMValueRef a, LLVMValueRef b)
1940
{
1941
LLVMValueRef cmp = LLVMBuildICmp(ctx->builder, LLVMIntSLE, a, b, "");
1942
return LLVMBuildSelect(ctx->builder, cmp, a, b, "");
1943
}
1944
1945
LLVMValueRef ac_build_imax(struct ac_llvm_context *ctx, LLVMValueRef a, LLVMValueRef b)
1946
{
1947
LLVMValueRef cmp = LLVMBuildICmp(ctx->builder, LLVMIntSGT, a, b, "");
1948
return LLVMBuildSelect(ctx->builder, cmp, a, b, "");
1949
}
1950
1951
LLVMValueRef ac_build_umin(struct ac_llvm_context *ctx, LLVMValueRef a, LLVMValueRef b)
1952
{
1953
LLVMValueRef cmp = LLVMBuildICmp(ctx->builder, LLVMIntULE, a, b, "");
1954
return LLVMBuildSelect(ctx->builder, cmp, a, b, "");
1955
}
1956
1957
LLVMValueRef ac_build_umax(struct ac_llvm_context *ctx, LLVMValueRef a, LLVMValueRef b)
1958
{
1959
LLVMValueRef cmp = LLVMBuildICmp(ctx->builder, LLVMIntUGE, a, b, "");
1960
return LLVMBuildSelect(ctx->builder, cmp, a, b, "");
1961
}
1962
1963
LLVMValueRef ac_build_clamp(struct ac_llvm_context *ctx, LLVMValueRef value)
1964
{
1965
LLVMTypeRef t = LLVMTypeOf(value);
1966
return ac_build_fmin(ctx, ac_build_fmax(ctx, value, LLVMConstReal(t, 0.0)),
1967
LLVMConstReal(t, 1.0));
1968
}
1969
1970
void ac_build_export(struct ac_llvm_context *ctx, struct ac_export_args *a)
1971
{
1972
LLVMValueRef args[9];
1973
1974
args[0] = LLVMConstInt(ctx->i32, a->target, 0);
1975
args[1] = LLVMConstInt(ctx->i32, a->enabled_channels, 0);
1976
1977
if (a->compr) {
1978
args[2] = LLVMBuildBitCast(ctx->builder, a->out[0], ctx->v2i16, "");
1979
args[3] = LLVMBuildBitCast(ctx->builder, a->out[1], ctx->v2i16, "");
1980
args[4] = LLVMConstInt(ctx->i1, a->done, 0);
1981
args[5] = LLVMConstInt(ctx->i1, a->valid_mask, 0);
1982
1983
ac_build_intrinsic(ctx, "llvm.amdgcn.exp.compr.v2i16", ctx->voidt, args, 6, 0);
1984
} else {
1985
args[2] = a->out[0];
1986
args[3] = a->out[1];
1987
args[4] = a->out[2];
1988
args[5] = a->out[3];
1989
args[6] = LLVMConstInt(ctx->i1, a->done, 0);
1990
args[7] = LLVMConstInt(ctx->i1, a->valid_mask, 0);
1991
1992
ac_build_intrinsic(ctx, "llvm.amdgcn.exp.f32", ctx->voidt, args, 8, 0);
1993
}
1994
}
1995
1996
void ac_build_export_null(struct ac_llvm_context *ctx)
1997
{
1998
struct ac_export_args args;
1999
2000
args.enabled_channels = 0x0; /* enabled channels */
2001
args.valid_mask = 1; /* whether the EXEC mask is valid */
2002
args.done = 1; /* DONE bit */
2003
args.target = V_008DFC_SQ_EXP_NULL;
2004
args.compr = 0; /* COMPR flag (0 = 32-bit export) */
2005
args.out[0] = LLVMGetUndef(ctx->f32); /* R */
2006
args.out[1] = LLVMGetUndef(ctx->f32); /* G */
2007
args.out[2] = LLVMGetUndef(ctx->f32); /* B */
2008
args.out[3] = LLVMGetUndef(ctx->f32); /* A */
2009
2010
ac_build_export(ctx, &args);
2011
}
2012
2013
static unsigned ac_num_coords(enum ac_image_dim dim)
2014
{
2015
switch (dim) {
2016
case ac_image_1d:
2017
return 1;
2018
case ac_image_2d:
2019
case ac_image_1darray:
2020
return 2;
2021
case ac_image_3d:
2022
case ac_image_cube:
2023
case ac_image_2darray:
2024
case ac_image_2dmsaa:
2025
return 3;
2026
case ac_image_2darraymsaa:
2027
return 4;
2028
default:
2029
unreachable("ac_num_coords: bad dim");
2030
}
2031
}
2032
2033
static unsigned ac_num_derivs(enum ac_image_dim dim)
2034
{
2035
switch (dim) {
2036
case ac_image_1d:
2037
case ac_image_1darray:
2038
return 2;
2039
case ac_image_2d:
2040
case ac_image_2darray:
2041
case ac_image_cube:
2042
return 4;
2043
case ac_image_3d:
2044
return 6;
2045
case ac_image_2dmsaa:
2046
case ac_image_2darraymsaa:
2047
default:
2048
unreachable("derivatives not supported");
2049
}
2050
}
2051
2052
static const char *get_atomic_name(enum ac_atomic_op op)
2053
{
2054
switch (op) {
2055
case ac_atomic_swap:
2056
return "swap";
2057
case ac_atomic_add:
2058
return "add";
2059
case ac_atomic_sub:
2060
return "sub";
2061
case ac_atomic_smin:
2062
return "smin";
2063
case ac_atomic_umin:
2064
return "umin";
2065
case ac_atomic_smax:
2066
return "smax";
2067
case ac_atomic_umax:
2068
return "umax";
2069
case ac_atomic_and:
2070
return "and";
2071
case ac_atomic_or:
2072
return "or";
2073
case ac_atomic_xor:
2074
return "xor";
2075
case ac_atomic_inc_wrap:
2076
return "inc";
2077
case ac_atomic_dec_wrap:
2078
return "dec";
2079
}
2080
unreachable("bad atomic op");
2081
}
2082
2083
LLVMValueRef ac_build_image_opcode(struct ac_llvm_context *ctx, struct ac_image_args *a)
2084
{
2085
const char *overload[3] = {"", "", ""};
2086
unsigned num_overloads = 0;
2087
LLVMValueRef args[18];
2088
unsigned num_args = 0;
2089
enum ac_image_dim dim = a->dim;
2090
2091
assert(!a->lod || a->lod == ctx->i32_0 || a->lod == ctx->f32_0 || !a->level_zero);
2092
assert((a->opcode != ac_image_get_resinfo && a->opcode != ac_image_load_mip &&
2093
a->opcode != ac_image_store_mip) ||
2094
a->lod);
2095
assert(a->opcode == ac_image_sample || a->opcode == ac_image_gather4 ||
2096
(!a->compare && !a->offset));
2097
assert((a->opcode == ac_image_sample || a->opcode == ac_image_gather4 ||
2098
a->opcode == ac_image_get_lod) ||
2099
!a->bias);
2100
assert((a->bias ? 1 : 0) + (a->lod ? 1 : 0) + (a->level_zero ? 1 : 0) + (a->derivs[0] ? 1 : 0) <=
2101
1);
2102
assert((a->min_lod ? 1 : 0) + (a->lod ? 1 : 0) + (a->level_zero ? 1 : 0) <= 1);
2103
assert(!a->d16 || (ctx->chip_class >= GFX8 && a->opcode != ac_image_atomic &&
2104
a->opcode != ac_image_atomic_cmpswap && a->opcode != ac_image_get_lod &&
2105
a->opcode != ac_image_get_resinfo));
2106
assert(!a->a16 || ctx->chip_class >= GFX9);
2107
assert(a->g16 == a->a16 || ctx->chip_class >= GFX10);
2108
2109
assert(!a->offset ||
2110
ac_get_elem_bits(ctx, LLVMTypeOf(a->offset)) == 32);
2111
assert(!a->bias ||
2112
ac_get_elem_bits(ctx, LLVMTypeOf(a->bias)) == 32);
2113
assert(!a->compare ||
2114
ac_get_elem_bits(ctx, LLVMTypeOf(a->compare)) == 32);
2115
assert(!a->derivs[0] ||
2116
((!a->g16 || ac_get_elem_bits(ctx, LLVMTypeOf(a->derivs[0])) == 16) &&
2117
(a->g16 || ac_get_elem_bits(ctx, LLVMTypeOf(a->derivs[0])) == 32)));
2118
assert(!a->coords[0] ||
2119
((!a->a16 || ac_get_elem_bits(ctx, LLVMTypeOf(a->coords[0])) == 16) &&
2120
(a->a16 || ac_get_elem_bits(ctx, LLVMTypeOf(a->coords[0])) == 32)));
2121
assert(!a->lod ||
2122
((a->opcode != ac_image_get_resinfo || ac_get_elem_bits(ctx, LLVMTypeOf(a->lod))) &&
2123
(a->opcode == ac_image_get_resinfo ||
2124
ac_get_elem_bits(ctx, LLVMTypeOf(a->lod)) ==
2125
ac_get_elem_bits(ctx, LLVMTypeOf(a->coords[0])))));
2126
assert(!a->min_lod ||
2127
ac_get_elem_bits(ctx, LLVMTypeOf(a->min_lod)) ==
2128
ac_get_elem_bits(ctx, LLVMTypeOf(a->coords[0])));
2129
2130
if (a->opcode == ac_image_get_lod) {
2131
switch (dim) {
2132
case ac_image_1darray:
2133
dim = ac_image_1d;
2134
break;
2135
case ac_image_2darray:
2136
case ac_image_cube:
2137
dim = ac_image_2d;
2138
break;
2139
default:
2140
break;
2141
}
2142
}
2143
2144
bool sample = a->opcode == ac_image_sample || a->opcode == ac_image_gather4 ||
2145
a->opcode == ac_image_get_lod;
2146
bool atomic = a->opcode == ac_image_atomic || a->opcode == ac_image_atomic_cmpswap;
2147
bool load = a->opcode == ac_image_sample || a->opcode == ac_image_gather4 ||
2148
a->opcode == ac_image_load || a->opcode == ac_image_load_mip;
2149
LLVMTypeRef coord_type = sample ? (a->a16 ? ctx->f16 : ctx->f32) : (a->a16 ? ctx->i16 : ctx->i32);
2150
uint8_t dmask = a->dmask;
2151
LLVMTypeRef data_type;
2152
char data_type_str[32];
2153
2154
if (atomic) {
2155
data_type = LLVMTypeOf(a->data[0]);
2156
} else if (a->opcode == ac_image_store || a->opcode == ac_image_store_mip) {
2157
/* Image stores might have been shrinked using the format. */
2158
data_type = LLVMTypeOf(a->data[0]);
2159
dmask = (1 << ac_get_llvm_num_components(a->data[0])) - 1;
2160
} else {
2161
data_type = a->d16 ? ctx->v4f16 : ctx->v4f32;
2162
}
2163
2164
if (a->tfe) {
2165
data_type = LLVMStructTypeInContext(
2166
ctx->context, (LLVMTypeRef[]){data_type, ctx->i32}, 2, false);
2167
}
2168
2169
if (atomic || a->opcode == ac_image_store || a->opcode == ac_image_store_mip) {
2170
args[num_args++] = a->data[0];
2171
if (a->opcode == ac_image_atomic_cmpswap)
2172
args[num_args++] = a->data[1];
2173
}
2174
2175
if (!atomic)
2176
args[num_args++] = LLVMConstInt(ctx->i32, dmask, false);
2177
2178
if (a->offset)
2179
args[num_args++] = ac_to_integer(ctx, a->offset);
2180
if (a->bias) {
2181
args[num_args++] = ac_to_float(ctx, a->bias);
2182
overload[num_overloads++] = ".f32";
2183
}
2184
if (a->compare)
2185
args[num_args++] = ac_to_float(ctx, a->compare);
2186
if (a->derivs[0]) {
2187
unsigned count = ac_num_derivs(dim);
2188
for (unsigned i = 0; i < count; ++i)
2189
args[num_args++] = ac_to_float(ctx, a->derivs[i]);
2190
overload[num_overloads++] = a->g16 ? ".f16" : ".f32";
2191
}
2192
unsigned num_coords = a->opcode != ac_image_get_resinfo ? ac_num_coords(dim) : 0;
2193
for (unsigned i = 0; i < num_coords; ++i)
2194
args[num_args++] = LLVMBuildBitCast(ctx->builder, a->coords[i], coord_type, "");
2195
if (a->lod)
2196
args[num_args++] = LLVMBuildBitCast(ctx->builder, a->lod, coord_type, "");
2197
if (a->min_lod)
2198
args[num_args++] = LLVMBuildBitCast(ctx->builder, a->min_lod, coord_type, "");
2199
2200
overload[num_overloads++] = sample ? (a->a16 ? ".f16" : ".f32") : (a->a16 ? ".i16" : ".i32");
2201
2202
args[num_args++] = a->resource;
2203
if (sample) {
2204
args[num_args++] = a->sampler;
2205
args[num_args++] = LLVMConstInt(ctx->i1, a->unorm, false);
2206
}
2207
2208
args[num_args++] = a->tfe ? ctx->i32_1 : ctx->i32_0; /* texfailctrl */
2209
args[num_args++] = LLVMConstInt(
2210
ctx->i32, load ? get_load_cache_policy(ctx, a->cache_policy) : a->cache_policy, false);
2211
2212
const char *name;
2213
const char *atomic_subop = "";
2214
switch (a->opcode) {
2215
case ac_image_sample:
2216
name = "sample";
2217
break;
2218
case ac_image_gather4:
2219
name = "gather4";
2220
break;
2221
case ac_image_load:
2222
name = "load";
2223
break;
2224
case ac_image_load_mip:
2225
name = "load.mip";
2226
break;
2227
case ac_image_store:
2228
name = "store";
2229
break;
2230
case ac_image_store_mip:
2231
name = "store.mip";
2232
break;
2233
case ac_image_atomic:
2234
name = "atomic.";
2235
atomic_subop = get_atomic_name(a->atomic);
2236
break;
2237
case ac_image_atomic_cmpswap:
2238
name = "atomic.";
2239
atomic_subop = "cmpswap";
2240
break;
2241
case ac_image_get_lod:
2242
name = "getlod";
2243
break;
2244
case ac_image_get_resinfo:
2245
name = "getresinfo";
2246
break;
2247
default:
2248
unreachable("invalid image opcode");
2249
}
2250
2251
const char *dimname;
2252
switch (dim) {
2253
case ac_image_1d:
2254
dimname = "1d";
2255
break;
2256
case ac_image_2d:
2257
dimname = "2d";
2258
break;
2259
case ac_image_3d:
2260
dimname = "3d";
2261
break;
2262
case ac_image_cube:
2263
dimname = "cube";
2264
break;
2265
case ac_image_1darray:
2266
dimname = "1darray";
2267
break;
2268
case ac_image_2darray:
2269
dimname = "2darray";
2270
break;
2271
case ac_image_2dmsaa:
2272
dimname = "2dmsaa";
2273
break;
2274
case ac_image_2darraymsaa:
2275
dimname = "2darraymsaa";
2276
break;
2277
default:
2278
unreachable("invalid dim");
2279
}
2280
2281
ac_build_type_name_for_intr(data_type, data_type_str, sizeof(data_type_str));
2282
2283
bool lod_suffix = a->lod && (a->opcode == ac_image_sample || a->opcode == ac_image_gather4);
2284
char intr_name[96];
2285
snprintf(intr_name, sizeof(intr_name),
2286
"llvm.amdgcn.image.%s%s" /* base name */
2287
"%s%s%s%s" /* sample/gather modifiers */
2288
".%s.%s%s%s%s", /* dimension and type overloads */
2289
name, atomic_subop, a->compare ? ".c" : "",
2290
a->bias ? ".b" : lod_suffix ? ".l" : a->derivs[0] ? ".d" : a->level_zero ? ".lz" : "",
2291
a->min_lod ? ".cl" : "", a->offset ? ".o" : "", dimname,
2292
data_type_str, overload[0], overload[1], overload[2]);
2293
2294
LLVMTypeRef retty;
2295
if (a->opcode == ac_image_store || a->opcode == ac_image_store_mip)
2296
retty = ctx->voidt;
2297
else
2298
retty = data_type;
2299
2300
LLVMValueRef result = ac_build_intrinsic(ctx, intr_name, retty, args, num_args, a->attributes);
2301
if (a->tfe) {
2302
LLVMValueRef texel = LLVMBuildExtractValue(ctx->builder, result, 0, "");
2303
LLVMValueRef code = LLVMBuildExtractValue(ctx->builder, result, 1, "");
2304
result = ac_build_concat(ctx, texel, ac_to_float(ctx, code));
2305
}
2306
2307
if (!sample && !atomic && retty != ctx->voidt)
2308
result = ac_to_integer(ctx, result);
2309
2310
return result;
2311
}
2312
2313
LLVMValueRef ac_build_image_get_sample_count(struct ac_llvm_context *ctx, LLVMValueRef rsrc)
2314
{
2315
LLVMValueRef samples;
2316
2317
/* Read the samples from the descriptor directly.
2318
* Hardware doesn't have any instruction for this.
2319
*/
2320
samples = LLVMBuildExtractElement(ctx->builder, rsrc, LLVMConstInt(ctx->i32, 3, 0), "");
2321
samples = LLVMBuildLShr(ctx->builder, samples, LLVMConstInt(ctx->i32, 16, 0), "");
2322
samples = LLVMBuildAnd(ctx->builder, samples, LLVMConstInt(ctx->i32, 0xf, 0), "");
2323
samples = LLVMBuildShl(ctx->builder, ctx->i32_1, samples, "");
2324
return samples;
2325
}
2326
2327
LLVMValueRef ac_build_cvt_pkrtz_f16(struct ac_llvm_context *ctx, LLVMValueRef args[2])
2328
{
2329
return ac_build_intrinsic(ctx, "llvm.amdgcn.cvt.pkrtz", ctx->v2f16, args, 2,
2330
AC_FUNC_ATTR_READNONE);
2331
}
2332
2333
LLVMValueRef ac_build_cvt_pknorm_i16(struct ac_llvm_context *ctx, LLVMValueRef args[2])
2334
{
2335
LLVMValueRef res = ac_build_intrinsic(ctx, "llvm.amdgcn.cvt.pknorm.i16", ctx->v2i16, args, 2,
2336
AC_FUNC_ATTR_READNONE);
2337
return LLVMBuildBitCast(ctx->builder, res, ctx->i32, "");
2338
}
2339
2340
LLVMValueRef ac_build_cvt_pknorm_u16(struct ac_llvm_context *ctx, LLVMValueRef args[2])
2341
{
2342
LLVMValueRef res = ac_build_intrinsic(ctx, "llvm.amdgcn.cvt.pknorm.u16", ctx->v2i16, args, 2,
2343
AC_FUNC_ATTR_READNONE);
2344
return LLVMBuildBitCast(ctx->builder, res, ctx->i32, "");
2345
}
2346
2347
LLVMValueRef ac_build_cvt_pknorm_i16_f16(struct ac_llvm_context *ctx,
2348
LLVMValueRef args[2])
2349
{
2350
LLVMTypeRef param_types[] = {ctx->f16, ctx->f16};
2351
LLVMTypeRef calltype = LLVMFunctionType(ctx->i32, param_types, 2, false);
2352
LLVMValueRef code = LLVMConstInlineAsm(calltype,
2353
"v_cvt_pknorm_i16_f16 $0, $1, $2", "=v,v,v",
2354
false, false);
2355
return LLVMBuildCall(ctx->builder, code, args, 2, "");
2356
}
2357
2358
LLVMValueRef ac_build_cvt_pknorm_u16_f16(struct ac_llvm_context *ctx,
2359
LLVMValueRef args[2])
2360
{
2361
LLVMTypeRef param_types[] = {ctx->f16, ctx->f16};
2362
LLVMTypeRef calltype = LLVMFunctionType(ctx->i32, param_types, 2, false);
2363
LLVMValueRef code = LLVMConstInlineAsm(calltype,
2364
"v_cvt_pknorm_u16_f16 $0, $1, $2", "=v,v,v",
2365
false, false);
2366
return LLVMBuildCall(ctx->builder, code, args, 2, "");
2367
}
2368
2369
/* The 8-bit and 10-bit clamping is for HW workarounds. */
2370
LLVMValueRef ac_build_cvt_pk_i16(struct ac_llvm_context *ctx, LLVMValueRef args[2], unsigned bits,
2371
bool hi)
2372
{
2373
assert(bits == 8 || bits == 10 || bits == 16);
2374
2375
LLVMValueRef max_rgb = LLVMConstInt(ctx->i32, bits == 8 ? 127 : bits == 10 ? 511 : 32767, 0);
2376
LLVMValueRef min_rgb = LLVMConstInt(ctx->i32, bits == 8 ? -128 : bits == 10 ? -512 : -32768, 0);
2377
LLVMValueRef max_alpha = bits != 10 ? max_rgb : ctx->i32_1;
2378
LLVMValueRef min_alpha = bits != 10 ? min_rgb : LLVMConstInt(ctx->i32, -2, 0);
2379
2380
/* Clamp. */
2381
if (bits != 16) {
2382
for (int i = 0; i < 2; i++) {
2383
bool alpha = hi && i == 1;
2384
args[i] = ac_build_imin(ctx, args[i], alpha ? max_alpha : max_rgb);
2385
args[i] = ac_build_imax(ctx, args[i], alpha ? min_alpha : min_rgb);
2386
}
2387
}
2388
2389
LLVMValueRef res =
2390
ac_build_intrinsic(ctx, "llvm.amdgcn.cvt.pk.i16", ctx->v2i16, args, 2, AC_FUNC_ATTR_READNONE);
2391
return LLVMBuildBitCast(ctx->builder, res, ctx->i32, "");
2392
}
2393
2394
/* The 8-bit and 10-bit clamping is for HW workarounds. */
2395
LLVMValueRef ac_build_cvt_pk_u16(struct ac_llvm_context *ctx, LLVMValueRef args[2], unsigned bits,
2396
bool hi)
2397
{
2398
assert(bits == 8 || bits == 10 || bits == 16);
2399
2400
LLVMValueRef max_rgb = LLVMConstInt(ctx->i32, bits == 8 ? 255 : bits == 10 ? 1023 : 65535, 0);
2401
LLVMValueRef max_alpha = bits != 10 ? max_rgb : LLVMConstInt(ctx->i32, 3, 0);
2402
2403
/* Clamp. */
2404
if (bits != 16) {
2405
for (int i = 0; i < 2; i++) {
2406
bool alpha = hi && i == 1;
2407
args[i] = ac_build_umin(ctx, args[i], alpha ? max_alpha : max_rgb);
2408
}
2409
}
2410
2411
LLVMValueRef res =
2412
ac_build_intrinsic(ctx, "llvm.amdgcn.cvt.pk.u16", ctx->v2i16, args, 2, AC_FUNC_ATTR_READNONE);
2413
return LLVMBuildBitCast(ctx->builder, res, ctx->i32, "");
2414
}
2415
2416
LLVMValueRef ac_build_wqm_vote(struct ac_llvm_context *ctx, LLVMValueRef i1)
2417
{
2418
return ac_build_intrinsic(ctx, "llvm.amdgcn.wqm.vote", ctx->i1, &i1, 1, AC_FUNC_ATTR_READNONE);
2419
}
2420
2421
void ac_build_kill_if_false(struct ac_llvm_context *ctx, LLVMValueRef i1)
2422
{
2423
ac_build_intrinsic(ctx, "llvm.amdgcn.kill", ctx->voidt, &i1, 1, 0);
2424
}
2425
2426
LLVMValueRef ac_build_bfe(struct ac_llvm_context *ctx, LLVMValueRef input, LLVMValueRef offset,
2427
LLVMValueRef width, bool is_signed)
2428
{
2429
LLVMValueRef args[] = {
2430
input,
2431
offset,
2432
width,
2433
};
2434
2435
return ac_build_intrinsic(ctx, is_signed ? "llvm.amdgcn.sbfe.i32" : "llvm.amdgcn.ubfe.i32",
2436
ctx->i32, args, 3, AC_FUNC_ATTR_READNONE);
2437
}
2438
2439
LLVMValueRef ac_build_imad(struct ac_llvm_context *ctx, LLVMValueRef s0, LLVMValueRef s1,
2440
LLVMValueRef s2)
2441
{
2442
return LLVMBuildAdd(ctx->builder, LLVMBuildMul(ctx->builder, s0, s1, ""), s2, "");
2443
}
2444
2445
LLVMValueRef ac_build_fmad(struct ac_llvm_context *ctx, LLVMValueRef s0, LLVMValueRef s1,
2446
LLVMValueRef s2)
2447
{
2448
/* FMA is better on GFX10, because it has FMA units instead of MUL-ADD units. */
2449
if (ctx->chip_class >= GFX10) {
2450
return ac_build_intrinsic(ctx, "llvm.fma.f32", ctx->f32, (LLVMValueRef[]){s0, s1, s2}, 3,
2451
AC_FUNC_ATTR_READNONE);
2452
}
2453
2454
return LLVMBuildFAdd(ctx->builder, LLVMBuildFMul(ctx->builder, s0, s1, ""), s2, "");
2455
}
2456
2457
void ac_build_waitcnt(struct ac_llvm_context *ctx, unsigned wait_flags)
2458
{
2459
if (!wait_flags)
2460
return;
2461
2462
unsigned lgkmcnt = 63;
2463
unsigned vmcnt = ctx->chip_class >= GFX9 ? 63 : 15;
2464
unsigned vscnt = 63;
2465
2466
if (wait_flags & AC_WAIT_LGKM)
2467
lgkmcnt = 0;
2468
if (wait_flags & AC_WAIT_VLOAD)
2469
vmcnt = 0;
2470
2471
if (wait_flags & AC_WAIT_VSTORE) {
2472
if (ctx->chip_class >= GFX10)
2473
vscnt = 0;
2474
else
2475
vmcnt = 0;
2476
}
2477
2478
/* There is no intrinsic for vscnt(0), so use a fence. */
2479
if ((wait_flags & AC_WAIT_LGKM && wait_flags & AC_WAIT_VLOAD && wait_flags & AC_WAIT_VSTORE) ||
2480
vscnt == 0) {
2481
LLVMBuildFence(ctx->builder, LLVMAtomicOrderingRelease, false, "");
2482
return;
2483
}
2484
2485
unsigned simm16 = (lgkmcnt << 8) | (7 << 4) | /* expcnt */
2486
(vmcnt & 0xf) | ((vmcnt >> 4) << 14);
2487
2488
LLVMValueRef args[1] = {
2489
LLVMConstInt(ctx->i32, simm16, false),
2490
};
2491
ac_build_intrinsic(ctx, "llvm.amdgcn.s.waitcnt", ctx->voidt, args, 1, 0);
2492
}
2493
2494
LLVMValueRef ac_build_fsat(struct ac_llvm_context *ctx, LLVMValueRef src,
2495
LLVMTypeRef type)
2496
{
2497
unsigned bitsize = ac_get_elem_bits(ctx, type);
2498
LLVMValueRef zero = LLVMConstReal(type, 0.0);
2499
LLVMValueRef one = LLVMConstReal(type, 1.0);
2500
LLVMValueRef result;
2501
2502
if (bitsize == 64 || (bitsize == 16 && ctx->chip_class <= GFX8)) {
2503
/* Use fmin/fmax for 64-bit fsat or 16-bit on GFX6-GFX8 because LLVM
2504
* doesn't expose an intrinsic.
2505
*/
2506
result = ac_build_fmin(ctx, ac_build_fmax(ctx, src, zero), one);
2507
} else {
2508
LLVMTypeRef type;
2509
char *intr;
2510
2511
if (bitsize == 16) {
2512
intr = "llvm.amdgcn.fmed3.f16";
2513
type = ctx->f16;
2514
} else {
2515
assert(bitsize == 32);
2516
intr = "llvm.amdgcn.fmed3.f32";
2517
type = ctx->f32;
2518
}
2519
2520
LLVMValueRef params[] = {
2521
zero,
2522
one,
2523
src,
2524
};
2525
2526
result = ac_build_intrinsic(ctx, intr, type, params, 3,
2527
AC_FUNC_ATTR_READNONE);
2528
}
2529
2530
if (ctx->chip_class < GFX9 && bitsize == 32) {
2531
/* Only pre-GFX9 chips do not flush denorms. */
2532
result = ac_build_canonicalize(ctx, result, bitsize);
2533
}
2534
2535
return result;
2536
}
2537
2538
LLVMValueRef ac_build_fract(struct ac_llvm_context *ctx, LLVMValueRef src0, unsigned bitsize)
2539
{
2540
LLVMTypeRef type;
2541
char *intr;
2542
2543
if (bitsize == 16) {
2544
intr = "llvm.amdgcn.fract.f16";
2545
type = ctx->f16;
2546
} else if (bitsize == 32) {
2547
intr = "llvm.amdgcn.fract.f32";
2548
type = ctx->f32;
2549
} else {
2550
intr = "llvm.amdgcn.fract.f64";
2551
type = ctx->f64;
2552
}
2553
2554
LLVMValueRef params[] = {
2555
src0,
2556
};
2557
return ac_build_intrinsic(ctx, intr, type, params, 1, AC_FUNC_ATTR_READNONE);
2558
}
2559
2560
LLVMValueRef ac_const_uint_vec(struct ac_llvm_context *ctx, LLVMTypeRef type, uint64_t value)
2561
{
2562
2563
if (LLVMGetTypeKind(type) == LLVMVectorTypeKind) {
2564
LLVMValueRef scalar = LLVMConstInt(LLVMGetElementType(type), value, 0);
2565
unsigned vec_size = LLVMGetVectorSize(type);
2566
LLVMValueRef *scalars = alloca(vec_size * sizeof(LLVMValueRef));
2567
2568
for (unsigned i = 0; i < vec_size; i++)
2569
scalars[i] = scalar;
2570
return LLVMConstVector(scalars, vec_size);
2571
}
2572
return LLVMConstInt(type, value, 0);
2573
}
2574
2575
LLVMValueRef ac_build_isign(struct ac_llvm_context *ctx, LLVMValueRef src0)
2576
{
2577
LLVMTypeRef type = LLVMTypeOf(src0);
2578
LLVMValueRef val;
2579
2580
/* v_med3 is selected only when max is first. (LLVM bug?) */
2581
val = ac_build_imax(ctx, src0, ac_const_uint_vec(ctx, type, -1));
2582
return ac_build_imin(ctx, val, ac_const_uint_vec(ctx, type, 1));
2583
}
2584
2585
static LLVMValueRef ac_eliminate_negative_zero(struct ac_llvm_context *ctx, LLVMValueRef val)
2586
{
2587
ac_enable_signed_zeros(ctx);
2588
/* (val + 0) converts negative zero to positive zero. */
2589
val = LLVMBuildFAdd(ctx->builder, val, LLVMConstNull(LLVMTypeOf(val)), "");
2590
ac_disable_signed_zeros(ctx);
2591
return val;
2592
}
2593
2594
LLVMValueRef ac_build_fsign(struct ac_llvm_context *ctx, LLVMValueRef src)
2595
{
2596
LLVMTypeRef type = LLVMTypeOf(src);
2597
LLVMValueRef pos, neg, dw[2], val;
2598
unsigned bitsize = ac_get_elem_bits(ctx, type);
2599
2600
/* The standard version leads to this:
2601
* v_cmp_ngt_f32_e64 s[0:1], s4, 0 ; D40B0000 00010004
2602
* v_cndmask_b32_e64 v4, 1.0, s4, s[0:1] ; D5010004 000008F2
2603
* v_cmp_le_f32_e32 vcc, 0, v4 ; 7C060880
2604
* v_cndmask_b32_e32 v4, -1.0, v4, vcc ; 020808F3
2605
*
2606
* The isign version:
2607
* v_add_f32_e64 v4, s4, 0 ; D5030004 00010004
2608
* v_med3_i32 v4, v4, -1, 1 ; D5580004 02058304
2609
* v_cvt_f32_i32_e32 v4, v4 ; 7E080B04
2610
*
2611
* (src0 + 0) converts negative zero to positive zero.
2612
* After that, int(fsign(x)) == isign(floatBitsToInt(x)).
2613
*
2614
* For FP64, use the standard version, which doesn't suffer from the huge DP rate
2615
* reduction. (FP64 comparisons are as fast as int64 comparisons)
2616
*/
2617
if (bitsize == 16 || bitsize == 32) {
2618
val = ac_to_integer(ctx, ac_eliminate_negative_zero(ctx, src));
2619
val = ac_build_isign(ctx, val);
2620
return LLVMBuildSIToFP(ctx->builder, val, type, "");
2621
}
2622
2623
assert(bitsize == 64);
2624
pos = LLVMBuildFCmp(ctx->builder, LLVMRealOGT, src, ctx->f64_0, "");
2625
neg = LLVMBuildFCmp(ctx->builder, LLVMRealOLT, src, ctx->f64_0, "");
2626
dw[0] = ctx->i32_0;
2627
dw[1] = LLVMBuildSelect(
2628
ctx->builder, pos, LLVMConstInt(ctx->i32, 0x3FF00000, 0),
2629
LLVMBuildSelect(ctx->builder, neg, LLVMConstInt(ctx->i32, 0xBFF00000, 0), ctx->i32_0, ""),
2630
"");
2631
return LLVMBuildBitCast(ctx->builder, ac_build_gather_values(ctx, dw, 2), ctx->f64, "");
2632
}
2633
2634
LLVMValueRef ac_build_bit_count(struct ac_llvm_context *ctx, LLVMValueRef src0)
2635
{
2636
LLVMValueRef result;
2637
unsigned bitsize;
2638
2639
bitsize = ac_get_elem_bits(ctx, LLVMTypeOf(src0));
2640
2641
switch (bitsize) {
2642
case 128:
2643
result = ac_build_intrinsic(ctx, "llvm.ctpop.i128", ctx->i128, (LLVMValueRef[]){src0}, 1,
2644
AC_FUNC_ATTR_READNONE);
2645
result = LLVMBuildTrunc(ctx->builder, result, ctx->i32, "");
2646
break;
2647
case 64:
2648
result = ac_build_intrinsic(ctx, "llvm.ctpop.i64", ctx->i64, (LLVMValueRef[]){src0}, 1,
2649
AC_FUNC_ATTR_READNONE);
2650
2651
result = LLVMBuildTrunc(ctx->builder, result, ctx->i32, "");
2652
break;
2653
case 32:
2654
result = ac_build_intrinsic(ctx, "llvm.ctpop.i32", ctx->i32, (LLVMValueRef[]){src0}, 1,
2655
AC_FUNC_ATTR_READNONE);
2656
break;
2657
case 16:
2658
result = ac_build_intrinsic(ctx, "llvm.ctpop.i16", ctx->i16, (LLVMValueRef[]){src0}, 1,
2659
AC_FUNC_ATTR_READNONE);
2660
2661
result = LLVMBuildZExt(ctx->builder, result, ctx->i32, "");
2662
break;
2663
case 8:
2664
result = ac_build_intrinsic(ctx, "llvm.ctpop.i8", ctx->i8, (LLVMValueRef[]){src0}, 1,
2665
AC_FUNC_ATTR_READNONE);
2666
2667
result = LLVMBuildZExt(ctx->builder, result, ctx->i32, "");
2668
break;
2669
default:
2670
unreachable(!"invalid bitsize");
2671
break;
2672
}
2673
2674
return result;
2675
}
2676
2677
LLVMValueRef ac_build_bitfield_reverse(struct ac_llvm_context *ctx, LLVMValueRef src0)
2678
{
2679
LLVMValueRef result;
2680
unsigned bitsize;
2681
2682
bitsize = ac_get_elem_bits(ctx, LLVMTypeOf(src0));
2683
2684
switch (bitsize) {
2685
case 64:
2686
result = ac_build_intrinsic(ctx, "llvm.bitreverse.i64", ctx->i64, (LLVMValueRef[]){src0}, 1,
2687
AC_FUNC_ATTR_READNONE);
2688
2689
result = LLVMBuildTrunc(ctx->builder, result, ctx->i32, "");
2690
break;
2691
case 32:
2692
result = ac_build_intrinsic(ctx, "llvm.bitreverse.i32", ctx->i32, (LLVMValueRef[]){src0}, 1,
2693
AC_FUNC_ATTR_READNONE);
2694
break;
2695
case 16:
2696
result = ac_build_intrinsic(ctx, "llvm.bitreverse.i16", ctx->i16, (LLVMValueRef[]){src0}, 1,
2697
AC_FUNC_ATTR_READNONE);
2698
2699
result = LLVMBuildZExt(ctx->builder, result, ctx->i32, "");
2700
break;
2701
case 8:
2702
result = ac_build_intrinsic(ctx, "llvm.bitreverse.i8", ctx->i8, (LLVMValueRef[]){src0}, 1,
2703
AC_FUNC_ATTR_READNONE);
2704
2705
result = LLVMBuildZExt(ctx->builder, result, ctx->i32, "");
2706
break;
2707
default:
2708
unreachable(!"invalid bitsize");
2709
break;
2710
}
2711
2712
return result;
2713
}
2714
2715
#define AC_EXP_TARGET 0
2716
#define AC_EXP_ENABLED_CHANNELS 1
2717
#define AC_EXP_OUT0 2
2718
2719
enum ac_ir_type
2720
{
2721
AC_IR_UNDEF,
2722
AC_IR_CONST,
2723
AC_IR_VALUE,
2724
};
2725
2726
struct ac_vs_exp_chan {
2727
LLVMValueRef value;
2728
float const_float;
2729
enum ac_ir_type type;
2730
};
2731
2732
struct ac_vs_exp_inst {
2733
unsigned offset;
2734
LLVMValueRef inst;
2735
struct ac_vs_exp_chan chan[4];
2736
};
2737
2738
struct ac_vs_exports {
2739
unsigned num;
2740
struct ac_vs_exp_inst exp[VARYING_SLOT_MAX];
2741
};
2742
2743
/* Return true if the PARAM export has been eliminated. */
2744
static bool ac_eliminate_const_output(uint8_t *vs_output_param_offset, uint32_t num_outputs,
2745
struct ac_vs_exp_inst *exp)
2746
{
2747
unsigned i, default_val; /* SPI_PS_INPUT_CNTL_i.DEFAULT_VAL */
2748
bool is_zero[4] = {0}, is_one[4] = {0};
2749
2750
for (i = 0; i < 4; i++) {
2751
/* It's a constant expression. Undef outputs are eliminated too. */
2752
if (exp->chan[i].type == AC_IR_UNDEF) {
2753
is_zero[i] = true;
2754
is_one[i] = true;
2755
} else if (exp->chan[i].type == AC_IR_CONST) {
2756
if (exp->chan[i].const_float == 0)
2757
is_zero[i] = true;
2758
else if (exp->chan[i].const_float == 1)
2759
is_one[i] = true;
2760
else
2761
return false; /* other constant */
2762
} else
2763
return false;
2764
}
2765
2766
/* Only certain combinations of 0 and 1 can be eliminated. */
2767
if (is_zero[0] && is_zero[1] && is_zero[2])
2768
default_val = is_zero[3] ? 0 : 1;
2769
else if (is_one[0] && is_one[1] && is_one[2])
2770
default_val = is_zero[3] ? 2 : 3;
2771
else
2772
return false;
2773
2774
/* The PARAM export can be represented as DEFAULT_VAL. Kill it. */
2775
LLVMInstructionEraseFromParent(exp->inst);
2776
2777
/* Change OFFSET to DEFAULT_VAL. */
2778
for (i = 0; i < num_outputs; i++) {
2779
if (vs_output_param_offset[i] == exp->offset) {
2780
vs_output_param_offset[i] = AC_EXP_PARAM_DEFAULT_VAL_0000 + default_val;
2781
break;
2782
}
2783
}
2784
return true;
2785
}
2786
2787
static bool ac_eliminate_duplicated_output(struct ac_llvm_context *ctx,
2788
uint8_t *vs_output_param_offset, uint32_t num_outputs,
2789
struct ac_vs_exports *processed,
2790
struct ac_vs_exp_inst *exp)
2791
{
2792
unsigned p, copy_back_channels = 0;
2793
2794
/* See if the output is already in the list of processed outputs.
2795
* The LLVMValueRef comparison relies on SSA.
2796
*/
2797
for (p = 0; p < processed->num; p++) {
2798
bool different = false;
2799
2800
for (unsigned j = 0; j < 4; j++) {
2801
struct ac_vs_exp_chan *c1 = &processed->exp[p].chan[j];
2802
struct ac_vs_exp_chan *c2 = &exp->chan[j];
2803
2804
/* Treat undef as a match. */
2805
if (c2->type == AC_IR_UNDEF)
2806
continue;
2807
2808
/* If c1 is undef but c2 isn't, we can copy c2 to c1
2809
* and consider the instruction duplicated.
2810
*/
2811
if (c1->type == AC_IR_UNDEF) {
2812
copy_back_channels |= 1 << j;
2813
continue;
2814
}
2815
2816
/* Test whether the channels are not equal. */
2817
if (c1->type != c2->type ||
2818
(c1->type == AC_IR_CONST && c1->const_float != c2->const_float) ||
2819
(c1->type == AC_IR_VALUE && c1->value != c2->value)) {
2820
different = true;
2821
break;
2822
}
2823
}
2824
if (!different)
2825
break;
2826
2827
copy_back_channels = 0;
2828
}
2829
if (p == processed->num)
2830
return false;
2831
2832
/* If a match was found, but the matching export has undef where the new
2833
* one has a normal value, copy the normal value to the undef channel.
2834
*/
2835
struct ac_vs_exp_inst *match = &processed->exp[p];
2836
2837
/* Get current enabled channels mask. */
2838
LLVMValueRef arg = LLVMGetOperand(match->inst, AC_EXP_ENABLED_CHANNELS);
2839
unsigned enabled_channels = LLVMConstIntGetZExtValue(arg);
2840
2841
while (copy_back_channels) {
2842
unsigned chan = u_bit_scan(&copy_back_channels);
2843
2844
assert(match->chan[chan].type == AC_IR_UNDEF);
2845
LLVMSetOperand(match->inst, AC_EXP_OUT0 + chan, exp->chan[chan].value);
2846
match->chan[chan] = exp->chan[chan];
2847
2848
/* Update number of enabled channels because the original mask
2849
* is not always 0xf.
2850
*/
2851
enabled_channels |= (1 << chan);
2852
LLVMSetOperand(match->inst, AC_EXP_ENABLED_CHANNELS,
2853
LLVMConstInt(ctx->i32, enabled_channels, 0));
2854
}
2855
2856
/* The PARAM export is duplicated. Kill it. */
2857
LLVMInstructionEraseFromParent(exp->inst);
2858
2859
/* Change OFFSET to the matching export. */
2860
for (unsigned i = 0; i < num_outputs; i++) {
2861
if (vs_output_param_offset[i] == exp->offset) {
2862
vs_output_param_offset[i] = match->offset;
2863
break;
2864
}
2865
}
2866
return true;
2867
}
2868
2869
void ac_optimize_vs_outputs(struct ac_llvm_context *ctx, LLVMValueRef main_fn,
2870
uint8_t *vs_output_param_offset, uint32_t num_outputs,
2871
uint32_t skip_output_mask, uint8_t *num_param_exports)
2872
{
2873
LLVMBasicBlockRef bb;
2874
bool removed_any = false;
2875
struct ac_vs_exports exports;
2876
2877
exports.num = 0;
2878
2879
/* Process all LLVM instructions. */
2880
bb = LLVMGetFirstBasicBlock(main_fn);
2881
while (bb) {
2882
LLVMValueRef inst = LLVMGetFirstInstruction(bb);
2883
2884
while (inst) {
2885
LLVMValueRef cur = inst;
2886
inst = LLVMGetNextInstruction(inst);
2887
struct ac_vs_exp_inst exp;
2888
2889
if (LLVMGetInstructionOpcode(cur) != LLVMCall)
2890
continue;
2891
2892
LLVMValueRef callee = ac_llvm_get_called_value(cur);
2893
2894
if (!ac_llvm_is_function(callee))
2895
continue;
2896
2897
const char *name = LLVMGetValueName(callee);
2898
unsigned num_args = LLVMCountParams(callee);
2899
2900
/* Check if this is an export instruction. */
2901
if ((num_args != 9 && num_args != 8) ||
2902
(strcmp(name, "llvm.SI.export") && strcmp(name, "llvm.amdgcn.exp.f32")))
2903
continue;
2904
2905
LLVMValueRef arg = LLVMGetOperand(cur, AC_EXP_TARGET);
2906
unsigned target = LLVMConstIntGetZExtValue(arg);
2907
2908
if (target < V_008DFC_SQ_EXP_PARAM)
2909
continue;
2910
2911
target -= V_008DFC_SQ_EXP_PARAM;
2912
2913
/* Parse the instruction. */
2914
memset(&exp, 0, sizeof(exp));
2915
exp.offset = target;
2916
exp.inst = cur;
2917
2918
for (unsigned i = 0; i < 4; i++) {
2919
LLVMValueRef v = LLVMGetOperand(cur, AC_EXP_OUT0 + i);
2920
2921
exp.chan[i].value = v;
2922
2923
if (LLVMIsUndef(v)) {
2924
exp.chan[i].type = AC_IR_UNDEF;
2925
} else if (LLVMIsAConstantFP(v)) {
2926
LLVMBool loses_info;
2927
exp.chan[i].type = AC_IR_CONST;
2928
exp.chan[i].const_float = LLVMConstRealGetDouble(v, &loses_info);
2929
} else {
2930
exp.chan[i].type = AC_IR_VALUE;
2931
}
2932
}
2933
2934
/* Eliminate constant and duplicated PARAM exports. */
2935
if (!((1u << target) & skip_output_mask) &&
2936
(ac_eliminate_const_output(vs_output_param_offset, num_outputs, &exp) ||
2937
ac_eliminate_duplicated_output(ctx, vs_output_param_offset, num_outputs, &exports,
2938
&exp))) {
2939
removed_any = true;
2940
} else {
2941
exports.exp[exports.num++] = exp;
2942
}
2943
}
2944
bb = LLVMGetNextBasicBlock(bb);
2945
}
2946
2947
/* Remove holes in export memory due to removed PARAM exports.
2948
* This is done by renumbering all PARAM exports.
2949
*/
2950
if (removed_any) {
2951
uint8_t old_offset[VARYING_SLOT_MAX];
2952
unsigned out, i;
2953
2954
/* Make a copy of the offsets. We need the old version while
2955
* we are modifying some of them. */
2956
memcpy(old_offset, vs_output_param_offset, sizeof(old_offset));
2957
2958
for (i = 0; i < exports.num; i++) {
2959
unsigned offset = exports.exp[i].offset;
2960
2961
/* Update vs_output_param_offset. Multiple outputs can
2962
* have the same offset.
2963
*/
2964
for (out = 0; out < num_outputs; out++) {
2965
if (old_offset[out] == offset)
2966
vs_output_param_offset[out] = i;
2967
}
2968
2969
/* Change the PARAM offset in the instruction. */
2970
LLVMSetOperand(exports.exp[i].inst, AC_EXP_TARGET,
2971
LLVMConstInt(ctx->i32, V_008DFC_SQ_EXP_PARAM + i, 0));
2972
}
2973
*num_param_exports = exports.num;
2974
}
2975
}
2976
2977
void ac_init_exec_full_mask(struct ac_llvm_context *ctx)
2978
{
2979
LLVMValueRef full_mask = LLVMConstInt(ctx->i64, ~0ull, 0);
2980
ac_build_intrinsic(ctx, "llvm.amdgcn.init.exec", ctx->voidt, &full_mask, 1,
2981
AC_FUNC_ATTR_CONVERGENT);
2982
}
2983
2984
void ac_declare_lds_as_pointer(struct ac_llvm_context *ctx)
2985
{
2986
unsigned lds_size = ctx->chip_class >= GFX7 ? 65536 : 32768;
2987
ctx->lds = LLVMBuildIntToPtr(
2988
ctx->builder, ctx->i32_0,
2989
LLVMPointerType(LLVMArrayType(ctx->i32, lds_size / 4), AC_ADDR_SPACE_LDS), "lds");
2990
}
2991
2992
LLVMValueRef ac_lds_load(struct ac_llvm_context *ctx, LLVMValueRef dw_addr)
2993
{
2994
return LLVMBuildLoad(ctx->builder, ac_build_gep0(ctx, ctx->lds, dw_addr), "");
2995
}
2996
2997
void ac_lds_store(struct ac_llvm_context *ctx, LLVMValueRef dw_addr, LLVMValueRef value)
2998
{
2999
value = ac_to_integer(ctx, value);
3000
ac_build_indexed_store(ctx, ctx->lds, dw_addr, value);
3001
}
3002
3003
LLVMValueRef ac_find_lsb(struct ac_llvm_context *ctx, LLVMTypeRef dst_type, LLVMValueRef src0)
3004
{
3005
unsigned src0_bitsize = ac_get_elem_bits(ctx, LLVMTypeOf(src0));
3006
const char *intrin_name;
3007
LLVMTypeRef type;
3008
LLVMValueRef zero;
3009
3010
switch (src0_bitsize) {
3011
case 64:
3012
intrin_name = "llvm.cttz.i64";
3013
type = ctx->i64;
3014
zero = ctx->i64_0;
3015
break;
3016
case 32:
3017
intrin_name = "llvm.cttz.i32";
3018
type = ctx->i32;
3019
zero = ctx->i32_0;
3020
break;
3021
case 16:
3022
intrin_name = "llvm.cttz.i16";
3023
type = ctx->i16;
3024
zero = ctx->i16_0;
3025
break;
3026
case 8:
3027
intrin_name = "llvm.cttz.i8";
3028
type = ctx->i8;
3029
zero = ctx->i8_0;
3030
break;
3031
default:
3032
unreachable(!"invalid bitsize");
3033
}
3034
3035
LLVMValueRef params[2] = {
3036
src0,
3037
3038
/* The value of 1 means that ffs(x=0) = undef, so LLVM won't
3039
* add special code to check for x=0. The reason is that
3040
* the LLVM behavior for x=0 is different from what we
3041
* need here. However, LLVM also assumes that ffs(x) is
3042
* in [0, 31], but GLSL expects that ffs(0) = -1, so
3043
* a conditional assignment to handle 0 is still required.
3044
*
3045
* The hardware already implements the correct behavior.
3046
*/
3047
ctx->i1true,
3048
};
3049
3050
LLVMValueRef lsb = ac_build_intrinsic(ctx, intrin_name, type, params, 2, AC_FUNC_ATTR_READNONE);
3051
3052
if (src0_bitsize == 64) {
3053
lsb = LLVMBuildTrunc(ctx->builder, lsb, ctx->i32, "");
3054
} else if (src0_bitsize < 32) {
3055
lsb = LLVMBuildSExt(ctx->builder, lsb, ctx->i32, "");
3056
}
3057
3058
/* TODO: We need an intrinsic to skip this conditional. */
3059
/* Check for zero: */
3060
return LLVMBuildSelect(ctx->builder, LLVMBuildICmp(ctx->builder, LLVMIntEQ, src0, zero, ""),
3061
LLVMConstInt(ctx->i32, -1, 0), lsb, "");
3062
}
3063
3064
LLVMTypeRef ac_array_in_const_addr_space(LLVMTypeRef elem_type)
3065
{
3066
return LLVMPointerType(elem_type, AC_ADDR_SPACE_CONST);
3067
}
3068
3069
LLVMTypeRef ac_array_in_const32_addr_space(LLVMTypeRef elem_type)
3070
{
3071
return LLVMPointerType(elem_type, AC_ADDR_SPACE_CONST_32BIT);
3072
}
3073
3074
static struct ac_llvm_flow *get_current_flow(struct ac_llvm_context *ctx)
3075
{
3076
if (ctx->flow->depth > 0)
3077
return &ctx->flow->stack[ctx->flow->depth - 1];
3078
return NULL;
3079
}
3080
3081
static struct ac_llvm_flow *get_innermost_loop(struct ac_llvm_context *ctx)
3082
{
3083
for (unsigned i = ctx->flow->depth; i > 0; --i) {
3084
if (ctx->flow->stack[i - 1].loop_entry_block)
3085
return &ctx->flow->stack[i - 1];
3086
}
3087
return NULL;
3088
}
3089
3090
static struct ac_llvm_flow *push_flow(struct ac_llvm_context *ctx)
3091
{
3092
struct ac_llvm_flow *flow;
3093
3094
if (ctx->flow->depth >= ctx->flow->depth_max) {
3095
unsigned new_max = MAX2(ctx->flow->depth << 1, AC_LLVM_INITIAL_CF_DEPTH);
3096
3097
ctx->flow->stack = realloc(ctx->flow->stack, new_max * sizeof(*ctx->flow->stack));
3098
ctx->flow->depth_max = new_max;
3099
}
3100
3101
flow = &ctx->flow->stack[ctx->flow->depth];
3102
ctx->flow->depth++;
3103
3104
flow->next_block = NULL;
3105
flow->loop_entry_block = NULL;
3106
return flow;
3107
}
3108
3109
static void set_basicblock_name(LLVMBasicBlockRef bb, const char *base, int label_id)
3110
{
3111
char buf[32];
3112
snprintf(buf, sizeof(buf), "%s%d", base, label_id);
3113
LLVMSetValueName(LLVMBasicBlockAsValue(bb), buf);
3114
}
3115
3116
/* Append a basic block at the level of the parent flow.
3117
*/
3118
static LLVMBasicBlockRef append_basic_block(struct ac_llvm_context *ctx, const char *name)
3119
{
3120
assert(ctx->flow->depth >= 1);
3121
3122
if (ctx->flow->depth >= 2) {
3123
struct ac_llvm_flow *flow = &ctx->flow->stack[ctx->flow->depth - 2];
3124
3125
return LLVMInsertBasicBlockInContext(ctx->context, flow->next_block, name);
3126
}
3127
3128
LLVMValueRef main_fn = LLVMGetBasicBlockParent(LLVMGetInsertBlock(ctx->builder));
3129
return LLVMAppendBasicBlockInContext(ctx->context, main_fn, name);
3130
}
3131
3132
/* Emit a branch to the given default target for the current block if
3133
* applicable -- that is, if the current block does not already contain a
3134
* branch from a break or continue.
3135
*/
3136
static void emit_default_branch(LLVMBuilderRef builder, LLVMBasicBlockRef target)
3137
{
3138
if (!LLVMGetBasicBlockTerminator(LLVMGetInsertBlock(builder)))
3139
LLVMBuildBr(builder, target);
3140
}
3141
3142
void ac_build_bgnloop(struct ac_llvm_context *ctx, int label_id)
3143
{
3144
struct ac_llvm_flow *flow = push_flow(ctx);
3145
flow->loop_entry_block = append_basic_block(ctx, "LOOP");
3146
flow->next_block = append_basic_block(ctx, "ENDLOOP");
3147
set_basicblock_name(flow->loop_entry_block, "loop", label_id);
3148
LLVMBuildBr(ctx->builder, flow->loop_entry_block);
3149
LLVMPositionBuilderAtEnd(ctx->builder, flow->loop_entry_block);
3150
}
3151
3152
void ac_build_break(struct ac_llvm_context *ctx)
3153
{
3154
struct ac_llvm_flow *flow = get_innermost_loop(ctx);
3155
LLVMBuildBr(ctx->builder, flow->next_block);
3156
}
3157
3158
void ac_build_continue(struct ac_llvm_context *ctx)
3159
{
3160
struct ac_llvm_flow *flow = get_innermost_loop(ctx);
3161
LLVMBuildBr(ctx->builder, flow->loop_entry_block);
3162
}
3163
3164
void ac_build_else(struct ac_llvm_context *ctx, int label_id)
3165
{
3166
struct ac_llvm_flow *current_branch = get_current_flow(ctx);
3167
LLVMBasicBlockRef endif_block;
3168
3169
assert(!current_branch->loop_entry_block);
3170
3171
endif_block = append_basic_block(ctx, "ENDIF");
3172
emit_default_branch(ctx->builder, endif_block);
3173
3174
LLVMPositionBuilderAtEnd(ctx->builder, current_branch->next_block);
3175
set_basicblock_name(current_branch->next_block, "else", label_id);
3176
3177
current_branch->next_block = endif_block;
3178
}
3179
3180
/* Invoked after a branch is exited. */
3181
static void ac_branch_exited(struct ac_llvm_context *ctx)
3182
{
3183
if (ctx->flow->depth == 0 && ctx->conditional_demote_seen) {
3184
/* The previous conditional branch contained demote. Kill threads
3185
* after all conditional blocks because amdgcn.wqm.vote doesn't
3186
* return usable values inside the blocks.
3187
*
3188
* This is an optional optimization that only kills whole inactive quads.
3189
*/
3190
LLVMValueRef cond = LLVMBuildLoad(ctx->builder, ctx->postponed_kill, "");
3191
ac_build_kill_if_false(ctx, ac_build_wqm_vote(ctx, cond));
3192
ctx->conditional_demote_seen = false;
3193
}
3194
}
3195
3196
void ac_build_endif(struct ac_llvm_context *ctx, int label_id)
3197
{
3198
struct ac_llvm_flow *current_branch = get_current_flow(ctx);
3199
3200
assert(!current_branch->loop_entry_block);
3201
3202
emit_default_branch(ctx->builder, current_branch->next_block);
3203
LLVMPositionBuilderAtEnd(ctx->builder, current_branch->next_block);
3204
set_basicblock_name(current_branch->next_block, "endif", label_id);
3205
3206
ctx->flow->depth--;
3207
ac_branch_exited(ctx);
3208
}
3209
3210
void ac_build_endloop(struct ac_llvm_context *ctx, int label_id)
3211
{
3212
struct ac_llvm_flow *current_loop = get_current_flow(ctx);
3213
3214
assert(current_loop->loop_entry_block);
3215
3216
emit_default_branch(ctx->builder, current_loop->loop_entry_block);
3217
3218
LLVMPositionBuilderAtEnd(ctx->builder, current_loop->next_block);
3219
set_basicblock_name(current_loop->next_block, "endloop", label_id);
3220
ctx->flow->depth--;
3221
ac_branch_exited(ctx);
3222
}
3223
3224
void ac_build_ifcc(struct ac_llvm_context *ctx, LLVMValueRef cond, int label_id)
3225
{
3226
struct ac_llvm_flow *flow = push_flow(ctx);
3227
LLVMBasicBlockRef if_block;
3228
3229
if_block = append_basic_block(ctx, "IF");
3230
flow->next_block = append_basic_block(ctx, "ELSE");
3231
set_basicblock_name(if_block, "if", label_id);
3232
LLVMBuildCondBr(ctx->builder, cond, if_block, flow->next_block);
3233
LLVMPositionBuilderAtEnd(ctx->builder, if_block);
3234
}
3235
3236
LLVMValueRef ac_build_alloca_undef(struct ac_llvm_context *ac, LLVMTypeRef type, const char *name)
3237
{
3238
LLVMBuilderRef builder = ac->builder;
3239
LLVMBasicBlockRef current_block = LLVMGetInsertBlock(builder);
3240
LLVMValueRef function = LLVMGetBasicBlockParent(current_block);
3241
LLVMBasicBlockRef first_block = LLVMGetEntryBasicBlock(function);
3242
LLVMValueRef first_instr = LLVMGetFirstInstruction(first_block);
3243
LLVMBuilderRef first_builder = LLVMCreateBuilderInContext(ac->context);
3244
LLVMValueRef res;
3245
3246
if (first_instr) {
3247
LLVMPositionBuilderBefore(first_builder, first_instr);
3248
} else {
3249
LLVMPositionBuilderAtEnd(first_builder, first_block);
3250
}
3251
3252
res = LLVMBuildAlloca(first_builder, type, name);
3253
LLVMDisposeBuilder(first_builder);
3254
return res;
3255
}
3256
3257
LLVMValueRef ac_build_alloca(struct ac_llvm_context *ac, LLVMTypeRef type, const char *name)
3258
{
3259
LLVMValueRef ptr = ac_build_alloca_undef(ac, type, name);
3260
LLVMBuildStore(ac->builder, LLVMConstNull(type), ptr);
3261
return ptr;
3262
}
3263
3264
LLVMValueRef ac_build_alloca_init(struct ac_llvm_context *ac, LLVMValueRef val, const char *name)
3265
{
3266
LLVMValueRef ptr = ac_build_alloca_undef(ac, LLVMTypeOf(val), name);
3267
LLVMBuildStore(ac->builder, val, ptr);
3268
return ptr;
3269
}
3270
3271
LLVMValueRef ac_cast_ptr(struct ac_llvm_context *ctx, LLVMValueRef ptr, LLVMTypeRef type)
3272
{
3273
int addr_space = LLVMGetPointerAddressSpace(LLVMTypeOf(ptr));
3274
return LLVMBuildBitCast(ctx->builder, ptr, LLVMPointerType(type, addr_space), "");
3275
}
3276
3277
LLVMValueRef ac_trim_vector(struct ac_llvm_context *ctx, LLVMValueRef value, unsigned count)
3278
{
3279
unsigned num_components = ac_get_llvm_num_components(value);
3280
if (count == num_components)
3281
return value;
3282
3283
LLVMValueRef *const masks = alloca(MAX2(count, 2) * sizeof(LLVMValueRef));
3284
masks[0] = ctx->i32_0;
3285
masks[1] = ctx->i32_1;
3286
for (unsigned i = 2; i < count; i++)
3287
masks[i] = LLVMConstInt(ctx->i32, i, false);
3288
3289
if (count == 1)
3290
return LLVMBuildExtractElement(ctx->builder, value, masks[0], "");
3291
3292
LLVMValueRef swizzle = LLVMConstVector(masks, count);
3293
return LLVMBuildShuffleVector(ctx->builder, value, value, swizzle, "");
3294
}
3295
3296
/* If param is i64 and bitwidth <= 32, the return value will be i32. */
3297
LLVMValueRef ac_unpack_param(struct ac_llvm_context *ctx, LLVMValueRef param, unsigned rshift,
3298
unsigned bitwidth)
3299
{
3300
LLVMValueRef value = param;
3301
if (rshift)
3302
value = LLVMBuildLShr(ctx->builder, value, LLVMConstInt(LLVMTypeOf(param), rshift, false), "");
3303
3304
if (rshift + bitwidth < 32) {
3305
uint64_t mask = (1ull << bitwidth) - 1;
3306
value = LLVMBuildAnd(ctx->builder, value, LLVMConstInt(LLVMTypeOf(param), mask, false), "");
3307
}
3308
3309
if (bitwidth <= 32 && LLVMTypeOf(param) == ctx->i64)
3310
value = LLVMBuildTrunc(ctx->builder, value, ctx->i32, "");
3311
return value;
3312
}
3313
3314
/* Adjust the sample index according to FMASK.
3315
*
3316
* For uncompressed MSAA surfaces, FMASK should return 0x76543210,
3317
* which is the identity mapping. Each nibble says which physical sample
3318
* should be fetched to get that sample.
3319
*
3320
* For example, 0x11111100 means there are only 2 samples stored and
3321
* the second sample covers 3/4 of the pixel. When reading samples 0
3322
* and 1, return physical sample 0 (determined by the first two 0s
3323
* in FMASK), otherwise return physical sample 1.
3324
*
3325
* The sample index should be adjusted as follows:
3326
* addr[sample_index] = (fmask >> (addr[sample_index] * 4)) & 0xF;
3327
*/
3328
void ac_apply_fmask_to_sample(struct ac_llvm_context *ac, LLVMValueRef fmask, LLVMValueRef *addr,
3329
bool is_array_tex)
3330
{
3331
struct ac_image_args fmask_load = {0};
3332
fmask_load.opcode = ac_image_load;
3333
fmask_load.resource = fmask;
3334
fmask_load.dmask = 0xf;
3335
fmask_load.dim = is_array_tex ? ac_image_2darray : ac_image_2d;
3336
fmask_load.attributes = AC_FUNC_ATTR_READNONE;
3337
3338
fmask_load.coords[0] = addr[0];
3339
fmask_load.coords[1] = addr[1];
3340
if (is_array_tex)
3341
fmask_load.coords[2] = addr[2];
3342
fmask_load.a16 = ac_get_elem_bits(ac, LLVMTypeOf(addr[0])) == 16;
3343
3344
LLVMValueRef fmask_value = ac_build_image_opcode(ac, &fmask_load);
3345
fmask_value = LLVMBuildExtractElement(ac->builder, fmask_value, ac->i32_0, "");
3346
3347
/* Apply the formula. */
3348
unsigned sample_chan = is_array_tex ? 3 : 2;
3349
LLVMValueRef final_sample;
3350
final_sample = LLVMBuildMul(ac->builder, addr[sample_chan],
3351
LLVMConstInt(LLVMTypeOf(addr[0]), 4, 0), "");
3352
final_sample = LLVMBuildLShr(ac->builder, fmask_value,
3353
LLVMBuildZExt(ac->builder, final_sample, ac->i32, ""), "");
3354
/* Mask the sample index by 0x7, because 0x8 means an unknown value
3355
* with EQAA, so those will map to 0. */
3356
final_sample = LLVMBuildAnd(ac->builder, final_sample, LLVMConstInt(ac->i32, 0x7, 0), "");
3357
if (fmask_load.a16)
3358
final_sample = LLVMBuildTrunc(ac->builder, final_sample, ac->i16, "");
3359
3360
/* Don't rewrite the sample index if WORD1.DATA_FORMAT of the FMASK
3361
* resource descriptor is 0 (invalid).
3362
*/
3363
LLVMValueRef tmp;
3364
tmp = LLVMBuildBitCast(ac->builder, fmask, ac->v8i32, "");
3365
tmp = LLVMBuildExtractElement(ac->builder, tmp, ac->i32_1, "");
3366
tmp = LLVMBuildICmp(ac->builder, LLVMIntNE, tmp, ac->i32_0, "");
3367
3368
/* Replace the MSAA sample index. */
3369
addr[sample_chan] = LLVMBuildSelect(ac->builder, tmp, final_sample, addr[sample_chan], "");
3370
}
3371
3372
static LLVMValueRef _ac_build_readlane(struct ac_llvm_context *ctx, LLVMValueRef src,
3373
LLVMValueRef lane, bool with_opt_barrier)
3374
{
3375
LLVMTypeRef type = LLVMTypeOf(src);
3376
LLVMValueRef result;
3377
3378
if (with_opt_barrier)
3379
ac_build_optimization_barrier(ctx, &src, false);
3380
3381
src = LLVMBuildZExt(ctx->builder, src, ctx->i32, "");
3382
if (lane)
3383
lane = LLVMBuildZExt(ctx->builder, lane, ctx->i32, "");
3384
3385
result =
3386
ac_build_intrinsic(ctx, lane == NULL ? "llvm.amdgcn.readfirstlane" : "llvm.amdgcn.readlane",
3387
ctx->i32, (LLVMValueRef[]){src, lane}, lane == NULL ? 1 : 2,
3388
AC_FUNC_ATTR_READNONE | AC_FUNC_ATTR_CONVERGENT);
3389
3390
return LLVMBuildTrunc(ctx->builder, result, type, "");
3391
}
3392
3393
static LLVMValueRef ac_build_readlane_common(struct ac_llvm_context *ctx, LLVMValueRef src,
3394
LLVMValueRef lane, bool with_opt_barrier)
3395
{
3396
LLVMTypeRef src_type = LLVMTypeOf(src);
3397
src = ac_to_integer(ctx, src);
3398
unsigned bits = LLVMGetIntTypeWidth(LLVMTypeOf(src));
3399
LLVMValueRef ret;
3400
3401
if (bits > 32) {
3402
assert(bits % 32 == 0);
3403
LLVMTypeRef vec_type = LLVMVectorType(ctx->i32, bits / 32);
3404
LLVMValueRef src_vector = LLVMBuildBitCast(ctx->builder, src, vec_type, "");
3405
ret = LLVMGetUndef(vec_type);
3406
for (unsigned i = 0; i < bits / 32; i++) {
3407
LLVMValueRef ret_comp;
3408
3409
src = LLVMBuildExtractElement(ctx->builder, src_vector, LLVMConstInt(ctx->i32, i, 0), "");
3410
3411
ret_comp = _ac_build_readlane(ctx, src, lane, with_opt_barrier);
3412
3413
ret =
3414
LLVMBuildInsertElement(ctx->builder, ret, ret_comp, LLVMConstInt(ctx->i32, i, 0), "");
3415
}
3416
} else {
3417
ret = _ac_build_readlane(ctx, src, lane, with_opt_barrier);
3418
}
3419
3420
if (LLVMGetTypeKind(src_type) == LLVMPointerTypeKind)
3421
return LLVMBuildIntToPtr(ctx->builder, ret, src_type, "");
3422
return LLVMBuildBitCast(ctx->builder, ret, src_type, "");
3423
}
3424
3425
/**
3426
* Builds the "llvm.amdgcn.readlane" or "llvm.amdgcn.readfirstlane" intrinsic.
3427
*
3428
* The optimization barrier is not needed if the value is the same in all lanes
3429
* or if this is called in the outermost block.
3430
*
3431
* @param ctx
3432
* @param src
3433
* @param lane - id of the lane or NULL for the first active lane
3434
* @return value of the lane
3435
*/
3436
LLVMValueRef ac_build_readlane_no_opt_barrier(struct ac_llvm_context *ctx, LLVMValueRef src,
3437
LLVMValueRef lane)
3438
{
3439
return ac_build_readlane_common(ctx, src, lane, false);
3440
}
3441
3442
LLVMValueRef ac_build_readlane(struct ac_llvm_context *ctx, LLVMValueRef src, LLVMValueRef lane)
3443
{
3444
return ac_build_readlane_common(ctx, src, lane, true);
3445
}
3446
3447
LLVMValueRef ac_build_writelane(struct ac_llvm_context *ctx, LLVMValueRef src, LLVMValueRef value,
3448
LLVMValueRef lane)
3449
{
3450
return ac_build_intrinsic(ctx, "llvm.amdgcn.writelane", ctx->i32,
3451
(LLVMValueRef[]){value, lane, src}, 3,
3452
AC_FUNC_ATTR_READNONE | AC_FUNC_ATTR_CONVERGENT);
3453
}
3454
3455
LLVMValueRef ac_build_mbcnt_add(struct ac_llvm_context *ctx, LLVMValueRef mask, LLVMValueRef add_src)
3456
{
3457
if (ctx->wave_size == 32) {
3458
LLVMValueRef val = ac_build_intrinsic(ctx, "llvm.amdgcn.mbcnt.lo", ctx->i32,
3459
(LLVMValueRef[]){mask, ctx->i32_0}, 2, AC_FUNC_ATTR_READNONE);
3460
ac_set_range_metadata(ctx, val, 0, ctx->wave_size);
3461
return val;
3462
}
3463
LLVMValueRef mask_vec = LLVMBuildBitCast(ctx->builder, mask, ctx->v2i32, "");
3464
LLVMValueRef mask_lo = LLVMBuildExtractElement(ctx->builder, mask_vec, ctx->i32_0, "");
3465
LLVMValueRef mask_hi = LLVMBuildExtractElement(ctx->builder, mask_vec, ctx->i32_1, "");
3466
LLVMValueRef val =
3467
ac_build_intrinsic(ctx, "llvm.amdgcn.mbcnt.lo", ctx->i32,
3468
(LLVMValueRef[]){mask_lo, add_src}, 2, AC_FUNC_ATTR_READNONE);
3469
val = ac_build_intrinsic(ctx, "llvm.amdgcn.mbcnt.hi", ctx->i32, (LLVMValueRef[]){mask_hi, val},
3470
2, AC_FUNC_ATTR_READNONE);
3471
ac_set_range_metadata(ctx, val, 0, ctx->wave_size);
3472
return val;
3473
}
3474
3475
LLVMValueRef ac_build_mbcnt(struct ac_llvm_context *ctx, LLVMValueRef mask)
3476
{
3477
return ac_build_mbcnt_add(ctx, mask, ctx->i32_0);
3478
}
3479
3480
enum dpp_ctrl
3481
{
3482
_dpp_quad_perm = 0x000,
3483
_dpp_row_sl = 0x100,
3484
_dpp_row_sr = 0x110,
3485
_dpp_row_rr = 0x120,
3486
dpp_wf_sl1 = 0x130,
3487
dpp_wf_rl1 = 0x134,
3488
dpp_wf_sr1 = 0x138,
3489
dpp_wf_rr1 = 0x13C,
3490
dpp_row_mirror = 0x140,
3491
dpp_row_half_mirror = 0x141,
3492
dpp_row_bcast15 = 0x142,
3493
dpp_row_bcast31 = 0x143
3494
};
3495
3496
static inline enum dpp_ctrl dpp_quad_perm(unsigned lane0, unsigned lane1, unsigned lane2,
3497
unsigned lane3)
3498
{
3499
assert(lane0 < 4 && lane1 < 4 && lane2 < 4 && lane3 < 4);
3500
return _dpp_quad_perm | lane0 | (lane1 << 2) | (lane2 << 4) | (lane3 << 6);
3501
}
3502
3503
static inline enum dpp_ctrl dpp_row_sl(unsigned amount)
3504
{
3505
assert(amount > 0 && amount < 16);
3506
return _dpp_row_sl | amount;
3507
}
3508
3509
static inline enum dpp_ctrl dpp_row_sr(unsigned amount)
3510
{
3511
assert(amount > 0 && amount < 16);
3512
return _dpp_row_sr | amount;
3513
}
3514
3515
static LLVMValueRef _ac_build_dpp(struct ac_llvm_context *ctx, LLVMValueRef old, LLVMValueRef src,
3516
enum dpp_ctrl dpp_ctrl, unsigned row_mask, unsigned bank_mask,
3517
bool bound_ctrl)
3518
{
3519
LLVMTypeRef type = LLVMTypeOf(src);
3520
LLVMValueRef res;
3521
3522
old = LLVMBuildZExt(ctx->builder, old, ctx->i32, "");
3523
src = LLVMBuildZExt(ctx->builder, src, ctx->i32, "");
3524
3525
res = ac_build_intrinsic(
3526
ctx, "llvm.amdgcn.update.dpp.i32", ctx->i32,
3527
(LLVMValueRef[]){old, src, LLVMConstInt(ctx->i32, dpp_ctrl, 0),
3528
LLVMConstInt(ctx->i32, row_mask, 0), LLVMConstInt(ctx->i32, bank_mask, 0),
3529
LLVMConstInt(ctx->i1, bound_ctrl, 0)},
3530
6, AC_FUNC_ATTR_READNONE | AC_FUNC_ATTR_CONVERGENT);
3531
3532
return LLVMBuildTrunc(ctx->builder, res, type, "");
3533
}
3534
3535
static LLVMValueRef ac_build_dpp(struct ac_llvm_context *ctx, LLVMValueRef old, LLVMValueRef src,
3536
enum dpp_ctrl dpp_ctrl, unsigned row_mask, unsigned bank_mask,
3537
bool bound_ctrl)
3538
{
3539
LLVMTypeRef src_type = LLVMTypeOf(src);
3540
src = ac_to_integer(ctx, src);
3541
old = ac_to_integer(ctx, old);
3542
unsigned bits = LLVMGetIntTypeWidth(LLVMTypeOf(src));
3543
LLVMValueRef ret;
3544
if (bits > 32) {
3545
assert(bits % 32 == 0);
3546
LLVMTypeRef vec_type = LLVMVectorType(ctx->i32, bits / 32);
3547
LLVMValueRef src_vector = LLVMBuildBitCast(ctx->builder, src, vec_type, "");
3548
LLVMValueRef old_vector = LLVMBuildBitCast(ctx->builder, old, vec_type, "");
3549
ret = LLVMGetUndef(vec_type);
3550
for (unsigned i = 0; i < bits / 32; i++) {
3551
src = LLVMBuildExtractElement(ctx->builder, src_vector, LLVMConstInt(ctx->i32, i, 0), "");
3552
old = LLVMBuildExtractElement(ctx->builder, old_vector, LLVMConstInt(ctx->i32, i, 0), "");
3553
LLVMValueRef ret_comp =
3554
_ac_build_dpp(ctx, old, src, dpp_ctrl, row_mask, bank_mask, bound_ctrl);
3555
ret =
3556
LLVMBuildInsertElement(ctx->builder, ret, ret_comp, LLVMConstInt(ctx->i32, i, 0), "");
3557
}
3558
} else {
3559
ret = _ac_build_dpp(ctx, old, src, dpp_ctrl, row_mask, bank_mask, bound_ctrl);
3560
}
3561
return LLVMBuildBitCast(ctx->builder, ret, src_type, "");
3562
}
3563
3564
static LLVMValueRef _ac_build_permlane16(struct ac_llvm_context *ctx, LLVMValueRef src,
3565
uint64_t sel, bool exchange_rows, bool bound_ctrl)
3566
{
3567
LLVMTypeRef type = LLVMTypeOf(src);
3568
LLVMValueRef result;
3569
3570
src = LLVMBuildZExt(ctx->builder, src, ctx->i32, "");
3571
3572
LLVMValueRef args[6] = {
3573
src,
3574
src,
3575
LLVMConstInt(ctx->i32, sel, false),
3576
LLVMConstInt(ctx->i32, sel >> 32, false),
3577
ctx->i1true, /* fi */
3578
bound_ctrl ? ctx->i1true : ctx->i1false,
3579
};
3580
3581
result =
3582
ac_build_intrinsic(ctx, exchange_rows ? "llvm.amdgcn.permlanex16" : "llvm.amdgcn.permlane16",
3583
ctx->i32, args, 6, AC_FUNC_ATTR_READNONE | AC_FUNC_ATTR_CONVERGENT);
3584
3585
return LLVMBuildTrunc(ctx->builder, result, type, "");
3586
}
3587
3588
static LLVMValueRef ac_build_permlane16(struct ac_llvm_context *ctx, LLVMValueRef src, uint64_t sel,
3589
bool exchange_rows, bool bound_ctrl)
3590
{
3591
LLVMTypeRef src_type = LLVMTypeOf(src);
3592
src = ac_to_integer(ctx, src);
3593
unsigned bits = LLVMGetIntTypeWidth(LLVMTypeOf(src));
3594
LLVMValueRef ret;
3595
if (bits > 32) {
3596
assert(bits % 32 == 0);
3597
LLVMTypeRef vec_type = LLVMVectorType(ctx->i32, bits / 32);
3598
LLVMValueRef src_vector = LLVMBuildBitCast(ctx->builder, src, vec_type, "");
3599
ret = LLVMGetUndef(vec_type);
3600
for (unsigned i = 0; i < bits / 32; i++) {
3601
src = LLVMBuildExtractElement(ctx->builder, src_vector, LLVMConstInt(ctx->i32, i, 0), "");
3602
LLVMValueRef ret_comp = _ac_build_permlane16(ctx, src, sel, exchange_rows, bound_ctrl);
3603
ret =
3604
LLVMBuildInsertElement(ctx->builder, ret, ret_comp, LLVMConstInt(ctx->i32, i, 0), "");
3605
}
3606
} else {
3607
ret = _ac_build_permlane16(ctx, src, sel, exchange_rows, bound_ctrl);
3608
}
3609
return LLVMBuildBitCast(ctx->builder, ret, src_type, "");
3610
}
3611
3612
static inline unsigned ds_pattern_bitmode(unsigned and_mask, unsigned or_mask, unsigned xor_mask)
3613
{
3614
assert(and_mask < 32 && or_mask < 32 && xor_mask < 32);
3615
return and_mask | (or_mask << 5) | (xor_mask << 10);
3616
}
3617
3618
static LLVMValueRef _ac_build_ds_swizzle(struct ac_llvm_context *ctx, LLVMValueRef src,
3619
unsigned mask)
3620
{
3621
LLVMTypeRef src_type = LLVMTypeOf(src);
3622
LLVMValueRef ret;
3623
3624
src = LLVMBuildZExt(ctx->builder, src, ctx->i32, "");
3625
3626
ret = ac_build_intrinsic(ctx, "llvm.amdgcn.ds.swizzle", ctx->i32,
3627
(LLVMValueRef[]){src, LLVMConstInt(ctx->i32, mask, 0)}, 2,
3628
AC_FUNC_ATTR_READNONE | AC_FUNC_ATTR_CONVERGENT);
3629
3630
return LLVMBuildTrunc(ctx->builder, ret, src_type, "");
3631
}
3632
3633
LLVMValueRef ac_build_ds_swizzle(struct ac_llvm_context *ctx, LLVMValueRef src, unsigned mask)
3634
{
3635
LLVMTypeRef src_type = LLVMTypeOf(src);
3636
src = ac_to_integer(ctx, src);
3637
unsigned bits = LLVMGetIntTypeWidth(LLVMTypeOf(src));
3638
LLVMValueRef ret;
3639
if (bits > 32) {
3640
assert(bits % 32 == 0);
3641
LLVMTypeRef vec_type = LLVMVectorType(ctx->i32, bits / 32);
3642
LLVMValueRef src_vector = LLVMBuildBitCast(ctx->builder, src, vec_type, "");
3643
ret = LLVMGetUndef(vec_type);
3644
for (unsigned i = 0; i < bits / 32; i++) {
3645
src = LLVMBuildExtractElement(ctx->builder, src_vector, LLVMConstInt(ctx->i32, i, 0), "");
3646
LLVMValueRef ret_comp = _ac_build_ds_swizzle(ctx, src, mask);
3647
ret =
3648
LLVMBuildInsertElement(ctx->builder, ret, ret_comp, LLVMConstInt(ctx->i32, i, 0), "");
3649
}
3650
} else {
3651
ret = _ac_build_ds_swizzle(ctx, src, mask);
3652
}
3653
return LLVMBuildBitCast(ctx->builder, ret, src_type, "");
3654
}
3655
3656
static LLVMValueRef ac_build_wwm(struct ac_llvm_context *ctx, LLVMValueRef src)
3657
{
3658
LLVMTypeRef src_type = LLVMTypeOf(src);
3659
unsigned bitsize = ac_get_elem_bits(ctx, src_type);
3660
char name[32], type[8];
3661
LLVMValueRef ret;
3662
3663
src = ac_to_integer(ctx, src);
3664
3665
if (bitsize < 32)
3666
src = LLVMBuildZExt(ctx->builder, src, ctx->i32, "");
3667
3668
ac_build_type_name_for_intr(LLVMTypeOf(src), type, sizeof(type));
3669
snprintf(name, sizeof(name), "llvm.amdgcn.wwm.%s", type);
3670
ret = ac_build_intrinsic(ctx, name, LLVMTypeOf(src), (LLVMValueRef[]){src}, 1,
3671
AC_FUNC_ATTR_READNONE);
3672
3673
if (bitsize < 32)
3674
ret = LLVMBuildTrunc(ctx->builder, ret, ac_to_integer_type(ctx, src_type), "");
3675
3676
return LLVMBuildBitCast(ctx->builder, ret, src_type, "");
3677
}
3678
3679
static LLVMValueRef ac_build_set_inactive(struct ac_llvm_context *ctx, LLVMValueRef src,
3680
LLVMValueRef inactive)
3681
{
3682
char name[33], type[8];
3683
LLVMTypeRef src_type = LLVMTypeOf(src);
3684
unsigned bitsize = ac_get_elem_bits(ctx, src_type);
3685
src = ac_to_integer(ctx, src);
3686
inactive = ac_to_integer(ctx, inactive);
3687
3688
if (bitsize < 32) {
3689
src = LLVMBuildZExt(ctx->builder, src, ctx->i32, "");
3690
inactive = LLVMBuildZExt(ctx->builder, inactive, ctx->i32, "");
3691
}
3692
3693
ac_build_type_name_for_intr(LLVMTypeOf(src), type, sizeof(type));
3694
snprintf(name, sizeof(name), "llvm.amdgcn.set.inactive.%s", type);
3695
LLVMValueRef ret =
3696
ac_build_intrinsic(ctx, name, LLVMTypeOf(src), (LLVMValueRef[]){src, inactive}, 2,
3697
AC_FUNC_ATTR_READNONE | AC_FUNC_ATTR_CONVERGENT);
3698
if (bitsize < 32)
3699
ret = LLVMBuildTrunc(ctx->builder, ret, src_type, "");
3700
3701
return ret;
3702
}
3703
3704
static LLVMValueRef get_reduction_identity(struct ac_llvm_context *ctx, nir_op op,
3705
unsigned type_size)
3706
{
3707
3708
if (type_size == 0) {
3709
switch (op) {
3710
case nir_op_ior:
3711
case nir_op_ixor:
3712
return LLVMConstInt(ctx->i1, 0, 0);
3713
case nir_op_iand:
3714
return LLVMConstInt(ctx->i1, 1, 0);
3715
default:
3716
unreachable("bad reduction intrinsic");
3717
}
3718
} else if (type_size == 1) {
3719
switch (op) {
3720
case nir_op_iadd:
3721
return ctx->i8_0;
3722
case nir_op_imul:
3723
return ctx->i8_1;
3724
case nir_op_imin:
3725
return LLVMConstInt(ctx->i8, INT8_MAX, 0);
3726
case nir_op_umin:
3727
return LLVMConstInt(ctx->i8, UINT8_MAX, 0);
3728
case nir_op_imax:
3729
return LLVMConstInt(ctx->i8, INT8_MIN, 0);
3730
case nir_op_umax:
3731
return ctx->i8_0;
3732
case nir_op_iand:
3733
return LLVMConstInt(ctx->i8, -1, 0);
3734
case nir_op_ior:
3735
return ctx->i8_0;
3736
case nir_op_ixor:
3737
return ctx->i8_0;
3738
default:
3739
unreachable("bad reduction intrinsic");
3740
}
3741
} else if (type_size == 2) {
3742
switch (op) {
3743
case nir_op_iadd:
3744
return ctx->i16_0;
3745
case nir_op_fadd:
3746
return ctx->f16_0;
3747
case nir_op_imul:
3748
return ctx->i16_1;
3749
case nir_op_fmul:
3750
return ctx->f16_1;
3751
case nir_op_imin:
3752
return LLVMConstInt(ctx->i16, INT16_MAX, 0);
3753
case nir_op_umin:
3754
return LLVMConstInt(ctx->i16, UINT16_MAX, 0);
3755
case nir_op_fmin:
3756
return LLVMConstReal(ctx->f16, INFINITY);
3757
case nir_op_imax:
3758
return LLVMConstInt(ctx->i16, INT16_MIN, 0);
3759
case nir_op_umax:
3760
return ctx->i16_0;
3761
case nir_op_fmax:
3762
return LLVMConstReal(ctx->f16, -INFINITY);
3763
case nir_op_iand:
3764
return LLVMConstInt(ctx->i16, -1, 0);
3765
case nir_op_ior:
3766
return ctx->i16_0;
3767
case nir_op_ixor:
3768
return ctx->i16_0;
3769
default:
3770
unreachable("bad reduction intrinsic");
3771
}
3772
} else if (type_size == 4) {
3773
switch (op) {
3774
case nir_op_iadd:
3775
return ctx->i32_0;
3776
case nir_op_fadd:
3777
return ctx->f32_0;
3778
case nir_op_imul:
3779
return ctx->i32_1;
3780
case nir_op_fmul:
3781
return ctx->f32_1;
3782
case nir_op_imin:
3783
return LLVMConstInt(ctx->i32, INT32_MAX, 0);
3784
case nir_op_umin:
3785
return LLVMConstInt(ctx->i32, UINT32_MAX, 0);
3786
case nir_op_fmin:
3787
return LLVMConstReal(ctx->f32, INFINITY);
3788
case nir_op_imax:
3789
return LLVMConstInt(ctx->i32, INT32_MIN, 0);
3790
case nir_op_umax:
3791
return ctx->i32_0;
3792
case nir_op_fmax:
3793
return LLVMConstReal(ctx->f32, -INFINITY);
3794
case nir_op_iand:
3795
return LLVMConstInt(ctx->i32, -1, 0);
3796
case nir_op_ior:
3797
return ctx->i32_0;
3798
case nir_op_ixor:
3799
return ctx->i32_0;
3800
default:
3801
unreachable("bad reduction intrinsic");
3802
}
3803
} else { /* type_size == 64bit */
3804
switch (op) {
3805
case nir_op_iadd:
3806
return ctx->i64_0;
3807
case nir_op_fadd:
3808
return ctx->f64_0;
3809
case nir_op_imul:
3810
return ctx->i64_1;
3811
case nir_op_fmul:
3812
return ctx->f64_1;
3813
case nir_op_imin:
3814
return LLVMConstInt(ctx->i64, INT64_MAX, 0);
3815
case nir_op_umin:
3816
return LLVMConstInt(ctx->i64, UINT64_MAX, 0);
3817
case nir_op_fmin:
3818
return LLVMConstReal(ctx->f64, INFINITY);
3819
case nir_op_imax:
3820
return LLVMConstInt(ctx->i64, INT64_MIN, 0);
3821
case nir_op_umax:
3822
return ctx->i64_0;
3823
case nir_op_fmax:
3824
return LLVMConstReal(ctx->f64, -INFINITY);
3825
case nir_op_iand:
3826
return LLVMConstInt(ctx->i64, -1, 0);
3827
case nir_op_ior:
3828
return ctx->i64_0;
3829
case nir_op_ixor:
3830
return ctx->i64_0;
3831
default:
3832
unreachable("bad reduction intrinsic");
3833
}
3834
}
3835
}
3836
3837
static LLVMValueRef ac_build_alu_op(struct ac_llvm_context *ctx, LLVMValueRef lhs, LLVMValueRef rhs,
3838
nir_op op)
3839
{
3840
bool _64bit = ac_get_type_size(LLVMTypeOf(lhs)) == 8;
3841
bool _32bit = ac_get_type_size(LLVMTypeOf(lhs)) == 4;
3842
switch (op) {
3843
case nir_op_iadd:
3844
return LLVMBuildAdd(ctx->builder, lhs, rhs, "");
3845
case nir_op_fadd:
3846
return LLVMBuildFAdd(ctx->builder, lhs, rhs, "");
3847
case nir_op_imul:
3848
return LLVMBuildMul(ctx->builder, lhs, rhs, "");
3849
case nir_op_fmul:
3850
return LLVMBuildFMul(ctx->builder, lhs, rhs, "");
3851
case nir_op_imin:
3852
return LLVMBuildSelect(ctx->builder, LLVMBuildICmp(ctx->builder, LLVMIntSLT, lhs, rhs, ""),
3853
lhs, rhs, "");
3854
case nir_op_umin:
3855
return LLVMBuildSelect(ctx->builder, LLVMBuildICmp(ctx->builder, LLVMIntULT, lhs, rhs, ""),
3856
lhs, rhs, "");
3857
case nir_op_fmin:
3858
return ac_build_intrinsic(
3859
ctx, _64bit ? "llvm.minnum.f64" : _32bit ? "llvm.minnum.f32" : "llvm.minnum.f16",
3860
_64bit ? ctx->f64 : _32bit ? ctx->f32 : ctx->f16, (LLVMValueRef[]){lhs, rhs}, 2,
3861
AC_FUNC_ATTR_READNONE);
3862
case nir_op_imax:
3863
return LLVMBuildSelect(ctx->builder, LLVMBuildICmp(ctx->builder, LLVMIntSGT, lhs, rhs, ""),
3864
lhs, rhs, "");
3865
case nir_op_umax:
3866
return LLVMBuildSelect(ctx->builder, LLVMBuildICmp(ctx->builder, LLVMIntUGT, lhs, rhs, ""),
3867
lhs, rhs, "");
3868
case nir_op_fmax:
3869
return ac_build_intrinsic(
3870
ctx, _64bit ? "llvm.maxnum.f64" : _32bit ? "llvm.maxnum.f32" : "llvm.maxnum.f16",
3871
_64bit ? ctx->f64 : _32bit ? ctx->f32 : ctx->f16, (LLVMValueRef[]){lhs, rhs}, 2,
3872
AC_FUNC_ATTR_READNONE);
3873
case nir_op_iand:
3874
return LLVMBuildAnd(ctx->builder, lhs, rhs, "");
3875
case nir_op_ior:
3876
return LLVMBuildOr(ctx->builder, lhs, rhs, "");
3877
case nir_op_ixor:
3878
return LLVMBuildXor(ctx->builder, lhs, rhs, "");
3879
default:
3880
unreachable("bad reduction intrinsic");
3881
}
3882
}
3883
3884
/**
3885
* \param src The value to shift.
3886
* \param identity The value to use the first lane.
3887
* \param maxprefix specifies that the result only needs to be correct for a
3888
* prefix of this many threads
3889
* \return src, shifted 1 lane up, and identity shifted into lane 0.
3890
*/
3891
static LLVMValueRef ac_wavefront_shift_right_1(struct ac_llvm_context *ctx, LLVMValueRef src,
3892
LLVMValueRef identity, unsigned maxprefix)
3893
{
3894
if (ctx->chip_class >= GFX10) {
3895
/* wavefront shift_right by 1 on GFX10 (emulate dpp_wf_sr1) */
3896
LLVMValueRef active, tmp1, tmp2;
3897
LLVMValueRef tid = ac_get_thread_id(ctx);
3898
3899
tmp1 = ac_build_dpp(ctx, identity, src, dpp_row_sr(1), 0xf, 0xf, false);
3900
3901
tmp2 = ac_build_permlane16(ctx, src, (uint64_t)~0, true, false);
3902
3903
if (maxprefix > 32) {
3904
active =
3905
LLVMBuildICmp(ctx->builder, LLVMIntEQ, tid, LLVMConstInt(ctx->i32, 32, false), "");
3906
3907
tmp2 = LLVMBuildSelect(ctx->builder, active,
3908
ac_build_readlane(ctx, src, LLVMConstInt(ctx->i32, 31, false)),
3909
tmp2, "");
3910
3911
active = LLVMBuildOr(
3912
ctx->builder, active,
3913
LLVMBuildICmp(ctx->builder, LLVMIntEQ,
3914
LLVMBuildAnd(ctx->builder, tid, LLVMConstInt(ctx->i32, 0x1f, false), ""),
3915
LLVMConstInt(ctx->i32, 0x10, false), ""),
3916
"");
3917
return LLVMBuildSelect(ctx->builder, active, tmp2, tmp1, "");
3918
} else if (maxprefix > 16) {
3919
active =
3920
LLVMBuildICmp(ctx->builder, LLVMIntEQ, tid, LLVMConstInt(ctx->i32, 16, false), "");
3921
3922
return LLVMBuildSelect(ctx->builder, active, tmp2, tmp1, "");
3923
}
3924
} else if (ctx->chip_class >= GFX8) {
3925
return ac_build_dpp(ctx, identity, src, dpp_wf_sr1, 0xf, 0xf, false);
3926
}
3927
3928
/* wavefront shift_right by 1 on SI/CI */
3929
LLVMValueRef active, tmp1, tmp2;
3930
LLVMValueRef tid = ac_get_thread_id(ctx);
3931
tmp1 = ac_build_ds_swizzle(ctx, src, (1 << 15) | dpp_quad_perm(0, 0, 1, 2));
3932
tmp2 = ac_build_ds_swizzle(ctx, src, ds_pattern_bitmode(0x18, 0x03, 0x00));
3933
active = LLVMBuildICmp(ctx->builder, LLVMIntEQ,
3934
LLVMBuildAnd(ctx->builder, tid, LLVMConstInt(ctx->i32, 0x7, 0), ""),
3935
LLVMConstInt(ctx->i32, 0x4, 0), "");
3936
tmp1 = LLVMBuildSelect(ctx->builder, active, tmp2, tmp1, "");
3937
tmp2 = ac_build_ds_swizzle(ctx, src, ds_pattern_bitmode(0x10, 0x07, 0x00));
3938
active = LLVMBuildICmp(ctx->builder, LLVMIntEQ,
3939
LLVMBuildAnd(ctx->builder, tid, LLVMConstInt(ctx->i32, 0xf, 0), ""),
3940
LLVMConstInt(ctx->i32, 0x8, 0), "");
3941
tmp1 = LLVMBuildSelect(ctx->builder, active, tmp2, tmp1, "");
3942
tmp2 = ac_build_ds_swizzle(ctx, src, ds_pattern_bitmode(0x00, 0x0f, 0x00));
3943
active = LLVMBuildICmp(ctx->builder, LLVMIntEQ,
3944
LLVMBuildAnd(ctx->builder, tid, LLVMConstInt(ctx->i32, 0x1f, 0), ""),
3945
LLVMConstInt(ctx->i32, 0x10, 0), "");
3946
tmp1 = LLVMBuildSelect(ctx->builder, active, tmp2, tmp1, "");
3947
tmp2 = ac_build_readlane(ctx, src, LLVMConstInt(ctx->i32, 31, 0));
3948
active = LLVMBuildICmp(ctx->builder, LLVMIntEQ, tid, LLVMConstInt(ctx->i32, 32, 0), "");
3949
tmp1 = LLVMBuildSelect(ctx->builder, active, tmp2, tmp1, "");
3950
active = LLVMBuildICmp(ctx->builder, LLVMIntEQ, tid, LLVMConstInt(ctx->i32, 0, 0), "");
3951
return LLVMBuildSelect(ctx->builder, active, identity, tmp1, "");
3952
}
3953
3954
/**
3955
* \param maxprefix specifies that the result only needs to be correct for a
3956
* prefix of this many threads
3957
*/
3958
static LLVMValueRef ac_build_scan(struct ac_llvm_context *ctx, nir_op op, LLVMValueRef src,
3959
LLVMValueRef identity, unsigned maxprefix, bool inclusive)
3960
{
3961
LLVMValueRef result, tmp;
3962
3963
if (!inclusive)
3964
src = ac_wavefront_shift_right_1(ctx, src, identity, maxprefix);
3965
3966
result = src;
3967
3968
if (ctx->chip_class <= GFX7) {
3969
assert(maxprefix == 64);
3970
LLVMValueRef tid = ac_get_thread_id(ctx);
3971
LLVMValueRef active;
3972
tmp = ac_build_ds_swizzle(ctx, src, ds_pattern_bitmode(0x1e, 0x00, 0x00));
3973
active = LLVMBuildICmp(ctx->builder, LLVMIntNE,
3974
LLVMBuildAnd(ctx->builder, tid, ctx->i32_1, ""), ctx->i32_0, "");
3975
tmp = LLVMBuildSelect(ctx->builder, active, tmp, identity, "");
3976
result = ac_build_alu_op(ctx, result, tmp, op);
3977
tmp = ac_build_ds_swizzle(ctx, result, ds_pattern_bitmode(0x1c, 0x01, 0x00));
3978
active = LLVMBuildICmp(ctx->builder, LLVMIntNE,
3979
LLVMBuildAnd(ctx->builder, tid, LLVMConstInt(ctx->i32, 2, 0), ""),
3980
ctx->i32_0, "");
3981
tmp = LLVMBuildSelect(ctx->builder, active, tmp, identity, "");
3982
result = ac_build_alu_op(ctx, result, tmp, op);
3983
tmp = ac_build_ds_swizzle(ctx, result, ds_pattern_bitmode(0x18, 0x03, 0x00));
3984
active = LLVMBuildICmp(ctx->builder, LLVMIntNE,
3985
LLVMBuildAnd(ctx->builder, tid, LLVMConstInt(ctx->i32, 4, 0), ""),
3986
ctx->i32_0, "");
3987
tmp = LLVMBuildSelect(ctx->builder, active, tmp, identity, "");
3988
result = ac_build_alu_op(ctx, result, tmp, op);
3989
tmp = ac_build_ds_swizzle(ctx, result, ds_pattern_bitmode(0x10, 0x07, 0x00));
3990
active = LLVMBuildICmp(ctx->builder, LLVMIntNE,
3991
LLVMBuildAnd(ctx->builder, tid, LLVMConstInt(ctx->i32, 8, 0), ""),
3992
ctx->i32_0, "");
3993
tmp = LLVMBuildSelect(ctx->builder, active, tmp, identity, "");
3994
result = ac_build_alu_op(ctx, result, tmp, op);
3995
tmp = ac_build_ds_swizzle(ctx, result, ds_pattern_bitmode(0x00, 0x0f, 0x00));
3996
active = LLVMBuildICmp(ctx->builder, LLVMIntNE,
3997
LLVMBuildAnd(ctx->builder, tid, LLVMConstInt(ctx->i32, 16, 0), ""),
3998
ctx->i32_0, "");
3999
tmp = LLVMBuildSelect(ctx->builder, active, tmp, identity, "");
4000
result = ac_build_alu_op(ctx, result, tmp, op);
4001
tmp = ac_build_readlane(ctx, result, LLVMConstInt(ctx->i32, 31, 0));
4002
active = LLVMBuildICmp(ctx->builder, LLVMIntNE,
4003
LLVMBuildAnd(ctx->builder, tid, LLVMConstInt(ctx->i32, 32, 0), ""),
4004
ctx->i32_0, "");
4005
tmp = LLVMBuildSelect(ctx->builder, active, tmp, identity, "");
4006
result = ac_build_alu_op(ctx, result, tmp, op);
4007
return result;
4008
}
4009
4010
if (maxprefix <= 1)
4011
return result;
4012
tmp = ac_build_dpp(ctx, identity, src, dpp_row_sr(1), 0xf, 0xf, false);
4013
result = ac_build_alu_op(ctx, result, tmp, op);
4014
if (maxprefix <= 2)
4015
return result;
4016
tmp = ac_build_dpp(ctx, identity, src, dpp_row_sr(2), 0xf, 0xf, false);
4017
result = ac_build_alu_op(ctx, result, tmp, op);
4018
if (maxprefix <= 3)
4019
return result;
4020
tmp = ac_build_dpp(ctx, identity, src, dpp_row_sr(3), 0xf, 0xf, false);
4021
result = ac_build_alu_op(ctx, result, tmp, op);
4022
if (maxprefix <= 4)
4023
return result;
4024
tmp = ac_build_dpp(ctx, identity, result, dpp_row_sr(4), 0xf, 0xe, false);
4025
result = ac_build_alu_op(ctx, result, tmp, op);
4026
if (maxprefix <= 8)
4027
return result;
4028
tmp = ac_build_dpp(ctx, identity, result, dpp_row_sr(8), 0xf, 0xc, false);
4029
result = ac_build_alu_op(ctx, result, tmp, op);
4030
if (maxprefix <= 16)
4031
return result;
4032
4033
if (ctx->chip_class >= GFX10) {
4034
LLVMValueRef tid = ac_get_thread_id(ctx);
4035
LLVMValueRef active;
4036
4037
tmp = ac_build_permlane16(ctx, result, ~(uint64_t)0, true, false);
4038
4039
active = LLVMBuildICmp(ctx->builder, LLVMIntNE,
4040
LLVMBuildAnd(ctx->builder, tid, LLVMConstInt(ctx->i32, 16, false), ""),
4041
ctx->i32_0, "");
4042
4043
tmp = LLVMBuildSelect(ctx->builder, active, tmp, identity, "");
4044
4045
result = ac_build_alu_op(ctx, result, tmp, op);
4046
4047
if (maxprefix <= 32)
4048
return result;
4049
4050
tmp = ac_build_readlane(ctx, result, LLVMConstInt(ctx->i32, 31, false));
4051
4052
active = LLVMBuildICmp(ctx->builder, LLVMIntUGE, tid, LLVMConstInt(ctx->i32, 32, false), "");
4053
4054
tmp = LLVMBuildSelect(ctx->builder, active, tmp, identity, "");
4055
4056
result = ac_build_alu_op(ctx, result, tmp, op);
4057
return result;
4058
}
4059
4060
tmp = ac_build_dpp(ctx, identity, result, dpp_row_bcast15, 0xa, 0xf, false);
4061
result = ac_build_alu_op(ctx, result, tmp, op);
4062
if (maxprefix <= 32)
4063
return result;
4064
tmp = ac_build_dpp(ctx, identity, result, dpp_row_bcast31, 0xc, 0xf, false);
4065
result = ac_build_alu_op(ctx, result, tmp, op);
4066
return result;
4067
}
4068
4069
LLVMValueRef ac_build_inclusive_scan(struct ac_llvm_context *ctx, LLVMValueRef src, nir_op op)
4070
{
4071
LLVMValueRef result;
4072
4073
if (LLVMTypeOf(src) == ctx->i1 && op == nir_op_iadd) {
4074
LLVMBuilderRef builder = ctx->builder;
4075
src = LLVMBuildZExt(builder, src, ctx->i32, "");
4076
result = ac_build_ballot(ctx, src);
4077
result = ac_build_mbcnt(ctx, result);
4078
result = LLVMBuildAdd(builder, result, src, "");
4079
return result;
4080
}
4081
4082
ac_build_optimization_barrier(ctx, &src, false);
4083
4084
LLVMValueRef identity = get_reduction_identity(ctx, op, ac_get_type_size(LLVMTypeOf(src)));
4085
result = LLVMBuildBitCast(ctx->builder, ac_build_set_inactive(ctx, src, identity),
4086
LLVMTypeOf(identity), "");
4087
result = ac_build_scan(ctx, op, result, identity, ctx->wave_size, true);
4088
4089
return ac_build_wwm(ctx, result);
4090
}
4091
4092
LLVMValueRef ac_build_exclusive_scan(struct ac_llvm_context *ctx, LLVMValueRef src, nir_op op)
4093
{
4094
LLVMValueRef result;
4095
4096
if (LLVMTypeOf(src) == ctx->i1 && op == nir_op_iadd) {
4097
LLVMBuilderRef builder = ctx->builder;
4098
src = LLVMBuildZExt(builder, src, ctx->i32, "");
4099
result = ac_build_ballot(ctx, src);
4100
result = ac_build_mbcnt(ctx, result);
4101
return result;
4102
}
4103
4104
ac_build_optimization_barrier(ctx, &src, false);
4105
4106
LLVMValueRef identity = get_reduction_identity(ctx, op, ac_get_type_size(LLVMTypeOf(src)));
4107
result = LLVMBuildBitCast(ctx->builder, ac_build_set_inactive(ctx, src, identity),
4108
LLVMTypeOf(identity), "");
4109
result = ac_build_scan(ctx, op, result, identity, ctx->wave_size, false);
4110
4111
return ac_build_wwm(ctx, result);
4112
}
4113
4114
LLVMValueRef ac_build_reduce(struct ac_llvm_context *ctx, LLVMValueRef src, nir_op op,
4115
unsigned cluster_size)
4116
{
4117
if (cluster_size == 1)
4118
return src;
4119
ac_build_optimization_barrier(ctx, &src, false);
4120
LLVMValueRef result, swap;
4121
LLVMValueRef identity = get_reduction_identity(ctx, op, ac_get_type_size(LLVMTypeOf(src)));
4122
result = LLVMBuildBitCast(ctx->builder, ac_build_set_inactive(ctx, src, identity),
4123
LLVMTypeOf(identity), "");
4124
swap = ac_build_quad_swizzle(ctx, result, 1, 0, 3, 2);
4125
result = ac_build_alu_op(ctx, result, swap, op);
4126
if (cluster_size == 2)
4127
return ac_build_wwm(ctx, result);
4128
4129
swap = ac_build_quad_swizzle(ctx, result, 2, 3, 0, 1);
4130
result = ac_build_alu_op(ctx, result, swap, op);
4131
if (cluster_size == 4)
4132
return ac_build_wwm(ctx, result);
4133
4134
if (ctx->chip_class >= GFX8)
4135
swap = ac_build_dpp(ctx, identity, result, dpp_row_half_mirror, 0xf, 0xf, false);
4136
else
4137
swap = ac_build_ds_swizzle(ctx, result, ds_pattern_bitmode(0x1f, 0, 0x04));
4138
result = ac_build_alu_op(ctx, result, swap, op);
4139
if (cluster_size == 8)
4140
return ac_build_wwm(ctx, result);
4141
4142
if (ctx->chip_class >= GFX8)
4143
swap = ac_build_dpp(ctx, identity, result, dpp_row_mirror, 0xf, 0xf, false);
4144
else
4145
swap = ac_build_ds_swizzle(ctx, result, ds_pattern_bitmode(0x1f, 0, 0x08));
4146
result = ac_build_alu_op(ctx, result, swap, op);
4147
if (cluster_size == 16)
4148
return ac_build_wwm(ctx, result);
4149
4150
if (ctx->chip_class >= GFX10)
4151
swap = ac_build_permlane16(ctx, result, 0, true, false);
4152
else if (ctx->chip_class >= GFX8 && cluster_size != 32)
4153
swap = ac_build_dpp(ctx, identity, result, dpp_row_bcast15, 0xa, 0xf, false);
4154
else
4155
swap = ac_build_ds_swizzle(ctx, result, ds_pattern_bitmode(0x1f, 0, 0x10));
4156
result = ac_build_alu_op(ctx, result, swap, op);
4157
if (cluster_size == 32)
4158
return ac_build_wwm(ctx, result);
4159
4160
if (ctx->chip_class >= GFX8) {
4161
if (ctx->wave_size == 64) {
4162
if (ctx->chip_class >= GFX10)
4163
swap = ac_build_readlane(ctx, result, LLVMConstInt(ctx->i32, 31, false));
4164
else
4165
swap = ac_build_dpp(ctx, identity, result, dpp_row_bcast31, 0xc, 0xf, false);
4166
result = ac_build_alu_op(ctx, result, swap, op);
4167
result = ac_build_readlane(ctx, result, LLVMConstInt(ctx->i32, 63, 0));
4168
}
4169
4170
return ac_build_wwm(ctx, result);
4171
} else {
4172
swap = ac_build_readlane(ctx, result, ctx->i32_0);
4173
result = ac_build_readlane(ctx, result, LLVMConstInt(ctx->i32, 32, 0));
4174
result = ac_build_alu_op(ctx, result, swap, op);
4175
return ac_build_wwm(ctx, result);
4176
}
4177
}
4178
4179
/**
4180
* "Top half" of a scan that reduces per-wave values across an entire
4181
* workgroup.
4182
*
4183
* The source value must be present in the highest lane of the wave, and the
4184
* highest lane must be live.
4185
*/
4186
void ac_build_wg_wavescan_top(struct ac_llvm_context *ctx, struct ac_wg_scan *ws)
4187
{
4188
if (ws->maxwaves <= 1)
4189
return;
4190
4191
const LLVMValueRef last_lane = LLVMConstInt(ctx->i32, ctx->wave_size - 1, false);
4192
LLVMBuilderRef builder = ctx->builder;
4193
LLVMValueRef tid = ac_get_thread_id(ctx);
4194
LLVMValueRef tmp;
4195
4196
tmp = LLVMBuildICmp(builder, LLVMIntEQ, tid, last_lane, "");
4197
ac_build_ifcc(ctx, tmp, 1000);
4198
LLVMBuildStore(builder, ws->src, LLVMBuildGEP(builder, ws->scratch, &ws->waveidx, 1, ""));
4199
ac_build_endif(ctx, 1000);
4200
}
4201
4202
/**
4203
* "Bottom half" of a scan that reduces per-wave values across an entire
4204
* workgroup.
4205
*
4206
* The caller must place a barrier between the top and bottom halves.
4207
*/
4208
void ac_build_wg_wavescan_bottom(struct ac_llvm_context *ctx, struct ac_wg_scan *ws)
4209
{
4210
const LLVMTypeRef type = LLVMTypeOf(ws->src);
4211
const LLVMValueRef identity = get_reduction_identity(ctx, ws->op, ac_get_type_size(type));
4212
4213
if (ws->maxwaves <= 1) {
4214
ws->result_reduce = ws->src;
4215
ws->result_inclusive = ws->src;
4216
ws->result_exclusive = identity;
4217
return;
4218
}
4219
assert(ws->maxwaves <= 32);
4220
4221
LLVMBuilderRef builder = ctx->builder;
4222
LLVMValueRef tid = ac_get_thread_id(ctx);
4223
LLVMBasicBlockRef bbs[2];
4224
LLVMValueRef phivalues_scan[2];
4225
LLVMValueRef tmp, tmp2;
4226
4227
bbs[0] = LLVMGetInsertBlock(builder);
4228
phivalues_scan[0] = LLVMGetUndef(type);
4229
4230
if (ws->enable_reduce)
4231
tmp = LLVMBuildICmp(builder, LLVMIntULT, tid, ws->numwaves, "");
4232
else if (ws->enable_inclusive)
4233
tmp = LLVMBuildICmp(builder, LLVMIntULE, tid, ws->waveidx, "");
4234
else
4235
tmp = LLVMBuildICmp(builder, LLVMIntULT, tid, ws->waveidx, "");
4236
ac_build_ifcc(ctx, tmp, 1001);
4237
{
4238
tmp = LLVMBuildLoad(builder, LLVMBuildGEP(builder, ws->scratch, &tid, 1, ""), "");
4239
4240
ac_build_optimization_barrier(ctx, &tmp, false);
4241
4242
bbs[1] = LLVMGetInsertBlock(builder);
4243
phivalues_scan[1] = ac_build_scan(ctx, ws->op, tmp, identity, ws->maxwaves, true);
4244
}
4245
ac_build_endif(ctx, 1001);
4246
4247
const LLVMValueRef scan = ac_build_phi(ctx, type, 2, phivalues_scan, bbs);
4248
4249
if (ws->enable_reduce) {
4250
tmp = LLVMBuildSub(builder, ws->numwaves, ctx->i32_1, "");
4251
ws->result_reduce = ac_build_readlane(ctx, scan, tmp);
4252
}
4253
if (ws->enable_inclusive)
4254
ws->result_inclusive = ac_build_readlane(ctx, scan, ws->waveidx);
4255
if (ws->enable_exclusive) {
4256
tmp = LLVMBuildSub(builder, ws->waveidx, ctx->i32_1, "");
4257
tmp = ac_build_readlane(ctx, scan, tmp);
4258
tmp2 = LLVMBuildICmp(builder, LLVMIntEQ, ws->waveidx, ctx->i32_0, "");
4259
ws->result_exclusive = LLVMBuildSelect(builder, tmp2, identity, tmp, "");
4260
}
4261
}
4262
4263
/**
4264
* Inclusive scan of a per-wave value across an entire workgroup.
4265
*
4266
* This implies an s_barrier instruction.
4267
*
4268
* Unlike ac_build_inclusive_scan, the caller \em must ensure that all threads
4269
* of the workgroup are live. (This requirement cannot easily be relaxed in a
4270
* useful manner because of the barrier in the algorithm.)
4271
*/
4272
void ac_build_wg_wavescan(struct ac_llvm_context *ctx, struct ac_wg_scan *ws)
4273
{
4274
ac_build_wg_wavescan_top(ctx, ws);
4275
ac_build_s_barrier(ctx);
4276
ac_build_wg_wavescan_bottom(ctx, ws);
4277
}
4278
4279
/**
4280
* "Top half" of a scan that reduces per-thread values across an entire
4281
* workgroup.
4282
*
4283
* All lanes must be active when this code runs.
4284
*/
4285
void ac_build_wg_scan_top(struct ac_llvm_context *ctx, struct ac_wg_scan *ws)
4286
{
4287
if (ws->enable_exclusive) {
4288
ws->extra = ac_build_exclusive_scan(ctx, ws->src, ws->op);
4289
if (LLVMTypeOf(ws->src) == ctx->i1 && ws->op == nir_op_iadd)
4290
ws->src = LLVMBuildZExt(ctx->builder, ws->src, ctx->i32, "");
4291
ws->src = ac_build_alu_op(ctx, ws->extra, ws->src, ws->op);
4292
} else {
4293
ws->src = ac_build_inclusive_scan(ctx, ws->src, ws->op);
4294
}
4295
4296
bool enable_inclusive = ws->enable_inclusive;
4297
bool enable_exclusive = ws->enable_exclusive;
4298
ws->enable_inclusive = false;
4299
ws->enable_exclusive = ws->enable_exclusive || enable_inclusive;
4300
ac_build_wg_wavescan_top(ctx, ws);
4301
ws->enable_inclusive = enable_inclusive;
4302
ws->enable_exclusive = enable_exclusive;
4303
}
4304
4305
/**
4306
* "Bottom half" of a scan that reduces per-thread values across an entire
4307
* workgroup.
4308
*
4309
* The caller must place a barrier between the top and bottom halves.
4310
*/
4311
void ac_build_wg_scan_bottom(struct ac_llvm_context *ctx, struct ac_wg_scan *ws)
4312
{
4313
bool enable_inclusive = ws->enable_inclusive;
4314
bool enable_exclusive = ws->enable_exclusive;
4315
ws->enable_inclusive = false;
4316
ws->enable_exclusive = ws->enable_exclusive || enable_inclusive;
4317
ac_build_wg_wavescan_bottom(ctx, ws);
4318
ws->enable_inclusive = enable_inclusive;
4319
ws->enable_exclusive = enable_exclusive;
4320
4321
/* ws->result_reduce is already the correct value */
4322
if (ws->enable_inclusive)
4323
ws->result_inclusive = ac_build_alu_op(ctx, ws->result_inclusive, ws->src, ws->op);
4324
if (ws->enable_exclusive)
4325
ws->result_exclusive = ac_build_alu_op(ctx, ws->result_exclusive, ws->extra, ws->op);
4326
}
4327
4328
/**
4329
* A scan that reduces per-thread values across an entire workgroup.
4330
*
4331
* The caller must ensure that all lanes are active when this code runs
4332
* (WWM is insufficient!), because there is an implied barrier.
4333
*/
4334
void ac_build_wg_scan(struct ac_llvm_context *ctx, struct ac_wg_scan *ws)
4335
{
4336
ac_build_wg_scan_top(ctx, ws);
4337
ac_build_s_barrier(ctx);
4338
ac_build_wg_scan_bottom(ctx, ws);
4339
}
4340
4341
LLVMValueRef ac_build_quad_swizzle(struct ac_llvm_context *ctx, LLVMValueRef src, unsigned lane0,
4342
unsigned lane1, unsigned lane2, unsigned lane3)
4343
{
4344
unsigned mask = dpp_quad_perm(lane0, lane1, lane2, lane3);
4345
if (ctx->chip_class >= GFX8) {
4346
return ac_build_dpp(ctx, src, src, mask, 0xf, 0xf, false);
4347
} else {
4348
return ac_build_ds_swizzle(ctx, src, (1 << 15) | mask);
4349
}
4350
}
4351
4352
LLVMValueRef ac_build_shuffle(struct ac_llvm_context *ctx, LLVMValueRef src, LLVMValueRef index)
4353
{
4354
LLVMTypeRef type = LLVMTypeOf(src);
4355
LLVMValueRef result;
4356
4357
index = LLVMBuildMul(ctx->builder, index, LLVMConstInt(ctx->i32, 4, 0), "");
4358
src = LLVMBuildZExt(ctx->builder, src, ctx->i32, "");
4359
4360
result =
4361
ac_build_intrinsic(ctx, "llvm.amdgcn.ds.bpermute", ctx->i32, (LLVMValueRef[]){index, src}, 2,
4362
AC_FUNC_ATTR_READNONE | AC_FUNC_ATTR_CONVERGENT);
4363
return LLVMBuildTrunc(ctx->builder, result, type, "");
4364
}
4365
4366
LLVMValueRef ac_build_frexp_exp(struct ac_llvm_context *ctx, LLVMValueRef src0, unsigned bitsize)
4367
{
4368
LLVMTypeRef type;
4369
char *intr;
4370
4371
if (bitsize == 16) {
4372
intr = "llvm.amdgcn.frexp.exp.i16.f16";
4373
type = ctx->i16;
4374
} else if (bitsize == 32) {
4375
intr = "llvm.amdgcn.frexp.exp.i32.f32";
4376
type = ctx->i32;
4377
} else {
4378
intr = "llvm.amdgcn.frexp.exp.i32.f64";
4379
type = ctx->i32;
4380
}
4381
4382
LLVMValueRef params[] = {
4383
src0,
4384
};
4385
return ac_build_intrinsic(ctx, intr, type, params, 1, AC_FUNC_ATTR_READNONE);
4386
}
4387
LLVMValueRef ac_build_frexp_mant(struct ac_llvm_context *ctx, LLVMValueRef src0, unsigned bitsize)
4388
{
4389
LLVMTypeRef type;
4390
char *intr;
4391
4392
if (bitsize == 16) {
4393
intr = "llvm.amdgcn.frexp.mant.f16";
4394
type = ctx->f16;
4395
} else if (bitsize == 32) {
4396
intr = "llvm.amdgcn.frexp.mant.f32";
4397
type = ctx->f32;
4398
} else {
4399
intr = "llvm.amdgcn.frexp.mant.f64";
4400
type = ctx->f64;
4401
}
4402
4403
LLVMValueRef params[] = {
4404
src0,
4405
};
4406
return ac_build_intrinsic(ctx, intr, type, params, 1, AC_FUNC_ATTR_READNONE);
4407
}
4408
4409
LLVMValueRef ac_build_canonicalize(struct ac_llvm_context *ctx, LLVMValueRef src0, unsigned bitsize)
4410
{
4411
LLVMTypeRef type;
4412
char *intr;
4413
4414
if (bitsize == 16) {
4415
intr = "llvm.canonicalize.f16";
4416
type = ctx->f16;
4417
} else if (bitsize == 32) {
4418
intr = "llvm.canonicalize.f32";
4419
type = ctx->f32;
4420
} else {
4421
intr = "llvm.canonicalize.f64";
4422
type = ctx->f64;
4423
}
4424
4425
LLVMValueRef params[] = {
4426
src0,
4427
};
4428
return ac_build_intrinsic(ctx, intr, type, params, 1, AC_FUNC_ATTR_READNONE);
4429
}
4430
4431
/*
4432
* this takes an I,J coordinate pair,
4433
* and works out the X and Y derivatives.
4434
* it returns DDX(I), DDX(J), DDY(I), DDY(J).
4435
*/
4436
LLVMValueRef ac_build_ddxy_interp(struct ac_llvm_context *ctx, LLVMValueRef interp_ij)
4437
{
4438
LLVMValueRef result[4], a;
4439
unsigned i;
4440
4441
for (i = 0; i < 2; i++) {
4442
a = LLVMBuildExtractElement(ctx->builder, interp_ij, LLVMConstInt(ctx->i32, i, false), "");
4443
result[i] = ac_build_ddxy(ctx, AC_TID_MASK_TOP_LEFT, 1, a);
4444
result[2 + i] = ac_build_ddxy(ctx, AC_TID_MASK_TOP_LEFT, 2, a);
4445
}
4446
return ac_build_gather_values(ctx, result, 4);
4447
}
4448
4449
LLVMValueRef ac_build_load_helper_invocation(struct ac_llvm_context *ctx)
4450
{
4451
LLVMValueRef result;
4452
4453
if (LLVM_VERSION_MAJOR >= 13) {
4454
result = ac_build_intrinsic(ctx, "llvm.amdgcn.live.mask", ctx->i1, NULL, 0,
4455
AC_FUNC_ATTR_READONLY | AC_FUNC_ATTR_INACCESSIBLE_MEM_ONLY);
4456
} else {
4457
result = ac_build_intrinsic(ctx, "llvm.amdgcn.ps.live", ctx->i1, NULL, 0,
4458
AC_FUNC_ATTR_READNONE);
4459
}
4460
return LLVMBuildNot(ctx->builder, result, "");
4461
}
4462
4463
LLVMValueRef ac_build_is_helper_invocation(struct ac_llvm_context *ctx)
4464
{
4465
if (!ctx->postponed_kill)
4466
return ac_build_load_helper_invocation(ctx);
4467
4468
/* postponed_kill should be NULL on LLVM 13+ */
4469
assert(LLVM_VERSION_MAJOR < 13);
4470
4471
/* !(exact && postponed) */
4472
LLVMValueRef exact =
4473
ac_build_intrinsic(ctx, "llvm.amdgcn.ps.live", ctx->i1, NULL, 0, AC_FUNC_ATTR_READNONE);
4474
4475
LLVMValueRef postponed = LLVMBuildLoad(ctx->builder, ctx->postponed_kill, "");
4476
return LLVMBuildNot(ctx->builder, LLVMBuildAnd(ctx->builder, exact, postponed, ""), "");
4477
}
4478
4479
LLVMValueRef ac_build_call(struct ac_llvm_context *ctx, LLVMValueRef func, LLVMValueRef *args,
4480
unsigned num_args)
4481
{
4482
LLVMValueRef ret = LLVMBuildCall(ctx->builder, func, args, num_args, "");
4483
LLVMSetInstructionCallConv(ret, LLVMGetFunctionCallConv(func));
4484
return ret;
4485
}
4486
4487
void ac_export_mrt_z(struct ac_llvm_context *ctx, LLVMValueRef depth, LLVMValueRef stencil,
4488
LLVMValueRef samplemask, struct ac_export_args *args)
4489
{
4490
unsigned mask = 0;
4491
unsigned format = ac_get_spi_shader_z_format(depth != NULL, stencil != NULL, samplemask != NULL);
4492
4493
assert(depth || stencil || samplemask);
4494
4495
memset(args, 0, sizeof(*args));
4496
4497
args->valid_mask = 1; /* whether the EXEC mask is valid */
4498
args->done = 1; /* DONE bit */
4499
4500
/* Specify the target we are exporting */
4501
args->target = V_008DFC_SQ_EXP_MRTZ;
4502
4503
args->compr = 0; /* COMP flag */
4504
args->out[0] = LLVMGetUndef(ctx->f32); /* R, depth */
4505
args->out[1] = LLVMGetUndef(ctx->f32); /* G, stencil test val[0:7], stencil op val[8:15] */
4506
args->out[2] = LLVMGetUndef(ctx->f32); /* B, sample mask */
4507
args->out[3] = LLVMGetUndef(ctx->f32); /* A, alpha to mask */
4508
4509
if (format == V_028710_SPI_SHADER_UINT16_ABGR) {
4510
assert(!depth);
4511
args->compr = 1; /* COMPR flag */
4512
4513
if (stencil) {
4514
/* Stencil should be in X[23:16]. */
4515
stencil = ac_to_integer(ctx, stencil);
4516
stencil = LLVMBuildShl(ctx->builder, stencil, LLVMConstInt(ctx->i32, 16, 0), "");
4517
args->out[0] = ac_to_float(ctx, stencil);
4518
mask |= 0x3;
4519
}
4520
if (samplemask) {
4521
/* SampleMask should be in Y[15:0]. */
4522
args->out[1] = samplemask;
4523
mask |= 0xc;
4524
}
4525
} else {
4526
if (depth) {
4527
args->out[0] = depth;
4528
mask |= 0x1;
4529
}
4530
if (stencil) {
4531
args->out[1] = stencil;
4532
mask |= 0x2;
4533
}
4534
if (samplemask) {
4535
args->out[2] = samplemask;
4536
mask |= 0x4;
4537
}
4538
}
4539
4540
/* GFX6 (except OLAND and HAINAN) has a bug that it only looks
4541
* at the X writemask component. */
4542
if (ctx->chip_class == GFX6 && ctx->family != CHIP_OLAND && ctx->family != CHIP_HAINAN)
4543
mask |= 0x1;
4544
4545
/* Specify which components to enable */
4546
args->enabled_channels = mask;
4547
}
4548
4549
/* Send GS Alloc Req message from the first wave of the group to SPI.
4550
* Message payload is:
4551
* - bits 0..10: vertices in group
4552
* - bits 12..22: primitives in group
4553
*/
4554
void ac_build_sendmsg_gs_alloc_req(struct ac_llvm_context *ctx, LLVMValueRef wave_id,
4555
LLVMValueRef vtx_cnt, LLVMValueRef prim_cnt)
4556
{
4557
LLVMBuilderRef builder = ctx->builder;
4558
LLVMValueRef tmp;
4559
bool export_dummy_prim = false;
4560
4561
/* HW workaround for a GPU hang with 100% culling.
4562
* We always have to export at least 1 primitive.
4563
* Export a degenerate triangle using vertex 0 for all 3 vertices.
4564
*/
4565
if (prim_cnt == ctx->i32_0 && ctx->chip_class == GFX10) {
4566
assert(vtx_cnt == ctx->i32_0);
4567
prim_cnt = ctx->i32_1;
4568
vtx_cnt = ctx->i32_1;
4569
export_dummy_prim = true;
4570
}
4571
4572
ac_build_ifcc(ctx, LLVMBuildICmp(builder, LLVMIntEQ, wave_id, ctx->i32_0, ""), 5020);
4573
4574
tmp = LLVMBuildShl(builder, prim_cnt, LLVMConstInt(ctx->i32, 12, false), "");
4575
tmp = LLVMBuildOr(builder, tmp, vtx_cnt, "");
4576
ac_build_sendmsg(ctx, AC_SENDMSG_GS_ALLOC_REQ, tmp);
4577
4578
if (export_dummy_prim) {
4579
struct ac_ngg_prim prim = {0};
4580
/* The vertex indices are 0,0,0. */
4581
prim.passthrough = ctx->i32_0;
4582
4583
struct ac_export_args pos = {0};
4584
/* The hw culls primitives with NaN. */
4585
pos.out[0] = pos.out[1] = pos.out[2] = pos.out[3] = LLVMConstReal(ctx->f32, NAN);
4586
pos.target = V_008DFC_SQ_EXP_POS;
4587
pos.enabled_channels = 0xf;
4588
pos.done = true;
4589
4590
ac_build_ifcc(ctx, LLVMBuildICmp(builder, LLVMIntEQ, ac_get_thread_id(ctx), ctx->i32_0, ""),
4591
5021);
4592
ac_build_export_prim(ctx, &prim);
4593
ac_build_export(ctx, &pos);
4594
ac_build_endif(ctx, 5021);
4595
}
4596
4597
ac_build_endif(ctx, 5020);
4598
}
4599
4600
LLVMValueRef ac_pack_prim_export(struct ac_llvm_context *ctx, const struct ac_ngg_prim *prim)
4601
{
4602
/* The prim export format is:
4603
* - bits 0..8: index 0
4604
* - bit 9: edge flag 0
4605
* - bits 10..18: index 1
4606
* - bit 19: edge flag 1
4607
* - bits 20..28: index 2
4608
* - bit 29: edge flag 2
4609
* - bit 31: null primitive (skip)
4610
*/
4611
LLVMBuilderRef builder = ctx->builder;
4612
LLVMValueRef tmp = LLVMBuildZExt(builder, prim->isnull, ctx->i32, "");
4613
LLVMValueRef result = LLVMBuildShl(builder, tmp, LLVMConstInt(ctx->i32, 31, false), "");
4614
4615
for (unsigned i = 0; i < prim->num_vertices; ++i) {
4616
tmp = LLVMBuildShl(builder, prim->index[i], LLVMConstInt(ctx->i32, 10 * i, false), "");
4617
result = LLVMBuildOr(builder, result, tmp, "");
4618
tmp = LLVMBuildZExt(builder, prim->edgeflag[i], ctx->i32, "");
4619
tmp = LLVMBuildShl(builder, tmp, LLVMConstInt(ctx->i32, 10 * i + 9, false), "");
4620
result = LLVMBuildOr(builder, result, tmp, "");
4621
}
4622
return result;
4623
}
4624
4625
void ac_build_export_prim(struct ac_llvm_context *ctx, const struct ac_ngg_prim *prim)
4626
{
4627
struct ac_export_args args;
4628
4629
if (prim->passthrough) {
4630
args.out[0] = prim->passthrough;
4631
} else {
4632
args.out[0] = ac_pack_prim_export(ctx, prim);
4633
}
4634
4635
args.out[0] = LLVMBuildBitCast(ctx->builder, args.out[0], ctx->f32, "");
4636
args.out[1] = LLVMGetUndef(ctx->f32);
4637
args.out[2] = LLVMGetUndef(ctx->f32);
4638
args.out[3] = LLVMGetUndef(ctx->f32);
4639
4640
args.target = V_008DFC_SQ_EXP_PRIM;
4641
args.enabled_channels = 1;
4642
args.done = true;
4643
args.valid_mask = false;
4644
args.compr = false;
4645
4646
ac_build_export(ctx, &args);
4647
}
4648
4649
static LLVMTypeRef arg_llvm_type(enum ac_arg_type type, unsigned size, struct ac_llvm_context *ctx)
4650
{
4651
if (type == AC_ARG_FLOAT) {
4652
return size == 1 ? ctx->f32 : LLVMVectorType(ctx->f32, size);
4653
} else if (type == AC_ARG_INT) {
4654
return size == 1 ? ctx->i32 : LLVMVectorType(ctx->i32, size);
4655
} else {
4656
LLVMTypeRef ptr_type;
4657
switch (type) {
4658
case AC_ARG_CONST_PTR:
4659
ptr_type = ctx->i8;
4660
break;
4661
case AC_ARG_CONST_FLOAT_PTR:
4662
ptr_type = ctx->f32;
4663
break;
4664
case AC_ARG_CONST_PTR_PTR:
4665
ptr_type = ac_array_in_const32_addr_space(ctx->i8);
4666
break;
4667
case AC_ARG_CONST_DESC_PTR:
4668
ptr_type = ctx->v4i32;
4669
break;
4670
case AC_ARG_CONST_IMAGE_PTR:
4671
ptr_type = ctx->v8i32;
4672
break;
4673
default:
4674
unreachable("unknown arg type");
4675
}
4676
if (size == 1) {
4677
return ac_array_in_const32_addr_space(ptr_type);
4678
} else {
4679
assert(size == 2);
4680
return ac_array_in_const_addr_space(ptr_type);
4681
}
4682
}
4683
}
4684
4685
LLVMValueRef ac_build_main(const struct ac_shader_args *args, struct ac_llvm_context *ctx,
4686
enum ac_llvm_calling_convention convention, const char *name,
4687
LLVMTypeRef ret_type, LLVMModuleRef module)
4688
{
4689
LLVMTypeRef arg_types[AC_MAX_ARGS];
4690
4691
for (unsigned i = 0; i < args->arg_count; i++) {
4692
arg_types[i] = arg_llvm_type(args->args[i].type, args->args[i].size, ctx);
4693
}
4694
4695
LLVMTypeRef main_function_type = LLVMFunctionType(ret_type, arg_types, args->arg_count, 0);
4696
4697
LLVMValueRef main_function = LLVMAddFunction(module, name, main_function_type);
4698
LLVMBasicBlockRef main_function_body =
4699
LLVMAppendBasicBlockInContext(ctx->context, main_function, "main_body");
4700
LLVMPositionBuilderAtEnd(ctx->builder, main_function_body);
4701
4702
LLVMSetFunctionCallConv(main_function, convention);
4703
for (unsigned i = 0; i < args->arg_count; ++i) {
4704
LLVMValueRef P = LLVMGetParam(main_function, i);
4705
4706
if (args->args[i].file != AC_ARG_SGPR)
4707
continue;
4708
4709
ac_add_function_attr(ctx->context, main_function, i + 1, AC_FUNC_ATTR_INREG);
4710
4711
if (LLVMGetTypeKind(LLVMTypeOf(P)) == LLVMPointerTypeKind) {
4712
ac_add_function_attr(ctx->context, main_function, i + 1, AC_FUNC_ATTR_NOALIAS);
4713
ac_add_attr_dereferenceable(P, UINT64_MAX);
4714
ac_add_attr_alignment(P, 4);
4715
}
4716
}
4717
4718
ctx->main_function = main_function;
4719
4720
/* Enable denormals for FP16 and FP64: */
4721
LLVMAddTargetDependentFunctionAttr(main_function, "denormal-fp-math", "ieee,ieee");
4722
/* Disable denormals for FP32: */
4723
LLVMAddTargetDependentFunctionAttr(main_function, "denormal-fp-math-f32",
4724
"preserve-sign,preserve-sign");
4725
return main_function;
4726
}
4727
4728
void ac_build_s_endpgm(struct ac_llvm_context *ctx)
4729
{
4730
LLVMTypeRef calltype = LLVMFunctionType(ctx->voidt, NULL, 0, false);
4731
LLVMValueRef code = LLVMConstInlineAsm(calltype, "s_endpgm", "", true, false);
4732
LLVMBuildCall(ctx->builder, code, NULL, 0, "");
4733
}
4734
4735
/**
4736
* Convert triangle strip indices to triangle indices. This is used to decompose
4737
* triangle strips into triangles.
4738
*/
4739
void ac_build_triangle_strip_indices_to_triangle(struct ac_llvm_context *ctx, LLVMValueRef is_odd,
4740
LLVMValueRef flatshade_first,
4741
LLVMValueRef index[3])
4742
{
4743
LLVMBuilderRef builder = ctx->builder;
4744
LLVMValueRef out[3];
4745
4746
/* We need to change the vertex order for odd triangles to get correct
4747
* front/back facing by swapping 2 vertex indices, but we also have to
4748
* keep the provoking vertex in the same place.
4749
*
4750
* If the first vertex is provoking, swap index 1 and 2.
4751
* If the last vertex is provoking, swap index 0 and 1.
4752
*/
4753
out[0] = LLVMBuildSelect(builder, flatshade_first, index[0],
4754
LLVMBuildSelect(builder, is_odd, index[1], index[0], ""), "");
4755
out[1] = LLVMBuildSelect(builder, flatshade_first,
4756
LLVMBuildSelect(builder, is_odd, index[2], index[1], ""),
4757
LLVMBuildSelect(builder, is_odd, index[0], index[1], ""), "");
4758
out[2] = LLVMBuildSelect(builder, flatshade_first,
4759
LLVMBuildSelect(builder, is_odd, index[1], index[2], ""), index[2], "");
4760
memcpy(index, out, sizeof(out));
4761
}
4762
4763