Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
PojavLauncherTeam
GitHub Repository: PojavLauncherTeam/mesa
Path: blob/21.2-virgl/src/gallium/auxiliary/gallivm/lp_bld_arit.c
4565 views
1
/**************************************************************************
2
*
3
* Copyright 2009-2010 VMware, Inc.
4
* All Rights Reserved.
5
*
6
* Permission is hereby granted, free of charge, to any person obtaining a
7
* copy of this software and associated documentation files (the
8
* "Software"), to deal in the Software without restriction, including
9
* without limitation the rights to use, copy, modify, merge, publish,
10
* distribute, sub license, and/or sell copies of the Software, and to
11
* permit persons to whom the Software is furnished to do so, subject to
12
* the following conditions:
13
*
14
* The above copyright notice and this permission notice (including the
15
* next paragraph) shall be included in all copies or substantial portions
16
* of the Software.
17
*
18
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
19
* OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
21
* IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
22
* ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
23
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
24
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25
*
26
**************************************************************************/
27
28
29
/**
30
* @file
31
* Helper
32
*
33
* LLVM IR doesn't support all basic arithmetic operations we care about (most
34
* notably min/max and saturated operations), and it is often necessary to
35
* resort machine-specific intrinsics directly. The functions here hide all
36
* these implementation details from the other modules.
37
*
38
* We also do simple expressions simplification here. Reasons are:
39
* - it is very easy given we have all necessary information readily available
40
* - LLVM optimization passes fail to simplify several vector expressions
41
* - We often know value constraints which the optimization passes have no way
42
* of knowing, such as when source arguments are known to be in [0, 1] range.
43
*
44
* @author Jose Fonseca <[email protected]>
45
*/
46
47
48
#include <float.h>
49
50
#include <llvm/Config/llvm-config.h>
51
52
#include "util/u_memory.h"
53
#include "util/u_debug.h"
54
#include "util/u_math.h"
55
#include "util/u_cpu_detect.h"
56
57
#include "lp_bld_type.h"
58
#include "lp_bld_const.h"
59
#include "lp_bld_init.h"
60
#include "lp_bld_intr.h"
61
#include "lp_bld_logic.h"
62
#include "lp_bld_pack.h"
63
#include "lp_bld_debug.h"
64
#include "lp_bld_bitarit.h"
65
#include "lp_bld_arit.h"
66
#include "lp_bld_flow.h"
67
68
#if defined(PIPE_ARCH_SSE)
69
#include <xmmintrin.h>
70
#endif
71
72
#ifndef _MM_DENORMALS_ZERO_MASK
73
#define _MM_DENORMALS_ZERO_MASK 0x0040
74
#endif
75
76
#ifndef _MM_FLUSH_ZERO_MASK
77
#define _MM_FLUSH_ZERO_MASK 0x8000
78
#endif
79
80
#define EXP_POLY_DEGREE 5
81
82
#define LOG_POLY_DEGREE 4
83
84
85
/**
86
* Generate min(a, b)
87
* No checks for special case values of a or b = 1 or 0 are done.
88
* NaN's are handled according to the behavior specified by the
89
* nan_behavior argument.
90
*/
91
static LLVMValueRef
92
lp_build_min_simple(struct lp_build_context *bld,
93
LLVMValueRef a,
94
LLVMValueRef b,
95
enum gallivm_nan_behavior nan_behavior)
96
{
97
const struct lp_type type = bld->type;
98
const char *intrinsic = NULL;
99
unsigned intr_size = 0;
100
LLVMValueRef cond;
101
102
assert(lp_check_value(type, a));
103
assert(lp_check_value(type, b));
104
105
/* TODO: optimize the constant case */
106
107
if (type.floating && util_get_cpu_caps()->has_sse) {
108
if (type.width == 32) {
109
if (type.length == 1) {
110
intrinsic = "llvm.x86.sse.min.ss";
111
intr_size = 128;
112
}
113
else if (type.length <= 4 || !util_get_cpu_caps()->has_avx) {
114
intrinsic = "llvm.x86.sse.min.ps";
115
intr_size = 128;
116
}
117
else {
118
intrinsic = "llvm.x86.avx.min.ps.256";
119
intr_size = 256;
120
}
121
}
122
if (type.width == 64 && util_get_cpu_caps()->has_sse2) {
123
if (type.length == 1) {
124
intrinsic = "llvm.x86.sse2.min.sd";
125
intr_size = 128;
126
}
127
else if (type.length == 2 || !util_get_cpu_caps()->has_avx) {
128
intrinsic = "llvm.x86.sse2.min.pd";
129
intr_size = 128;
130
}
131
else {
132
intrinsic = "llvm.x86.avx.min.pd.256";
133
intr_size = 256;
134
}
135
}
136
}
137
else if (type.floating && util_get_cpu_caps()->has_altivec) {
138
if (nan_behavior == GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN) {
139
debug_printf("%s: altivec doesn't support nan return nan behavior\n",
140
__FUNCTION__);
141
}
142
if (type.width == 32 && type.length == 4) {
143
intrinsic = "llvm.ppc.altivec.vminfp";
144
intr_size = 128;
145
}
146
} else if (util_get_cpu_caps()->has_altivec) {
147
intr_size = 128;
148
if (type.width == 8) {
149
if (!type.sign) {
150
intrinsic = "llvm.ppc.altivec.vminub";
151
} else {
152
intrinsic = "llvm.ppc.altivec.vminsb";
153
}
154
} else if (type.width == 16) {
155
if (!type.sign) {
156
intrinsic = "llvm.ppc.altivec.vminuh";
157
} else {
158
intrinsic = "llvm.ppc.altivec.vminsh";
159
}
160
} else if (type.width == 32) {
161
if (!type.sign) {
162
intrinsic = "llvm.ppc.altivec.vminuw";
163
} else {
164
intrinsic = "llvm.ppc.altivec.vminsw";
165
}
166
}
167
}
168
169
if (intrinsic) {
170
/* We need to handle nan's for floating point numbers. If one of the
171
* inputs is nan the other should be returned (required by both D3D10+
172
* and OpenCL).
173
* The sse intrinsics return the second operator in case of nan by
174
* default so we need to special code to handle those.
175
*/
176
if (util_get_cpu_caps()->has_sse && type.floating &&
177
nan_behavior == GALLIVM_NAN_RETURN_OTHER) {
178
LLVMValueRef isnan, min;
179
min = lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
180
type,
181
intr_size, a, b);
182
isnan = lp_build_isnan(bld, b);
183
return lp_build_select(bld, isnan, a, min);
184
} else {
185
return lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
186
type,
187
intr_size, a, b);
188
}
189
}
190
191
if (type.floating) {
192
switch (nan_behavior) {
193
case GALLIVM_NAN_RETURN_OTHER: {
194
LLVMValueRef isnan = lp_build_isnan(bld, a);
195
cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
196
cond = LLVMBuildXor(bld->gallivm->builder, cond, isnan, "");
197
return lp_build_select(bld, cond, a, b);
198
}
199
break;
200
case GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN:
201
cond = lp_build_cmp_ordered(bld, PIPE_FUNC_LESS, a, b);
202
return lp_build_select(bld, cond, a, b);
203
case GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN:
204
cond = lp_build_cmp(bld, PIPE_FUNC_LESS, b, a);
205
return lp_build_select(bld, cond, b, a);
206
case GALLIVM_NAN_BEHAVIOR_UNDEFINED:
207
cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
208
return lp_build_select(bld, cond, a, b);
209
break;
210
default:
211
assert(0);
212
cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
213
return lp_build_select(bld, cond, a, b);
214
}
215
} else {
216
cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
217
return lp_build_select(bld, cond, a, b);
218
}
219
}
220
221
222
LLVMValueRef
223
lp_build_fmuladd(LLVMBuilderRef builder,
224
LLVMValueRef a,
225
LLVMValueRef b,
226
LLVMValueRef c)
227
{
228
LLVMTypeRef type = LLVMTypeOf(a);
229
assert(type == LLVMTypeOf(b));
230
assert(type == LLVMTypeOf(c));
231
232
char intrinsic[32];
233
lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.fmuladd", type);
234
LLVMValueRef args[] = { a, b, c };
235
return lp_build_intrinsic(builder, intrinsic, type, args, 3, 0);
236
}
237
238
239
/**
240
* Generate max(a, b)
241
* No checks for special case values of a or b = 1 or 0 are done.
242
* NaN's are handled according to the behavior specified by the
243
* nan_behavior argument.
244
*/
245
static LLVMValueRef
246
lp_build_max_simple(struct lp_build_context *bld,
247
LLVMValueRef a,
248
LLVMValueRef b,
249
enum gallivm_nan_behavior nan_behavior)
250
{
251
const struct lp_type type = bld->type;
252
const char *intrinsic = NULL;
253
unsigned intr_size = 0;
254
LLVMValueRef cond;
255
256
assert(lp_check_value(type, a));
257
assert(lp_check_value(type, b));
258
259
/* TODO: optimize the constant case */
260
261
if (type.floating && util_get_cpu_caps()->has_sse) {
262
if (type.width == 32) {
263
if (type.length == 1) {
264
intrinsic = "llvm.x86.sse.max.ss";
265
intr_size = 128;
266
}
267
else if (type.length <= 4 || !util_get_cpu_caps()->has_avx) {
268
intrinsic = "llvm.x86.sse.max.ps";
269
intr_size = 128;
270
}
271
else {
272
intrinsic = "llvm.x86.avx.max.ps.256";
273
intr_size = 256;
274
}
275
}
276
if (type.width == 64 && util_get_cpu_caps()->has_sse2) {
277
if (type.length == 1) {
278
intrinsic = "llvm.x86.sse2.max.sd";
279
intr_size = 128;
280
}
281
else if (type.length == 2 || !util_get_cpu_caps()->has_avx) {
282
intrinsic = "llvm.x86.sse2.max.pd";
283
intr_size = 128;
284
}
285
else {
286
intrinsic = "llvm.x86.avx.max.pd.256";
287
intr_size = 256;
288
}
289
}
290
}
291
else if (type.floating && util_get_cpu_caps()->has_altivec) {
292
if (nan_behavior == GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN) {
293
debug_printf("%s: altivec doesn't support nan return nan behavior\n",
294
__FUNCTION__);
295
}
296
if (type.width == 32 || type.length == 4) {
297
intrinsic = "llvm.ppc.altivec.vmaxfp";
298
intr_size = 128;
299
}
300
} else if (util_get_cpu_caps()->has_altivec) {
301
intr_size = 128;
302
if (type.width == 8) {
303
if (!type.sign) {
304
intrinsic = "llvm.ppc.altivec.vmaxub";
305
} else {
306
intrinsic = "llvm.ppc.altivec.vmaxsb";
307
}
308
} else if (type.width == 16) {
309
if (!type.sign) {
310
intrinsic = "llvm.ppc.altivec.vmaxuh";
311
} else {
312
intrinsic = "llvm.ppc.altivec.vmaxsh";
313
}
314
} else if (type.width == 32) {
315
if (!type.sign) {
316
intrinsic = "llvm.ppc.altivec.vmaxuw";
317
} else {
318
intrinsic = "llvm.ppc.altivec.vmaxsw";
319
}
320
}
321
}
322
323
if (intrinsic) {
324
if (util_get_cpu_caps()->has_sse && type.floating &&
325
nan_behavior == GALLIVM_NAN_RETURN_OTHER) {
326
LLVMValueRef isnan, max;
327
max = lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
328
type,
329
intr_size, a, b);
330
isnan = lp_build_isnan(bld, b);
331
return lp_build_select(bld, isnan, a, max);
332
} else {
333
return lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
334
type,
335
intr_size, a, b);
336
}
337
}
338
339
if (type.floating) {
340
switch (nan_behavior) {
341
case GALLIVM_NAN_RETURN_OTHER: {
342
LLVMValueRef isnan = lp_build_isnan(bld, a);
343
cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
344
cond = LLVMBuildXor(bld->gallivm->builder, cond, isnan, "");
345
return lp_build_select(bld, cond, a, b);
346
}
347
break;
348
case GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN:
349
cond = lp_build_cmp_ordered(bld, PIPE_FUNC_GREATER, a, b);
350
return lp_build_select(bld, cond, a, b);
351
case GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN:
352
cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, b, a);
353
return lp_build_select(bld, cond, b, a);
354
case GALLIVM_NAN_BEHAVIOR_UNDEFINED:
355
cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
356
return lp_build_select(bld, cond, a, b);
357
break;
358
default:
359
assert(0);
360
cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
361
return lp_build_select(bld, cond, a, b);
362
}
363
} else {
364
cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
365
return lp_build_select(bld, cond, a, b);
366
}
367
}
368
369
370
/**
371
* Generate 1 - a, or ~a depending on bld->type.
372
*/
373
LLVMValueRef
374
lp_build_comp(struct lp_build_context *bld,
375
LLVMValueRef a)
376
{
377
LLVMBuilderRef builder = bld->gallivm->builder;
378
const struct lp_type type = bld->type;
379
380
assert(lp_check_value(type, a));
381
382
if(a == bld->one)
383
return bld->zero;
384
if(a == bld->zero)
385
return bld->one;
386
387
if(type.norm && !type.floating && !type.fixed && !type.sign) {
388
if(LLVMIsConstant(a))
389
return LLVMConstNot(a);
390
else
391
return LLVMBuildNot(builder, a, "");
392
}
393
394
if(LLVMIsConstant(a))
395
if (type.floating)
396
return LLVMConstFSub(bld->one, a);
397
else
398
return LLVMConstSub(bld->one, a);
399
else
400
if (type.floating)
401
return LLVMBuildFSub(builder, bld->one, a, "");
402
else
403
return LLVMBuildSub(builder, bld->one, a, "");
404
}
405
406
407
/**
408
* Generate a + b
409
*/
410
LLVMValueRef
411
lp_build_add(struct lp_build_context *bld,
412
LLVMValueRef a,
413
LLVMValueRef b)
414
{
415
LLVMBuilderRef builder = bld->gallivm->builder;
416
const struct lp_type type = bld->type;
417
LLVMValueRef res;
418
419
assert(lp_check_value(type, a));
420
assert(lp_check_value(type, b));
421
422
if (a == bld->zero)
423
return b;
424
if (b == bld->zero)
425
return a;
426
if (a == bld->undef || b == bld->undef)
427
return bld->undef;
428
429
if (type.norm) {
430
const char *intrinsic = NULL;
431
432
if (!type.sign && (a == bld->one || b == bld->one))
433
return bld->one;
434
435
if (!type.floating && !type.fixed) {
436
if (LLVM_VERSION_MAJOR >= 8) {
437
char intrin[32];
438
intrinsic = type.sign ? "llvm.sadd.sat" : "llvm.uadd.sat";
439
lp_format_intrinsic(intrin, sizeof intrin, intrinsic, bld->vec_type);
440
return lp_build_intrinsic_binary(builder, intrin, bld->vec_type, a, b);
441
}
442
if (type.width * type.length == 128) {
443
if (util_get_cpu_caps()->has_sse2) {
444
if (type.width == 8)
445
intrinsic = type.sign ? "llvm.x86.sse2.padds.b" : "llvm.x86.sse2.paddus.b";
446
if (type.width == 16)
447
intrinsic = type.sign ? "llvm.x86.sse2.padds.w" : "llvm.x86.sse2.paddus.w";
448
} else if (util_get_cpu_caps()->has_altivec) {
449
if (type.width == 8)
450
intrinsic = type.sign ? "llvm.ppc.altivec.vaddsbs" : "llvm.ppc.altivec.vaddubs";
451
if (type.width == 16)
452
intrinsic = type.sign ? "llvm.ppc.altivec.vaddshs" : "llvm.ppc.altivec.vadduhs";
453
}
454
}
455
if (type.width * type.length == 256) {
456
if (util_get_cpu_caps()->has_avx2) {
457
if (type.width == 8)
458
intrinsic = type.sign ? "llvm.x86.avx2.padds.b" : "llvm.x86.avx2.paddus.b";
459
if (type.width == 16)
460
intrinsic = type.sign ? "llvm.x86.avx2.padds.w" : "llvm.x86.avx2.paddus.w";
461
}
462
}
463
}
464
465
if (intrinsic)
466
return lp_build_intrinsic_binary(builder, intrinsic, lp_build_vec_type(bld->gallivm, bld->type), a, b);
467
}
468
469
if(type.norm && !type.floating && !type.fixed) {
470
if (type.sign) {
471
uint64_t sign = (uint64_t)1 << (type.width - 1);
472
LLVMValueRef max_val = lp_build_const_int_vec(bld->gallivm, type, sign - 1);
473
LLVMValueRef min_val = lp_build_const_int_vec(bld->gallivm, type, sign);
474
/* a_clamp_max is the maximum a for positive b,
475
a_clamp_min is the minimum a for negative b. */
476
LLVMValueRef a_clamp_max = lp_build_min_simple(bld, a, LLVMBuildSub(builder, max_val, b, ""), GALLIVM_NAN_BEHAVIOR_UNDEFINED);
477
LLVMValueRef a_clamp_min = lp_build_max_simple(bld, a, LLVMBuildSub(builder, min_val, b, ""), GALLIVM_NAN_BEHAVIOR_UNDEFINED);
478
a = lp_build_select(bld, lp_build_cmp(bld, PIPE_FUNC_GREATER, b, bld->zero), a_clamp_max, a_clamp_min);
479
}
480
}
481
482
if(LLVMIsConstant(a) && LLVMIsConstant(b))
483
if (type.floating)
484
res = LLVMConstFAdd(a, b);
485
else
486
res = LLVMConstAdd(a, b);
487
else
488
if (type.floating)
489
res = LLVMBuildFAdd(builder, a, b, "");
490
else
491
res = LLVMBuildAdd(builder, a, b, "");
492
493
/* clamp to ceiling of 1.0 */
494
if(bld->type.norm && (bld->type.floating || bld->type.fixed))
495
res = lp_build_min_simple(bld, res, bld->one, GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN);
496
497
if (type.norm && !type.floating && !type.fixed) {
498
if (!type.sign) {
499
/*
500
* newer llvm versions no longer support the intrinsics, but recognize
501
* the pattern. Since auto-upgrade of intrinsics doesn't work for jit
502
* code, it is important we match the pattern llvm uses (and pray llvm
503
* doesn't change it - and hope they decide on the same pattern for
504
* all backends supporting it...).
505
* NOTE: cmp/select does sext/trunc of the mask. Does not seem to
506
* interfere with llvm's ability to recognize the pattern but seems
507
* a bit brittle.
508
* NOTE: llvm 9+ always uses (non arch specific) intrinsic.
509
*/
510
LLVMValueRef overflowed = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, res);
511
res = lp_build_select(bld, overflowed,
512
LLVMConstAllOnes(bld->int_vec_type), res);
513
}
514
}
515
516
/* XXX clamp to floor of -1 or 0??? */
517
518
return res;
519
}
520
521
522
/** Return the scalar sum of the elements of a.
523
* Should avoid this operation whenever possible.
524
*/
525
LLVMValueRef
526
lp_build_horizontal_add(struct lp_build_context *bld,
527
LLVMValueRef a)
528
{
529
LLVMBuilderRef builder = bld->gallivm->builder;
530
const struct lp_type type = bld->type;
531
LLVMValueRef index, res;
532
unsigned i, length;
533
LLVMValueRef shuffles1[LP_MAX_VECTOR_LENGTH / 2];
534
LLVMValueRef shuffles2[LP_MAX_VECTOR_LENGTH / 2];
535
LLVMValueRef vecres, elem2;
536
537
assert(lp_check_value(type, a));
538
539
if (type.length == 1) {
540
return a;
541
}
542
543
assert(!bld->type.norm);
544
545
/*
546
* for byte vectors can do much better with psadbw.
547
* Using repeated shuffle/adds here. Note with multiple vectors
548
* this can be done more efficiently as outlined in the intel
549
* optimization manual.
550
* Note: could cause data rearrangement if used with smaller element
551
* sizes.
552
*/
553
554
vecres = a;
555
length = type.length / 2;
556
while (length > 1) {
557
LLVMValueRef vec1, vec2;
558
for (i = 0; i < length; i++) {
559
shuffles1[i] = lp_build_const_int32(bld->gallivm, i);
560
shuffles2[i] = lp_build_const_int32(bld->gallivm, i + length);
561
}
562
vec1 = LLVMBuildShuffleVector(builder, vecres, vecres,
563
LLVMConstVector(shuffles1, length), "");
564
vec2 = LLVMBuildShuffleVector(builder, vecres, vecres,
565
LLVMConstVector(shuffles2, length), "");
566
if (type.floating) {
567
vecres = LLVMBuildFAdd(builder, vec1, vec2, "");
568
}
569
else {
570
vecres = LLVMBuildAdd(builder, vec1, vec2, "");
571
}
572
length = length >> 1;
573
}
574
575
/* always have vector of size 2 here */
576
assert(length == 1);
577
578
index = lp_build_const_int32(bld->gallivm, 0);
579
res = LLVMBuildExtractElement(builder, vecres, index, "");
580
index = lp_build_const_int32(bld->gallivm, 1);
581
elem2 = LLVMBuildExtractElement(builder, vecres, index, "");
582
583
if (type.floating)
584
res = LLVMBuildFAdd(builder, res, elem2, "");
585
else
586
res = LLVMBuildAdd(builder, res, elem2, "");
587
588
return res;
589
}
590
591
/**
592
* Return the horizontal sums of 4 float vectors as a float4 vector.
593
* This uses the technique as outlined in Intel Optimization Manual.
594
*/
595
static LLVMValueRef
596
lp_build_horizontal_add4x4f(struct lp_build_context *bld,
597
LLVMValueRef src[4])
598
{
599
struct gallivm_state *gallivm = bld->gallivm;
600
LLVMBuilderRef builder = gallivm->builder;
601
LLVMValueRef shuffles[4];
602
LLVMValueRef tmp[4];
603
LLVMValueRef sumtmp[2], shuftmp[2];
604
605
/* lower half of regs */
606
shuffles[0] = lp_build_const_int32(gallivm, 0);
607
shuffles[1] = lp_build_const_int32(gallivm, 1);
608
shuffles[2] = lp_build_const_int32(gallivm, 4);
609
shuffles[3] = lp_build_const_int32(gallivm, 5);
610
tmp[0] = LLVMBuildShuffleVector(builder, src[0], src[1],
611
LLVMConstVector(shuffles, 4), "");
612
tmp[2] = LLVMBuildShuffleVector(builder, src[2], src[3],
613
LLVMConstVector(shuffles, 4), "");
614
615
/* upper half of regs */
616
shuffles[0] = lp_build_const_int32(gallivm, 2);
617
shuffles[1] = lp_build_const_int32(gallivm, 3);
618
shuffles[2] = lp_build_const_int32(gallivm, 6);
619
shuffles[3] = lp_build_const_int32(gallivm, 7);
620
tmp[1] = LLVMBuildShuffleVector(builder, src[0], src[1],
621
LLVMConstVector(shuffles, 4), "");
622
tmp[3] = LLVMBuildShuffleVector(builder, src[2], src[3],
623
LLVMConstVector(shuffles, 4), "");
624
625
sumtmp[0] = LLVMBuildFAdd(builder, tmp[0], tmp[1], "");
626
sumtmp[1] = LLVMBuildFAdd(builder, tmp[2], tmp[3], "");
627
628
shuffles[0] = lp_build_const_int32(gallivm, 0);
629
shuffles[1] = lp_build_const_int32(gallivm, 2);
630
shuffles[2] = lp_build_const_int32(gallivm, 4);
631
shuffles[3] = lp_build_const_int32(gallivm, 6);
632
shuftmp[0] = LLVMBuildShuffleVector(builder, sumtmp[0], sumtmp[1],
633
LLVMConstVector(shuffles, 4), "");
634
635
shuffles[0] = lp_build_const_int32(gallivm, 1);
636
shuffles[1] = lp_build_const_int32(gallivm, 3);
637
shuffles[2] = lp_build_const_int32(gallivm, 5);
638
shuffles[3] = lp_build_const_int32(gallivm, 7);
639
shuftmp[1] = LLVMBuildShuffleVector(builder, sumtmp[0], sumtmp[1],
640
LLVMConstVector(shuffles, 4), "");
641
642
return LLVMBuildFAdd(builder, shuftmp[0], shuftmp[1], "");
643
}
644
645
646
/*
647
* partially horizontally add 2-4 float vectors with length nx4,
648
* i.e. only four adjacent values in each vector will be added,
649
* assuming values are really grouped in 4 which also determines
650
* output order.
651
*
652
* Return a vector of the same length as the initial vectors,
653
* with the excess elements (if any) being undefined.
654
* The element order is independent of number of input vectors.
655
* For 3 vectors x0x1x2x3x4x5x6x7, y0y1y2y3y4y5y6y7, z0z1z2z3z4z5z6z7
656
* the output order thus will be
657
* sumx0-x3,sumy0-y3,sumz0-z3,undef,sumx4-x7,sumy4-y7,sumz4z7,undef
658
*/
659
LLVMValueRef
660
lp_build_hadd_partial4(struct lp_build_context *bld,
661
LLVMValueRef vectors[],
662
unsigned num_vecs)
663
{
664
struct gallivm_state *gallivm = bld->gallivm;
665
LLVMBuilderRef builder = gallivm->builder;
666
LLVMValueRef ret_vec;
667
LLVMValueRef tmp[4];
668
const char *intrinsic = NULL;
669
670
assert(num_vecs >= 2 && num_vecs <= 4);
671
assert(bld->type.floating);
672
673
/* only use this with at least 2 vectors, as it is sort of expensive
674
* (depending on cpu) and we always need two horizontal adds anyway,
675
* so a shuffle/add approach might be better.
676
*/
677
678
tmp[0] = vectors[0];
679
tmp[1] = vectors[1];
680
681
tmp[2] = num_vecs > 2 ? vectors[2] : vectors[0];
682
tmp[3] = num_vecs > 3 ? vectors[3] : vectors[0];
683
684
if (util_get_cpu_caps()->has_sse3 && bld->type.width == 32 &&
685
bld->type.length == 4) {
686
intrinsic = "llvm.x86.sse3.hadd.ps";
687
}
688
else if (util_get_cpu_caps()->has_avx && bld->type.width == 32 &&
689
bld->type.length == 8) {
690
intrinsic = "llvm.x86.avx.hadd.ps.256";
691
}
692
if (intrinsic) {
693
tmp[0] = lp_build_intrinsic_binary(builder, intrinsic,
694
lp_build_vec_type(gallivm, bld->type),
695
tmp[0], tmp[1]);
696
if (num_vecs > 2) {
697
tmp[1] = lp_build_intrinsic_binary(builder, intrinsic,
698
lp_build_vec_type(gallivm, bld->type),
699
tmp[2], tmp[3]);
700
}
701
else {
702
tmp[1] = tmp[0];
703
}
704
return lp_build_intrinsic_binary(builder, intrinsic,
705
lp_build_vec_type(gallivm, bld->type),
706
tmp[0], tmp[1]);
707
}
708
709
if (bld->type.length == 4) {
710
ret_vec = lp_build_horizontal_add4x4f(bld, tmp);
711
}
712
else {
713
LLVMValueRef partres[LP_MAX_VECTOR_LENGTH/4];
714
unsigned j;
715
unsigned num_iter = bld->type.length / 4;
716
struct lp_type parttype = bld->type;
717
parttype.length = 4;
718
for (j = 0; j < num_iter; j++) {
719
LLVMValueRef partsrc[4];
720
unsigned i;
721
for (i = 0; i < 4; i++) {
722
partsrc[i] = lp_build_extract_range(gallivm, tmp[i], j*4, 4);
723
}
724
partres[j] = lp_build_horizontal_add4x4f(bld, partsrc);
725
}
726
ret_vec = lp_build_concat(gallivm, partres, parttype, num_iter);
727
}
728
return ret_vec;
729
}
730
731
/**
732
* Generate a - b
733
*/
734
LLVMValueRef
735
lp_build_sub(struct lp_build_context *bld,
736
LLVMValueRef a,
737
LLVMValueRef b)
738
{
739
LLVMBuilderRef builder = bld->gallivm->builder;
740
const struct lp_type type = bld->type;
741
LLVMValueRef res;
742
743
assert(lp_check_value(type, a));
744
assert(lp_check_value(type, b));
745
746
if (b == bld->zero)
747
return a;
748
if (a == bld->undef || b == bld->undef)
749
return bld->undef;
750
if (a == b)
751
return bld->zero;
752
753
if (type.norm) {
754
const char *intrinsic = NULL;
755
756
if (!type.sign && b == bld->one)
757
return bld->zero;
758
759
if (!type.floating && !type.fixed) {
760
if (LLVM_VERSION_MAJOR >= 8) {
761
char intrin[32];
762
intrinsic = type.sign ? "llvm.ssub.sat" : "llvm.usub.sat";
763
lp_format_intrinsic(intrin, sizeof intrin, intrinsic, bld->vec_type);
764
return lp_build_intrinsic_binary(builder, intrin, bld->vec_type, a, b);
765
}
766
if (type.width * type.length == 128) {
767
if (util_get_cpu_caps()->has_sse2) {
768
if (type.width == 8)
769
intrinsic = type.sign ? "llvm.x86.sse2.psubs.b" : "llvm.x86.sse2.psubus.b";
770
if (type.width == 16)
771
intrinsic = type.sign ? "llvm.x86.sse2.psubs.w" : "llvm.x86.sse2.psubus.w";
772
} else if (util_get_cpu_caps()->has_altivec) {
773
if (type.width == 8)
774
intrinsic = type.sign ? "llvm.ppc.altivec.vsubsbs" : "llvm.ppc.altivec.vsububs";
775
if (type.width == 16)
776
intrinsic = type.sign ? "llvm.ppc.altivec.vsubshs" : "llvm.ppc.altivec.vsubuhs";
777
}
778
}
779
if (type.width * type.length == 256) {
780
if (util_get_cpu_caps()->has_avx2) {
781
if (type.width == 8)
782
intrinsic = type.sign ? "llvm.x86.avx2.psubs.b" : "llvm.x86.avx2.psubus.b";
783
if (type.width == 16)
784
intrinsic = type.sign ? "llvm.x86.avx2.psubs.w" : "llvm.x86.avx2.psubus.w";
785
}
786
}
787
}
788
789
if (intrinsic)
790
return lp_build_intrinsic_binary(builder, intrinsic, lp_build_vec_type(bld->gallivm, bld->type), a, b);
791
}
792
793
if(type.norm && !type.floating && !type.fixed) {
794
if (type.sign) {
795
uint64_t sign = (uint64_t)1 << (type.width - 1);
796
LLVMValueRef max_val = lp_build_const_int_vec(bld->gallivm, type, sign - 1);
797
LLVMValueRef min_val = lp_build_const_int_vec(bld->gallivm, type, sign);
798
/* a_clamp_max is the maximum a for negative b,
799
a_clamp_min is the minimum a for positive b. */
800
LLVMValueRef a_clamp_max = lp_build_min_simple(bld, a, LLVMBuildAdd(builder, max_val, b, ""), GALLIVM_NAN_BEHAVIOR_UNDEFINED);
801
LLVMValueRef a_clamp_min = lp_build_max_simple(bld, a, LLVMBuildAdd(builder, min_val, b, ""), GALLIVM_NAN_BEHAVIOR_UNDEFINED);
802
a = lp_build_select(bld, lp_build_cmp(bld, PIPE_FUNC_GREATER, b, bld->zero), a_clamp_min, a_clamp_max);
803
} else {
804
/*
805
* This must match llvm pattern for saturated unsigned sub.
806
* (lp_build_max_simple actually does the job with its current
807
* definition but do it explicitly here.)
808
* NOTE: cmp/select does sext/trunc of the mask. Does not seem to
809
* interfere with llvm's ability to recognize the pattern but seems
810
* a bit brittle.
811
* NOTE: llvm 9+ always uses (non arch specific) intrinsic.
812
*/
813
LLVMValueRef no_ov = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
814
a = lp_build_select(bld, no_ov, a, b);
815
}
816
}
817
818
if(LLVMIsConstant(a) && LLVMIsConstant(b))
819
if (type.floating)
820
res = LLVMConstFSub(a, b);
821
else
822
res = LLVMConstSub(a, b);
823
else
824
if (type.floating)
825
res = LLVMBuildFSub(builder, a, b, "");
826
else
827
res = LLVMBuildSub(builder, a, b, "");
828
829
if(bld->type.norm && (bld->type.floating || bld->type.fixed))
830
res = lp_build_max_simple(bld, res, bld->zero, GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN);
831
832
return res;
833
}
834
835
836
837
/**
838
* Normalized multiplication.
839
*
840
* There are several approaches for (using 8-bit normalized multiplication as
841
* an example):
842
*
843
* - alpha plus one
844
*
845
* makes the following approximation to the division (Sree)
846
*
847
* a*b/255 ~= (a*(b + 1)) >> 256
848
*
849
* which is the fastest method that satisfies the following OpenGL criteria of
850
*
851
* 0*0 = 0 and 255*255 = 255
852
*
853
* - geometric series
854
*
855
* takes the geometric series approximation to the division
856
*
857
* t/255 = (t >> 8) + (t >> 16) + (t >> 24) ..
858
*
859
* in this case just the first two terms to fit in 16bit arithmetic
860
*
861
* t/255 ~= (t + (t >> 8)) >> 8
862
*
863
* note that just by itself it doesn't satisfies the OpenGL criteria, as
864
* 255*255 = 254, so the special case b = 255 must be accounted or roundoff
865
* must be used.
866
*
867
* - geometric series plus rounding
868
*
869
* when using a geometric series division instead of truncating the result
870
* use roundoff in the approximation (Jim Blinn)
871
*
872
* t/255 ~= (t + (t >> 8) + 0x80) >> 8
873
*
874
* achieving the exact results.
875
*
876
*
877
*
878
* @sa Alvy Ray Smith, Image Compositing Fundamentals, Tech Memo 4, Aug 15, 1995,
879
* ftp://ftp.alvyray.com/Acrobat/4_Comp.pdf
880
* @sa Michael Herf, The "double blend trick", May 2000,
881
* http://www.stereopsis.com/doubleblend.html
882
*/
883
LLVMValueRef
884
lp_build_mul_norm(struct gallivm_state *gallivm,
885
struct lp_type wide_type,
886
LLVMValueRef a, LLVMValueRef b)
887
{
888
LLVMBuilderRef builder = gallivm->builder;
889
struct lp_build_context bld;
890
unsigned n;
891
LLVMValueRef half;
892
LLVMValueRef ab;
893
894
assert(!wide_type.floating);
895
assert(lp_check_value(wide_type, a));
896
assert(lp_check_value(wide_type, b));
897
898
lp_build_context_init(&bld, gallivm, wide_type);
899
900
n = wide_type.width / 2;
901
if (wide_type.sign) {
902
--n;
903
}
904
905
/*
906
* TODO: for 16bits normalized SSE2 vectors we could consider using PMULHUW
907
* http://ssp.impulsetrain.com/2011/07/03/multiplying-normalized-16-bit-numbers-with-sse2/
908
*/
909
910
/*
911
* a*b / (2**n - 1) ~= (a*b + (a*b >> n) + half) >> n
912
*/
913
914
ab = LLVMBuildMul(builder, a, b, "");
915
ab = LLVMBuildAdd(builder, ab, lp_build_shr_imm(&bld, ab, n), "");
916
917
/*
918
* half = sgn(ab) * 0.5 * (2 ** n) = sgn(ab) * (1 << (n - 1))
919
*/
920
921
half = lp_build_const_int_vec(gallivm, wide_type, 1LL << (n - 1));
922
if (wide_type.sign) {
923
LLVMValueRef minus_half = LLVMBuildNeg(builder, half, "");
924
LLVMValueRef sign = lp_build_shr_imm(&bld, ab, wide_type.width - 1);
925
half = lp_build_select(&bld, sign, minus_half, half);
926
}
927
ab = LLVMBuildAdd(builder, ab, half, "");
928
929
/* Final division */
930
ab = lp_build_shr_imm(&bld, ab, n);
931
932
return ab;
933
}
934
935
/**
936
* Generate a * b
937
*/
938
LLVMValueRef
939
lp_build_mul(struct lp_build_context *bld,
940
LLVMValueRef a,
941
LLVMValueRef b)
942
{
943
LLVMBuilderRef builder = bld->gallivm->builder;
944
const struct lp_type type = bld->type;
945
LLVMValueRef shift;
946
LLVMValueRef res;
947
948
assert(lp_check_value(type, a));
949
assert(lp_check_value(type, b));
950
951
if(a == bld->zero)
952
return bld->zero;
953
if(a == bld->one)
954
return b;
955
if(b == bld->zero)
956
return bld->zero;
957
if(b == bld->one)
958
return a;
959
if(a == bld->undef || b == bld->undef)
960
return bld->undef;
961
962
if (!type.floating && !type.fixed && type.norm) {
963
struct lp_type wide_type = lp_wider_type(type);
964
LLVMValueRef al, ah, bl, bh, abl, abh, ab;
965
966
lp_build_unpack2_native(bld->gallivm, type, wide_type, a, &al, &ah);
967
lp_build_unpack2_native(bld->gallivm, type, wide_type, b, &bl, &bh);
968
969
/* PMULLW, PSRLW, PADDW */
970
abl = lp_build_mul_norm(bld->gallivm, wide_type, al, bl);
971
abh = lp_build_mul_norm(bld->gallivm, wide_type, ah, bh);
972
973
ab = lp_build_pack2_native(bld->gallivm, wide_type, type, abl, abh);
974
975
return ab;
976
}
977
978
if(type.fixed)
979
shift = lp_build_const_int_vec(bld->gallivm, type, type.width/2);
980
else
981
shift = NULL;
982
983
if(LLVMIsConstant(a) && LLVMIsConstant(b)) {
984
if (type.floating)
985
res = LLVMConstFMul(a, b);
986
else
987
res = LLVMConstMul(a, b);
988
if(shift) {
989
if(type.sign)
990
res = LLVMConstAShr(res, shift);
991
else
992
res = LLVMConstLShr(res, shift);
993
}
994
}
995
else {
996
if (type.floating)
997
res = LLVMBuildFMul(builder, a, b, "");
998
else
999
res = LLVMBuildMul(builder, a, b, "");
1000
if(shift) {
1001
if(type.sign)
1002
res = LLVMBuildAShr(builder, res, shift, "");
1003
else
1004
res = LLVMBuildLShr(builder, res, shift, "");
1005
}
1006
}
1007
1008
return res;
1009
}
1010
1011
/*
1012
* Widening mul, valid for 32x32 bit -> 64bit only.
1013
* Result is low 32bits, high bits returned in res_hi.
1014
*
1015
* Emits code that is meant to be compiled for the host CPU.
1016
*/
1017
LLVMValueRef
1018
lp_build_mul_32_lohi_cpu(struct lp_build_context *bld,
1019
LLVMValueRef a,
1020
LLVMValueRef b,
1021
LLVMValueRef *res_hi)
1022
{
1023
struct gallivm_state *gallivm = bld->gallivm;
1024
LLVMBuilderRef builder = gallivm->builder;
1025
1026
assert(bld->type.width == 32);
1027
assert(bld->type.floating == 0);
1028
assert(bld->type.fixed == 0);
1029
assert(bld->type.norm == 0);
1030
1031
/*
1032
* XXX: for some reason, with zext/zext/mul/trunc the code llvm produces
1033
* for x86 simd is atrocious (even if the high bits weren't required),
1034
* trying to handle real 64bit inputs (which of course can't happen due
1035
* to using 64bit umul with 32bit numbers zero-extended to 64bit, but
1036
* apparently llvm does not recognize this widening mul). This includes 6
1037
* (instead of 2) pmuludq plus extra adds and shifts
1038
* The same story applies to signed mul, albeit fixing this requires sse41.
1039
* https://llvm.org/bugs/show_bug.cgi?id=30845
1040
* So, whip up our own code, albeit only for length 4 and 8 (which
1041
* should be good enough)...
1042
* FIXME: For llvm >= 7.0 we should match the autoupgrade pattern
1043
* (bitcast/and/mul/shuffle for unsigned, bitcast/shl/ashr/mul/shuffle
1044
* for signed), which the fallback code does not, without this llvm
1045
* will likely still produce atrocious code.
1046
*/
1047
if (LLVM_VERSION_MAJOR < 7 &&
1048
(bld->type.length == 4 || bld->type.length == 8) &&
1049
((util_get_cpu_caps()->has_sse2 && (bld->type.sign == 0)) ||
1050
util_get_cpu_caps()->has_sse4_1)) {
1051
const char *intrinsic = NULL;
1052
LLVMValueRef aeven, aodd, beven, bodd, muleven, mulodd;
1053
LLVMValueRef shuf[LP_MAX_VECTOR_WIDTH / 32], shuf_vec;
1054
struct lp_type type_wide = lp_wider_type(bld->type);
1055
LLVMTypeRef wider_type = lp_build_vec_type(gallivm, type_wide);
1056
unsigned i;
1057
for (i = 0; i < bld->type.length; i += 2) {
1058
shuf[i] = lp_build_const_int32(gallivm, i+1);
1059
shuf[i+1] = LLVMGetUndef(LLVMInt32TypeInContext(gallivm->context));
1060
}
1061
shuf_vec = LLVMConstVector(shuf, bld->type.length);
1062
aeven = a;
1063
beven = b;
1064
aodd = LLVMBuildShuffleVector(builder, aeven, bld->undef, shuf_vec, "");
1065
bodd = LLVMBuildShuffleVector(builder, beven, bld->undef, shuf_vec, "");
1066
1067
if (util_get_cpu_caps()->has_avx2 && bld->type.length == 8) {
1068
if (bld->type.sign) {
1069
intrinsic = "llvm.x86.avx2.pmul.dq";
1070
} else {
1071
intrinsic = "llvm.x86.avx2.pmulu.dq";
1072
}
1073
muleven = lp_build_intrinsic_binary(builder, intrinsic,
1074
wider_type, aeven, beven);
1075
mulodd = lp_build_intrinsic_binary(builder, intrinsic,
1076
wider_type, aodd, bodd);
1077
}
1078
else {
1079
/* for consistent naming look elsewhere... */
1080
if (bld->type.sign) {
1081
intrinsic = "llvm.x86.sse41.pmuldq";
1082
} else {
1083
intrinsic = "llvm.x86.sse2.pmulu.dq";
1084
}
1085
/*
1086
* XXX If we only have AVX but not AVX2 this is a pain.
1087
* lp_build_intrinsic_binary_anylength() can't handle it
1088
* (due to src and dst type not being identical).
1089
*/
1090
if (bld->type.length == 8) {
1091
LLVMValueRef aevenlo, aevenhi, bevenlo, bevenhi;
1092
LLVMValueRef aoddlo, aoddhi, boddlo, boddhi;
1093
LLVMValueRef muleven2[2], mulodd2[2];
1094
struct lp_type type_wide_half = type_wide;
1095
LLVMTypeRef wtype_half;
1096
type_wide_half.length = 2;
1097
wtype_half = lp_build_vec_type(gallivm, type_wide_half);
1098
aevenlo = lp_build_extract_range(gallivm, aeven, 0, 4);
1099
aevenhi = lp_build_extract_range(gallivm, aeven, 4, 4);
1100
bevenlo = lp_build_extract_range(gallivm, beven, 0, 4);
1101
bevenhi = lp_build_extract_range(gallivm, beven, 4, 4);
1102
aoddlo = lp_build_extract_range(gallivm, aodd, 0, 4);
1103
aoddhi = lp_build_extract_range(gallivm, aodd, 4, 4);
1104
boddlo = lp_build_extract_range(gallivm, bodd, 0, 4);
1105
boddhi = lp_build_extract_range(gallivm, bodd, 4, 4);
1106
muleven2[0] = lp_build_intrinsic_binary(builder, intrinsic,
1107
wtype_half, aevenlo, bevenlo);
1108
mulodd2[0] = lp_build_intrinsic_binary(builder, intrinsic,
1109
wtype_half, aoddlo, boddlo);
1110
muleven2[1] = lp_build_intrinsic_binary(builder, intrinsic,
1111
wtype_half, aevenhi, bevenhi);
1112
mulodd2[1] = lp_build_intrinsic_binary(builder, intrinsic,
1113
wtype_half, aoddhi, boddhi);
1114
muleven = lp_build_concat(gallivm, muleven2, type_wide_half, 2);
1115
mulodd = lp_build_concat(gallivm, mulodd2, type_wide_half, 2);
1116
1117
}
1118
else {
1119
muleven = lp_build_intrinsic_binary(builder, intrinsic,
1120
wider_type, aeven, beven);
1121
mulodd = lp_build_intrinsic_binary(builder, intrinsic,
1122
wider_type, aodd, bodd);
1123
}
1124
}
1125
muleven = LLVMBuildBitCast(builder, muleven, bld->vec_type, "");
1126
mulodd = LLVMBuildBitCast(builder, mulodd, bld->vec_type, "");
1127
1128
for (i = 0; i < bld->type.length; i += 2) {
1129
shuf[i] = lp_build_const_int32(gallivm, i + 1);
1130
shuf[i+1] = lp_build_const_int32(gallivm, i + 1 + bld->type.length);
1131
}
1132
shuf_vec = LLVMConstVector(shuf, bld->type.length);
1133
*res_hi = LLVMBuildShuffleVector(builder, muleven, mulodd, shuf_vec, "");
1134
1135
for (i = 0; i < bld->type.length; i += 2) {
1136
shuf[i] = lp_build_const_int32(gallivm, i);
1137
shuf[i+1] = lp_build_const_int32(gallivm, i + bld->type.length);
1138
}
1139
shuf_vec = LLVMConstVector(shuf, bld->type.length);
1140
return LLVMBuildShuffleVector(builder, muleven, mulodd, shuf_vec, "");
1141
}
1142
else {
1143
return lp_build_mul_32_lohi(bld, a, b, res_hi);
1144
}
1145
}
1146
1147
1148
/*
1149
* Widening mul, valid for <= 32 (8, 16, 32) -> 64
1150
* Result is low N bits, high bits returned in res_hi.
1151
*
1152
* Emits generic code.
1153
*/
1154
LLVMValueRef
1155
lp_build_mul_32_lohi(struct lp_build_context *bld,
1156
LLVMValueRef a,
1157
LLVMValueRef b,
1158
LLVMValueRef *res_hi)
1159
{
1160
struct gallivm_state *gallivm = bld->gallivm;
1161
LLVMBuilderRef builder = gallivm->builder;
1162
LLVMValueRef tmp, shift, res_lo;
1163
struct lp_type type_tmp;
1164
LLVMTypeRef wide_type, narrow_type;
1165
1166
type_tmp = bld->type;
1167
narrow_type = lp_build_vec_type(gallivm, type_tmp);
1168
if (bld->type.width < 32)
1169
type_tmp.width = 32;
1170
else
1171
type_tmp.width *= 2;
1172
wide_type = lp_build_vec_type(gallivm, type_tmp);
1173
shift = lp_build_const_vec(gallivm, type_tmp, bld->type.width);
1174
1175
if (bld->type.sign) {
1176
a = LLVMBuildSExt(builder, a, wide_type, "");
1177
b = LLVMBuildSExt(builder, b, wide_type, "");
1178
} else {
1179
a = LLVMBuildZExt(builder, a, wide_type, "");
1180
b = LLVMBuildZExt(builder, b, wide_type, "");
1181
}
1182
tmp = LLVMBuildMul(builder, a, b, "");
1183
1184
res_lo = LLVMBuildTrunc(builder, tmp, narrow_type, "");
1185
1186
/* Since we truncate anyway, LShr and AShr are equivalent. */
1187
tmp = LLVMBuildLShr(builder, tmp, shift, "");
1188
*res_hi = LLVMBuildTrunc(builder, tmp, narrow_type, "");
1189
1190
return res_lo;
1191
}
1192
1193
1194
/* a * b + c */
1195
LLVMValueRef
1196
lp_build_mad(struct lp_build_context *bld,
1197
LLVMValueRef a,
1198
LLVMValueRef b,
1199
LLVMValueRef c)
1200
{
1201
const struct lp_type type = bld->type;
1202
if (type.floating) {
1203
return lp_build_fmuladd(bld->gallivm->builder, a, b, c);
1204
} else {
1205
return lp_build_add(bld, lp_build_mul(bld, a, b), c);
1206
}
1207
}
1208
1209
1210
/**
1211
* Small vector x scale multiplication optimization.
1212
*/
1213
LLVMValueRef
1214
lp_build_mul_imm(struct lp_build_context *bld,
1215
LLVMValueRef a,
1216
int b)
1217
{
1218
LLVMBuilderRef builder = bld->gallivm->builder;
1219
LLVMValueRef factor;
1220
1221
assert(lp_check_value(bld->type, a));
1222
1223
if(b == 0)
1224
return bld->zero;
1225
1226
if(b == 1)
1227
return a;
1228
1229
if(b == -1)
1230
return lp_build_negate(bld, a);
1231
1232
if(b == 2 && bld->type.floating)
1233
return lp_build_add(bld, a, a);
1234
1235
if(util_is_power_of_two_or_zero(b)) {
1236
unsigned shift = ffs(b) - 1;
1237
1238
if(bld->type.floating) {
1239
#if 0
1240
/*
1241
* Power of two multiplication by directly manipulating the exponent.
1242
*
1243
* XXX: This might not be always faster, it will introduce a small error
1244
* for multiplication by zero, and it will produce wrong results
1245
* for Inf and NaN.
1246
*/
1247
unsigned mantissa = lp_mantissa(bld->type);
1248
factor = lp_build_const_int_vec(bld->gallivm, bld->type, (unsigned long long)shift << mantissa);
1249
a = LLVMBuildBitCast(builder, a, lp_build_int_vec_type(bld->type), "");
1250
a = LLVMBuildAdd(builder, a, factor, "");
1251
a = LLVMBuildBitCast(builder, a, lp_build_vec_type(bld->gallivm, bld->type), "");
1252
return a;
1253
#endif
1254
}
1255
else {
1256
factor = lp_build_const_vec(bld->gallivm, bld->type, shift);
1257
return LLVMBuildShl(builder, a, factor, "");
1258
}
1259
}
1260
1261
factor = lp_build_const_vec(bld->gallivm, bld->type, (double)b);
1262
return lp_build_mul(bld, a, factor);
1263
}
1264
1265
1266
/**
1267
* Generate a / b
1268
*/
1269
LLVMValueRef
1270
lp_build_div(struct lp_build_context *bld,
1271
LLVMValueRef a,
1272
LLVMValueRef b)
1273
{
1274
LLVMBuilderRef builder = bld->gallivm->builder;
1275
const struct lp_type type = bld->type;
1276
1277
assert(lp_check_value(type, a));
1278
assert(lp_check_value(type, b));
1279
1280
if(a == bld->zero)
1281
return bld->zero;
1282
if(a == bld->one && type.floating)
1283
return lp_build_rcp(bld, b);
1284
if(b == bld->zero)
1285
return bld->undef;
1286
if(b == bld->one)
1287
return a;
1288
if(a == bld->undef || b == bld->undef)
1289
return bld->undef;
1290
1291
if(LLVMIsConstant(a) && LLVMIsConstant(b)) {
1292
if (type.floating)
1293
return LLVMConstFDiv(a, b);
1294
else if (type.sign)
1295
return LLVMConstSDiv(a, b);
1296
else
1297
return LLVMConstUDiv(a, b);
1298
}
1299
1300
/* fast rcp is disabled (just uses div), so makes no sense to try that */
1301
if(FALSE &&
1302
((util_get_cpu_caps()->has_sse && type.width == 32 && type.length == 4) ||
1303
(util_get_cpu_caps()->has_avx && type.width == 32 && type.length == 8)) &&
1304
type.floating)
1305
return lp_build_mul(bld, a, lp_build_rcp(bld, b));
1306
1307
if (type.floating)
1308
return LLVMBuildFDiv(builder, a, b, "");
1309
else if (type.sign)
1310
return LLVMBuildSDiv(builder, a, b, "");
1311
else
1312
return LLVMBuildUDiv(builder, a, b, "");
1313
}
1314
1315
1316
/**
1317
* Linear interpolation helper.
1318
*
1319
* @param normalized whether we are interpolating normalized values,
1320
* encoded in normalized integers, twice as wide.
1321
*
1322
* @sa http://www.stereopsis.com/doubleblend.html
1323
*/
1324
static inline LLVMValueRef
1325
lp_build_lerp_simple(struct lp_build_context *bld,
1326
LLVMValueRef x,
1327
LLVMValueRef v0,
1328
LLVMValueRef v1,
1329
unsigned flags)
1330
{
1331
unsigned half_width = bld->type.width/2;
1332
LLVMBuilderRef builder = bld->gallivm->builder;
1333
LLVMValueRef delta;
1334
LLVMValueRef res;
1335
1336
assert(lp_check_value(bld->type, x));
1337
assert(lp_check_value(bld->type, v0));
1338
assert(lp_check_value(bld->type, v1));
1339
1340
delta = lp_build_sub(bld, v1, v0);
1341
1342
if (bld->type.floating) {
1343
assert(flags == 0);
1344
return lp_build_mad(bld, x, delta, v0);
1345
}
1346
1347
if (flags & LP_BLD_LERP_WIDE_NORMALIZED) {
1348
if (!bld->type.sign) {
1349
if (!(flags & LP_BLD_LERP_PRESCALED_WEIGHTS)) {
1350
/*
1351
* Scale x from [0, 2**n - 1] to [0, 2**n] by adding the
1352
* most-significant-bit to the lowest-significant-bit, so that
1353
* later we can just divide by 2**n instead of 2**n - 1.
1354
*/
1355
1356
x = lp_build_add(bld, x, lp_build_shr_imm(bld, x, half_width - 1));
1357
}
1358
1359
/* (x * delta) >> n */
1360
res = lp_build_mul(bld, x, delta);
1361
res = lp_build_shr_imm(bld, res, half_width);
1362
} else {
1363
/*
1364
* The rescaling trick above doesn't work for signed numbers, so
1365
* use the 2**n - 1 divison approximation in lp_build_mul_norm
1366
* instead.
1367
*/
1368
assert(!(flags & LP_BLD_LERP_PRESCALED_WEIGHTS));
1369
res = lp_build_mul_norm(bld->gallivm, bld->type, x, delta);
1370
}
1371
} else {
1372
assert(!(flags & LP_BLD_LERP_PRESCALED_WEIGHTS));
1373
res = lp_build_mul(bld, x, delta);
1374
}
1375
1376
if ((flags & LP_BLD_LERP_WIDE_NORMALIZED) && !bld->type.sign) {
1377
/*
1378
* At this point both res and v0 only use the lower half of the bits,
1379
* the rest is zero. Instead of add / mask, do add with half wide type.
1380
*/
1381
struct lp_type narrow_type;
1382
struct lp_build_context narrow_bld;
1383
1384
memset(&narrow_type, 0, sizeof narrow_type);
1385
narrow_type.sign = bld->type.sign;
1386
narrow_type.width = bld->type.width/2;
1387
narrow_type.length = bld->type.length*2;
1388
1389
lp_build_context_init(&narrow_bld, bld->gallivm, narrow_type);
1390
res = LLVMBuildBitCast(builder, res, narrow_bld.vec_type, "");
1391
v0 = LLVMBuildBitCast(builder, v0, narrow_bld.vec_type, "");
1392
res = lp_build_add(&narrow_bld, v0, res);
1393
res = LLVMBuildBitCast(builder, res, bld->vec_type, "");
1394
} else {
1395
res = lp_build_add(bld, v0, res);
1396
1397
if (bld->type.fixed) {
1398
/*
1399
* We need to mask out the high order bits when lerping 8bit
1400
* normalized colors stored on 16bits
1401
*/
1402
/* XXX: This step is necessary for lerping 8bit colors stored on
1403
* 16bits, but it will be wrong for true fixed point use cases.
1404
* Basically we need a more powerful lp_type, capable of further
1405
* distinguishing the values interpretation from the value storage.
1406
*/
1407
LLVMValueRef low_bits;
1408
low_bits = lp_build_const_int_vec(bld->gallivm, bld->type, (1 << half_width) - 1);
1409
res = LLVMBuildAnd(builder, res, low_bits, "");
1410
}
1411
}
1412
1413
return res;
1414
}
1415
1416
1417
/**
1418
* Linear interpolation.
1419
*/
1420
LLVMValueRef
1421
lp_build_lerp(struct lp_build_context *bld,
1422
LLVMValueRef x,
1423
LLVMValueRef v0,
1424
LLVMValueRef v1,
1425
unsigned flags)
1426
{
1427
const struct lp_type type = bld->type;
1428
LLVMValueRef res;
1429
1430
assert(lp_check_value(type, x));
1431
assert(lp_check_value(type, v0));
1432
assert(lp_check_value(type, v1));
1433
1434
assert(!(flags & LP_BLD_LERP_WIDE_NORMALIZED));
1435
1436
if (type.norm) {
1437
struct lp_type wide_type;
1438
struct lp_build_context wide_bld;
1439
LLVMValueRef xl, xh, v0l, v0h, v1l, v1h, resl, resh;
1440
1441
assert(type.length >= 2);
1442
1443
/*
1444
* Create a wider integer type, enough to hold the
1445
* intermediate result of the multiplication.
1446
*/
1447
memset(&wide_type, 0, sizeof wide_type);
1448
wide_type.sign = type.sign;
1449
wide_type.width = type.width*2;
1450
wide_type.length = type.length/2;
1451
1452
lp_build_context_init(&wide_bld, bld->gallivm, wide_type);
1453
1454
lp_build_unpack2_native(bld->gallivm, type, wide_type, x, &xl, &xh);
1455
lp_build_unpack2_native(bld->gallivm, type, wide_type, v0, &v0l, &v0h);
1456
lp_build_unpack2_native(bld->gallivm, type, wide_type, v1, &v1l, &v1h);
1457
1458
/*
1459
* Lerp both halves.
1460
*/
1461
1462
flags |= LP_BLD_LERP_WIDE_NORMALIZED;
1463
1464
resl = lp_build_lerp_simple(&wide_bld, xl, v0l, v1l, flags);
1465
resh = lp_build_lerp_simple(&wide_bld, xh, v0h, v1h, flags);
1466
1467
res = lp_build_pack2_native(bld->gallivm, wide_type, type, resl, resh);
1468
} else {
1469
res = lp_build_lerp_simple(bld, x, v0, v1, flags);
1470
}
1471
1472
return res;
1473
}
1474
1475
1476
/**
1477
* Bilinear interpolation.
1478
*
1479
* Values indices are in v_{yx}.
1480
*/
1481
LLVMValueRef
1482
lp_build_lerp_2d(struct lp_build_context *bld,
1483
LLVMValueRef x,
1484
LLVMValueRef y,
1485
LLVMValueRef v00,
1486
LLVMValueRef v01,
1487
LLVMValueRef v10,
1488
LLVMValueRef v11,
1489
unsigned flags)
1490
{
1491
LLVMValueRef v0 = lp_build_lerp(bld, x, v00, v01, flags);
1492
LLVMValueRef v1 = lp_build_lerp(bld, x, v10, v11, flags);
1493
return lp_build_lerp(bld, y, v0, v1, flags);
1494
}
1495
1496
1497
LLVMValueRef
1498
lp_build_lerp_3d(struct lp_build_context *bld,
1499
LLVMValueRef x,
1500
LLVMValueRef y,
1501
LLVMValueRef z,
1502
LLVMValueRef v000,
1503
LLVMValueRef v001,
1504
LLVMValueRef v010,
1505
LLVMValueRef v011,
1506
LLVMValueRef v100,
1507
LLVMValueRef v101,
1508
LLVMValueRef v110,
1509
LLVMValueRef v111,
1510
unsigned flags)
1511
{
1512
LLVMValueRef v0 = lp_build_lerp_2d(bld, x, y, v000, v001, v010, v011, flags);
1513
LLVMValueRef v1 = lp_build_lerp_2d(bld, x, y, v100, v101, v110, v111, flags);
1514
return lp_build_lerp(bld, z, v0, v1, flags);
1515
}
1516
1517
1518
/**
1519
* Generate min(a, b)
1520
* Do checks for special cases but not for nans.
1521
*/
1522
LLVMValueRef
1523
lp_build_min(struct lp_build_context *bld,
1524
LLVMValueRef a,
1525
LLVMValueRef b)
1526
{
1527
assert(lp_check_value(bld->type, a));
1528
assert(lp_check_value(bld->type, b));
1529
1530
if(a == bld->undef || b == bld->undef)
1531
return bld->undef;
1532
1533
if(a == b)
1534
return a;
1535
1536
if (bld->type.norm) {
1537
if (!bld->type.sign) {
1538
if (a == bld->zero || b == bld->zero) {
1539
return bld->zero;
1540
}
1541
}
1542
if(a == bld->one)
1543
return b;
1544
if(b == bld->one)
1545
return a;
1546
}
1547
1548
return lp_build_min_simple(bld, a, b, GALLIVM_NAN_BEHAVIOR_UNDEFINED);
1549
}
1550
1551
/**
1552
* Generate min(a, b)
1553
* NaN's are handled according to the behavior specified by the
1554
* nan_behavior argument.
1555
*/
1556
LLVMValueRef
1557
lp_build_min_ext(struct lp_build_context *bld,
1558
LLVMValueRef a,
1559
LLVMValueRef b,
1560
enum gallivm_nan_behavior nan_behavior)
1561
{
1562
assert(lp_check_value(bld->type, a));
1563
assert(lp_check_value(bld->type, b));
1564
1565
if(a == bld->undef || b == bld->undef)
1566
return bld->undef;
1567
1568
if(a == b)
1569
return a;
1570
1571
if (bld->type.norm) {
1572
if (!bld->type.sign) {
1573
if (a == bld->zero || b == bld->zero) {
1574
return bld->zero;
1575
}
1576
}
1577
if(a == bld->one)
1578
return b;
1579
if(b == bld->one)
1580
return a;
1581
}
1582
1583
return lp_build_min_simple(bld, a, b, nan_behavior);
1584
}
1585
1586
/**
1587
* Generate max(a, b)
1588
* Do checks for special cases, but NaN behavior is undefined.
1589
*/
1590
LLVMValueRef
1591
lp_build_max(struct lp_build_context *bld,
1592
LLVMValueRef a,
1593
LLVMValueRef b)
1594
{
1595
assert(lp_check_value(bld->type, a));
1596
assert(lp_check_value(bld->type, b));
1597
1598
if(a == bld->undef || b == bld->undef)
1599
return bld->undef;
1600
1601
if(a == b)
1602
return a;
1603
1604
if(bld->type.norm) {
1605
if(a == bld->one || b == bld->one)
1606
return bld->one;
1607
if (!bld->type.sign) {
1608
if (a == bld->zero) {
1609
return b;
1610
}
1611
if (b == bld->zero) {
1612
return a;
1613
}
1614
}
1615
}
1616
1617
return lp_build_max_simple(bld, a, b, GALLIVM_NAN_BEHAVIOR_UNDEFINED);
1618
}
1619
1620
1621
/**
1622
* Generate max(a, b)
1623
* Checks for special cases.
1624
* NaN's are handled according to the behavior specified by the
1625
* nan_behavior argument.
1626
*/
1627
LLVMValueRef
1628
lp_build_max_ext(struct lp_build_context *bld,
1629
LLVMValueRef a,
1630
LLVMValueRef b,
1631
enum gallivm_nan_behavior nan_behavior)
1632
{
1633
assert(lp_check_value(bld->type, a));
1634
assert(lp_check_value(bld->type, b));
1635
1636
if(a == bld->undef || b == bld->undef)
1637
return bld->undef;
1638
1639
if(a == b)
1640
return a;
1641
1642
if(bld->type.norm) {
1643
if(a == bld->one || b == bld->one)
1644
return bld->one;
1645
if (!bld->type.sign) {
1646
if (a == bld->zero) {
1647
return b;
1648
}
1649
if (b == bld->zero) {
1650
return a;
1651
}
1652
}
1653
}
1654
1655
return lp_build_max_simple(bld, a, b, nan_behavior);
1656
}
1657
1658
/**
1659
* Generate clamp(a, min, max)
1660
* NaN behavior (for any of a, min, max) is undefined.
1661
* Do checks for special cases.
1662
*/
1663
LLVMValueRef
1664
lp_build_clamp(struct lp_build_context *bld,
1665
LLVMValueRef a,
1666
LLVMValueRef min,
1667
LLVMValueRef max)
1668
{
1669
assert(lp_check_value(bld->type, a));
1670
assert(lp_check_value(bld->type, min));
1671
assert(lp_check_value(bld->type, max));
1672
1673
a = lp_build_min(bld, a, max);
1674
a = lp_build_max(bld, a, min);
1675
return a;
1676
}
1677
1678
1679
/**
1680
* Generate clamp(a, 0, 1)
1681
* A NaN will get converted to zero.
1682
*/
1683
LLVMValueRef
1684
lp_build_clamp_zero_one_nanzero(struct lp_build_context *bld,
1685
LLVMValueRef a)
1686
{
1687
a = lp_build_max_ext(bld, a, bld->zero, GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN);
1688
a = lp_build_min(bld, a, bld->one);
1689
return a;
1690
}
1691
1692
1693
/**
1694
* Generate abs(a)
1695
*/
1696
LLVMValueRef
1697
lp_build_abs(struct lp_build_context *bld,
1698
LLVMValueRef a)
1699
{
1700
LLVMBuilderRef builder = bld->gallivm->builder;
1701
const struct lp_type type = bld->type;
1702
LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
1703
1704
assert(lp_check_value(type, a));
1705
1706
if(!type.sign)
1707
return a;
1708
1709
if(type.floating) {
1710
char intrinsic[32];
1711
lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.fabs", vec_type);
1712
return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a);
1713
}
1714
1715
if(type.width*type.length == 128 && util_get_cpu_caps()->has_ssse3 && LLVM_VERSION_MAJOR < 6) {
1716
switch(type.width) {
1717
case 8:
1718
return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.b.128", vec_type, a);
1719
case 16:
1720
return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.w.128", vec_type, a);
1721
case 32:
1722
return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.d.128", vec_type, a);
1723
}
1724
}
1725
else if (type.width*type.length == 256 && util_get_cpu_caps()->has_avx2 && LLVM_VERSION_MAJOR < 6) {
1726
switch(type.width) {
1727
case 8:
1728
return lp_build_intrinsic_unary(builder, "llvm.x86.avx2.pabs.b", vec_type, a);
1729
case 16:
1730
return lp_build_intrinsic_unary(builder, "llvm.x86.avx2.pabs.w", vec_type, a);
1731
case 32:
1732
return lp_build_intrinsic_unary(builder, "llvm.x86.avx2.pabs.d", vec_type, a);
1733
}
1734
}
1735
1736
return lp_build_select(bld, lp_build_cmp(bld, PIPE_FUNC_GREATER, a, bld->zero),
1737
a, LLVMBuildNeg(builder, a, ""));
1738
}
1739
1740
1741
LLVMValueRef
1742
lp_build_negate(struct lp_build_context *bld,
1743
LLVMValueRef a)
1744
{
1745
LLVMBuilderRef builder = bld->gallivm->builder;
1746
1747
assert(lp_check_value(bld->type, a));
1748
1749
if (bld->type.floating)
1750
a = LLVMBuildFNeg(builder, a, "");
1751
else
1752
a = LLVMBuildNeg(builder, a, "");
1753
1754
return a;
1755
}
1756
1757
1758
/** Return -1, 0 or +1 depending on the sign of a */
1759
LLVMValueRef
1760
lp_build_sgn(struct lp_build_context *bld,
1761
LLVMValueRef a)
1762
{
1763
LLVMBuilderRef builder = bld->gallivm->builder;
1764
const struct lp_type type = bld->type;
1765
LLVMValueRef cond;
1766
LLVMValueRef res;
1767
1768
assert(lp_check_value(type, a));
1769
1770
/* Handle non-zero case */
1771
if(!type.sign) {
1772
/* if not zero then sign must be positive */
1773
res = bld->one;
1774
}
1775
else if(type.floating) {
1776
LLVMTypeRef vec_type;
1777
LLVMTypeRef int_type;
1778
LLVMValueRef mask;
1779
LLVMValueRef sign;
1780
LLVMValueRef one;
1781
unsigned long long maskBit = (unsigned long long)1 << (type.width - 1);
1782
1783
int_type = lp_build_int_vec_type(bld->gallivm, type);
1784
vec_type = lp_build_vec_type(bld->gallivm, type);
1785
mask = lp_build_const_int_vec(bld->gallivm, type, maskBit);
1786
1787
/* Take the sign bit and add it to 1 constant */
1788
sign = LLVMBuildBitCast(builder, a, int_type, "");
1789
sign = LLVMBuildAnd(builder, sign, mask, "");
1790
one = LLVMConstBitCast(bld->one, int_type);
1791
res = LLVMBuildOr(builder, sign, one, "");
1792
res = LLVMBuildBitCast(builder, res, vec_type, "");
1793
}
1794
else
1795
{
1796
/* signed int/norm/fixed point */
1797
/* could use psign with sse3 and appropriate vectors here */
1798
LLVMValueRef minus_one = lp_build_const_vec(bld->gallivm, type, -1.0);
1799
cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, bld->zero);
1800
res = lp_build_select(bld, cond, bld->one, minus_one);
1801
}
1802
1803
/* Handle zero */
1804
cond = lp_build_cmp(bld, PIPE_FUNC_EQUAL, a, bld->zero);
1805
res = lp_build_select(bld, cond, bld->zero, res);
1806
1807
return res;
1808
}
1809
1810
1811
/**
1812
* Set the sign of float vector 'a' according to 'sign'.
1813
* If sign==0, return abs(a).
1814
* If sign==1, return -abs(a);
1815
* Other values for sign produce undefined results.
1816
*/
1817
LLVMValueRef
1818
lp_build_set_sign(struct lp_build_context *bld,
1819
LLVMValueRef a, LLVMValueRef sign)
1820
{
1821
LLVMBuilderRef builder = bld->gallivm->builder;
1822
const struct lp_type type = bld->type;
1823
LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
1824
LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
1825
LLVMValueRef shift = lp_build_const_int_vec(bld->gallivm, type, type.width - 1);
1826
LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type,
1827
~((unsigned long long) 1 << (type.width - 1)));
1828
LLVMValueRef val, res;
1829
1830
assert(type.floating);
1831
assert(lp_check_value(type, a));
1832
1833
/* val = reinterpret_cast<int>(a) */
1834
val = LLVMBuildBitCast(builder, a, int_vec_type, "");
1835
/* val = val & mask */
1836
val = LLVMBuildAnd(builder, val, mask, "");
1837
/* sign = sign << shift */
1838
sign = LLVMBuildShl(builder, sign, shift, "");
1839
/* res = val | sign */
1840
res = LLVMBuildOr(builder, val, sign, "");
1841
/* res = reinterpret_cast<float>(res) */
1842
res = LLVMBuildBitCast(builder, res, vec_type, "");
1843
1844
return res;
1845
}
1846
1847
1848
/**
1849
* Convert vector of (or scalar) int to vector of (or scalar) float.
1850
*/
1851
LLVMValueRef
1852
lp_build_int_to_float(struct lp_build_context *bld,
1853
LLVMValueRef a)
1854
{
1855
LLVMBuilderRef builder = bld->gallivm->builder;
1856
const struct lp_type type = bld->type;
1857
LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
1858
1859
assert(type.floating);
1860
1861
return LLVMBuildSIToFP(builder, a, vec_type, "");
1862
}
1863
1864
static boolean
1865
arch_rounding_available(const struct lp_type type)
1866
{
1867
if ((util_get_cpu_caps()->has_sse4_1 &&
1868
(type.length == 1 || type.width*type.length == 128)) ||
1869
(util_get_cpu_caps()->has_avx && type.width*type.length == 256) ||
1870
(util_get_cpu_caps()->has_avx512f && type.width*type.length == 512))
1871
return TRUE;
1872
else if ((util_get_cpu_caps()->has_altivec &&
1873
(type.width == 32 && type.length == 4)))
1874
return TRUE;
1875
else if (util_get_cpu_caps()->has_neon)
1876
return TRUE;
1877
1878
return FALSE;
1879
}
1880
1881
enum lp_build_round_mode
1882
{
1883
LP_BUILD_ROUND_NEAREST = 0,
1884
LP_BUILD_ROUND_FLOOR = 1,
1885
LP_BUILD_ROUND_CEIL = 2,
1886
LP_BUILD_ROUND_TRUNCATE = 3
1887
};
1888
1889
static inline LLVMValueRef
1890
lp_build_iround_nearest_sse2(struct lp_build_context *bld,
1891
LLVMValueRef a)
1892
{
1893
LLVMBuilderRef builder = bld->gallivm->builder;
1894
const struct lp_type type = bld->type;
1895
LLVMTypeRef i32t = LLVMInt32TypeInContext(bld->gallivm->context);
1896
LLVMTypeRef ret_type = lp_build_int_vec_type(bld->gallivm, type);
1897
const char *intrinsic;
1898
LLVMValueRef res;
1899
1900
assert(type.floating);
1901
/* using the double precision conversions is a bit more complicated */
1902
assert(type.width == 32);
1903
1904
assert(lp_check_value(type, a));
1905
assert(util_get_cpu_caps()->has_sse2);
1906
1907
/* This is relying on MXCSR rounding mode, which should always be nearest. */
1908
if (type.length == 1) {
1909
LLVMTypeRef vec_type;
1910
LLVMValueRef undef;
1911
LLVMValueRef arg;
1912
LLVMValueRef index0 = LLVMConstInt(i32t, 0, 0);
1913
1914
vec_type = LLVMVectorType(bld->elem_type, 4);
1915
1916
intrinsic = "llvm.x86.sse.cvtss2si";
1917
1918
undef = LLVMGetUndef(vec_type);
1919
1920
arg = LLVMBuildInsertElement(builder, undef, a, index0, "");
1921
1922
res = lp_build_intrinsic_unary(builder, intrinsic,
1923
ret_type, arg);
1924
}
1925
else {
1926
if (type.width* type.length == 128) {
1927
intrinsic = "llvm.x86.sse2.cvtps2dq";
1928
}
1929
else {
1930
assert(type.width*type.length == 256);
1931
assert(util_get_cpu_caps()->has_avx);
1932
1933
intrinsic = "llvm.x86.avx.cvt.ps2dq.256";
1934
}
1935
res = lp_build_intrinsic_unary(builder, intrinsic,
1936
ret_type, a);
1937
}
1938
1939
return res;
1940
}
1941
1942
1943
/*
1944
*/
1945
static inline LLVMValueRef
1946
lp_build_round_altivec(struct lp_build_context *bld,
1947
LLVMValueRef a,
1948
enum lp_build_round_mode mode)
1949
{
1950
LLVMBuilderRef builder = bld->gallivm->builder;
1951
const struct lp_type type = bld->type;
1952
const char *intrinsic = NULL;
1953
1954
assert(type.floating);
1955
1956
assert(lp_check_value(type, a));
1957
assert(util_get_cpu_caps()->has_altivec);
1958
1959
(void)type;
1960
1961
switch (mode) {
1962
case LP_BUILD_ROUND_NEAREST:
1963
intrinsic = "llvm.ppc.altivec.vrfin";
1964
break;
1965
case LP_BUILD_ROUND_FLOOR:
1966
intrinsic = "llvm.ppc.altivec.vrfim";
1967
break;
1968
case LP_BUILD_ROUND_CEIL:
1969
intrinsic = "llvm.ppc.altivec.vrfip";
1970
break;
1971
case LP_BUILD_ROUND_TRUNCATE:
1972
intrinsic = "llvm.ppc.altivec.vrfiz";
1973
break;
1974
}
1975
1976
return lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
1977
}
1978
1979
static inline LLVMValueRef
1980
lp_build_round_arch(struct lp_build_context *bld,
1981
LLVMValueRef a,
1982
enum lp_build_round_mode mode)
1983
{
1984
if (util_get_cpu_caps()->has_sse4_1 || util_get_cpu_caps()->has_neon) {
1985
LLVMBuilderRef builder = bld->gallivm->builder;
1986
const struct lp_type type = bld->type;
1987
const char *intrinsic_root;
1988
char intrinsic[32];
1989
1990
assert(type.floating);
1991
assert(lp_check_value(type, a));
1992
(void)type;
1993
1994
switch (mode) {
1995
case LP_BUILD_ROUND_NEAREST:
1996
intrinsic_root = "llvm.nearbyint";
1997
break;
1998
case LP_BUILD_ROUND_FLOOR:
1999
intrinsic_root = "llvm.floor";
2000
break;
2001
case LP_BUILD_ROUND_CEIL:
2002
intrinsic_root = "llvm.ceil";
2003
break;
2004
case LP_BUILD_ROUND_TRUNCATE:
2005
intrinsic_root = "llvm.trunc";
2006
break;
2007
default:
2008
unreachable("unhandled lp_build_round_mode");
2009
}
2010
2011
lp_format_intrinsic(intrinsic, sizeof intrinsic, intrinsic_root, bld->vec_type);
2012
return lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
2013
}
2014
else /* (util_get_cpu_caps()->has_altivec) */
2015
return lp_build_round_altivec(bld, a, mode);
2016
}
2017
2018
/**
2019
* Return the integer part of a float (vector) value (== round toward zero).
2020
* The returned value is a float (vector).
2021
* Ex: trunc(-1.5) = -1.0
2022
*/
2023
LLVMValueRef
2024
lp_build_trunc(struct lp_build_context *bld,
2025
LLVMValueRef a)
2026
{
2027
LLVMBuilderRef builder = bld->gallivm->builder;
2028
const struct lp_type type = bld->type;
2029
2030
assert(type.floating);
2031
assert(lp_check_value(type, a));
2032
2033
if (arch_rounding_available(type)) {
2034
return lp_build_round_arch(bld, a, LP_BUILD_ROUND_TRUNCATE);
2035
}
2036
else {
2037
const struct lp_type type = bld->type;
2038
struct lp_type inttype;
2039
struct lp_build_context intbld;
2040
LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 1<<24);
2041
LLVMValueRef trunc, res, anosign, mask;
2042
LLVMTypeRef int_vec_type = bld->int_vec_type;
2043
LLVMTypeRef vec_type = bld->vec_type;
2044
2045
inttype = type;
2046
inttype.floating = 0;
2047
lp_build_context_init(&intbld, bld->gallivm, inttype);
2048
2049
/* round by truncation */
2050
trunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
2051
res = LLVMBuildSIToFP(builder, trunc, vec_type, "floor.trunc");
2052
2053
/* mask out sign bit */
2054
anosign = lp_build_abs(bld, a);
2055
/*
2056
* mask out all values if anosign > 2^24
2057
* This should work both for large ints (all rounding is no-op for them
2058
* because such floats are always exact) as well as special cases like
2059
* NaNs, Infs (taking advantage of the fact they use max exponent).
2060
* (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
2061
*/
2062
anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, "");
2063
cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, "");
2064
mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval);
2065
return lp_build_select(bld, mask, a, res);
2066
}
2067
}
2068
2069
2070
/**
2071
* Return float (vector) rounded to nearest integer (vector). The returned
2072
* value is a float (vector).
2073
* Ex: round(0.9) = 1.0
2074
* Ex: round(-1.5) = -2.0
2075
*/
2076
LLVMValueRef
2077
lp_build_round(struct lp_build_context *bld,
2078
LLVMValueRef a)
2079
{
2080
LLVMBuilderRef builder = bld->gallivm->builder;
2081
const struct lp_type type = bld->type;
2082
2083
assert(type.floating);
2084
assert(lp_check_value(type, a));
2085
2086
if (arch_rounding_available(type)) {
2087
return lp_build_round_arch(bld, a, LP_BUILD_ROUND_NEAREST);
2088
}
2089
else {
2090
const struct lp_type type = bld->type;
2091
struct lp_type inttype;
2092
struct lp_build_context intbld;
2093
LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 1<<24);
2094
LLVMValueRef res, anosign, mask;
2095
LLVMTypeRef int_vec_type = bld->int_vec_type;
2096
LLVMTypeRef vec_type = bld->vec_type;
2097
2098
inttype = type;
2099
inttype.floating = 0;
2100
lp_build_context_init(&intbld, bld->gallivm, inttype);
2101
2102
res = lp_build_iround(bld, a);
2103
res = LLVMBuildSIToFP(builder, res, vec_type, "");
2104
2105
/* mask out sign bit */
2106
anosign = lp_build_abs(bld, a);
2107
/*
2108
* mask out all values if anosign > 2^24
2109
* This should work both for large ints (all rounding is no-op for them
2110
* because such floats are always exact) as well as special cases like
2111
* NaNs, Infs (taking advantage of the fact they use max exponent).
2112
* (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
2113
*/
2114
anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, "");
2115
cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, "");
2116
mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval);
2117
return lp_build_select(bld, mask, a, res);
2118
}
2119
}
2120
2121
2122
/**
2123
* Return floor of float (vector), result is a float (vector)
2124
* Ex: floor(1.1) = 1.0
2125
* Ex: floor(-1.1) = -2.0
2126
*/
2127
LLVMValueRef
2128
lp_build_floor(struct lp_build_context *bld,
2129
LLVMValueRef a)
2130
{
2131
LLVMBuilderRef builder = bld->gallivm->builder;
2132
const struct lp_type type = bld->type;
2133
2134
assert(type.floating);
2135
assert(lp_check_value(type, a));
2136
2137
if (arch_rounding_available(type)) {
2138
return lp_build_round_arch(bld, a, LP_BUILD_ROUND_FLOOR);
2139
}
2140
else {
2141
const struct lp_type type = bld->type;
2142
struct lp_type inttype;
2143
struct lp_build_context intbld;
2144
LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 1<<24);
2145
LLVMValueRef trunc, res, anosign, mask;
2146
LLVMTypeRef int_vec_type = bld->int_vec_type;
2147
LLVMTypeRef vec_type = bld->vec_type;
2148
2149
if (type.width != 32) {
2150
char intrinsic[32];
2151
lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.floor", vec_type);
2152
return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a);
2153
}
2154
2155
assert(type.width == 32); /* might want to handle doubles at some point */
2156
2157
inttype = type;
2158
inttype.floating = 0;
2159
lp_build_context_init(&intbld, bld->gallivm, inttype);
2160
2161
/* round by truncation */
2162
trunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
2163
res = LLVMBuildSIToFP(builder, trunc, vec_type, "floor.trunc");
2164
2165
if (type.sign) {
2166
LLVMValueRef tmp;
2167
2168
/*
2169
* fix values if rounding is wrong (for non-special cases)
2170
* - this is the case if trunc > a
2171
*/
2172
mask = lp_build_cmp(bld, PIPE_FUNC_GREATER, res, a);
2173
/* tmp = trunc > a ? 1.0 : 0.0 */
2174
tmp = LLVMBuildBitCast(builder, bld->one, int_vec_type, "");
2175
tmp = lp_build_and(&intbld, mask, tmp);
2176
tmp = LLVMBuildBitCast(builder, tmp, vec_type, "");
2177
res = lp_build_sub(bld, res, tmp);
2178
}
2179
2180
/* mask out sign bit */
2181
anosign = lp_build_abs(bld, a);
2182
/*
2183
* mask out all values if anosign > 2^24
2184
* This should work both for large ints (all rounding is no-op for them
2185
* because such floats are always exact) as well as special cases like
2186
* NaNs, Infs (taking advantage of the fact they use max exponent).
2187
* (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
2188
*/
2189
anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, "");
2190
cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, "");
2191
mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval);
2192
return lp_build_select(bld, mask, a, res);
2193
}
2194
}
2195
2196
2197
/**
2198
* Return ceiling of float (vector), returning float (vector).
2199
* Ex: ceil( 1.1) = 2.0
2200
* Ex: ceil(-1.1) = -1.0
2201
*/
2202
LLVMValueRef
2203
lp_build_ceil(struct lp_build_context *bld,
2204
LLVMValueRef a)
2205
{
2206
LLVMBuilderRef builder = bld->gallivm->builder;
2207
const struct lp_type type = bld->type;
2208
2209
assert(type.floating);
2210
assert(lp_check_value(type, a));
2211
2212
if (arch_rounding_available(type)) {
2213
return lp_build_round_arch(bld, a, LP_BUILD_ROUND_CEIL);
2214
}
2215
else {
2216
const struct lp_type type = bld->type;
2217
struct lp_type inttype;
2218
struct lp_build_context intbld;
2219
LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 1<<24);
2220
LLVMValueRef trunc, res, anosign, mask, tmp;
2221
LLVMTypeRef int_vec_type = bld->int_vec_type;
2222
LLVMTypeRef vec_type = bld->vec_type;
2223
2224
if (type.width != 32) {
2225
char intrinsic[32];
2226
lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.ceil", vec_type);
2227
return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a);
2228
}
2229
2230
assert(type.width == 32); /* might want to handle doubles at some point */
2231
2232
inttype = type;
2233
inttype.floating = 0;
2234
lp_build_context_init(&intbld, bld->gallivm, inttype);
2235
2236
/* round by truncation */
2237
trunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
2238
trunc = LLVMBuildSIToFP(builder, trunc, vec_type, "ceil.trunc");
2239
2240
/*
2241
* fix values if rounding is wrong (for non-special cases)
2242
* - this is the case if trunc < a
2243
*/
2244
mask = lp_build_cmp(bld, PIPE_FUNC_LESS, trunc, a);
2245
/* tmp = trunc < a ? 1.0 : 0.0 */
2246
tmp = LLVMBuildBitCast(builder, bld->one, int_vec_type, "");
2247
tmp = lp_build_and(&intbld, mask, tmp);
2248
tmp = LLVMBuildBitCast(builder, tmp, vec_type, "");
2249
res = lp_build_add(bld, trunc, tmp);
2250
2251
/* mask out sign bit */
2252
anosign = lp_build_abs(bld, a);
2253
/*
2254
* mask out all values if anosign > 2^24
2255
* This should work both for large ints (all rounding is no-op for them
2256
* because such floats are always exact) as well as special cases like
2257
* NaNs, Infs (taking advantage of the fact they use max exponent).
2258
* (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
2259
*/
2260
anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, "");
2261
cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, "");
2262
mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval);
2263
return lp_build_select(bld, mask, a, res);
2264
}
2265
}
2266
2267
2268
/**
2269
* Return fractional part of 'a' computed as a - floor(a)
2270
* Typically used in texture coord arithmetic.
2271
*/
2272
LLVMValueRef
2273
lp_build_fract(struct lp_build_context *bld,
2274
LLVMValueRef a)
2275
{
2276
assert(bld->type.floating);
2277
return lp_build_sub(bld, a, lp_build_floor(bld, a));
2278
}
2279
2280
2281
/**
2282
* Prevent returning 1.0 for very small negative values of 'a' by clamping
2283
* against 0.99999(9). (Will also return that value for NaNs.)
2284
*/
2285
static inline LLVMValueRef
2286
clamp_fract(struct lp_build_context *bld, LLVMValueRef fract)
2287
{
2288
LLVMValueRef max;
2289
2290
/* this is the largest number smaller than 1.0 representable as float */
2291
max = lp_build_const_vec(bld->gallivm, bld->type,
2292
1.0 - 1.0/(1LL << (lp_mantissa(bld->type) + 1)));
2293
return lp_build_min_ext(bld, fract, max,
2294
GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN);
2295
}
2296
2297
2298
/**
2299
* Same as lp_build_fract, but guarantees that the result is always smaller
2300
* than one. Will also return the smaller-than-one value for infs, NaNs.
2301
*/
2302
LLVMValueRef
2303
lp_build_fract_safe(struct lp_build_context *bld,
2304
LLVMValueRef a)
2305
{
2306
return clamp_fract(bld, lp_build_fract(bld, a));
2307
}
2308
2309
2310
/**
2311
* Return the integer part of a float (vector) value (== round toward zero).
2312
* The returned value is an integer (vector).
2313
* Ex: itrunc(-1.5) = -1
2314
*/
2315
LLVMValueRef
2316
lp_build_itrunc(struct lp_build_context *bld,
2317
LLVMValueRef a)
2318
{
2319
LLVMBuilderRef builder = bld->gallivm->builder;
2320
const struct lp_type type = bld->type;
2321
LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
2322
2323
assert(type.floating);
2324
assert(lp_check_value(type, a));
2325
2326
return LLVMBuildFPToSI(builder, a, int_vec_type, "");
2327
}
2328
2329
2330
/**
2331
* Return float (vector) rounded to nearest integer (vector). The returned
2332
* value is an integer (vector).
2333
* Ex: iround(0.9) = 1
2334
* Ex: iround(-1.5) = -2
2335
*/
2336
LLVMValueRef
2337
lp_build_iround(struct lp_build_context *bld,
2338
LLVMValueRef a)
2339
{
2340
LLVMBuilderRef builder = bld->gallivm->builder;
2341
const struct lp_type type = bld->type;
2342
LLVMTypeRef int_vec_type = bld->int_vec_type;
2343
LLVMValueRef res;
2344
2345
assert(type.floating);
2346
2347
assert(lp_check_value(type, a));
2348
2349
if ((util_get_cpu_caps()->has_sse2 &&
2350
((type.width == 32) && (type.length == 1 || type.length == 4))) ||
2351
(util_get_cpu_caps()->has_avx && type.width == 32 && type.length == 8)) {
2352
return lp_build_iround_nearest_sse2(bld, a);
2353
}
2354
if (arch_rounding_available(type)) {
2355
res = lp_build_round_arch(bld, a, LP_BUILD_ROUND_NEAREST);
2356
}
2357
else {
2358
LLVMValueRef half;
2359
2360
half = lp_build_const_vec(bld->gallivm, type, nextafterf(0.5, 0.0));
2361
2362
if (type.sign) {
2363
LLVMTypeRef vec_type = bld->vec_type;
2364
LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type,
2365
(unsigned long long)1 << (type.width - 1));
2366
LLVMValueRef sign;
2367
2368
/* get sign bit */
2369
sign = LLVMBuildBitCast(builder, a, int_vec_type, "");
2370
sign = LLVMBuildAnd(builder, sign, mask, "");
2371
2372
/* sign * 0.5 */
2373
half = LLVMBuildBitCast(builder, half, int_vec_type, "");
2374
half = LLVMBuildOr(builder, sign, half, "");
2375
half = LLVMBuildBitCast(builder, half, vec_type, "");
2376
}
2377
2378
res = LLVMBuildFAdd(builder, a, half, "");
2379
}
2380
2381
res = LLVMBuildFPToSI(builder, res, int_vec_type, "");
2382
2383
return res;
2384
}
2385
2386
2387
/**
2388
* Return floor of float (vector), result is an int (vector)
2389
* Ex: ifloor(1.1) = 1.0
2390
* Ex: ifloor(-1.1) = -2.0
2391
*/
2392
LLVMValueRef
2393
lp_build_ifloor(struct lp_build_context *bld,
2394
LLVMValueRef a)
2395
{
2396
LLVMBuilderRef builder = bld->gallivm->builder;
2397
const struct lp_type type = bld->type;
2398
LLVMTypeRef int_vec_type = bld->int_vec_type;
2399
LLVMValueRef res;
2400
2401
assert(type.floating);
2402
assert(lp_check_value(type, a));
2403
2404
res = a;
2405
if (type.sign) {
2406
if (arch_rounding_available(type)) {
2407
res = lp_build_round_arch(bld, a, LP_BUILD_ROUND_FLOOR);
2408
}
2409
else {
2410
struct lp_type inttype;
2411
struct lp_build_context intbld;
2412
LLVMValueRef trunc, itrunc, mask;
2413
2414
assert(type.floating);
2415
assert(lp_check_value(type, a));
2416
2417
inttype = type;
2418
inttype.floating = 0;
2419
lp_build_context_init(&intbld, bld->gallivm, inttype);
2420
2421
/* round by truncation */
2422
itrunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
2423
trunc = LLVMBuildSIToFP(builder, itrunc, bld->vec_type, "ifloor.trunc");
2424
2425
/*
2426
* fix values if rounding is wrong (for non-special cases)
2427
* - this is the case if trunc > a
2428
* The results of doing this with NaNs, very large values etc.
2429
* are undefined but this seems to be the case anyway.
2430
*/
2431
mask = lp_build_cmp(bld, PIPE_FUNC_GREATER, trunc, a);
2432
/* cheapie minus one with mask since the mask is minus one / zero */
2433
return lp_build_add(&intbld, itrunc, mask);
2434
}
2435
}
2436
2437
/* round to nearest (toward zero) */
2438
res = LLVMBuildFPToSI(builder, res, int_vec_type, "ifloor.res");
2439
2440
return res;
2441
}
2442
2443
2444
/**
2445
* Return ceiling of float (vector), returning int (vector).
2446
* Ex: iceil( 1.1) = 2
2447
* Ex: iceil(-1.1) = -1
2448
*/
2449
LLVMValueRef
2450
lp_build_iceil(struct lp_build_context *bld,
2451
LLVMValueRef a)
2452
{
2453
LLVMBuilderRef builder = bld->gallivm->builder;
2454
const struct lp_type type = bld->type;
2455
LLVMTypeRef int_vec_type = bld->int_vec_type;
2456
LLVMValueRef res;
2457
2458
assert(type.floating);
2459
assert(lp_check_value(type, a));
2460
2461
if (arch_rounding_available(type)) {
2462
res = lp_build_round_arch(bld, a, LP_BUILD_ROUND_CEIL);
2463
}
2464
else {
2465
struct lp_type inttype;
2466
struct lp_build_context intbld;
2467
LLVMValueRef trunc, itrunc, mask;
2468
2469
assert(type.floating);
2470
assert(lp_check_value(type, a));
2471
2472
inttype = type;
2473
inttype.floating = 0;
2474
lp_build_context_init(&intbld, bld->gallivm, inttype);
2475
2476
/* round by truncation */
2477
itrunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
2478
trunc = LLVMBuildSIToFP(builder, itrunc, bld->vec_type, "iceil.trunc");
2479
2480
/*
2481
* fix values if rounding is wrong (for non-special cases)
2482
* - this is the case if trunc < a
2483
* The results of doing this with NaNs, very large values etc.
2484
* are undefined but this seems to be the case anyway.
2485
*/
2486
mask = lp_build_cmp(bld, PIPE_FUNC_LESS, trunc, a);
2487
/* cheapie plus one with mask since the mask is minus one / zero */
2488
return lp_build_sub(&intbld, itrunc, mask);
2489
}
2490
2491
/* round to nearest (toward zero) */
2492
res = LLVMBuildFPToSI(builder, res, int_vec_type, "iceil.res");
2493
2494
return res;
2495
}
2496
2497
2498
/**
2499
* Combined ifloor() & fract().
2500
*
2501
* Preferred to calling the functions separately, as it will ensure that the
2502
* strategy (floor() vs ifloor()) that results in less redundant work is used.
2503
*/
2504
void
2505
lp_build_ifloor_fract(struct lp_build_context *bld,
2506
LLVMValueRef a,
2507
LLVMValueRef *out_ipart,
2508
LLVMValueRef *out_fpart)
2509
{
2510
LLVMBuilderRef builder = bld->gallivm->builder;
2511
const struct lp_type type = bld->type;
2512
LLVMValueRef ipart;
2513
2514
assert(type.floating);
2515
assert(lp_check_value(type, a));
2516
2517
if (arch_rounding_available(type)) {
2518
/*
2519
* floor() is easier.
2520
*/
2521
2522
ipart = lp_build_floor(bld, a);
2523
*out_fpart = LLVMBuildFSub(builder, a, ipart, "fpart");
2524
*out_ipart = LLVMBuildFPToSI(builder, ipart, bld->int_vec_type, "ipart");
2525
}
2526
else {
2527
/*
2528
* ifloor() is easier.
2529
*/
2530
2531
*out_ipart = lp_build_ifloor(bld, a);
2532
ipart = LLVMBuildSIToFP(builder, *out_ipart, bld->vec_type, "ipart");
2533
*out_fpart = LLVMBuildFSub(builder, a, ipart, "fpart");
2534
}
2535
}
2536
2537
2538
/**
2539
* Same as lp_build_ifloor_fract, but guarantees that the fractional part is
2540
* always smaller than one.
2541
*/
2542
void
2543
lp_build_ifloor_fract_safe(struct lp_build_context *bld,
2544
LLVMValueRef a,
2545
LLVMValueRef *out_ipart,
2546
LLVMValueRef *out_fpart)
2547
{
2548
lp_build_ifloor_fract(bld, a, out_ipart, out_fpart);
2549
*out_fpart = clamp_fract(bld, *out_fpart);
2550
}
2551
2552
2553
LLVMValueRef
2554
lp_build_sqrt(struct lp_build_context *bld,
2555
LLVMValueRef a)
2556
{
2557
LLVMBuilderRef builder = bld->gallivm->builder;
2558
const struct lp_type type = bld->type;
2559
LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
2560
char intrinsic[32];
2561
2562
assert(lp_check_value(type, a));
2563
2564
assert(type.floating);
2565
lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.sqrt", vec_type);
2566
2567
return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a);
2568
}
2569
2570
2571
/**
2572
* Do one Newton-Raphson step to improve reciprocate precision:
2573
*
2574
* x_{i+1} = x_i + x_i * (1 - a * x_i)
2575
*
2576
* XXX: Unfortunately this won't give IEEE-754 conformant results for 0 or
2577
* +/-Inf, giving NaN instead. Certain applications rely on this behavior,
2578
* such as Google Earth, which does RCP(RSQRT(0.0)) when drawing the Earth's
2579
* halo. It would be necessary to clamp the argument to prevent this.
2580
*
2581
* See also:
2582
* - http://en.wikipedia.org/wiki/Division_(digital)#Newton.E2.80.93Raphson_division
2583
* - http://softwarecommunity.intel.com/articles/eng/1818.htm
2584
*/
2585
static inline LLVMValueRef
2586
lp_build_rcp_refine(struct lp_build_context *bld,
2587
LLVMValueRef a,
2588
LLVMValueRef rcp_a)
2589
{
2590
LLVMBuilderRef builder = bld->gallivm->builder;
2591
LLVMValueRef neg_a;
2592
LLVMValueRef res;
2593
2594
neg_a = LLVMBuildFNeg(builder, a, "");
2595
res = lp_build_fmuladd(builder, neg_a, rcp_a, bld->one);
2596
res = lp_build_fmuladd(builder, res, rcp_a, rcp_a);
2597
2598
return res;
2599
}
2600
2601
2602
LLVMValueRef
2603
lp_build_rcp(struct lp_build_context *bld,
2604
LLVMValueRef a)
2605
{
2606
LLVMBuilderRef builder = bld->gallivm->builder;
2607
const struct lp_type type = bld->type;
2608
2609
assert(lp_check_value(type, a));
2610
2611
if(a == bld->zero)
2612
return bld->undef;
2613
if(a == bld->one)
2614
return bld->one;
2615
if(a == bld->undef)
2616
return bld->undef;
2617
2618
assert(type.floating);
2619
2620
if(LLVMIsConstant(a))
2621
return LLVMConstFDiv(bld->one, a);
2622
2623
/*
2624
* We don't use RCPPS because:
2625
* - it only has 10bits of precision
2626
* - it doesn't even get the reciprocate of 1.0 exactly
2627
* - doing Newton-Rapshon steps yields wrong (NaN) values for 0.0 or Inf
2628
* - for recent processors the benefit over DIVPS is marginal, a case
2629
* dependent
2630
*
2631
* We could still use it on certain processors if benchmarks show that the
2632
* RCPPS plus necessary workarounds are still preferrable to DIVPS; or for
2633
* particular uses that require less workarounds.
2634
*/
2635
2636
if (FALSE && ((util_get_cpu_caps()->has_sse && type.width == 32 && type.length == 4) ||
2637
(util_get_cpu_caps()->has_avx && type.width == 32 && type.length == 8))){
2638
const unsigned num_iterations = 0;
2639
LLVMValueRef res;
2640
unsigned i;
2641
const char *intrinsic = NULL;
2642
2643
if (type.length == 4) {
2644
intrinsic = "llvm.x86.sse.rcp.ps";
2645
}
2646
else {
2647
intrinsic = "llvm.x86.avx.rcp.ps.256";
2648
}
2649
2650
res = lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
2651
2652
for (i = 0; i < num_iterations; ++i) {
2653
res = lp_build_rcp_refine(bld, a, res);
2654
}
2655
2656
return res;
2657
}
2658
2659
return LLVMBuildFDiv(builder, bld->one, a, "");
2660
}
2661
2662
2663
/**
2664
* Do one Newton-Raphson step to improve rsqrt precision:
2665
*
2666
* x_{i+1} = 0.5 * x_i * (3.0 - a * x_i * x_i)
2667
*
2668
* See also Intel 64 and IA-32 Architectures Optimization Manual.
2669
*/
2670
static inline LLVMValueRef
2671
lp_build_rsqrt_refine(struct lp_build_context *bld,
2672
LLVMValueRef a,
2673
LLVMValueRef rsqrt_a)
2674
{
2675
LLVMBuilderRef builder = bld->gallivm->builder;
2676
LLVMValueRef half = lp_build_const_vec(bld->gallivm, bld->type, 0.5);
2677
LLVMValueRef three = lp_build_const_vec(bld->gallivm, bld->type, 3.0);
2678
LLVMValueRef res;
2679
2680
res = LLVMBuildFMul(builder, rsqrt_a, rsqrt_a, "");
2681
res = LLVMBuildFMul(builder, a, res, "");
2682
res = LLVMBuildFSub(builder, three, res, "");
2683
res = LLVMBuildFMul(builder, rsqrt_a, res, "");
2684
res = LLVMBuildFMul(builder, half, res, "");
2685
2686
return res;
2687
}
2688
2689
2690
/**
2691
* Generate 1/sqrt(a).
2692
* Result is undefined for values < 0, infinity for +0.
2693
*/
2694
LLVMValueRef
2695
lp_build_rsqrt(struct lp_build_context *bld,
2696
LLVMValueRef a)
2697
{
2698
const struct lp_type type = bld->type;
2699
2700
assert(lp_check_value(type, a));
2701
2702
assert(type.floating);
2703
2704
/*
2705
* This should be faster but all denormals will end up as infinity.
2706
*/
2707
if (0 && lp_build_fast_rsqrt_available(type)) {
2708
const unsigned num_iterations = 1;
2709
LLVMValueRef res;
2710
unsigned i;
2711
2712
/* rsqrt(1.0) != 1.0 here */
2713
res = lp_build_fast_rsqrt(bld, a);
2714
2715
if (num_iterations) {
2716
/*
2717
* Newton-Raphson will result in NaN instead of infinity for zero,
2718
* and NaN instead of zero for infinity.
2719
* Also, need to ensure rsqrt(1.0) == 1.0.
2720
* All numbers smaller than FLT_MIN will result in +infinity
2721
* (rsqrtps treats all denormals as zero).
2722
*/
2723
LLVMValueRef cmp;
2724
LLVMValueRef flt_min = lp_build_const_vec(bld->gallivm, type, FLT_MIN);
2725
LLVMValueRef inf = lp_build_const_vec(bld->gallivm, type, INFINITY);
2726
2727
for (i = 0; i < num_iterations; ++i) {
2728
res = lp_build_rsqrt_refine(bld, a, res);
2729
}
2730
cmp = lp_build_compare(bld->gallivm, type, PIPE_FUNC_LESS, a, flt_min);
2731
res = lp_build_select(bld, cmp, inf, res);
2732
cmp = lp_build_compare(bld->gallivm, type, PIPE_FUNC_EQUAL, a, inf);
2733
res = lp_build_select(bld, cmp, bld->zero, res);
2734
cmp = lp_build_compare(bld->gallivm, type, PIPE_FUNC_EQUAL, a, bld->one);
2735
res = lp_build_select(bld, cmp, bld->one, res);
2736
}
2737
2738
return res;
2739
}
2740
2741
return lp_build_rcp(bld, lp_build_sqrt(bld, a));
2742
}
2743
2744
/**
2745
* If there's a fast (inaccurate) rsqrt instruction available
2746
* (caller may want to avoid to call rsqrt_fast if it's not available,
2747
* i.e. for calculating x^0.5 it may do rsqrt_fast(x) * x but if
2748
* unavailable it would result in sqrt/div/mul so obviously
2749
* much better to just call sqrt, skipping both div and mul).
2750
*/
2751
boolean
2752
lp_build_fast_rsqrt_available(struct lp_type type)
2753
{
2754
assert(type.floating);
2755
2756
if ((util_get_cpu_caps()->has_sse && type.width == 32 && type.length == 4) ||
2757
(util_get_cpu_caps()->has_avx && type.width == 32 && type.length == 8)) {
2758
return true;
2759
}
2760
return false;
2761
}
2762
2763
2764
/**
2765
* Generate 1/sqrt(a).
2766
* Result is undefined for values < 0, infinity for +0.
2767
* Precision is limited, only ~10 bits guaranteed
2768
* (rsqrt 1.0 may not be 1.0, denorms may be flushed to 0).
2769
*/
2770
LLVMValueRef
2771
lp_build_fast_rsqrt(struct lp_build_context *bld,
2772
LLVMValueRef a)
2773
{
2774
LLVMBuilderRef builder = bld->gallivm->builder;
2775
const struct lp_type type = bld->type;
2776
2777
assert(lp_check_value(type, a));
2778
2779
if (lp_build_fast_rsqrt_available(type)) {
2780
const char *intrinsic = NULL;
2781
2782
if (type.length == 4) {
2783
intrinsic = "llvm.x86.sse.rsqrt.ps";
2784
}
2785
else {
2786
intrinsic = "llvm.x86.avx.rsqrt.ps.256";
2787
}
2788
return lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
2789
}
2790
else {
2791
debug_printf("%s: emulating fast rsqrt with rcp/sqrt\n", __FUNCTION__);
2792
}
2793
return lp_build_rcp(bld, lp_build_sqrt(bld, a));
2794
}
2795
2796
2797
/**
2798
* Generate sin(a) or cos(a) using polynomial approximation.
2799
* TODO: it might be worth recognizing sin and cos using same source
2800
* (i.e. d3d10 sincos opcode). Obviously doing both at the same time
2801
* would be way cheaper than calculating (nearly) everything twice...
2802
* Not sure it's common enough to be worth bothering however, scs
2803
* opcode could also benefit from calculating both though.
2804
*/
2805
static LLVMValueRef
2806
lp_build_sin_or_cos(struct lp_build_context *bld,
2807
LLVMValueRef a,
2808
boolean cos)
2809
{
2810
struct gallivm_state *gallivm = bld->gallivm;
2811
LLVMBuilderRef b = gallivm->builder;
2812
struct lp_type int_type = lp_int_type(bld->type);
2813
2814
/*
2815
* take the absolute value,
2816
* x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
2817
*/
2818
2819
LLVMValueRef inv_sig_mask = lp_build_const_int_vec(gallivm, bld->type, ~0x80000000);
2820
LLVMValueRef a_v4si = LLVMBuildBitCast(b, a, bld->int_vec_type, "a_v4si");
2821
2822
LLVMValueRef absi = LLVMBuildAnd(b, a_v4si, inv_sig_mask, "absi");
2823
LLVMValueRef x_abs = LLVMBuildBitCast(b, absi, bld->vec_type, "x_abs");
2824
2825
/*
2826
* scale by 4/Pi
2827
* y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
2828
*/
2829
2830
LLVMValueRef FOPi = lp_build_const_vec(gallivm, bld->type, 1.27323954473516);
2831
LLVMValueRef scale_y = LLVMBuildFMul(b, x_abs, FOPi, "scale_y");
2832
2833
/*
2834
* store the integer part of y in mm0
2835
* emm2 = _mm_cvttps_epi32(y);
2836
*/
2837
2838
LLVMValueRef emm2_i = LLVMBuildFPToSI(b, scale_y, bld->int_vec_type, "emm2_i");
2839
2840
/*
2841
* j=(j+1) & (~1) (see the cephes sources)
2842
* emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1);
2843
*/
2844
2845
LLVMValueRef all_one = lp_build_const_int_vec(gallivm, bld->type, 1);
2846
LLVMValueRef emm2_add = LLVMBuildAdd(b, emm2_i, all_one, "emm2_add");
2847
/*
2848
* emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1);
2849
*/
2850
LLVMValueRef inv_one = lp_build_const_int_vec(gallivm, bld->type, ~1);
2851
LLVMValueRef emm2_and = LLVMBuildAnd(b, emm2_add, inv_one, "emm2_and");
2852
2853
/*
2854
* y = _mm_cvtepi32_ps(emm2);
2855
*/
2856
LLVMValueRef y_2 = LLVMBuildSIToFP(b, emm2_and, bld->vec_type, "y_2");
2857
2858
LLVMValueRef const_2 = lp_build_const_int_vec(gallivm, bld->type, 2);
2859
LLVMValueRef const_4 = lp_build_const_int_vec(gallivm, bld->type, 4);
2860
LLVMValueRef const_29 = lp_build_const_int_vec(gallivm, bld->type, 29);
2861
LLVMValueRef sign_mask = lp_build_const_int_vec(gallivm, bld->type, 0x80000000);
2862
2863
/*
2864
* Argument used for poly selection and sign bit determination
2865
* is different for sin vs. cos.
2866
*/
2867
LLVMValueRef emm2_2 = cos ? LLVMBuildSub(b, emm2_and, const_2, "emm2_2") :
2868
emm2_and;
2869
2870
LLVMValueRef sign_bit = cos ? LLVMBuildShl(b, LLVMBuildAnd(b, const_4,
2871
LLVMBuildNot(b, emm2_2, ""), ""),
2872
const_29, "sign_bit") :
2873
LLVMBuildAnd(b, LLVMBuildXor(b, a_v4si,
2874
LLVMBuildShl(b, emm2_add,
2875
const_29, ""), ""),
2876
sign_mask, "sign_bit");
2877
2878
/*
2879
* get the polynom selection mask
2880
* there is one polynom for 0 <= x <= Pi/4
2881
* and another one for Pi/4<x<=Pi/2
2882
* Both branches will be computed.
2883
*
2884
* emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);
2885
* emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
2886
*/
2887
2888
LLVMValueRef emm2_3 = LLVMBuildAnd(b, emm2_2, const_2, "emm2_3");
2889
LLVMValueRef poly_mask = lp_build_compare(gallivm,
2890
int_type, PIPE_FUNC_EQUAL,
2891
emm2_3, lp_build_const_int_vec(gallivm, bld->type, 0));
2892
2893
/*
2894
* _PS_CONST(minus_cephes_DP1, -0.78515625);
2895
* _PS_CONST(minus_cephes_DP2, -2.4187564849853515625e-4);
2896
* _PS_CONST(minus_cephes_DP3, -3.77489497744594108e-8);
2897
*/
2898
LLVMValueRef DP1 = lp_build_const_vec(gallivm, bld->type, -0.78515625);
2899
LLVMValueRef DP2 = lp_build_const_vec(gallivm, bld->type, -2.4187564849853515625e-4);
2900
LLVMValueRef DP3 = lp_build_const_vec(gallivm, bld->type, -3.77489497744594108e-8);
2901
2902
/*
2903
* The magic pass: "Extended precision modular arithmetic"
2904
* x = ((x - y * DP1) - y * DP2) - y * DP3;
2905
*/
2906
LLVMValueRef x_1 = lp_build_fmuladd(b, y_2, DP1, x_abs);
2907
LLVMValueRef x_2 = lp_build_fmuladd(b, y_2, DP2, x_1);
2908
LLVMValueRef x_3 = lp_build_fmuladd(b, y_2, DP3, x_2);
2909
2910
/*
2911
* Evaluate the first polynom (0 <= x <= Pi/4)
2912
*
2913
* z = _mm_mul_ps(x,x);
2914
*/
2915
LLVMValueRef z = LLVMBuildFMul(b, x_3, x_3, "z");
2916
2917
/*
2918
* _PS_CONST(coscof_p0, 2.443315711809948E-005);
2919
* _PS_CONST(coscof_p1, -1.388731625493765E-003);
2920
* _PS_CONST(coscof_p2, 4.166664568298827E-002);
2921
*/
2922
LLVMValueRef coscof_p0 = lp_build_const_vec(gallivm, bld->type, 2.443315711809948E-005);
2923
LLVMValueRef coscof_p1 = lp_build_const_vec(gallivm, bld->type, -1.388731625493765E-003);
2924
LLVMValueRef coscof_p2 = lp_build_const_vec(gallivm, bld->type, 4.166664568298827E-002);
2925
2926
/*
2927
* y = *(v4sf*)_ps_coscof_p0;
2928
* y = _mm_mul_ps(y, z);
2929
*/
2930
LLVMValueRef y_4 = lp_build_fmuladd(b, z, coscof_p0, coscof_p1);
2931
LLVMValueRef y_6 = lp_build_fmuladd(b, y_4, z, coscof_p2);
2932
LLVMValueRef y_7 = LLVMBuildFMul(b, y_6, z, "y_7");
2933
LLVMValueRef y_8 = LLVMBuildFMul(b, y_7, z, "y_8");
2934
2935
2936
/*
2937
* tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
2938
* y = _mm_sub_ps(y, tmp);
2939
* y = _mm_add_ps(y, *(v4sf*)_ps_1);
2940
*/
2941
LLVMValueRef half = lp_build_const_vec(gallivm, bld->type, 0.5);
2942
LLVMValueRef tmp = LLVMBuildFMul(b, z, half, "tmp");
2943
LLVMValueRef y_9 = LLVMBuildFSub(b, y_8, tmp, "y_8");
2944
LLVMValueRef one = lp_build_const_vec(gallivm, bld->type, 1.0);
2945
LLVMValueRef y_10 = LLVMBuildFAdd(b, y_9, one, "y_9");
2946
2947
/*
2948
* _PS_CONST(sincof_p0, -1.9515295891E-4);
2949
* _PS_CONST(sincof_p1, 8.3321608736E-3);
2950
* _PS_CONST(sincof_p2, -1.6666654611E-1);
2951
*/
2952
LLVMValueRef sincof_p0 = lp_build_const_vec(gallivm, bld->type, -1.9515295891E-4);
2953
LLVMValueRef sincof_p1 = lp_build_const_vec(gallivm, bld->type, 8.3321608736E-3);
2954
LLVMValueRef sincof_p2 = lp_build_const_vec(gallivm, bld->type, -1.6666654611E-1);
2955
2956
/*
2957
* Evaluate the second polynom (Pi/4 <= x <= 0)
2958
*
2959
* y2 = *(v4sf*)_ps_sincof_p0;
2960
* y2 = _mm_mul_ps(y2, z);
2961
* y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
2962
* y2 = _mm_mul_ps(y2, z);
2963
* y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
2964
* y2 = _mm_mul_ps(y2, z);
2965
* y2 = _mm_mul_ps(y2, x);
2966
* y2 = _mm_add_ps(y2, x);
2967
*/
2968
2969
LLVMValueRef y2_4 = lp_build_fmuladd(b, z, sincof_p0, sincof_p1);
2970
LLVMValueRef y2_6 = lp_build_fmuladd(b, y2_4, z, sincof_p2);
2971
LLVMValueRef y2_7 = LLVMBuildFMul(b, y2_6, z, "y2_7");
2972
LLVMValueRef y2_9 = lp_build_fmuladd(b, y2_7, x_3, x_3);
2973
2974
/*
2975
* select the correct result from the two polynoms
2976
* xmm3 = poly_mask;
2977
* y2 = _mm_and_ps(xmm3, y2); //, xmm3);
2978
* y = _mm_andnot_ps(xmm3, y);
2979
* y = _mm_or_ps(y,y2);
2980
*/
2981
LLVMValueRef y2_i = LLVMBuildBitCast(b, y2_9, bld->int_vec_type, "y2_i");
2982
LLVMValueRef y_i = LLVMBuildBitCast(b, y_10, bld->int_vec_type, "y_i");
2983
LLVMValueRef y2_and = LLVMBuildAnd(b, y2_i, poly_mask, "y2_and");
2984
LLVMValueRef poly_mask_inv = LLVMBuildNot(b, poly_mask, "poly_mask_inv");
2985
LLVMValueRef y_and = LLVMBuildAnd(b, y_i, poly_mask_inv, "y_and");
2986
LLVMValueRef y_combine = LLVMBuildOr(b, y_and, y2_and, "y_combine");
2987
2988
/*
2989
* update the sign
2990
* y = _mm_xor_ps(y, sign_bit);
2991
*/
2992
LLVMValueRef y_sign = LLVMBuildXor(b, y_combine, sign_bit, "y_sign");
2993
LLVMValueRef y_result = LLVMBuildBitCast(b, y_sign, bld->vec_type, "y_result");
2994
2995
LLVMValueRef isfinite = lp_build_isfinite(bld, a);
2996
2997
/* clamp output to be within [-1, 1] */
2998
y_result = lp_build_clamp(bld, y_result,
2999
lp_build_const_vec(bld->gallivm, bld->type, -1.f),
3000
lp_build_const_vec(bld->gallivm, bld->type, 1.f));
3001
/* If a is -inf, inf or NaN then return NaN */
3002
y_result = lp_build_select(bld, isfinite, y_result,
3003
lp_build_const_vec(bld->gallivm, bld->type, NAN));
3004
return y_result;
3005
}
3006
3007
3008
/**
3009
* Generate sin(a)
3010
*/
3011
LLVMValueRef
3012
lp_build_sin(struct lp_build_context *bld,
3013
LLVMValueRef a)
3014
{
3015
return lp_build_sin_or_cos(bld, a, FALSE);
3016
}
3017
3018
3019
/**
3020
* Generate cos(a)
3021
*/
3022
LLVMValueRef
3023
lp_build_cos(struct lp_build_context *bld,
3024
LLVMValueRef a)
3025
{
3026
return lp_build_sin_or_cos(bld, a, TRUE);
3027
}
3028
3029
3030
/**
3031
* Generate pow(x, y)
3032
*/
3033
LLVMValueRef
3034
lp_build_pow(struct lp_build_context *bld,
3035
LLVMValueRef x,
3036
LLVMValueRef y)
3037
{
3038
/* TODO: optimize the constant case */
3039
if (gallivm_debug & GALLIVM_DEBUG_PERF &&
3040
LLVMIsConstant(x) && LLVMIsConstant(y)) {
3041
debug_printf("%s: inefficient/imprecise constant arithmetic\n",
3042
__FUNCTION__);
3043
}
3044
3045
LLVMValueRef cmp = lp_build_cmp(bld, PIPE_FUNC_EQUAL, x, lp_build_const_vec(bld->gallivm, bld->type, 0.0f));
3046
LLVMValueRef res = lp_build_exp2(bld, lp_build_mul(bld, lp_build_log2(bld, x), y));
3047
3048
res = lp_build_select(bld, cmp, lp_build_const_vec(bld->gallivm, bld->type, 0.0f), res);
3049
return res;
3050
}
3051
3052
3053
/**
3054
* Generate exp(x)
3055
*/
3056
LLVMValueRef
3057
lp_build_exp(struct lp_build_context *bld,
3058
LLVMValueRef x)
3059
{
3060
/* log2(e) = 1/log(2) */
3061
LLVMValueRef log2e = lp_build_const_vec(bld->gallivm, bld->type,
3062
1.4426950408889634);
3063
3064
assert(lp_check_value(bld->type, x));
3065
3066
return lp_build_exp2(bld, lp_build_mul(bld, log2e, x));
3067
}
3068
3069
3070
/**
3071
* Generate log(x)
3072
* Behavior is undefined with infs, 0s and nans
3073
*/
3074
LLVMValueRef
3075
lp_build_log(struct lp_build_context *bld,
3076
LLVMValueRef x)
3077
{
3078
/* log(2) */
3079
LLVMValueRef log2 = lp_build_const_vec(bld->gallivm, bld->type,
3080
0.69314718055994529);
3081
3082
assert(lp_check_value(bld->type, x));
3083
3084
return lp_build_mul(bld, log2, lp_build_log2(bld, x));
3085
}
3086
3087
/**
3088
* Generate log(x) that handles edge cases (infs, 0s and nans)
3089
*/
3090
LLVMValueRef
3091
lp_build_log_safe(struct lp_build_context *bld,
3092
LLVMValueRef x)
3093
{
3094
/* log(2) */
3095
LLVMValueRef log2 = lp_build_const_vec(bld->gallivm, bld->type,
3096
0.69314718055994529);
3097
3098
assert(lp_check_value(bld->type, x));
3099
3100
return lp_build_mul(bld, log2, lp_build_log2_safe(bld, x));
3101
}
3102
3103
3104
/**
3105
* Generate polynomial.
3106
* Ex: coeffs[0] + x * coeffs[1] + x^2 * coeffs[2].
3107
*/
3108
LLVMValueRef
3109
lp_build_polynomial(struct lp_build_context *bld,
3110
LLVMValueRef x,
3111
const double *coeffs,
3112
unsigned num_coeffs)
3113
{
3114
const struct lp_type type = bld->type;
3115
LLVMValueRef even = NULL, odd = NULL;
3116
LLVMValueRef x2;
3117
unsigned i;
3118
3119
assert(lp_check_value(bld->type, x));
3120
3121
/* TODO: optimize the constant case */
3122
if (gallivm_debug & GALLIVM_DEBUG_PERF &&
3123
LLVMIsConstant(x)) {
3124
debug_printf("%s: inefficient/imprecise constant arithmetic\n",
3125
__FUNCTION__);
3126
}
3127
3128
/*
3129
* Calculate odd and even terms seperately to decrease data dependency
3130
* Ex:
3131
* c[0] + x^2 * c[2] + x^4 * c[4] ...
3132
* + x * (c[1] + x^2 * c[3] + x^4 * c[5]) ...
3133
*/
3134
x2 = lp_build_mul(bld, x, x);
3135
3136
for (i = num_coeffs; i--; ) {
3137
LLVMValueRef coeff;
3138
3139
coeff = lp_build_const_vec(bld->gallivm, type, coeffs[i]);
3140
3141
if (i % 2 == 0) {
3142
if (even)
3143
even = lp_build_mad(bld, x2, even, coeff);
3144
else
3145
even = coeff;
3146
} else {
3147
if (odd)
3148
odd = lp_build_mad(bld, x2, odd, coeff);
3149
else
3150
odd = coeff;
3151
}
3152
}
3153
3154
if (odd)
3155
return lp_build_mad(bld, odd, x, even);
3156
else if (even)
3157
return even;
3158
else
3159
return bld->undef;
3160
}
3161
3162
3163
/**
3164
* Minimax polynomial fit of 2**x, in range [0, 1[
3165
*/
3166
const double lp_build_exp2_polynomial[] = {
3167
#if EXP_POLY_DEGREE == 5
3168
1.000000000000000000000, /*XXX: was 0.999999925063526176901, recompute others */
3169
0.693153073200168932794,
3170
0.240153617044375388211,
3171
0.0558263180532956664775,
3172
0.00898934009049466391101,
3173
0.00187757667519147912699
3174
#elif EXP_POLY_DEGREE == 4
3175
1.00000259337069434683,
3176
0.693003834469974940458,
3177
0.24144275689150793076,
3178
0.0520114606103070150235,
3179
0.0135341679161270268764
3180
#elif EXP_POLY_DEGREE == 3
3181
0.999925218562710312959,
3182
0.695833540494823811697,
3183
0.226067155427249155588,
3184
0.0780245226406372992967
3185
#elif EXP_POLY_DEGREE == 2
3186
1.00172476321474503578,
3187
0.657636275736077639316,
3188
0.33718943461968720704
3189
#else
3190
#error
3191
#endif
3192
};
3193
3194
3195
LLVMValueRef
3196
lp_build_exp2(struct lp_build_context *bld,
3197
LLVMValueRef x)
3198
{
3199
LLVMBuilderRef builder = bld->gallivm->builder;
3200
const struct lp_type type = bld->type;
3201
LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
3202
LLVMValueRef ipart = NULL;
3203
LLVMValueRef fpart = NULL;
3204
LLVMValueRef expipart = NULL;
3205
LLVMValueRef expfpart = NULL;
3206
LLVMValueRef res = NULL;
3207
3208
assert(lp_check_value(bld->type, x));
3209
3210
/* TODO: optimize the constant case */
3211
if (gallivm_debug & GALLIVM_DEBUG_PERF &&
3212
LLVMIsConstant(x)) {
3213
debug_printf("%s: inefficient/imprecise constant arithmetic\n",
3214
__FUNCTION__);
3215
}
3216
3217
assert(type.floating && type.width == 32);
3218
3219
/* We want to preserve NaN and make sure than for exp2 if x > 128,
3220
* the result is INF and if it's smaller than -126.9 the result is 0 */
3221
x = lp_build_min_ext(bld, lp_build_const_vec(bld->gallivm, type, 128.0), x,
3222
GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN);
3223
x = lp_build_max_ext(bld, lp_build_const_vec(bld->gallivm, type, -126.99999),
3224
x, GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN);
3225
3226
/* ipart = floor(x) */
3227
/* fpart = x - ipart */
3228
lp_build_ifloor_fract(bld, x, &ipart, &fpart);
3229
3230
/* expipart = (float) (1 << ipart) */
3231
expipart = LLVMBuildAdd(builder, ipart,
3232
lp_build_const_int_vec(bld->gallivm, type, 127), "");
3233
expipart = LLVMBuildShl(builder, expipart,
3234
lp_build_const_int_vec(bld->gallivm, type, 23), "");
3235
expipart = LLVMBuildBitCast(builder, expipart, vec_type, "");
3236
3237
expfpart = lp_build_polynomial(bld, fpart, lp_build_exp2_polynomial,
3238
ARRAY_SIZE(lp_build_exp2_polynomial));
3239
3240
res = LLVMBuildFMul(builder, expipart, expfpart, "");
3241
3242
return res;
3243
}
3244
3245
3246
3247
/**
3248
* Extract the exponent of a IEEE-754 floating point value.
3249
*
3250
* Optionally apply an integer bias.
3251
*
3252
* Result is an integer value with
3253
*
3254
* ifloor(log2(x)) + bias
3255
*/
3256
LLVMValueRef
3257
lp_build_extract_exponent(struct lp_build_context *bld,
3258
LLVMValueRef x,
3259
int bias)
3260
{
3261
LLVMBuilderRef builder = bld->gallivm->builder;
3262
const struct lp_type type = bld->type;
3263
unsigned mantissa = lp_mantissa(type);
3264
LLVMValueRef res;
3265
3266
assert(type.floating);
3267
3268
assert(lp_check_value(bld->type, x));
3269
3270
x = LLVMBuildBitCast(builder, x, bld->int_vec_type, "");
3271
3272
res = LLVMBuildLShr(builder, x,
3273
lp_build_const_int_vec(bld->gallivm, type, mantissa), "");
3274
res = LLVMBuildAnd(builder, res,
3275
lp_build_const_int_vec(bld->gallivm, type, 255), "");
3276
res = LLVMBuildSub(builder, res,
3277
lp_build_const_int_vec(bld->gallivm, type, 127 - bias), "");
3278
3279
return res;
3280
}
3281
3282
3283
/**
3284
* Extract the mantissa of the a floating.
3285
*
3286
* Result is a floating point value with
3287
*
3288
* x / floor(log2(x))
3289
*/
3290
LLVMValueRef
3291
lp_build_extract_mantissa(struct lp_build_context *bld,
3292
LLVMValueRef x)
3293
{
3294
LLVMBuilderRef builder = bld->gallivm->builder;
3295
const struct lp_type type = bld->type;
3296
unsigned mantissa = lp_mantissa(type);
3297
LLVMValueRef mantmask = lp_build_const_int_vec(bld->gallivm, type,
3298
(1ULL << mantissa) - 1);
3299
LLVMValueRef one = LLVMConstBitCast(bld->one, bld->int_vec_type);
3300
LLVMValueRef res;
3301
3302
assert(lp_check_value(bld->type, x));
3303
3304
assert(type.floating);
3305
3306
x = LLVMBuildBitCast(builder, x, bld->int_vec_type, "");
3307
3308
/* res = x / 2**ipart */
3309
res = LLVMBuildAnd(builder, x, mantmask, "");
3310
res = LLVMBuildOr(builder, res, one, "");
3311
res = LLVMBuildBitCast(builder, res, bld->vec_type, "");
3312
3313
return res;
3314
}
3315
3316
3317
3318
/**
3319
* Minimax polynomial fit of log2((1.0 + sqrt(x))/(1.0 - sqrt(x)))/sqrt(x) ,for x in range of [0, 1/9[
3320
* These coefficients can be generate with
3321
* http://www.boost.org/doc/libs/1_36_0/libs/math/doc/sf_and_dist/html/math_toolkit/toolkit/internals2/minimax.html
3322
*/
3323
const double lp_build_log2_polynomial[] = {
3324
#if LOG_POLY_DEGREE == 5
3325
2.88539008148777786488L,
3326
0.961796878841293367824L,
3327
0.577058946784739859012L,
3328
0.412914355135828735411L,
3329
0.308591899232910175289L,
3330
0.352376952300281371868L,
3331
#elif LOG_POLY_DEGREE == 4
3332
2.88539009343309178325L,
3333
0.961791550404184197881L,
3334
0.577440339438736392009L,
3335
0.403343858251329912514L,
3336
0.406718052498846252698L,
3337
#elif LOG_POLY_DEGREE == 3
3338
2.88538959748872753838L,
3339
0.961932915889597772928L,
3340
0.571118517972136195241L,
3341
0.493997535084709500285L,
3342
#else
3343
#error
3344
#endif
3345
};
3346
3347
/**
3348
* See http://www.devmaster.net/forums/showthread.php?p=43580
3349
* http://en.wikipedia.org/wiki/Logarithm#Calculation
3350
* http://www.nezumi.demon.co.uk/consult/logx.htm
3351
*
3352
* If handle_edge_cases is true the function will perform computations
3353
* to match the required D3D10+ behavior for each of the edge cases.
3354
* That means that if input is:
3355
* - less than zero (to and including -inf) then NaN will be returned
3356
* - equal to zero (-denorm, -0, +0 or +denorm), then -inf will be returned
3357
* - +infinity, then +infinity will be returned
3358
* - NaN, then NaN will be returned
3359
*
3360
* Those checks are fairly expensive so if you don't need them make sure
3361
* handle_edge_cases is false.
3362
*/
3363
void
3364
lp_build_log2_approx(struct lp_build_context *bld,
3365
LLVMValueRef x,
3366
LLVMValueRef *p_exp,
3367
LLVMValueRef *p_floor_log2,
3368
LLVMValueRef *p_log2,
3369
boolean handle_edge_cases)
3370
{
3371
LLVMBuilderRef builder = bld->gallivm->builder;
3372
const struct lp_type type = bld->type;
3373
LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
3374
LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
3375
3376
LLVMValueRef expmask = lp_build_const_int_vec(bld->gallivm, type, 0x7f800000);
3377
LLVMValueRef mantmask = lp_build_const_int_vec(bld->gallivm, type, 0x007fffff);
3378
LLVMValueRef one = LLVMConstBitCast(bld->one, int_vec_type);
3379
3380
LLVMValueRef i = NULL;
3381
LLVMValueRef y = NULL;
3382
LLVMValueRef z = NULL;
3383
LLVMValueRef exp = NULL;
3384
LLVMValueRef mant = NULL;
3385
LLVMValueRef logexp = NULL;
3386
LLVMValueRef p_z = NULL;
3387
LLVMValueRef res = NULL;
3388
3389
assert(lp_check_value(bld->type, x));
3390
3391
if(p_exp || p_floor_log2 || p_log2) {
3392
/* TODO: optimize the constant case */
3393
if (gallivm_debug & GALLIVM_DEBUG_PERF &&
3394
LLVMIsConstant(x)) {
3395
debug_printf("%s: inefficient/imprecise constant arithmetic\n",
3396
__FUNCTION__);
3397
}
3398
3399
assert(type.floating && type.width == 32);
3400
3401
/*
3402
* We don't explicitly handle denormalized numbers. They will yield a
3403
* result in the neighbourhood of -127, which appears to be adequate
3404
* enough.
3405
*/
3406
3407
i = LLVMBuildBitCast(builder, x, int_vec_type, "");
3408
3409
/* exp = (float) exponent(x) */
3410
exp = LLVMBuildAnd(builder, i, expmask, "");
3411
}
3412
3413
if(p_floor_log2 || p_log2) {
3414
logexp = LLVMBuildLShr(builder, exp, lp_build_const_int_vec(bld->gallivm, type, 23), "");
3415
logexp = LLVMBuildSub(builder, logexp, lp_build_const_int_vec(bld->gallivm, type, 127), "");
3416
logexp = LLVMBuildSIToFP(builder, logexp, vec_type, "");
3417
}
3418
3419
if (p_log2) {
3420
/* mant = 1 + (float) mantissa(x) */
3421
mant = LLVMBuildAnd(builder, i, mantmask, "");
3422
mant = LLVMBuildOr(builder, mant, one, "");
3423
mant = LLVMBuildBitCast(builder, mant, vec_type, "");
3424
3425
/* y = (mant - 1) / (mant + 1) */
3426
y = lp_build_div(bld,
3427
lp_build_sub(bld, mant, bld->one),
3428
lp_build_add(bld, mant, bld->one)
3429
);
3430
3431
/* z = y^2 */
3432
z = lp_build_mul(bld, y, y);
3433
3434
/* compute P(z) */
3435
p_z = lp_build_polynomial(bld, z, lp_build_log2_polynomial,
3436
ARRAY_SIZE(lp_build_log2_polynomial));
3437
3438
/* y * P(z) + logexp */
3439
res = lp_build_mad(bld, y, p_z, logexp);
3440
3441
if (type.floating && handle_edge_cases) {
3442
LLVMValueRef negmask, infmask, zmask;
3443
negmask = lp_build_cmp(bld, PIPE_FUNC_LESS, x,
3444
lp_build_const_vec(bld->gallivm, type, 0.0f));
3445
zmask = lp_build_cmp(bld, PIPE_FUNC_EQUAL, x,
3446
lp_build_const_vec(bld->gallivm, type, 0.0f));
3447
infmask = lp_build_cmp(bld, PIPE_FUNC_GEQUAL, x,
3448
lp_build_const_vec(bld->gallivm, type, INFINITY));
3449
3450
/* If x is qual to inf make sure we return inf */
3451
res = lp_build_select(bld, infmask,
3452
lp_build_const_vec(bld->gallivm, type, INFINITY),
3453
res);
3454
/* If x is qual to 0, return -inf */
3455
res = lp_build_select(bld, zmask,
3456
lp_build_const_vec(bld->gallivm, type, -INFINITY),
3457
res);
3458
/* If x is nan or less than 0, return nan */
3459
res = lp_build_select(bld, negmask,
3460
lp_build_const_vec(bld->gallivm, type, NAN),
3461
res);
3462
}
3463
}
3464
3465
if (p_exp) {
3466
exp = LLVMBuildBitCast(builder, exp, vec_type, "");
3467
*p_exp = exp;
3468
}
3469
3470
if (p_floor_log2)
3471
*p_floor_log2 = logexp;
3472
3473
if (p_log2)
3474
*p_log2 = res;
3475
}
3476
3477
3478
/*
3479
* log2 implementation which doesn't have special code to
3480
* handle edge cases (-inf, 0, inf, NaN). It's faster but
3481
* the results for those cases are undefined.
3482
*/
3483
LLVMValueRef
3484
lp_build_log2(struct lp_build_context *bld,
3485
LLVMValueRef x)
3486
{
3487
LLVMValueRef res;
3488
lp_build_log2_approx(bld, x, NULL, NULL, &res, FALSE);
3489
return res;
3490
}
3491
3492
/*
3493
* Version of log2 which handles all edge cases.
3494
* Look at documentation of lp_build_log2_approx for
3495
* description of the behavior for each of the edge cases.
3496
*/
3497
LLVMValueRef
3498
lp_build_log2_safe(struct lp_build_context *bld,
3499
LLVMValueRef x)
3500
{
3501
LLVMValueRef res;
3502
lp_build_log2_approx(bld, x, NULL, NULL, &res, TRUE);
3503
return res;
3504
}
3505
3506
3507
/**
3508
* Faster (and less accurate) log2.
3509
*
3510
* log2(x) = floor(log2(x)) - 1 + x / 2**floor(log2(x))
3511
*
3512
* Piece-wise linear approximation, with exact results when x is a
3513
* power of two.
3514
*
3515
* See http://www.flipcode.com/archives/Fast_log_Function.shtml
3516
*/
3517
LLVMValueRef
3518
lp_build_fast_log2(struct lp_build_context *bld,
3519
LLVMValueRef x)
3520
{
3521
LLVMBuilderRef builder = bld->gallivm->builder;
3522
LLVMValueRef ipart;
3523
LLVMValueRef fpart;
3524
3525
assert(lp_check_value(bld->type, x));
3526
3527
assert(bld->type.floating);
3528
3529
/* ipart = floor(log2(x)) - 1 */
3530
ipart = lp_build_extract_exponent(bld, x, -1);
3531
ipart = LLVMBuildSIToFP(builder, ipart, bld->vec_type, "");
3532
3533
/* fpart = x / 2**ipart */
3534
fpart = lp_build_extract_mantissa(bld, x);
3535
3536
/* ipart + fpart */
3537
return LLVMBuildFAdd(builder, ipart, fpart, "");
3538
}
3539
3540
3541
/**
3542
* Fast implementation of iround(log2(x)).
3543
*
3544
* Not an approximation -- it should give accurate results all the time.
3545
*/
3546
LLVMValueRef
3547
lp_build_ilog2(struct lp_build_context *bld,
3548
LLVMValueRef x)
3549
{
3550
LLVMBuilderRef builder = bld->gallivm->builder;
3551
LLVMValueRef sqrt2 = lp_build_const_vec(bld->gallivm, bld->type, M_SQRT2);
3552
LLVMValueRef ipart;
3553
3554
assert(bld->type.floating);
3555
3556
assert(lp_check_value(bld->type, x));
3557
3558
/* x * 2^(0.5) i.e., add 0.5 to the log2(x) */
3559
x = LLVMBuildFMul(builder, x, sqrt2, "");
3560
3561
/* ipart = floor(log2(x) + 0.5) */
3562
ipart = lp_build_extract_exponent(bld, x, 0);
3563
3564
return ipart;
3565
}
3566
3567
LLVMValueRef
3568
lp_build_mod(struct lp_build_context *bld,
3569
LLVMValueRef x,
3570
LLVMValueRef y)
3571
{
3572
LLVMBuilderRef builder = bld->gallivm->builder;
3573
LLVMValueRef res;
3574
const struct lp_type type = bld->type;
3575
3576
assert(lp_check_value(type, x));
3577
assert(lp_check_value(type, y));
3578
3579
if (type.floating)
3580
res = LLVMBuildFRem(builder, x, y, "");
3581
else if (type.sign)
3582
res = LLVMBuildSRem(builder, x, y, "");
3583
else
3584
res = LLVMBuildURem(builder, x, y, "");
3585
return res;
3586
}
3587
3588
3589
/*
3590
* For floating inputs it creates and returns a mask
3591
* which is all 1's for channels which are NaN.
3592
* Channels inside x which are not NaN will be 0.
3593
*/
3594
LLVMValueRef
3595
lp_build_isnan(struct lp_build_context *bld,
3596
LLVMValueRef x)
3597
{
3598
LLVMValueRef mask;
3599
LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, bld->type);
3600
3601
assert(bld->type.floating);
3602
assert(lp_check_value(bld->type, x));
3603
3604
mask = LLVMBuildFCmp(bld->gallivm->builder, LLVMRealOEQ, x, x,
3605
"isnotnan");
3606
mask = LLVMBuildNot(bld->gallivm->builder, mask, "");
3607
mask = LLVMBuildSExt(bld->gallivm->builder, mask, int_vec_type, "isnan");
3608
return mask;
3609
}
3610
3611
/* Returns all 1's for floating point numbers that are
3612
* finite numbers and returns all zeros for -inf,
3613
* inf and nan's */
3614
LLVMValueRef
3615
lp_build_isfinite(struct lp_build_context *bld,
3616
LLVMValueRef x)
3617
{
3618
LLVMBuilderRef builder = bld->gallivm->builder;
3619
LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, bld->type);
3620
struct lp_type int_type = lp_int_type(bld->type);
3621
LLVMValueRef intx = LLVMBuildBitCast(builder, x, int_vec_type, "");
3622
LLVMValueRef infornan32 = lp_build_const_int_vec(bld->gallivm, bld->type,
3623
0x7f800000);
3624
3625
if (!bld->type.floating) {
3626
return lp_build_const_int_vec(bld->gallivm, bld->type, 0);
3627
}
3628
assert(bld->type.floating);
3629
assert(lp_check_value(bld->type, x));
3630
assert(bld->type.width == 32);
3631
3632
intx = LLVMBuildAnd(builder, intx, infornan32, "");
3633
return lp_build_compare(bld->gallivm, int_type, PIPE_FUNC_NOTEQUAL,
3634
intx, infornan32);
3635
}
3636
3637
/*
3638
* Returns true if the number is nan or inf and false otherwise.
3639
* The input has to be a floating point vector.
3640
*/
3641
LLVMValueRef
3642
lp_build_is_inf_or_nan(struct gallivm_state *gallivm,
3643
const struct lp_type type,
3644
LLVMValueRef x)
3645
{
3646
LLVMBuilderRef builder = gallivm->builder;
3647
struct lp_type int_type = lp_int_type(type);
3648
LLVMValueRef const0 = lp_build_const_int_vec(gallivm, int_type,
3649
0x7f800000);
3650
LLVMValueRef ret;
3651
3652
assert(type.floating);
3653
3654
ret = LLVMBuildBitCast(builder, x, lp_build_vec_type(gallivm, int_type), "");
3655
ret = LLVMBuildAnd(builder, ret, const0, "");
3656
ret = lp_build_compare(gallivm, int_type, PIPE_FUNC_EQUAL,
3657
ret, const0);
3658
3659
return ret;
3660
}
3661
3662
3663
LLVMValueRef
3664
lp_build_fpstate_get(struct gallivm_state *gallivm)
3665
{
3666
if (util_get_cpu_caps()->has_sse) {
3667
LLVMBuilderRef builder = gallivm->builder;
3668
LLVMValueRef mxcsr_ptr = lp_build_alloca(
3669
gallivm,
3670
LLVMInt32TypeInContext(gallivm->context),
3671
"mxcsr_ptr");
3672
LLVMValueRef mxcsr_ptr8 = LLVMBuildPointerCast(builder, mxcsr_ptr,
3673
LLVMPointerType(LLVMInt8TypeInContext(gallivm->context), 0), "");
3674
lp_build_intrinsic(builder,
3675
"llvm.x86.sse.stmxcsr",
3676
LLVMVoidTypeInContext(gallivm->context),
3677
&mxcsr_ptr8, 1, 0);
3678
return mxcsr_ptr;
3679
}
3680
return 0;
3681
}
3682
3683
void
3684
lp_build_fpstate_set_denorms_zero(struct gallivm_state *gallivm,
3685
boolean zero)
3686
{
3687
if (util_get_cpu_caps()->has_sse) {
3688
/* turn on DAZ (64) | FTZ (32768) = 32832 if available */
3689
int daz_ftz = _MM_FLUSH_ZERO_MASK;
3690
3691
LLVMBuilderRef builder = gallivm->builder;
3692
LLVMValueRef mxcsr_ptr = lp_build_fpstate_get(gallivm);
3693
LLVMValueRef mxcsr =
3694
LLVMBuildLoad(builder, mxcsr_ptr, "mxcsr");
3695
3696
if (util_get_cpu_caps()->has_daz) {
3697
/* Enable denormals are zero mode */
3698
daz_ftz |= _MM_DENORMALS_ZERO_MASK;
3699
}
3700
if (zero) {
3701
mxcsr = LLVMBuildOr(builder, mxcsr,
3702
LLVMConstInt(LLVMTypeOf(mxcsr), daz_ftz, 0), "");
3703
} else {
3704
mxcsr = LLVMBuildAnd(builder, mxcsr,
3705
LLVMConstInt(LLVMTypeOf(mxcsr), ~daz_ftz, 0), "");
3706
}
3707
3708
LLVMBuildStore(builder, mxcsr, mxcsr_ptr);
3709
lp_build_fpstate_set(gallivm, mxcsr_ptr);
3710
}
3711
}
3712
3713
void
3714
lp_build_fpstate_set(struct gallivm_state *gallivm,
3715
LLVMValueRef mxcsr_ptr)
3716
{
3717
if (util_get_cpu_caps()->has_sse) {
3718
LLVMBuilderRef builder = gallivm->builder;
3719
mxcsr_ptr = LLVMBuildPointerCast(builder, mxcsr_ptr,
3720
LLVMPointerType(LLVMInt8TypeInContext(gallivm->context), 0), "");
3721
lp_build_intrinsic(builder,
3722
"llvm.x86.sse.ldmxcsr",
3723
LLVMVoidTypeInContext(gallivm->context),
3724
&mxcsr_ptr, 1, 0);
3725
}
3726
}
3727
3728