CoCalc -- lp_bld

GitHub Repository: PojavLauncherTeam/mesa
Path: blob/21.2-virgl/src/gallium/auxiliary/gallivm/lp_bld_arit.c
⁴⁵⁶⁵ views
1
/**************************************************************************
2
 *
3
 * Copyright 2009-2010 VMware, Inc.
4
 * All Rights Reserved.
5
 *
6
 * Permission is hereby granted, free of charge, to any person obtaining a
7
 * copy of this software and associated documentation files (the
8
 * "Software"), to deal in the Software without restriction, including
9
 * without limitation the rights to use, copy, modify, merge, publish,
10
 * distribute, sub license, and/or sell copies of the Software, and to
11
 * permit persons to whom the Software is furnished to do so, subject to
12
 * the following conditions:
13
 *
14
 * The above copyright notice and this permission notice (including the
15
 * next paragraph) shall be included in all copies or substantial portions
16
 * of the Software.
17
 *
18
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
19
 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20
 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
21
 * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
22
 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
23
 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
24
 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25
 *
26
 **************************************************************************/
27

28

29
/**
30
 * @file
31
 * Helper
32
 *
33
 * LLVM IR doesn't support all basic arithmetic operations we care about (most
34
 * notably min/max and saturated operations), and it is often necessary to
35
 * resort machine-specific intrinsics directly. The functions here hide all
36
 * these implementation details from the other modules.
37
 *
38
 * We also do simple expressions simplification here. Reasons are:
39
 * - it is very easy given we have all necessary information readily available
40
 * - LLVM optimization passes fail to simplify several vector expressions
41
 * - We often know value constraints which the optimization passes have no way
42
 *   of knowing, such as when source arguments are known to be in [0, 1] range.
43
 *
44
 * @author Jose Fonseca <[email protected]>
45
 */
46

47

48
#include <float.h>
49

50
#include <llvm/Config/llvm-config.h>
51

52
#include "util/u_memory.h"
53
#include "util/u_debug.h"
54
#include "util/u_math.h"
55
#include "util/u_cpu_detect.h"
56

57
#include "lp_bld_type.h"
58
#include "lp_bld_const.h"
59
#include "lp_bld_init.h"
60
#include "lp_bld_intr.h"
61
#include "lp_bld_logic.h"
62
#include "lp_bld_pack.h"
63
#include "lp_bld_debug.h"
64
#include "lp_bld_bitarit.h"
65
#include "lp_bld_arit.h"
66
#include "lp_bld_flow.h"
67

68
#if defined(PIPE_ARCH_SSE)
69
#include <xmmintrin.h>
70
#endif
71

72
#ifndef _MM_DENORMALS_ZERO_MASK
73
#define _MM_DENORMALS_ZERO_MASK 0x0040
74
#endif
75

76
#ifndef _MM_FLUSH_ZERO_MASK
77
#define _MM_FLUSH_ZERO_MASK 0x8000
78
#endif
79

80
#define EXP_POLY_DEGREE 5
81

82
#define LOG_POLY_DEGREE 4
83

84

85
/**
86
 * Generate min(a, b)
87
 * No checks for special case values of a or b = 1 or 0 are done.
88
 * NaN's are handled according to the behavior specified by the
89
 * nan_behavior argument.
90
 */
91
static LLVMValueRef
92
lp_build_min_simple(struct lp_build_context *bld,
93
                    LLVMValueRef a,
94
                    LLVMValueRef b,
95
                    enum gallivm_nan_behavior nan_behavior)
96
{
97
   const struct lp_type type = bld->type;
98
   const char *intrinsic = NULL;
99
   unsigned intr_size = 0;
100
   LLVMValueRef cond;
101

102
   assert(lp_check_value(type, a));
103
   assert(lp_check_value(type, b));
104

105
   /* TODO: optimize the constant case */
106

107
   if (type.floating && util_get_cpu_caps()->has_sse) {
108
      if (type.width == 32) {
109
         if (type.length == 1) {
110
            intrinsic = "llvm.x86.sse.min.ss";
111
            intr_size = 128;
112
         }
113
         else if (type.length <= 4 || !util_get_cpu_caps()->has_avx) {
114
            intrinsic = "llvm.x86.sse.min.ps";
115
            intr_size = 128;
116
         }
117
         else {
118
            intrinsic = "llvm.x86.avx.min.ps.256";
119
            intr_size = 256;
120
         }
121
      }
122
      if (type.width == 64 && util_get_cpu_caps()->has_sse2) {
123
         if (type.length == 1) {
124
            intrinsic = "llvm.x86.sse2.min.sd";
125
            intr_size = 128;
126
         }
127
         else if (type.length == 2 || !util_get_cpu_caps()->has_avx) {
128
            intrinsic = "llvm.x86.sse2.min.pd";
129
            intr_size = 128;
130
         }
131
         else {
132
            intrinsic = "llvm.x86.avx.min.pd.256";
133
            intr_size = 256;
134
         }
135
      }
136
   }
137
   else if (type.floating && util_get_cpu_caps()->has_altivec) {
138
      if (nan_behavior == GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN) {
139
         debug_printf("%s: altivec doesn't support nan return nan behavior\n",
140
                      __FUNCTION__);
141
      }
142
      if (type.width == 32 && type.length == 4) {
143
         intrinsic = "llvm.ppc.altivec.vminfp";
144
         intr_size = 128;
145
      }
146
   } else if (util_get_cpu_caps()->has_altivec) {
147
      intr_size = 128;
148
      if (type.width == 8) {
149
         if (!type.sign) {
150
            intrinsic = "llvm.ppc.altivec.vminub";
151
         } else {
152
            intrinsic = "llvm.ppc.altivec.vminsb";
153
         }
154
      } else if (type.width == 16) {
155
         if (!type.sign) {
156
            intrinsic = "llvm.ppc.altivec.vminuh";
157
         } else {
158
            intrinsic = "llvm.ppc.altivec.vminsh";
159
         }
160
      } else if (type.width == 32) {
161
         if (!type.sign) {
162
            intrinsic = "llvm.ppc.altivec.vminuw";
163
         } else {
164
            intrinsic = "llvm.ppc.altivec.vminsw";
165
         }
166
      }
167
   }
168

169
   if (intrinsic) {
170
      /* We need to handle nan's for floating point numbers. If one of the
171
       * inputs is nan the other should be returned (required by both D3D10+
172
       * and OpenCL).
173
       * The sse intrinsics return the second operator in case of nan by
174
       * default so we need to special code to handle those.
175
       */
176
      if (util_get_cpu_caps()->has_sse && type.floating &&
177
          nan_behavior == GALLIVM_NAN_RETURN_OTHER) {
178
         LLVMValueRef isnan, min;
179
         min = lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
180
                                                   type,
181
                                                   intr_size, a, b);
182
         isnan = lp_build_isnan(bld, b);
183
         return lp_build_select(bld, isnan, a, min);
184
      } else {
185
         return lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
186
                                                    type,
187
                                                    intr_size, a, b);
188
      }
189
   }
190

191
   if (type.floating) {
192
      switch (nan_behavior) {
193
      case GALLIVM_NAN_RETURN_OTHER: {
194
         LLVMValueRef isnan = lp_build_isnan(bld, a);
195
         cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
196
         cond = LLVMBuildXor(bld->gallivm->builder, cond, isnan, "");
197
         return lp_build_select(bld, cond, a, b);
198
      }
199
         break;
200
      case GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN:
201
         cond = lp_build_cmp_ordered(bld, PIPE_FUNC_LESS, a, b);
202
         return lp_build_select(bld, cond, a, b);
203
      case GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN:
204
         cond = lp_build_cmp(bld, PIPE_FUNC_LESS, b, a);
205
         return lp_build_select(bld, cond, b, a);
206
      case GALLIVM_NAN_BEHAVIOR_UNDEFINED:
207
         cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
208
         return lp_build_select(bld, cond, a, b);
209
         break;
210
      default:
211
         assert(0);
212
         cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
213
         return lp_build_select(bld, cond, a, b);
214
      }
215
   } else {
216
      cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
217
      return lp_build_select(bld, cond, a, b);
218
   }
219
}
220

221

222
LLVMValueRef
223
lp_build_fmuladd(LLVMBuilderRef builder,
224
                 LLVMValueRef a,
225
                 LLVMValueRef b,
226
                 LLVMValueRef c)
227
{
228
   LLVMTypeRef type = LLVMTypeOf(a);
229
   assert(type == LLVMTypeOf(b));
230
   assert(type == LLVMTypeOf(c));
231

232
   char intrinsic[32];
233
   lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.fmuladd", type);
234
   LLVMValueRef args[] = { a, b, c };
235
   return lp_build_intrinsic(builder, intrinsic, type, args, 3, 0);
236
}
237

238

239
/**
240
 * Generate max(a, b)
241
 * No checks for special case values of a or b = 1 or 0 are done.
242
 * NaN's are handled according to the behavior specified by the
243
 * nan_behavior argument.
244
 */
245
static LLVMValueRef
246
lp_build_max_simple(struct lp_build_context *bld,
247
                    LLVMValueRef a,
248
                    LLVMValueRef b,
249
                    enum gallivm_nan_behavior nan_behavior)
250
{
251
   const struct lp_type type = bld->type;
252
   const char *intrinsic = NULL;
253
   unsigned intr_size = 0;
254
   LLVMValueRef cond;
255

256
   assert(lp_check_value(type, a));
257
   assert(lp_check_value(type, b));
258

259
   /* TODO: optimize the constant case */
260

261
   if (type.floating && util_get_cpu_caps()->has_sse) {
262
      if (type.width == 32) {
263
         if (type.length == 1) {
264
            intrinsic = "llvm.x86.sse.max.ss";
265
            intr_size = 128;
266
         }
267
         else if (type.length <= 4 || !util_get_cpu_caps()->has_avx) {
268
            intrinsic = "llvm.x86.sse.max.ps";
269
            intr_size = 128;
270
         }
271
         else {
272
            intrinsic = "llvm.x86.avx.max.ps.256";
273
            intr_size = 256;
274
         }
275
      }
276
      if (type.width == 64 && util_get_cpu_caps()->has_sse2) {
277
         if (type.length == 1) {
278
            intrinsic = "llvm.x86.sse2.max.sd";
279
            intr_size = 128;
280
         }
281
         else if (type.length == 2 || !util_get_cpu_caps()->has_avx) {
282
            intrinsic = "llvm.x86.sse2.max.pd";
283
            intr_size = 128;
284
         }
285
         else {
286
            intrinsic = "llvm.x86.avx.max.pd.256";
287
            intr_size = 256;
288
         }
289
      }
290
   }
291
   else if (type.floating && util_get_cpu_caps()->has_altivec) {
292
      if (nan_behavior == GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN) {
293
         debug_printf("%s: altivec doesn't support nan return nan behavior\n",
294
                      __FUNCTION__);
295
      }
296
      if (type.width == 32 || type.length == 4) {
297
         intrinsic = "llvm.ppc.altivec.vmaxfp";
298
         intr_size = 128;
299
      }
300
   } else if (util_get_cpu_caps()->has_altivec) {
301
     intr_size = 128;
302
     if (type.width == 8) {
303
       if (!type.sign) {
304
         intrinsic = "llvm.ppc.altivec.vmaxub";
305
       } else {
306
         intrinsic = "llvm.ppc.altivec.vmaxsb";
307
       }
308
     } else if (type.width == 16) {
309
       if (!type.sign) {
310
         intrinsic = "llvm.ppc.altivec.vmaxuh";
311
       } else {
312
         intrinsic = "llvm.ppc.altivec.vmaxsh";
313
       }
314
     } else if (type.width == 32) {
315
       if (!type.sign) {
316
         intrinsic = "llvm.ppc.altivec.vmaxuw";
317
       } else {
318
         intrinsic = "llvm.ppc.altivec.vmaxsw";
319
       }
320
     }
321
   }
322

323
   if (intrinsic) {
324
      if (util_get_cpu_caps()->has_sse && type.floating &&
325
          nan_behavior == GALLIVM_NAN_RETURN_OTHER) {
326
         LLVMValueRef isnan, max;
327
         max = lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
328
                                                   type,
329
                                                   intr_size, a, b);
330
         isnan = lp_build_isnan(bld, b);
331
         return lp_build_select(bld, isnan, a, max);
332
      } else {
333
         return lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
334
                                                    type,
335
                                                    intr_size, a, b);
336
      }
337
   }
338

339
   if (type.floating) {
340
      switch (nan_behavior) {
341
      case GALLIVM_NAN_RETURN_OTHER: {
342
         LLVMValueRef isnan = lp_build_isnan(bld, a);
343
         cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
344
         cond = LLVMBuildXor(bld->gallivm->builder, cond, isnan, "");
345
         return lp_build_select(bld, cond, a, b);
346
      }
347
         break;
348
      case GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN:
349
         cond = lp_build_cmp_ordered(bld, PIPE_FUNC_GREATER, a, b);
350
         return lp_build_select(bld, cond, a, b);
351
      case GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN:
352
         cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, b, a);
353
         return lp_build_select(bld, cond, b, a);
354
      case GALLIVM_NAN_BEHAVIOR_UNDEFINED:
355
         cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
356
         return lp_build_select(bld, cond, a, b);
357
         break;
358
      default:
359
         assert(0);
360
         cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
361
         return lp_build_select(bld, cond, a, b);
362
      }
363
   } else {
364
      cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
365
      return lp_build_select(bld, cond, a, b);
366
   }
367
}
368

369

370
/**
371
 * Generate 1 - a, or ~a depending on bld->type.
372
 */
373
LLVMValueRef
374
lp_build_comp(struct lp_build_context *bld,
375
              LLVMValueRef a)
376
{
377
   LLVMBuilderRef builder = bld->gallivm->builder;
378
   const struct lp_type type = bld->type;
379

380
   assert(lp_check_value(type, a));
381

382
   if(a == bld->one)
383
      return bld->zero;
384
   if(a == bld->zero)
385
      return bld->one;
386

387
   if(type.norm && !type.floating && !type.fixed && !type.sign) {
388
      if(LLVMIsConstant(a))
389
         return LLVMConstNot(a);
390
      else
391
         return LLVMBuildNot(builder, a, "");
392
   }
393

394
   if(LLVMIsConstant(a))
395
      if (type.floating)
396
          return LLVMConstFSub(bld->one, a);
397
      else
398
          return LLVMConstSub(bld->one, a);
399
   else
400
      if (type.floating)
401
         return LLVMBuildFSub(builder, bld->one, a, "");
402
      else
403
         return LLVMBuildSub(builder, bld->one, a, "");
404
}
405

406

407
/**
408
 * Generate a + b
409
 */
410
LLVMValueRef
411
lp_build_add(struct lp_build_context *bld,
412
             LLVMValueRef a,
413
             LLVMValueRef b)
414
{
415
   LLVMBuilderRef builder = bld->gallivm->builder;
416
   const struct lp_type type = bld->type;
417
   LLVMValueRef res;
418

419
   assert(lp_check_value(type, a));
420
   assert(lp_check_value(type, b));
421

422
   if (a == bld->zero)
423
      return b;
424
   if (b == bld->zero)
425
      return a;
426
   if (a == bld->undef || b == bld->undef)
427
      return bld->undef;
428

429
   if (type.norm) {
430
      const char *intrinsic = NULL;
431

432
      if (!type.sign && (a == bld->one || b == bld->one))
433
        return bld->one;
434

435
      if (!type.floating && !type.fixed) {
436
         if (LLVM_VERSION_MAJOR >= 8) {
437
            char intrin[32];
438
            intrinsic = type.sign ? "llvm.sadd.sat" : "llvm.uadd.sat";
439
            lp_format_intrinsic(intrin, sizeof intrin, intrinsic, bld->vec_type);
440
            return lp_build_intrinsic_binary(builder, intrin, bld->vec_type, a, b);
441
         }
442
         if (type.width * type.length == 128) {
443
            if (util_get_cpu_caps()->has_sse2) {
444
               if (type.width == 8)
445
                 intrinsic = type.sign ? "llvm.x86.sse2.padds.b" : "llvm.x86.sse2.paddus.b";
446
               if (type.width == 16)
447
                 intrinsic = type.sign ? "llvm.x86.sse2.padds.w" : "llvm.x86.sse2.paddus.w";
448
            } else if (util_get_cpu_caps()->has_altivec) {
449
               if (type.width == 8)
450
                  intrinsic = type.sign ? "llvm.ppc.altivec.vaddsbs" : "llvm.ppc.altivec.vaddubs";
451
               if (type.width == 16)
452
                  intrinsic = type.sign ? "llvm.ppc.altivec.vaddshs" : "llvm.ppc.altivec.vadduhs";
453
            }
454
         }
455
         if (type.width * type.length == 256) {
456
            if (util_get_cpu_caps()->has_avx2) {
457
               if (type.width == 8)
458
                  intrinsic = type.sign ? "llvm.x86.avx2.padds.b" : "llvm.x86.avx2.paddus.b";
459
               if (type.width == 16)
460
                  intrinsic = type.sign ? "llvm.x86.avx2.padds.w" : "llvm.x86.avx2.paddus.w";
461
            }
462
         }
463
      }
464
   
465
      if (intrinsic)
466
         return lp_build_intrinsic_binary(builder, intrinsic, lp_build_vec_type(bld->gallivm, bld->type), a, b);
467
   }
468

469
   if(type.norm && !type.floating && !type.fixed) {
470
      if (type.sign) {
471
         uint64_t sign = (uint64_t)1 << (type.width - 1);
472
         LLVMValueRef max_val = lp_build_const_int_vec(bld->gallivm, type, sign - 1);
473
         LLVMValueRef min_val = lp_build_const_int_vec(bld->gallivm, type, sign);
474
         /* a_clamp_max is the maximum a for positive b,
475
            a_clamp_min is the minimum a for negative b. */
476
         LLVMValueRef a_clamp_max = lp_build_min_simple(bld, a, LLVMBuildSub(builder, max_val, b, ""), GALLIVM_NAN_BEHAVIOR_UNDEFINED);
477
         LLVMValueRef a_clamp_min = lp_build_max_simple(bld, a, LLVMBuildSub(builder, min_val, b, ""), GALLIVM_NAN_BEHAVIOR_UNDEFINED);
478
         a = lp_build_select(bld, lp_build_cmp(bld, PIPE_FUNC_GREATER, b, bld->zero), a_clamp_max, a_clamp_min);
479
      }
480
   }
481

482
   if(LLVMIsConstant(a) && LLVMIsConstant(b))
483
      if (type.floating)
484
         res = LLVMConstFAdd(a, b);
485
      else
486
         res = LLVMConstAdd(a, b);
487
   else
488
      if (type.floating)
489
         res = LLVMBuildFAdd(builder, a, b, "");
490
      else
491
         res = LLVMBuildAdd(builder, a, b, "");
492

493
   /* clamp to ceiling of 1.0 */
494
   if(bld->type.norm && (bld->type.floating || bld->type.fixed))
495
      res = lp_build_min_simple(bld, res, bld->one, GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN);
496

497
   if (type.norm && !type.floating && !type.fixed) {
498
      if (!type.sign) {
499
         /*
500
          * newer llvm versions no longer support the intrinsics, but recognize
501
          * the pattern. Since auto-upgrade of intrinsics doesn't work for jit
502
          * code, it is important we match the pattern llvm uses (and pray llvm
503
          * doesn't change it - and hope they decide on the same pattern for
504
          * all backends supporting it...).
505
          * NOTE: cmp/select does sext/trunc of the mask. Does not seem to
506
          * interfere with llvm's ability to recognize the pattern but seems
507
          * a bit brittle.
508
          * NOTE: llvm 9+ always uses (non arch specific) intrinsic.
509
          */
510
         LLVMValueRef overflowed = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, res);
511
         res = lp_build_select(bld, overflowed,
512
                               LLVMConstAllOnes(bld->int_vec_type), res);
513
      }
514
   }
515

516
   /* XXX clamp to floor of -1 or 0??? */
517

518
   return res;
519
}
520

521

522
/** Return the scalar sum of the elements of a.
523
 * Should avoid this operation whenever possible.
524
 */
525
LLVMValueRef
526
lp_build_horizontal_add(struct lp_build_context *bld,
527
                        LLVMValueRef a)
528
{
529
   LLVMBuilderRef builder = bld->gallivm->builder;
530
   const struct lp_type type = bld->type;
531
   LLVMValueRef index, res;
532
   unsigned i, length;
533
   LLVMValueRef shuffles1[LP_MAX_VECTOR_LENGTH / 2];
534
   LLVMValueRef shuffles2[LP_MAX_VECTOR_LENGTH / 2];
535
   LLVMValueRef vecres, elem2;
536

537
   assert(lp_check_value(type, a));
538

539
   if (type.length == 1) {
540
      return a;
541
   }
542

543
   assert(!bld->type.norm);
544

545
   /*
546
    * for byte vectors can do much better with psadbw.
547
    * Using repeated shuffle/adds here. Note with multiple vectors
548
    * this can be done more efficiently as outlined in the intel
549
    * optimization manual.
550
    * Note: could cause data rearrangement if used with smaller element
551
    * sizes.
552
    */
553

554
   vecres = a;
555
   length = type.length / 2;
556
   while (length > 1) {
557
      LLVMValueRef vec1, vec2;
558
      for (i = 0; i < length; i++) {
559
         shuffles1[i] = lp_build_const_int32(bld->gallivm, i);
560
         shuffles2[i] = lp_build_const_int32(bld->gallivm, i + length);
561
      }
562
      vec1 = LLVMBuildShuffleVector(builder, vecres, vecres,
563
                                    LLVMConstVector(shuffles1, length), "");
564
      vec2 = LLVMBuildShuffleVector(builder, vecres, vecres,
565
                                    LLVMConstVector(shuffles2, length), "");
566
      if (type.floating) {
567
         vecres = LLVMBuildFAdd(builder, vec1, vec2, "");
568
      }
569
      else {
570
         vecres = LLVMBuildAdd(builder, vec1, vec2, "");
571
      }
572
      length = length >> 1;
573
   }
574

575
   /* always have vector of size 2 here */
576
   assert(length == 1);
577

578
   index = lp_build_const_int32(bld->gallivm, 0);
579
   res = LLVMBuildExtractElement(builder, vecres, index, "");
580
   index = lp_build_const_int32(bld->gallivm, 1);
581
   elem2 = LLVMBuildExtractElement(builder, vecres, index, "");
582

583
   if (type.floating)
584
      res = LLVMBuildFAdd(builder, res, elem2, "");
585
    else
586
      res = LLVMBuildAdd(builder, res, elem2, "");
587

588
   return res;
589
}
590

591
/**
592
 * Return the horizontal sums of 4 float vectors as a float4 vector.
593
 * This uses the technique as outlined in Intel Optimization Manual.
594
 */
595
static LLVMValueRef
596
lp_build_horizontal_add4x4f(struct lp_build_context *bld,
597
                            LLVMValueRef src[4])
598
{
599
   struct gallivm_state *gallivm = bld->gallivm;
600
   LLVMBuilderRef builder = gallivm->builder;
601
   LLVMValueRef shuffles[4];
602
   LLVMValueRef tmp[4];
603
   LLVMValueRef sumtmp[2], shuftmp[2];
604

605
   /* lower half of regs */
606
   shuffles[0] = lp_build_const_int32(gallivm, 0);
607
   shuffles[1] = lp_build_const_int32(gallivm, 1);
608
   shuffles[2] = lp_build_const_int32(gallivm, 4);
609
   shuffles[3] = lp_build_const_int32(gallivm, 5);
610
   tmp[0] = LLVMBuildShuffleVector(builder, src[0], src[1],
611
                                   LLVMConstVector(shuffles, 4), "");
612
   tmp[2] = LLVMBuildShuffleVector(builder, src[2], src[3],
613
                                   LLVMConstVector(shuffles, 4), "");
614

615
   /* upper half of regs */
616
   shuffles[0] = lp_build_const_int32(gallivm, 2);
617
   shuffles[1] = lp_build_const_int32(gallivm, 3);
618
   shuffles[2] = lp_build_const_int32(gallivm, 6);
619
   shuffles[3] = lp_build_const_int32(gallivm, 7);
620
   tmp[1] = LLVMBuildShuffleVector(builder, src[0], src[1],
621
                                   LLVMConstVector(shuffles, 4), "");
622
   tmp[3] = LLVMBuildShuffleVector(builder, src[2], src[3],
623
                                   LLVMConstVector(shuffles, 4), "");
624

625
   sumtmp[0] = LLVMBuildFAdd(builder, tmp[0], tmp[1], "");
626
   sumtmp[1] = LLVMBuildFAdd(builder, tmp[2], tmp[3], "");
627

628
   shuffles[0] = lp_build_const_int32(gallivm, 0);
629
   shuffles[1] = lp_build_const_int32(gallivm, 2);
630
   shuffles[2] = lp_build_const_int32(gallivm, 4);
631
   shuffles[3] = lp_build_const_int32(gallivm, 6);
632
   shuftmp[0] = LLVMBuildShuffleVector(builder, sumtmp[0], sumtmp[1],
633
                                       LLVMConstVector(shuffles, 4), "");
634

635
   shuffles[0] = lp_build_const_int32(gallivm, 1);
636
   shuffles[1] = lp_build_const_int32(gallivm, 3);
637
   shuffles[2] = lp_build_const_int32(gallivm, 5);
638
   shuffles[3] = lp_build_const_int32(gallivm, 7);
639
   shuftmp[1] = LLVMBuildShuffleVector(builder, sumtmp[0], sumtmp[1],
640
                                       LLVMConstVector(shuffles, 4), "");
641

642
   return LLVMBuildFAdd(builder, shuftmp[0], shuftmp[1], "");
643
}
644

645

646
/*
647
 * partially horizontally add 2-4 float vectors with length nx4,
648
 * i.e. only four adjacent values in each vector will be added,
649
 * assuming values are really grouped in 4 which also determines
650
 * output order.
651
 *
652
 * Return a vector of the same length as the initial vectors,
653
 * with the excess elements (if any) being undefined.
654
 * The element order is independent of number of input vectors.
655
 * For 3 vectors x0x1x2x3x4x5x6x7, y0y1y2y3y4y5y6y7, z0z1z2z3z4z5z6z7
656
 * the output order thus will be
657
 * sumx0-x3,sumy0-y3,sumz0-z3,undef,sumx4-x7,sumy4-y7,sumz4z7,undef
658
 */
659
LLVMValueRef
660
lp_build_hadd_partial4(struct lp_build_context *bld,
661
                       LLVMValueRef vectors[],
662
                       unsigned num_vecs)
663
{
664
   struct gallivm_state *gallivm = bld->gallivm;
665
   LLVMBuilderRef builder = gallivm->builder;
666
   LLVMValueRef ret_vec;
667
   LLVMValueRef tmp[4];
668
   const char *intrinsic = NULL;
669

670
   assert(num_vecs >= 2 && num_vecs <= 4);
671
   assert(bld->type.floating);
672

673
   /* only use this with at least 2 vectors, as it is sort of expensive
674
    * (depending on cpu) and we always need two horizontal adds anyway,
675
    * so a shuffle/add approach might be better.
676
    */
677

678
   tmp[0] = vectors[0];
679
   tmp[1] = vectors[1];
680

681
   tmp[2] = num_vecs > 2 ? vectors[2] : vectors[0];
682
   tmp[3] = num_vecs > 3 ? vectors[3] : vectors[0];
683

684
   if (util_get_cpu_caps()->has_sse3 && bld->type.width == 32 &&
685
       bld->type.length == 4) {
686
      intrinsic = "llvm.x86.sse3.hadd.ps";
687
   }
688
   else if (util_get_cpu_caps()->has_avx && bld->type.width == 32 &&
689
            bld->type.length == 8) {
690
      intrinsic = "llvm.x86.avx.hadd.ps.256";
691
   }
692
   if (intrinsic) {
693
      tmp[0] = lp_build_intrinsic_binary(builder, intrinsic,
694
                                       lp_build_vec_type(gallivm, bld->type),
695
                                       tmp[0], tmp[1]);
696
      if (num_vecs > 2) {
697
         tmp[1] = lp_build_intrinsic_binary(builder, intrinsic,
698
                                          lp_build_vec_type(gallivm, bld->type),
699
                                          tmp[2], tmp[3]);
700
      }
701
      else {
702
         tmp[1] = tmp[0];
703
      }
704
      return lp_build_intrinsic_binary(builder, intrinsic,
705
                                       lp_build_vec_type(gallivm, bld->type),
706
                                       tmp[0], tmp[1]);
707
   }
708

709
   if (bld->type.length == 4) {
710
      ret_vec = lp_build_horizontal_add4x4f(bld, tmp);
711
   }
712
   else {
713
      LLVMValueRef partres[LP_MAX_VECTOR_LENGTH/4];
714
      unsigned j;
715
      unsigned num_iter = bld->type.length / 4;
716
      struct lp_type parttype = bld->type;
717
      parttype.length = 4;
718
      for (j = 0; j < num_iter; j++) {
719
         LLVMValueRef partsrc[4];
720
         unsigned i;
721
         for (i = 0; i < 4; i++) {
722
            partsrc[i] = lp_build_extract_range(gallivm, tmp[i], j*4, 4);
723
         }
724
         partres[j] = lp_build_horizontal_add4x4f(bld, partsrc);
725
      }
726
      ret_vec = lp_build_concat(gallivm, partres, parttype, num_iter);
727
   }
728
   return ret_vec;
729
}
730

731
/**
732
 * Generate a - b
733
 */
734
LLVMValueRef
735
lp_build_sub(struct lp_build_context *bld,
736
             LLVMValueRef a,
737
             LLVMValueRef b)
738
{
739
   LLVMBuilderRef builder = bld->gallivm->builder;
740
   const struct lp_type type = bld->type;
741
   LLVMValueRef res;
742

743
   assert(lp_check_value(type, a));
744
   assert(lp_check_value(type, b));
745

746
   if (b == bld->zero)
747
      return a;
748
   if (a == bld->undef || b == bld->undef)
749
      return bld->undef;
750
   if (a == b)
751
      return bld->zero;
752

753
   if (type.norm) {
754
      const char *intrinsic = NULL;
755

756
      if (!type.sign && b == bld->one)
757
        return bld->zero;
758

759
      if (!type.floating && !type.fixed) {
760
         if (LLVM_VERSION_MAJOR >= 8) {
761
            char intrin[32];
762
            intrinsic = type.sign ? "llvm.ssub.sat" : "llvm.usub.sat";
763
            lp_format_intrinsic(intrin, sizeof intrin, intrinsic, bld->vec_type);
764
            return lp_build_intrinsic_binary(builder, intrin, bld->vec_type, a, b);
765
         }
766
         if (type.width * type.length == 128) {
767
            if (util_get_cpu_caps()->has_sse2) {
768
               if (type.width == 8)
769
                  intrinsic = type.sign ? "llvm.x86.sse2.psubs.b" : "llvm.x86.sse2.psubus.b";
770
               if (type.width == 16)
771
                  intrinsic = type.sign ? "llvm.x86.sse2.psubs.w" : "llvm.x86.sse2.psubus.w";
772
            } else if (util_get_cpu_caps()->has_altivec) {
773
               if (type.width == 8)
774
                  intrinsic = type.sign ? "llvm.ppc.altivec.vsubsbs" : "llvm.ppc.altivec.vsububs";
775
               if (type.width == 16)
776
                  intrinsic = type.sign ? "llvm.ppc.altivec.vsubshs" : "llvm.ppc.altivec.vsubuhs";
777
            }
778
         }
779
         if (type.width * type.length == 256) {
780
            if (util_get_cpu_caps()->has_avx2) {
781
               if (type.width == 8)
782
                  intrinsic = type.sign ? "llvm.x86.avx2.psubs.b" : "llvm.x86.avx2.psubus.b";
783
               if (type.width == 16)
784
                  intrinsic = type.sign ? "llvm.x86.avx2.psubs.w" : "llvm.x86.avx2.psubus.w";
785
            }
786
         }
787
      }
788
   
789
      if (intrinsic)
790
         return lp_build_intrinsic_binary(builder, intrinsic, lp_build_vec_type(bld->gallivm, bld->type), a, b);
791
   }
792

793
   if(type.norm && !type.floating && !type.fixed) {
794
      if (type.sign) {
795
         uint64_t sign = (uint64_t)1 << (type.width - 1);
796
         LLVMValueRef max_val = lp_build_const_int_vec(bld->gallivm, type, sign - 1);
797
         LLVMValueRef min_val = lp_build_const_int_vec(bld->gallivm, type, sign);
798
         /* a_clamp_max is the maximum a for negative b,
799
            a_clamp_min is the minimum a for positive b. */
800
         LLVMValueRef a_clamp_max = lp_build_min_simple(bld, a, LLVMBuildAdd(builder, max_val, b, ""), GALLIVM_NAN_BEHAVIOR_UNDEFINED);
801
         LLVMValueRef a_clamp_min = lp_build_max_simple(bld, a, LLVMBuildAdd(builder, min_val, b, ""), GALLIVM_NAN_BEHAVIOR_UNDEFINED);
802
         a = lp_build_select(bld, lp_build_cmp(bld, PIPE_FUNC_GREATER, b, bld->zero), a_clamp_min, a_clamp_max);
803
      } else {
804
         /*
805
          * This must match llvm pattern for saturated unsigned sub.
806
          * (lp_build_max_simple actually does the job with its current
807
          * definition but do it explicitly here.)
808
          * NOTE: cmp/select does sext/trunc of the mask. Does not seem to
809
          * interfere with llvm's ability to recognize the pattern but seems
810
          * a bit brittle.
811
          * NOTE: llvm 9+ always uses (non arch specific) intrinsic.
812
          */
813
         LLVMValueRef no_ov = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
814
         a = lp_build_select(bld, no_ov, a, b);
815
      }
816
   }
817

818
   if(LLVMIsConstant(a) && LLVMIsConstant(b))
819
      if (type.floating)
820
         res = LLVMConstFSub(a, b);
821
      else
822
         res = LLVMConstSub(a, b);
823
   else
824
      if (type.floating)
825
         res = LLVMBuildFSub(builder, a, b, "");
826
      else
827
         res = LLVMBuildSub(builder, a, b, "");
828

829
   if(bld->type.norm && (bld->type.floating || bld->type.fixed))
830
      res = lp_build_max_simple(bld, res, bld->zero, GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN);
831

832
   return res;
833
}
834

835

836

837
/**
838
 * Normalized multiplication.
839
 *
840
 * There are several approaches for (using 8-bit normalized multiplication as
841
 * an example):
842
 *
843
 * - alpha plus one
844
 *
845
 *     makes the following approximation to the division (Sree)
846
 *    
847
 *       a*b/255 ~= (a*(b + 1)) >> 256
848
 *    
849
 *     which is the fastest method that satisfies the following OpenGL criteria of
850
 *    
851
 *       0*0 = 0 and 255*255 = 255
852
 *
853
 * - geometric series
854
 *
855
 *     takes the geometric series approximation to the division
856
 *
857
 *       t/255 = (t >> 8) + (t >> 16) + (t >> 24) ..
858
 *
859
 *     in this case just the first two terms to fit in 16bit arithmetic
860
 *
861
 *       t/255 ~= (t + (t >> 8)) >> 8
862
 *
863
 *     note that just by itself it doesn't satisfies the OpenGL criteria, as
864
 *     255*255 = 254, so the special case b = 255 must be accounted or roundoff
865
 *     must be used.
866
 *
867
 * - geometric series plus rounding
868
 *
869
 *     when using a geometric series division instead of truncating the result
870
 *     use roundoff in the approximation (Jim Blinn)
871
 *
872
 *       t/255 ~= (t + (t >> 8) + 0x80) >> 8
873
 *
874
 *     achieving the exact results.
875
 *
876
 *
877
 *
878
 * @sa Alvy Ray Smith, Image Compositing Fundamentals, Tech Memo 4, Aug 15, 1995, 
879
 *     ftp://ftp.alvyray.com/Acrobat/4_Comp.pdf
880
 * @sa Michael Herf, The "double blend trick", May 2000, 
881
 *     http://www.stereopsis.com/doubleblend.html
882
 */
883
LLVMValueRef
884
lp_build_mul_norm(struct gallivm_state *gallivm,
885
                  struct lp_type wide_type,
886
                  LLVMValueRef a, LLVMValueRef b)
887
{
888
   LLVMBuilderRef builder = gallivm->builder;
889
   struct lp_build_context bld;
890
   unsigned n;
891
   LLVMValueRef half;
892
   LLVMValueRef ab;
893

894
   assert(!wide_type.floating);
895
   assert(lp_check_value(wide_type, a));
896
   assert(lp_check_value(wide_type, b));
897

898
   lp_build_context_init(&bld, gallivm, wide_type);
899

900
   n = wide_type.width / 2;
901
   if (wide_type.sign) {
902
      --n;
903
   }
904

905
   /*
906
    * TODO: for 16bits normalized SSE2 vectors we could consider using PMULHUW
907
    * http://ssp.impulsetrain.com/2011/07/03/multiplying-normalized-16-bit-numbers-with-sse2/
908
    */
909

910
   /*
911
    * a*b / (2**n - 1) ~= (a*b + (a*b >> n) + half) >> n
912
    */
913

914
   ab = LLVMBuildMul(builder, a, b, "");
915
   ab = LLVMBuildAdd(builder, ab, lp_build_shr_imm(&bld, ab, n), "");
916

917
   /*
918
    * half = sgn(ab) * 0.5 * (2 ** n) = sgn(ab) * (1 << (n - 1))
919
    */
920

921
   half = lp_build_const_int_vec(gallivm, wide_type, 1LL << (n - 1));
922
   if (wide_type.sign) {
923
      LLVMValueRef minus_half = LLVMBuildNeg(builder, half, "");
924
      LLVMValueRef sign = lp_build_shr_imm(&bld, ab, wide_type.width - 1);
925
      half = lp_build_select(&bld, sign, minus_half, half);
926
   }
927
   ab = LLVMBuildAdd(builder, ab, half, "");
928

929
   /* Final division */
930
   ab = lp_build_shr_imm(&bld, ab, n);
931

932
   return ab;
933
}
934

935
/**
936
 * Generate a * b
937
 */
938
LLVMValueRef
939
lp_build_mul(struct lp_build_context *bld,
940
             LLVMValueRef a,
941
             LLVMValueRef b)
942
{
943
   LLVMBuilderRef builder = bld->gallivm->builder;
944
   const struct lp_type type = bld->type;
945
   LLVMValueRef shift;
946
   LLVMValueRef res;
947

948
   assert(lp_check_value(type, a));
949
   assert(lp_check_value(type, b));
950

951
   if(a == bld->zero)
952
      return bld->zero;
953
   if(a == bld->one)
954
      return b;
955
   if(b == bld->zero)
956
      return bld->zero;
957
   if(b == bld->one)
958
      return a;
959
   if(a == bld->undef || b == bld->undef)
960
      return bld->undef;
961

962
   if (!type.floating && !type.fixed && type.norm) {
963
      struct lp_type wide_type = lp_wider_type(type);
964
      LLVMValueRef al, ah, bl, bh, abl, abh, ab;
965

966
      lp_build_unpack2_native(bld->gallivm, type, wide_type, a, &al, &ah);
967
      lp_build_unpack2_native(bld->gallivm, type, wide_type, b, &bl, &bh);
968

969
      /* PMULLW, PSRLW, PADDW */
970
      abl = lp_build_mul_norm(bld->gallivm, wide_type, al, bl);
971
      abh = lp_build_mul_norm(bld->gallivm, wide_type, ah, bh);
972

973
      ab = lp_build_pack2_native(bld->gallivm, wide_type, type, abl, abh);
974

975
      return ab;
976
   }
977

978
   if(type.fixed)
979
      shift = lp_build_const_int_vec(bld->gallivm, type, type.width/2);
980
   else
981
      shift = NULL;
982

983
   if(LLVMIsConstant(a) && LLVMIsConstant(b)) {
984
      if (type.floating)
985
         res = LLVMConstFMul(a, b);
986
      else
987
         res = LLVMConstMul(a, b);
988
      if(shift) {
989
         if(type.sign)
990
            res = LLVMConstAShr(res, shift);
991
         else
992
            res = LLVMConstLShr(res, shift);
993
      }
994
   }
995
   else {
996
      if (type.floating)
997
         res = LLVMBuildFMul(builder, a, b, "");
998
      else
999
         res = LLVMBuildMul(builder, a, b, "");
1000
      if(shift) {
1001
         if(type.sign)
1002
            res = LLVMBuildAShr(builder, res, shift, "");
1003
         else
1004
            res = LLVMBuildLShr(builder, res, shift, "");
1005
      }
1006
   }
1007

1008
   return res;
1009
}
1010

1011
/*
1012
 * Widening mul, valid for 32x32 bit -> 64bit only.
1013
 * Result is low 32bits, high bits returned in res_hi.
1014
 *
1015
 * Emits code that is meant to be compiled for the host CPU.
1016
 */
1017
LLVMValueRef
1018
lp_build_mul_32_lohi_cpu(struct lp_build_context *bld,
1019
                         LLVMValueRef a,
1020
                         LLVMValueRef b,
1021
                         LLVMValueRef *res_hi)
1022
{
1023
   struct gallivm_state *gallivm = bld->gallivm;
1024
   LLVMBuilderRef builder = gallivm->builder;
1025

1026
   assert(bld->type.width == 32);
1027
   assert(bld->type.floating == 0);
1028
   assert(bld->type.fixed == 0);
1029
   assert(bld->type.norm == 0);
1030

1031
   /*
1032
    * XXX: for some reason, with zext/zext/mul/trunc the code llvm produces
1033
    * for x86 simd is atrocious (even if the high bits weren't required),
1034
    * trying to handle real 64bit inputs (which of course can't happen due
1035
    * to using 64bit umul with 32bit numbers zero-extended to 64bit, but
1036
    * apparently llvm does not recognize this widening mul). This includes 6
1037
    * (instead of 2) pmuludq plus extra adds and shifts
1038
    * The same story applies to signed mul, albeit fixing this requires sse41.
1039
    * https://llvm.org/bugs/show_bug.cgi?id=30845
1040
    * So, whip up our own code, albeit only for length 4 and 8 (which
1041
    * should be good enough)...
1042
    * FIXME: For llvm >= 7.0 we should match the autoupgrade pattern
1043
    * (bitcast/and/mul/shuffle for unsigned, bitcast/shl/ashr/mul/shuffle
1044
    * for signed), which the fallback code does not, without this llvm
1045
    * will likely still produce atrocious code.
1046
    */
1047
   if (LLVM_VERSION_MAJOR < 7 &&
1048
       (bld->type.length == 4 || bld->type.length == 8) &&
1049
       ((util_get_cpu_caps()->has_sse2 && (bld->type.sign == 0)) ||
1050
        util_get_cpu_caps()->has_sse4_1)) {
1051
      const char *intrinsic = NULL;
1052
      LLVMValueRef aeven, aodd, beven, bodd, muleven, mulodd;
1053
      LLVMValueRef shuf[LP_MAX_VECTOR_WIDTH / 32], shuf_vec;
1054
      struct lp_type type_wide = lp_wider_type(bld->type);
1055
      LLVMTypeRef wider_type = lp_build_vec_type(gallivm, type_wide);
1056
      unsigned i;
1057
      for (i = 0; i < bld->type.length; i += 2) {
1058
         shuf[i] = lp_build_const_int32(gallivm, i+1);
1059
         shuf[i+1] = LLVMGetUndef(LLVMInt32TypeInContext(gallivm->context));
1060
      }
1061
      shuf_vec = LLVMConstVector(shuf, bld->type.length);
1062
      aeven = a;
1063
      beven = b;
1064
      aodd = LLVMBuildShuffleVector(builder, aeven, bld->undef, shuf_vec, "");
1065
      bodd = LLVMBuildShuffleVector(builder, beven, bld->undef, shuf_vec, "");
1066

1067
      if (util_get_cpu_caps()->has_avx2 && bld->type.length == 8) {
1068
         if (bld->type.sign) {
1069
            intrinsic = "llvm.x86.avx2.pmul.dq";
1070
         } else {
1071
            intrinsic = "llvm.x86.avx2.pmulu.dq";
1072
         }
1073
         muleven = lp_build_intrinsic_binary(builder, intrinsic,
1074
                                             wider_type, aeven, beven);
1075
         mulodd = lp_build_intrinsic_binary(builder, intrinsic,
1076
                                            wider_type, aodd, bodd);
1077
      }
1078
      else {
1079
         /* for consistent naming look elsewhere... */
1080
         if (bld->type.sign) {
1081
            intrinsic = "llvm.x86.sse41.pmuldq";
1082
         } else {
1083
            intrinsic = "llvm.x86.sse2.pmulu.dq";
1084
         }
1085
         /*
1086
          * XXX If we only have AVX but not AVX2 this is a pain.
1087
          * lp_build_intrinsic_binary_anylength() can't handle it
1088
          * (due to src and dst type not being identical).
1089
          */
1090
         if (bld->type.length == 8) {
1091
            LLVMValueRef aevenlo, aevenhi, bevenlo, bevenhi;
1092
            LLVMValueRef aoddlo, aoddhi, boddlo, boddhi;
1093
            LLVMValueRef muleven2[2], mulodd2[2];
1094
            struct lp_type type_wide_half = type_wide;
1095
            LLVMTypeRef wtype_half;
1096
            type_wide_half.length = 2;
1097
            wtype_half = lp_build_vec_type(gallivm, type_wide_half);
1098
            aevenlo = lp_build_extract_range(gallivm, aeven, 0, 4);
1099
            aevenhi = lp_build_extract_range(gallivm, aeven, 4, 4);
1100
            bevenlo = lp_build_extract_range(gallivm, beven, 0, 4);
1101
            bevenhi = lp_build_extract_range(gallivm, beven, 4, 4);
1102
            aoddlo = lp_build_extract_range(gallivm, aodd, 0, 4);
1103
            aoddhi = lp_build_extract_range(gallivm, aodd, 4, 4);
1104
            boddlo = lp_build_extract_range(gallivm, bodd, 0, 4);
1105
            boddhi = lp_build_extract_range(gallivm, bodd, 4, 4);
1106
            muleven2[0] = lp_build_intrinsic_binary(builder, intrinsic,
1107
                                                    wtype_half, aevenlo, bevenlo);
1108
            mulodd2[0] = lp_build_intrinsic_binary(builder, intrinsic,
1109
                                                   wtype_half, aoddlo, boddlo);
1110
            muleven2[1] = lp_build_intrinsic_binary(builder, intrinsic,
1111
                                                    wtype_half, aevenhi, bevenhi);
1112
            mulodd2[1] = lp_build_intrinsic_binary(builder, intrinsic,
1113
                                                   wtype_half, aoddhi, boddhi);
1114
            muleven = lp_build_concat(gallivm, muleven2, type_wide_half, 2);
1115
            mulodd = lp_build_concat(gallivm, mulodd2, type_wide_half, 2);
1116

1117
         }
1118
         else {
1119
            muleven = lp_build_intrinsic_binary(builder, intrinsic,
1120
                                                wider_type, aeven, beven);
1121
            mulodd = lp_build_intrinsic_binary(builder, intrinsic,
1122
                                               wider_type, aodd, bodd);
1123
         }
1124
      }
1125
      muleven = LLVMBuildBitCast(builder, muleven, bld->vec_type, "");
1126
      mulodd = LLVMBuildBitCast(builder, mulodd, bld->vec_type, "");
1127

1128
      for (i = 0; i < bld->type.length; i += 2) {
1129
         shuf[i] = lp_build_const_int32(gallivm, i + 1);
1130
         shuf[i+1] = lp_build_const_int32(gallivm, i + 1 + bld->type.length);
1131
      }
1132
      shuf_vec = LLVMConstVector(shuf, bld->type.length);
1133
      *res_hi = LLVMBuildShuffleVector(builder, muleven, mulodd, shuf_vec, "");
1134

1135
      for (i = 0; i < bld->type.length; i += 2) {
1136
         shuf[i] = lp_build_const_int32(gallivm, i);
1137
         shuf[i+1] = lp_build_const_int32(gallivm, i + bld->type.length);
1138
      }
1139
      shuf_vec = LLVMConstVector(shuf, bld->type.length);
1140
      return LLVMBuildShuffleVector(builder, muleven, mulodd, shuf_vec, "");
1141
   }
1142
   else {
1143
      return lp_build_mul_32_lohi(bld, a, b, res_hi);
1144
   }
1145
}
1146

1147

1148
/*
1149
 * Widening mul, valid for <= 32 (8, 16, 32) -> 64
1150
 * Result is low N bits, high bits returned in res_hi.
1151
 *
1152
 * Emits generic code.
1153
 */
1154
LLVMValueRef
1155
lp_build_mul_32_lohi(struct lp_build_context *bld,
1156
                     LLVMValueRef a,
1157
                     LLVMValueRef b,
1158
                     LLVMValueRef *res_hi)
1159
{
1160
   struct gallivm_state *gallivm = bld->gallivm;
1161
   LLVMBuilderRef builder = gallivm->builder;
1162
   LLVMValueRef tmp, shift, res_lo;
1163
   struct lp_type type_tmp;
1164
   LLVMTypeRef wide_type, narrow_type;
1165

1166
   type_tmp = bld->type;
1167
   narrow_type = lp_build_vec_type(gallivm, type_tmp);
1168
   if (bld->type.width < 32)
1169
      type_tmp.width = 32;
1170
   else
1171
      type_tmp.width *= 2;
1172
   wide_type = lp_build_vec_type(gallivm, type_tmp);
1173
   shift = lp_build_const_vec(gallivm, type_tmp, bld->type.width);
1174

1175
   if (bld->type.sign) {
1176
      a = LLVMBuildSExt(builder, a, wide_type, "");
1177
      b = LLVMBuildSExt(builder, b, wide_type, "");
1178
   } else {
1179
      a = LLVMBuildZExt(builder, a, wide_type, "");
1180
      b = LLVMBuildZExt(builder, b, wide_type, "");
1181
   }
1182
   tmp = LLVMBuildMul(builder, a, b, "");
1183

1184
   res_lo = LLVMBuildTrunc(builder, tmp, narrow_type, "");
1185

1186
   /* Since we truncate anyway, LShr and AShr are equivalent. */
1187
   tmp = LLVMBuildLShr(builder, tmp, shift, "");
1188
   *res_hi = LLVMBuildTrunc(builder, tmp, narrow_type, "");
1189

1190
   return res_lo;
1191
}
1192

1193

1194
/* a * b + c */
1195
LLVMValueRef
1196
lp_build_mad(struct lp_build_context *bld,
1197
             LLVMValueRef a,
1198
             LLVMValueRef b,
1199
             LLVMValueRef c)
1200
{
1201
   const struct lp_type type = bld->type;
1202
   if (type.floating) {
1203
      return lp_build_fmuladd(bld->gallivm->builder, a, b, c);
1204
   } else {
1205
      return lp_build_add(bld, lp_build_mul(bld, a, b), c);
1206
   }
1207
}
1208

1209

1210
/**
1211
 * Small vector x scale multiplication optimization.
1212
 */
1213
LLVMValueRef
1214
lp_build_mul_imm(struct lp_build_context *bld,
1215
                 LLVMValueRef a,
1216
                 int b)
1217
{
1218
   LLVMBuilderRef builder = bld->gallivm->builder;
1219
   LLVMValueRef factor;
1220

1221
   assert(lp_check_value(bld->type, a));
1222

1223
   if(b == 0)
1224
      return bld->zero;
1225

1226
   if(b == 1)
1227
      return a;
1228

1229
   if(b == -1)
1230
      return lp_build_negate(bld, a);
1231

1232
   if(b == 2 && bld->type.floating)
1233
      return lp_build_add(bld, a, a);
1234

1235
   if(util_is_power_of_two_or_zero(b)) {
1236
      unsigned shift = ffs(b) - 1;
1237

1238
      if(bld->type.floating) {
1239
#if 0
1240
         /*
1241
          * Power of two multiplication by directly manipulating the exponent.
1242
          *
1243
          * XXX: This might not be always faster, it will introduce a small error
1244
          * for multiplication by zero, and it will produce wrong results
1245
          * for Inf and NaN.
1246
          */
1247
         unsigned mantissa = lp_mantissa(bld->type);
1248
         factor = lp_build_const_int_vec(bld->gallivm, bld->type, (unsigned long long)shift << mantissa);
1249
         a = LLVMBuildBitCast(builder, a, lp_build_int_vec_type(bld->type), "");
1250
         a = LLVMBuildAdd(builder, a, factor, "");
1251
         a = LLVMBuildBitCast(builder, a, lp_build_vec_type(bld->gallivm, bld->type), "");
1252
         return a;
1253
#endif
1254
      }
1255
      else {
1256
         factor = lp_build_const_vec(bld->gallivm, bld->type, shift);
1257
         return LLVMBuildShl(builder, a, factor, "");
1258
      }
1259
   }
1260

1261
   factor = lp_build_const_vec(bld->gallivm, bld->type, (double)b);
1262
   return lp_build_mul(bld, a, factor);
1263
}
1264

1265

1266
/**
1267
 * Generate a / b
1268
 */
1269
LLVMValueRef
1270
lp_build_div(struct lp_build_context *bld,
1271
             LLVMValueRef a,
1272
             LLVMValueRef b)
1273
{
1274
   LLVMBuilderRef builder = bld->gallivm->builder;
1275
   const struct lp_type type = bld->type;
1276

1277
   assert(lp_check_value(type, a));
1278
   assert(lp_check_value(type, b));
1279

1280
   if(a == bld->zero)
1281
      return bld->zero;
1282
   if(a == bld->one && type.floating)
1283
      return lp_build_rcp(bld, b);
1284
   if(b == bld->zero)
1285
      return bld->undef;
1286
   if(b == bld->one)
1287
      return a;
1288
   if(a == bld->undef || b == bld->undef)
1289
      return bld->undef;
1290

1291
   if(LLVMIsConstant(a) && LLVMIsConstant(b)) {
1292
      if (type.floating)
1293
         return LLVMConstFDiv(a, b);
1294
      else if (type.sign)
1295
         return LLVMConstSDiv(a, b);
1296
      else
1297
         return LLVMConstUDiv(a, b);
1298
   }
1299

1300
   /* fast rcp is disabled (just uses div), so makes no sense to try that */
1301
   if(FALSE &&
1302
      ((util_get_cpu_caps()->has_sse && type.width == 32 && type.length == 4) ||
1303
       (util_get_cpu_caps()->has_avx && type.width == 32 && type.length == 8)) &&
1304
      type.floating)
1305
      return lp_build_mul(bld, a, lp_build_rcp(bld, b));
1306

1307
   if (type.floating)
1308
      return LLVMBuildFDiv(builder, a, b, "");
1309
   else if (type.sign)
1310
      return LLVMBuildSDiv(builder, a, b, "");
1311
   else
1312
      return LLVMBuildUDiv(builder, a, b, "");
1313
}
1314

1315

1316
/**
1317
 * Linear interpolation helper.
1318
 *
1319
 * @param normalized whether we are interpolating normalized values,
1320
 *        encoded in normalized integers, twice as wide.
1321
 *
1322
 * @sa http://www.stereopsis.com/doubleblend.html
1323
 */
1324
static inline LLVMValueRef
1325
lp_build_lerp_simple(struct lp_build_context *bld,
1326
                     LLVMValueRef x,
1327
                     LLVMValueRef v0,
1328
                     LLVMValueRef v1,
1329
                     unsigned flags)
1330
{
1331
   unsigned half_width = bld->type.width/2;
1332
   LLVMBuilderRef builder = bld->gallivm->builder;
1333
   LLVMValueRef delta;
1334
   LLVMValueRef res;
1335

1336
   assert(lp_check_value(bld->type, x));
1337
   assert(lp_check_value(bld->type, v0));
1338
   assert(lp_check_value(bld->type, v1));
1339

1340
   delta = lp_build_sub(bld, v1, v0);
1341

1342
   if (bld->type.floating) {
1343
      assert(flags == 0);
1344
      return lp_build_mad(bld, x, delta, v0);
1345
   }
1346

1347
   if (flags & LP_BLD_LERP_WIDE_NORMALIZED) {
1348
      if (!bld->type.sign) {
1349
         if (!(flags & LP_BLD_LERP_PRESCALED_WEIGHTS)) {
1350
            /*
1351
             * Scale x from [0, 2**n - 1] to [0, 2**n] by adding the
1352
             * most-significant-bit to the lowest-significant-bit, so that
1353
             * later we can just divide by 2**n instead of 2**n - 1.
1354
             */
1355

1356
            x = lp_build_add(bld, x, lp_build_shr_imm(bld, x, half_width - 1));
1357
         }
1358

1359
         /* (x * delta) >> n */
1360
         res = lp_build_mul(bld, x, delta);
1361
         res = lp_build_shr_imm(bld, res, half_width);
1362
      } else {
1363
         /*
1364
          * The rescaling trick above doesn't work for signed numbers, so
1365
          * use the 2**n - 1 divison approximation in lp_build_mul_norm
1366
          * instead.
1367
          */
1368
         assert(!(flags & LP_BLD_LERP_PRESCALED_WEIGHTS));
1369
         res = lp_build_mul_norm(bld->gallivm, bld->type, x, delta);
1370
      }
1371
   } else {
1372
      assert(!(flags & LP_BLD_LERP_PRESCALED_WEIGHTS));
1373
      res = lp_build_mul(bld, x, delta);
1374
   }
1375

1376
   if ((flags & LP_BLD_LERP_WIDE_NORMALIZED) && !bld->type.sign) {
1377
      /*
1378
       * At this point both res and v0 only use the lower half of the bits,
1379
       * the rest is zero. Instead of add / mask, do add with half wide type.
1380
       */
1381
      struct lp_type narrow_type;
1382
      struct lp_build_context narrow_bld;
1383

1384
      memset(&narrow_type, 0, sizeof narrow_type);
1385
      narrow_type.sign   = bld->type.sign;
1386
      narrow_type.width  = bld->type.width/2;
1387
      narrow_type.length = bld->type.length*2;
1388

1389
      lp_build_context_init(&narrow_bld, bld->gallivm, narrow_type);
1390
      res = LLVMBuildBitCast(builder, res, narrow_bld.vec_type, "");
1391
      v0 = LLVMBuildBitCast(builder, v0, narrow_bld.vec_type, "");
1392
      res = lp_build_add(&narrow_bld, v0, res);
1393
      res = LLVMBuildBitCast(builder, res, bld->vec_type, "");
1394
   } else {
1395
      res = lp_build_add(bld, v0, res);
1396

1397
      if (bld->type.fixed) {
1398
         /*
1399
          * We need to mask out the high order bits when lerping 8bit
1400
          * normalized colors stored on 16bits
1401
          */
1402
         /* XXX: This step is necessary for lerping 8bit colors stored on
1403
          * 16bits, but it will be wrong for true fixed point use cases.
1404
          * Basically we need a more powerful lp_type, capable of further
1405
          * distinguishing the values interpretation from the value storage.
1406
          */
1407
         LLVMValueRef low_bits;
1408
         low_bits = lp_build_const_int_vec(bld->gallivm, bld->type, (1 << half_width) - 1);
1409
         res = LLVMBuildAnd(builder, res, low_bits, "");
1410
      }
1411
   }
1412

1413
   return res;
1414
}
1415

1416

1417
/**
1418
 * Linear interpolation.
1419
 */
1420
LLVMValueRef
1421
lp_build_lerp(struct lp_build_context *bld,
1422
              LLVMValueRef x,
1423
              LLVMValueRef v0,
1424
              LLVMValueRef v1,
1425
              unsigned flags)
1426
{
1427
   const struct lp_type type = bld->type;
1428
   LLVMValueRef res;
1429

1430
   assert(lp_check_value(type, x));
1431
   assert(lp_check_value(type, v0));
1432
   assert(lp_check_value(type, v1));
1433

1434
   assert(!(flags & LP_BLD_LERP_WIDE_NORMALIZED));
1435

1436
   if (type.norm) {
1437
      struct lp_type wide_type;
1438
      struct lp_build_context wide_bld;
1439
      LLVMValueRef xl, xh, v0l, v0h, v1l, v1h, resl, resh;
1440

1441
      assert(type.length >= 2);
1442

1443
      /*
1444
       * Create a wider integer type, enough to hold the
1445
       * intermediate result of the multiplication.
1446
       */
1447
      memset(&wide_type, 0, sizeof wide_type);
1448
      wide_type.sign   = type.sign;
1449
      wide_type.width  = type.width*2;
1450
      wide_type.length = type.length/2;
1451

1452
      lp_build_context_init(&wide_bld, bld->gallivm, wide_type);
1453

1454
      lp_build_unpack2_native(bld->gallivm, type, wide_type, x,  &xl,  &xh);
1455
      lp_build_unpack2_native(bld->gallivm, type, wide_type, v0, &v0l, &v0h);
1456
      lp_build_unpack2_native(bld->gallivm, type, wide_type, v1, &v1l, &v1h);
1457

1458
      /*
1459
       * Lerp both halves.
1460
       */
1461

1462
      flags |= LP_BLD_LERP_WIDE_NORMALIZED;
1463

1464
      resl = lp_build_lerp_simple(&wide_bld, xl, v0l, v1l, flags);
1465
      resh = lp_build_lerp_simple(&wide_bld, xh, v0h, v1h, flags);
1466

1467
      res = lp_build_pack2_native(bld->gallivm, wide_type, type, resl, resh);
1468
   } else {
1469
      res = lp_build_lerp_simple(bld, x, v0, v1, flags);
1470
   }
1471

1472
   return res;
1473
}
1474

1475

1476
/**
1477
 * Bilinear interpolation.
1478
 *
1479
 * Values indices are in v_{yx}.
1480
 */
1481
LLVMValueRef
1482
lp_build_lerp_2d(struct lp_build_context *bld,
1483
                 LLVMValueRef x,
1484
                 LLVMValueRef y,
1485
                 LLVMValueRef v00,
1486
                 LLVMValueRef v01,
1487
                 LLVMValueRef v10,
1488
                 LLVMValueRef v11,
1489
                 unsigned flags)
1490
{
1491
   LLVMValueRef v0 = lp_build_lerp(bld, x, v00, v01, flags);
1492
   LLVMValueRef v1 = lp_build_lerp(bld, x, v10, v11, flags);
1493
   return lp_build_lerp(bld, y, v0, v1, flags);
1494
}
1495

1496

1497
LLVMValueRef
1498
lp_build_lerp_3d(struct lp_build_context *bld,
1499
                 LLVMValueRef x,
1500
                 LLVMValueRef y,
1501
                 LLVMValueRef z,
1502
                 LLVMValueRef v000,
1503
                 LLVMValueRef v001,
1504
                 LLVMValueRef v010,
1505
                 LLVMValueRef v011,
1506
                 LLVMValueRef v100,
1507
                 LLVMValueRef v101,
1508
                 LLVMValueRef v110,
1509
                 LLVMValueRef v111,
1510
                 unsigned flags)
1511
{
1512
   LLVMValueRef v0 = lp_build_lerp_2d(bld, x, y, v000, v001, v010, v011, flags);
1513
   LLVMValueRef v1 = lp_build_lerp_2d(bld, x, y, v100, v101, v110, v111, flags);
1514
   return lp_build_lerp(bld, z, v0, v1, flags);
1515
}
1516

1517

1518
/**
1519
 * Generate min(a, b)
1520
 * Do checks for special cases but not for nans.
1521
 */
1522
LLVMValueRef
1523
lp_build_min(struct lp_build_context *bld,
1524
             LLVMValueRef a,
1525
             LLVMValueRef b)
1526
{
1527
   assert(lp_check_value(bld->type, a));
1528
   assert(lp_check_value(bld->type, b));
1529

1530
   if(a == bld->undef || b == bld->undef)
1531
      return bld->undef;
1532

1533
   if(a == b)
1534
      return a;
1535

1536
   if (bld->type.norm) {
1537
      if (!bld->type.sign) {
1538
         if (a == bld->zero || b == bld->zero) {
1539
            return bld->zero;
1540
         }
1541
      }
1542
      if(a == bld->one)
1543
         return b;
1544
      if(b == bld->one)
1545
         return a;
1546
   }
1547

1548
   return lp_build_min_simple(bld, a, b, GALLIVM_NAN_BEHAVIOR_UNDEFINED);
1549
}
1550

1551
/**
1552
 * Generate min(a, b)
1553
 * NaN's are handled according to the behavior specified by the
1554
 * nan_behavior argument.
1555
 */
1556
LLVMValueRef
1557
lp_build_min_ext(struct lp_build_context *bld,
1558
                 LLVMValueRef a,
1559
                 LLVMValueRef b,
1560
                 enum gallivm_nan_behavior nan_behavior)
1561
{
1562
   assert(lp_check_value(bld->type, a));
1563
   assert(lp_check_value(bld->type, b));
1564

1565
   if(a == bld->undef || b == bld->undef)
1566
      return bld->undef;
1567

1568
   if(a == b)
1569
      return a;
1570

1571
   if (bld->type.norm) {
1572
      if (!bld->type.sign) {
1573
         if (a == bld->zero || b == bld->zero) {
1574
            return bld->zero;
1575
         }
1576
      }
1577
      if(a == bld->one)
1578
         return b;
1579
      if(b == bld->one)
1580
         return a;
1581
   }
1582

1583
   return lp_build_min_simple(bld, a, b, nan_behavior);
1584
}
1585

1586
/**
1587
 * Generate max(a, b)
1588
 * Do checks for special cases, but NaN behavior is undefined.
1589
 */
1590
LLVMValueRef
1591
lp_build_max(struct lp_build_context *bld,
1592
             LLVMValueRef a,
1593
             LLVMValueRef b)
1594
{
1595
   assert(lp_check_value(bld->type, a));
1596
   assert(lp_check_value(bld->type, b));
1597

1598
   if(a == bld->undef || b == bld->undef)
1599
      return bld->undef;
1600

1601
   if(a == b)
1602
      return a;
1603

1604
   if(bld->type.norm) {
1605
      if(a == bld->one || b == bld->one)
1606
         return bld->one;
1607
      if (!bld->type.sign) {
1608
         if (a == bld->zero) {
1609
            return b;
1610
         }
1611
         if (b == bld->zero) {
1612
            return a;
1613
         }
1614
      }
1615
   }
1616

1617
   return lp_build_max_simple(bld, a, b, GALLIVM_NAN_BEHAVIOR_UNDEFINED);
1618
}
1619

1620

1621
/**
1622
 * Generate max(a, b)
1623
 * Checks for special cases.
1624
 * NaN's are handled according to the behavior specified by the
1625
 * nan_behavior argument.
1626
 */
1627
LLVMValueRef
1628
lp_build_max_ext(struct lp_build_context *bld,
1629
                  LLVMValueRef a,
1630
                  LLVMValueRef b,
1631
                  enum gallivm_nan_behavior nan_behavior)
1632
{
1633
   assert(lp_check_value(bld->type, a));
1634
   assert(lp_check_value(bld->type, b));
1635

1636
   if(a == bld->undef || b == bld->undef)
1637
      return bld->undef;
1638

1639
   if(a == b)
1640
      return a;
1641

1642
   if(bld->type.norm) {
1643
      if(a == bld->one || b == bld->one)
1644
         return bld->one;
1645
      if (!bld->type.sign) {
1646
         if (a == bld->zero) {
1647
            return b;
1648
         }
1649
         if (b == bld->zero) {
1650
            return a;
1651
         }
1652
      }
1653
   }
1654

1655
   return lp_build_max_simple(bld, a, b, nan_behavior);
1656
}
1657

1658
/**
1659
 * Generate clamp(a, min, max)
1660
 * NaN behavior (for any of a, min, max) is undefined.
1661
 * Do checks for special cases.
1662
 */
1663
LLVMValueRef
1664
lp_build_clamp(struct lp_build_context *bld,
1665
               LLVMValueRef a,
1666
               LLVMValueRef min,
1667
               LLVMValueRef max)
1668
{
1669
   assert(lp_check_value(bld->type, a));
1670
   assert(lp_check_value(bld->type, min));
1671
   assert(lp_check_value(bld->type, max));
1672

1673
   a = lp_build_min(bld, a, max);
1674
   a = lp_build_max(bld, a, min);
1675
   return a;
1676
}
1677

1678

1679
/**
1680
 * Generate clamp(a, 0, 1)
1681
 * A NaN will get converted to zero.
1682
 */
1683
LLVMValueRef
1684
lp_build_clamp_zero_one_nanzero(struct lp_build_context *bld,
1685
                                LLVMValueRef a)
1686
{
1687
   a = lp_build_max_ext(bld, a, bld->zero, GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN);
1688
   a = lp_build_min(bld, a, bld->one);
1689
   return a;
1690
}
1691

1692

1693
/**
1694
 * Generate abs(a)
1695
 */
1696
LLVMValueRef
1697
lp_build_abs(struct lp_build_context *bld,
1698
             LLVMValueRef a)
1699
{
1700
   LLVMBuilderRef builder = bld->gallivm->builder;
1701
   const struct lp_type type = bld->type;
1702
   LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
1703

1704
   assert(lp_check_value(type, a));
1705

1706
   if(!type.sign)
1707
      return a;
1708

1709
   if(type.floating) {
1710
      char intrinsic[32];
1711
      lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.fabs", vec_type);
1712
      return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a);
1713
   }
1714

1715
   if(type.width*type.length == 128 && util_get_cpu_caps()->has_ssse3 && LLVM_VERSION_MAJOR < 6) {
1716
      switch(type.width) {
1717
      case 8:
1718
         return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.b.128", vec_type, a);
1719
      case 16:
1720
         return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.w.128", vec_type, a);
1721
      case 32:
1722
         return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.d.128", vec_type, a);
1723
      }
1724
   }
1725
   else if (type.width*type.length == 256 && util_get_cpu_caps()->has_avx2 && LLVM_VERSION_MAJOR < 6) {
1726
      switch(type.width) {
1727
      case 8:
1728
         return lp_build_intrinsic_unary(builder, "llvm.x86.avx2.pabs.b", vec_type, a);
1729
      case 16:
1730
         return lp_build_intrinsic_unary(builder, "llvm.x86.avx2.pabs.w", vec_type, a);
1731
      case 32:
1732
         return lp_build_intrinsic_unary(builder, "llvm.x86.avx2.pabs.d", vec_type, a);
1733
      }
1734
   }
1735

1736
   return lp_build_select(bld, lp_build_cmp(bld, PIPE_FUNC_GREATER, a, bld->zero),
1737
                          a, LLVMBuildNeg(builder, a, ""));
1738
}
1739

1740

1741
LLVMValueRef
1742
lp_build_negate(struct lp_build_context *bld,
1743
                LLVMValueRef a)
1744
{
1745
   LLVMBuilderRef builder = bld->gallivm->builder;
1746

1747
   assert(lp_check_value(bld->type, a));
1748

1749
   if (bld->type.floating)
1750
      a = LLVMBuildFNeg(builder, a, "");
1751
   else
1752
      a = LLVMBuildNeg(builder, a, "");
1753

1754
   return a;
1755
}
1756

1757

1758
/** Return -1, 0 or +1 depending on the sign of a */
1759
LLVMValueRef
1760
lp_build_sgn(struct lp_build_context *bld,
1761
             LLVMValueRef a)
1762
{
1763
   LLVMBuilderRef builder = bld->gallivm->builder;
1764
   const struct lp_type type = bld->type;
1765
   LLVMValueRef cond;
1766
   LLVMValueRef res;
1767

1768
   assert(lp_check_value(type, a));
1769

1770
   /* Handle non-zero case */
1771
   if(!type.sign) {
1772
      /* if not zero then sign must be positive */
1773
      res = bld->one;
1774
   }
1775
   else if(type.floating) {
1776
      LLVMTypeRef vec_type;
1777
      LLVMTypeRef int_type;
1778
      LLVMValueRef mask;
1779
      LLVMValueRef sign;
1780
      LLVMValueRef one;
1781
      unsigned long long maskBit = (unsigned long long)1 << (type.width - 1);
1782

1783
      int_type = lp_build_int_vec_type(bld->gallivm, type);
1784
      vec_type = lp_build_vec_type(bld->gallivm, type);
1785
      mask = lp_build_const_int_vec(bld->gallivm, type, maskBit);
1786

1787
      /* Take the sign bit and add it to 1 constant */
1788
      sign = LLVMBuildBitCast(builder, a, int_type, "");
1789
      sign = LLVMBuildAnd(builder, sign, mask, "");
1790
      one = LLVMConstBitCast(bld->one, int_type);
1791
      res = LLVMBuildOr(builder, sign, one, "");
1792
      res = LLVMBuildBitCast(builder, res, vec_type, "");
1793
   }
1794
   else
1795
   {
1796
      /* signed int/norm/fixed point */
1797
      /* could use psign with sse3 and appropriate vectors here */
1798
      LLVMValueRef minus_one = lp_build_const_vec(bld->gallivm, type, -1.0);
1799
      cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, bld->zero);
1800
      res = lp_build_select(bld, cond, bld->one, minus_one);
1801
   }
1802

1803
   /* Handle zero */
1804
   cond = lp_build_cmp(bld, PIPE_FUNC_EQUAL, a, bld->zero);
1805
   res = lp_build_select(bld, cond, bld->zero, res);
1806

1807
   return res;
1808
}
1809

1810

1811
/**
1812
 * Set the sign of float vector 'a' according to 'sign'.
1813
 * If sign==0, return abs(a).
1814
 * If sign==1, return -abs(a);
1815
 * Other values for sign produce undefined results.
1816
 */
1817
LLVMValueRef
1818
lp_build_set_sign(struct lp_build_context *bld,
1819
                  LLVMValueRef a, LLVMValueRef sign)
1820
{
1821
   LLVMBuilderRef builder = bld->gallivm->builder;
1822
   const struct lp_type type = bld->type;
1823
   LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
1824
   LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
1825
   LLVMValueRef shift = lp_build_const_int_vec(bld->gallivm, type, type.width - 1);
1826
   LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type,
1827
                             ~((unsigned long long) 1 << (type.width - 1)));
1828
   LLVMValueRef val, res;
1829

1830
   assert(type.floating);
1831
   assert(lp_check_value(type, a));
1832

1833
   /* val = reinterpret_cast<int>(a) */
1834
   val = LLVMBuildBitCast(builder, a, int_vec_type, "");
1835
   /* val = val & mask */
1836
   val = LLVMBuildAnd(builder, val, mask, "");
1837
   /* sign = sign << shift */
1838
   sign = LLVMBuildShl(builder, sign, shift, "");
1839
   /* res = val | sign */
1840
   res = LLVMBuildOr(builder, val, sign, "");
1841
   /* res = reinterpret_cast<float>(res) */
1842
   res = LLVMBuildBitCast(builder, res, vec_type, "");
1843

1844
   return res;
1845
}
1846

1847

1848
/**
1849
 * Convert vector of (or scalar) int to vector of (or scalar) float.
1850
 */
1851
LLVMValueRef
1852
lp_build_int_to_float(struct lp_build_context *bld,
1853
                      LLVMValueRef a)
1854
{
1855
   LLVMBuilderRef builder = bld->gallivm->builder;
1856
   const struct lp_type type = bld->type;
1857
   LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
1858

1859
   assert(type.floating);
1860

1861
   return LLVMBuildSIToFP(builder, a, vec_type, "");
1862
}
1863

1864
static boolean
1865
arch_rounding_available(const struct lp_type type)
1866
{
1867
   if ((util_get_cpu_caps()->has_sse4_1 &&
1868
       (type.length == 1 || type.width*type.length == 128)) ||
1869
       (util_get_cpu_caps()->has_avx && type.width*type.length == 256) ||
1870
       (util_get_cpu_caps()->has_avx512f && type.width*type.length == 512))
1871
      return TRUE;
1872
   else if ((util_get_cpu_caps()->has_altivec &&
1873
            (type.width == 32 && type.length == 4)))
1874
      return TRUE;
1875
   else if (util_get_cpu_caps()->has_neon)
1876
      return TRUE;
1877

1878
   return FALSE;
1879
}
1880

1881
enum lp_build_round_mode
1882
{
1883
   LP_BUILD_ROUND_NEAREST = 0,
1884
   LP_BUILD_ROUND_FLOOR = 1,
1885
   LP_BUILD_ROUND_CEIL = 2,
1886
   LP_BUILD_ROUND_TRUNCATE = 3
1887
};
1888

1889
static inline LLVMValueRef
1890
lp_build_iround_nearest_sse2(struct lp_build_context *bld,
1891
                             LLVMValueRef a)
1892
{
1893
   LLVMBuilderRef builder = bld->gallivm->builder;
1894
   const struct lp_type type = bld->type;
1895
   LLVMTypeRef i32t = LLVMInt32TypeInContext(bld->gallivm->context);
1896
   LLVMTypeRef ret_type = lp_build_int_vec_type(bld->gallivm, type);
1897
   const char *intrinsic;
1898
   LLVMValueRef res;
1899

1900
   assert(type.floating);
1901
   /* using the double precision conversions is a bit more complicated */
1902
   assert(type.width == 32);
1903

1904
   assert(lp_check_value(type, a));
1905
   assert(util_get_cpu_caps()->has_sse2);
1906

1907
   /* This is relying on MXCSR rounding mode, which should always be nearest. */
1908
   if (type.length == 1) {
1909
      LLVMTypeRef vec_type;
1910
      LLVMValueRef undef;
1911
      LLVMValueRef arg;
1912
      LLVMValueRef index0 = LLVMConstInt(i32t, 0, 0);
1913

1914
      vec_type = LLVMVectorType(bld->elem_type, 4);
1915

1916
      intrinsic = "llvm.x86.sse.cvtss2si";
1917

1918
      undef = LLVMGetUndef(vec_type);
1919

1920
      arg = LLVMBuildInsertElement(builder, undef, a, index0, "");
1921

1922
      res = lp_build_intrinsic_unary(builder, intrinsic,
1923
                                     ret_type, arg);
1924
   }
1925
   else {
1926
      if (type.width* type.length == 128) {
1927
         intrinsic = "llvm.x86.sse2.cvtps2dq";
1928
      }
1929
      else {
1930
         assert(type.width*type.length == 256);
1931
         assert(util_get_cpu_caps()->has_avx);
1932

1933
         intrinsic = "llvm.x86.avx.cvt.ps2dq.256";
1934
      }
1935
      res = lp_build_intrinsic_unary(builder, intrinsic,
1936
                                     ret_type, a);
1937
   }
1938

1939
   return res;
1940
}
1941

1942

1943
/*
1944
 */
1945
static inline LLVMValueRef
1946
lp_build_round_altivec(struct lp_build_context *bld,
1947
                       LLVMValueRef a,
1948
                       enum lp_build_round_mode mode)
1949
{
1950
   LLVMBuilderRef builder = bld->gallivm->builder;
1951
   const struct lp_type type = bld->type;
1952
   const char *intrinsic = NULL;
1953

1954
   assert(type.floating);
1955

1956
   assert(lp_check_value(type, a));
1957
   assert(util_get_cpu_caps()->has_altivec);
1958

1959
   (void)type;
1960

1961
   switch (mode) {
1962
   case LP_BUILD_ROUND_NEAREST:
1963
      intrinsic = "llvm.ppc.altivec.vrfin";
1964
      break;
1965
   case LP_BUILD_ROUND_FLOOR:
1966
      intrinsic = "llvm.ppc.altivec.vrfim";
1967
      break;
1968
   case LP_BUILD_ROUND_CEIL:
1969
      intrinsic = "llvm.ppc.altivec.vrfip";
1970
      break;
1971
   case LP_BUILD_ROUND_TRUNCATE:
1972
      intrinsic = "llvm.ppc.altivec.vrfiz";
1973
      break;
1974
   }
1975

1976
   return lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
1977
}
1978

1979
static inline LLVMValueRef
1980
lp_build_round_arch(struct lp_build_context *bld,
1981
                    LLVMValueRef a,
1982
                    enum lp_build_round_mode mode)
1983
{
1984
   if (util_get_cpu_caps()->has_sse4_1 || util_get_cpu_caps()->has_neon) {
1985
      LLVMBuilderRef builder = bld->gallivm->builder;
1986
      const struct lp_type type = bld->type;
1987
      const char *intrinsic_root;
1988
      char intrinsic[32];
1989

1990
      assert(type.floating);
1991
      assert(lp_check_value(type, a));
1992
      (void)type;
1993

1994
      switch (mode) {
1995
      case LP_BUILD_ROUND_NEAREST:
1996
         intrinsic_root = "llvm.nearbyint";
1997
         break;
1998
      case LP_BUILD_ROUND_FLOOR:
1999
         intrinsic_root = "llvm.floor";
2000
         break;
2001
      case LP_BUILD_ROUND_CEIL:
2002
         intrinsic_root = "llvm.ceil";
2003
         break;
2004
      case LP_BUILD_ROUND_TRUNCATE:
2005
         intrinsic_root = "llvm.trunc";
2006
         break;
2007
      default:
2008
         unreachable("unhandled lp_build_round_mode");
2009
      }
2010

2011
      lp_format_intrinsic(intrinsic, sizeof intrinsic, intrinsic_root, bld->vec_type);
2012
      return lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
2013
   }
2014
   else /* (util_get_cpu_caps()->has_altivec) */
2015
     return lp_build_round_altivec(bld, a, mode);
2016
}
2017

2018
/**
2019
 * Return the integer part of a float (vector) value (== round toward zero).
2020
 * The returned value is a float (vector).
2021
 * Ex: trunc(-1.5) = -1.0
2022
 */
2023
LLVMValueRef
2024
lp_build_trunc(struct lp_build_context *bld,
2025
               LLVMValueRef a)
2026
{
2027
   LLVMBuilderRef builder = bld->gallivm->builder;
2028
   const struct lp_type type = bld->type;
2029

2030
   assert(type.floating);
2031
   assert(lp_check_value(type, a));
2032

2033
   if (arch_rounding_available(type)) {
2034
      return lp_build_round_arch(bld, a, LP_BUILD_ROUND_TRUNCATE);
2035
   }
2036
   else {
2037
      const struct lp_type type = bld->type;
2038
      struct lp_type inttype;
2039
      struct lp_build_context intbld;
2040
      LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 1<<24);
2041
      LLVMValueRef trunc, res, anosign, mask;
2042
      LLVMTypeRef int_vec_type = bld->int_vec_type;
2043
      LLVMTypeRef vec_type = bld->vec_type;
2044

2045
      inttype = type;
2046
      inttype.floating = 0;
2047
      lp_build_context_init(&intbld, bld->gallivm, inttype);
2048

2049
      /* round by truncation */
2050
      trunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
2051
      res = LLVMBuildSIToFP(builder, trunc, vec_type, "floor.trunc");
2052

2053
      /* mask out sign bit */
2054
      anosign = lp_build_abs(bld, a);
2055
      /*
2056
       * mask out all values if anosign > 2^24
2057
       * This should work both for large ints (all rounding is no-op for them
2058
       * because such floats are always exact) as well as special cases like
2059
       * NaNs, Infs (taking advantage of the fact they use max exponent).
2060
       * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
2061
       */
2062
      anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, "");
2063
      cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, "");
2064
      mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval);
2065
      return lp_build_select(bld, mask, a, res);
2066
   }
2067
}
2068

2069

2070
/**
2071
 * Return float (vector) rounded to nearest integer (vector).  The returned
2072
 * value is a float (vector).
2073
 * Ex: round(0.9) = 1.0
2074
 * Ex: round(-1.5) = -2.0
2075
 */
2076
LLVMValueRef
2077
lp_build_round(struct lp_build_context *bld,
2078
               LLVMValueRef a)
2079
{
2080
   LLVMBuilderRef builder = bld->gallivm->builder;
2081
   const struct lp_type type = bld->type;
2082

2083
   assert(type.floating);
2084
   assert(lp_check_value(type, a));
2085

2086
   if (arch_rounding_available(type)) {
2087
      return lp_build_round_arch(bld, a, LP_BUILD_ROUND_NEAREST);
2088
   }
2089
   else {
2090
      const struct lp_type type = bld->type;
2091
      struct lp_type inttype;
2092
      struct lp_build_context intbld;
2093
      LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 1<<24);
2094
      LLVMValueRef res, anosign, mask;
2095
      LLVMTypeRef int_vec_type = bld->int_vec_type;
2096
      LLVMTypeRef vec_type = bld->vec_type;
2097

2098
      inttype = type;
2099
      inttype.floating = 0;
2100
      lp_build_context_init(&intbld, bld->gallivm, inttype);
2101

2102
      res = lp_build_iround(bld, a);
2103
      res = LLVMBuildSIToFP(builder, res, vec_type, "");
2104

2105
      /* mask out sign bit */
2106
      anosign = lp_build_abs(bld, a);
2107
      /*
2108
       * mask out all values if anosign > 2^24
2109
       * This should work both for large ints (all rounding is no-op for them
2110
       * because such floats are always exact) as well as special cases like
2111
       * NaNs, Infs (taking advantage of the fact they use max exponent).
2112
       * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
2113
       */
2114
      anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, "");
2115
      cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, "");
2116
      mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval);
2117
      return lp_build_select(bld, mask, a, res);
2118
   }
2119
}
2120

2121

2122
/**
2123
 * Return floor of float (vector), result is a float (vector)
2124
 * Ex: floor(1.1) = 1.0
2125
 * Ex: floor(-1.1) = -2.0
2126
 */
2127
LLVMValueRef
2128
lp_build_floor(struct lp_build_context *bld,
2129
               LLVMValueRef a)
2130
{
2131
   LLVMBuilderRef builder = bld->gallivm->builder;
2132
   const struct lp_type type = bld->type;
2133

2134
   assert(type.floating);
2135
   assert(lp_check_value(type, a));
2136

2137
   if (arch_rounding_available(type)) {
2138
      return lp_build_round_arch(bld, a, LP_BUILD_ROUND_FLOOR);
2139
   }
2140
   else {
2141
      const struct lp_type type = bld->type;
2142
      struct lp_type inttype;
2143
      struct lp_build_context intbld;
2144
      LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 1<<24);
2145
      LLVMValueRef trunc, res, anosign, mask;
2146
      LLVMTypeRef int_vec_type = bld->int_vec_type;
2147
      LLVMTypeRef vec_type = bld->vec_type;
2148

2149
      if (type.width != 32) {
2150
         char intrinsic[32];
2151
         lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.floor", vec_type);
2152
         return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a);
2153
      }
2154

2155
      assert(type.width == 32); /* might want to handle doubles at some point */
2156

2157
      inttype = type;
2158
      inttype.floating = 0;
2159
      lp_build_context_init(&intbld, bld->gallivm, inttype);
2160

2161
      /* round by truncation */
2162
      trunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
2163
      res = LLVMBuildSIToFP(builder, trunc, vec_type, "floor.trunc");
2164

2165
      if (type.sign) {
2166
         LLVMValueRef tmp;
2167

2168
         /*
2169
          * fix values if rounding is wrong (for non-special cases)
2170
          * - this is the case if trunc > a
2171
          */
2172
         mask = lp_build_cmp(bld, PIPE_FUNC_GREATER, res, a);
2173
         /* tmp = trunc > a ? 1.0 : 0.0 */
2174
         tmp = LLVMBuildBitCast(builder, bld->one, int_vec_type, "");
2175
         tmp = lp_build_and(&intbld, mask, tmp);
2176
         tmp = LLVMBuildBitCast(builder, tmp, vec_type, "");
2177
         res = lp_build_sub(bld, res, tmp);
2178
      }
2179

2180
      /* mask out sign bit */
2181
      anosign = lp_build_abs(bld, a);
2182
      /*
2183
       * mask out all values if anosign > 2^24
2184
       * This should work both for large ints (all rounding is no-op for them
2185
       * because such floats are always exact) as well as special cases like
2186
       * NaNs, Infs (taking advantage of the fact they use max exponent).
2187
       * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
2188
       */
2189
      anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, "");
2190
      cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, "");
2191
      mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval);
2192
      return lp_build_select(bld, mask, a, res);
2193
   }
2194
}
2195

2196

2197
/**
2198
 * Return ceiling of float (vector), returning float (vector).
2199
 * Ex: ceil( 1.1) = 2.0
2200
 * Ex: ceil(-1.1) = -1.0
2201
 */
2202
LLVMValueRef
2203
lp_build_ceil(struct lp_build_context *bld,
2204
              LLVMValueRef a)
2205
{
2206
   LLVMBuilderRef builder = bld->gallivm->builder;
2207
   const struct lp_type type = bld->type;
2208

2209
   assert(type.floating);
2210
   assert(lp_check_value(type, a));
2211

2212
   if (arch_rounding_available(type)) {
2213
      return lp_build_round_arch(bld, a, LP_BUILD_ROUND_CEIL);
2214
   }
2215
   else {
2216
      const struct lp_type type = bld->type;
2217
      struct lp_type inttype;
2218
      struct lp_build_context intbld;
2219
      LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 1<<24);
2220
      LLVMValueRef trunc, res, anosign, mask, tmp;
2221
      LLVMTypeRef int_vec_type = bld->int_vec_type;
2222
      LLVMTypeRef vec_type = bld->vec_type;
2223

2224
      if (type.width != 32) {
2225
         char intrinsic[32];
2226
         lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.ceil", vec_type);
2227
         return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a);
2228
      }
2229

2230
      assert(type.width == 32); /* might want to handle doubles at some point */
2231

2232
      inttype = type;
2233
      inttype.floating = 0;
2234
      lp_build_context_init(&intbld, bld->gallivm, inttype);
2235

2236
      /* round by truncation */
2237
      trunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
2238
      trunc = LLVMBuildSIToFP(builder, trunc, vec_type, "ceil.trunc");
2239

2240
      /*
2241
       * fix values if rounding is wrong (for non-special cases)
2242
       * - this is the case if trunc < a
2243
       */
2244
      mask = lp_build_cmp(bld, PIPE_FUNC_LESS, trunc, a);
2245
      /* tmp = trunc < a ? 1.0 : 0.0 */
2246
      tmp = LLVMBuildBitCast(builder, bld->one, int_vec_type, "");
2247
      tmp = lp_build_and(&intbld, mask, tmp);
2248
      tmp = LLVMBuildBitCast(builder, tmp, vec_type, "");
2249
      res = lp_build_add(bld, trunc, tmp);
2250

2251
      /* mask out sign bit */
2252
      anosign = lp_build_abs(bld, a);
2253
      /*
2254
       * mask out all values if anosign > 2^24
2255
       * This should work both for large ints (all rounding is no-op for them
2256
       * because such floats are always exact) as well as special cases like
2257
       * NaNs, Infs (taking advantage of the fact they use max exponent).
2258
       * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
2259
       */
2260
      anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, "");
2261
      cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, "");
2262
      mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval);
2263
      return lp_build_select(bld, mask, a, res);
2264
   }
2265
}
2266

2267

2268
/**
2269
 * Return fractional part of 'a' computed as a - floor(a)
2270
 * Typically used in texture coord arithmetic.
2271
 */
2272
LLVMValueRef
2273
lp_build_fract(struct lp_build_context *bld,
2274
               LLVMValueRef a)
2275
{
2276
   assert(bld->type.floating);
2277
   return lp_build_sub(bld, a, lp_build_floor(bld, a));
2278
}
2279

2280

2281
/**
2282
 * Prevent returning 1.0 for very small negative values of 'a' by clamping
2283
 * against 0.99999(9). (Will also return that value for NaNs.)
2284
 */
2285
static inline LLVMValueRef
2286
clamp_fract(struct lp_build_context *bld, LLVMValueRef fract)
2287
{
2288
   LLVMValueRef max;
2289

2290
   /* this is the largest number smaller than 1.0 representable as float */
2291
   max = lp_build_const_vec(bld->gallivm, bld->type,
2292
                            1.0 - 1.0/(1LL << (lp_mantissa(bld->type) + 1)));
2293
   return lp_build_min_ext(bld, fract, max,
2294
                           GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN);
2295
}
2296

2297

2298
/**
2299
 * Same as lp_build_fract, but guarantees that the result is always smaller
2300
 * than one. Will also return the smaller-than-one value for infs, NaNs.
2301
 */
2302
LLVMValueRef
2303
lp_build_fract_safe(struct lp_build_context *bld,
2304
                    LLVMValueRef a)
2305
{
2306
   return clamp_fract(bld, lp_build_fract(bld, a));
2307
}
2308

2309

2310
/**
2311
 * Return the integer part of a float (vector) value (== round toward zero).
2312
 * The returned value is an integer (vector).
2313
 * Ex: itrunc(-1.5) = -1
2314
 */
2315
LLVMValueRef
2316
lp_build_itrunc(struct lp_build_context *bld,
2317
                LLVMValueRef a)
2318
{
2319
   LLVMBuilderRef builder = bld->gallivm->builder;
2320
   const struct lp_type type = bld->type;
2321
   LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
2322

2323
   assert(type.floating);
2324
   assert(lp_check_value(type, a));
2325

2326
   return LLVMBuildFPToSI(builder, a, int_vec_type, "");
2327
}
2328

2329

2330
/**
2331
 * Return float (vector) rounded to nearest integer (vector).  The returned
2332
 * value is an integer (vector).
2333
 * Ex: iround(0.9) = 1
2334
 * Ex: iround(-1.5) = -2
2335
 */
2336
LLVMValueRef
2337
lp_build_iround(struct lp_build_context *bld,
2338
                LLVMValueRef a)
2339
{
2340
   LLVMBuilderRef builder = bld->gallivm->builder;
2341
   const struct lp_type type = bld->type;
2342
   LLVMTypeRef int_vec_type = bld->int_vec_type;
2343
   LLVMValueRef res;
2344

2345
   assert(type.floating);
2346

2347
   assert(lp_check_value(type, a));
2348

2349
   if ((util_get_cpu_caps()->has_sse2 &&
2350
       ((type.width == 32) && (type.length == 1 || type.length == 4))) ||
2351
       (util_get_cpu_caps()->has_avx && type.width == 32 && type.length == 8)) {
2352
      return lp_build_iround_nearest_sse2(bld, a);
2353
   }
2354
   if (arch_rounding_available(type)) {
2355
      res = lp_build_round_arch(bld, a, LP_BUILD_ROUND_NEAREST);
2356
   }
2357
   else {
2358
      LLVMValueRef half;
2359

2360
      half = lp_build_const_vec(bld->gallivm, type, nextafterf(0.5, 0.0));
2361

2362
      if (type.sign) {
2363
         LLVMTypeRef vec_type = bld->vec_type;
2364
         LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type,
2365
                                    (unsigned long long)1 << (type.width - 1));
2366
         LLVMValueRef sign;
2367

2368
         /* get sign bit */
2369
         sign = LLVMBuildBitCast(builder, a, int_vec_type, "");
2370
         sign = LLVMBuildAnd(builder, sign, mask, "");
2371

2372
         /* sign * 0.5 */
2373
         half = LLVMBuildBitCast(builder, half, int_vec_type, "");
2374
         half = LLVMBuildOr(builder, sign, half, "");
2375
         half = LLVMBuildBitCast(builder, half, vec_type, "");
2376
      }
2377

2378
      res = LLVMBuildFAdd(builder, a, half, "");
2379
   }
2380

2381
   res = LLVMBuildFPToSI(builder, res, int_vec_type, "");
2382

2383
   return res;
2384
}
2385

2386

2387
/**
2388
 * Return floor of float (vector), result is an int (vector)
2389
 * Ex: ifloor(1.1) = 1.0
2390
 * Ex: ifloor(-1.1) = -2.0
2391
 */
2392
LLVMValueRef
2393
lp_build_ifloor(struct lp_build_context *bld,
2394
                LLVMValueRef a)
2395
{
2396
   LLVMBuilderRef builder = bld->gallivm->builder;
2397
   const struct lp_type type = bld->type;
2398
   LLVMTypeRef int_vec_type = bld->int_vec_type;
2399
   LLVMValueRef res;
2400

2401
   assert(type.floating);
2402
   assert(lp_check_value(type, a));
2403

2404
   res = a;
2405
   if (type.sign) {
2406
      if (arch_rounding_available(type)) {
2407
         res = lp_build_round_arch(bld, a, LP_BUILD_ROUND_FLOOR);
2408
      }
2409
      else {
2410
         struct lp_type inttype;
2411
         struct lp_build_context intbld;
2412
         LLVMValueRef trunc, itrunc, mask;
2413

2414
         assert(type.floating);
2415
         assert(lp_check_value(type, a));
2416

2417
         inttype = type;
2418
         inttype.floating = 0;
2419
         lp_build_context_init(&intbld, bld->gallivm, inttype);
2420

2421
         /* round by truncation */
2422
         itrunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
2423
         trunc = LLVMBuildSIToFP(builder, itrunc, bld->vec_type, "ifloor.trunc");
2424

2425
         /*
2426
          * fix values if rounding is wrong (for non-special cases)
2427
          * - this is the case if trunc > a
2428
          * The results of doing this with NaNs, very large values etc.
2429
          * are undefined but this seems to be the case anyway.
2430
          */
2431
         mask = lp_build_cmp(bld, PIPE_FUNC_GREATER, trunc, a);
2432
         /* cheapie minus one with mask since the mask is minus one / zero */
2433
         return lp_build_add(&intbld, itrunc, mask);
2434
      }
2435
   }
2436

2437
   /* round to nearest (toward zero) */
2438
   res = LLVMBuildFPToSI(builder, res, int_vec_type, "ifloor.res");
2439

2440
   return res;
2441
}
2442

2443

2444
/**
2445
 * Return ceiling of float (vector), returning int (vector).
2446
 * Ex: iceil( 1.1) = 2
2447
 * Ex: iceil(-1.1) = -1
2448
 */
2449
LLVMValueRef
2450
lp_build_iceil(struct lp_build_context *bld,
2451
               LLVMValueRef a)
2452
{
2453
   LLVMBuilderRef builder = bld->gallivm->builder;
2454
   const struct lp_type type = bld->type;
2455
   LLVMTypeRef int_vec_type = bld->int_vec_type;
2456
   LLVMValueRef res;
2457

2458
   assert(type.floating);
2459
   assert(lp_check_value(type, a));
2460

2461
   if (arch_rounding_available(type)) {
2462
      res = lp_build_round_arch(bld, a, LP_BUILD_ROUND_CEIL);
2463
   }
2464
   else {
2465
      struct lp_type inttype;
2466
      struct lp_build_context intbld;
2467
      LLVMValueRef trunc, itrunc, mask;
2468

2469
      assert(type.floating);
2470
      assert(lp_check_value(type, a));
2471

2472
      inttype = type;
2473
      inttype.floating = 0;
2474
      lp_build_context_init(&intbld, bld->gallivm, inttype);
2475

2476
      /* round by truncation */
2477
      itrunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
2478
      trunc = LLVMBuildSIToFP(builder, itrunc, bld->vec_type, "iceil.trunc");
2479

2480
      /*
2481
       * fix values if rounding is wrong (for non-special cases)
2482
       * - this is the case if trunc < a
2483
       * The results of doing this with NaNs, very large values etc.
2484
       * are undefined but this seems to be the case anyway.
2485
       */
2486
      mask = lp_build_cmp(bld, PIPE_FUNC_LESS, trunc, a);
2487
      /* cheapie plus one with mask since the mask is minus one / zero */
2488
      return lp_build_sub(&intbld, itrunc, mask);
2489
   }
2490

2491
   /* round to nearest (toward zero) */
2492
   res = LLVMBuildFPToSI(builder, res, int_vec_type, "iceil.res");
2493

2494
   return res;
2495
}
2496

2497

2498
/**
2499
 * Combined ifloor() & fract().
2500
 *
2501
 * Preferred to calling the functions separately, as it will ensure that the
2502
 * strategy (floor() vs ifloor()) that results in less redundant work is used.
2503
 */
2504
void
2505
lp_build_ifloor_fract(struct lp_build_context *bld,
2506
                      LLVMValueRef a,
2507
                      LLVMValueRef *out_ipart,
2508
                      LLVMValueRef *out_fpart)
2509
{
2510
   LLVMBuilderRef builder = bld->gallivm->builder;
2511
   const struct lp_type type = bld->type;
2512
   LLVMValueRef ipart;
2513

2514
   assert(type.floating);
2515
   assert(lp_check_value(type, a));
2516

2517
   if (arch_rounding_available(type)) {
2518
      /*
2519
       * floor() is easier.
2520
       */
2521

2522
      ipart = lp_build_floor(bld, a);
2523
      *out_fpart = LLVMBuildFSub(builder, a, ipart, "fpart");
2524
      *out_ipart = LLVMBuildFPToSI(builder, ipart, bld->int_vec_type, "ipart");
2525
   }
2526
   else {
2527
      /*
2528
       * ifloor() is easier.
2529
       */
2530

2531
      *out_ipart = lp_build_ifloor(bld, a);
2532
      ipart = LLVMBuildSIToFP(builder, *out_ipart, bld->vec_type, "ipart");
2533
      *out_fpart = LLVMBuildFSub(builder, a, ipart, "fpart");
2534
   }
2535
}
2536

2537

2538
/**
2539
 * Same as lp_build_ifloor_fract, but guarantees that the fractional part is
2540
 * always smaller than one.
2541
 */
2542
void
2543
lp_build_ifloor_fract_safe(struct lp_build_context *bld,
2544
                           LLVMValueRef a,
2545
                           LLVMValueRef *out_ipart,
2546
                           LLVMValueRef *out_fpart)
2547
{
2548
   lp_build_ifloor_fract(bld, a, out_ipart, out_fpart);
2549
   *out_fpart = clamp_fract(bld, *out_fpart);
2550
}
2551

2552

2553
LLVMValueRef
2554
lp_build_sqrt(struct lp_build_context *bld,
2555
              LLVMValueRef a)
2556
{
2557
   LLVMBuilderRef builder = bld->gallivm->builder;
2558
   const struct lp_type type = bld->type;
2559
   LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
2560
   char intrinsic[32];
2561

2562
   assert(lp_check_value(type, a));
2563

2564
   assert(type.floating);
2565
   lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.sqrt", vec_type);
2566

2567
   return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a);
2568
}
2569

2570

2571
/**
2572
 * Do one Newton-Raphson step to improve reciprocate precision:
2573
 *
2574
 *   x_{i+1} = x_i + x_i * (1 - a * x_i)
2575
 *
2576
 * XXX: Unfortunately this won't give IEEE-754 conformant results for 0 or
2577
 * +/-Inf, giving NaN instead.  Certain applications rely on this behavior,
2578
 * such as Google Earth, which does RCP(RSQRT(0.0)) when drawing the Earth's
2579
 * halo. It would be necessary to clamp the argument to prevent this.
2580
 *
2581
 * See also:
2582
 * - http://en.wikipedia.org/wiki/Division_(digital)#Newton.E2.80.93Raphson_division
2583
 * - http://softwarecommunity.intel.com/articles/eng/1818.htm
2584
 */
2585
static inline LLVMValueRef
2586
lp_build_rcp_refine(struct lp_build_context *bld,
2587
                    LLVMValueRef a,
2588
                    LLVMValueRef rcp_a)
2589
{
2590
   LLVMBuilderRef builder = bld->gallivm->builder;
2591
   LLVMValueRef neg_a;
2592
   LLVMValueRef res;
2593

2594
   neg_a = LLVMBuildFNeg(builder, a, "");
2595
   res = lp_build_fmuladd(builder, neg_a, rcp_a, bld->one);
2596
   res = lp_build_fmuladd(builder, res, rcp_a, rcp_a);
2597

2598
   return res;
2599
}
2600

2601

2602
LLVMValueRef
2603
lp_build_rcp(struct lp_build_context *bld,
2604
             LLVMValueRef a)
2605
{
2606
   LLVMBuilderRef builder = bld->gallivm->builder;
2607
   const struct lp_type type = bld->type;
2608

2609
   assert(lp_check_value(type, a));
2610

2611
   if(a == bld->zero)
2612
      return bld->undef;
2613
   if(a == bld->one)
2614
      return bld->one;
2615
   if(a == bld->undef)
2616
      return bld->undef;
2617

2618
   assert(type.floating);
2619

2620
   if(LLVMIsConstant(a))
2621
      return LLVMConstFDiv(bld->one, a);
2622

2623
   /*
2624
    * We don't use RCPPS because:
2625
    * - it only has 10bits of precision
2626
    * - it doesn't even get the reciprocate of 1.0 exactly
2627
    * - doing Newton-Rapshon steps yields wrong (NaN) values for 0.0 or Inf
2628
    * - for recent processors the benefit over DIVPS is marginal, a case
2629
    *   dependent
2630
    *
2631
    * We could still use it on certain processors if benchmarks show that the
2632
    * RCPPS plus necessary workarounds are still preferrable to DIVPS; or for
2633
    * particular uses that require less workarounds.
2634
    */
2635

2636
   if (FALSE && ((util_get_cpu_caps()->has_sse && type.width == 32 && type.length == 4) ||
2637
         (util_get_cpu_caps()->has_avx && type.width == 32 && type.length == 8))){
2638
      const unsigned num_iterations = 0;
2639
      LLVMValueRef res;
2640
      unsigned i;
2641
      const char *intrinsic = NULL;
2642

2643
      if (type.length == 4) {
2644
         intrinsic = "llvm.x86.sse.rcp.ps";
2645
      }
2646
      else {
2647
         intrinsic = "llvm.x86.avx.rcp.ps.256";
2648
      }
2649

2650
      res = lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
2651

2652
      for (i = 0; i < num_iterations; ++i) {
2653
         res = lp_build_rcp_refine(bld, a, res);
2654
      }
2655

2656
      return res;
2657
   }
2658

2659
   return LLVMBuildFDiv(builder, bld->one, a, "");
2660
}
2661

2662

2663
/**
2664
 * Do one Newton-Raphson step to improve rsqrt precision:
2665
 *
2666
 *   x_{i+1} = 0.5 * x_i * (3.0 - a * x_i * x_i)
2667
 *
2668
 * See also Intel 64 and IA-32 Architectures Optimization Manual.
2669
 */
2670
static inline LLVMValueRef
2671
lp_build_rsqrt_refine(struct lp_build_context *bld,
2672
                      LLVMValueRef a,
2673
                      LLVMValueRef rsqrt_a)
2674
{
2675
   LLVMBuilderRef builder = bld->gallivm->builder;
2676
   LLVMValueRef half = lp_build_const_vec(bld->gallivm, bld->type, 0.5);
2677
   LLVMValueRef three = lp_build_const_vec(bld->gallivm, bld->type, 3.0);
2678
   LLVMValueRef res;
2679

2680
   res = LLVMBuildFMul(builder, rsqrt_a, rsqrt_a, "");
2681
   res = LLVMBuildFMul(builder, a, res, "");
2682
   res = LLVMBuildFSub(builder, three, res, "");
2683
   res = LLVMBuildFMul(builder, rsqrt_a, res, "");
2684
   res = LLVMBuildFMul(builder, half, res, "");
2685

2686
   return res;
2687
}
2688

2689

2690
/**
2691
 * Generate 1/sqrt(a).
2692
 * Result is undefined for values < 0, infinity for +0.
2693
 */
2694
LLVMValueRef
2695
lp_build_rsqrt(struct lp_build_context *bld,
2696
               LLVMValueRef a)
2697
{
2698
   const struct lp_type type = bld->type;
2699

2700
   assert(lp_check_value(type, a));
2701

2702
   assert(type.floating);
2703

2704
   /*
2705
    * This should be faster but all denormals will end up as infinity.
2706
    */
2707
   if (0 && lp_build_fast_rsqrt_available(type)) {
2708
      const unsigned num_iterations = 1;
2709
      LLVMValueRef res;
2710
      unsigned i;
2711

2712
      /* rsqrt(1.0) != 1.0 here */
2713
      res = lp_build_fast_rsqrt(bld, a);
2714

2715
      if (num_iterations) {
2716
         /*
2717
          * Newton-Raphson will result in NaN instead of infinity for zero,
2718
          * and NaN instead of zero for infinity.
2719
          * Also, need to ensure rsqrt(1.0) == 1.0.
2720
          * All numbers smaller than FLT_MIN will result in +infinity
2721
          * (rsqrtps treats all denormals as zero).
2722
          */
2723
         LLVMValueRef cmp;
2724
         LLVMValueRef flt_min = lp_build_const_vec(bld->gallivm, type, FLT_MIN);
2725
         LLVMValueRef inf = lp_build_const_vec(bld->gallivm, type, INFINITY);
2726

2727
         for (i = 0; i < num_iterations; ++i) {
2728
            res = lp_build_rsqrt_refine(bld, a, res);
2729
         }
2730
         cmp = lp_build_compare(bld->gallivm, type, PIPE_FUNC_LESS, a, flt_min);
2731
         res = lp_build_select(bld, cmp, inf, res);
2732
         cmp = lp_build_compare(bld->gallivm, type, PIPE_FUNC_EQUAL, a, inf);
2733
         res = lp_build_select(bld, cmp, bld->zero, res);
2734
         cmp = lp_build_compare(bld->gallivm, type, PIPE_FUNC_EQUAL, a, bld->one);
2735
         res = lp_build_select(bld, cmp, bld->one, res);
2736
      }
2737

2738
      return res;
2739
   }
2740

2741
   return lp_build_rcp(bld, lp_build_sqrt(bld, a));
2742
}
2743

2744
/**
2745
 * If there's a fast (inaccurate) rsqrt instruction available
2746
 * (caller may want to avoid to call rsqrt_fast if it's not available,
2747
 * i.e. for calculating x^0.5 it may do rsqrt_fast(x) * x but if
2748
 * unavailable it would result in sqrt/div/mul so obviously
2749
 * much better to just call sqrt, skipping both div and mul).
2750
 */
2751
boolean
2752
lp_build_fast_rsqrt_available(struct lp_type type)
2753
{
2754
   assert(type.floating);
2755

2756
   if ((util_get_cpu_caps()->has_sse && type.width == 32 && type.length == 4) ||
2757
       (util_get_cpu_caps()->has_avx && type.width == 32 && type.length == 8)) {
2758
      return true;
2759
   }
2760
   return false;
2761
}
2762

2763

2764
/**
2765
 * Generate 1/sqrt(a).
2766
 * Result is undefined for values < 0, infinity for +0.
2767
 * Precision is limited, only ~10 bits guaranteed
2768
 * (rsqrt 1.0 may not be 1.0, denorms may be flushed to 0).
2769
 */
2770
LLVMValueRef
2771
lp_build_fast_rsqrt(struct lp_build_context *bld,
2772
                    LLVMValueRef a)
2773
{
2774
   LLVMBuilderRef builder = bld->gallivm->builder;
2775
   const struct lp_type type = bld->type;
2776

2777
   assert(lp_check_value(type, a));
2778

2779
   if (lp_build_fast_rsqrt_available(type)) {
2780
      const char *intrinsic = NULL;
2781

2782
      if (type.length == 4) {
2783
         intrinsic = "llvm.x86.sse.rsqrt.ps";
2784
      }
2785
      else {
2786
         intrinsic = "llvm.x86.avx.rsqrt.ps.256";
2787
      }
2788
      return lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
2789
   }
2790
   else {
2791
      debug_printf("%s: emulating fast rsqrt with rcp/sqrt\n", __FUNCTION__);
2792
   }
2793
   return lp_build_rcp(bld, lp_build_sqrt(bld, a));
2794
}
2795

2796

2797
/**
2798
 * Generate sin(a) or cos(a) using polynomial approximation.
2799
 * TODO: it might be worth recognizing sin and cos using same source
2800
 * (i.e. d3d10 sincos opcode). Obviously doing both at the same time
2801
 * would be way cheaper than calculating (nearly) everything twice...
2802
 * Not sure it's common enough to be worth bothering however, scs
2803
 * opcode could also benefit from calculating both though.
2804
 */
2805
static LLVMValueRef
2806
lp_build_sin_or_cos(struct lp_build_context *bld,
2807
                    LLVMValueRef a,
2808
                    boolean cos)
2809
{
2810
   struct gallivm_state *gallivm = bld->gallivm;
2811
   LLVMBuilderRef b = gallivm->builder;
2812
   struct lp_type int_type = lp_int_type(bld->type);
2813

2814
   /*
2815
    *  take the absolute value,
2816
    *  x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
2817
    */
2818

2819
   LLVMValueRef inv_sig_mask = lp_build_const_int_vec(gallivm, bld->type, ~0x80000000);
2820
   LLVMValueRef a_v4si = LLVMBuildBitCast(b, a, bld->int_vec_type, "a_v4si");
2821

2822
   LLVMValueRef absi = LLVMBuildAnd(b, a_v4si, inv_sig_mask, "absi");
2823
   LLVMValueRef x_abs = LLVMBuildBitCast(b, absi, bld->vec_type, "x_abs");
2824

2825
   /*
2826
    * scale by 4/Pi
2827
    * y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
2828
    */
2829

2830
   LLVMValueRef FOPi = lp_build_const_vec(gallivm, bld->type, 1.27323954473516);
2831
   LLVMValueRef scale_y = LLVMBuildFMul(b, x_abs, FOPi, "scale_y");
2832

2833
   /*
2834
    * store the integer part of y in mm0
2835
    * emm2 = _mm_cvttps_epi32(y);
2836
    */
2837

2838
   LLVMValueRef emm2_i = LLVMBuildFPToSI(b, scale_y, bld->int_vec_type, "emm2_i");
2839

2840
   /*
2841
    * j=(j+1) & (~1) (see the cephes sources)
2842
    * emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1);
2843
    */
2844

2845
   LLVMValueRef all_one = lp_build_const_int_vec(gallivm, bld->type, 1);
2846
   LLVMValueRef emm2_add =  LLVMBuildAdd(b, emm2_i, all_one, "emm2_add");
2847
   /*
2848
    * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1);
2849
    */
2850
   LLVMValueRef inv_one = lp_build_const_int_vec(gallivm, bld->type, ~1);
2851
   LLVMValueRef emm2_and =  LLVMBuildAnd(b, emm2_add, inv_one, "emm2_and");
2852

2853
   /*
2854
    * y = _mm_cvtepi32_ps(emm2);
2855
    */
2856
   LLVMValueRef y_2 = LLVMBuildSIToFP(b, emm2_and, bld->vec_type, "y_2");
2857

2858
   LLVMValueRef const_2 = lp_build_const_int_vec(gallivm, bld->type, 2);
2859
   LLVMValueRef const_4 = lp_build_const_int_vec(gallivm, bld->type, 4);
2860
   LLVMValueRef const_29 = lp_build_const_int_vec(gallivm, bld->type, 29);
2861
   LLVMValueRef sign_mask = lp_build_const_int_vec(gallivm, bld->type, 0x80000000);
2862

2863
   /*
2864
    * Argument used for poly selection and sign bit determination
2865
    * is different for sin vs. cos.
2866
    */
2867
   LLVMValueRef emm2_2 = cos ? LLVMBuildSub(b, emm2_and, const_2, "emm2_2") :
2868
                               emm2_and;
2869

2870
   LLVMValueRef sign_bit = cos ? LLVMBuildShl(b, LLVMBuildAnd(b, const_4,
2871
                                                              LLVMBuildNot(b, emm2_2, ""), ""),
2872
                                              const_29, "sign_bit") :
2873
                                 LLVMBuildAnd(b, LLVMBuildXor(b, a_v4si,
2874
                                                              LLVMBuildShl(b, emm2_add,
2875
                                                                           const_29, ""), ""),
2876
                                              sign_mask, "sign_bit");
2877

2878
   /*
2879
    * get the polynom selection mask
2880
    * there is one polynom for 0 <= x <= Pi/4
2881
    * and another one for Pi/4<x<=Pi/2
2882
    * Both branches will be computed.
2883
    *
2884
    * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);
2885
    * emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
2886
    */
2887

2888
   LLVMValueRef emm2_3 =  LLVMBuildAnd(b, emm2_2, const_2, "emm2_3");
2889
   LLVMValueRef poly_mask = lp_build_compare(gallivm,
2890
                                             int_type, PIPE_FUNC_EQUAL,
2891
                                             emm2_3, lp_build_const_int_vec(gallivm, bld->type, 0));
2892

2893
   /*
2894
    * _PS_CONST(minus_cephes_DP1, -0.78515625);
2895
    * _PS_CONST(minus_cephes_DP2, -2.4187564849853515625e-4);
2896
    * _PS_CONST(minus_cephes_DP3, -3.77489497744594108e-8);
2897
    */
2898
   LLVMValueRef DP1 = lp_build_const_vec(gallivm, bld->type, -0.78515625);
2899
   LLVMValueRef DP2 = lp_build_const_vec(gallivm, bld->type, -2.4187564849853515625e-4);
2900
   LLVMValueRef DP3 = lp_build_const_vec(gallivm, bld->type, -3.77489497744594108e-8);
2901

2902
   /*
2903
    * The magic pass: "Extended precision modular arithmetic"
2904
    * x = ((x - y * DP1) - y * DP2) - y * DP3;
2905
    */
2906
   LLVMValueRef x_1 = lp_build_fmuladd(b, y_2, DP1, x_abs);
2907
   LLVMValueRef x_2 = lp_build_fmuladd(b, y_2, DP2, x_1);
2908
   LLVMValueRef x_3 = lp_build_fmuladd(b, y_2, DP3, x_2);
2909

2910
   /*
2911
    * Evaluate the first polynom  (0 <= x <= Pi/4)
2912
    *
2913
    * z = _mm_mul_ps(x,x);
2914
    */
2915
   LLVMValueRef z = LLVMBuildFMul(b, x_3, x_3, "z");
2916

2917
   /*
2918
    * _PS_CONST(coscof_p0,  2.443315711809948E-005);
2919
    * _PS_CONST(coscof_p1, -1.388731625493765E-003);
2920
    * _PS_CONST(coscof_p2,  4.166664568298827E-002);
2921
    */
2922
   LLVMValueRef coscof_p0 = lp_build_const_vec(gallivm, bld->type, 2.443315711809948E-005);
2923
   LLVMValueRef coscof_p1 = lp_build_const_vec(gallivm, bld->type, -1.388731625493765E-003);
2924
   LLVMValueRef coscof_p2 = lp_build_const_vec(gallivm, bld->type, 4.166664568298827E-002);
2925

2926
   /*
2927
    * y = *(v4sf*)_ps_coscof_p0;
2928
    * y = _mm_mul_ps(y, z);
2929
    */
2930
   LLVMValueRef y_4 = lp_build_fmuladd(b, z, coscof_p0, coscof_p1);
2931
   LLVMValueRef y_6 = lp_build_fmuladd(b, y_4, z, coscof_p2);
2932
   LLVMValueRef y_7 = LLVMBuildFMul(b, y_6, z, "y_7");
2933
   LLVMValueRef y_8 = LLVMBuildFMul(b, y_7, z, "y_8");
2934

2935

2936
   /*
2937
    * tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
2938
    * y = _mm_sub_ps(y, tmp);
2939
    * y = _mm_add_ps(y, *(v4sf*)_ps_1);
2940
    */
2941
   LLVMValueRef half = lp_build_const_vec(gallivm, bld->type, 0.5);
2942
   LLVMValueRef tmp = LLVMBuildFMul(b, z, half, "tmp");
2943
   LLVMValueRef y_9 = LLVMBuildFSub(b, y_8, tmp, "y_8");
2944
   LLVMValueRef one = lp_build_const_vec(gallivm, bld->type, 1.0);
2945
   LLVMValueRef y_10 = LLVMBuildFAdd(b, y_9, one, "y_9");
2946

2947
   /*
2948
    * _PS_CONST(sincof_p0, -1.9515295891E-4);
2949
    * _PS_CONST(sincof_p1,  8.3321608736E-3);
2950
    * _PS_CONST(sincof_p2, -1.6666654611E-1);
2951
    */
2952
   LLVMValueRef sincof_p0 = lp_build_const_vec(gallivm, bld->type, -1.9515295891E-4);
2953
   LLVMValueRef sincof_p1 = lp_build_const_vec(gallivm, bld->type, 8.3321608736E-3);
2954
   LLVMValueRef sincof_p2 = lp_build_const_vec(gallivm, bld->type, -1.6666654611E-1);
2955

2956
   /*
2957
    * Evaluate the second polynom  (Pi/4 <= x <= 0)
2958
    *
2959
    * y2 = *(v4sf*)_ps_sincof_p0;
2960
    * y2 = _mm_mul_ps(y2, z);
2961
    * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
2962
    * y2 = _mm_mul_ps(y2, z);
2963
    * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
2964
    * y2 = _mm_mul_ps(y2, z);
2965
    * y2 = _mm_mul_ps(y2, x);
2966
    * y2 = _mm_add_ps(y2, x);
2967
    */
2968

2969
   LLVMValueRef y2_4 = lp_build_fmuladd(b, z, sincof_p0, sincof_p1);
2970
   LLVMValueRef y2_6 = lp_build_fmuladd(b, y2_4, z, sincof_p2);
2971
   LLVMValueRef y2_7 = LLVMBuildFMul(b, y2_6, z, "y2_7");
2972
   LLVMValueRef y2_9 = lp_build_fmuladd(b, y2_7, x_3, x_3);
2973

2974
   /*
2975
    * select the correct result from the two polynoms
2976
    * xmm3 = poly_mask;
2977
    * y2 = _mm_and_ps(xmm3, y2); //, xmm3);
2978
    * y = _mm_andnot_ps(xmm3, y);
2979
    * y = _mm_or_ps(y,y2);
2980
    */
2981
   LLVMValueRef y2_i = LLVMBuildBitCast(b, y2_9, bld->int_vec_type, "y2_i");
2982
   LLVMValueRef y_i = LLVMBuildBitCast(b, y_10, bld->int_vec_type, "y_i");
2983
   LLVMValueRef y2_and = LLVMBuildAnd(b, y2_i, poly_mask, "y2_and");
2984
   LLVMValueRef poly_mask_inv = LLVMBuildNot(b, poly_mask, "poly_mask_inv");
2985
   LLVMValueRef y_and = LLVMBuildAnd(b, y_i, poly_mask_inv, "y_and");
2986
   LLVMValueRef y_combine = LLVMBuildOr(b, y_and, y2_and, "y_combine");
2987

2988
   /*
2989
    * update the sign
2990
    * y = _mm_xor_ps(y, sign_bit);
2991
    */
2992
   LLVMValueRef y_sign = LLVMBuildXor(b, y_combine, sign_bit, "y_sign");
2993
   LLVMValueRef y_result = LLVMBuildBitCast(b, y_sign, bld->vec_type, "y_result");
2994

2995
   LLVMValueRef isfinite = lp_build_isfinite(bld, a);
2996

2997
   /* clamp output to be within [-1, 1] */
2998
   y_result = lp_build_clamp(bld, y_result,
2999
                             lp_build_const_vec(bld->gallivm, bld->type,  -1.f),
3000
                             lp_build_const_vec(bld->gallivm, bld->type,  1.f));
3001
   /* If a is -inf, inf or NaN then return NaN */
3002
   y_result = lp_build_select(bld, isfinite, y_result,
3003
                              lp_build_const_vec(bld->gallivm, bld->type,  NAN));
3004
   return y_result;
3005
}
3006

3007

3008
/**
3009
 * Generate sin(a)
3010
 */
3011
LLVMValueRef
3012
lp_build_sin(struct lp_build_context *bld,
3013
             LLVMValueRef a)
3014
{
3015
   return lp_build_sin_or_cos(bld, a, FALSE);
3016
}
3017

3018

3019
/**
3020
 * Generate cos(a)
3021
 */
3022
LLVMValueRef
3023
lp_build_cos(struct lp_build_context *bld,
3024
             LLVMValueRef a)
3025
{
3026
   return lp_build_sin_or_cos(bld, a, TRUE);
3027
}
3028

3029

3030
/**
3031
 * Generate pow(x, y)
3032
 */
3033
LLVMValueRef
3034
lp_build_pow(struct lp_build_context *bld,
3035
             LLVMValueRef x,
3036
             LLVMValueRef y)
3037
{
3038
   /* TODO: optimize the constant case */
3039
   if (gallivm_debug & GALLIVM_DEBUG_PERF &&
3040
       LLVMIsConstant(x) && LLVMIsConstant(y)) {
3041
      debug_printf("%s: inefficient/imprecise constant arithmetic\n",
3042
                   __FUNCTION__);
3043
   }
3044

3045
   LLVMValueRef cmp = lp_build_cmp(bld, PIPE_FUNC_EQUAL, x, lp_build_const_vec(bld->gallivm, bld->type, 0.0f));
3046
   LLVMValueRef res = lp_build_exp2(bld, lp_build_mul(bld, lp_build_log2(bld, x), y));
3047

3048
   res = lp_build_select(bld, cmp, lp_build_const_vec(bld->gallivm, bld->type, 0.0f), res);
3049
   return res;
3050
}
3051

3052

3053
/**
3054
 * Generate exp(x)
3055
 */
3056
LLVMValueRef
3057
lp_build_exp(struct lp_build_context *bld,
3058
             LLVMValueRef x)
3059
{
3060
   /* log2(e) = 1/log(2) */
3061
   LLVMValueRef log2e = lp_build_const_vec(bld->gallivm, bld->type,
3062
                                           1.4426950408889634);
3063

3064
   assert(lp_check_value(bld->type, x));
3065

3066
   return lp_build_exp2(bld, lp_build_mul(bld, log2e, x));
3067
}
3068

3069

3070
/**
3071
 * Generate log(x)
3072
 * Behavior is undefined with infs, 0s and nans
3073
 */
3074
LLVMValueRef
3075
lp_build_log(struct lp_build_context *bld,
3076
             LLVMValueRef x)
3077
{
3078
   /* log(2) */
3079
   LLVMValueRef log2 = lp_build_const_vec(bld->gallivm, bld->type,
3080
                                          0.69314718055994529);
3081

3082
   assert(lp_check_value(bld->type, x));
3083

3084
   return lp_build_mul(bld, log2, lp_build_log2(bld, x));
3085
}
3086

3087
/**
3088
 * Generate log(x) that handles edge cases (infs, 0s and nans)
3089
 */
3090
LLVMValueRef
3091
lp_build_log_safe(struct lp_build_context *bld,
3092
                  LLVMValueRef x)
3093
{
3094
   /* log(2) */
3095
   LLVMValueRef log2 = lp_build_const_vec(bld->gallivm, bld->type,
3096
                                          0.69314718055994529);
3097

3098
   assert(lp_check_value(bld->type, x));
3099

3100
   return lp_build_mul(bld, log2, lp_build_log2_safe(bld, x));
3101
}
3102

3103

3104
/**
3105
 * Generate polynomial.
3106
 * Ex:  coeffs[0] + x * coeffs[1] + x^2 * coeffs[2].
3107
 */
3108
LLVMValueRef
3109
lp_build_polynomial(struct lp_build_context *bld,
3110
                    LLVMValueRef x,
3111
                    const double *coeffs,
3112
                    unsigned num_coeffs)
3113
{
3114
   const struct lp_type type = bld->type;
3115
   LLVMValueRef even = NULL, odd = NULL;
3116
   LLVMValueRef x2;
3117
   unsigned i;
3118

3119
   assert(lp_check_value(bld->type, x));
3120

3121
   /* TODO: optimize the constant case */
3122
   if (gallivm_debug & GALLIVM_DEBUG_PERF &&
3123
       LLVMIsConstant(x)) {
3124
      debug_printf("%s: inefficient/imprecise constant arithmetic\n",
3125
                   __FUNCTION__);
3126
   }
3127

3128
   /*
3129
    * Calculate odd and even terms seperately to decrease data dependency
3130
    * Ex:
3131
    *     c[0] + x^2 * c[2] + x^4 * c[4] ...
3132
    *     + x * (c[1] + x^2 * c[3] + x^4 * c[5]) ...
3133
    */
3134
   x2 = lp_build_mul(bld, x, x);
3135

3136
   for (i = num_coeffs; i--; ) {
3137
      LLVMValueRef coeff;
3138

3139
      coeff = lp_build_const_vec(bld->gallivm, type, coeffs[i]);
3140

3141
      if (i % 2 == 0) {
3142
         if (even)
3143
            even = lp_build_mad(bld, x2, even, coeff);
3144
         else
3145
            even = coeff;
3146
      } else {
3147
         if (odd)
3148
            odd = lp_build_mad(bld, x2, odd, coeff);
3149
         else
3150
            odd = coeff;
3151
      }
3152
   }
3153

3154
   if (odd)
3155
      return lp_build_mad(bld, odd, x, even);
3156
   else if (even)
3157
      return even;
3158
   else
3159
      return bld->undef;
3160
}
3161

3162

3163
/**
3164
 * Minimax polynomial fit of 2**x, in range [0, 1[
3165
 */
3166
const double lp_build_exp2_polynomial[] = {
3167
#if EXP_POLY_DEGREE == 5
3168
   1.000000000000000000000, /*XXX: was 0.999999925063526176901, recompute others */
3169
   0.693153073200168932794,
3170
   0.240153617044375388211,
3171
   0.0558263180532956664775,
3172
   0.00898934009049466391101,
3173
   0.00187757667519147912699
3174
#elif EXP_POLY_DEGREE == 4
3175
   1.00000259337069434683,
3176
   0.693003834469974940458,
3177
   0.24144275689150793076,
3178
   0.0520114606103070150235,
3179
   0.0135341679161270268764
3180
#elif EXP_POLY_DEGREE == 3
3181
   0.999925218562710312959,
3182
   0.695833540494823811697,
3183
   0.226067155427249155588,
3184
   0.0780245226406372992967
3185
#elif EXP_POLY_DEGREE == 2
3186
   1.00172476321474503578,
3187
   0.657636275736077639316,
3188
   0.33718943461968720704
3189
#else
3190
#error
3191
#endif
3192
};
3193

3194

3195
LLVMValueRef
3196
lp_build_exp2(struct lp_build_context *bld,
3197
              LLVMValueRef x)
3198
{
3199
   LLVMBuilderRef builder = bld->gallivm->builder;
3200
   const struct lp_type type = bld->type;
3201
   LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
3202
   LLVMValueRef ipart = NULL;
3203
   LLVMValueRef fpart = NULL;
3204
   LLVMValueRef expipart = NULL;
3205
   LLVMValueRef expfpart = NULL;
3206
   LLVMValueRef res = NULL;
3207

3208
   assert(lp_check_value(bld->type, x));
3209

3210
   /* TODO: optimize the constant case */
3211
   if (gallivm_debug & GALLIVM_DEBUG_PERF &&
3212
       LLVMIsConstant(x)) {
3213
      debug_printf("%s: inefficient/imprecise constant arithmetic\n",
3214
                   __FUNCTION__);
3215
   }
3216

3217
   assert(type.floating && type.width == 32);
3218

3219
   /* We want to preserve NaN and make sure than for exp2 if x > 128,
3220
    * the result is INF  and if it's smaller than -126.9 the result is 0 */
3221
   x = lp_build_min_ext(bld, lp_build_const_vec(bld->gallivm, type,  128.0), x,
3222
                        GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN);
3223
   x = lp_build_max_ext(bld, lp_build_const_vec(bld->gallivm, type, -126.99999),
3224
                        x, GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN);
3225

3226
   /* ipart = floor(x) */
3227
   /* fpart = x - ipart */
3228
   lp_build_ifloor_fract(bld, x, &ipart, &fpart);
3229

3230
   /* expipart = (float) (1 << ipart) */
3231
   expipart = LLVMBuildAdd(builder, ipart,
3232
                           lp_build_const_int_vec(bld->gallivm, type, 127), "");
3233
   expipart = LLVMBuildShl(builder, expipart,
3234
                           lp_build_const_int_vec(bld->gallivm, type, 23), "");
3235
   expipart = LLVMBuildBitCast(builder, expipart, vec_type, "");
3236

3237
   expfpart = lp_build_polynomial(bld, fpart, lp_build_exp2_polynomial,
3238
                                  ARRAY_SIZE(lp_build_exp2_polynomial));
3239

3240
   res = LLVMBuildFMul(builder, expipart, expfpart, "");
3241

3242
   return res;
3243
}
3244

3245

3246

3247
/**
3248
 * Extract the exponent of a IEEE-754 floating point value.
3249
 *
3250
 * Optionally apply an integer bias.
3251
 *
3252
 * Result is an integer value with
3253
 *
3254
 *   ifloor(log2(x)) + bias
3255
 */
3256
LLVMValueRef
3257
lp_build_extract_exponent(struct lp_build_context *bld,
3258
                          LLVMValueRef x,
3259
                          int bias)
3260
{
3261
   LLVMBuilderRef builder = bld->gallivm->builder;
3262
   const struct lp_type type = bld->type;
3263
   unsigned mantissa = lp_mantissa(type);
3264
   LLVMValueRef res;
3265

3266
   assert(type.floating);
3267

3268
   assert(lp_check_value(bld->type, x));
3269

3270
   x = LLVMBuildBitCast(builder, x, bld->int_vec_type, "");
3271

3272
   res = LLVMBuildLShr(builder, x,
3273
                       lp_build_const_int_vec(bld->gallivm, type, mantissa), "");
3274
   res = LLVMBuildAnd(builder, res,
3275
                      lp_build_const_int_vec(bld->gallivm, type, 255), "");
3276
   res = LLVMBuildSub(builder, res,
3277
                      lp_build_const_int_vec(bld->gallivm, type, 127 - bias), "");
3278

3279
   return res;
3280
}
3281

3282

3283
/**
3284
 * Extract the mantissa of the a floating.
3285
 *
3286
 * Result is a floating point value with
3287
 *
3288
 *   x / floor(log2(x))
3289
 */
3290
LLVMValueRef
3291
lp_build_extract_mantissa(struct lp_build_context *bld,
3292
                          LLVMValueRef x)
3293
{
3294
   LLVMBuilderRef builder = bld->gallivm->builder;
3295
   const struct lp_type type = bld->type;
3296
   unsigned mantissa = lp_mantissa(type);
3297
   LLVMValueRef mantmask = lp_build_const_int_vec(bld->gallivm, type,
3298
                                                  (1ULL << mantissa) - 1);
3299
   LLVMValueRef one = LLVMConstBitCast(bld->one, bld->int_vec_type);
3300
   LLVMValueRef res;
3301

3302
   assert(lp_check_value(bld->type, x));
3303

3304
   assert(type.floating);
3305

3306
   x = LLVMBuildBitCast(builder, x, bld->int_vec_type, "");
3307

3308
   /* res = x / 2**ipart */
3309
   res = LLVMBuildAnd(builder, x, mantmask, "");
3310
   res = LLVMBuildOr(builder, res, one, "");
3311
   res = LLVMBuildBitCast(builder, res, bld->vec_type, "");
3312

3313
   return res;
3314
}
3315

3316

3317

3318
/**
3319
 * Minimax polynomial fit of log2((1.0 + sqrt(x))/(1.0 - sqrt(x)))/sqrt(x) ,for x in range of [0, 1/9[
3320
 * These coefficients can be generate with
3321
 * http://www.boost.org/doc/libs/1_36_0/libs/math/doc/sf_and_dist/html/math_toolkit/toolkit/internals2/minimax.html
3322
 */
3323
const double lp_build_log2_polynomial[] = {
3324
#if LOG_POLY_DEGREE == 5
3325
   2.88539008148777786488L,
3326
   0.961796878841293367824L,
3327
   0.577058946784739859012L,
3328
   0.412914355135828735411L,
3329
   0.308591899232910175289L,
3330
   0.352376952300281371868L,
3331
#elif LOG_POLY_DEGREE == 4
3332
   2.88539009343309178325L,
3333
   0.961791550404184197881L,
3334
   0.577440339438736392009L,
3335
   0.403343858251329912514L,
3336
   0.406718052498846252698L,
3337
#elif LOG_POLY_DEGREE == 3
3338
   2.88538959748872753838L,
3339
   0.961932915889597772928L,
3340
   0.571118517972136195241L,
3341
   0.493997535084709500285L,
3342
#else
3343
#error
3344
#endif
3345
};
3346

3347
/**
3348
 * See http://www.devmaster.net/forums/showthread.php?p=43580
3349
 * http://en.wikipedia.org/wiki/Logarithm#Calculation
3350
 * http://www.nezumi.demon.co.uk/consult/logx.htm
3351
 *
3352
 * If handle_edge_cases is true the function will perform computations
3353
 * to match the required D3D10+ behavior for each of the edge cases.
3354
 * That means that if input is:
3355
 * - less than zero (to and including -inf) then NaN will be returned
3356
 * - equal to zero (-denorm, -0, +0 or +denorm), then -inf will be returned
3357
 * - +infinity, then +infinity will be returned
3358
 * - NaN, then NaN will be returned
3359
 *
3360
 * Those checks are fairly expensive so if you don't need them make sure
3361
 * handle_edge_cases is false.
3362
 */
3363
void
3364
lp_build_log2_approx(struct lp_build_context *bld,
3365
                     LLVMValueRef x,
3366
                     LLVMValueRef *p_exp,
3367
                     LLVMValueRef *p_floor_log2,
3368
                     LLVMValueRef *p_log2,
3369
                     boolean handle_edge_cases)
3370
{
3371
   LLVMBuilderRef builder = bld->gallivm->builder;
3372
   const struct lp_type type = bld->type;
3373
   LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
3374
   LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
3375

3376
   LLVMValueRef expmask = lp_build_const_int_vec(bld->gallivm, type, 0x7f800000);
3377
   LLVMValueRef mantmask = lp_build_const_int_vec(bld->gallivm, type, 0x007fffff);
3378
   LLVMValueRef one = LLVMConstBitCast(bld->one, int_vec_type);
3379

3380
   LLVMValueRef i = NULL;
3381
   LLVMValueRef y = NULL;
3382
   LLVMValueRef z = NULL;
3383
   LLVMValueRef exp = NULL;
3384
   LLVMValueRef mant = NULL;
3385
   LLVMValueRef logexp = NULL;
3386
   LLVMValueRef p_z = NULL;
3387
   LLVMValueRef res = NULL;
3388

3389
   assert(lp_check_value(bld->type, x));
3390

3391
   if(p_exp || p_floor_log2 || p_log2) {
3392
      /* TODO: optimize the constant case */
3393
      if (gallivm_debug & GALLIVM_DEBUG_PERF &&
3394
          LLVMIsConstant(x)) {
3395
         debug_printf("%s: inefficient/imprecise constant arithmetic\n",
3396
                      __FUNCTION__);
3397
      }
3398

3399
      assert(type.floating && type.width == 32);
3400

3401
      /* 
3402
       * We don't explicitly handle denormalized numbers. They will yield a
3403
       * result in the neighbourhood of -127, which appears to be adequate
3404
       * enough.
3405
       */
3406

3407
      i = LLVMBuildBitCast(builder, x, int_vec_type, "");
3408

3409
      /* exp = (float) exponent(x) */
3410
      exp = LLVMBuildAnd(builder, i, expmask, "");
3411
   }
3412

3413
   if(p_floor_log2 || p_log2) {
3414
      logexp = LLVMBuildLShr(builder, exp, lp_build_const_int_vec(bld->gallivm, type, 23), "");
3415
      logexp = LLVMBuildSub(builder, logexp, lp_build_const_int_vec(bld->gallivm, type, 127), "");
3416
      logexp = LLVMBuildSIToFP(builder, logexp, vec_type, "");
3417
   }
3418

3419
   if (p_log2) {
3420
      /* mant = 1 + (float) mantissa(x) */
3421
      mant = LLVMBuildAnd(builder, i, mantmask, "");
3422
      mant = LLVMBuildOr(builder, mant, one, "");
3423
      mant = LLVMBuildBitCast(builder, mant, vec_type, "");
3424

3425
      /* y = (mant - 1) / (mant + 1) */
3426
      y = lp_build_div(bld,
3427
         lp_build_sub(bld, mant, bld->one),
3428
         lp_build_add(bld, mant, bld->one)
3429
      );
3430

3431
      /* z = y^2 */
3432
      z = lp_build_mul(bld, y, y);
3433

3434
      /* compute P(z) */
3435
      p_z = lp_build_polynomial(bld, z, lp_build_log2_polynomial,
3436
                                ARRAY_SIZE(lp_build_log2_polynomial));
3437

3438
      /* y * P(z) + logexp */
3439
      res = lp_build_mad(bld, y, p_z, logexp);
3440

3441
      if (type.floating && handle_edge_cases) {
3442
         LLVMValueRef negmask, infmask,  zmask;
3443
         negmask = lp_build_cmp(bld, PIPE_FUNC_LESS, x,
3444
                                lp_build_const_vec(bld->gallivm, type,  0.0f));
3445
         zmask = lp_build_cmp(bld, PIPE_FUNC_EQUAL, x,
3446
                              lp_build_const_vec(bld->gallivm, type,  0.0f));
3447
         infmask = lp_build_cmp(bld, PIPE_FUNC_GEQUAL, x,
3448
                                lp_build_const_vec(bld->gallivm, type,  INFINITY));
3449

3450
         /* If x is qual to inf make sure we return inf */
3451
         res = lp_build_select(bld, infmask,
3452
                               lp_build_const_vec(bld->gallivm, type,  INFINITY),
3453
                               res);
3454
         /* If x is qual to 0, return -inf */
3455
         res = lp_build_select(bld, zmask,
3456
                               lp_build_const_vec(bld->gallivm, type,  -INFINITY),
3457
                               res);
3458
         /* If x is nan or less than 0, return nan */
3459
         res = lp_build_select(bld, negmask,
3460
                               lp_build_const_vec(bld->gallivm, type,  NAN),
3461
                               res);
3462
      }
3463
   }
3464

3465
   if (p_exp) {
3466
      exp = LLVMBuildBitCast(builder, exp, vec_type, "");
3467
      *p_exp = exp;
3468
   }
3469

3470
   if (p_floor_log2)
3471
      *p_floor_log2 = logexp;
3472

3473
   if (p_log2)
3474
      *p_log2 = res;
3475
}
3476

3477

3478
/*
3479
 * log2 implementation which doesn't have special code to
3480
 * handle edge cases (-inf, 0, inf, NaN). It's faster but
3481
 * the results for those cases are undefined.
3482
 */
3483
LLVMValueRef
3484
lp_build_log2(struct lp_build_context *bld,
3485
              LLVMValueRef x)
3486
{
3487
   LLVMValueRef res;
3488
   lp_build_log2_approx(bld, x, NULL, NULL, &res, FALSE);
3489
   return res;
3490
}
3491

3492
/*
3493
 * Version of log2 which handles all edge cases.
3494
 * Look at documentation of lp_build_log2_approx for
3495
 * description of the behavior for each of the edge cases.
3496
 */
3497
LLVMValueRef
3498
lp_build_log2_safe(struct lp_build_context *bld,
3499
                   LLVMValueRef x)
3500
{
3501
   LLVMValueRef res;
3502
   lp_build_log2_approx(bld, x, NULL, NULL, &res, TRUE);
3503
   return res;
3504
}
3505

3506

3507
/**
3508
 * Faster (and less accurate) log2.
3509
 *
3510
 *    log2(x) = floor(log2(x)) - 1 + x / 2**floor(log2(x))
3511
 *
3512
 * Piece-wise linear approximation, with exact results when x is a
3513
 * power of two.
3514
 *
3515
 * See http://www.flipcode.com/archives/Fast_log_Function.shtml
3516
 */
3517
LLVMValueRef
3518
lp_build_fast_log2(struct lp_build_context *bld,
3519
                   LLVMValueRef x)
3520
{
3521
   LLVMBuilderRef builder = bld->gallivm->builder;
3522
   LLVMValueRef ipart;
3523
   LLVMValueRef fpart;
3524

3525
   assert(lp_check_value(bld->type, x));
3526

3527
   assert(bld->type.floating);
3528

3529
   /* ipart = floor(log2(x)) - 1 */
3530
   ipart = lp_build_extract_exponent(bld, x, -1);
3531
   ipart = LLVMBuildSIToFP(builder, ipart, bld->vec_type, "");
3532

3533
   /* fpart = x / 2**ipart */
3534
   fpart = lp_build_extract_mantissa(bld, x);
3535

3536
   /* ipart + fpart */
3537
   return LLVMBuildFAdd(builder, ipart, fpart, "");
3538
}
3539

3540

3541
/**
3542
 * Fast implementation of iround(log2(x)).
3543
 *
3544
 * Not an approximation -- it should give accurate results all the time.
3545
 */
3546
LLVMValueRef
3547
lp_build_ilog2(struct lp_build_context *bld,
3548
               LLVMValueRef x)
3549
{
3550
   LLVMBuilderRef builder = bld->gallivm->builder;
3551
   LLVMValueRef sqrt2 = lp_build_const_vec(bld->gallivm, bld->type, M_SQRT2);
3552
   LLVMValueRef ipart;
3553

3554
   assert(bld->type.floating);
3555

3556
   assert(lp_check_value(bld->type, x));
3557

3558
   /* x * 2^(0.5)   i.e., add 0.5 to the log2(x) */
3559
   x = LLVMBuildFMul(builder, x, sqrt2, "");
3560

3561
   /* ipart = floor(log2(x) + 0.5)  */
3562
   ipart = lp_build_extract_exponent(bld, x, 0);
3563

3564
   return ipart;
3565
}
3566

3567
LLVMValueRef
3568
lp_build_mod(struct lp_build_context *bld,
3569
             LLVMValueRef x,
3570
             LLVMValueRef y)
3571
{
3572
   LLVMBuilderRef builder = bld->gallivm->builder;
3573
   LLVMValueRef res;
3574
   const struct lp_type type = bld->type;
3575

3576
   assert(lp_check_value(type, x));
3577
   assert(lp_check_value(type, y));
3578

3579
   if (type.floating)
3580
      res = LLVMBuildFRem(builder, x, y, "");
3581
   else if (type.sign)
3582
      res = LLVMBuildSRem(builder, x, y, "");
3583
   else
3584
      res = LLVMBuildURem(builder, x, y, "");
3585
   return res;
3586
}
3587

3588

3589
/*
3590
 * For floating inputs it creates and returns a mask
3591
 * which is all 1's for channels which are NaN.
3592
 * Channels inside x which are not NaN will be 0.
3593
 */
3594
LLVMValueRef
3595
lp_build_isnan(struct lp_build_context *bld,
3596
               LLVMValueRef x)
3597
{
3598
   LLVMValueRef mask;
3599
   LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, bld->type);
3600

3601
   assert(bld->type.floating);
3602
   assert(lp_check_value(bld->type, x));
3603

3604
   mask = LLVMBuildFCmp(bld->gallivm->builder, LLVMRealOEQ, x, x,
3605
                        "isnotnan");
3606
   mask = LLVMBuildNot(bld->gallivm->builder, mask, "");
3607
   mask = LLVMBuildSExt(bld->gallivm->builder, mask, int_vec_type, "isnan");
3608
   return mask;
3609
}
3610

3611
/* Returns all 1's for floating point numbers that are
3612
 * finite numbers and returns all zeros for -inf,
3613
 * inf and nan's */
3614
LLVMValueRef
3615
lp_build_isfinite(struct lp_build_context *bld,
3616
                  LLVMValueRef x)
3617
{
3618
   LLVMBuilderRef builder = bld->gallivm->builder;
3619
   LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, bld->type);
3620
   struct lp_type int_type = lp_int_type(bld->type);
3621
   LLVMValueRef intx = LLVMBuildBitCast(builder, x, int_vec_type, "");
3622
   LLVMValueRef infornan32 = lp_build_const_int_vec(bld->gallivm, bld->type,
3623
                                                    0x7f800000);
3624

3625
   if (!bld->type.floating) {
3626
      return lp_build_const_int_vec(bld->gallivm, bld->type, 0);
3627
   }
3628
   assert(bld->type.floating);
3629
   assert(lp_check_value(bld->type, x));
3630
   assert(bld->type.width == 32);
3631

3632
   intx = LLVMBuildAnd(builder, intx, infornan32, "");
3633
   return lp_build_compare(bld->gallivm, int_type, PIPE_FUNC_NOTEQUAL,
3634
                           intx, infornan32);
3635
}
3636

3637
/*
3638
 * Returns true if the number is nan or inf and false otherwise.
3639
 * The input has to be a floating point vector.
3640
 */
3641
LLVMValueRef
3642
lp_build_is_inf_or_nan(struct gallivm_state *gallivm,
3643
                       const struct lp_type type,
3644
                       LLVMValueRef x)
3645
{
3646
   LLVMBuilderRef builder = gallivm->builder;
3647
   struct lp_type int_type = lp_int_type(type);
3648
   LLVMValueRef const0 = lp_build_const_int_vec(gallivm, int_type,
3649
                                                0x7f800000);
3650
   LLVMValueRef ret;
3651

3652
   assert(type.floating);
3653

3654
   ret = LLVMBuildBitCast(builder, x, lp_build_vec_type(gallivm, int_type), "");
3655
   ret = LLVMBuildAnd(builder, ret, const0, "");
3656
   ret = lp_build_compare(gallivm, int_type, PIPE_FUNC_EQUAL,
3657
                          ret, const0);
3658

3659
   return ret;
3660
}
3661

3662

3663
LLVMValueRef
3664
lp_build_fpstate_get(struct gallivm_state *gallivm)
3665
{
3666
   if (util_get_cpu_caps()->has_sse) {
3667
      LLVMBuilderRef builder = gallivm->builder;
3668
      LLVMValueRef mxcsr_ptr = lp_build_alloca(
3669
         gallivm,
3670
         LLVMInt32TypeInContext(gallivm->context),
3671
         "mxcsr_ptr");
3672
      LLVMValueRef mxcsr_ptr8 = LLVMBuildPointerCast(builder, mxcsr_ptr,
3673
          LLVMPointerType(LLVMInt8TypeInContext(gallivm->context), 0), "");
3674
      lp_build_intrinsic(builder,
3675
                         "llvm.x86.sse.stmxcsr",
3676
                         LLVMVoidTypeInContext(gallivm->context),
3677
                         &mxcsr_ptr8, 1, 0);
3678
      return mxcsr_ptr;
3679
   }
3680
   return 0;
3681
}
3682

3683
void
3684
lp_build_fpstate_set_denorms_zero(struct gallivm_state *gallivm,
3685
                                  boolean zero)
3686
{
3687
   if (util_get_cpu_caps()->has_sse) {
3688
      /* turn on DAZ (64) | FTZ (32768) = 32832 if available */
3689
      int daz_ftz = _MM_FLUSH_ZERO_MASK;
3690

3691
      LLVMBuilderRef builder = gallivm->builder;
3692
      LLVMValueRef mxcsr_ptr = lp_build_fpstate_get(gallivm);
3693
      LLVMValueRef mxcsr =
3694
         LLVMBuildLoad(builder, mxcsr_ptr, "mxcsr");
3695

3696
      if (util_get_cpu_caps()->has_daz) {
3697
         /* Enable denormals are zero mode */
3698
         daz_ftz |= _MM_DENORMALS_ZERO_MASK;
3699
      }
3700
      if (zero) {
3701
         mxcsr = LLVMBuildOr(builder, mxcsr,
3702
                             LLVMConstInt(LLVMTypeOf(mxcsr), daz_ftz, 0), "");
3703
      } else {
3704
         mxcsr = LLVMBuildAnd(builder, mxcsr,
3705
                              LLVMConstInt(LLVMTypeOf(mxcsr), ~daz_ftz, 0), "");
3706
      }
3707

3708
      LLVMBuildStore(builder, mxcsr, mxcsr_ptr);
3709
      lp_build_fpstate_set(gallivm, mxcsr_ptr);
3710
   }
3711
}
3712

3713
void
3714
lp_build_fpstate_set(struct gallivm_state *gallivm,
3715
                     LLVMValueRef mxcsr_ptr)
3716
{
3717
   if (util_get_cpu_caps()->has_sse) {
3718
      LLVMBuilderRef builder = gallivm->builder;
3719
      mxcsr_ptr = LLVMBuildPointerCast(builder, mxcsr_ptr,
3720
                     LLVMPointerType(LLVMInt8TypeInContext(gallivm->context), 0), "");
3721
      lp_build_intrinsic(builder,
3722
                         "llvm.x86.sse.ldmxcsr",
3723
                         LLVMVoidTypeInContext(gallivm->context),
3724
                         &mxcsr_ptr, 1, 0);
3725
   }
3726
}
3727

3728
Product

Resources

Company