CoCalc -- lp_bld

GitHub Repository: PojavLauncherTeam/mesa
Path: blob/21.2-virgl/src/gallium/auxiliary/gallivm/lp_bld_conv.c
⁴⁵⁶⁵ views
1
/**************************************************************************
2
 *
3
 * Copyright 2009 VMware, Inc.
4
 * All Rights Reserved.
5
 *
6
 * Permission is hereby granted, free of charge, to any person obtaining a
7
 * copy of this software and associated documentation files (the
8
 * "Software"), to deal in the Software without restriction, including
9
 * without limitation the rights to use, copy, modify, merge, publish,
10
 * distribute, sub license, and/or sell copies of the Software, and to
11
 * permit persons to whom the Software is furnished to do so, subject to
12
 * the following conditions:
13
 *
14
 * The above copyright notice and this permission notice (including the
15
 * next paragraph) shall be included in all copies or substantial portions
16
 * of the Software.
17
 *
18
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
19
 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20
 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
21
 * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
22
 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
23
 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
24
 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25
 *
26
 **************************************************************************/
27

28

29
/**
30
 * @file
31
 * Helper functions for type conversions.
32
 *
33
 * We want to use the fastest type for a given computation whenever feasible.
34
 * The other side of this is that we need to be able convert between several
35
 * types accurately and efficiently.
36
 *
37
 * Conversion between types of different bit width is quite complex since a
38
 *
39
 * To remember there are a few invariants in type conversions:
40
 *
41
 * - register width must remain constant:
42
 *
43
 *     src_type.width * src_type.length == dst_type.width * dst_type.length
44
 *
45
 * - total number of elements must remain constant:
46
 *
47
 *     src_type.length * num_srcs == dst_type.length * num_dsts
48
 *
49
 * It is not always possible to do the conversion both accurately and
50
 * efficiently, usually due to lack of adequate machine instructions. In these
51
 * cases it is important not to cut shortcuts here and sacrifice accuracy, as
52
 * there this functions can be used anywhere. In the future we might have a
53
 * precision parameter which can gauge the accuracy vs efficiency compromise,
54
 * but for now if the data conversion between two stages happens to be the
55
 * bottleneck, then most likely should just avoid converting at all and run
56
 * both stages with the same type.
57
 *
58
 * Make sure to run lp_test_conv unit test after any change to this file.
59
 *
60
 * @author Jose Fonseca <[email protected]>
61
 */
62

63

64
#include "util/u_debug.h"
65
#include "util/u_math.h"
66
#include "util/half_float.h"
67
#include "util/u_cpu_detect.h"
68

69
#include "lp_bld_type.h"
70
#include "lp_bld_const.h"
71
#include "lp_bld_arit.h"
72
#include "lp_bld_bitarit.h"
73
#include "lp_bld_pack.h"
74
#include "lp_bld_conv.h"
75
#include "lp_bld_logic.h"
76
#include "lp_bld_intr.h"
77
#include "lp_bld_printf.h"
78
#include "lp_bld_format.h"
79

80

81
/* the lp_test_format test fails on mingw/i686 at -O2 with gcc 10.x
82
 * ref https://gitlab.freedesktop.org/mesa/mesa/-/issues/3906
83
 */
84

85
#if defined(__MINGW32__) && !defined(__MINGW64__) && (__GNUC__ == 10)
86
#warning "disabling caller-saves optimization for this file to work around compiler bug"
87
#pragma GCC optimize("-fno-caller-saves")
88
#endif
89

90
/**
91
 * Converts int16 half-float to float32
92
 * Note this can be performed in 1 instruction if vcvtph2ps exists (f16c/cvt16)
93
 * [llvm.x86.vcvtph2ps / _mm_cvtph_ps]
94
 *
95
 * @param src           value to convert
96
 *
97
 */
98
LLVMValueRef
99
lp_build_half_to_float(struct gallivm_state *gallivm,
100
                       LLVMValueRef src)
101
{
102
   LLVMBuilderRef builder = gallivm->builder;
103
   LLVMTypeRef src_type = LLVMTypeOf(src);
104
   unsigned src_length = LLVMGetTypeKind(src_type) == LLVMVectorTypeKind ?
105
                            LLVMGetVectorSize(src_type) : 1;
106

107
   struct lp_type f32_type = lp_type_float_vec(32, 32 * src_length);
108
   struct lp_type i32_type = lp_type_int_vec(32, 32 * src_length);
109
   LLVMTypeRef int_vec_type = lp_build_vec_type(gallivm, i32_type);
110
   LLVMValueRef h;
111

112
   if (util_get_cpu_caps()->has_f16c &&
113
       (src_length == 4 || src_length == 8)) {
114
      if (LLVM_VERSION_MAJOR < 11) {
115
         const char *intrinsic = NULL;
116
         if (src_length == 4) {
117
            src = lp_build_pad_vector(gallivm, src, 8);
118
            intrinsic = "llvm.x86.vcvtph2ps.128";
119
         }
120
         else {
121
            intrinsic = "llvm.x86.vcvtph2ps.256";
122
         }
123
         return lp_build_intrinsic_unary(builder, intrinsic,
124
                                         lp_build_vec_type(gallivm, f32_type), src);
125
      } else {
126
         /*
127
          * XXX: could probably use on other archs as well.
128
          * But if the cpu doesn't support it natively it looks like the backends still
129
          * can't lower it and will try to call out to external libraries, which will crash.
130
          */
131
         /*
132
          * XXX: lp_build_vec_type() would use int16 vector. Probably need to revisit
133
          * this at some point.
134
          */
135
         src = LLVMBuildBitCast(builder, src,
136
                                LLVMVectorType(LLVMHalfTypeInContext(gallivm->context), src_length), "");
137
         return LLVMBuildFPExt(builder, src, lp_build_vec_type(gallivm, f32_type), "");
138
      }
139
   }
140

141
   h = LLVMBuildZExt(builder, src, int_vec_type, "");
142
   return lp_build_smallfloat_to_float(gallivm, f32_type, h, 10, 5, 0, true);
143
}
144

145

146
/**
147
 * Converts float32 to int16 half-float
148
 * Note this can be performed in 1 instruction if vcvtps2ph exists (f16c/cvt16)
149
 * [llvm.x86.vcvtps2ph / _mm_cvtps_ph]
150
 *
151
 * @param src           value to convert
152
 *
153
 * Convert float32 to half floats, preserving Infs and NaNs,
154
 * with rounding towards zero (trunc).
155
 * XXX: For GL, would prefer rounding towards nearest(-even).
156
 */
157
LLVMValueRef
158
lp_build_float_to_half(struct gallivm_state *gallivm,
159
                       LLVMValueRef src)
160
{
161
   LLVMBuilderRef builder = gallivm->builder;
162
   LLVMTypeRef f32_vec_type = LLVMTypeOf(src);
163
   unsigned length = LLVMGetTypeKind(f32_vec_type) == LLVMVectorTypeKind
164
                   ? LLVMGetVectorSize(f32_vec_type) : 1;
165
   struct lp_type i32_type = lp_type_int_vec(32, 32 * length);
166
   struct lp_type i16_type = lp_type_int_vec(16, 16 * length);
167
   LLVMValueRef result;
168

169
   /*
170
    * Note: Newer llvm versions (3.6 or so) support fptrunc to 16 bits
171
    * directly, without any (x86 or generic) intrinsics.
172
    * Albeit the rounding mode cannot be specified (and is undefined,
173
    * though in practice on x86 seems to do nearest-even but it may
174
    * be dependent on instruction set support), so is essentially
175
    * useless.
176
    */
177

178
   if (util_get_cpu_caps()->has_f16c &&
179
       (length == 4 || length == 8)) {
180
      struct lp_type i168_type = lp_type_int_vec(16, 16 * 8);
181
      unsigned mode = 3; /* same as LP_BUILD_ROUND_TRUNCATE */
182
      LLVMTypeRef i32t = LLVMInt32TypeInContext(gallivm->context);
183
      const char *intrinsic = NULL;
184
      if (length == 4) {
185
         intrinsic = "llvm.x86.vcvtps2ph.128";
186
      }
187
      else {
188
         intrinsic = "llvm.x86.vcvtps2ph.256";
189
      }
190
      result = lp_build_intrinsic_binary(builder, intrinsic,
191
                                         lp_build_vec_type(gallivm, i168_type),
192
                                         src, LLVMConstInt(i32t, mode, 0));
193
      if (length == 4) {
194
         result = lp_build_extract_range(gallivm, result, 0, 4);
195
      }
196
   }
197

198
   else {
199
      result = lp_build_float_to_smallfloat(gallivm, i32_type, src, 10, 5, 0, true);
200
      /* Convert int32 vector to int16 vector by trunc (might generate bad code) */
201
      result = LLVMBuildTrunc(builder, result, lp_build_vec_type(gallivm, i16_type), "");
202
   }
203

204
   /*
205
    * Debugging code.
206
    */
207
   if (0) {
208
     LLVMTypeRef i32t = LLVMInt32TypeInContext(gallivm->context);
209
     LLVMTypeRef i16t = LLVMInt16TypeInContext(gallivm->context);
210
     LLVMTypeRef f32t = LLVMFloatTypeInContext(gallivm->context);
211
     LLVMValueRef ref_result = LLVMGetUndef(LLVMVectorType(i16t, length));
212
     unsigned i;
213

214
     LLVMTypeRef func_type = LLVMFunctionType(i16t, &f32t, 1, 0);
215
     LLVMValueRef func = lp_build_const_int_pointer(gallivm, func_to_pointer((func_pointer)_mesa_float_to_half));
216
     func = LLVMBuildBitCast(builder, func, LLVMPointerType(func_type, 0), "_mesa_float_to_half");
217

218
     for (i = 0; i < length; ++i) {
219
        LLVMValueRef index = LLVMConstInt(i32t, i, 0);
220
        LLVMValueRef f32 = LLVMBuildExtractElement(builder, src, index, "");
221
#if 0
222
        /*
223
         * XXX: not really supported by backends.
224
         * Even if they would now, rounding mode cannot be specified and
225
         * is undefined.
226
         */
227
        LLVMValueRef f16 = lp_build_intrinsic_unary(builder, "llvm.convert.to.fp16", i16t, f32);
228
#else
229
        LLVMValueRef f16 = LLVMBuildCall(builder, func, &f32, 1, "");
230
#endif
231
        ref_result = LLVMBuildInsertElement(builder, ref_result, f16, index, "");
232
     }
233

234
     lp_build_print_value(gallivm, "src  = ", src);
235
     lp_build_print_value(gallivm, "llvm = ", result);
236
     lp_build_print_value(gallivm, "util = ", ref_result);
237
     lp_build_printf(gallivm, "\n");
238
  }
239

240
   return result;
241
}
242

243

244
/**
245
 * Special case for converting clamped IEEE-754 floats to unsigned norms.
246
 *
247
 * The mathematical voodoo below may seem excessive but it is actually
248
 * paramount we do it this way for several reasons. First, there is no single
249
 * precision FP to unsigned integer conversion Intel SSE instruction. Second,
250
 * secondly, even if there was, since the FP's mantissa takes only a fraction
251
 * of register bits the typically scale and cast approach would require double
252
 * precision for accurate results, and therefore half the throughput
253
 *
254
 * Although the result values can be scaled to an arbitrary bit width specified
255
 * by dst_width, the actual result type will have the same width.
256
 *
257
 * Ex: src = { float, float, float, float }
258
 * return { i32, i32, i32, i32 } where each value is in [0, 2^dst_width-1].
259
 */
260
LLVMValueRef
261
lp_build_clamped_float_to_unsigned_norm(struct gallivm_state *gallivm,
262
                                        struct lp_type src_type,
263
                                        unsigned dst_width,
264
                                        LLVMValueRef src)
265
{
266
   LLVMBuilderRef builder = gallivm->builder;
267
   LLVMTypeRef int_vec_type = lp_build_int_vec_type(gallivm, src_type);
268
   LLVMValueRef res;
269
   unsigned mantissa;
270

271
   assert(src_type.floating);
272
   assert(dst_width <= src_type.width);
273
   src_type.sign = FALSE;
274

275
   mantissa = lp_mantissa(src_type);
276

277
   if (dst_width <= mantissa) {
278
      /*
279
       * Apply magic coefficients that will make the desired result to appear
280
       * in the lowest significant bits of the mantissa, with correct rounding.
281
       *
282
       * This only works if the destination width fits in the mantissa.
283
       */
284

285
      unsigned long long ubound;
286
      unsigned long long mask;
287
      double scale;
288
      double bias;
289

290
      ubound = (1ULL << dst_width);
291
      mask = ubound - 1;
292
      scale = (double)mask/ubound;
293
      bias = (double)(1ULL << (mantissa - dst_width));
294

295
      res = LLVMBuildFMul(builder, src, lp_build_const_vec(gallivm, src_type, scale), "");
296
      /* instead of fadd/and could (with sse2) just use lp_build_iround */
297
      res = LLVMBuildFAdd(builder, res, lp_build_const_vec(gallivm, src_type, bias), "");
298
      res = LLVMBuildBitCast(builder, res, int_vec_type, "");
299
      res = LLVMBuildAnd(builder, res,
300
                         lp_build_const_int_vec(gallivm, src_type, mask), "");
301
   }
302
   else if (dst_width == (mantissa + 1)) {
303
      /*
304
       * The destination width matches exactly what can be represented in
305
       * floating point (i.e., mantissa + 1 bits). Even so correct rounding
306
       * still needs to be applied (only for numbers in [0.5-1.0] would
307
       * conversion using truncation after scaling be sufficient).
308
       */
309
      double scale;
310
      struct lp_build_context uf32_bld;
311

312
      lp_build_context_init(&uf32_bld, gallivm, src_type);
313
      scale = (double)((1ULL << dst_width) - 1);
314

315
      res = LLVMBuildFMul(builder, src,
316
                          lp_build_const_vec(gallivm, src_type, scale), "");
317
      res = lp_build_iround(&uf32_bld, res);
318
   }
319
   else {
320
      /*
321
       * The destination exceeds what can be represented in the floating point.
322
       * So multiply by the largest power two we get away with, and when
323
       * subtract the most significant bit to rescale to normalized values.
324
       *
325
       * The largest power of two factor we can get away is
326
       * (1 << (src_type.width - 1)), because we need to use signed . In theory it
327
       * should be (1 << (src_type.width - 2)), but IEEE 754 rules states
328
       * INT_MIN should be returned in FPToSI, which is the correct result for
329
       * values near 1.0!
330
       *
331
       * This means we get (src_type.width - 1) correct bits for values near 0.0,
332
       * and (mantissa + 1) correct bits for values near 1.0. Equally or more
333
       * important, we also get exact results for 0.0 and 1.0.
334
       */
335

336
      unsigned n = MIN2(src_type.width - 1u, dst_width);
337

338
      double scale = (double)(1ULL << n);
339
      unsigned lshift = dst_width - n;
340
      unsigned rshift = n;
341
      LLVMValueRef lshifted;
342
      LLVMValueRef rshifted;
343

344
      res = LLVMBuildFMul(builder, src,
345
                          lp_build_const_vec(gallivm, src_type, scale), "");
346
      if (!src_type.sign && src_type.width == 32)
347
         res = LLVMBuildFPToUI(builder, res, int_vec_type, "");
348
      else
349
         res = LLVMBuildFPToSI(builder, res, int_vec_type, "");
350

351
      /*
352
       * Align the most significant bit to its final place.
353
       *
354
       * This will cause 1.0 to overflow to 0, but the later adjustment will
355
       * get it right.
356
       */
357
      if (lshift) {
358
         lshifted = LLVMBuildShl(builder, res,
359
                                 lp_build_const_int_vec(gallivm, src_type,
360
                                                        lshift), "");
361
      } else {
362
         lshifted = res;
363
      }
364

365
      /*
366
       * Align the most significant bit to the right.
367
       */
368
      rshifted =  LLVMBuildLShr(builder, res,
369
                                lp_build_const_int_vec(gallivm, src_type, rshift),
370
                                "");
371

372
      /*
373
       * Subtract the MSB to the LSB, therefore re-scaling from
374
       * (1 << dst_width) to ((1 << dst_width) - 1).
375
       */
376

377
      res = LLVMBuildSub(builder, lshifted, rshifted, "");
378
   }
379

380
   return res;
381
}
382

383

384
/**
385
 * Inverse of lp_build_clamped_float_to_unsigned_norm above.
386
 * Ex: src = { i32, i32, i32, i32 } with values in range [0, 2^src_width-1]
387
 * return {float, float, float, float} with values in range [0, 1].
388
 */
389
LLVMValueRef
390
lp_build_unsigned_norm_to_float(struct gallivm_state *gallivm,
391
                                unsigned src_width,
392
                                struct lp_type dst_type,
393
                                LLVMValueRef src)
394
{
395
   LLVMBuilderRef builder = gallivm->builder;
396
   LLVMTypeRef vec_type = lp_build_vec_type(gallivm, dst_type);
397
   LLVMTypeRef int_vec_type = lp_build_int_vec_type(gallivm, dst_type);
398
   LLVMValueRef bias_;
399
   LLVMValueRef res;
400
   unsigned mantissa;
401
   unsigned n;
402
   unsigned long long ubound;
403
   unsigned long long mask;
404
   double scale;
405
   double bias;
406

407
   assert(dst_type.floating);
408

409
   mantissa = lp_mantissa(dst_type);
410

411
   if (src_width <= (mantissa + 1)) {
412
      /*
413
       * The source width matches fits what can be represented in floating
414
       * point (i.e., mantissa + 1 bits). So do a straight multiplication
415
       * followed by casting. No further rounding is necessary.
416
       */
417

418
      scale = 1.0/(double)((1ULL << src_width) - 1);
419
      res = LLVMBuildSIToFP(builder, src, vec_type, "");
420
      res = LLVMBuildFMul(builder, res,
421
                          lp_build_const_vec(gallivm, dst_type, scale), "");
422
      return res;
423
   }
424
   else {
425
      /*
426
       * The source width exceeds what can be represented in floating
427
       * point. So truncate the incoming values.
428
       */
429

430
      n = MIN2(mantissa, src_width);
431

432
      ubound = ((unsigned long long)1 << n);
433
      mask = ubound - 1;
434
      scale = (double)ubound/mask;
435
      bias = (double)((unsigned long long)1 << (mantissa - n));
436

437
      res = src;
438

439
      if (src_width > mantissa) {
440
         int shift = src_width - mantissa;
441
         res = LLVMBuildLShr(builder, res,
442
                             lp_build_const_int_vec(gallivm, dst_type, shift), "");
443
      }
444

445
      bias_ = lp_build_const_vec(gallivm, dst_type, bias);
446

447
      res = LLVMBuildOr(builder,
448
                        res,
449
                        LLVMBuildBitCast(builder, bias_, int_vec_type, ""), "");
450

451
      res = LLVMBuildBitCast(builder, res, vec_type, "");
452

453
      res = LLVMBuildFSub(builder, res, bias_, "");
454
      res = LLVMBuildFMul(builder, res, lp_build_const_vec(gallivm, dst_type, scale), "");
455
   }
456

457
   return res;
458
}
459

460

461
/**
462
 * Pick a suitable num_dsts for lp_build_conv to ensure optimal cases are used.
463
 *
464
 * Returns the number of dsts created from src
465
 */
466
int lp_build_conv_auto(struct gallivm_state *gallivm,
467
                       struct lp_type src_type,
468
                       struct lp_type* dst_type,
469
                       const LLVMValueRef *src,
470
                       unsigned num_srcs,
471
                       LLVMValueRef *dst)
472
{
473
   unsigned i;
474
   int num_dsts = num_srcs;
475

476
   if (src_type.floating == dst_type->floating &&
477
       src_type.width == dst_type->width &&
478
       src_type.length == dst_type->length &&
479
       src_type.fixed == dst_type->fixed &&
480
       src_type.norm == dst_type->norm &&
481
       src_type.sign == dst_type->sign)
482
      return num_dsts;
483

484
   /* Special case 4x4x32 -> 1x16x8 or 2x8x32 -> 1x16x8
485
    */
486
   if (src_type.norm     == 0 &&
487
       src_type.width    == 32 &&
488
       src_type.fixed    == 0 &&
489

490
       dst_type->floating == 0 &&
491
       dst_type->fixed    == 0 &&
492
       dst_type->width    == 8 &&
493

494
       ((src_type.floating == 1 && src_type.sign == 1 && dst_type->norm == 1) ||
495
        (src_type.floating == 0 && dst_type->floating == 0 &&
496
         src_type.sign == dst_type->sign && dst_type->norm == 0))) {
497

498
      /* Special case 4x4x32 --> 1x16x8 */
499
      if (src_type.length == 4 &&
500
            (util_get_cpu_caps()->has_sse2 || util_get_cpu_caps()->has_altivec))
501
      {
502
         num_dsts = (num_srcs + 3) / 4;
503
         dst_type->length = num_srcs * 4 >= 16 ? 16 : num_srcs * 4;
504

505
         lp_build_conv(gallivm, src_type, *dst_type, src, num_srcs, dst, num_dsts);
506
         return num_dsts;
507
      }
508

509
      /* Special case 2x8x32 --> 1x16x8 */
510
      if (src_type.length == 8 &&
511
          util_get_cpu_caps()->has_avx)
512
      {
513
         num_dsts = (num_srcs + 1) / 2;
514
         dst_type->length = num_srcs * 8 >= 16 ? 16 : num_srcs * 8;
515

516
         lp_build_conv(gallivm, src_type, *dst_type, src, num_srcs, dst, num_dsts);
517
         return num_dsts;
518
      }
519
   }
520

521
   /* lp_build_resize does not support M:N */
522
   if (src_type.width == dst_type->width) {
523
      lp_build_conv(gallivm, src_type, *dst_type, src, num_srcs, dst, num_dsts);
524
   } else {
525
      /*
526
       * If dst_width is 16 bits and src_width 32 and the dst vector size
527
       * 64bit, try feeding 2 vectors at once so pack intrinsics can be used.
528
       * (For AVX, this isn't needed, since we usually get 256bit src and
529
       * 128bit dst vectors which works ok. If we do AVX2 pack this should
530
       * be extended but need to be able to tell conversion code about pack
531
       * ordering first.)
532
       */
533
      unsigned ratio = 1;
534
      if (src_type.width == 2 * dst_type->width &&
535
          src_type.length == dst_type->length &&
536
          dst_type->floating == 0 && (num_srcs % 2 == 0) &&
537
          dst_type->width * dst_type->length == 64) {
538
         ratio = 2;
539
         num_dsts /= 2;
540
         dst_type->length *= 2;
541
      }
542
      for (i = 0; i < num_dsts; i++) {
543
         lp_build_conv(gallivm, src_type, *dst_type, &src[i*ratio], ratio, &dst[i], 1);
544
      }
545
   }
546

547
   return num_dsts;
548
}
549

550

551
/**
552
 * Generic type conversion.
553
 *
554
 * TODO: Take a precision argument, or even better, add a new precision member
555
 * to the lp_type union.
556
 */
557
void
558
lp_build_conv(struct gallivm_state *gallivm,
559
              struct lp_type src_type,
560
              struct lp_type dst_type,
561
              const LLVMValueRef *src, unsigned num_srcs,
562
              LLVMValueRef *dst, unsigned num_dsts)
563
{
564
   LLVMBuilderRef builder = gallivm->builder;
565
   struct lp_type tmp_type;
566
   LLVMValueRef tmp[LP_MAX_VECTOR_LENGTH];
567
   unsigned num_tmps;
568
   unsigned i;
569

570
   /* We must not loose or gain channels. Only precision */
571
   assert(src_type.length * num_srcs == dst_type.length * num_dsts);
572

573
   assert(src_type.length <= LP_MAX_VECTOR_LENGTH);
574
   assert(dst_type.length <= LP_MAX_VECTOR_LENGTH);
575
   assert(num_srcs <= LP_MAX_VECTOR_LENGTH);
576
   assert(num_dsts <= LP_MAX_VECTOR_LENGTH);
577

578
   tmp_type = src_type;
579
   for(i = 0; i < num_srcs; ++i) {
580
      assert(lp_check_value(src_type, src[i]));
581
      tmp[i] = src[i];
582
   }
583
   num_tmps = num_srcs;
584

585

586
   /*
587
    * Special case 4x4x32 --> 1x16x8, 2x4x32 -> 1x8x8, 1x4x32 -> 1x4x8
588
    * Only float -> s/unorm8 and (u)int32->(u)int8.
589
    * XXX: This should cover all interesting backend cases for 8 bit,
590
    * but should use same strategy if dst is 16 bit.
591
    */
592
   if (src_type.norm     == 0 &&
593
       src_type.width    == 32 &&
594
       src_type.length   == 4 &&
595
       src_type.fixed    == 0 &&
596

597
       dst_type.floating == 0 &&
598
       dst_type.fixed    == 0 &&
599
       dst_type.width    == 8 &&
600

601
       ((src_type.floating == 1 && src_type.sign == 1 && dst_type.norm == 1) ||
602
        (src_type.floating == 0 && dst_type.floating == 0 &&
603
         src_type.sign == dst_type.sign && dst_type.norm == 0)) &&
604

605
       ((dst_type.length == 16 && 4 * num_dsts == num_srcs) ||
606
        (num_dsts == 1 && dst_type.length * num_srcs == 16 && num_srcs != 3)) &&
607

608
       (util_get_cpu_caps()->has_sse2 || util_get_cpu_caps()->has_altivec))
609
   {
610
      struct lp_build_context bld;
611
      struct lp_type int16_type, int32_type;
612
      struct lp_type dst_type_ext = dst_type;
613
      LLVMValueRef const_scale;
614
      unsigned i, j;
615

616
      lp_build_context_init(&bld, gallivm, src_type);
617

618
      dst_type_ext.length = 16;
619
      int16_type = int32_type = dst_type_ext;
620

621
      int16_type.width *= 2;
622
      int16_type.length /= 2;
623
      int16_type.sign = 1;
624

625
      int32_type.width *= 4;
626
      int32_type.length /= 4;
627
      int32_type.sign = 1;
628

629
      const_scale = lp_build_const_vec(gallivm, src_type, lp_const_scale(dst_type));
630

631
      for (i = 0; i < num_dsts; ++i, src += 4) {
632
         LLVMValueRef lo, hi;
633

634
         if (src_type.floating) {
635
            for (j = 0; j < dst_type.length / 4; ++j) {
636
               /*
637
                * XXX This is not actually fully correct. The float to int
638
                * conversion will produce 0x80000000 value for everything
639
                * out of range and NaNs (on x86, llvm.x86.sse2.cvtps2dq).
640
                * Hence, NaNs and negatives will get clamped just fine to zero
641
                * (relying on clamping pack behavior) when converting to unorm,
642
                * however too large values (both finite and infinite) will also
643
                * end up as zero, not 255.
644
                * For snorm, for now we'll keep bug compatibility with generic
645
                * conversion path (meaning too large values are fine, but
646
                * NaNs get converted to -128 (purely by luck, as we don't
647
                * specify nan behavior for the max there) instead of 0).
648
                *
649
                * dEQP has GLES31 tests that expect +inf -> 255.0.
650
                */
651
               if (dst_type.sign) {
652
                  tmp[j] = lp_build_min(&bld, bld.one, src[j]);
653

654
               }
655
               else {
656
                  if (1) {
657
                     tmp[j] = lp_build_min_ext(&bld, bld.one, src[j],
658
                                               GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN);
659
                  }
660
                  tmp[j] = src[j];
661
               }
662
               tmp[j] = LLVMBuildFMul(builder, tmp[j], const_scale, "");
663
               tmp[j] = lp_build_iround(&bld, tmp[j]);
664
            }
665
         } else {
666
            for (j = 0; j < dst_type.length / 4; ++j) {
667
               if (!dst_type.sign) {
668
                  /*
669
                   * Pack clamp is always signed->unsigned (or signed->signed).
670
                   * Hence need min.
671
                   */
672
                  LLVMValueRef const_max;
673
                  const_max = lp_build_const_int_vec(gallivm, src_type, 255);
674
                  tmp[j] = lp_build_min(&bld, src[j], const_max);
675
               } else {
676
                  tmp[j] = src[j];
677
               }
678
            }
679
         }
680

681
         if (num_srcs == 1) {
682
            tmp[1] = tmp[0];
683
         }
684

685
         /* relying on clamping behavior of sse2 intrinsics here */
686
         lo = lp_build_pack2(gallivm, int32_type, int16_type, tmp[0], tmp[1]);
687

688
         if (num_srcs < 4) {
689
            hi = lo;
690
         }
691
         else {
692
            hi = lp_build_pack2(gallivm, int32_type, int16_type, tmp[2], tmp[3]);
693
         }
694
         dst[i] = lp_build_pack2(gallivm, int16_type, dst_type_ext, lo, hi);
695
      }
696
      if (num_srcs < 4) {
697
         dst[0] = lp_build_extract_range(gallivm, dst[0], 0, dst_type.length);
698
      }
699

700
      return;
701
   }
702

703
   /* Special case 2x8x32 --> 1x16x8, 1x8x32 ->1x8x8
704
    */
705
   else if (src_type.norm     == 0 &&
706
       src_type.width    == 32 &&
707
       src_type.length   == 8 &&
708
       src_type.fixed    == 0 &&
709

710
       dst_type.floating == 0 &&
711
       dst_type.fixed    == 0 &&
712
       dst_type.width    == 8 &&
713

714
       ((src_type.floating == 1 && src_type.sign == 1 && dst_type.norm == 1) ||
715
        (src_type.floating == 0 && dst_type.floating == 0 &&
716
         src_type.sign == dst_type.sign && dst_type.norm == 0)) &&
717

718
      ((dst_type.length == 16 && 2 * num_dsts == num_srcs) ||
719
       (num_dsts == 1 && dst_type.length * num_srcs == 8)) &&
720

721
      util_get_cpu_caps()->has_avx) {
722

723
      struct lp_build_context bld;
724
      struct lp_type int16_type, int32_type;
725
      struct lp_type dst_type_ext = dst_type;
726
      LLVMValueRef const_scale;
727
      unsigned i;
728

729
      lp_build_context_init(&bld, gallivm, src_type);
730

731
      dst_type_ext.length = 16;
732
      int16_type = int32_type = dst_type_ext;
733

734
      int16_type.width *= 2;
735
      int16_type.length /= 2;
736
      int16_type.sign = 1;
737

738
      int32_type.width *= 4;
739
      int32_type.length /= 4;
740
      int32_type.sign = 1;
741

742
      const_scale = lp_build_const_vec(gallivm, src_type, lp_const_scale(dst_type));
743

744
      for (i = 0; i < num_dsts; ++i, src += 2) {
745
         unsigned j;
746
         for (j = 0; j < (num_srcs == 1 ? 1 : 2); j++) {
747
            LLVMValueRef lo, hi, a;
748

749
            a = src[j];
750
            if (src_type.floating) {
751
               if (dst_type.sign) {
752
                  a = lp_build_min(&bld, bld.one, a);
753

754
               }
755
               else {
756
                  if (1) {
757
                     a = lp_build_min_ext(&bld, bld.one, a,
758
                                          GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN);
759
                  }
760
               }
761
               a = LLVMBuildFMul(builder, a, const_scale, "");
762
               a = lp_build_iround(&bld, a);
763
            } else {
764
               if (!dst_type.sign) {
765
                  LLVMValueRef const_max;
766
                  const_max = lp_build_const_int_vec(gallivm, src_type, 255);
767
                  a = lp_build_min(&bld, a, const_max);
768
               }
769
            }
770
            lo = lp_build_extract_range(gallivm, a, 0, 4);
771
            hi = lp_build_extract_range(gallivm, a, 4, 4);
772
            /* relying on clamping behavior of sse2 intrinsics here */
773
            tmp[j] = lp_build_pack2(gallivm, int32_type, int16_type, lo, hi);
774
         }
775

776
         if (num_srcs == 1) {
777
            tmp[1] = tmp[0];
778
         }
779
         dst[i] = lp_build_pack2(gallivm, int16_type, dst_type_ext, tmp[0], tmp[1]);
780
      }
781

782
      if (num_srcs == 1) {
783
         dst[0] = lp_build_extract_range(gallivm, dst[0], 0, dst_type.length);
784
      }
785

786
      return;
787
   }
788

789
   /* Special case -> 16bit half-float
790
    */
791
   else if (dst_type.floating && dst_type.width == 16)
792
   {
793
      /* Only support src as 32bit float currently */
794
      assert(src_type.floating && src_type.width == 32);
795

796
      for(i = 0; i < num_tmps; ++i)
797
         dst[i] = lp_build_float_to_half(gallivm, tmp[i]);
798

799
      return;
800
   }
801

802
   /* Pre convert half-floats to floats
803
    */
804
   else if (src_type.floating && src_type.width == 16)
805
   {
806
      for(i = 0; i < num_tmps; ++i)
807
         tmp[i] = lp_build_half_to_float(gallivm, tmp[i]);
808

809
      tmp_type.width = 32;
810
   }
811

812
   /*
813
    * Clamp if necessary
814
    */
815

816
   if(memcmp(&src_type, &dst_type, sizeof src_type) != 0) {
817
      struct lp_build_context bld;
818
      double src_min = lp_const_min(src_type);
819
      double dst_min = lp_const_min(dst_type);
820
      double src_max = lp_const_max(src_type);
821
      double dst_max = lp_const_max(dst_type);
822
      LLVMValueRef thres;
823

824
      lp_build_context_init(&bld, gallivm, tmp_type);
825

826
      if(src_min < dst_min) {
827
         if(dst_min == 0.0)
828
            thres = bld.zero;
829
         else
830
            thres = lp_build_const_vec(gallivm, src_type, dst_min);
831
         for(i = 0; i < num_tmps; ++i)
832
            tmp[i] = lp_build_max(&bld, tmp[i], thres);
833
      }
834

835
      if(src_max > dst_max) {
836
         if(dst_max == 1.0)
837
            thres = bld.one;
838
         else
839
            thres = lp_build_const_vec(gallivm, src_type, dst_max);
840
         for(i = 0; i < num_tmps; ++i)
841
            tmp[i] = lp_build_min(&bld, tmp[i], thres);
842
      }
843
   }
844

845
   /*
846
    * Scale to the narrowest range
847
    */
848

849
   if(dst_type.floating) {
850
      /* Nothing to do */
851
   }
852
   else if(tmp_type.floating) {
853
      if(!dst_type.fixed && !dst_type.sign && dst_type.norm) {
854
         for(i = 0; i < num_tmps; ++i) {
855
            tmp[i] = lp_build_clamped_float_to_unsigned_norm(gallivm,
856
                                                             tmp_type,
857
                                                             dst_type.width,
858
                                                             tmp[i]);
859
         }
860
         tmp_type.floating = FALSE;
861
      }
862
      else {
863
         double dst_scale = lp_const_scale(dst_type);
864

865
         if (dst_scale != 1.0) {
866
            LLVMValueRef scale = lp_build_const_vec(gallivm, tmp_type, dst_scale);
867
            for(i = 0; i < num_tmps; ++i)
868
               tmp[i] = LLVMBuildFMul(builder, tmp[i], scale, "");
869
         }
870

871
         /*
872
          * these functions will use fptosi in some form which won't work
873
          * with 32bit uint dst. Causes lp_test_conv failures though.
874
          */
875
         if (0)
876
            assert(dst_type.sign || dst_type.width < 32);
877

878
         if (dst_type.sign && dst_type.norm && !dst_type.fixed) {
879
            struct lp_build_context bld;
880

881
            lp_build_context_init(&bld, gallivm, tmp_type);
882
            for(i = 0; i < num_tmps; ++i) {
883
               tmp[i] = lp_build_iround(&bld, tmp[i]);
884
            }
885
            tmp_type.floating = FALSE;
886
         }
887
         else {
888
            LLVMTypeRef tmp_vec_type;
889

890
            tmp_type.floating = FALSE;
891
            tmp_vec_type = lp_build_vec_type(gallivm, tmp_type);
892
            for(i = 0; i < num_tmps; ++i) {
893
#if 0
894
               if(dst_type.sign)
895
                  tmp[i] = LLVMBuildFPToSI(builder, tmp[i], tmp_vec_type, "");
896
               else
897
                  tmp[i] = LLVMBuildFPToUI(builder, tmp[i], tmp_vec_type, "");
898
#else
899
              /* FIXME: there is no SSE counterpart for LLVMBuildFPToUI */
900
               tmp[i] = LLVMBuildFPToSI(builder, tmp[i], tmp_vec_type, "");
901
#endif
902
            }
903
         }
904
      }
905
   }
906
   else {
907
      unsigned src_shift = lp_const_shift(src_type);
908
      unsigned dst_shift = lp_const_shift(dst_type);
909
      unsigned src_offset = lp_const_offset(src_type);
910
      unsigned dst_offset = lp_const_offset(dst_type);
911
      struct lp_build_context bld;
912
      lp_build_context_init(&bld, gallivm, tmp_type);
913

914
      /* Compensate for different offsets */
915
      /* sscaled -> unorm and similar would cause negative shift count, skip */
916
      if (dst_offset > src_offset && src_type.width > dst_type.width && src_shift > 0) {
917
         for (i = 0; i < num_tmps; ++i) {
918
            LLVMValueRef shifted;
919

920
            shifted = lp_build_shr_imm(&bld, tmp[i], src_shift - 1);
921
            tmp[i] = LLVMBuildSub(builder, tmp[i], shifted, "");
922
         }
923
      }
924

925
      if(src_shift > dst_shift) {
926
         for(i = 0; i < num_tmps; ++i)
927
            tmp[i] = lp_build_shr_imm(&bld, tmp[i], src_shift - dst_shift);
928
      }
929
   }
930

931
   /*
932
    * Truncate or expand bit width
933
    *
934
    * No data conversion should happen here, although the sign bits are
935
    * crucial to avoid bad clamping.
936
    */
937

938
   {
939
      struct lp_type new_type;
940

941
      new_type = tmp_type;
942
      new_type.sign   = dst_type.sign;
943
      new_type.width  = dst_type.width;
944
      new_type.length = dst_type.length;
945

946
      /*
947
       * Note that resize when using packs can sometimes get min/max
948
       * clamping for free. Should be able to exploit this...
949
       */
950
      lp_build_resize(gallivm, tmp_type, new_type, tmp, num_srcs, tmp, num_dsts);
951

952
      tmp_type = new_type;
953
      num_tmps = num_dsts;
954
   }
955

956
   /*
957
    * Scale to the widest range
958
    */
959

960
   if(src_type.floating) {
961
      /* Nothing to do */
962
   }
963
   else if(!src_type.floating && dst_type.floating) {
964
      if(!src_type.fixed && !src_type.sign && src_type.norm) {
965
         for(i = 0; i < num_tmps; ++i) {
966
            tmp[i] = lp_build_unsigned_norm_to_float(gallivm,
967
                                                     src_type.width,
968
                                                     dst_type,
969
                                                     tmp[i]);
970
         }
971
         tmp_type.floating = TRUE;
972
      }
973
      else {
974
         double src_scale = lp_const_scale(src_type);
975
         LLVMTypeRef tmp_vec_type;
976

977
         /* Use an equally sized integer for intermediate computations */
978
         tmp_type.floating = TRUE;
979
         tmp_type.sign = TRUE;
980
         tmp_vec_type = lp_build_vec_type(gallivm, tmp_type);
981
         for(i = 0; i < num_tmps; ++i) {
982
#if 0
983
            if(dst_type.sign)
984
               tmp[i] = LLVMBuildSIToFP(builder, tmp[i], tmp_vec_type, "");
985
            else
986
               tmp[i] = LLVMBuildUIToFP(builder, tmp[i], tmp_vec_type, "");
987
#else
988
            /* FIXME: there is no SSE counterpart for LLVMBuildUIToFP */
989
            tmp[i] = LLVMBuildSIToFP(builder, tmp[i], tmp_vec_type, "");
990
#endif
991
          }
992

993
          if (src_scale != 1.0) {
994
             LLVMValueRef scale = lp_build_const_vec(gallivm, tmp_type, 1.0/src_scale);
995
             for(i = 0; i < num_tmps; ++i)
996
                tmp[i] = LLVMBuildFMul(builder, tmp[i], scale, "");
997
          }
998

999
          /* the formula above will produce value below -1.0 for most negative
1000
           * value but everything seems happy with that hence disable for now */
1001
          if (0 && !src_type.fixed && src_type.norm && src_type.sign) {
1002
             struct lp_build_context bld;
1003

1004
             lp_build_context_init(&bld, gallivm, dst_type);
1005
             for(i = 0; i < num_tmps; ++i) {
1006
                tmp[i] = lp_build_max(&bld, tmp[i],
1007
                                      lp_build_const_vec(gallivm, dst_type, -1.0f));
1008
             }
1009
          }
1010
      }
1011
    }
1012
    else {
1013
       unsigned src_shift = lp_const_shift(src_type);
1014
       unsigned dst_shift = lp_const_shift(dst_type);
1015
       unsigned src_offset = lp_const_offset(src_type);
1016
       unsigned dst_offset = lp_const_offset(dst_type);
1017
       struct lp_build_context bld;
1018
       lp_build_context_init(&bld, gallivm, tmp_type);
1019

1020
       if (src_shift < dst_shift) {
1021
          LLVMValueRef pre_shift[LP_MAX_VECTOR_LENGTH];
1022

1023
          if (dst_shift - src_shift < dst_type.width) {
1024
             for (i = 0; i < num_tmps; ++i) {
1025
                pre_shift[i] = tmp[i];
1026
                tmp[i] = lp_build_shl_imm(&bld, tmp[i], dst_shift - src_shift);
1027
             }
1028
          }
1029
          else {
1030
             /*
1031
              * This happens for things like sscaled -> unorm conversions. Shift
1032
              * counts equal to bit width cause undefined results, so hack around it.
1033
              */
1034
             for (i = 0; i < num_tmps; ++i) {
1035
                pre_shift[i] = tmp[i];
1036
                tmp[i] = lp_build_zero(gallivm, dst_type);
1037
             }
1038
          }
1039

1040
          /* Compensate for different offsets */
1041
          if (dst_offset > src_offset) {
1042
             for (i = 0; i < num_tmps; ++i) {
1043
                tmp[i] = LLVMBuildSub(builder, tmp[i], pre_shift[i], "");
1044
             }
1045
          }
1046
       }
1047
    }
1048

1049
   for(i = 0; i < num_dsts; ++i) {
1050
      dst[i] = tmp[i];
1051
      assert(lp_check_value(dst_type, dst[i]));
1052
   }
1053
}
1054

1055

1056
/**
1057
 * Bit mask conversion.
1058
 *
1059
 * This will convert the integer masks that match the given types.
1060
 *
1061
 * The mask values should 0 or -1, i.e., all bits either set to zero or one.
1062
 * Any other value will likely cause unpredictable results.
1063
 *
1064
 * This is basically a very trimmed down version of lp_build_conv.
1065
 */
1066
void
1067
lp_build_conv_mask(struct gallivm_state *gallivm,
1068
                   struct lp_type src_type,
1069
                   struct lp_type dst_type,
1070
                   const LLVMValueRef *src, unsigned num_srcs,
1071
                   LLVMValueRef *dst, unsigned num_dsts)
1072
{
1073

1074
   /* We must not loose or gain channels. Only precision */
1075
   assert(src_type.length * num_srcs == dst_type.length * num_dsts);
1076

1077
   /*
1078
    * Drop
1079
    *
1080
    * We assume all values are 0 or -1
1081
    */
1082

1083
   src_type.floating = FALSE;
1084
   src_type.fixed = FALSE;
1085
   src_type.sign = TRUE;
1086
   src_type.norm = FALSE;
1087

1088
   dst_type.floating = FALSE;
1089
   dst_type.fixed = FALSE;
1090
   dst_type.sign = TRUE;
1091
   dst_type.norm = FALSE;
1092

1093
   /*
1094
    * Truncate or expand bit width
1095
    */
1096

1097
   lp_build_resize(gallivm, src_type, dst_type, src, num_srcs, dst, num_dsts);
1098
}
1099

1100
Product

Resources

Company