CoCalc -- lp_bld_format

GitHub Repository: PojavLauncherTeam/mesa
Path: blob/21.2-virgl/src/gallium/auxiliary/gallivm/lp_bld_format_soa.c
⁴⁵⁶⁵ views
1
/**************************************************************************
2
 *
3
 * Copyright 2009 VMware, Inc.
4
 * All Rights Reserved.
5
 *
6
 * Permission is hereby granted, free of charge, to any person obtaining a
7
 * copy of this software and associated documentation files (the
8
 * "Software"), to deal in the Software without restriction, including
9
 * without limitation the rights to use, copy, modify, merge, publish,
10
 * distribute, sub license, and/or sell copies of the Software, and to
11
 * permit persons to whom the Software is furnished to do so, subject to
12
 * the following conditions:
13
 *
14
 * The above copyright notice and this permission notice (including the
15
 * next paragraph) shall be included in all copies or substantial portions
16
 * of the Software.
17
 *
18
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
19
 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20
 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
21
 * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
22
 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
23
 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
24
 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25
 *
26
 **************************************************************************/
27

28

29
#include "pipe/p_defines.h"
30

31
#include "util/format/u_format.h"
32
#include "util/u_memory.h"
33
#include "util/u_string.h"
34
#include "util/u_math.h"
35

36
#include "lp_bld_type.h"
37
#include "lp_bld_const.h"
38
#include "lp_bld_conv.h"
39
#include "lp_bld_swizzle.h"
40
#include "lp_bld_gather.h"
41
#include "lp_bld_debug.h"
42
#include "lp_bld_format.h"
43
#include "lp_bld_arit.h"
44
#include "lp_bld_pack.h"
45
#include "lp_bld_flow.h"
46
#include "lp_bld_printf.h"
47
#include "lp_bld_intr.h"
48

49
static void
50
convert_to_soa(struct gallivm_state *gallivm,
51
               LLVMValueRef src_aos[LP_MAX_VECTOR_WIDTH / 32],
52
               LLVMValueRef dst_soa[4],
53
               const struct lp_type soa_type)
54
{
55
   unsigned j, k;
56
   struct lp_type aos_channel_type = soa_type;
57

58
   LLVMValueRef aos_channels[4];
59
   unsigned pixels_per_channel = soa_type.length / 4;
60

61
   debug_assert((soa_type.length % 4) == 0);
62

63
   aos_channel_type.length >>= 1;
64

65
   for (j = 0; j < 4; ++j) {
66
      LLVMValueRef channel[LP_MAX_VECTOR_LENGTH] = { 0 };
67

68
      assert(pixels_per_channel <= LP_MAX_VECTOR_LENGTH);
69

70
      for (k = 0; k < pixels_per_channel; ++k) {
71
         channel[k] = src_aos[j + 4 * k];
72
      }
73

74
      aos_channels[j] = lp_build_concat(gallivm, channel, aos_channel_type, pixels_per_channel);
75
   }
76

77
   lp_build_transpose_aos(gallivm, soa_type, aos_channels, dst_soa);
78
}
79

80

81
void
82
lp_build_format_swizzle_soa(const struct util_format_description *format_desc,
83
                            struct lp_build_context *bld,
84
                            const LLVMValueRef unswizzled[4],
85
                            LLVMValueRef swizzled_out[4])
86
{
87
   if (format_desc->colorspace == UTIL_FORMAT_COLORSPACE_ZS) {
88
      enum pipe_swizzle swizzle;
89
      LLVMValueRef depth_or_stencil;
90

91
      if (util_format_has_stencil(format_desc) &&
92
          !util_format_has_depth(format_desc)) {
93
         assert(!bld->type.floating);
94
         swizzle = format_desc->swizzle[1];
95
      }
96
      else {
97
         assert(bld->type.floating);
98
         swizzle = format_desc->swizzle[0];
99
      }
100
      /*
101
       * Return zzz1 or sss1 for depth-stencil formats here.
102
       * Correct swizzling will be handled by apply_sampler_swizzle() later.
103
       */
104
      depth_or_stencil = lp_build_swizzle_soa_channel(bld, unswizzled, swizzle);
105

106
      swizzled_out[2] = swizzled_out[1] = swizzled_out[0] = depth_or_stencil;
107
      swizzled_out[3] = bld->one;
108
   }
109
   else {
110
      unsigned chan;
111
      for (chan = 0; chan < 4; ++chan) {
112
         enum pipe_swizzle swizzle = format_desc->swizzle[chan];
113
         swizzled_out[chan] = lp_build_swizzle_soa_channel(bld, unswizzled, swizzle);
114
      }
115
   }
116
}
117

118

119

120
static LLVMValueRef
121
lp_build_extract_soa_chan(struct lp_build_context *bld,
122
                          unsigned blockbits,
123
                          boolean srgb_chan,
124
                          struct util_format_channel_description chan_desc,
125
                          LLVMValueRef packed)
126
{
127
   struct gallivm_state *gallivm = bld->gallivm;
128
   LLVMBuilderRef builder = gallivm->builder;
129
   struct lp_type type = bld->type;
130
   LLVMValueRef input = packed;
131
   const unsigned width = chan_desc.size;
132
   const unsigned start = chan_desc.shift;
133
   const unsigned stop = start + width;
134

135
   /* Decode the input vector component */
136

137
   switch(chan_desc.type) {
138
   case UTIL_FORMAT_TYPE_VOID:
139
      input = bld->undef;
140
      break;
141

142
   case UTIL_FORMAT_TYPE_UNSIGNED:
143
      /*
144
       * Align the LSB
145
       */
146
      if (start) {
147
         input = LLVMBuildLShr(builder, input,
148
                               lp_build_const_int_vec(gallivm, type, start), "");
149
      }
150

151
      /*
152
       * Zero the MSBs
153
       */
154
      if (stop < blockbits) {
155
         unsigned mask = ((unsigned long long)1 << width) - 1;
156
         input = LLVMBuildAnd(builder, input,
157
                              lp_build_const_int_vec(gallivm, type, mask), "");
158
      }
159

160
      /*
161
       * Type conversion
162
       */
163
      if (type.floating) {
164
         if (srgb_chan) {
165
            struct lp_type conv_type = lp_uint_type(type);
166
            input = lp_build_srgb_to_linear(gallivm, conv_type, width, input);
167
         }
168
         else {
169
            if(chan_desc.normalized)
170
               input = lp_build_unsigned_norm_to_float(gallivm, width, type, input);
171
            else
172
               input = LLVMBuildUIToFP(builder, input, bld->vec_type, "");
173
         }
174
      }
175
      else if (chan_desc.pure_integer) {
176
         /* Nothing to do */
177
      } else {
178
          /* FIXME */
179
          assert(0);
180
      }
181
      break;
182

183
   case UTIL_FORMAT_TYPE_SIGNED:
184
      /*
185
       * Align the sign bit first.
186
       */
187
      if (stop < type.width) {
188
         unsigned bits = type.width - stop;
189
         LLVMValueRef bits_val = lp_build_const_int_vec(gallivm, type, bits);
190
         input = LLVMBuildShl(builder, input, bits_val, "");
191
      }
192

193
      /*
194
       * Align the LSB (with an arithmetic shift to preserve the sign)
195
       */
196
      if (chan_desc.size < type.width) {
197
         unsigned bits = type.width - chan_desc.size;
198
         LLVMValueRef bits_val = lp_build_const_int_vec(gallivm, type, bits);
199
         input = LLVMBuildAShr(builder, input, bits_val, "");
200
      }
201

202
      /*
203
       * Type conversion
204
       */
205
      if (type.floating) {
206
         input = LLVMBuildSIToFP(builder, input, bld->vec_type, "");
207
         if (chan_desc.normalized) {
208
            double scale = 1.0 / ((1 << (chan_desc.size - 1)) - 1);
209
            LLVMValueRef scale_val = lp_build_const_vec(gallivm, type, scale);
210
            input = LLVMBuildFMul(builder, input, scale_val, "");
211
            /*
212
             * The formula above will produce value below -1.0 for most negative values.
213
             * compliance requires clamping it.
214
             * GTF-GL45.gtf33.GL3Tests.vertex_type_2_10_10_10_rev.vertex_type_2_10_10_10_rev_conversion.
215
             */
216
            input = lp_build_max(bld, input,
217
                                 lp_build_const_vec(gallivm, type, -1.0f));
218
         }
219
      }
220
      else if (chan_desc.pure_integer) {
221
         /* Nothing to do */
222
      } else {
223
          /* FIXME */
224
          assert(0);
225
      }
226
      break;
227

228
   case UTIL_FORMAT_TYPE_FLOAT:
229
      if (type.floating) {
230
         if (chan_desc.size == 16) {
231
            struct lp_type f16i_type = type;
232
            f16i_type.width /= 2;
233
            f16i_type.floating = 0;
234
            if (start) {
235
               input = LLVMBuildLShr(builder, input,
236
                                     lp_build_const_int_vec(gallivm, type, start), "");
237
            }
238
            input = LLVMBuildTrunc(builder, input,
239
                                   lp_build_vec_type(gallivm, f16i_type), "");
240
            input = lp_build_half_to_float(gallivm, input);
241
         } else {
242
            assert(start == 0);
243
            assert(stop == 32);
244
            assert(type.width == 32);
245
         }
246
         input = LLVMBuildBitCast(builder, input, bld->vec_type, "");
247
      }
248
      else {
249
         /* FIXME */
250
         assert(0);
251
         input = bld->undef;
252
      }
253
      break;
254

255
   case UTIL_FORMAT_TYPE_FIXED:
256
      if (type.floating) {
257
         double scale = 1.0 / ((1 << (chan_desc.size/2)) - 1);
258
         LLVMValueRef scale_val = lp_build_const_vec(gallivm, type, scale);
259
         input = LLVMBuildSIToFP(builder, input, bld->vec_type, "");
260
         input = LLVMBuildFMul(builder, input, scale_val, "");
261
      }
262
      else {
263
         /* FIXME */
264
         assert(0);
265
         input = bld->undef;
266
      }
267
      break;
268

269
   default:
270
      assert(0);
271
      input = bld->undef;
272
      break;
273
   }
274

275
   return input;
276
}
277

278

279
/**
280
 * Unpack several pixels in SoA.
281
 *
282
 * It takes a vector of packed pixels:
283
 *
284
 *   packed = {P0, P1, P2, P3, ..., Pn}
285
 *
286
 * And will produce four vectors:
287
 *
288
 *   red    = {R0, R1, R2, R3, ..., Rn}
289
 *   green  = {G0, G1, G2, G3, ..., Gn}
290
 *   blue   = {B0, B1, B2, B3, ..., Bn}
291
 *   alpha  = {A0, A1, A2, A3, ..., An}
292
 *
293
 * It requires that a packed pixel fits into an element of the output
294
 * channels. The common case is when converting pixel with a depth of 32 bit or
295
 * less into floats.
296
 *
297
 * \param format_desc  the format of the 'packed' incoming pixel vector
298
 * \param type  the desired type for rgba_out (type.length = n, above)
299
 * \param packed  the incoming vector of packed pixels
300
 * \param rgba_out  returns the SoA R,G,B,A vectors
301
 */
302
void
303
lp_build_unpack_rgba_soa(struct gallivm_state *gallivm,
304
                         const struct util_format_description *format_desc,
305
                         struct lp_type type,
306
                         LLVMValueRef packed,
307
                         LLVMValueRef rgba_out[4])
308
{
309
   struct lp_build_context bld;
310
   LLVMValueRef inputs[4];
311
   unsigned chan;
312

313
   assert(format_desc->layout == UTIL_FORMAT_LAYOUT_PLAIN);
314
   assert(format_desc->block.width == 1);
315
   assert(format_desc->block.height == 1);
316
   assert(format_desc->block.bits <= type.width);
317
   /* FIXME: Support more output types */
318
   assert(type.width == 32);
319

320
   lp_build_context_init(&bld, gallivm, type);
321

322
   /* Decode the input vector components */
323
   for (chan = 0; chan < format_desc->nr_channels; ++chan) {
324
      struct util_format_channel_description chan_desc = format_desc->channel[chan];
325
      boolean srgb_chan = FALSE;
326

327
      if (format_desc->colorspace == UTIL_FORMAT_COLORSPACE_SRGB &&
328
          format_desc->swizzle[3] != chan) {
329
         srgb_chan = TRUE;
330
      }
331

332
      inputs[chan] = lp_build_extract_soa_chan(&bld,
333
                                               format_desc->block.bits,
334
                                               srgb_chan,
335
                                               chan_desc,
336
                                               packed);
337
   }
338

339
   lp_build_format_swizzle_soa(format_desc, &bld, inputs, rgba_out);
340
}
341

342

343
/**
344
 * Convert a vector of rgba8 values into 32bit wide SoA vectors.
345
 *
346
 * \param dst_type  The desired return type. For pure integer formats
347
 *                  this should be a 32bit wide int or uint vector type,
348
 *                  otherwise a float vector type.
349
 *
350
 * \param packed    The rgba8 values to pack.
351
 *
352
 * \param rgba      The 4 SoA return vectors.
353
 */
354
void
355
lp_build_rgba8_to_fi32_soa(struct gallivm_state *gallivm,
356
                           struct lp_type dst_type,
357
                           LLVMValueRef packed,
358
                           LLVMValueRef *rgba)
359
{
360
   LLVMBuilderRef builder = gallivm->builder;
361
   LLVMValueRef mask = lp_build_const_int_vec(gallivm, dst_type, 0xff);
362
   unsigned chan;
363

364
   /* XXX technically shouldn't use that for uint dst_type */
365
   packed = LLVMBuildBitCast(builder, packed,
366
                             lp_build_int_vec_type(gallivm, dst_type), "");
367

368
   /* Decode the input vector components */
369
   for (chan = 0; chan < 4; ++chan) {
370
#if UTIL_ARCH_LITTLE_ENDIAN
371
      unsigned start = chan*8;
372
#else
373
      unsigned start = (3-chan)*8;
374
#endif
375
      unsigned stop = start + 8;
376
      LLVMValueRef input;
377

378
      input = packed;
379

380
      if (start)
381
         input = LLVMBuildLShr(builder, input,
382
                               lp_build_const_int_vec(gallivm, dst_type, start), "");
383

384
      if (stop < 32)
385
         input = LLVMBuildAnd(builder, input, mask, "");
386

387
      if (dst_type.floating)
388
         input = lp_build_unsigned_norm_to_float(gallivm, 8, dst_type, input);
389

390
      rgba[chan] = input;
391
   }
392
}
393

394

395

396
/**
397
 * Fetch a texels from a texture, returning them in SoA layout.
398
 *
399
 * \param type  the desired return type for 'rgba'.  The vector length
400
 *              is the number of texels to fetch
401
 * \param aligned if the offset is guaranteed to be aligned to element width
402
 *
403
 * \param base_ptr  points to the base of the texture mip tree.
404
 * \param offset    offset to start of the texture image block.  For non-
405
 *                  compressed formats, this simply is an offset to the texel.
406
 *                  For compressed formats, it is an offset to the start of the
407
 *                  compressed data block.
408
 *
409
 * \param i, j  the sub-block pixel coordinates.  For non-compressed formats
410
 *              these will always be (0,0).  For compressed formats, i will
411
 *              be in [0, block_width-1] and j will be in [0, block_height-1].
412
 * \param cache  optional value pointing to a lp_build_format_cache structure
413
 */
414
void
415
lp_build_fetch_rgba_soa(struct gallivm_state *gallivm,
416
                        const struct util_format_description *format_desc,
417
                        struct lp_type type,
418
                        boolean aligned,
419
                        LLVMValueRef base_ptr,
420
                        LLVMValueRef offset,
421
                        LLVMValueRef i,
422
                        LLVMValueRef j,
423
                        LLVMValueRef cache,
424
                        LLVMValueRef rgba_out[4])
425
{
426
   LLVMBuilderRef builder = gallivm->builder;
427
   enum pipe_format format = format_desc->format;
428
   struct lp_type fetch_type;
429

430
   if (format_desc->layout == UTIL_FORMAT_LAYOUT_PLAIN &&
431
       (format_desc->colorspace == UTIL_FORMAT_COLORSPACE_RGB ||
432
        format_desc->colorspace == UTIL_FORMAT_COLORSPACE_SRGB ||
433
        format_desc->colorspace == UTIL_FORMAT_COLORSPACE_ZS) &&
434
       format_desc->block.width == 1 &&
435
       format_desc->block.height == 1 &&
436
       format_desc->block.bits <= type.width &&
437
       (format_desc->channel[0].type != UTIL_FORMAT_TYPE_FLOAT ||
438
        format_desc->channel[0].size == 32 ||
439
        format_desc->channel[0].size == 16))
440
   {
441
      /*
442
       * The packed pixel fits into an element of the destination format. Put
443
       * the packed pixels into a vector and extract each component for all
444
       * vector elements in parallel.
445
       */
446

447
      LLVMValueRef packed;
448

449
      /*
450
       * gather the texels from the texture
451
       * Ex: packed = {XYZW, XYZW, XYZW, XYZW}
452
       */
453
      assert(format_desc->block.bits <= type.width);
454
      fetch_type = lp_type_uint(type.width);
455
      packed = lp_build_gather(gallivm,
456
                               type.length,
457
                               format_desc->block.bits,
458
                               fetch_type,
459
                               aligned,
460
                               base_ptr, offset, FALSE);
461

462
      /*
463
       * convert texels to float rgba
464
       */
465
      lp_build_unpack_rgba_soa(gallivm,
466
                               format_desc,
467
                               type,
468
                               packed, rgba_out);
469
      return;
470
   }
471

472

473
   if (format_desc->layout == UTIL_FORMAT_LAYOUT_PLAIN &&
474
       (format_desc->colorspace == UTIL_FORMAT_COLORSPACE_RGB) &&
475
       format_desc->block.width == 1 &&
476
       format_desc->block.height == 1 &&
477
       format_desc->block.bits > type.width &&
478
       ((format_desc->block.bits <= type.width * type.length &&
479
         format_desc->channel[0].size <= type.width) ||
480
        (format_desc->channel[0].size == 64 &&
481
         format_desc->channel[0].type == UTIL_FORMAT_TYPE_FLOAT &&
482
         type.floating)))
483
   {
484
      /*
485
       * Similar to above, but the packed pixel is larger than what fits
486
       * into an element of the destination format. The packed pixels will be
487
       * shuffled into SoA vectors appropriately, and then the extraction will
488
       * be done in parallel as much as possible.
489
       * Good for 16xn (n > 2) and 32xn (n > 1) formats, care is taken so
490
       * the gathered vectors can be shuffled easily (even with avx).
491
       * 64xn float -> 32xn float is handled too but it's a bit special as
492
       * it does the conversion pre-shuffle.
493
       */
494

495
      LLVMValueRef packed[4], dst[4], output[4], shuffles[LP_MAX_VECTOR_WIDTH/32];
496
      struct lp_type fetch_type, gather_type = type;
497
      unsigned num_gather, fetch_width, i, j;
498
      struct lp_build_context bld;
499
      boolean fp64 = format_desc->channel[0].size == 64;
500

501
      lp_build_context_init(&bld, gallivm, type);
502

503
      assert(type.width == 32);
504
      assert(format_desc->block.bits > type.width);
505

506
      /*
507
       * First, figure out fetch order.
508
       */
509
      fetch_width = util_next_power_of_two(format_desc->block.bits);
510
      /*
511
       * fp64 are treated like fp32 except we fetch twice wide values
512
       * (as we shuffle after trunc). The shuffles for that work out
513
       * mostly fine (slightly suboptimal for 4-wide, perfect for AVX)
514
       * albeit we miss the potential opportunity for hw gather (as it
515
       * only handles native size).
516
       */
517
      num_gather = fetch_width / type.width;
518
      gather_type.width *= num_gather;
519
      if (fp64) {
520
         num_gather /= 2;
521
      }
522
      gather_type.length /= num_gather;
523

524
      for (i = 0; i < num_gather; i++) {
525
         LLVMValueRef offsetr, shuf_vec;
526
         if(num_gather == 4) {
527
            for (j = 0; j < gather_type.length; j++) {
528
               unsigned idx = i + 4*j;
529
               shuffles[j] = lp_build_const_int32(gallivm, idx);
530
            }
531
            shuf_vec = LLVMConstVector(shuffles, gather_type.length);
532
            offsetr = LLVMBuildShuffleVector(builder, offset, offset, shuf_vec, "");
533

534
         }
535
         else if (num_gather == 2) {
536
            assert(num_gather == 2);
537
            for (j = 0; j < gather_type.length; j++) {
538
               unsigned idx = i*2 + (j%2) + (j/2)*4;
539
               shuffles[j] = lp_build_const_int32(gallivm, idx);
540
            }
541
            shuf_vec = LLVMConstVector(shuffles, gather_type.length);
542
            offsetr = LLVMBuildShuffleVector(builder, offset, offset, shuf_vec, "");
543
         }
544
         else {
545
            assert(num_gather == 1);
546
            offsetr = offset;
547
         }
548
         if (gather_type.length == 1) {
549
            LLVMValueRef zero = lp_build_const_int32(gallivm, 0);
550
            offsetr = LLVMBuildExtractElement(builder, offsetr, zero, "");
551
         }
552

553
         /*
554
          * Determine whether to use float or int loads. This is mostly
555
          * to outsmart the (stupid) llvm int/float shuffle logic, we
556
          * don't really care much if the data is floats or ints...
557
          * But llvm will refuse to use single float shuffle with int data
558
          * and instead use 3 int shuffles instead, the code looks atrocious.
559
          * (Note bitcasts often won't help, as llvm is too smart to be
560
          * fooled by that.)
561
          * Nobody cares about simd float<->int domain transition penalties,
562
          * which usually don't even exist for shuffles anyway.
563
          * With 4x32bit (and 3x32bit) fetch, we use float vec (the data is
564
          * going into transpose, which is unpacks, so doesn't really matter
565
          * much).
566
          * With 2x32bit or 4x16bit fetch, we use float vec, since those
567
          * go into the weird channel separation shuffle. With floats,
568
          * this is (with 128bit vectors):
569
          * - 2 movq, 2 movhpd, 2 shufps
570
          * With ints it would be:
571
          * - 4 movq, 2 punpcklqdq, 4 pshufd, 2 blendw
572
          * I've seen texture functions increase in code size by 15% just due
573
          * to that (there's lots of such fetches in them...)
574
          * (We could chose a different gather order to improve this somewhat
575
          * for the int path, but it would basically just drop the blends,
576
          * so the float path with this order really is optimal.)
577
          * Albeit it is tricky sometimes llvm doesn't ignore the float->int
578
          * casts so must avoid them until we're done with the float shuffle...
579
          * 3x16bit formats (the same is also true for 3x8) are pretty bad but
580
          * there's nothing we can do about them (we could overallocate by
581
          * those couple bytes and use unaligned but pot sized load).
582
          * Note that this is very much x86 specific. I don't know if this
583
          * affect other archs at all.
584
          */
585
         if (num_gather > 1) {
586
            /*
587
             * We always want some float type here (with x86)
588
             * due to shuffles being float ones afterwards (albeit for
589
             * the num_gather == 4 case int should work fine too
590
             * (unless there's some problems with avx but not avx2).
591
             */
592
            if (format_desc->channel[0].size == 64) {
593
               fetch_type = lp_type_float_vec(64, gather_type.width);
594
            } else {
595
               fetch_type = lp_type_int_vec(32, gather_type.width);
596
            }
597
         }
598
         else {
599
            /* type doesn't matter much */
600
            if (format_desc->channel[0].type == UTIL_FORMAT_TYPE_FLOAT &&
601
                (format_desc->channel[0].size == 32 ||
602
                 format_desc->channel[0].size == 64)) {
603
            fetch_type = lp_type_float(gather_type.width);
604
            } else {
605
               fetch_type = lp_type_uint(gather_type.width);
606
            }
607
         }
608

609
         /* Now finally gather the values */
610
         packed[i] = lp_build_gather(gallivm, gather_type.length,
611
                                     format_desc->block.bits,
612
                                     fetch_type, aligned,
613
                                     base_ptr, offsetr, FALSE);
614
         if (fp64) {
615
            struct lp_type conv_type = type;
616
            conv_type.width *= 2;
617
            packed[i] = LLVMBuildBitCast(builder, packed[i],
618
                                         lp_build_vec_type(gallivm, conv_type), "");
619
            packed[i] = LLVMBuildFPTrunc(builder, packed[i], bld.vec_type, "");
620
         }
621
      }
622

623
      /* shuffle the gathered values to SoA */
624
      if (num_gather == 2) {
625
         for (i = 0; i < num_gather; i++) {
626
            for (j = 0; j < type.length; j++) {
627
               unsigned idx = (j%2)*2 + (j/4)*4 + i;
628
               if ((j/2)%2)
629
                  idx += type.length;
630
               shuffles[j] = lp_build_const_int32(gallivm, idx);
631
            }
632
            dst[i] = LLVMBuildShuffleVector(builder, packed[0], packed[1],
633
                                            LLVMConstVector(shuffles, type.length), "");
634
         }
635
      }
636
      else if (num_gather == 4) {
637
         lp_build_transpose_aos(gallivm, lp_int_type(type), packed, dst);
638
      }
639
      else {
640
         assert(num_gather == 1);
641
         dst[0] = packed[0];
642
      }
643

644
      /*
645
       * And finally unpack exactly as above, except that
646
       * chan shift is adjusted and the right vector selected.
647
       */
648
      if (!fp64) {
649
         for (i = 0; i < num_gather; i++) {
650
            dst[i] = LLVMBuildBitCast(builder, dst[i], bld.int_vec_type, "");
651
         }
652
         for (i = 0; i < format_desc->nr_channels; i++) {
653
            struct util_format_channel_description chan_desc = format_desc->channel[i];
654
            unsigned blockbits = type.width;
655
            unsigned vec_nr;
656

657
#if UTIL_ARCH_BIG_ENDIAN
658
            vec_nr = (format_desc->block.bits - (chan_desc.shift + chan_desc.size)) / type.width;
659
#else
660
            vec_nr = chan_desc.shift / type.width;
661
#endif
662
            chan_desc.shift %= type.width;
663

664
            output[i] = lp_build_extract_soa_chan(&bld,
665
                                                  blockbits,
666
                                                  FALSE,
667
                                                  chan_desc,
668
                                                  dst[vec_nr]);
669
         }
670
      }
671
      else {
672
         for (i = 0; i < format_desc->nr_channels; i++)  {
673
            output[i] = dst[i];
674
         }
675
      }
676

677
      lp_build_format_swizzle_soa(format_desc, &bld, output, rgba_out);
678
      return;
679
   }
680

681
   if (format == PIPE_FORMAT_R11G11B10_FLOAT ||
682
       format == PIPE_FORMAT_R9G9B9E5_FLOAT) {
683
      /*
684
       * similar conceptually to above but requiring special
685
       * AoS packed -> SoA float conversion code.
686
       */
687
      LLVMValueRef packed;
688
      struct lp_type fetch_type = lp_type_uint(type.width);
689

690
      assert(type.floating);
691
      assert(type.width == 32);
692

693
      packed = lp_build_gather(gallivm, type.length,
694
                               format_desc->block.bits,
695
                               fetch_type, aligned,
696
                               base_ptr, offset, FALSE);
697
      if (format == PIPE_FORMAT_R11G11B10_FLOAT) {
698
         lp_build_r11g11b10_to_float(gallivm, packed, rgba_out);
699
      }
700
      else {
701
         lp_build_rgb9e5_to_float(gallivm, packed, rgba_out);
702
      }
703
      return;
704
   }
705

706
   if (format_desc->colorspace == UTIL_FORMAT_COLORSPACE_ZS &&
707
       format_desc->block.bits == 64) {
708
      /*
709
       * special case the format is 64 bits but we only require
710
       * 32bit (or 8bit) from each block.
711
       */
712
      LLVMValueRef packed;
713
      struct lp_type fetch_type = lp_type_uint(type.width);
714

715
      if (format == PIPE_FORMAT_X32_S8X24_UINT) {
716
         /*
717
          * for stencil simply fix up offsets - could in fact change
718
          * base_ptr instead even outside the shader.
719
          */
720
         unsigned mask = (1 << 8) - 1;
721
         LLVMValueRef s_offset = lp_build_const_int_vec(gallivm, type, 4);
722
         offset = LLVMBuildAdd(builder, offset, s_offset, "");
723
         packed = lp_build_gather(gallivm, type.length, 32, fetch_type,
724
                                  aligned, base_ptr, offset, FALSE);
725
         packed = LLVMBuildAnd(builder, packed,
726
                               lp_build_const_int_vec(gallivm, type, mask), "");
727
      }
728
      else {
729
         assert (format == PIPE_FORMAT_Z32_FLOAT_S8X24_UINT);
730
         packed = lp_build_gather(gallivm, type.length, 32, fetch_type,
731
                                  aligned, base_ptr, offset, TRUE);
732
         packed = LLVMBuildBitCast(builder, packed,
733
                                   lp_build_vec_type(gallivm, type), "");
734
      }
735
      /* for consistency with lp_build_unpack_rgba_soa() return sss1 or zzz1 */
736
      rgba_out[0] = rgba_out[1] = rgba_out[2] = packed;
737
      rgba_out[3] = lp_build_const_vec(gallivm, type, 1.0f);
738
      return;
739
   }
740

741
   /*
742
    * Try calling lp_build_fetch_rgba_aos for all pixels.
743
    * Should only really hit subsampled, compressed
744
    * (for s3tc srgb and rgtc too).
745
    * (This is invalid for plain 8unorm formats because we're lazy with
746
    * the swizzle since some results would arrive swizzled, some not.)
747
    */
748

749
   if ((format_desc->layout != UTIL_FORMAT_LAYOUT_PLAIN) &&
750
       (util_format_fits_8unorm(format_desc) ||
751
        format_desc->layout == UTIL_FORMAT_LAYOUT_RGTC ||
752
        format_desc->layout == UTIL_FORMAT_LAYOUT_S3TC) &&
753
       type.floating && type.width == 32 &&
754
       (type.length == 1 || (type.length % 4 == 0))) {
755
      struct lp_type tmp_type;
756
      struct lp_build_context bld;
757
      LLVMValueRef packed, rgba[4];
758
      const struct util_format_description *flinear_desc;
759
      const struct util_format_description *frgba8_desc;
760
      unsigned chan;
761
      bool is_signed = (format_desc->format == PIPE_FORMAT_RGTC1_SNORM ||
762
                        format_desc->format == PIPE_FORMAT_RGTC2_SNORM ||
763
                        format_desc->format == PIPE_FORMAT_LATC1_SNORM ||
764
                        format_desc->format == PIPE_FORMAT_LATC2_SNORM);
765

766
      lp_build_context_init(&bld, gallivm, type);
767

768
      /*
769
       * Make sure the conversion in aos really only does convert to rgba8
770
       * and not anything more (so use linear format, adjust type).
771
       */
772
      flinear_desc = util_format_description(util_format_linear(format));
773
      memset(&tmp_type, 0, sizeof tmp_type);
774
      tmp_type.width = 8;
775
      tmp_type.length = type.length * 4;
776
      tmp_type.norm = TRUE;
777
      tmp_type.sign = is_signed;
778

779
      packed = lp_build_fetch_rgba_aos(gallivm, flinear_desc, tmp_type,
780
                                       aligned, base_ptr, offset, i, j, cache);
781
      packed = LLVMBuildBitCast(builder, packed, bld.int_vec_type, "");
782

783
      /*
784
       * The values are now packed so they match ordinary (srgb) RGBA8 format,
785
       * hence need to use matching format for unpack.
786
       */
787
      frgba8_desc = util_format_description(is_signed ? PIPE_FORMAT_R8G8B8A8_SNORM : PIPE_FORMAT_R8G8B8A8_UNORM);
788
      if (format_desc->colorspace == UTIL_FORMAT_COLORSPACE_SRGB) {
789
         assert(format_desc->layout == UTIL_FORMAT_LAYOUT_S3TC);
790
         frgba8_desc = util_format_description(PIPE_FORMAT_R8G8B8A8_SRGB);
791
      }
792
      lp_build_unpack_rgba_soa(gallivm,
793
                               frgba8_desc,
794
                               type,
795
                               packed, rgba);
796

797
      /*
798
       * We converted 4 channels. Make sure llvm can drop unneeded ones
799
       * (luckily the rgba order is fixed, only LA needs special case).
800
       */
801
      for (chan = 0; chan < 4; chan++) {
802
         enum pipe_swizzle swizzle = format_desc->swizzle[chan];
803
         if (chan == 3 && util_format_is_luminance_alpha(format)) {
804
            swizzle = PIPE_SWIZZLE_W;
805
         }
806
         rgba_out[chan] = lp_build_swizzle_soa_channel(&bld, rgba, swizzle);
807
      }
808
      return;
809
   }
810

811

812
   /*
813
    * Fallback to calling lp_build_fetch_rgba_aos for each pixel.
814
    *
815
    * This is not the most efficient way of fetching pixels, as we
816
    * miss some opportunities to do vectorization, but this is
817
    * convenient for formats or scenarios for which there was no
818
    * opportunity or incentive to optimize.
819
    *
820
    * We do NOT want to end up here, this typically is quite terrible,
821
    * in particular if the formats have less than 4 channels.
822
    *
823
    * Right now, this should only be hit for:
824
    * - ETC formats
825
    *   (those miss fast fetch functions hence they are terrible anyway)
826
    */
827

828
   {
829
      unsigned k;
830
      struct lp_type tmp_type;
831
      LLVMValueRef aos_fetch[LP_MAX_VECTOR_WIDTH / 32];
832

833
      if (gallivm_debug & GALLIVM_DEBUG_PERF) {
834
         debug_printf("%s: AoS fetch fallback for %s\n",
835
                      __FUNCTION__, format_desc->short_name);
836
      }
837

838
      tmp_type = type;
839
      tmp_type.length = 4;
840

841
      if (type.length == 1) {
842
         LLVMValueRef fetch = lp_build_fetch_rgba_aos(gallivm, format_desc, tmp_type,
843
                                                      aligned, base_ptr, offset,
844
                                                      i, j, cache);
845

846
         for (k = 0; k < 4; k++)
847
            rgba_out[k] = LLVMBuildExtractElement(gallivm->builder, fetch, lp_build_const_int32(gallivm, k), "");
848
         return;
849
      }
850

851
      /*
852
       * Note that vector transpose can be worse compared to insert/extract
853
       * for aos->soa conversion (for formats with 1 or 2 channels). However,
854
       * we should try to avoid getting here for just about all formats, so
855
       * don't bother.
856
       */
857

858
      /* loop over number of pixels */
859
      for(k = 0; k < type.length; ++k) {
860
         LLVMValueRef index = lp_build_const_int32(gallivm, k);
861
         LLVMValueRef offset_elem;
862
         LLVMValueRef i_elem, j_elem;
863

864
         offset_elem = LLVMBuildExtractElement(builder, offset,
865
                                               index, "");
866

867
         i_elem = LLVMBuildExtractElement(builder, i, index, "");
868
         j_elem = LLVMBuildExtractElement(builder, j, index, "");
869

870
         /* Get a single float[4]={R,G,B,A} pixel */
871
         aos_fetch[k] = lp_build_fetch_rgba_aos(gallivm, format_desc, tmp_type,
872
                                                aligned, base_ptr, offset_elem,
873
                                                i_elem, j_elem, cache);
874

875
      }
876
      convert_to_soa(gallivm, aos_fetch, rgba_out, type);
877
   }
878
}
879

880
static void
881
lp_build_insert_soa_chan(struct lp_build_context *bld,
882
                         unsigned blockbits,
883
                         struct util_format_channel_description chan_desc,
884
                         LLVMValueRef *output,
885
                         LLVMValueRef rgba)
886
{
887
    struct gallivm_state *gallivm = bld->gallivm;
888
    LLVMBuilderRef builder = gallivm->builder;
889
    struct lp_type type = bld->type;
890
    const unsigned width = chan_desc.size;
891
    const unsigned start = chan_desc.shift;
892
    const uint32_t chan_mask = (1ULL << width) - 1;
893
    ASSERTED const unsigned stop = start + width;
894
    LLVMValueRef chan = NULL;
895
    switch(chan_desc.type) {
896
    case UTIL_FORMAT_TYPE_UNSIGNED:
897

898
       if (chan_desc.pure_integer) {
899
          chan = LLVMBuildBitCast(builder, rgba, bld->int_vec_type, "");
900
          LLVMValueRef mask_val = lp_build_const_int_vec(gallivm, type, chan_mask);
901
          LLVMValueRef mask = LLVMBuildICmp(builder, LLVMIntUGT, chan, mask_val, "");
902
          chan = LLVMBuildSelect(builder, mask, mask_val, chan, "");
903
       }
904
       else if (type.floating) {
905
          if (chan_desc.normalized) {
906
             rgba = lp_build_clamp(bld, rgba, bld->zero, bld->one);
907
             chan = lp_build_clamped_float_to_unsigned_norm(gallivm, type, width, rgba);
908
          } else
909
             chan = LLVMBuildFPToSI(builder, rgba, bld->vec_type, "");
910
       }
911
       if (start)
912
          chan = LLVMBuildShl(builder, chan,
913
                              lp_build_const_int_vec(gallivm, type, start), "");
914
       if (!*output)
915
          *output = chan;
916
       else
917
          *output = LLVMBuildOr(builder, *output, chan, "");
918
       break;
919
    case UTIL_FORMAT_TYPE_SIGNED:
920
       if (chan_desc.pure_integer) {
921
          chan = LLVMBuildBitCast(builder, rgba, bld->int_vec_type, "");
922
          chan = LLVMBuildAnd(builder, chan, lp_build_const_int_vec(gallivm, type, chan_mask), "");
923
       } else if (type.floating) {
924
          if (chan_desc.normalized) {
925
             char intrin[32];
926
             double scale = ((1 << (chan_desc.size - 1)) - 1);
927
             LLVMValueRef scale_val = lp_build_const_vec(gallivm, type, scale);
928
             rgba = lp_build_clamp(bld, rgba, lp_build_negate(bld, bld->one), bld->one);
929
             rgba = LLVMBuildFMul(builder, rgba, scale_val, "");
930
             lp_format_intrinsic(intrin, sizeof intrin, "llvm.rint", bld->vec_type);
931
             rgba = lp_build_intrinsic_unary(builder, intrin, bld->vec_type, rgba);
932
          }
933
          chan = LLVMBuildFPToSI(builder, rgba, bld->int_vec_type, "");
934
          chan = LLVMBuildAnd(builder, chan, lp_build_const_int_vec(gallivm, type, chan_mask), "");
935
       }
936
       if (start)
937
          chan = LLVMBuildShl(builder, chan,
938
                              lp_build_const_int_vec(gallivm, type, start), "");
939
       if (!*output)
940
          *output = chan;
941
       else
942
          *output = LLVMBuildOr(builder, *output, chan, "");
943
       break;
944
    case UTIL_FORMAT_TYPE_FLOAT:
945
       if (type.floating) {
946
          if (chan_desc.size == 16) {
947
             chan = lp_build_float_to_half(gallivm, rgba);
948
             chan = LLVMBuildZExt(builder, chan, bld->int_vec_type, "");
949
             if (start)
950
                chan = LLVMBuildShl(builder, chan,
951
                                    lp_build_const_int_vec(gallivm, type, start), "");
952
             if (!*output)
953
                *output = chan;
954
             else
955
                *output = LLVMBuildOr(builder, *output, chan, "");
956
          } else {
957
             assert(start == 0);
958
             assert(stop == 32);
959
             assert(type.width == 32);
960
             *output = LLVMBuildBitCast(builder, rgba, bld->int_vec_type, "");
961
          }
962
       } else
963
          assert(0);
964
       break;
965
    default:
966
       assert(0);
967
       *output = bld->undef;
968
    }
969
}
970

971
static void
972
lp_build_pack_rgba_soa(struct gallivm_state *gallivm,
973
                       const struct util_format_description *format_desc,
974
                       struct lp_type type,
975
                       const LLVMValueRef rgba_in[4],
976
                       LLVMValueRef *packed)
977
{
978
   unsigned chan;
979
   struct lp_build_context bld;
980
   assert(format_desc->layout == UTIL_FORMAT_LAYOUT_PLAIN);
981
   assert(format_desc->block.width == 1);
982
   assert(format_desc->block.height == 1);
983
   assert(format_desc->block.bits <= type.width);
984
   /* FIXME: Support more output types */
985
   assert(type.width == 32);
986

987
   lp_build_context_init(&bld, gallivm, type);
988
   for (chan = 0; chan < format_desc->nr_channels; ++chan) {
989
      struct util_format_channel_description chan_desc = format_desc->channel[chan];
990

991
      lp_build_insert_soa_chan(&bld, format_desc->block.bits,
992
                               chan_desc,
993
                               packed,
994
                               rgba_in[chan]);
995
   }
996
}
997

998
void
999
lp_build_store_rgba_soa(struct gallivm_state *gallivm,
1000
                        const struct util_format_description *format_desc,
1001
                        struct lp_type type,
1002
                        LLVMValueRef exec_mask,
1003
                        LLVMValueRef base_ptr,
1004
                        LLVMValueRef offset,
1005
                        LLVMValueRef out_of_bounds,
1006
                        const LLVMValueRef rgba_in[4])
1007
{
1008
   enum pipe_format format = format_desc->format;
1009
   LLVMValueRef packed[4];
1010
   unsigned num_stores = 0;
1011

1012
   memset(packed, 0, sizeof(LLVMValueRef) * 4);
1013
   if (format_desc->layout == UTIL_FORMAT_LAYOUT_PLAIN &&
1014
       format_desc->colorspace == UTIL_FORMAT_COLORSPACE_RGB &&
1015
       format_desc->block.width == 1 &&
1016
       format_desc->block.height == 1 &&
1017
       format_desc->block.bits <= type.width &&
1018
       (format_desc->channel[0].type != UTIL_FORMAT_TYPE_FLOAT ||
1019
        format_desc->channel[0].size == 32 ||
1020
        format_desc->channel[0].size == 16))
1021
   {
1022
      lp_build_pack_rgba_soa(gallivm, format_desc, type, rgba_in, &packed[0]);
1023

1024
      num_stores = 1;
1025
   } else if (format_desc->layout == UTIL_FORMAT_LAYOUT_PLAIN &&
1026
       (format_desc->colorspace == UTIL_FORMAT_COLORSPACE_RGB) &&
1027
       format_desc->block.width == 1 &&
1028
       format_desc->block.height == 1 &&
1029
       format_desc->block.bits > type.width &&
1030
       ((format_desc->block.bits <= type.width * type.length &&
1031
         format_desc->channel[0].size <= type.width) ||
1032
        (format_desc->channel[0].size == 64 &&
1033
         format_desc->channel[0].type == UTIL_FORMAT_TYPE_FLOAT &&
1034
         type.floating)))
1035
   {
1036
      /*
1037
       * Similar to above, but the packed pixel is larger than what fits
1038
       * into an element of the destination format. The packed pixels will be
1039
       * shuffled into SoA vectors appropriately, and then the extraction will
1040
       * be done in parallel as much as possible.
1041
       * Good for 16xn (n > 2) and 32xn (n > 1) formats, care is taken so
1042
       * the gathered vectors can be shuffled easily (even with avx).
1043
       * 64xn float -> 32xn float is handled too but it's a bit special as
1044
       * it does the conversion pre-shuffle.
1045
       */
1046
      struct lp_build_context bld;
1047

1048
      lp_build_context_init(&bld, gallivm, type);
1049
      assert(type.width == 32);
1050
      assert(format_desc->block.bits > type.width);
1051

1052
      unsigned store_width = util_next_power_of_two(format_desc->block.bits);
1053
      num_stores = store_width / type.width;
1054
      for (unsigned i = 0; i < format_desc->nr_channels; i++) {
1055
            struct util_format_channel_description chan_desc = format_desc->channel[i];
1056
            unsigned blockbits = type.width;
1057
            unsigned vec_nr;
1058

1059
            vec_nr = chan_desc.shift / type.width;
1060
            chan_desc.shift %= type.width;
1061

1062
            lp_build_insert_soa_chan(&bld, blockbits,
1063
                                     chan_desc,
1064
                                     &packed[vec_nr],
1065
                                     rgba_in[i]);
1066
      }
1067

1068
      assert(num_stores == 4 || num_stores == 2);
1069
      /* we can transpose and store at the same time */
1070
   } else if (format == PIPE_FORMAT_R11G11B10_FLOAT) {
1071
      packed[0] = lp_build_float_to_r11g11b10(gallivm, rgba_in);
1072
      num_stores = 1;
1073
   } else
1074
      assert(0);
1075

1076
   assert(exec_mask);
1077

1078
   LLVMTypeRef int32_ptr_type = LLVMPointerType(LLVMInt32TypeInContext(gallivm->context), 0);
1079
   LLVMTypeRef int16_ptr_type = LLVMPointerType(LLVMInt16TypeInContext(gallivm->context), 0);
1080
   LLVMTypeRef int8_ptr_type = LLVMPointerType(LLVMInt8TypeInContext(gallivm->context), 0);
1081

1082
   LLVMValueRef should_store_mask = LLVMBuildAnd(gallivm->builder, exec_mask, LLVMBuildNot(gallivm->builder, out_of_bounds, ""), "store_mask");
1083
   should_store_mask = LLVMBuildICmp(gallivm->builder, LLVMIntNE, should_store_mask, lp_build_const_int_vec(gallivm, type, 0), "");
1084
   for (unsigned i = 0; i < num_stores; i++) {
1085
      struct lp_build_loop_state loop_state;
1086

1087
      LLVMValueRef store_offset = LLVMBuildAdd(gallivm->builder, offset, lp_build_const_int_vec(gallivm, type, i * 4), "");
1088
      store_offset = LLVMBuildGEP(gallivm->builder, base_ptr, &store_offset, 1, "");
1089

1090
      lp_build_loop_begin(&loop_state, gallivm, lp_build_const_int32(gallivm, 0));
1091

1092
      struct lp_build_if_state ifthen;
1093
      LLVMValueRef cond = LLVMBuildExtractElement(gallivm->builder, should_store_mask, loop_state.counter, "");
1094
      lp_build_if(&ifthen, gallivm, cond);
1095

1096
      LLVMValueRef data = LLVMBuildExtractElement(gallivm->builder, packed[i], loop_state.counter, "");
1097
      LLVMValueRef this_offset = LLVMBuildExtractElement(gallivm->builder, store_offset, loop_state.counter, "");
1098

1099
      if (format_desc->block.bits == 8) {
1100
         this_offset = LLVMBuildBitCast(gallivm->builder, this_offset, int8_ptr_type, "");
1101
         data = LLVMBuildTrunc(gallivm->builder, data, LLVMInt8TypeInContext(gallivm->context), "");
1102
      } else if (format_desc->block.bits == 16) {
1103
         this_offset = LLVMBuildBitCast(gallivm->builder, this_offset, int16_ptr_type, "");
1104
         data = LLVMBuildTrunc(gallivm->builder, data, LLVMInt16TypeInContext(gallivm->context), "");
1105
      } else
1106
         this_offset = LLVMBuildBitCast(gallivm->builder, this_offset, int32_ptr_type, "");
1107
      LLVMBuildStore(gallivm->builder, data, this_offset);
1108
      lp_build_endif(&ifthen);
1109
      lp_build_loop_end_cond(&loop_state, lp_build_const_int32(gallivm, type.length),
1110
                             NULL, LLVMIntUGE);
1111
   }
1112
}
1113

1114
Product

Resources

Company