Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
PojavLauncherTeam
GitHub Repository: PojavLauncherTeam/mesa
Path: blob/21.2-virgl/src/gallium/auxiliary/gallivm/lp_bld_format_soa.c
4565 views
1
/**************************************************************************
2
*
3
* Copyright 2009 VMware, Inc.
4
* All Rights Reserved.
5
*
6
* Permission is hereby granted, free of charge, to any person obtaining a
7
* copy of this software and associated documentation files (the
8
* "Software"), to deal in the Software without restriction, including
9
* without limitation the rights to use, copy, modify, merge, publish,
10
* distribute, sub license, and/or sell copies of the Software, and to
11
* permit persons to whom the Software is furnished to do so, subject to
12
* the following conditions:
13
*
14
* The above copyright notice and this permission notice (including the
15
* next paragraph) shall be included in all copies or substantial portions
16
* of the Software.
17
*
18
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
19
* OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
21
* IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
22
* ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
23
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
24
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25
*
26
**************************************************************************/
27
28
29
#include "pipe/p_defines.h"
30
31
#include "util/format/u_format.h"
32
#include "util/u_memory.h"
33
#include "util/u_string.h"
34
#include "util/u_math.h"
35
36
#include "lp_bld_type.h"
37
#include "lp_bld_const.h"
38
#include "lp_bld_conv.h"
39
#include "lp_bld_swizzle.h"
40
#include "lp_bld_gather.h"
41
#include "lp_bld_debug.h"
42
#include "lp_bld_format.h"
43
#include "lp_bld_arit.h"
44
#include "lp_bld_pack.h"
45
#include "lp_bld_flow.h"
46
#include "lp_bld_printf.h"
47
#include "lp_bld_intr.h"
48
49
static void
50
convert_to_soa(struct gallivm_state *gallivm,
51
LLVMValueRef src_aos[LP_MAX_VECTOR_WIDTH / 32],
52
LLVMValueRef dst_soa[4],
53
const struct lp_type soa_type)
54
{
55
unsigned j, k;
56
struct lp_type aos_channel_type = soa_type;
57
58
LLVMValueRef aos_channels[4];
59
unsigned pixels_per_channel = soa_type.length / 4;
60
61
debug_assert((soa_type.length % 4) == 0);
62
63
aos_channel_type.length >>= 1;
64
65
for (j = 0; j < 4; ++j) {
66
LLVMValueRef channel[LP_MAX_VECTOR_LENGTH] = { 0 };
67
68
assert(pixels_per_channel <= LP_MAX_VECTOR_LENGTH);
69
70
for (k = 0; k < pixels_per_channel; ++k) {
71
channel[k] = src_aos[j + 4 * k];
72
}
73
74
aos_channels[j] = lp_build_concat(gallivm, channel, aos_channel_type, pixels_per_channel);
75
}
76
77
lp_build_transpose_aos(gallivm, soa_type, aos_channels, dst_soa);
78
}
79
80
81
void
82
lp_build_format_swizzle_soa(const struct util_format_description *format_desc,
83
struct lp_build_context *bld,
84
const LLVMValueRef unswizzled[4],
85
LLVMValueRef swizzled_out[4])
86
{
87
if (format_desc->colorspace == UTIL_FORMAT_COLORSPACE_ZS) {
88
enum pipe_swizzle swizzle;
89
LLVMValueRef depth_or_stencil;
90
91
if (util_format_has_stencil(format_desc) &&
92
!util_format_has_depth(format_desc)) {
93
assert(!bld->type.floating);
94
swizzle = format_desc->swizzle[1];
95
}
96
else {
97
assert(bld->type.floating);
98
swizzle = format_desc->swizzle[0];
99
}
100
/*
101
* Return zzz1 or sss1 for depth-stencil formats here.
102
* Correct swizzling will be handled by apply_sampler_swizzle() later.
103
*/
104
depth_or_stencil = lp_build_swizzle_soa_channel(bld, unswizzled, swizzle);
105
106
swizzled_out[2] = swizzled_out[1] = swizzled_out[0] = depth_or_stencil;
107
swizzled_out[3] = bld->one;
108
}
109
else {
110
unsigned chan;
111
for (chan = 0; chan < 4; ++chan) {
112
enum pipe_swizzle swizzle = format_desc->swizzle[chan];
113
swizzled_out[chan] = lp_build_swizzle_soa_channel(bld, unswizzled, swizzle);
114
}
115
}
116
}
117
118
119
120
static LLVMValueRef
121
lp_build_extract_soa_chan(struct lp_build_context *bld,
122
unsigned blockbits,
123
boolean srgb_chan,
124
struct util_format_channel_description chan_desc,
125
LLVMValueRef packed)
126
{
127
struct gallivm_state *gallivm = bld->gallivm;
128
LLVMBuilderRef builder = gallivm->builder;
129
struct lp_type type = bld->type;
130
LLVMValueRef input = packed;
131
const unsigned width = chan_desc.size;
132
const unsigned start = chan_desc.shift;
133
const unsigned stop = start + width;
134
135
/* Decode the input vector component */
136
137
switch(chan_desc.type) {
138
case UTIL_FORMAT_TYPE_VOID:
139
input = bld->undef;
140
break;
141
142
case UTIL_FORMAT_TYPE_UNSIGNED:
143
/*
144
* Align the LSB
145
*/
146
if (start) {
147
input = LLVMBuildLShr(builder, input,
148
lp_build_const_int_vec(gallivm, type, start), "");
149
}
150
151
/*
152
* Zero the MSBs
153
*/
154
if (stop < blockbits) {
155
unsigned mask = ((unsigned long long)1 << width) - 1;
156
input = LLVMBuildAnd(builder, input,
157
lp_build_const_int_vec(gallivm, type, mask), "");
158
}
159
160
/*
161
* Type conversion
162
*/
163
if (type.floating) {
164
if (srgb_chan) {
165
struct lp_type conv_type = lp_uint_type(type);
166
input = lp_build_srgb_to_linear(gallivm, conv_type, width, input);
167
}
168
else {
169
if(chan_desc.normalized)
170
input = lp_build_unsigned_norm_to_float(gallivm, width, type, input);
171
else
172
input = LLVMBuildUIToFP(builder, input, bld->vec_type, "");
173
}
174
}
175
else if (chan_desc.pure_integer) {
176
/* Nothing to do */
177
} else {
178
/* FIXME */
179
assert(0);
180
}
181
break;
182
183
case UTIL_FORMAT_TYPE_SIGNED:
184
/*
185
* Align the sign bit first.
186
*/
187
if (stop < type.width) {
188
unsigned bits = type.width - stop;
189
LLVMValueRef bits_val = lp_build_const_int_vec(gallivm, type, bits);
190
input = LLVMBuildShl(builder, input, bits_val, "");
191
}
192
193
/*
194
* Align the LSB (with an arithmetic shift to preserve the sign)
195
*/
196
if (chan_desc.size < type.width) {
197
unsigned bits = type.width - chan_desc.size;
198
LLVMValueRef bits_val = lp_build_const_int_vec(gallivm, type, bits);
199
input = LLVMBuildAShr(builder, input, bits_val, "");
200
}
201
202
/*
203
* Type conversion
204
*/
205
if (type.floating) {
206
input = LLVMBuildSIToFP(builder, input, bld->vec_type, "");
207
if (chan_desc.normalized) {
208
double scale = 1.0 / ((1 << (chan_desc.size - 1)) - 1);
209
LLVMValueRef scale_val = lp_build_const_vec(gallivm, type, scale);
210
input = LLVMBuildFMul(builder, input, scale_val, "");
211
/*
212
* The formula above will produce value below -1.0 for most negative values.
213
* compliance requires clamping it.
214
* GTF-GL45.gtf33.GL3Tests.vertex_type_2_10_10_10_rev.vertex_type_2_10_10_10_rev_conversion.
215
*/
216
input = lp_build_max(bld, input,
217
lp_build_const_vec(gallivm, type, -1.0f));
218
}
219
}
220
else if (chan_desc.pure_integer) {
221
/* Nothing to do */
222
} else {
223
/* FIXME */
224
assert(0);
225
}
226
break;
227
228
case UTIL_FORMAT_TYPE_FLOAT:
229
if (type.floating) {
230
if (chan_desc.size == 16) {
231
struct lp_type f16i_type = type;
232
f16i_type.width /= 2;
233
f16i_type.floating = 0;
234
if (start) {
235
input = LLVMBuildLShr(builder, input,
236
lp_build_const_int_vec(gallivm, type, start), "");
237
}
238
input = LLVMBuildTrunc(builder, input,
239
lp_build_vec_type(gallivm, f16i_type), "");
240
input = lp_build_half_to_float(gallivm, input);
241
} else {
242
assert(start == 0);
243
assert(stop == 32);
244
assert(type.width == 32);
245
}
246
input = LLVMBuildBitCast(builder, input, bld->vec_type, "");
247
}
248
else {
249
/* FIXME */
250
assert(0);
251
input = bld->undef;
252
}
253
break;
254
255
case UTIL_FORMAT_TYPE_FIXED:
256
if (type.floating) {
257
double scale = 1.0 / ((1 << (chan_desc.size/2)) - 1);
258
LLVMValueRef scale_val = lp_build_const_vec(gallivm, type, scale);
259
input = LLVMBuildSIToFP(builder, input, bld->vec_type, "");
260
input = LLVMBuildFMul(builder, input, scale_val, "");
261
}
262
else {
263
/* FIXME */
264
assert(0);
265
input = bld->undef;
266
}
267
break;
268
269
default:
270
assert(0);
271
input = bld->undef;
272
break;
273
}
274
275
return input;
276
}
277
278
279
/**
280
* Unpack several pixels in SoA.
281
*
282
* It takes a vector of packed pixels:
283
*
284
* packed = {P0, P1, P2, P3, ..., Pn}
285
*
286
* And will produce four vectors:
287
*
288
* red = {R0, R1, R2, R3, ..., Rn}
289
* green = {G0, G1, G2, G3, ..., Gn}
290
* blue = {B0, B1, B2, B3, ..., Bn}
291
* alpha = {A0, A1, A2, A3, ..., An}
292
*
293
* It requires that a packed pixel fits into an element of the output
294
* channels. The common case is when converting pixel with a depth of 32 bit or
295
* less into floats.
296
*
297
* \param format_desc the format of the 'packed' incoming pixel vector
298
* \param type the desired type for rgba_out (type.length = n, above)
299
* \param packed the incoming vector of packed pixels
300
* \param rgba_out returns the SoA R,G,B,A vectors
301
*/
302
void
303
lp_build_unpack_rgba_soa(struct gallivm_state *gallivm,
304
const struct util_format_description *format_desc,
305
struct lp_type type,
306
LLVMValueRef packed,
307
LLVMValueRef rgba_out[4])
308
{
309
struct lp_build_context bld;
310
LLVMValueRef inputs[4];
311
unsigned chan;
312
313
assert(format_desc->layout == UTIL_FORMAT_LAYOUT_PLAIN);
314
assert(format_desc->block.width == 1);
315
assert(format_desc->block.height == 1);
316
assert(format_desc->block.bits <= type.width);
317
/* FIXME: Support more output types */
318
assert(type.width == 32);
319
320
lp_build_context_init(&bld, gallivm, type);
321
322
/* Decode the input vector components */
323
for (chan = 0; chan < format_desc->nr_channels; ++chan) {
324
struct util_format_channel_description chan_desc = format_desc->channel[chan];
325
boolean srgb_chan = FALSE;
326
327
if (format_desc->colorspace == UTIL_FORMAT_COLORSPACE_SRGB &&
328
format_desc->swizzle[3] != chan) {
329
srgb_chan = TRUE;
330
}
331
332
inputs[chan] = lp_build_extract_soa_chan(&bld,
333
format_desc->block.bits,
334
srgb_chan,
335
chan_desc,
336
packed);
337
}
338
339
lp_build_format_swizzle_soa(format_desc, &bld, inputs, rgba_out);
340
}
341
342
343
/**
344
* Convert a vector of rgba8 values into 32bit wide SoA vectors.
345
*
346
* \param dst_type The desired return type. For pure integer formats
347
* this should be a 32bit wide int or uint vector type,
348
* otherwise a float vector type.
349
*
350
* \param packed The rgba8 values to pack.
351
*
352
* \param rgba The 4 SoA return vectors.
353
*/
354
void
355
lp_build_rgba8_to_fi32_soa(struct gallivm_state *gallivm,
356
struct lp_type dst_type,
357
LLVMValueRef packed,
358
LLVMValueRef *rgba)
359
{
360
LLVMBuilderRef builder = gallivm->builder;
361
LLVMValueRef mask = lp_build_const_int_vec(gallivm, dst_type, 0xff);
362
unsigned chan;
363
364
/* XXX technically shouldn't use that for uint dst_type */
365
packed = LLVMBuildBitCast(builder, packed,
366
lp_build_int_vec_type(gallivm, dst_type), "");
367
368
/* Decode the input vector components */
369
for (chan = 0; chan < 4; ++chan) {
370
#if UTIL_ARCH_LITTLE_ENDIAN
371
unsigned start = chan*8;
372
#else
373
unsigned start = (3-chan)*8;
374
#endif
375
unsigned stop = start + 8;
376
LLVMValueRef input;
377
378
input = packed;
379
380
if (start)
381
input = LLVMBuildLShr(builder, input,
382
lp_build_const_int_vec(gallivm, dst_type, start), "");
383
384
if (stop < 32)
385
input = LLVMBuildAnd(builder, input, mask, "");
386
387
if (dst_type.floating)
388
input = lp_build_unsigned_norm_to_float(gallivm, 8, dst_type, input);
389
390
rgba[chan] = input;
391
}
392
}
393
394
395
396
/**
397
* Fetch a texels from a texture, returning them in SoA layout.
398
*
399
* \param type the desired return type for 'rgba'. The vector length
400
* is the number of texels to fetch
401
* \param aligned if the offset is guaranteed to be aligned to element width
402
*
403
* \param base_ptr points to the base of the texture mip tree.
404
* \param offset offset to start of the texture image block. For non-
405
* compressed formats, this simply is an offset to the texel.
406
* For compressed formats, it is an offset to the start of the
407
* compressed data block.
408
*
409
* \param i, j the sub-block pixel coordinates. For non-compressed formats
410
* these will always be (0,0). For compressed formats, i will
411
* be in [0, block_width-1] and j will be in [0, block_height-1].
412
* \param cache optional value pointing to a lp_build_format_cache structure
413
*/
414
void
415
lp_build_fetch_rgba_soa(struct gallivm_state *gallivm,
416
const struct util_format_description *format_desc,
417
struct lp_type type,
418
boolean aligned,
419
LLVMValueRef base_ptr,
420
LLVMValueRef offset,
421
LLVMValueRef i,
422
LLVMValueRef j,
423
LLVMValueRef cache,
424
LLVMValueRef rgba_out[4])
425
{
426
LLVMBuilderRef builder = gallivm->builder;
427
enum pipe_format format = format_desc->format;
428
struct lp_type fetch_type;
429
430
if (format_desc->layout == UTIL_FORMAT_LAYOUT_PLAIN &&
431
(format_desc->colorspace == UTIL_FORMAT_COLORSPACE_RGB ||
432
format_desc->colorspace == UTIL_FORMAT_COLORSPACE_SRGB ||
433
format_desc->colorspace == UTIL_FORMAT_COLORSPACE_ZS) &&
434
format_desc->block.width == 1 &&
435
format_desc->block.height == 1 &&
436
format_desc->block.bits <= type.width &&
437
(format_desc->channel[0].type != UTIL_FORMAT_TYPE_FLOAT ||
438
format_desc->channel[0].size == 32 ||
439
format_desc->channel[0].size == 16))
440
{
441
/*
442
* The packed pixel fits into an element of the destination format. Put
443
* the packed pixels into a vector and extract each component for all
444
* vector elements in parallel.
445
*/
446
447
LLVMValueRef packed;
448
449
/*
450
* gather the texels from the texture
451
* Ex: packed = {XYZW, XYZW, XYZW, XYZW}
452
*/
453
assert(format_desc->block.bits <= type.width);
454
fetch_type = lp_type_uint(type.width);
455
packed = lp_build_gather(gallivm,
456
type.length,
457
format_desc->block.bits,
458
fetch_type,
459
aligned,
460
base_ptr, offset, FALSE);
461
462
/*
463
* convert texels to float rgba
464
*/
465
lp_build_unpack_rgba_soa(gallivm,
466
format_desc,
467
type,
468
packed, rgba_out);
469
return;
470
}
471
472
473
if (format_desc->layout == UTIL_FORMAT_LAYOUT_PLAIN &&
474
(format_desc->colorspace == UTIL_FORMAT_COLORSPACE_RGB) &&
475
format_desc->block.width == 1 &&
476
format_desc->block.height == 1 &&
477
format_desc->block.bits > type.width &&
478
((format_desc->block.bits <= type.width * type.length &&
479
format_desc->channel[0].size <= type.width) ||
480
(format_desc->channel[0].size == 64 &&
481
format_desc->channel[0].type == UTIL_FORMAT_TYPE_FLOAT &&
482
type.floating)))
483
{
484
/*
485
* Similar to above, but the packed pixel is larger than what fits
486
* into an element of the destination format. The packed pixels will be
487
* shuffled into SoA vectors appropriately, and then the extraction will
488
* be done in parallel as much as possible.
489
* Good for 16xn (n > 2) and 32xn (n > 1) formats, care is taken so
490
* the gathered vectors can be shuffled easily (even with avx).
491
* 64xn float -> 32xn float is handled too but it's a bit special as
492
* it does the conversion pre-shuffle.
493
*/
494
495
LLVMValueRef packed[4], dst[4], output[4], shuffles[LP_MAX_VECTOR_WIDTH/32];
496
struct lp_type fetch_type, gather_type = type;
497
unsigned num_gather, fetch_width, i, j;
498
struct lp_build_context bld;
499
boolean fp64 = format_desc->channel[0].size == 64;
500
501
lp_build_context_init(&bld, gallivm, type);
502
503
assert(type.width == 32);
504
assert(format_desc->block.bits > type.width);
505
506
/*
507
* First, figure out fetch order.
508
*/
509
fetch_width = util_next_power_of_two(format_desc->block.bits);
510
/*
511
* fp64 are treated like fp32 except we fetch twice wide values
512
* (as we shuffle after trunc). The shuffles for that work out
513
* mostly fine (slightly suboptimal for 4-wide, perfect for AVX)
514
* albeit we miss the potential opportunity for hw gather (as it
515
* only handles native size).
516
*/
517
num_gather = fetch_width / type.width;
518
gather_type.width *= num_gather;
519
if (fp64) {
520
num_gather /= 2;
521
}
522
gather_type.length /= num_gather;
523
524
for (i = 0; i < num_gather; i++) {
525
LLVMValueRef offsetr, shuf_vec;
526
if(num_gather == 4) {
527
for (j = 0; j < gather_type.length; j++) {
528
unsigned idx = i + 4*j;
529
shuffles[j] = lp_build_const_int32(gallivm, idx);
530
}
531
shuf_vec = LLVMConstVector(shuffles, gather_type.length);
532
offsetr = LLVMBuildShuffleVector(builder, offset, offset, shuf_vec, "");
533
534
}
535
else if (num_gather == 2) {
536
assert(num_gather == 2);
537
for (j = 0; j < gather_type.length; j++) {
538
unsigned idx = i*2 + (j%2) + (j/2)*4;
539
shuffles[j] = lp_build_const_int32(gallivm, idx);
540
}
541
shuf_vec = LLVMConstVector(shuffles, gather_type.length);
542
offsetr = LLVMBuildShuffleVector(builder, offset, offset, shuf_vec, "");
543
}
544
else {
545
assert(num_gather == 1);
546
offsetr = offset;
547
}
548
if (gather_type.length == 1) {
549
LLVMValueRef zero = lp_build_const_int32(gallivm, 0);
550
offsetr = LLVMBuildExtractElement(builder, offsetr, zero, "");
551
}
552
553
/*
554
* Determine whether to use float or int loads. This is mostly
555
* to outsmart the (stupid) llvm int/float shuffle logic, we
556
* don't really care much if the data is floats or ints...
557
* But llvm will refuse to use single float shuffle with int data
558
* and instead use 3 int shuffles instead, the code looks atrocious.
559
* (Note bitcasts often won't help, as llvm is too smart to be
560
* fooled by that.)
561
* Nobody cares about simd float<->int domain transition penalties,
562
* which usually don't even exist for shuffles anyway.
563
* With 4x32bit (and 3x32bit) fetch, we use float vec (the data is
564
* going into transpose, which is unpacks, so doesn't really matter
565
* much).
566
* With 2x32bit or 4x16bit fetch, we use float vec, since those
567
* go into the weird channel separation shuffle. With floats,
568
* this is (with 128bit vectors):
569
* - 2 movq, 2 movhpd, 2 shufps
570
* With ints it would be:
571
* - 4 movq, 2 punpcklqdq, 4 pshufd, 2 blendw
572
* I've seen texture functions increase in code size by 15% just due
573
* to that (there's lots of such fetches in them...)
574
* (We could chose a different gather order to improve this somewhat
575
* for the int path, but it would basically just drop the blends,
576
* so the float path with this order really is optimal.)
577
* Albeit it is tricky sometimes llvm doesn't ignore the float->int
578
* casts so must avoid them until we're done with the float shuffle...
579
* 3x16bit formats (the same is also true for 3x8) are pretty bad but
580
* there's nothing we can do about them (we could overallocate by
581
* those couple bytes and use unaligned but pot sized load).
582
* Note that this is very much x86 specific. I don't know if this
583
* affect other archs at all.
584
*/
585
if (num_gather > 1) {
586
/*
587
* We always want some float type here (with x86)
588
* due to shuffles being float ones afterwards (albeit for
589
* the num_gather == 4 case int should work fine too
590
* (unless there's some problems with avx but not avx2).
591
*/
592
if (format_desc->channel[0].size == 64) {
593
fetch_type = lp_type_float_vec(64, gather_type.width);
594
} else {
595
fetch_type = lp_type_int_vec(32, gather_type.width);
596
}
597
}
598
else {
599
/* type doesn't matter much */
600
if (format_desc->channel[0].type == UTIL_FORMAT_TYPE_FLOAT &&
601
(format_desc->channel[0].size == 32 ||
602
format_desc->channel[0].size == 64)) {
603
fetch_type = lp_type_float(gather_type.width);
604
} else {
605
fetch_type = lp_type_uint(gather_type.width);
606
}
607
}
608
609
/* Now finally gather the values */
610
packed[i] = lp_build_gather(gallivm, gather_type.length,
611
format_desc->block.bits,
612
fetch_type, aligned,
613
base_ptr, offsetr, FALSE);
614
if (fp64) {
615
struct lp_type conv_type = type;
616
conv_type.width *= 2;
617
packed[i] = LLVMBuildBitCast(builder, packed[i],
618
lp_build_vec_type(gallivm, conv_type), "");
619
packed[i] = LLVMBuildFPTrunc(builder, packed[i], bld.vec_type, "");
620
}
621
}
622
623
/* shuffle the gathered values to SoA */
624
if (num_gather == 2) {
625
for (i = 0; i < num_gather; i++) {
626
for (j = 0; j < type.length; j++) {
627
unsigned idx = (j%2)*2 + (j/4)*4 + i;
628
if ((j/2)%2)
629
idx += type.length;
630
shuffles[j] = lp_build_const_int32(gallivm, idx);
631
}
632
dst[i] = LLVMBuildShuffleVector(builder, packed[0], packed[1],
633
LLVMConstVector(shuffles, type.length), "");
634
}
635
}
636
else if (num_gather == 4) {
637
lp_build_transpose_aos(gallivm, lp_int_type(type), packed, dst);
638
}
639
else {
640
assert(num_gather == 1);
641
dst[0] = packed[0];
642
}
643
644
/*
645
* And finally unpack exactly as above, except that
646
* chan shift is adjusted and the right vector selected.
647
*/
648
if (!fp64) {
649
for (i = 0; i < num_gather; i++) {
650
dst[i] = LLVMBuildBitCast(builder, dst[i], bld.int_vec_type, "");
651
}
652
for (i = 0; i < format_desc->nr_channels; i++) {
653
struct util_format_channel_description chan_desc = format_desc->channel[i];
654
unsigned blockbits = type.width;
655
unsigned vec_nr;
656
657
#if UTIL_ARCH_BIG_ENDIAN
658
vec_nr = (format_desc->block.bits - (chan_desc.shift + chan_desc.size)) / type.width;
659
#else
660
vec_nr = chan_desc.shift / type.width;
661
#endif
662
chan_desc.shift %= type.width;
663
664
output[i] = lp_build_extract_soa_chan(&bld,
665
blockbits,
666
FALSE,
667
chan_desc,
668
dst[vec_nr]);
669
}
670
}
671
else {
672
for (i = 0; i < format_desc->nr_channels; i++) {
673
output[i] = dst[i];
674
}
675
}
676
677
lp_build_format_swizzle_soa(format_desc, &bld, output, rgba_out);
678
return;
679
}
680
681
if (format == PIPE_FORMAT_R11G11B10_FLOAT ||
682
format == PIPE_FORMAT_R9G9B9E5_FLOAT) {
683
/*
684
* similar conceptually to above but requiring special
685
* AoS packed -> SoA float conversion code.
686
*/
687
LLVMValueRef packed;
688
struct lp_type fetch_type = lp_type_uint(type.width);
689
690
assert(type.floating);
691
assert(type.width == 32);
692
693
packed = lp_build_gather(gallivm, type.length,
694
format_desc->block.bits,
695
fetch_type, aligned,
696
base_ptr, offset, FALSE);
697
if (format == PIPE_FORMAT_R11G11B10_FLOAT) {
698
lp_build_r11g11b10_to_float(gallivm, packed, rgba_out);
699
}
700
else {
701
lp_build_rgb9e5_to_float(gallivm, packed, rgba_out);
702
}
703
return;
704
}
705
706
if (format_desc->colorspace == UTIL_FORMAT_COLORSPACE_ZS &&
707
format_desc->block.bits == 64) {
708
/*
709
* special case the format is 64 bits but we only require
710
* 32bit (or 8bit) from each block.
711
*/
712
LLVMValueRef packed;
713
struct lp_type fetch_type = lp_type_uint(type.width);
714
715
if (format == PIPE_FORMAT_X32_S8X24_UINT) {
716
/*
717
* for stencil simply fix up offsets - could in fact change
718
* base_ptr instead even outside the shader.
719
*/
720
unsigned mask = (1 << 8) - 1;
721
LLVMValueRef s_offset = lp_build_const_int_vec(gallivm, type, 4);
722
offset = LLVMBuildAdd(builder, offset, s_offset, "");
723
packed = lp_build_gather(gallivm, type.length, 32, fetch_type,
724
aligned, base_ptr, offset, FALSE);
725
packed = LLVMBuildAnd(builder, packed,
726
lp_build_const_int_vec(gallivm, type, mask), "");
727
}
728
else {
729
assert (format == PIPE_FORMAT_Z32_FLOAT_S8X24_UINT);
730
packed = lp_build_gather(gallivm, type.length, 32, fetch_type,
731
aligned, base_ptr, offset, TRUE);
732
packed = LLVMBuildBitCast(builder, packed,
733
lp_build_vec_type(gallivm, type), "");
734
}
735
/* for consistency with lp_build_unpack_rgba_soa() return sss1 or zzz1 */
736
rgba_out[0] = rgba_out[1] = rgba_out[2] = packed;
737
rgba_out[3] = lp_build_const_vec(gallivm, type, 1.0f);
738
return;
739
}
740
741
/*
742
* Try calling lp_build_fetch_rgba_aos for all pixels.
743
* Should only really hit subsampled, compressed
744
* (for s3tc srgb and rgtc too).
745
* (This is invalid for plain 8unorm formats because we're lazy with
746
* the swizzle since some results would arrive swizzled, some not.)
747
*/
748
749
if ((format_desc->layout != UTIL_FORMAT_LAYOUT_PLAIN) &&
750
(util_format_fits_8unorm(format_desc) ||
751
format_desc->layout == UTIL_FORMAT_LAYOUT_RGTC ||
752
format_desc->layout == UTIL_FORMAT_LAYOUT_S3TC) &&
753
type.floating && type.width == 32 &&
754
(type.length == 1 || (type.length % 4 == 0))) {
755
struct lp_type tmp_type;
756
struct lp_build_context bld;
757
LLVMValueRef packed, rgba[4];
758
const struct util_format_description *flinear_desc;
759
const struct util_format_description *frgba8_desc;
760
unsigned chan;
761
bool is_signed = (format_desc->format == PIPE_FORMAT_RGTC1_SNORM ||
762
format_desc->format == PIPE_FORMAT_RGTC2_SNORM ||
763
format_desc->format == PIPE_FORMAT_LATC1_SNORM ||
764
format_desc->format == PIPE_FORMAT_LATC2_SNORM);
765
766
lp_build_context_init(&bld, gallivm, type);
767
768
/*
769
* Make sure the conversion in aos really only does convert to rgba8
770
* and not anything more (so use linear format, adjust type).
771
*/
772
flinear_desc = util_format_description(util_format_linear(format));
773
memset(&tmp_type, 0, sizeof tmp_type);
774
tmp_type.width = 8;
775
tmp_type.length = type.length * 4;
776
tmp_type.norm = TRUE;
777
tmp_type.sign = is_signed;
778
779
packed = lp_build_fetch_rgba_aos(gallivm, flinear_desc, tmp_type,
780
aligned, base_ptr, offset, i, j, cache);
781
packed = LLVMBuildBitCast(builder, packed, bld.int_vec_type, "");
782
783
/*
784
* The values are now packed so they match ordinary (srgb) RGBA8 format,
785
* hence need to use matching format for unpack.
786
*/
787
frgba8_desc = util_format_description(is_signed ? PIPE_FORMAT_R8G8B8A8_SNORM : PIPE_FORMAT_R8G8B8A8_UNORM);
788
if (format_desc->colorspace == UTIL_FORMAT_COLORSPACE_SRGB) {
789
assert(format_desc->layout == UTIL_FORMAT_LAYOUT_S3TC);
790
frgba8_desc = util_format_description(PIPE_FORMAT_R8G8B8A8_SRGB);
791
}
792
lp_build_unpack_rgba_soa(gallivm,
793
frgba8_desc,
794
type,
795
packed, rgba);
796
797
/*
798
* We converted 4 channels. Make sure llvm can drop unneeded ones
799
* (luckily the rgba order is fixed, only LA needs special case).
800
*/
801
for (chan = 0; chan < 4; chan++) {
802
enum pipe_swizzle swizzle = format_desc->swizzle[chan];
803
if (chan == 3 && util_format_is_luminance_alpha(format)) {
804
swizzle = PIPE_SWIZZLE_W;
805
}
806
rgba_out[chan] = lp_build_swizzle_soa_channel(&bld, rgba, swizzle);
807
}
808
return;
809
}
810
811
812
/*
813
* Fallback to calling lp_build_fetch_rgba_aos for each pixel.
814
*
815
* This is not the most efficient way of fetching pixels, as we
816
* miss some opportunities to do vectorization, but this is
817
* convenient for formats or scenarios for which there was no
818
* opportunity or incentive to optimize.
819
*
820
* We do NOT want to end up here, this typically is quite terrible,
821
* in particular if the formats have less than 4 channels.
822
*
823
* Right now, this should only be hit for:
824
* - ETC formats
825
* (those miss fast fetch functions hence they are terrible anyway)
826
*/
827
828
{
829
unsigned k;
830
struct lp_type tmp_type;
831
LLVMValueRef aos_fetch[LP_MAX_VECTOR_WIDTH / 32];
832
833
if (gallivm_debug & GALLIVM_DEBUG_PERF) {
834
debug_printf("%s: AoS fetch fallback for %s\n",
835
__FUNCTION__, format_desc->short_name);
836
}
837
838
tmp_type = type;
839
tmp_type.length = 4;
840
841
if (type.length == 1) {
842
LLVMValueRef fetch = lp_build_fetch_rgba_aos(gallivm, format_desc, tmp_type,
843
aligned, base_ptr, offset,
844
i, j, cache);
845
846
for (k = 0; k < 4; k++)
847
rgba_out[k] = LLVMBuildExtractElement(gallivm->builder, fetch, lp_build_const_int32(gallivm, k), "");
848
return;
849
}
850
851
/*
852
* Note that vector transpose can be worse compared to insert/extract
853
* for aos->soa conversion (for formats with 1 or 2 channels). However,
854
* we should try to avoid getting here for just about all formats, so
855
* don't bother.
856
*/
857
858
/* loop over number of pixels */
859
for(k = 0; k < type.length; ++k) {
860
LLVMValueRef index = lp_build_const_int32(gallivm, k);
861
LLVMValueRef offset_elem;
862
LLVMValueRef i_elem, j_elem;
863
864
offset_elem = LLVMBuildExtractElement(builder, offset,
865
index, "");
866
867
i_elem = LLVMBuildExtractElement(builder, i, index, "");
868
j_elem = LLVMBuildExtractElement(builder, j, index, "");
869
870
/* Get a single float[4]={R,G,B,A} pixel */
871
aos_fetch[k] = lp_build_fetch_rgba_aos(gallivm, format_desc, tmp_type,
872
aligned, base_ptr, offset_elem,
873
i_elem, j_elem, cache);
874
875
}
876
convert_to_soa(gallivm, aos_fetch, rgba_out, type);
877
}
878
}
879
880
static void
881
lp_build_insert_soa_chan(struct lp_build_context *bld,
882
unsigned blockbits,
883
struct util_format_channel_description chan_desc,
884
LLVMValueRef *output,
885
LLVMValueRef rgba)
886
{
887
struct gallivm_state *gallivm = bld->gallivm;
888
LLVMBuilderRef builder = gallivm->builder;
889
struct lp_type type = bld->type;
890
const unsigned width = chan_desc.size;
891
const unsigned start = chan_desc.shift;
892
const uint32_t chan_mask = (1ULL << width) - 1;
893
ASSERTED const unsigned stop = start + width;
894
LLVMValueRef chan = NULL;
895
switch(chan_desc.type) {
896
case UTIL_FORMAT_TYPE_UNSIGNED:
897
898
if (chan_desc.pure_integer) {
899
chan = LLVMBuildBitCast(builder, rgba, bld->int_vec_type, "");
900
LLVMValueRef mask_val = lp_build_const_int_vec(gallivm, type, chan_mask);
901
LLVMValueRef mask = LLVMBuildICmp(builder, LLVMIntUGT, chan, mask_val, "");
902
chan = LLVMBuildSelect(builder, mask, mask_val, chan, "");
903
}
904
else if (type.floating) {
905
if (chan_desc.normalized) {
906
rgba = lp_build_clamp(bld, rgba, bld->zero, bld->one);
907
chan = lp_build_clamped_float_to_unsigned_norm(gallivm, type, width, rgba);
908
} else
909
chan = LLVMBuildFPToSI(builder, rgba, bld->vec_type, "");
910
}
911
if (start)
912
chan = LLVMBuildShl(builder, chan,
913
lp_build_const_int_vec(gallivm, type, start), "");
914
if (!*output)
915
*output = chan;
916
else
917
*output = LLVMBuildOr(builder, *output, chan, "");
918
break;
919
case UTIL_FORMAT_TYPE_SIGNED:
920
if (chan_desc.pure_integer) {
921
chan = LLVMBuildBitCast(builder, rgba, bld->int_vec_type, "");
922
chan = LLVMBuildAnd(builder, chan, lp_build_const_int_vec(gallivm, type, chan_mask), "");
923
} else if (type.floating) {
924
if (chan_desc.normalized) {
925
char intrin[32];
926
double scale = ((1 << (chan_desc.size - 1)) - 1);
927
LLVMValueRef scale_val = lp_build_const_vec(gallivm, type, scale);
928
rgba = lp_build_clamp(bld, rgba, lp_build_negate(bld, bld->one), bld->one);
929
rgba = LLVMBuildFMul(builder, rgba, scale_val, "");
930
lp_format_intrinsic(intrin, sizeof intrin, "llvm.rint", bld->vec_type);
931
rgba = lp_build_intrinsic_unary(builder, intrin, bld->vec_type, rgba);
932
}
933
chan = LLVMBuildFPToSI(builder, rgba, bld->int_vec_type, "");
934
chan = LLVMBuildAnd(builder, chan, lp_build_const_int_vec(gallivm, type, chan_mask), "");
935
}
936
if (start)
937
chan = LLVMBuildShl(builder, chan,
938
lp_build_const_int_vec(gallivm, type, start), "");
939
if (!*output)
940
*output = chan;
941
else
942
*output = LLVMBuildOr(builder, *output, chan, "");
943
break;
944
case UTIL_FORMAT_TYPE_FLOAT:
945
if (type.floating) {
946
if (chan_desc.size == 16) {
947
chan = lp_build_float_to_half(gallivm, rgba);
948
chan = LLVMBuildZExt(builder, chan, bld->int_vec_type, "");
949
if (start)
950
chan = LLVMBuildShl(builder, chan,
951
lp_build_const_int_vec(gallivm, type, start), "");
952
if (!*output)
953
*output = chan;
954
else
955
*output = LLVMBuildOr(builder, *output, chan, "");
956
} else {
957
assert(start == 0);
958
assert(stop == 32);
959
assert(type.width == 32);
960
*output = LLVMBuildBitCast(builder, rgba, bld->int_vec_type, "");
961
}
962
} else
963
assert(0);
964
break;
965
default:
966
assert(0);
967
*output = bld->undef;
968
}
969
}
970
971
static void
972
lp_build_pack_rgba_soa(struct gallivm_state *gallivm,
973
const struct util_format_description *format_desc,
974
struct lp_type type,
975
const LLVMValueRef rgba_in[4],
976
LLVMValueRef *packed)
977
{
978
unsigned chan;
979
struct lp_build_context bld;
980
assert(format_desc->layout == UTIL_FORMAT_LAYOUT_PLAIN);
981
assert(format_desc->block.width == 1);
982
assert(format_desc->block.height == 1);
983
assert(format_desc->block.bits <= type.width);
984
/* FIXME: Support more output types */
985
assert(type.width == 32);
986
987
lp_build_context_init(&bld, gallivm, type);
988
for (chan = 0; chan < format_desc->nr_channels; ++chan) {
989
struct util_format_channel_description chan_desc = format_desc->channel[chan];
990
991
lp_build_insert_soa_chan(&bld, format_desc->block.bits,
992
chan_desc,
993
packed,
994
rgba_in[chan]);
995
}
996
}
997
998
void
999
lp_build_store_rgba_soa(struct gallivm_state *gallivm,
1000
const struct util_format_description *format_desc,
1001
struct lp_type type,
1002
LLVMValueRef exec_mask,
1003
LLVMValueRef base_ptr,
1004
LLVMValueRef offset,
1005
LLVMValueRef out_of_bounds,
1006
const LLVMValueRef rgba_in[4])
1007
{
1008
enum pipe_format format = format_desc->format;
1009
LLVMValueRef packed[4];
1010
unsigned num_stores = 0;
1011
1012
memset(packed, 0, sizeof(LLVMValueRef) * 4);
1013
if (format_desc->layout == UTIL_FORMAT_LAYOUT_PLAIN &&
1014
format_desc->colorspace == UTIL_FORMAT_COLORSPACE_RGB &&
1015
format_desc->block.width == 1 &&
1016
format_desc->block.height == 1 &&
1017
format_desc->block.bits <= type.width &&
1018
(format_desc->channel[0].type != UTIL_FORMAT_TYPE_FLOAT ||
1019
format_desc->channel[0].size == 32 ||
1020
format_desc->channel[0].size == 16))
1021
{
1022
lp_build_pack_rgba_soa(gallivm, format_desc, type, rgba_in, &packed[0]);
1023
1024
num_stores = 1;
1025
} else if (format_desc->layout == UTIL_FORMAT_LAYOUT_PLAIN &&
1026
(format_desc->colorspace == UTIL_FORMAT_COLORSPACE_RGB) &&
1027
format_desc->block.width == 1 &&
1028
format_desc->block.height == 1 &&
1029
format_desc->block.bits > type.width &&
1030
((format_desc->block.bits <= type.width * type.length &&
1031
format_desc->channel[0].size <= type.width) ||
1032
(format_desc->channel[0].size == 64 &&
1033
format_desc->channel[0].type == UTIL_FORMAT_TYPE_FLOAT &&
1034
type.floating)))
1035
{
1036
/*
1037
* Similar to above, but the packed pixel is larger than what fits
1038
* into an element of the destination format. The packed pixels will be
1039
* shuffled into SoA vectors appropriately, and then the extraction will
1040
* be done in parallel as much as possible.
1041
* Good for 16xn (n > 2) and 32xn (n > 1) formats, care is taken so
1042
* the gathered vectors can be shuffled easily (even with avx).
1043
* 64xn float -> 32xn float is handled too but it's a bit special as
1044
* it does the conversion pre-shuffle.
1045
*/
1046
struct lp_build_context bld;
1047
1048
lp_build_context_init(&bld, gallivm, type);
1049
assert(type.width == 32);
1050
assert(format_desc->block.bits > type.width);
1051
1052
unsigned store_width = util_next_power_of_two(format_desc->block.bits);
1053
num_stores = store_width / type.width;
1054
for (unsigned i = 0; i < format_desc->nr_channels; i++) {
1055
struct util_format_channel_description chan_desc = format_desc->channel[i];
1056
unsigned blockbits = type.width;
1057
unsigned vec_nr;
1058
1059
vec_nr = chan_desc.shift / type.width;
1060
chan_desc.shift %= type.width;
1061
1062
lp_build_insert_soa_chan(&bld, blockbits,
1063
chan_desc,
1064
&packed[vec_nr],
1065
rgba_in[i]);
1066
}
1067
1068
assert(num_stores == 4 || num_stores == 2);
1069
/* we can transpose and store at the same time */
1070
} else if (format == PIPE_FORMAT_R11G11B10_FLOAT) {
1071
packed[0] = lp_build_float_to_r11g11b10(gallivm, rgba_in);
1072
num_stores = 1;
1073
} else
1074
assert(0);
1075
1076
assert(exec_mask);
1077
1078
LLVMTypeRef int32_ptr_type = LLVMPointerType(LLVMInt32TypeInContext(gallivm->context), 0);
1079
LLVMTypeRef int16_ptr_type = LLVMPointerType(LLVMInt16TypeInContext(gallivm->context), 0);
1080
LLVMTypeRef int8_ptr_type = LLVMPointerType(LLVMInt8TypeInContext(gallivm->context), 0);
1081
1082
LLVMValueRef should_store_mask = LLVMBuildAnd(gallivm->builder, exec_mask, LLVMBuildNot(gallivm->builder, out_of_bounds, ""), "store_mask");
1083
should_store_mask = LLVMBuildICmp(gallivm->builder, LLVMIntNE, should_store_mask, lp_build_const_int_vec(gallivm, type, 0), "");
1084
for (unsigned i = 0; i < num_stores; i++) {
1085
struct lp_build_loop_state loop_state;
1086
1087
LLVMValueRef store_offset = LLVMBuildAdd(gallivm->builder, offset, lp_build_const_int_vec(gallivm, type, i * 4), "");
1088
store_offset = LLVMBuildGEP(gallivm->builder, base_ptr, &store_offset, 1, "");
1089
1090
lp_build_loop_begin(&loop_state, gallivm, lp_build_const_int32(gallivm, 0));
1091
1092
struct lp_build_if_state ifthen;
1093
LLVMValueRef cond = LLVMBuildExtractElement(gallivm->builder, should_store_mask, loop_state.counter, "");
1094
lp_build_if(&ifthen, gallivm, cond);
1095
1096
LLVMValueRef data = LLVMBuildExtractElement(gallivm->builder, packed[i], loop_state.counter, "");
1097
LLVMValueRef this_offset = LLVMBuildExtractElement(gallivm->builder, store_offset, loop_state.counter, "");
1098
1099
if (format_desc->block.bits == 8) {
1100
this_offset = LLVMBuildBitCast(gallivm->builder, this_offset, int8_ptr_type, "");
1101
data = LLVMBuildTrunc(gallivm->builder, data, LLVMInt8TypeInContext(gallivm->context), "");
1102
} else if (format_desc->block.bits == 16) {
1103
this_offset = LLVMBuildBitCast(gallivm->builder, this_offset, int16_ptr_type, "");
1104
data = LLVMBuildTrunc(gallivm->builder, data, LLVMInt16TypeInContext(gallivm->context), "");
1105
} else
1106
this_offset = LLVMBuildBitCast(gallivm->builder, this_offset, int32_ptr_type, "");
1107
LLVMBuildStore(gallivm->builder, data, this_offset);
1108
lp_build_endif(&ifthen);
1109
lp_build_loop_end_cond(&loop_state, lp_build_const_int32(gallivm, type.length),
1110
NULL, LLVMIntUGE);
1111
}
1112
}
1113
1114