Path: blob/21.2-virgl/src/gallium/auxiliary/gallivm/lp_bld_format_soa.c
4565 views
/**************************************************************************1*2* Copyright 2009 VMware, Inc.3* All Rights Reserved.4*5* Permission is hereby granted, free of charge, to any person obtaining a6* copy of this software and associated documentation files (the7* "Software"), to deal in the Software without restriction, including8* without limitation the rights to use, copy, modify, merge, publish,9* distribute, sub license, and/or sell copies of the Software, and to10* permit persons to whom the Software is furnished to do so, subject to11* the following conditions:12*13* The above copyright notice and this permission notice (including the14* next paragraph) shall be included in all copies or substantial portions15* of the Software.16*17* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS18* OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF19* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.20* IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR21* ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,22* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE23* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.24*25**************************************************************************/262728#include "pipe/p_defines.h"2930#include "util/format/u_format.h"31#include "util/u_memory.h"32#include "util/u_string.h"33#include "util/u_math.h"3435#include "lp_bld_type.h"36#include "lp_bld_const.h"37#include "lp_bld_conv.h"38#include "lp_bld_swizzle.h"39#include "lp_bld_gather.h"40#include "lp_bld_debug.h"41#include "lp_bld_format.h"42#include "lp_bld_arit.h"43#include "lp_bld_pack.h"44#include "lp_bld_flow.h"45#include "lp_bld_printf.h"46#include "lp_bld_intr.h"4748static void49convert_to_soa(struct gallivm_state *gallivm,50LLVMValueRef src_aos[LP_MAX_VECTOR_WIDTH / 32],51LLVMValueRef dst_soa[4],52const struct lp_type soa_type)53{54unsigned j, k;55struct lp_type aos_channel_type = soa_type;5657LLVMValueRef aos_channels[4];58unsigned pixels_per_channel = soa_type.length / 4;5960debug_assert((soa_type.length % 4) == 0);6162aos_channel_type.length >>= 1;6364for (j = 0; j < 4; ++j) {65LLVMValueRef channel[LP_MAX_VECTOR_LENGTH] = { 0 };6667assert(pixels_per_channel <= LP_MAX_VECTOR_LENGTH);6869for (k = 0; k < pixels_per_channel; ++k) {70channel[k] = src_aos[j + 4 * k];71}7273aos_channels[j] = lp_build_concat(gallivm, channel, aos_channel_type, pixels_per_channel);74}7576lp_build_transpose_aos(gallivm, soa_type, aos_channels, dst_soa);77}787980void81lp_build_format_swizzle_soa(const struct util_format_description *format_desc,82struct lp_build_context *bld,83const LLVMValueRef unswizzled[4],84LLVMValueRef swizzled_out[4])85{86if (format_desc->colorspace == UTIL_FORMAT_COLORSPACE_ZS) {87enum pipe_swizzle swizzle;88LLVMValueRef depth_or_stencil;8990if (util_format_has_stencil(format_desc) &&91!util_format_has_depth(format_desc)) {92assert(!bld->type.floating);93swizzle = format_desc->swizzle[1];94}95else {96assert(bld->type.floating);97swizzle = format_desc->swizzle[0];98}99/*100* Return zzz1 or sss1 for depth-stencil formats here.101* Correct swizzling will be handled by apply_sampler_swizzle() later.102*/103depth_or_stencil = lp_build_swizzle_soa_channel(bld, unswizzled, swizzle);104105swizzled_out[2] = swizzled_out[1] = swizzled_out[0] = depth_or_stencil;106swizzled_out[3] = bld->one;107}108else {109unsigned chan;110for (chan = 0; chan < 4; ++chan) {111enum pipe_swizzle swizzle = format_desc->swizzle[chan];112swizzled_out[chan] = lp_build_swizzle_soa_channel(bld, unswizzled, swizzle);113}114}115}116117118119static LLVMValueRef120lp_build_extract_soa_chan(struct lp_build_context *bld,121unsigned blockbits,122boolean srgb_chan,123struct util_format_channel_description chan_desc,124LLVMValueRef packed)125{126struct gallivm_state *gallivm = bld->gallivm;127LLVMBuilderRef builder = gallivm->builder;128struct lp_type type = bld->type;129LLVMValueRef input = packed;130const unsigned width = chan_desc.size;131const unsigned start = chan_desc.shift;132const unsigned stop = start + width;133134/* Decode the input vector component */135136switch(chan_desc.type) {137case UTIL_FORMAT_TYPE_VOID:138input = bld->undef;139break;140141case UTIL_FORMAT_TYPE_UNSIGNED:142/*143* Align the LSB144*/145if (start) {146input = LLVMBuildLShr(builder, input,147lp_build_const_int_vec(gallivm, type, start), "");148}149150/*151* Zero the MSBs152*/153if (stop < blockbits) {154unsigned mask = ((unsigned long long)1 << width) - 1;155input = LLVMBuildAnd(builder, input,156lp_build_const_int_vec(gallivm, type, mask), "");157}158159/*160* Type conversion161*/162if (type.floating) {163if (srgb_chan) {164struct lp_type conv_type = lp_uint_type(type);165input = lp_build_srgb_to_linear(gallivm, conv_type, width, input);166}167else {168if(chan_desc.normalized)169input = lp_build_unsigned_norm_to_float(gallivm, width, type, input);170else171input = LLVMBuildUIToFP(builder, input, bld->vec_type, "");172}173}174else if (chan_desc.pure_integer) {175/* Nothing to do */176} else {177/* FIXME */178assert(0);179}180break;181182case UTIL_FORMAT_TYPE_SIGNED:183/*184* Align the sign bit first.185*/186if (stop < type.width) {187unsigned bits = type.width - stop;188LLVMValueRef bits_val = lp_build_const_int_vec(gallivm, type, bits);189input = LLVMBuildShl(builder, input, bits_val, "");190}191192/*193* Align the LSB (with an arithmetic shift to preserve the sign)194*/195if (chan_desc.size < type.width) {196unsigned bits = type.width - chan_desc.size;197LLVMValueRef bits_val = lp_build_const_int_vec(gallivm, type, bits);198input = LLVMBuildAShr(builder, input, bits_val, "");199}200201/*202* Type conversion203*/204if (type.floating) {205input = LLVMBuildSIToFP(builder, input, bld->vec_type, "");206if (chan_desc.normalized) {207double scale = 1.0 / ((1 << (chan_desc.size - 1)) - 1);208LLVMValueRef scale_val = lp_build_const_vec(gallivm, type, scale);209input = LLVMBuildFMul(builder, input, scale_val, "");210/*211* The formula above will produce value below -1.0 for most negative values.212* compliance requires clamping it.213* GTF-GL45.gtf33.GL3Tests.vertex_type_2_10_10_10_rev.vertex_type_2_10_10_10_rev_conversion.214*/215input = lp_build_max(bld, input,216lp_build_const_vec(gallivm, type, -1.0f));217}218}219else if (chan_desc.pure_integer) {220/* Nothing to do */221} else {222/* FIXME */223assert(0);224}225break;226227case UTIL_FORMAT_TYPE_FLOAT:228if (type.floating) {229if (chan_desc.size == 16) {230struct lp_type f16i_type = type;231f16i_type.width /= 2;232f16i_type.floating = 0;233if (start) {234input = LLVMBuildLShr(builder, input,235lp_build_const_int_vec(gallivm, type, start), "");236}237input = LLVMBuildTrunc(builder, input,238lp_build_vec_type(gallivm, f16i_type), "");239input = lp_build_half_to_float(gallivm, input);240} else {241assert(start == 0);242assert(stop == 32);243assert(type.width == 32);244}245input = LLVMBuildBitCast(builder, input, bld->vec_type, "");246}247else {248/* FIXME */249assert(0);250input = bld->undef;251}252break;253254case UTIL_FORMAT_TYPE_FIXED:255if (type.floating) {256double scale = 1.0 / ((1 << (chan_desc.size/2)) - 1);257LLVMValueRef scale_val = lp_build_const_vec(gallivm, type, scale);258input = LLVMBuildSIToFP(builder, input, bld->vec_type, "");259input = LLVMBuildFMul(builder, input, scale_val, "");260}261else {262/* FIXME */263assert(0);264input = bld->undef;265}266break;267268default:269assert(0);270input = bld->undef;271break;272}273274return input;275}276277278/**279* Unpack several pixels in SoA.280*281* It takes a vector of packed pixels:282*283* packed = {P0, P1, P2, P3, ..., Pn}284*285* And will produce four vectors:286*287* red = {R0, R1, R2, R3, ..., Rn}288* green = {G0, G1, G2, G3, ..., Gn}289* blue = {B0, B1, B2, B3, ..., Bn}290* alpha = {A0, A1, A2, A3, ..., An}291*292* It requires that a packed pixel fits into an element of the output293* channels. The common case is when converting pixel with a depth of 32 bit or294* less into floats.295*296* \param format_desc the format of the 'packed' incoming pixel vector297* \param type the desired type for rgba_out (type.length = n, above)298* \param packed the incoming vector of packed pixels299* \param rgba_out returns the SoA R,G,B,A vectors300*/301void302lp_build_unpack_rgba_soa(struct gallivm_state *gallivm,303const struct util_format_description *format_desc,304struct lp_type type,305LLVMValueRef packed,306LLVMValueRef rgba_out[4])307{308struct lp_build_context bld;309LLVMValueRef inputs[4];310unsigned chan;311312assert(format_desc->layout == UTIL_FORMAT_LAYOUT_PLAIN);313assert(format_desc->block.width == 1);314assert(format_desc->block.height == 1);315assert(format_desc->block.bits <= type.width);316/* FIXME: Support more output types */317assert(type.width == 32);318319lp_build_context_init(&bld, gallivm, type);320321/* Decode the input vector components */322for (chan = 0; chan < format_desc->nr_channels; ++chan) {323struct util_format_channel_description chan_desc = format_desc->channel[chan];324boolean srgb_chan = FALSE;325326if (format_desc->colorspace == UTIL_FORMAT_COLORSPACE_SRGB &&327format_desc->swizzle[3] != chan) {328srgb_chan = TRUE;329}330331inputs[chan] = lp_build_extract_soa_chan(&bld,332format_desc->block.bits,333srgb_chan,334chan_desc,335packed);336}337338lp_build_format_swizzle_soa(format_desc, &bld, inputs, rgba_out);339}340341342/**343* Convert a vector of rgba8 values into 32bit wide SoA vectors.344*345* \param dst_type The desired return type. For pure integer formats346* this should be a 32bit wide int or uint vector type,347* otherwise a float vector type.348*349* \param packed The rgba8 values to pack.350*351* \param rgba The 4 SoA return vectors.352*/353void354lp_build_rgba8_to_fi32_soa(struct gallivm_state *gallivm,355struct lp_type dst_type,356LLVMValueRef packed,357LLVMValueRef *rgba)358{359LLVMBuilderRef builder = gallivm->builder;360LLVMValueRef mask = lp_build_const_int_vec(gallivm, dst_type, 0xff);361unsigned chan;362363/* XXX technically shouldn't use that for uint dst_type */364packed = LLVMBuildBitCast(builder, packed,365lp_build_int_vec_type(gallivm, dst_type), "");366367/* Decode the input vector components */368for (chan = 0; chan < 4; ++chan) {369#if UTIL_ARCH_LITTLE_ENDIAN370unsigned start = chan*8;371#else372unsigned start = (3-chan)*8;373#endif374unsigned stop = start + 8;375LLVMValueRef input;376377input = packed;378379if (start)380input = LLVMBuildLShr(builder, input,381lp_build_const_int_vec(gallivm, dst_type, start), "");382383if (stop < 32)384input = LLVMBuildAnd(builder, input, mask, "");385386if (dst_type.floating)387input = lp_build_unsigned_norm_to_float(gallivm, 8, dst_type, input);388389rgba[chan] = input;390}391}392393394395/**396* Fetch a texels from a texture, returning them in SoA layout.397*398* \param type the desired return type for 'rgba'. The vector length399* is the number of texels to fetch400* \param aligned if the offset is guaranteed to be aligned to element width401*402* \param base_ptr points to the base of the texture mip tree.403* \param offset offset to start of the texture image block. For non-404* compressed formats, this simply is an offset to the texel.405* For compressed formats, it is an offset to the start of the406* compressed data block.407*408* \param i, j the sub-block pixel coordinates. For non-compressed formats409* these will always be (0,0). For compressed formats, i will410* be in [0, block_width-1] and j will be in [0, block_height-1].411* \param cache optional value pointing to a lp_build_format_cache structure412*/413void414lp_build_fetch_rgba_soa(struct gallivm_state *gallivm,415const struct util_format_description *format_desc,416struct lp_type type,417boolean aligned,418LLVMValueRef base_ptr,419LLVMValueRef offset,420LLVMValueRef i,421LLVMValueRef j,422LLVMValueRef cache,423LLVMValueRef rgba_out[4])424{425LLVMBuilderRef builder = gallivm->builder;426enum pipe_format format = format_desc->format;427struct lp_type fetch_type;428429if (format_desc->layout == UTIL_FORMAT_LAYOUT_PLAIN &&430(format_desc->colorspace == UTIL_FORMAT_COLORSPACE_RGB ||431format_desc->colorspace == UTIL_FORMAT_COLORSPACE_SRGB ||432format_desc->colorspace == UTIL_FORMAT_COLORSPACE_ZS) &&433format_desc->block.width == 1 &&434format_desc->block.height == 1 &&435format_desc->block.bits <= type.width &&436(format_desc->channel[0].type != UTIL_FORMAT_TYPE_FLOAT ||437format_desc->channel[0].size == 32 ||438format_desc->channel[0].size == 16))439{440/*441* The packed pixel fits into an element of the destination format. Put442* the packed pixels into a vector and extract each component for all443* vector elements in parallel.444*/445446LLVMValueRef packed;447448/*449* gather the texels from the texture450* Ex: packed = {XYZW, XYZW, XYZW, XYZW}451*/452assert(format_desc->block.bits <= type.width);453fetch_type = lp_type_uint(type.width);454packed = lp_build_gather(gallivm,455type.length,456format_desc->block.bits,457fetch_type,458aligned,459base_ptr, offset, FALSE);460461/*462* convert texels to float rgba463*/464lp_build_unpack_rgba_soa(gallivm,465format_desc,466type,467packed, rgba_out);468return;469}470471472if (format_desc->layout == UTIL_FORMAT_LAYOUT_PLAIN &&473(format_desc->colorspace == UTIL_FORMAT_COLORSPACE_RGB) &&474format_desc->block.width == 1 &&475format_desc->block.height == 1 &&476format_desc->block.bits > type.width &&477((format_desc->block.bits <= type.width * type.length &&478format_desc->channel[0].size <= type.width) ||479(format_desc->channel[0].size == 64 &&480format_desc->channel[0].type == UTIL_FORMAT_TYPE_FLOAT &&481type.floating)))482{483/*484* Similar to above, but the packed pixel is larger than what fits485* into an element of the destination format. The packed pixels will be486* shuffled into SoA vectors appropriately, and then the extraction will487* be done in parallel as much as possible.488* Good for 16xn (n > 2) and 32xn (n > 1) formats, care is taken so489* the gathered vectors can be shuffled easily (even with avx).490* 64xn float -> 32xn float is handled too but it's a bit special as491* it does the conversion pre-shuffle.492*/493494LLVMValueRef packed[4], dst[4], output[4], shuffles[LP_MAX_VECTOR_WIDTH/32];495struct lp_type fetch_type, gather_type = type;496unsigned num_gather, fetch_width, i, j;497struct lp_build_context bld;498boolean fp64 = format_desc->channel[0].size == 64;499500lp_build_context_init(&bld, gallivm, type);501502assert(type.width == 32);503assert(format_desc->block.bits > type.width);504505/*506* First, figure out fetch order.507*/508fetch_width = util_next_power_of_two(format_desc->block.bits);509/*510* fp64 are treated like fp32 except we fetch twice wide values511* (as we shuffle after trunc). The shuffles for that work out512* mostly fine (slightly suboptimal for 4-wide, perfect for AVX)513* albeit we miss the potential opportunity for hw gather (as it514* only handles native size).515*/516num_gather = fetch_width / type.width;517gather_type.width *= num_gather;518if (fp64) {519num_gather /= 2;520}521gather_type.length /= num_gather;522523for (i = 0; i < num_gather; i++) {524LLVMValueRef offsetr, shuf_vec;525if(num_gather == 4) {526for (j = 0; j < gather_type.length; j++) {527unsigned idx = i + 4*j;528shuffles[j] = lp_build_const_int32(gallivm, idx);529}530shuf_vec = LLVMConstVector(shuffles, gather_type.length);531offsetr = LLVMBuildShuffleVector(builder, offset, offset, shuf_vec, "");532533}534else if (num_gather == 2) {535assert(num_gather == 2);536for (j = 0; j < gather_type.length; j++) {537unsigned idx = i*2 + (j%2) + (j/2)*4;538shuffles[j] = lp_build_const_int32(gallivm, idx);539}540shuf_vec = LLVMConstVector(shuffles, gather_type.length);541offsetr = LLVMBuildShuffleVector(builder, offset, offset, shuf_vec, "");542}543else {544assert(num_gather == 1);545offsetr = offset;546}547if (gather_type.length == 1) {548LLVMValueRef zero = lp_build_const_int32(gallivm, 0);549offsetr = LLVMBuildExtractElement(builder, offsetr, zero, "");550}551552/*553* Determine whether to use float or int loads. This is mostly554* to outsmart the (stupid) llvm int/float shuffle logic, we555* don't really care much if the data is floats or ints...556* But llvm will refuse to use single float shuffle with int data557* and instead use 3 int shuffles instead, the code looks atrocious.558* (Note bitcasts often won't help, as llvm is too smart to be559* fooled by that.)560* Nobody cares about simd float<->int domain transition penalties,561* which usually don't even exist for shuffles anyway.562* With 4x32bit (and 3x32bit) fetch, we use float vec (the data is563* going into transpose, which is unpacks, so doesn't really matter564* much).565* With 2x32bit or 4x16bit fetch, we use float vec, since those566* go into the weird channel separation shuffle. With floats,567* this is (with 128bit vectors):568* - 2 movq, 2 movhpd, 2 shufps569* With ints it would be:570* - 4 movq, 2 punpcklqdq, 4 pshufd, 2 blendw571* I've seen texture functions increase in code size by 15% just due572* to that (there's lots of such fetches in them...)573* (We could chose a different gather order to improve this somewhat574* for the int path, but it would basically just drop the blends,575* so the float path with this order really is optimal.)576* Albeit it is tricky sometimes llvm doesn't ignore the float->int577* casts so must avoid them until we're done with the float shuffle...578* 3x16bit formats (the same is also true for 3x8) are pretty bad but579* there's nothing we can do about them (we could overallocate by580* those couple bytes and use unaligned but pot sized load).581* Note that this is very much x86 specific. I don't know if this582* affect other archs at all.583*/584if (num_gather > 1) {585/*586* We always want some float type here (with x86)587* due to shuffles being float ones afterwards (albeit for588* the num_gather == 4 case int should work fine too589* (unless there's some problems with avx but not avx2).590*/591if (format_desc->channel[0].size == 64) {592fetch_type = lp_type_float_vec(64, gather_type.width);593} else {594fetch_type = lp_type_int_vec(32, gather_type.width);595}596}597else {598/* type doesn't matter much */599if (format_desc->channel[0].type == UTIL_FORMAT_TYPE_FLOAT &&600(format_desc->channel[0].size == 32 ||601format_desc->channel[0].size == 64)) {602fetch_type = lp_type_float(gather_type.width);603} else {604fetch_type = lp_type_uint(gather_type.width);605}606}607608/* Now finally gather the values */609packed[i] = lp_build_gather(gallivm, gather_type.length,610format_desc->block.bits,611fetch_type, aligned,612base_ptr, offsetr, FALSE);613if (fp64) {614struct lp_type conv_type = type;615conv_type.width *= 2;616packed[i] = LLVMBuildBitCast(builder, packed[i],617lp_build_vec_type(gallivm, conv_type), "");618packed[i] = LLVMBuildFPTrunc(builder, packed[i], bld.vec_type, "");619}620}621622/* shuffle the gathered values to SoA */623if (num_gather == 2) {624for (i = 0; i < num_gather; i++) {625for (j = 0; j < type.length; j++) {626unsigned idx = (j%2)*2 + (j/4)*4 + i;627if ((j/2)%2)628idx += type.length;629shuffles[j] = lp_build_const_int32(gallivm, idx);630}631dst[i] = LLVMBuildShuffleVector(builder, packed[0], packed[1],632LLVMConstVector(shuffles, type.length), "");633}634}635else if (num_gather == 4) {636lp_build_transpose_aos(gallivm, lp_int_type(type), packed, dst);637}638else {639assert(num_gather == 1);640dst[0] = packed[0];641}642643/*644* And finally unpack exactly as above, except that645* chan shift is adjusted and the right vector selected.646*/647if (!fp64) {648for (i = 0; i < num_gather; i++) {649dst[i] = LLVMBuildBitCast(builder, dst[i], bld.int_vec_type, "");650}651for (i = 0; i < format_desc->nr_channels; i++) {652struct util_format_channel_description chan_desc = format_desc->channel[i];653unsigned blockbits = type.width;654unsigned vec_nr;655656#if UTIL_ARCH_BIG_ENDIAN657vec_nr = (format_desc->block.bits - (chan_desc.shift + chan_desc.size)) / type.width;658#else659vec_nr = chan_desc.shift / type.width;660#endif661chan_desc.shift %= type.width;662663output[i] = lp_build_extract_soa_chan(&bld,664blockbits,665FALSE,666chan_desc,667dst[vec_nr]);668}669}670else {671for (i = 0; i < format_desc->nr_channels; i++) {672output[i] = dst[i];673}674}675676lp_build_format_swizzle_soa(format_desc, &bld, output, rgba_out);677return;678}679680if (format == PIPE_FORMAT_R11G11B10_FLOAT ||681format == PIPE_FORMAT_R9G9B9E5_FLOAT) {682/*683* similar conceptually to above but requiring special684* AoS packed -> SoA float conversion code.685*/686LLVMValueRef packed;687struct lp_type fetch_type = lp_type_uint(type.width);688689assert(type.floating);690assert(type.width == 32);691692packed = lp_build_gather(gallivm, type.length,693format_desc->block.bits,694fetch_type, aligned,695base_ptr, offset, FALSE);696if (format == PIPE_FORMAT_R11G11B10_FLOAT) {697lp_build_r11g11b10_to_float(gallivm, packed, rgba_out);698}699else {700lp_build_rgb9e5_to_float(gallivm, packed, rgba_out);701}702return;703}704705if (format_desc->colorspace == UTIL_FORMAT_COLORSPACE_ZS &&706format_desc->block.bits == 64) {707/*708* special case the format is 64 bits but we only require709* 32bit (or 8bit) from each block.710*/711LLVMValueRef packed;712struct lp_type fetch_type = lp_type_uint(type.width);713714if (format == PIPE_FORMAT_X32_S8X24_UINT) {715/*716* for stencil simply fix up offsets - could in fact change717* base_ptr instead even outside the shader.718*/719unsigned mask = (1 << 8) - 1;720LLVMValueRef s_offset = lp_build_const_int_vec(gallivm, type, 4);721offset = LLVMBuildAdd(builder, offset, s_offset, "");722packed = lp_build_gather(gallivm, type.length, 32, fetch_type,723aligned, base_ptr, offset, FALSE);724packed = LLVMBuildAnd(builder, packed,725lp_build_const_int_vec(gallivm, type, mask), "");726}727else {728assert (format == PIPE_FORMAT_Z32_FLOAT_S8X24_UINT);729packed = lp_build_gather(gallivm, type.length, 32, fetch_type,730aligned, base_ptr, offset, TRUE);731packed = LLVMBuildBitCast(builder, packed,732lp_build_vec_type(gallivm, type), "");733}734/* for consistency with lp_build_unpack_rgba_soa() return sss1 or zzz1 */735rgba_out[0] = rgba_out[1] = rgba_out[2] = packed;736rgba_out[3] = lp_build_const_vec(gallivm, type, 1.0f);737return;738}739740/*741* Try calling lp_build_fetch_rgba_aos for all pixels.742* Should only really hit subsampled, compressed743* (for s3tc srgb and rgtc too).744* (This is invalid for plain 8unorm formats because we're lazy with745* the swizzle since some results would arrive swizzled, some not.)746*/747748if ((format_desc->layout != UTIL_FORMAT_LAYOUT_PLAIN) &&749(util_format_fits_8unorm(format_desc) ||750format_desc->layout == UTIL_FORMAT_LAYOUT_RGTC ||751format_desc->layout == UTIL_FORMAT_LAYOUT_S3TC) &&752type.floating && type.width == 32 &&753(type.length == 1 || (type.length % 4 == 0))) {754struct lp_type tmp_type;755struct lp_build_context bld;756LLVMValueRef packed, rgba[4];757const struct util_format_description *flinear_desc;758const struct util_format_description *frgba8_desc;759unsigned chan;760bool is_signed = (format_desc->format == PIPE_FORMAT_RGTC1_SNORM ||761format_desc->format == PIPE_FORMAT_RGTC2_SNORM ||762format_desc->format == PIPE_FORMAT_LATC1_SNORM ||763format_desc->format == PIPE_FORMAT_LATC2_SNORM);764765lp_build_context_init(&bld, gallivm, type);766767/*768* Make sure the conversion in aos really only does convert to rgba8769* and not anything more (so use linear format, adjust type).770*/771flinear_desc = util_format_description(util_format_linear(format));772memset(&tmp_type, 0, sizeof tmp_type);773tmp_type.width = 8;774tmp_type.length = type.length * 4;775tmp_type.norm = TRUE;776tmp_type.sign = is_signed;777778packed = lp_build_fetch_rgba_aos(gallivm, flinear_desc, tmp_type,779aligned, base_ptr, offset, i, j, cache);780packed = LLVMBuildBitCast(builder, packed, bld.int_vec_type, "");781782/*783* The values are now packed so they match ordinary (srgb) RGBA8 format,784* hence need to use matching format for unpack.785*/786frgba8_desc = util_format_description(is_signed ? PIPE_FORMAT_R8G8B8A8_SNORM : PIPE_FORMAT_R8G8B8A8_UNORM);787if (format_desc->colorspace == UTIL_FORMAT_COLORSPACE_SRGB) {788assert(format_desc->layout == UTIL_FORMAT_LAYOUT_S3TC);789frgba8_desc = util_format_description(PIPE_FORMAT_R8G8B8A8_SRGB);790}791lp_build_unpack_rgba_soa(gallivm,792frgba8_desc,793type,794packed, rgba);795796/*797* We converted 4 channels. Make sure llvm can drop unneeded ones798* (luckily the rgba order is fixed, only LA needs special case).799*/800for (chan = 0; chan < 4; chan++) {801enum pipe_swizzle swizzle = format_desc->swizzle[chan];802if (chan == 3 && util_format_is_luminance_alpha(format)) {803swizzle = PIPE_SWIZZLE_W;804}805rgba_out[chan] = lp_build_swizzle_soa_channel(&bld, rgba, swizzle);806}807return;808}809810811/*812* Fallback to calling lp_build_fetch_rgba_aos for each pixel.813*814* This is not the most efficient way of fetching pixels, as we815* miss some opportunities to do vectorization, but this is816* convenient for formats or scenarios for which there was no817* opportunity or incentive to optimize.818*819* We do NOT want to end up here, this typically is quite terrible,820* in particular if the formats have less than 4 channels.821*822* Right now, this should only be hit for:823* - ETC formats824* (those miss fast fetch functions hence they are terrible anyway)825*/826827{828unsigned k;829struct lp_type tmp_type;830LLVMValueRef aos_fetch[LP_MAX_VECTOR_WIDTH / 32];831832if (gallivm_debug & GALLIVM_DEBUG_PERF) {833debug_printf("%s: AoS fetch fallback for %s\n",834__FUNCTION__, format_desc->short_name);835}836837tmp_type = type;838tmp_type.length = 4;839840if (type.length == 1) {841LLVMValueRef fetch = lp_build_fetch_rgba_aos(gallivm, format_desc, tmp_type,842aligned, base_ptr, offset,843i, j, cache);844845for (k = 0; k < 4; k++)846rgba_out[k] = LLVMBuildExtractElement(gallivm->builder, fetch, lp_build_const_int32(gallivm, k), "");847return;848}849850/*851* Note that vector transpose can be worse compared to insert/extract852* for aos->soa conversion (for formats with 1 or 2 channels). However,853* we should try to avoid getting here for just about all formats, so854* don't bother.855*/856857/* loop over number of pixels */858for(k = 0; k < type.length; ++k) {859LLVMValueRef index = lp_build_const_int32(gallivm, k);860LLVMValueRef offset_elem;861LLVMValueRef i_elem, j_elem;862863offset_elem = LLVMBuildExtractElement(builder, offset,864index, "");865866i_elem = LLVMBuildExtractElement(builder, i, index, "");867j_elem = LLVMBuildExtractElement(builder, j, index, "");868869/* Get a single float[4]={R,G,B,A} pixel */870aos_fetch[k] = lp_build_fetch_rgba_aos(gallivm, format_desc, tmp_type,871aligned, base_ptr, offset_elem,872i_elem, j_elem, cache);873874}875convert_to_soa(gallivm, aos_fetch, rgba_out, type);876}877}878879static void880lp_build_insert_soa_chan(struct lp_build_context *bld,881unsigned blockbits,882struct util_format_channel_description chan_desc,883LLVMValueRef *output,884LLVMValueRef rgba)885{886struct gallivm_state *gallivm = bld->gallivm;887LLVMBuilderRef builder = gallivm->builder;888struct lp_type type = bld->type;889const unsigned width = chan_desc.size;890const unsigned start = chan_desc.shift;891const uint32_t chan_mask = (1ULL << width) - 1;892ASSERTED const unsigned stop = start + width;893LLVMValueRef chan = NULL;894switch(chan_desc.type) {895case UTIL_FORMAT_TYPE_UNSIGNED:896897if (chan_desc.pure_integer) {898chan = LLVMBuildBitCast(builder, rgba, bld->int_vec_type, "");899LLVMValueRef mask_val = lp_build_const_int_vec(gallivm, type, chan_mask);900LLVMValueRef mask = LLVMBuildICmp(builder, LLVMIntUGT, chan, mask_val, "");901chan = LLVMBuildSelect(builder, mask, mask_val, chan, "");902}903else if (type.floating) {904if (chan_desc.normalized) {905rgba = lp_build_clamp(bld, rgba, bld->zero, bld->one);906chan = lp_build_clamped_float_to_unsigned_norm(gallivm, type, width, rgba);907} else908chan = LLVMBuildFPToSI(builder, rgba, bld->vec_type, "");909}910if (start)911chan = LLVMBuildShl(builder, chan,912lp_build_const_int_vec(gallivm, type, start), "");913if (!*output)914*output = chan;915else916*output = LLVMBuildOr(builder, *output, chan, "");917break;918case UTIL_FORMAT_TYPE_SIGNED:919if (chan_desc.pure_integer) {920chan = LLVMBuildBitCast(builder, rgba, bld->int_vec_type, "");921chan = LLVMBuildAnd(builder, chan, lp_build_const_int_vec(gallivm, type, chan_mask), "");922} else if (type.floating) {923if (chan_desc.normalized) {924char intrin[32];925double scale = ((1 << (chan_desc.size - 1)) - 1);926LLVMValueRef scale_val = lp_build_const_vec(gallivm, type, scale);927rgba = lp_build_clamp(bld, rgba, lp_build_negate(bld, bld->one), bld->one);928rgba = LLVMBuildFMul(builder, rgba, scale_val, "");929lp_format_intrinsic(intrin, sizeof intrin, "llvm.rint", bld->vec_type);930rgba = lp_build_intrinsic_unary(builder, intrin, bld->vec_type, rgba);931}932chan = LLVMBuildFPToSI(builder, rgba, bld->int_vec_type, "");933chan = LLVMBuildAnd(builder, chan, lp_build_const_int_vec(gallivm, type, chan_mask), "");934}935if (start)936chan = LLVMBuildShl(builder, chan,937lp_build_const_int_vec(gallivm, type, start), "");938if (!*output)939*output = chan;940else941*output = LLVMBuildOr(builder, *output, chan, "");942break;943case UTIL_FORMAT_TYPE_FLOAT:944if (type.floating) {945if (chan_desc.size == 16) {946chan = lp_build_float_to_half(gallivm, rgba);947chan = LLVMBuildZExt(builder, chan, bld->int_vec_type, "");948if (start)949chan = LLVMBuildShl(builder, chan,950lp_build_const_int_vec(gallivm, type, start), "");951if (!*output)952*output = chan;953else954*output = LLVMBuildOr(builder, *output, chan, "");955} else {956assert(start == 0);957assert(stop == 32);958assert(type.width == 32);959*output = LLVMBuildBitCast(builder, rgba, bld->int_vec_type, "");960}961} else962assert(0);963break;964default:965assert(0);966*output = bld->undef;967}968}969970static void971lp_build_pack_rgba_soa(struct gallivm_state *gallivm,972const struct util_format_description *format_desc,973struct lp_type type,974const LLVMValueRef rgba_in[4],975LLVMValueRef *packed)976{977unsigned chan;978struct lp_build_context bld;979assert(format_desc->layout == UTIL_FORMAT_LAYOUT_PLAIN);980assert(format_desc->block.width == 1);981assert(format_desc->block.height == 1);982assert(format_desc->block.bits <= type.width);983/* FIXME: Support more output types */984assert(type.width == 32);985986lp_build_context_init(&bld, gallivm, type);987for (chan = 0; chan < format_desc->nr_channels; ++chan) {988struct util_format_channel_description chan_desc = format_desc->channel[chan];989990lp_build_insert_soa_chan(&bld, format_desc->block.bits,991chan_desc,992packed,993rgba_in[chan]);994}995}996997void998lp_build_store_rgba_soa(struct gallivm_state *gallivm,999const struct util_format_description *format_desc,1000struct lp_type type,1001LLVMValueRef exec_mask,1002LLVMValueRef base_ptr,1003LLVMValueRef offset,1004LLVMValueRef out_of_bounds,1005const LLVMValueRef rgba_in[4])1006{1007enum pipe_format format = format_desc->format;1008LLVMValueRef packed[4];1009unsigned num_stores = 0;10101011memset(packed, 0, sizeof(LLVMValueRef) * 4);1012if (format_desc->layout == UTIL_FORMAT_LAYOUT_PLAIN &&1013format_desc->colorspace == UTIL_FORMAT_COLORSPACE_RGB &&1014format_desc->block.width == 1 &&1015format_desc->block.height == 1 &&1016format_desc->block.bits <= type.width &&1017(format_desc->channel[0].type != UTIL_FORMAT_TYPE_FLOAT ||1018format_desc->channel[0].size == 32 ||1019format_desc->channel[0].size == 16))1020{1021lp_build_pack_rgba_soa(gallivm, format_desc, type, rgba_in, &packed[0]);10221023num_stores = 1;1024} else if (format_desc->layout == UTIL_FORMAT_LAYOUT_PLAIN &&1025(format_desc->colorspace == UTIL_FORMAT_COLORSPACE_RGB) &&1026format_desc->block.width == 1 &&1027format_desc->block.height == 1 &&1028format_desc->block.bits > type.width &&1029((format_desc->block.bits <= type.width * type.length &&1030format_desc->channel[0].size <= type.width) ||1031(format_desc->channel[0].size == 64 &&1032format_desc->channel[0].type == UTIL_FORMAT_TYPE_FLOAT &&1033type.floating)))1034{1035/*1036* Similar to above, but the packed pixel is larger than what fits1037* into an element of the destination format. The packed pixels will be1038* shuffled into SoA vectors appropriately, and then the extraction will1039* be done in parallel as much as possible.1040* Good for 16xn (n > 2) and 32xn (n > 1) formats, care is taken so1041* the gathered vectors can be shuffled easily (even with avx).1042* 64xn float -> 32xn float is handled too but it's a bit special as1043* it does the conversion pre-shuffle.1044*/1045struct lp_build_context bld;10461047lp_build_context_init(&bld, gallivm, type);1048assert(type.width == 32);1049assert(format_desc->block.bits > type.width);10501051unsigned store_width = util_next_power_of_two(format_desc->block.bits);1052num_stores = store_width / type.width;1053for (unsigned i = 0; i < format_desc->nr_channels; i++) {1054struct util_format_channel_description chan_desc = format_desc->channel[i];1055unsigned blockbits = type.width;1056unsigned vec_nr;10571058vec_nr = chan_desc.shift / type.width;1059chan_desc.shift %= type.width;10601061lp_build_insert_soa_chan(&bld, blockbits,1062chan_desc,1063&packed[vec_nr],1064rgba_in[i]);1065}10661067assert(num_stores == 4 || num_stores == 2);1068/* we can transpose and store at the same time */1069} else if (format == PIPE_FORMAT_R11G11B10_FLOAT) {1070packed[0] = lp_build_float_to_r11g11b10(gallivm, rgba_in);1071num_stores = 1;1072} else1073assert(0);10741075assert(exec_mask);10761077LLVMTypeRef int32_ptr_type = LLVMPointerType(LLVMInt32TypeInContext(gallivm->context), 0);1078LLVMTypeRef int16_ptr_type = LLVMPointerType(LLVMInt16TypeInContext(gallivm->context), 0);1079LLVMTypeRef int8_ptr_type = LLVMPointerType(LLVMInt8TypeInContext(gallivm->context), 0);10801081LLVMValueRef should_store_mask = LLVMBuildAnd(gallivm->builder, exec_mask, LLVMBuildNot(gallivm->builder, out_of_bounds, ""), "store_mask");1082should_store_mask = LLVMBuildICmp(gallivm->builder, LLVMIntNE, should_store_mask, lp_build_const_int_vec(gallivm, type, 0), "");1083for (unsigned i = 0; i < num_stores; i++) {1084struct lp_build_loop_state loop_state;10851086LLVMValueRef store_offset = LLVMBuildAdd(gallivm->builder, offset, lp_build_const_int_vec(gallivm, type, i * 4), "");1087store_offset = LLVMBuildGEP(gallivm->builder, base_ptr, &store_offset, 1, "");10881089lp_build_loop_begin(&loop_state, gallivm, lp_build_const_int32(gallivm, 0));10901091struct lp_build_if_state ifthen;1092LLVMValueRef cond = LLVMBuildExtractElement(gallivm->builder, should_store_mask, loop_state.counter, "");1093lp_build_if(&ifthen, gallivm, cond);10941095LLVMValueRef data = LLVMBuildExtractElement(gallivm->builder, packed[i], loop_state.counter, "");1096LLVMValueRef this_offset = LLVMBuildExtractElement(gallivm->builder, store_offset, loop_state.counter, "");10971098if (format_desc->block.bits == 8) {1099this_offset = LLVMBuildBitCast(gallivm->builder, this_offset, int8_ptr_type, "");1100data = LLVMBuildTrunc(gallivm->builder, data, LLVMInt8TypeInContext(gallivm->context), "");1101} else if (format_desc->block.bits == 16) {1102this_offset = LLVMBuildBitCast(gallivm->builder, this_offset, int16_ptr_type, "");1103data = LLVMBuildTrunc(gallivm->builder, data, LLVMInt16TypeInContext(gallivm->context), "");1104} else1105this_offset = LLVMBuildBitCast(gallivm->builder, this_offset, int32_ptr_type, "");1106LLVMBuildStore(gallivm->builder, data, this_offset);1107lp_build_endif(&ifthen);1108lp_build_loop_end_cond(&loop_state, lp_build_const_int32(gallivm, type.length),1109NULL, LLVMIntUGE);1110}1111}111211131114