Path: blob/21.2-virgl/src/gallium/auxiliary/gallivm/lp_bld_conv.c
4565 views
/**************************************************************************1*2* Copyright 2009 VMware, Inc.3* All Rights Reserved.4*5* Permission is hereby granted, free of charge, to any person obtaining a6* copy of this software and associated documentation files (the7* "Software"), to deal in the Software without restriction, including8* without limitation the rights to use, copy, modify, merge, publish,9* distribute, sub license, and/or sell copies of the Software, and to10* permit persons to whom the Software is furnished to do so, subject to11* the following conditions:12*13* The above copyright notice and this permission notice (including the14* next paragraph) shall be included in all copies or substantial portions15* of the Software.16*17* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS18* OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF19* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.20* IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR21* ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,22* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE23* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.24*25**************************************************************************/262728/**29* @file30* Helper functions for type conversions.31*32* We want to use the fastest type for a given computation whenever feasible.33* The other side of this is that we need to be able convert between several34* types accurately and efficiently.35*36* Conversion between types of different bit width is quite complex since a37*38* To remember there are a few invariants in type conversions:39*40* - register width must remain constant:41*42* src_type.width * src_type.length == dst_type.width * dst_type.length43*44* - total number of elements must remain constant:45*46* src_type.length * num_srcs == dst_type.length * num_dsts47*48* It is not always possible to do the conversion both accurately and49* efficiently, usually due to lack of adequate machine instructions. In these50* cases it is important not to cut shortcuts here and sacrifice accuracy, as51* there this functions can be used anywhere. In the future we might have a52* precision parameter which can gauge the accuracy vs efficiency compromise,53* but for now if the data conversion between two stages happens to be the54* bottleneck, then most likely should just avoid converting at all and run55* both stages with the same type.56*57* Make sure to run lp_test_conv unit test after any change to this file.58*59* @author Jose Fonseca <[email protected]>60*/616263#include "util/u_debug.h"64#include "util/u_math.h"65#include "util/half_float.h"66#include "util/u_cpu_detect.h"6768#include "lp_bld_type.h"69#include "lp_bld_const.h"70#include "lp_bld_arit.h"71#include "lp_bld_bitarit.h"72#include "lp_bld_pack.h"73#include "lp_bld_conv.h"74#include "lp_bld_logic.h"75#include "lp_bld_intr.h"76#include "lp_bld_printf.h"77#include "lp_bld_format.h"787980/* the lp_test_format test fails on mingw/i686 at -O2 with gcc 10.x81* ref https://gitlab.freedesktop.org/mesa/mesa/-/issues/390682*/8384#if defined(__MINGW32__) && !defined(__MINGW64__) && (__GNUC__ == 10)85#warning "disabling caller-saves optimization for this file to work around compiler bug"86#pragma GCC optimize("-fno-caller-saves")87#endif8889/**90* Converts int16 half-float to float3291* Note this can be performed in 1 instruction if vcvtph2ps exists (f16c/cvt16)92* [llvm.x86.vcvtph2ps / _mm_cvtph_ps]93*94* @param src value to convert95*96*/97LLVMValueRef98lp_build_half_to_float(struct gallivm_state *gallivm,99LLVMValueRef src)100{101LLVMBuilderRef builder = gallivm->builder;102LLVMTypeRef src_type = LLVMTypeOf(src);103unsigned src_length = LLVMGetTypeKind(src_type) == LLVMVectorTypeKind ?104LLVMGetVectorSize(src_type) : 1;105106struct lp_type f32_type = lp_type_float_vec(32, 32 * src_length);107struct lp_type i32_type = lp_type_int_vec(32, 32 * src_length);108LLVMTypeRef int_vec_type = lp_build_vec_type(gallivm, i32_type);109LLVMValueRef h;110111if (util_get_cpu_caps()->has_f16c &&112(src_length == 4 || src_length == 8)) {113if (LLVM_VERSION_MAJOR < 11) {114const char *intrinsic = NULL;115if (src_length == 4) {116src = lp_build_pad_vector(gallivm, src, 8);117intrinsic = "llvm.x86.vcvtph2ps.128";118}119else {120intrinsic = "llvm.x86.vcvtph2ps.256";121}122return lp_build_intrinsic_unary(builder, intrinsic,123lp_build_vec_type(gallivm, f32_type), src);124} else {125/*126* XXX: could probably use on other archs as well.127* But if the cpu doesn't support it natively it looks like the backends still128* can't lower it and will try to call out to external libraries, which will crash.129*/130/*131* XXX: lp_build_vec_type() would use int16 vector. Probably need to revisit132* this at some point.133*/134src = LLVMBuildBitCast(builder, src,135LLVMVectorType(LLVMHalfTypeInContext(gallivm->context), src_length), "");136return LLVMBuildFPExt(builder, src, lp_build_vec_type(gallivm, f32_type), "");137}138}139140h = LLVMBuildZExt(builder, src, int_vec_type, "");141return lp_build_smallfloat_to_float(gallivm, f32_type, h, 10, 5, 0, true);142}143144145/**146* Converts float32 to int16 half-float147* Note this can be performed in 1 instruction if vcvtps2ph exists (f16c/cvt16)148* [llvm.x86.vcvtps2ph / _mm_cvtps_ph]149*150* @param src value to convert151*152* Convert float32 to half floats, preserving Infs and NaNs,153* with rounding towards zero (trunc).154* XXX: For GL, would prefer rounding towards nearest(-even).155*/156LLVMValueRef157lp_build_float_to_half(struct gallivm_state *gallivm,158LLVMValueRef src)159{160LLVMBuilderRef builder = gallivm->builder;161LLVMTypeRef f32_vec_type = LLVMTypeOf(src);162unsigned length = LLVMGetTypeKind(f32_vec_type) == LLVMVectorTypeKind163? LLVMGetVectorSize(f32_vec_type) : 1;164struct lp_type i32_type = lp_type_int_vec(32, 32 * length);165struct lp_type i16_type = lp_type_int_vec(16, 16 * length);166LLVMValueRef result;167168/*169* Note: Newer llvm versions (3.6 or so) support fptrunc to 16 bits170* directly, without any (x86 or generic) intrinsics.171* Albeit the rounding mode cannot be specified (and is undefined,172* though in practice on x86 seems to do nearest-even but it may173* be dependent on instruction set support), so is essentially174* useless.175*/176177if (util_get_cpu_caps()->has_f16c &&178(length == 4 || length == 8)) {179struct lp_type i168_type = lp_type_int_vec(16, 16 * 8);180unsigned mode = 3; /* same as LP_BUILD_ROUND_TRUNCATE */181LLVMTypeRef i32t = LLVMInt32TypeInContext(gallivm->context);182const char *intrinsic = NULL;183if (length == 4) {184intrinsic = "llvm.x86.vcvtps2ph.128";185}186else {187intrinsic = "llvm.x86.vcvtps2ph.256";188}189result = lp_build_intrinsic_binary(builder, intrinsic,190lp_build_vec_type(gallivm, i168_type),191src, LLVMConstInt(i32t, mode, 0));192if (length == 4) {193result = lp_build_extract_range(gallivm, result, 0, 4);194}195}196197else {198result = lp_build_float_to_smallfloat(gallivm, i32_type, src, 10, 5, 0, true);199/* Convert int32 vector to int16 vector by trunc (might generate bad code) */200result = LLVMBuildTrunc(builder, result, lp_build_vec_type(gallivm, i16_type), "");201}202203/*204* Debugging code.205*/206if (0) {207LLVMTypeRef i32t = LLVMInt32TypeInContext(gallivm->context);208LLVMTypeRef i16t = LLVMInt16TypeInContext(gallivm->context);209LLVMTypeRef f32t = LLVMFloatTypeInContext(gallivm->context);210LLVMValueRef ref_result = LLVMGetUndef(LLVMVectorType(i16t, length));211unsigned i;212213LLVMTypeRef func_type = LLVMFunctionType(i16t, &f32t, 1, 0);214LLVMValueRef func = lp_build_const_int_pointer(gallivm, func_to_pointer((func_pointer)_mesa_float_to_half));215func = LLVMBuildBitCast(builder, func, LLVMPointerType(func_type, 0), "_mesa_float_to_half");216217for (i = 0; i < length; ++i) {218LLVMValueRef index = LLVMConstInt(i32t, i, 0);219LLVMValueRef f32 = LLVMBuildExtractElement(builder, src, index, "");220#if 0221/*222* XXX: not really supported by backends.223* Even if they would now, rounding mode cannot be specified and224* is undefined.225*/226LLVMValueRef f16 = lp_build_intrinsic_unary(builder, "llvm.convert.to.fp16", i16t, f32);227#else228LLVMValueRef f16 = LLVMBuildCall(builder, func, &f32, 1, "");229#endif230ref_result = LLVMBuildInsertElement(builder, ref_result, f16, index, "");231}232233lp_build_print_value(gallivm, "src = ", src);234lp_build_print_value(gallivm, "llvm = ", result);235lp_build_print_value(gallivm, "util = ", ref_result);236lp_build_printf(gallivm, "\n");237}238239return result;240}241242243/**244* Special case for converting clamped IEEE-754 floats to unsigned norms.245*246* The mathematical voodoo below may seem excessive but it is actually247* paramount we do it this way for several reasons. First, there is no single248* precision FP to unsigned integer conversion Intel SSE instruction. Second,249* secondly, even if there was, since the FP's mantissa takes only a fraction250* of register bits the typically scale and cast approach would require double251* precision for accurate results, and therefore half the throughput252*253* Although the result values can be scaled to an arbitrary bit width specified254* by dst_width, the actual result type will have the same width.255*256* Ex: src = { float, float, float, float }257* return { i32, i32, i32, i32 } where each value is in [0, 2^dst_width-1].258*/259LLVMValueRef260lp_build_clamped_float_to_unsigned_norm(struct gallivm_state *gallivm,261struct lp_type src_type,262unsigned dst_width,263LLVMValueRef src)264{265LLVMBuilderRef builder = gallivm->builder;266LLVMTypeRef int_vec_type = lp_build_int_vec_type(gallivm, src_type);267LLVMValueRef res;268unsigned mantissa;269270assert(src_type.floating);271assert(dst_width <= src_type.width);272src_type.sign = FALSE;273274mantissa = lp_mantissa(src_type);275276if (dst_width <= mantissa) {277/*278* Apply magic coefficients that will make the desired result to appear279* in the lowest significant bits of the mantissa, with correct rounding.280*281* This only works if the destination width fits in the mantissa.282*/283284unsigned long long ubound;285unsigned long long mask;286double scale;287double bias;288289ubound = (1ULL << dst_width);290mask = ubound - 1;291scale = (double)mask/ubound;292bias = (double)(1ULL << (mantissa - dst_width));293294res = LLVMBuildFMul(builder, src, lp_build_const_vec(gallivm, src_type, scale), "");295/* instead of fadd/and could (with sse2) just use lp_build_iround */296res = LLVMBuildFAdd(builder, res, lp_build_const_vec(gallivm, src_type, bias), "");297res = LLVMBuildBitCast(builder, res, int_vec_type, "");298res = LLVMBuildAnd(builder, res,299lp_build_const_int_vec(gallivm, src_type, mask), "");300}301else if (dst_width == (mantissa + 1)) {302/*303* The destination width matches exactly what can be represented in304* floating point (i.e., mantissa + 1 bits). Even so correct rounding305* still needs to be applied (only for numbers in [0.5-1.0] would306* conversion using truncation after scaling be sufficient).307*/308double scale;309struct lp_build_context uf32_bld;310311lp_build_context_init(&uf32_bld, gallivm, src_type);312scale = (double)((1ULL << dst_width) - 1);313314res = LLVMBuildFMul(builder, src,315lp_build_const_vec(gallivm, src_type, scale), "");316res = lp_build_iround(&uf32_bld, res);317}318else {319/*320* The destination exceeds what can be represented in the floating point.321* So multiply by the largest power two we get away with, and when322* subtract the most significant bit to rescale to normalized values.323*324* The largest power of two factor we can get away is325* (1 << (src_type.width - 1)), because we need to use signed . In theory it326* should be (1 << (src_type.width - 2)), but IEEE 754 rules states327* INT_MIN should be returned in FPToSI, which is the correct result for328* values near 1.0!329*330* This means we get (src_type.width - 1) correct bits for values near 0.0,331* and (mantissa + 1) correct bits for values near 1.0. Equally or more332* important, we also get exact results for 0.0 and 1.0.333*/334335unsigned n = MIN2(src_type.width - 1u, dst_width);336337double scale = (double)(1ULL << n);338unsigned lshift = dst_width - n;339unsigned rshift = n;340LLVMValueRef lshifted;341LLVMValueRef rshifted;342343res = LLVMBuildFMul(builder, src,344lp_build_const_vec(gallivm, src_type, scale), "");345if (!src_type.sign && src_type.width == 32)346res = LLVMBuildFPToUI(builder, res, int_vec_type, "");347else348res = LLVMBuildFPToSI(builder, res, int_vec_type, "");349350/*351* Align the most significant bit to its final place.352*353* This will cause 1.0 to overflow to 0, but the later adjustment will354* get it right.355*/356if (lshift) {357lshifted = LLVMBuildShl(builder, res,358lp_build_const_int_vec(gallivm, src_type,359lshift), "");360} else {361lshifted = res;362}363364/*365* Align the most significant bit to the right.366*/367rshifted = LLVMBuildLShr(builder, res,368lp_build_const_int_vec(gallivm, src_type, rshift),369"");370371/*372* Subtract the MSB to the LSB, therefore re-scaling from373* (1 << dst_width) to ((1 << dst_width) - 1).374*/375376res = LLVMBuildSub(builder, lshifted, rshifted, "");377}378379return res;380}381382383/**384* Inverse of lp_build_clamped_float_to_unsigned_norm above.385* Ex: src = { i32, i32, i32, i32 } with values in range [0, 2^src_width-1]386* return {float, float, float, float} with values in range [0, 1].387*/388LLVMValueRef389lp_build_unsigned_norm_to_float(struct gallivm_state *gallivm,390unsigned src_width,391struct lp_type dst_type,392LLVMValueRef src)393{394LLVMBuilderRef builder = gallivm->builder;395LLVMTypeRef vec_type = lp_build_vec_type(gallivm, dst_type);396LLVMTypeRef int_vec_type = lp_build_int_vec_type(gallivm, dst_type);397LLVMValueRef bias_;398LLVMValueRef res;399unsigned mantissa;400unsigned n;401unsigned long long ubound;402unsigned long long mask;403double scale;404double bias;405406assert(dst_type.floating);407408mantissa = lp_mantissa(dst_type);409410if (src_width <= (mantissa + 1)) {411/*412* The source width matches fits what can be represented in floating413* point (i.e., mantissa + 1 bits). So do a straight multiplication414* followed by casting. No further rounding is necessary.415*/416417scale = 1.0/(double)((1ULL << src_width) - 1);418res = LLVMBuildSIToFP(builder, src, vec_type, "");419res = LLVMBuildFMul(builder, res,420lp_build_const_vec(gallivm, dst_type, scale), "");421return res;422}423else {424/*425* The source width exceeds what can be represented in floating426* point. So truncate the incoming values.427*/428429n = MIN2(mantissa, src_width);430431ubound = ((unsigned long long)1 << n);432mask = ubound - 1;433scale = (double)ubound/mask;434bias = (double)((unsigned long long)1 << (mantissa - n));435436res = src;437438if (src_width > mantissa) {439int shift = src_width - mantissa;440res = LLVMBuildLShr(builder, res,441lp_build_const_int_vec(gallivm, dst_type, shift), "");442}443444bias_ = lp_build_const_vec(gallivm, dst_type, bias);445446res = LLVMBuildOr(builder,447res,448LLVMBuildBitCast(builder, bias_, int_vec_type, ""), "");449450res = LLVMBuildBitCast(builder, res, vec_type, "");451452res = LLVMBuildFSub(builder, res, bias_, "");453res = LLVMBuildFMul(builder, res, lp_build_const_vec(gallivm, dst_type, scale), "");454}455456return res;457}458459460/**461* Pick a suitable num_dsts for lp_build_conv to ensure optimal cases are used.462*463* Returns the number of dsts created from src464*/465int lp_build_conv_auto(struct gallivm_state *gallivm,466struct lp_type src_type,467struct lp_type* dst_type,468const LLVMValueRef *src,469unsigned num_srcs,470LLVMValueRef *dst)471{472unsigned i;473int num_dsts = num_srcs;474475if (src_type.floating == dst_type->floating &&476src_type.width == dst_type->width &&477src_type.length == dst_type->length &&478src_type.fixed == dst_type->fixed &&479src_type.norm == dst_type->norm &&480src_type.sign == dst_type->sign)481return num_dsts;482483/* Special case 4x4x32 -> 1x16x8 or 2x8x32 -> 1x16x8484*/485if (src_type.norm == 0 &&486src_type.width == 32 &&487src_type.fixed == 0 &&488489dst_type->floating == 0 &&490dst_type->fixed == 0 &&491dst_type->width == 8 &&492493((src_type.floating == 1 && src_type.sign == 1 && dst_type->norm == 1) ||494(src_type.floating == 0 && dst_type->floating == 0 &&495src_type.sign == dst_type->sign && dst_type->norm == 0))) {496497/* Special case 4x4x32 --> 1x16x8 */498if (src_type.length == 4 &&499(util_get_cpu_caps()->has_sse2 || util_get_cpu_caps()->has_altivec))500{501num_dsts = (num_srcs + 3) / 4;502dst_type->length = num_srcs * 4 >= 16 ? 16 : num_srcs * 4;503504lp_build_conv(gallivm, src_type, *dst_type, src, num_srcs, dst, num_dsts);505return num_dsts;506}507508/* Special case 2x8x32 --> 1x16x8 */509if (src_type.length == 8 &&510util_get_cpu_caps()->has_avx)511{512num_dsts = (num_srcs + 1) / 2;513dst_type->length = num_srcs * 8 >= 16 ? 16 : num_srcs * 8;514515lp_build_conv(gallivm, src_type, *dst_type, src, num_srcs, dst, num_dsts);516return num_dsts;517}518}519520/* lp_build_resize does not support M:N */521if (src_type.width == dst_type->width) {522lp_build_conv(gallivm, src_type, *dst_type, src, num_srcs, dst, num_dsts);523} else {524/*525* If dst_width is 16 bits and src_width 32 and the dst vector size526* 64bit, try feeding 2 vectors at once so pack intrinsics can be used.527* (For AVX, this isn't needed, since we usually get 256bit src and528* 128bit dst vectors which works ok. If we do AVX2 pack this should529* be extended but need to be able to tell conversion code about pack530* ordering first.)531*/532unsigned ratio = 1;533if (src_type.width == 2 * dst_type->width &&534src_type.length == dst_type->length &&535dst_type->floating == 0 && (num_srcs % 2 == 0) &&536dst_type->width * dst_type->length == 64) {537ratio = 2;538num_dsts /= 2;539dst_type->length *= 2;540}541for (i = 0; i < num_dsts; i++) {542lp_build_conv(gallivm, src_type, *dst_type, &src[i*ratio], ratio, &dst[i], 1);543}544}545546return num_dsts;547}548549550/**551* Generic type conversion.552*553* TODO: Take a precision argument, or even better, add a new precision member554* to the lp_type union.555*/556void557lp_build_conv(struct gallivm_state *gallivm,558struct lp_type src_type,559struct lp_type dst_type,560const LLVMValueRef *src, unsigned num_srcs,561LLVMValueRef *dst, unsigned num_dsts)562{563LLVMBuilderRef builder = gallivm->builder;564struct lp_type tmp_type;565LLVMValueRef tmp[LP_MAX_VECTOR_LENGTH];566unsigned num_tmps;567unsigned i;568569/* We must not loose or gain channels. Only precision */570assert(src_type.length * num_srcs == dst_type.length * num_dsts);571572assert(src_type.length <= LP_MAX_VECTOR_LENGTH);573assert(dst_type.length <= LP_MAX_VECTOR_LENGTH);574assert(num_srcs <= LP_MAX_VECTOR_LENGTH);575assert(num_dsts <= LP_MAX_VECTOR_LENGTH);576577tmp_type = src_type;578for(i = 0; i < num_srcs; ++i) {579assert(lp_check_value(src_type, src[i]));580tmp[i] = src[i];581}582num_tmps = num_srcs;583584585/*586* Special case 4x4x32 --> 1x16x8, 2x4x32 -> 1x8x8, 1x4x32 -> 1x4x8587* Only float -> s/unorm8 and (u)int32->(u)int8.588* XXX: This should cover all interesting backend cases for 8 bit,589* but should use same strategy if dst is 16 bit.590*/591if (src_type.norm == 0 &&592src_type.width == 32 &&593src_type.length == 4 &&594src_type.fixed == 0 &&595596dst_type.floating == 0 &&597dst_type.fixed == 0 &&598dst_type.width == 8 &&599600((src_type.floating == 1 && src_type.sign == 1 && dst_type.norm == 1) ||601(src_type.floating == 0 && dst_type.floating == 0 &&602src_type.sign == dst_type.sign && dst_type.norm == 0)) &&603604((dst_type.length == 16 && 4 * num_dsts == num_srcs) ||605(num_dsts == 1 && dst_type.length * num_srcs == 16 && num_srcs != 3)) &&606607(util_get_cpu_caps()->has_sse2 || util_get_cpu_caps()->has_altivec))608{609struct lp_build_context bld;610struct lp_type int16_type, int32_type;611struct lp_type dst_type_ext = dst_type;612LLVMValueRef const_scale;613unsigned i, j;614615lp_build_context_init(&bld, gallivm, src_type);616617dst_type_ext.length = 16;618int16_type = int32_type = dst_type_ext;619620int16_type.width *= 2;621int16_type.length /= 2;622int16_type.sign = 1;623624int32_type.width *= 4;625int32_type.length /= 4;626int32_type.sign = 1;627628const_scale = lp_build_const_vec(gallivm, src_type, lp_const_scale(dst_type));629630for (i = 0; i < num_dsts; ++i, src += 4) {631LLVMValueRef lo, hi;632633if (src_type.floating) {634for (j = 0; j < dst_type.length / 4; ++j) {635/*636* XXX This is not actually fully correct. The float to int637* conversion will produce 0x80000000 value for everything638* out of range and NaNs (on x86, llvm.x86.sse2.cvtps2dq).639* Hence, NaNs and negatives will get clamped just fine to zero640* (relying on clamping pack behavior) when converting to unorm,641* however too large values (both finite and infinite) will also642* end up as zero, not 255.643* For snorm, for now we'll keep bug compatibility with generic644* conversion path (meaning too large values are fine, but645* NaNs get converted to -128 (purely by luck, as we don't646* specify nan behavior for the max there) instead of 0).647*648* dEQP has GLES31 tests that expect +inf -> 255.0.649*/650if (dst_type.sign) {651tmp[j] = lp_build_min(&bld, bld.one, src[j]);652653}654else {655if (1) {656tmp[j] = lp_build_min_ext(&bld, bld.one, src[j],657GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN);658}659tmp[j] = src[j];660}661tmp[j] = LLVMBuildFMul(builder, tmp[j], const_scale, "");662tmp[j] = lp_build_iround(&bld, tmp[j]);663}664} else {665for (j = 0; j < dst_type.length / 4; ++j) {666if (!dst_type.sign) {667/*668* Pack clamp is always signed->unsigned (or signed->signed).669* Hence need min.670*/671LLVMValueRef const_max;672const_max = lp_build_const_int_vec(gallivm, src_type, 255);673tmp[j] = lp_build_min(&bld, src[j], const_max);674} else {675tmp[j] = src[j];676}677}678}679680if (num_srcs == 1) {681tmp[1] = tmp[0];682}683684/* relying on clamping behavior of sse2 intrinsics here */685lo = lp_build_pack2(gallivm, int32_type, int16_type, tmp[0], tmp[1]);686687if (num_srcs < 4) {688hi = lo;689}690else {691hi = lp_build_pack2(gallivm, int32_type, int16_type, tmp[2], tmp[3]);692}693dst[i] = lp_build_pack2(gallivm, int16_type, dst_type_ext, lo, hi);694}695if (num_srcs < 4) {696dst[0] = lp_build_extract_range(gallivm, dst[0], 0, dst_type.length);697}698699return;700}701702/* Special case 2x8x32 --> 1x16x8, 1x8x32 ->1x8x8703*/704else if (src_type.norm == 0 &&705src_type.width == 32 &&706src_type.length == 8 &&707src_type.fixed == 0 &&708709dst_type.floating == 0 &&710dst_type.fixed == 0 &&711dst_type.width == 8 &&712713((src_type.floating == 1 && src_type.sign == 1 && dst_type.norm == 1) ||714(src_type.floating == 0 && dst_type.floating == 0 &&715src_type.sign == dst_type.sign && dst_type.norm == 0)) &&716717((dst_type.length == 16 && 2 * num_dsts == num_srcs) ||718(num_dsts == 1 && dst_type.length * num_srcs == 8)) &&719720util_get_cpu_caps()->has_avx) {721722struct lp_build_context bld;723struct lp_type int16_type, int32_type;724struct lp_type dst_type_ext = dst_type;725LLVMValueRef const_scale;726unsigned i;727728lp_build_context_init(&bld, gallivm, src_type);729730dst_type_ext.length = 16;731int16_type = int32_type = dst_type_ext;732733int16_type.width *= 2;734int16_type.length /= 2;735int16_type.sign = 1;736737int32_type.width *= 4;738int32_type.length /= 4;739int32_type.sign = 1;740741const_scale = lp_build_const_vec(gallivm, src_type, lp_const_scale(dst_type));742743for (i = 0; i < num_dsts; ++i, src += 2) {744unsigned j;745for (j = 0; j < (num_srcs == 1 ? 1 : 2); j++) {746LLVMValueRef lo, hi, a;747748a = src[j];749if (src_type.floating) {750if (dst_type.sign) {751a = lp_build_min(&bld, bld.one, a);752753}754else {755if (1) {756a = lp_build_min_ext(&bld, bld.one, a,757GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN);758}759}760a = LLVMBuildFMul(builder, a, const_scale, "");761a = lp_build_iround(&bld, a);762} else {763if (!dst_type.sign) {764LLVMValueRef const_max;765const_max = lp_build_const_int_vec(gallivm, src_type, 255);766a = lp_build_min(&bld, a, const_max);767}768}769lo = lp_build_extract_range(gallivm, a, 0, 4);770hi = lp_build_extract_range(gallivm, a, 4, 4);771/* relying on clamping behavior of sse2 intrinsics here */772tmp[j] = lp_build_pack2(gallivm, int32_type, int16_type, lo, hi);773}774775if (num_srcs == 1) {776tmp[1] = tmp[0];777}778dst[i] = lp_build_pack2(gallivm, int16_type, dst_type_ext, tmp[0], tmp[1]);779}780781if (num_srcs == 1) {782dst[0] = lp_build_extract_range(gallivm, dst[0], 0, dst_type.length);783}784785return;786}787788/* Special case -> 16bit half-float789*/790else if (dst_type.floating && dst_type.width == 16)791{792/* Only support src as 32bit float currently */793assert(src_type.floating && src_type.width == 32);794795for(i = 0; i < num_tmps; ++i)796dst[i] = lp_build_float_to_half(gallivm, tmp[i]);797798return;799}800801/* Pre convert half-floats to floats802*/803else if (src_type.floating && src_type.width == 16)804{805for(i = 0; i < num_tmps; ++i)806tmp[i] = lp_build_half_to_float(gallivm, tmp[i]);807808tmp_type.width = 32;809}810811/*812* Clamp if necessary813*/814815if(memcmp(&src_type, &dst_type, sizeof src_type) != 0) {816struct lp_build_context bld;817double src_min = lp_const_min(src_type);818double dst_min = lp_const_min(dst_type);819double src_max = lp_const_max(src_type);820double dst_max = lp_const_max(dst_type);821LLVMValueRef thres;822823lp_build_context_init(&bld, gallivm, tmp_type);824825if(src_min < dst_min) {826if(dst_min == 0.0)827thres = bld.zero;828else829thres = lp_build_const_vec(gallivm, src_type, dst_min);830for(i = 0; i < num_tmps; ++i)831tmp[i] = lp_build_max(&bld, tmp[i], thres);832}833834if(src_max > dst_max) {835if(dst_max == 1.0)836thres = bld.one;837else838thres = lp_build_const_vec(gallivm, src_type, dst_max);839for(i = 0; i < num_tmps; ++i)840tmp[i] = lp_build_min(&bld, tmp[i], thres);841}842}843844/*845* Scale to the narrowest range846*/847848if(dst_type.floating) {849/* Nothing to do */850}851else if(tmp_type.floating) {852if(!dst_type.fixed && !dst_type.sign && dst_type.norm) {853for(i = 0; i < num_tmps; ++i) {854tmp[i] = lp_build_clamped_float_to_unsigned_norm(gallivm,855tmp_type,856dst_type.width,857tmp[i]);858}859tmp_type.floating = FALSE;860}861else {862double dst_scale = lp_const_scale(dst_type);863864if (dst_scale != 1.0) {865LLVMValueRef scale = lp_build_const_vec(gallivm, tmp_type, dst_scale);866for(i = 0; i < num_tmps; ++i)867tmp[i] = LLVMBuildFMul(builder, tmp[i], scale, "");868}869870/*871* these functions will use fptosi in some form which won't work872* with 32bit uint dst. Causes lp_test_conv failures though.873*/874if (0)875assert(dst_type.sign || dst_type.width < 32);876877if (dst_type.sign && dst_type.norm && !dst_type.fixed) {878struct lp_build_context bld;879880lp_build_context_init(&bld, gallivm, tmp_type);881for(i = 0; i < num_tmps; ++i) {882tmp[i] = lp_build_iround(&bld, tmp[i]);883}884tmp_type.floating = FALSE;885}886else {887LLVMTypeRef tmp_vec_type;888889tmp_type.floating = FALSE;890tmp_vec_type = lp_build_vec_type(gallivm, tmp_type);891for(i = 0; i < num_tmps; ++i) {892#if 0893if(dst_type.sign)894tmp[i] = LLVMBuildFPToSI(builder, tmp[i], tmp_vec_type, "");895else896tmp[i] = LLVMBuildFPToUI(builder, tmp[i], tmp_vec_type, "");897#else898/* FIXME: there is no SSE counterpart for LLVMBuildFPToUI */899tmp[i] = LLVMBuildFPToSI(builder, tmp[i], tmp_vec_type, "");900#endif901}902}903}904}905else {906unsigned src_shift = lp_const_shift(src_type);907unsigned dst_shift = lp_const_shift(dst_type);908unsigned src_offset = lp_const_offset(src_type);909unsigned dst_offset = lp_const_offset(dst_type);910struct lp_build_context bld;911lp_build_context_init(&bld, gallivm, tmp_type);912913/* Compensate for different offsets */914/* sscaled -> unorm and similar would cause negative shift count, skip */915if (dst_offset > src_offset && src_type.width > dst_type.width && src_shift > 0) {916for (i = 0; i < num_tmps; ++i) {917LLVMValueRef shifted;918919shifted = lp_build_shr_imm(&bld, tmp[i], src_shift - 1);920tmp[i] = LLVMBuildSub(builder, tmp[i], shifted, "");921}922}923924if(src_shift > dst_shift) {925for(i = 0; i < num_tmps; ++i)926tmp[i] = lp_build_shr_imm(&bld, tmp[i], src_shift - dst_shift);927}928}929930/*931* Truncate or expand bit width932*933* No data conversion should happen here, although the sign bits are934* crucial to avoid bad clamping.935*/936937{938struct lp_type new_type;939940new_type = tmp_type;941new_type.sign = dst_type.sign;942new_type.width = dst_type.width;943new_type.length = dst_type.length;944945/*946* Note that resize when using packs can sometimes get min/max947* clamping for free. Should be able to exploit this...948*/949lp_build_resize(gallivm, tmp_type, new_type, tmp, num_srcs, tmp, num_dsts);950951tmp_type = new_type;952num_tmps = num_dsts;953}954955/*956* Scale to the widest range957*/958959if(src_type.floating) {960/* Nothing to do */961}962else if(!src_type.floating && dst_type.floating) {963if(!src_type.fixed && !src_type.sign && src_type.norm) {964for(i = 0; i < num_tmps; ++i) {965tmp[i] = lp_build_unsigned_norm_to_float(gallivm,966src_type.width,967dst_type,968tmp[i]);969}970tmp_type.floating = TRUE;971}972else {973double src_scale = lp_const_scale(src_type);974LLVMTypeRef tmp_vec_type;975976/* Use an equally sized integer for intermediate computations */977tmp_type.floating = TRUE;978tmp_type.sign = TRUE;979tmp_vec_type = lp_build_vec_type(gallivm, tmp_type);980for(i = 0; i < num_tmps; ++i) {981#if 0982if(dst_type.sign)983tmp[i] = LLVMBuildSIToFP(builder, tmp[i], tmp_vec_type, "");984else985tmp[i] = LLVMBuildUIToFP(builder, tmp[i], tmp_vec_type, "");986#else987/* FIXME: there is no SSE counterpart for LLVMBuildUIToFP */988tmp[i] = LLVMBuildSIToFP(builder, tmp[i], tmp_vec_type, "");989#endif990}991992if (src_scale != 1.0) {993LLVMValueRef scale = lp_build_const_vec(gallivm, tmp_type, 1.0/src_scale);994for(i = 0; i < num_tmps; ++i)995tmp[i] = LLVMBuildFMul(builder, tmp[i], scale, "");996}997998/* the formula above will produce value below -1.0 for most negative999* value but everything seems happy with that hence disable for now */1000if (0 && !src_type.fixed && src_type.norm && src_type.sign) {1001struct lp_build_context bld;10021003lp_build_context_init(&bld, gallivm, dst_type);1004for(i = 0; i < num_tmps; ++i) {1005tmp[i] = lp_build_max(&bld, tmp[i],1006lp_build_const_vec(gallivm, dst_type, -1.0f));1007}1008}1009}1010}1011else {1012unsigned src_shift = lp_const_shift(src_type);1013unsigned dst_shift = lp_const_shift(dst_type);1014unsigned src_offset = lp_const_offset(src_type);1015unsigned dst_offset = lp_const_offset(dst_type);1016struct lp_build_context bld;1017lp_build_context_init(&bld, gallivm, tmp_type);10181019if (src_shift < dst_shift) {1020LLVMValueRef pre_shift[LP_MAX_VECTOR_LENGTH];10211022if (dst_shift - src_shift < dst_type.width) {1023for (i = 0; i < num_tmps; ++i) {1024pre_shift[i] = tmp[i];1025tmp[i] = lp_build_shl_imm(&bld, tmp[i], dst_shift - src_shift);1026}1027}1028else {1029/*1030* This happens for things like sscaled -> unorm conversions. Shift1031* counts equal to bit width cause undefined results, so hack around it.1032*/1033for (i = 0; i < num_tmps; ++i) {1034pre_shift[i] = tmp[i];1035tmp[i] = lp_build_zero(gallivm, dst_type);1036}1037}10381039/* Compensate for different offsets */1040if (dst_offset > src_offset) {1041for (i = 0; i < num_tmps; ++i) {1042tmp[i] = LLVMBuildSub(builder, tmp[i], pre_shift[i], "");1043}1044}1045}1046}10471048for(i = 0; i < num_dsts; ++i) {1049dst[i] = tmp[i];1050assert(lp_check_value(dst_type, dst[i]));1051}1052}105310541055/**1056* Bit mask conversion.1057*1058* This will convert the integer masks that match the given types.1059*1060* The mask values should 0 or -1, i.e., all bits either set to zero or one.1061* Any other value will likely cause unpredictable results.1062*1063* This is basically a very trimmed down version of lp_build_conv.1064*/1065void1066lp_build_conv_mask(struct gallivm_state *gallivm,1067struct lp_type src_type,1068struct lp_type dst_type,1069const LLVMValueRef *src, unsigned num_srcs,1070LLVMValueRef *dst, unsigned num_dsts)1071{10721073/* We must not loose or gain channels. Only precision */1074assert(src_type.length * num_srcs == dst_type.length * num_dsts);10751076/*1077* Drop1078*1079* We assume all values are 0 or -11080*/10811082src_type.floating = FALSE;1083src_type.fixed = FALSE;1084src_type.sign = TRUE;1085src_type.norm = FALSE;10861087dst_type.floating = FALSE;1088dst_type.fixed = FALSE;1089dst_type.sign = TRUE;1090dst_type.norm = FALSE;10911092/*1093* Truncate or expand bit width1094*/10951096lp_build_resize(gallivm, src_type, dst_type, src, num_srcs, dst, num_dsts);1097}109810991100