Path: blob/21.2-virgl/src/gallium/auxiliary/gallivm/lp_bld_pack.c
4565 views
/**************************************************************************1*2* Copyright 2009 VMware, Inc.3* All Rights Reserved.4*5* Permission is hereby granted, free of charge, to any person obtaining a6* copy of this software and associated documentation files (the7* "Software"), to deal in the Software without restriction, including8* without limitation the rights to use, copy, modify, merge, publish,9* distribute, sub license, and/or sell copies of the Software, and to10* permit persons to whom the Software is furnished to do so, subject to11* the following conditions:12*13* The above copyright notice and this permission notice (including the14* next paragraph) shall be included in all copies or substantial portions15* of the Software.16*17* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS18* OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF19* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.20* IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR21* ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,22* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE23* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.24*25**************************************************************************/262728/**29* @file30* Helper functions for packing/unpacking.31*32* Pack/unpacking is necessary for conversion between types of different33* bit width.34*35* They are also commonly used when an computation needs higher36* precision for the intermediate values. For example, if one needs the37* function:38*39* c = compute(a, b);40*41* to use more precision for intermediate results then one should implement it42* as:43*44* LLVMValueRef45* compute(LLVMBuilderRef builder struct lp_type type, LLVMValueRef a, LLVMValueRef b)46* {47* struct lp_type wide_type = lp_wider_type(type);48* LLVMValueRef al, ah, bl, bh, cl, ch, c;49*50* lp_build_unpack2(builder, type, wide_type, a, &al, &ah);51* lp_build_unpack2(builder, type, wide_type, b, &bl, &bh);52*53* cl = compute_half(al, bl);54* ch = compute_half(ah, bh);55*56* c = lp_build_pack2(bld->builder, wide_type, type, cl, ch);57*58* return c;59* }60*61* where compute_half() would do the computation for half the elements with62* twice the precision.63*64* @author Jose Fonseca <[email protected]>65*/666768#include "util/u_debug.h"69#include "util/u_math.h"70#include "util/u_cpu_detect.h"71#include "util/u_memory.h"7273#include "lp_bld_type.h"74#include "lp_bld_const.h"75#include "lp_bld_init.h"76#include "lp_bld_intr.h"77#include "lp_bld_arit.h"78#include "lp_bld_pack.h"79#include "lp_bld_swizzle.h"808182/**83* Build shuffle vectors that match PUNPCKLxx and PUNPCKHxx instructions.84*/85static LLVMValueRef86lp_build_const_unpack_shuffle(struct gallivm_state *gallivm,87unsigned n, unsigned lo_hi)88{89LLVMValueRef elems[LP_MAX_VECTOR_LENGTH];90unsigned i, j;9192assert(n <= LP_MAX_VECTOR_LENGTH);93assert(lo_hi < 2);9495/* TODO: cache results in a static table */9697for(i = 0, j = lo_hi*n/2; i < n; i += 2, ++j) {98elems[i + 0] = lp_build_const_int32(gallivm, 0 + j);99elems[i + 1] = lp_build_const_int32(gallivm, n + j);100}101102return LLVMConstVector(elems, n);103}104105/**106* Similar to lp_build_const_unpack_shuffle but for special AVX 256bit unpack.107* See comment above lp_build_interleave2_half for more details.108*/109static LLVMValueRef110lp_build_const_unpack_shuffle_half(struct gallivm_state *gallivm,111unsigned n, unsigned lo_hi)112{113LLVMValueRef elems[LP_MAX_VECTOR_LENGTH];114unsigned i, j;115116assert(n <= LP_MAX_VECTOR_LENGTH);117assert(lo_hi < 2);118119for (i = 0, j = lo_hi*(n/4); i < n; i += 2, ++j) {120if (i == (n / 2))121j += n / 4;122123elems[i + 0] = lp_build_const_int32(gallivm, 0 + j);124elems[i + 1] = lp_build_const_int32(gallivm, n + j);125}126127return LLVMConstVector(elems, n);128}129130/**131* Similar to lp_build_const_unpack_shuffle_half, but for AVX512132* See comment above lp_build_interleave2_half for more details.133*/134static LLVMValueRef135lp_build_const_unpack_shuffle_16wide(struct gallivm_state *gallivm,136unsigned lo_hi)137{138LLVMValueRef elems[LP_MAX_VECTOR_LENGTH];139unsigned i, j;140141assert(lo_hi < 2);142143// for the following lo_hi setting, convert 0 -> f to:144// 0: 0 16 4 20 8 24 12 28 1 17 5 21 9 25 13 29145// 1: 2 18 6 22 10 26 14 30 3 19 7 23 11 27 15 31146for (i = 0; i < 16; i++) {147j = ((i&0x06)<<1) + ((i&1)<<4) + (i>>3) + (lo_hi<<1);148149elems[i] = lp_build_const_int32(gallivm, j);150}151152return LLVMConstVector(elems, 16);153}154155/**156* Build shuffle vectors that match PACKxx (SSE) instructions or157* VPERM (Altivec).158*/159static LLVMValueRef160lp_build_const_pack_shuffle(struct gallivm_state *gallivm, unsigned n)161{162LLVMValueRef elems[LP_MAX_VECTOR_LENGTH];163unsigned i;164165assert(n <= LP_MAX_VECTOR_LENGTH);166167for(i = 0; i < n; ++i)168#if UTIL_ARCH_LITTLE_ENDIAN169elems[i] = lp_build_const_int32(gallivm, 2*i);170#else171elems[i] = lp_build_const_int32(gallivm, 2*i+1);172#endif173174return LLVMConstVector(elems, n);175}176177/**178* Return a vector with elements src[start:start+size]179* Most useful for getting half the values out of a 256bit sized vector,180* otherwise may cause data rearrangement to happen.181*/182LLVMValueRef183lp_build_extract_range(struct gallivm_state *gallivm,184LLVMValueRef src,185unsigned start,186unsigned size)187{188LLVMValueRef elems[LP_MAX_VECTOR_LENGTH];189unsigned i;190191assert(size <= ARRAY_SIZE(elems));192193for (i = 0; i < size; ++i)194elems[i] = lp_build_const_int32(gallivm, i + start);195196if (size == 1) {197return LLVMBuildExtractElement(gallivm->builder, src, elems[0], "");198}199else {200return LLVMBuildShuffleVector(gallivm->builder, src, src,201LLVMConstVector(elems, size), "");202}203}204205/**206* Concatenates several (must be a power of 2) vectors (of same type)207* into a larger one.208* Most useful for building up a 256bit sized vector out of two 128bit ones.209*/210LLVMValueRef211lp_build_concat(struct gallivm_state *gallivm,212LLVMValueRef src[],213struct lp_type src_type,214unsigned num_vectors)215{216unsigned new_length, i;217LLVMValueRef tmp[LP_MAX_VECTOR_LENGTH/2];218LLVMValueRef shuffles[LP_MAX_VECTOR_LENGTH];219220assert(src_type.length * num_vectors <= ARRAY_SIZE(shuffles));221assert(util_is_power_of_two_or_zero(num_vectors));222223new_length = src_type.length;224225for (i = 0; i < num_vectors; i++)226tmp[i] = src[i];227228while (num_vectors > 1) {229num_vectors >>= 1;230new_length <<= 1;231for (i = 0; i < new_length; i++) {232shuffles[i] = lp_build_const_int32(gallivm, i);233}234for (i = 0; i < num_vectors; i++) {235tmp[i] = LLVMBuildShuffleVector(gallivm->builder, tmp[i*2], tmp[i*2 + 1],236LLVMConstVector(shuffles, new_length), "");237}238}239240return tmp[0];241}242243244/**245* Combines vectors to reduce from num_srcs to num_dsts.246* Returns the number of src vectors concatenated in a single dst.247*248* num_srcs must be exactly divisible by num_dsts.249*250* e.g. For num_srcs = 4 and src = [x, y, z, w]251* num_dsts = 1 dst = [xyzw] return = 4252* num_dsts = 2 dst = [xy, zw] return = 2253*/254int255lp_build_concat_n(struct gallivm_state *gallivm,256struct lp_type src_type,257LLVMValueRef *src,258unsigned num_srcs,259LLVMValueRef *dst,260unsigned num_dsts)261{262int size = num_srcs / num_dsts;263unsigned i;264265assert(num_srcs >= num_dsts);266assert((num_srcs % size) == 0);267268if (num_srcs == num_dsts) {269for (i = 0; i < num_dsts; ++i) {270dst[i] = src[i];271}272return 1;273}274275for (i = 0; i < num_dsts; ++i) {276dst[i] = lp_build_concat(gallivm, &src[i * size], src_type, size);277}278279return size;280}281282283/**284* Un-interleave vector.285* This will return a vector consisting of every second element286* (depending on lo_hi, beginning at 0 or 1).287* The returned vector size (elems and width) will only be half288* that of the source vector.289*/290LLVMValueRef291lp_build_uninterleave1(struct gallivm_state *gallivm,292unsigned num_elems,293LLVMValueRef a,294unsigned lo_hi)295{296LLVMValueRef shuffle, elems[LP_MAX_VECTOR_LENGTH];297unsigned i;298assert(num_elems <= LP_MAX_VECTOR_LENGTH);299300for (i = 0; i < num_elems / 2; ++i)301elems[i] = lp_build_const_int32(gallivm, 2*i + lo_hi);302303shuffle = LLVMConstVector(elems, num_elems / 2);304305return LLVMBuildShuffleVector(gallivm->builder, a, a, shuffle, "");306}307308309/**310* Interleave vector elements.311*312* Matches the PUNPCKLxx and PUNPCKHxx SSE instructions313* (but not for 256bit AVX vectors).314*/315LLVMValueRef316lp_build_interleave2(struct gallivm_state *gallivm,317struct lp_type type,318LLVMValueRef a,319LLVMValueRef b,320unsigned lo_hi)321{322LLVMValueRef shuffle;323324if (type.length == 2 && type.width == 128 && util_get_cpu_caps()->has_avx) {325/*326* XXX: This is a workaround for llvm code generation deficiency. Strangely327* enough, while this needs vinsertf128/vextractf128 instructions (hence328* a natural match when using 2x128bit vectors) the "normal" unpack shuffle329* generates code ranging from atrocious (llvm 3.1) to terrible (llvm 3.2, 3.3).330* So use some different shuffles instead (the exact shuffles don't seem to331* matter, as long as not using 128bit wide vectors, works with 8x32 or 4x64).332*/333struct lp_type tmp_type = type;334LLVMValueRef srchalf[2], tmpdst;335tmp_type.length = 4;336tmp_type.width = 64;337a = LLVMBuildBitCast(gallivm->builder, a, lp_build_vec_type(gallivm, tmp_type), "");338b = LLVMBuildBitCast(gallivm->builder, b, lp_build_vec_type(gallivm, tmp_type), "");339srchalf[0] = lp_build_extract_range(gallivm, a, lo_hi * 2, 2);340srchalf[1] = lp_build_extract_range(gallivm, b, lo_hi * 2, 2);341tmp_type.length = 2;342tmpdst = lp_build_concat(gallivm, srchalf, tmp_type, 2);343return LLVMBuildBitCast(gallivm->builder, tmpdst, lp_build_vec_type(gallivm, type), "");344}345346shuffle = lp_build_const_unpack_shuffle(gallivm, type.length, lo_hi);347348return LLVMBuildShuffleVector(gallivm->builder, a, b, shuffle, "");349}350351/**352* Interleave vector elements but with 256 (or 512) bit,353* treats it as interleave with 2 concatenated 128 (or 256) bit vectors.354*355* This differs to lp_build_interleave2 as that function would do the following (for lo):356* a0 b0 a1 b1 a2 b2 a3 b3, and this does not compile into an AVX unpack instruction.357*358*359* An example interleave 8x float with 8x float on AVX 256bit unpack:360* a0 a1 a2 a3 a4 a5 a6 a7 <-> b0 b1 b2 b3 b4 b5 b6 b7361*362* Equivalent to interleaving 2x 128 bit vectors363* a0 a1 a2 a3 <-> b0 b1 b2 b3 concatenated with a4 a5 a6 a7 <-> b4 b5 b6 b7364*365* So interleave-lo would result in:366* a0 b0 a1 b1 a4 b4 a5 b5367*368* And interleave-hi would result in:369* a2 b2 a3 b3 a6 b6 a7 b7370*371* For 512 bits, the following are true:372*373* Interleave-lo would result in (capital letters denote hex indices):374* a0 b0 a1 b1 a4 b4 a5 b5 a8 b8 a9 b9 aC bC aD bD375*376* Interleave-hi would result in:377* a2 b2 a3 b3 a6 b6 a7 b7 aA bA aB bB aE bE aF bF378*/379LLVMValueRef380lp_build_interleave2_half(struct gallivm_state *gallivm,381struct lp_type type,382LLVMValueRef a,383LLVMValueRef b,384unsigned lo_hi)385{386if (type.length * type.width == 256) {387LLVMValueRef shuffle = lp_build_const_unpack_shuffle_half(gallivm, type.length, lo_hi);388return LLVMBuildShuffleVector(gallivm->builder, a, b, shuffle, "");389} else if ((type.length == 16) && (type.width == 32)) {390LLVMValueRef shuffle = lp_build_const_unpack_shuffle_16wide(gallivm, lo_hi);391return LLVMBuildShuffleVector(gallivm->builder, a, b, shuffle, "");392} else {393return lp_build_interleave2(gallivm, type, a, b, lo_hi);394}395}396397398/**399* Double the bit width.400*401* This will only change the number of bits the values are represented, not the402* values themselves.403*404*/405void406lp_build_unpack2(struct gallivm_state *gallivm,407struct lp_type src_type,408struct lp_type dst_type,409LLVMValueRef src,410LLVMValueRef *dst_lo,411LLVMValueRef *dst_hi)412{413LLVMBuilderRef builder = gallivm->builder;414LLVMValueRef msb;415LLVMTypeRef dst_vec_type;416417assert(!src_type.floating);418assert(!dst_type.floating);419assert(dst_type.width == src_type.width * 2);420assert(dst_type.length * 2 == src_type.length);421422if(dst_type.sign && src_type.sign) {423/* Replicate the sign bit in the most significant bits */424msb = LLVMBuildAShr(builder, src, lp_build_const_int_vec(gallivm, src_type, src_type.width - 1), "");425}426else427/* Most significant bits always zero */428msb = lp_build_zero(gallivm, src_type);429430/* Interleave bits */431#if UTIL_ARCH_LITTLE_ENDIAN432*dst_lo = lp_build_interleave2(gallivm, src_type, src, msb, 0);433*dst_hi = lp_build_interleave2(gallivm, src_type, src, msb, 1);434435#else436*dst_lo = lp_build_interleave2(gallivm, src_type, msb, src, 0);437*dst_hi = lp_build_interleave2(gallivm, src_type, msb, src, 1);438#endif439440/* Cast the result into the new type (twice as wide) */441442dst_vec_type = lp_build_vec_type(gallivm, dst_type);443444*dst_lo = LLVMBuildBitCast(builder, *dst_lo, dst_vec_type, "");445*dst_hi = LLVMBuildBitCast(builder, *dst_hi, dst_vec_type, "");446}447448449/**450* Double the bit width, with an order which fits the cpu nicely.451*452* This will only change the number of bits the values are represented, not the453* values themselves.454*455* The order of the results is not guaranteed, other than it will match456* the corresponding lp_build_pack2_native call.457*/458void459lp_build_unpack2_native(struct gallivm_state *gallivm,460struct lp_type src_type,461struct lp_type dst_type,462LLVMValueRef src,463LLVMValueRef *dst_lo,464LLVMValueRef *dst_hi)465{466LLVMBuilderRef builder = gallivm->builder;467LLVMValueRef msb;468LLVMTypeRef dst_vec_type;469470assert(!src_type.floating);471assert(!dst_type.floating);472assert(dst_type.width == src_type.width * 2);473assert(dst_type.length * 2 == src_type.length);474475if(dst_type.sign && src_type.sign) {476/* Replicate the sign bit in the most significant bits */477msb = LLVMBuildAShr(builder, src,478lp_build_const_int_vec(gallivm, src_type, src_type.width - 1), "");479}480else481/* Most significant bits always zero */482msb = lp_build_zero(gallivm, src_type);483484/* Interleave bits */485#if UTIL_ARCH_LITTLE_ENDIAN486if (src_type.length * src_type.width == 256 && util_get_cpu_caps()->has_avx2) {487*dst_lo = lp_build_interleave2_half(gallivm, src_type, src, msb, 0);488*dst_hi = lp_build_interleave2_half(gallivm, src_type, src, msb, 1);489} else {490*dst_lo = lp_build_interleave2(gallivm, src_type, src, msb, 0);491*dst_hi = lp_build_interleave2(gallivm, src_type, src, msb, 1);492}493#else494*dst_lo = lp_build_interleave2(gallivm, src_type, msb, src, 0);495*dst_hi = lp_build_interleave2(gallivm, src_type, msb, src, 1);496#endif497498/* Cast the result into the new type (twice as wide) */499500dst_vec_type = lp_build_vec_type(gallivm, dst_type);501502*dst_lo = LLVMBuildBitCast(builder, *dst_lo, dst_vec_type, "");503*dst_hi = LLVMBuildBitCast(builder, *dst_hi, dst_vec_type, "");504}505506507/**508* Expand the bit width.509*510* This will only change the number of bits the values are represented, not the511* values themselves.512*/513void514lp_build_unpack(struct gallivm_state *gallivm,515struct lp_type src_type,516struct lp_type dst_type,517LLVMValueRef src,518LLVMValueRef *dst, unsigned num_dsts)519{520unsigned num_tmps;521unsigned i;522523/* Register width must remain constant */524assert(src_type.width * src_type.length == dst_type.width * dst_type.length);525526/* We must not loose or gain channels. Only precision */527assert(src_type.length == dst_type.length * num_dsts);528529num_tmps = 1;530dst[0] = src;531532while(src_type.width < dst_type.width) {533struct lp_type tmp_type = src_type;534535tmp_type.width *= 2;536tmp_type.length /= 2;537538for(i = num_tmps; i--; ) {539lp_build_unpack2(gallivm, src_type, tmp_type, dst[i], &dst[2*i + 0],540&dst[2*i + 1]);541}542543src_type = tmp_type;544545num_tmps *= 2;546}547548assert(num_tmps == num_dsts);549}550551552/**553* Non-interleaved pack.554*555* This will move values as556* (LSB) (MSB)557* lo = l0 __ l1 __ l2 __.. __ ln __558* hi = h0 __ h1 __ h2 __.. __ hn __559* res = l0 l1 l2 .. ln h0 h1 h2 .. hn560*561* This will only change the number of bits the values are represented, not the562* values themselves.563*564* It is assumed the values are already clamped into the destination type range.565* Values outside that range will produce undefined results. Use566* lp_build_packs2 instead.567*/568LLVMValueRef569lp_build_pack2(struct gallivm_state *gallivm,570struct lp_type src_type,571struct lp_type dst_type,572LLVMValueRef lo,573LLVMValueRef hi)574{575LLVMBuilderRef builder = gallivm->builder;576LLVMTypeRef dst_vec_type = lp_build_vec_type(gallivm, dst_type);577LLVMValueRef shuffle;578LLVMValueRef res = NULL;579struct lp_type intr_type = dst_type;580581assert(!src_type.floating);582assert(!dst_type.floating);583assert(src_type.width == dst_type.width * 2);584assert(src_type.length * 2 == dst_type.length);585586/* Check for special cases first */587if ((util_get_cpu_caps()->has_sse2 || util_get_cpu_caps()->has_altivec) &&588src_type.width * src_type.length >= 128) {589const char *intrinsic = NULL;590boolean swap_intrinsic_operands = FALSE;591592switch(src_type.width) {593case 32:594if (util_get_cpu_caps()->has_sse2) {595if (dst_type.sign) {596intrinsic = "llvm.x86.sse2.packssdw.128";597} else {598if (util_get_cpu_caps()->has_sse4_1) {599intrinsic = "llvm.x86.sse41.packusdw";600}601}602} else if (util_get_cpu_caps()->has_altivec) {603if (dst_type.sign) {604intrinsic = "llvm.ppc.altivec.vpkswss";605} else {606intrinsic = "llvm.ppc.altivec.vpkuwus";607}608#if UTIL_ARCH_LITTLE_ENDIAN609swap_intrinsic_operands = TRUE;610#endif611}612break;613case 16:614if (dst_type.sign) {615if (util_get_cpu_caps()->has_sse2) {616intrinsic = "llvm.x86.sse2.packsswb.128";617} else if (util_get_cpu_caps()->has_altivec) {618intrinsic = "llvm.ppc.altivec.vpkshss";619#if UTIL_ARCH_LITTLE_ENDIAN620swap_intrinsic_operands = TRUE;621#endif622}623} else {624if (util_get_cpu_caps()->has_sse2) {625intrinsic = "llvm.x86.sse2.packuswb.128";626} else if (util_get_cpu_caps()->has_altivec) {627intrinsic = "llvm.ppc.altivec.vpkshus";628#if UTIL_ARCH_LITTLE_ENDIAN629swap_intrinsic_operands = TRUE;630#endif631}632}633break;634/* default uses generic shuffle below */635}636if (intrinsic) {637if (src_type.width * src_type.length == 128) {638LLVMTypeRef intr_vec_type = lp_build_vec_type(gallivm, intr_type);639if (swap_intrinsic_operands) {640res = lp_build_intrinsic_binary(builder, intrinsic, intr_vec_type, hi, lo);641} else {642res = lp_build_intrinsic_binary(builder, intrinsic, intr_vec_type, lo, hi);643}644if (dst_vec_type != intr_vec_type) {645res = LLVMBuildBitCast(builder, res, dst_vec_type, "");646}647}648else {649int num_split = src_type.width * src_type.length / 128;650int i;651int nlen = 128 / src_type.width;652int lo_off = swap_intrinsic_operands ? nlen : 0;653int hi_off = swap_intrinsic_operands ? 0 : nlen;654struct lp_type ndst_type = lp_type_unorm(dst_type.width, 128);655struct lp_type nintr_type = lp_type_unorm(intr_type.width, 128);656LLVMValueRef tmpres[LP_MAX_VECTOR_WIDTH / 128];657LLVMValueRef tmplo, tmphi;658LLVMTypeRef ndst_vec_type = lp_build_vec_type(gallivm, ndst_type);659LLVMTypeRef nintr_vec_type = lp_build_vec_type(gallivm, nintr_type);660661assert(num_split <= LP_MAX_VECTOR_WIDTH / 128);662663for (i = 0; i < num_split / 2; i++) {664tmplo = lp_build_extract_range(gallivm,665lo, i*nlen*2 + lo_off, nlen);666tmphi = lp_build_extract_range(gallivm,667lo, i*nlen*2 + hi_off, nlen);668tmpres[i] = lp_build_intrinsic_binary(builder, intrinsic,669nintr_vec_type, tmplo, tmphi);670if (ndst_vec_type != nintr_vec_type) {671tmpres[i] = LLVMBuildBitCast(builder, tmpres[i], ndst_vec_type, "");672}673}674for (i = 0; i < num_split / 2; i++) {675tmplo = lp_build_extract_range(gallivm,676hi, i*nlen*2 + lo_off, nlen);677tmphi = lp_build_extract_range(gallivm,678hi, i*nlen*2 + hi_off, nlen);679tmpres[i+num_split/2] = lp_build_intrinsic_binary(builder, intrinsic,680nintr_vec_type,681tmplo, tmphi);682if (ndst_vec_type != nintr_vec_type) {683tmpres[i+num_split/2] = LLVMBuildBitCast(builder, tmpres[i+num_split/2],684ndst_vec_type, "");685}686}687res = lp_build_concat(gallivm, tmpres, ndst_type, num_split);688}689return res;690}691}692693/* generic shuffle */694lo = LLVMBuildBitCast(builder, lo, dst_vec_type, "");695hi = LLVMBuildBitCast(builder, hi, dst_vec_type, "");696697shuffle = lp_build_const_pack_shuffle(gallivm, dst_type.length);698699res = LLVMBuildShuffleVector(builder, lo, hi, shuffle, "");700701return res;702}703704705/**706* Non-interleaved native pack.707*708* Similar to lp_build_pack2, but the ordering of values is not709* guaranteed, other than it will match lp_build_unpack2_native.710*711* In particular, with avx2, the lower and upper 128bits of the vectors will712* be packed independently, so that (with 32bit->16bit values)713* (LSB) (MSB)714* lo = l0 __ l1 __ l2 __ l3 __ l4 __ l5 __ l6 __ l7 __715* hi = h0 __ h1 __ h2 __ h3 __ h4 __ h5 __ h6 __ h7 __716* res = l0 l1 l2 l3 h0 h1 h2 h3 l4 l5 l6 l7 h4 h5 h6 h7717*718* This will only change the number of bits the values are represented, not the719* values themselves.720*721* It is assumed the values are already clamped into the destination type range.722* Values outside that range will produce undefined results.723*/724LLVMValueRef725lp_build_pack2_native(struct gallivm_state *gallivm,726struct lp_type src_type,727struct lp_type dst_type,728LLVMValueRef lo,729LLVMValueRef hi)730{731LLVMBuilderRef builder = gallivm->builder;732struct lp_type intr_type = dst_type;733const char *intrinsic = NULL;734735assert(!src_type.floating);736assert(!dst_type.floating);737assert(src_type.width == dst_type.width * 2);738assert(src_type.length * 2 == dst_type.length);739740/* At this point only have special case for avx2 */741if (src_type.length * src_type.width == 256 &&742util_get_cpu_caps()->has_avx2) {743switch(src_type.width) {744case 32:745if (dst_type.sign) {746intrinsic = "llvm.x86.avx2.packssdw";747} else {748intrinsic = "llvm.x86.avx2.packusdw";749}750break;751case 16:752if (dst_type.sign) {753intrinsic = "llvm.x86.avx2.packsswb";754} else {755intrinsic = "llvm.x86.avx2.packuswb";756}757break;758}759}760if (intrinsic) {761LLVMTypeRef intr_vec_type = lp_build_vec_type(gallivm, intr_type);762return lp_build_intrinsic_binary(builder, intrinsic, intr_vec_type,763lo, hi);764}765else {766return lp_build_pack2(gallivm, src_type, dst_type, lo, hi);767}768}769770/**771* Non-interleaved pack and saturate.772*773* Same as lp_build_pack2 but will saturate values so that they fit into the774* destination type.775*/776LLVMValueRef777lp_build_packs2(struct gallivm_state *gallivm,778struct lp_type src_type,779struct lp_type dst_type,780LLVMValueRef lo,781LLVMValueRef hi)782{783boolean clamp;784785assert(!src_type.floating);786assert(!dst_type.floating);787assert(src_type.sign == dst_type.sign);788assert(src_type.width == dst_type.width * 2);789assert(src_type.length * 2 == dst_type.length);790791clamp = TRUE;792793/* All X86 SSE non-interleaved pack instructions take signed inputs and794* saturate them, so no need to clamp for those cases. */795if(util_get_cpu_caps()->has_sse2 &&796src_type.width * src_type.length >= 128 &&797src_type.sign &&798(src_type.width == 32 || src_type.width == 16))799clamp = FALSE;800801if(clamp) {802struct lp_build_context bld;803unsigned dst_bits = dst_type.sign ? dst_type.width - 1 : dst_type.width;804LLVMValueRef dst_max = lp_build_const_int_vec(gallivm, src_type,805((unsigned long long)1 << dst_bits) - 1);806lp_build_context_init(&bld, gallivm, src_type);807lo = lp_build_min(&bld, lo, dst_max);808hi = lp_build_min(&bld, hi, dst_max);809/* FIXME: What about lower bound? */810}811812return lp_build_pack2(gallivm, src_type, dst_type, lo, hi);813}814815816/**817* Truncate the bit width.818*819* TODO: Handle saturation consistently.820*/821LLVMValueRef822lp_build_pack(struct gallivm_state *gallivm,823struct lp_type src_type,824struct lp_type dst_type,825boolean clamped,826const LLVMValueRef *src, unsigned num_srcs)827{828LLVMValueRef (*pack2)(struct gallivm_state *gallivm,829struct lp_type src_type,830struct lp_type dst_type,831LLVMValueRef lo,832LLVMValueRef hi);833LLVMValueRef tmp[LP_MAX_VECTOR_LENGTH];834unsigned i;835836/* Register width must remain constant */837assert(src_type.width * src_type.length == dst_type.width * dst_type.length);838839/* We must not loose or gain channels. Only precision */840assert(src_type.length * num_srcs == dst_type.length);841842if(clamped)843pack2 = &lp_build_pack2;844else845pack2 = &lp_build_packs2;846847for(i = 0; i < num_srcs; ++i)848tmp[i] = src[i];849850while(src_type.width > dst_type.width) {851struct lp_type tmp_type = src_type;852853tmp_type.width /= 2;854tmp_type.length *= 2;855856/* Take in consideration the sign changes only in the last step */857if(tmp_type.width == dst_type.width)858tmp_type.sign = dst_type.sign;859860num_srcs /= 2;861862for(i = 0; i < num_srcs; ++i)863tmp[i] = pack2(gallivm, src_type, tmp_type,864tmp[2*i + 0], tmp[2*i + 1]);865866src_type = tmp_type;867}868869assert(num_srcs == 1);870871return tmp[0];872}873874875/**876* Truncate or expand the bitwidth.877*878* NOTE: Getting the right sign flags is crucial here, as we employ some879* intrinsics that do saturation.880*/881void882lp_build_resize(struct gallivm_state *gallivm,883struct lp_type src_type,884struct lp_type dst_type,885const LLVMValueRef *src, unsigned num_srcs,886LLVMValueRef *dst, unsigned num_dsts)887{888LLVMBuilderRef builder = gallivm->builder;889LLVMValueRef tmp[LP_MAX_VECTOR_LENGTH];890unsigned i;891892/*893* We don't support float <-> int conversion here. That must be done894* before/after calling this function.895*/896assert(src_type.floating == dst_type.floating);897898/*899* We don't support double <-> float conversion yet, although it could be900* added with little effort.901*/902assert((!src_type.floating && !dst_type.floating) ||903src_type.width == dst_type.width);904905/* We must not loose or gain channels. Only precision */906assert(src_type.length * num_srcs == dst_type.length * num_dsts);907908assert(src_type.length <= LP_MAX_VECTOR_LENGTH);909assert(dst_type.length <= LP_MAX_VECTOR_LENGTH);910assert(num_srcs <= LP_MAX_VECTOR_LENGTH);911assert(num_dsts <= LP_MAX_VECTOR_LENGTH);912913if (src_type.width > dst_type.width) {914/*915* Truncate bit width.916*/917918/* Conversion must be M:1 */919assert(num_dsts == 1);920921if (src_type.width * src_type.length == dst_type.width * dst_type.length) {922/*923* Register width remains constant -- use vector packing intrinsics924*/925tmp[0] = lp_build_pack(gallivm, src_type, dst_type, TRUE, src, num_srcs);926}927else {928if (src_type.width / dst_type.width > num_srcs) {929/*930* First change src vectors size (with shuffle) so they have the931* same size as the destination vector, then pack normally.932* Note: cannot use cast/extract because llvm generates atrocious code.933*/934unsigned size_ratio = (src_type.width * src_type.length) /935(dst_type.length * dst_type.width);936unsigned new_length = src_type.length / size_ratio;937938for (i = 0; i < size_ratio * num_srcs; i++) {939unsigned start_index = (i % size_ratio) * new_length;940tmp[i] = lp_build_extract_range(gallivm, src[i / size_ratio],941start_index, new_length);942}943num_srcs *= size_ratio;944src_type.length = new_length;945tmp[0] = lp_build_pack(gallivm, src_type, dst_type, TRUE, tmp, num_srcs);946}947else {948/*949* Truncate bit width but expand vector size - first pack950* then expand simply because this should be more AVX-friendly951* for the cases we probably hit.952*/953unsigned size_ratio = (dst_type.width * dst_type.length) /954(src_type.length * src_type.width);955unsigned num_pack_srcs = num_srcs / size_ratio;956dst_type.length = dst_type.length / size_ratio;957958for (i = 0; i < size_ratio; i++) {959tmp[i] = lp_build_pack(gallivm, src_type, dst_type, TRUE,960&src[i*num_pack_srcs], num_pack_srcs);961}962tmp[0] = lp_build_concat(gallivm, tmp, dst_type, size_ratio);963}964}965}966else if (src_type.width < dst_type.width) {967/*968* Expand bit width.969*/970971/* Conversion must be 1:N */972assert(num_srcs == 1);973974if (src_type.width * src_type.length == dst_type.width * dst_type.length) {975/*976* Register width remains constant -- use vector unpack intrinsics977*/978lp_build_unpack(gallivm, src_type, dst_type, src[0], tmp, num_dsts);979}980else {981/*982* Do it element-wise.983*/984assert(src_type.length * num_srcs == dst_type.length * num_dsts);985986for (i = 0; i < num_dsts; i++) {987tmp[i] = lp_build_undef(gallivm, dst_type);988}989990for (i = 0; i < src_type.length; ++i) {991unsigned j = i / dst_type.length;992LLVMValueRef srcindex = lp_build_const_int32(gallivm, i);993LLVMValueRef dstindex = lp_build_const_int32(gallivm, i % dst_type.length);994LLVMValueRef val = LLVMBuildExtractElement(builder, src[0], srcindex, "");995996if (src_type.sign && dst_type.sign) {997val = LLVMBuildSExt(builder, val, lp_build_elem_type(gallivm, dst_type), "");998} else {999val = LLVMBuildZExt(builder, val, lp_build_elem_type(gallivm, dst_type), "");1000}1001tmp[j] = LLVMBuildInsertElement(builder, tmp[j], val, dstindex, "");1002}1003}1004}1005else {1006/*1007* No-op1008*/10091010/* "Conversion" must be N:N */1011assert(num_srcs == num_dsts);10121013for(i = 0; i < num_dsts; ++i)1014tmp[i] = src[i];1015}10161017for(i = 0; i < num_dsts; ++i)1018dst[i] = tmp[i];1019}102010211022/**1023* Expands src vector from src.length to dst_length1024*/1025LLVMValueRef1026lp_build_pad_vector(struct gallivm_state *gallivm,1027LLVMValueRef src,1028unsigned dst_length)1029{1030LLVMValueRef elems[LP_MAX_VECTOR_LENGTH];1031LLVMValueRef undef;1032LLVMTypeRef type;1033unsigned i, src_length;10341035type = LLVMTypeOf(src);10361037if (LLVMGetTypeKind(type) != LLVMVectorTypeKind) {1038/* Can't use ShuffleVector on non-vector type */1039undef = LLVMGetUndef(LLVMVectorType(type, dst_length));1040return LLVMBuildInsertElement(gallivm->builder, undef, src, lp_build_const_int32(gallivm, 0), "");1041}10421043undef = LLVMGetUndef(type);1044src_length = LLVMGetVectorSize(type);10451046assert(dst_length <= ARRAY_SIZE(elems));1047assert(dst_length >= src_length);10481049if (src_length == dst_length)1050return src;10511052/* All elements from src vector */1053for (i = 0; i < src_length; ++i)1054elems[i] = lp_build_const_int32(gallivm, i);10551056/* Undef fill remaining space */1057for (i = src_length; i < dst_length; ++i)1058elems[i] = lp_build_const_int32(gallivm, src_length);10591060/* Combine the two vectors */1061return LLVMBuildShuffleVector(gallivm->builder, src, undef, LLVMConstVector(elems, dst_length), "");1062}106310641065