Path: blob/21.2-virgl/src/gallium/auxiliary/gallivm/lp_bld_gather.c
4565 views
/**************************************************************************1*2* Copyright 2010 VMware, Inc.3* All Rights Reserved.4*5* Permission is hereby granted, free of charge, to any person obtaining a6* copy of this software and associated documentation files (the7* "Software"), to deal in the Software without restriction, including8* without limitation the rights to use, copy, modify, merge, publish,9* distribute, sub license, and/or sell copies of the Software, and to10* permit persons to whom the Software is furnished to do so, subject to11* the following conditions:12*13* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR14* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,15* FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL16* THE COPYRIGHT HOLDERS, AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM,17* DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR18* OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE19* USE OR OTHER DEALINGS IN THE SOFTWARE.20*21* The above copyright notice and this permission notice (including the22* next paragraph) shall be included in all copies or substantial portions23* of the Software.24*25**************************************************************************/262728#include "util/u_debug.h"29#include "util/u_cpu_detect.h"30#include "util/u_math.h"31#include "lp_bld_debug.h"32#include "lp_bld_const.h"33#include "lp_bld_format.h"34#include "lp_bld_gather.h"35#include "lp_bld_swizzle.h"36#include "lp_bld_type.h"37#include "lp_bld_init.h"38#include "lp_bld_intr.h"39#include "lp_bld_pack.h"404142/**43* Get the pointer to one element from scatter positions in memory.44*45* @sa lp_build_gather()46*/47LLVMValueRef48lp_build_gather_elem_ptr(struct gallivm_state *gallivm,49unsigned length,50LLVMValueRef base_ptr,51LLVMValueRef offsets,52unsigned i)53{54LLVMValueRef offset;55LLVMValueRef ptr;5657assert(LLVMTypeOf(base_ptr) == LLVMPointerType(LLVMInt8TypeInContext(gallivm->context), 0));5859if (length == 1) {60assert(i == 0);61offset = offsets;62} else {63LLVMValueRef index = lp_build_const_int32(gallivm, i);64offset = LLVMBuildExtractElement(gallivm->builder, offsets, index, "");65}6667ptr = LLVMBuildGEP(gallivm->builder, base_ptr, &offset, 1, "");6869return ptr;70}717273/**74* Gather one element from scatter positions in memory.75*76* @sa lp_build_gather()77*/78LLVMValueRef79lp_build_gather_elem(struct gallivm_state *gallivm,80unsigned length,81unsigned src_width,82unsigned dst_width,83boolean aligned,84LLVMValueRef base_ptr,85LLVMValueRef offsets,86unsigned i,87boolean vector_justify)88{89LLVMTypeRef src_type = LLVMIntTypeInContext(gallivm->context, src_width);90LLVMTypeRef src_ptr_type = LLVMPointerType(src_type, 0);91LLVMTypeRef dst_elem_type = LLVMIntTypeInContext(gallivm->context, dst_width);92LLVMValueRef ptr;93LLVMValueRef res;9495assert(LLVMTypeOf(base_ptr) == LLVMPointerType(LLVMInt8TypeInContext(gallivm->context), 0));9697ptr = lp_build_gather_elem_ptr(gallivm, length, base_ptr, offsets, i);98ptr = LLVMBuildBitCast(gallivm->builder, ptr, src_ptr_type, "");99res = LLVMBuildLoad(gallivm->builder, ptr, "");100101/* XXX102* On some archs we probably really want to avoid having to deal103* with alignments lower than 4 bytes (if fetch size is a power of104* two >= 32). On x86 it doesn't matter, however.105* We should be able to guarantee full alignment for any kind of texture106* fetch (except ARB_texture_buffer_range, oops), but not vertex fetch107* (there's PIPE_CAP_VERTEX_BUFFER_OFFSET_4BYTE_ALIGNED_ONLY and friends108* but I don't think that's quite what we wanted).109* For ARB_texture_buffer_range, PIPE_CAP_TEXTURE_BUFFER_OFFSET_ALIGNMENT110* looks like a good fit, but it seems this cap bit (and OpenGL) aren't111* enforcing what we want (which is what d3d10 does, the offset needs to112* be aligned to element size, but GL has bytes regardless of element113* size which would only leave us with minimum alignment restriction of 16114* which doesn't make much sense if the type isn't 4x32bit). Due to115* translation of offsets to first_elem in sampler_views it actually seems116* gallium could not do anything else except 16 no matter what...117*/118if (!aligned) {119LLVMSetAlignment(res, 1);120} else if (!util_is_power_of_two_or_zero(src_width)) {121/*122* Full alignment is impossible, assume the caller really meant123* the individual elements were aligned (e.g. 3x32bit format).124* And yes the generated code may otherwise crash, llvm will125* really assume 128bit alignment with a 96bit fetch (I suppose126* that makes sense as it can just assume the upper 32bit to be127* whatever).128* Maybe the caller should be able to explicitly set this, but129* this should cover all the 3-channel formats.130*/131if (((src_width / 24) * 24 == src_width) &&132util_is_power_of_two_or_zero(src_width / 24)) {133LLVMSetAlignment(res, src_width / 24);134} else {135LLVMSetAlignment(res, 1);136}137}138139assert(src_width <= dst_width);140if (src_width < dst_width) {141res = LLVMBuildZExt(gallivm->builder, res, dst_elem_type, "");142if (vector_justify) {143#if UTIL_ARCH_BIG_ENDIAN144res = LLVMBuildShl(gallivm->builder, res,145LLVMConstInt(dst_elem_type, dst_width - src_width, 0), "");146#endif147}148}149150return res;151}152153154/**155* Gather one element from scatter positions in memory.156* Nearly the same as above, however the individual elements157* may be vectors themselves, and fetches may be float type.158* Can also do pad vector instead of ZExt.159*160* @sa lp_build_gather()161*/162static LLVMValueRef163lp_build_gather_elem_vec(struct gallivm_state *gallivm,164unsigned length,165unsigned src_width,166LLVMTypeRef src_type,167struct lp_type dst_type,168boolean aligned,169LLVMValueRef base_ptr,170LLVMValueRef offsets,171unsigned i,172boolean vector_justify)173{174LLVMValueRef ptr, res;175LLVMTypeRef src_ptr_type = LLVMPointerType(src_type, 0);176assert(LLVMTypeOf(base_ptr) == LLVMPointerType(LLVMInt8TypeInContext(gallivm->context), 0));177178ptr = lp_build_gather_elem_ptr(gallivm, length, base_ptr, offsets, i);179ptr = LLVMBuildBitCast(gallivm->builder, ptr, src_ptr_type, "");180res = LLVMBuildLoad(gallivm->builder, ptr, "");181182/* XXX183* On some archs we probably really want to avoid having to deal184* with alignments lower than 4 bytes (if fetch size is a power of185* two >= 32). On x86 it doesn't matter, however.186* We should be able to guarantee full alignment for any kind of texture187* fetch (except ARB_texture_buffer_range, oops), but not vertex fetch188* (there's PIPE_CAP_VERTEX_BUFFER_OFFSET_4BYTE_ALIGNED_ONLY and friends189* but I don't think that's quite what we wanted).190* For ARB_texture_buffer_range, PIPE_CAP_TEXTURE_BUFFER_OFFSET_ALIGNMENT191* looks like a good fit, but it seems this cap bit (and OpenGL) aren't192* enforcing what we want (which is what d3d10 does, the offset needs to193* be aligned to element size, but GL has bytes regardless of element194* size which would only leave us with minimum alignment restriction of 16195* which doesn't make much sense if the type isn't 4x32bit). Due to196* translation of offsets to first_elem in sampler_views it actually seems197* gallium could not do anything else except 16 no matter what...198*/199if (!aligned) {200LLVMSetAlignment(res, 1);201} else if (!util_is_power_of_two_or_zero(src_width)) {202/*203* Full alignment is impossible, assume the caller really meant204* the individual elements were aligned (e.g. 3x32bit format).205* And yes the generated code may otherwise crash, llvm will206* really assume 128bit alignment with a 96bit fetch (I suppose207* that makes sense as it can just assume the upper 32bit to be208* whatever).209* Maybe the caller should be able to explicitly set this, but210* this should cover all the 3-channel formats.211*/212if (((src_width / 24) * 24 == src_width) &&213util_is_power_of_two_or_zero(src_width / 24)) {214LLVMSetAlignment(res, src_width / 24);215} else {216LLVMSetAlignment(res, 1);217}218}219220assert(src_width <= dst_type.width * dst_type.length);221if (src_width < dst_type.width * dst_type.length) {222if (dst_type.length > 1) {223res = lp_build_pad_vector(gallivm, res, dst_type.length);224/*225* vector_justify hopefully a non-issue since we only deal226* with src_width >= 32 here?227*/228} else {229LLVMTypeRef dst_elem_type = lp_build_vec_type(gallivm, dst_type);230231/*232* Only valid if src_ptr_type is int type...233*/234res = LLVMBuildZExt(gallivm->builder, res, dst_elem_type, "");235236#if UTIL_ARCH_BIG_ENDIAN237if (vector_justify) {238res = LLVMBuildShl(gallivm->builder, res,239LLVMConstInt(dst_elem_type,240dst_type.width - src_width, 0), "");241}242if (src_width == 48) {243/* Load 3x16 bit vector.244* The sequence of loads on big-endian hardware proceeds as follows.245* 16-bit fields are denoted by X, Y, Z, and 0. In memory, the sequence246* of three fields appears in the order X, Y, Z.247*248* Load 32-bit word: 0.0.X.Y249* Load 16-bit halfword: 0.0.0.Z250* Rotate left: 0.X.Y.0251* Bitwise OR: 0.X.Y.Z252*253* The order in which we need the fields in the result is 0.Z.Y.X,254* the same as on little-endian; permute 16-bit fields accordingly255* within 64-bit register:256*/257LLVMValueRef shuffles[4] = {258lp_build_const_int32(gallivm, 2),259lp_build_const_int32(gallivm, 1),260lp_build_const_int32(gallivm, 0),261lp_build_const_int32(gallivm, 3),262};263res = LLVMBuildBitCast(gallivm->builder, res,264lp_build_vec_type(gallivm, lp_type_uint_vec(16, 4*16)), "");265res = LLVMBuildShuffleVector(gallivm->builder, res, res, LLVMConstVector(shuffles, 4), "");266res = LLVMBuildBitCast(gallivm->builder, res, dst_elem_type, "");267}268#endif269}270}271return res;272}273274275276277static LLVMValueRef278lp_build_gather_avx2(struct gallivm_state *gallivm,279unsigned length,280unsigned src_width,281struct lp_type dst_type,282LLVMValueRef base_ptr,283LLVMValueRef offsets)284{285LLVMBuilderRef builder = gallivm->builder;286LLVMTypeRef src_type, src_vec_type;287LLVMValueRef res;288struct lp_type res_type = dst_type;289res_type.length *= length;290291if (dst_type.floating) {292src_type = src_width == 64 ? LLVMDoubleTypeInContext(gallivm->context) :293LLVMFloatTypeInContext(gallivm->context);294} else {295src_type = LLVMIntTypeInContext(gallivm->context, src_width);296}297src_vec_type = LLVMVectorType(src_type, length);298299/* XXX should allow hw scaling (can handle i8, i16, i32, i64 for x86) */300assert(LLVMTypeOf(base_ptr) == LLVMPointerType(LLVMInt8TypeInContext(gallivm->context), 0));301302if (0) {303/*304* XXX: This will cause LLVM pre 3.7 to hang; it works on LLVM 3.8 but305* will not use the AVX2 gather instrinsics (even with llvm 4.0), at306* least with Haswell. See307* http://lists.llvm.org/pipermail/llvm-dev/2016-January/094448.html308* And the generated code doing the emulation is quite a bit worse309* than what we get by doing it ourselves too.310*/311LLVMTypeRef i32_type = LLVMIntTypeInContext(gallivm->context, 32);312LLVMTypeRef i32_vec_type = LLVMVectorType(i32_type, length);313LLVMTypeRef i1_type = LLVMIntTypeInContext(gallivm->context, 1);314LLVMTypeRef i1_vec_type = LLVMVectorType(i1_type, length);315LLVMTypeRef src_ptr_type = LLVMPointerType(src_type, 0);316LLVMValueRef src_ptr;317318base_ptr = LLVMBuildBitCast(builder, base_ptr, src_ptr_type, "");319320/* Rescale offsets from bytes to elements */321LLVMValueRef scale = LLVMConstInt(i32_type, src_width/8, 0);322scale = lp_build_broadcast(gallivm, i32_vec_type, scale);323assert(LLVMTypeOf(offsets) == i32_vec_type);324offsets = LLVMBuildSDiv(builder, offsets, scale, "");325326src_ptr = LLVMBuildGEP(builder, base_ptr, &offsets, 1, "vector-gep");327328char intrinsic[64];329snprintf(intrinsic, sizeof intrinsic, "llvm.masked.gather.v%u%s%u",330length, dst_type.floating ? "f" : "i", src_width);331LLVMValueRef alignment = LLVMConstInt(i32_type, src_width/8, 0);332LLVMValueRef mask = LLVMConstAllOnes(i1_vec_type);333LLVMValueRef passthru = LLVMGetUndef(src_vec_type);334335LLVMValueRef args[] = { src_ptr, alignment, mask, passthru };336337res = lp_build_intrinsic(builder, intrinsic, src_vec_type, args, 4, 0);338} else {339LLVMTypeRef i8_type = LLVMIntTypeInContext(gallivm->context, 8);340const char *intrinsic = NULL;341unsigned l_idx = 0;342343assert(src_width == 32 || src_width == 64);344if (src_width == 32) {345assert(length == 4 || length == 8);346} else {347assert(length == 2 || length == 4);348}349350static const char *intrinsics[2][2][2] = {351352{{"llvm.x86.avx2.gather.d.d",353"llvm.x86.avx2.gather.d.d.256"},354{"llvm.x86.avx2.gather.d.q",355"llvm.x86.avx2.gather.d.q.256"}},356357{{"llvm.x86.avx2.gather.d.ps",358"llvm.x86.avx2.gather.d.ps.256"},359{"llvm.x86.avx2.gather.d.pd",360"llvm.x86.avx2.gather.d.pd.256"}},361};362363if ((src_width == 32 && length == 8) ||364(src_width == 64 && length == 4)) {365l_idx = 1;366}367intrinsic = intrinsics[dst_type.floating][src_width == 64][l_idx];368369LLVMValueRef passthru = LLVMGetUndef(src_vec_type);370LLVMValueRef mask = LLVMConstAllOnes(src_vec_type);371mask = LLVMConstBitCast(mask, src_vec_type);372LLVMValueRef scale = LLVMConstInt(i8_type, 1, 0);373374LLVMValueRef args[] = { passthru, base_ptr, offsets, mask, scale };375376res = lp_build_intrinsic(builder, intrinsic, src_vec_type, args, 5, 0);377}378res = LLVMBuildBitCast(builder, res, lp_build_vec_type(gallivm, res_type), "");379380return res;381}382383384/**385* Gather elements from scatter positions in memory into a single vector.386* Use for fetching texels from a texture.387* For SSE, typical values are length=4, src_width=32, dst_width=32.388*389* When src_width < dst_width, the return value can be justified in390* one of two ways:391* "integer justification" is used when the caller treats the destination392* as a packed integer bitmask, as described by the channels' "shift" and393* "width" fields;394* "vector justification" is used when the caller casts the destination395* to a vector and needs channel X to be in vector element 0.396*397* @param length length of the offsets398* @param src_width src element width in bits399* @param dst_type result element type (src will be expanded to fit,400* but truncation is not allowed)401* (this may be a vector, must be pot sized)402* @param aligned whether the data is guaranteed to be aligned (to src_width)403* @param base_ptr base pointer, needs to be a i8 pointer type.404* @param offsets vector with offsets405* @param vector_justify select vector rather than integer justification406*/407LLVMValueRef408lp_build_gather(struct gallivm_state *gallivm,409unsigned length,410unsigned src_width,411struct lp_type dst_type,412boolean aligned,413LLVMValueRef base_ptr,414LLVMValueRef offsets,415boolean vector_justify)416{417LLVMValueRef res;418boolean need_expansion = src_width < dst_type.width * dst_type.length;419boolean vec_fetch;420struct lp_type fetch_type, fetch_dst_type;421LLVMTypeRef src_type;422423assert(src_width <= dst_type.width * dst_type.length);424425/*426* This is quite a mess...427* Figure out if the fetch should be done as:428* a) scalar or vector429* b) float or int430*431* As an example, for a 96bit fetch expanded into 4x32bit, it is better432* to use (3x32bit) vector type (then pad the vector). Otherwise, the433* zext will cause extra instructions.434* However, the same isn't true for 3x16bit (the codegen for that is435* completely worthless on x86 simd, and for 3x8bit is is way worse436* still, don't try that... (To get really good code out of llvm for437* these cases, the only way is to decompose the fetches manually438* into 1x32bit/1x16bit, or 1x16/1x8bit respectively, although the latter439* case requires sse41, otherwise simple scalar zext is way better.440* But probably not important enough, so don't bother.)441* Also, we try to honor the floating bit of destination (but isn't442* possible if caller asks for instance for 2x32bit dst_type with443* 48bit fetch - the idea would be to use 3x16bit fetch, pad and444* cast to 2x32f type, so the fetch is always int and on top of that445* we avoid the vec pad and use scalar zext due the above mentioned446* issue).447* Note this is optimized for x86 sse2 and up backend. Could be tweaked448* for other archs if necessary...449*/450if (((src_width % 32) == 0) && ((src_width % dst_type.width) == 0) &&451(dst_type.length > 1)) {452/* use vector fetch (if dst_type is vector) */453vec_fetch = TRUE;454if (dst_type.floating) {455fetch_type = lp_type_float_vec(dst_type.width, src_width);456} else {457fetch_type = lp_type_int_vec(dst_type.width, src_width);458}459/* intentionally not using lp_build_vec_type here */460src_type = LLVMVectorType(lp_build_elem_type(gallivm, fetch_type),461fetch_type.length);462fetch_dst_type = fetch_type;463fetch_dst_type.length = dst_type.length;464} else {465/* use scalar fetch */466vec_fetch = FALSE;467if (dst_type.floating && ((src_width == 32) || (src_width == 64))) {468fetch_type = lp_type_float(src_width);469} else {470fetch_type = lp_type_int(src_width);471}472src_type = lp_build_vec_type(gallivm, fetch_type);473fetch_dst_type = fetch_type;474fetch_dst_type.width = dst_type.width * dst_type.length;475}476477if (length == 1) {478/* Scalar */479res = lp_build_gather_elem_vec(gallivm, length,480src_width, src_type, fetch_dst_type,481aligned, base_ptr, offsets, 0,482vector_justify);483return LLVMBuildBitCast(gallivm->builder, res,484lp_build_vec_type(gallivm, dst_type), "");485/*486* Excluding expansion from these paths because if you need it for487* 32bit/64bit fetches you're doing it wrong (this is gather, not488* conversion) and it would be awkward for floats.489*/490} else if (util_get_cpu_caps()->has_avx2 && !need_expansion &&491src_width == 32 && (length == 4 || length == 8)) {492return lp_build_gather_avx2(gallivm, length, src_width, dst_type,493base_ptr, offsets);494/*495* This looks bad on paper wrt throughtput/latency on Haswell.496* Even on Broadwell it doesn't look stellar.497* Albeit no measurements were done (but tested to work).498* Should definitely enable on Skylake.499* (In general, should be more of a win if the fetch is 256bit wide -500* this is true for the 32bit case above too.)501*/502} else if (0 && util_get_cpu_caps()->has_avx2 && !need_expansion &&503src_width == 64 && (length == 2 || length == 4)) {504return lp_build_gather_avx2(gallivm, length, src_width, dst_type,505base_ptr, offsets);506} else {507/* Vector */508509LLVMValueRef elems[LP_MAX_VECTOR_WIDTH / 8];510unsigned i;511boolean vec_zext = FALSE;512struct lp_type res_type, gather_res_type;513LLVMTypeRef res_t, gather_res_t;514515res_type = fetch_dst_type;516res_type.length *= length;517gather_res_type = res_type;518519if (src_width == 16 && dst_type.width == 32 && dst_type.length == 1) {520/*521* Note that llvm is never able to optimize zext/insert combos522* directly (i.e. zero the simd reg, then place the elements into523* the appropriate place directly). (I think this has to do with524* scalar/vector transition.) And scalar 16->32bit zext simd loads525* aren't possible (instead loading to scalar reg first).526* No idea about other archs...527* We could do this manually, but instead we just use a vector528* zext, which is simple enough (and, in fact, llvm might optimize529* this away).530* (We're not trying that with other bit widths as that might not be531* easier, in particular with 8 bit values at least with only sse2.)532*/533assert(vec_fetch == FALSE);534gather_res_type.width /= 2;535fetch_dst_type = fetch_type;536src_type = lp_build_vec_type(gallivm, fetch_type);537vec_zext = TRUE;538}539res_t = lp_build_vec_type(gallivm, res_type);540gather_res_t = lp_build_vec_type(gallivm, gather_res_type);541res = LLVMGetUndef(gather_res_t);542for (i = 0; i < length; ++i) {543LLVMValueRef index = lp_build_const_int32(gallivm, i);544elems[i] = lp_build_gather_elem_vec(gallivm, length,545src_width, src_type, fetch_dst_type,546aligned, base_ptr, offsets, i,547vector_justify);548if (!vec_fetch) {549res = LLVMBuildInsertElement(gallivm->builder, res, elems[i], index, "");550}551}552if (vec_zext) {553res = LLVMBuildZExt(gallivm->builder, res, res_t, "");554if (vector_justify) {555#if UTIL_ARCH_BIG_ENDIAN556unsigned sv = dst_type.width - src_width;557res = LLVMBuildShl(gallivm->builder, res,558lp_build_const_int_vec(gallivm, res_type, sv), "");559#endif560}561}562if (vec_fetch) {563/*564* Do bitcast now otherwise llvm might get some funny ideas wrt565* float/int types...566*/567for (i = 0; i < length; i++) {568elems[i] = LLVMBuildBitCast(gallivm->builder, elems[i],569lp_build_vec_type(gallivm, dst_type), "");570}571res = lp_build_concat(gallivm, elems, dst_type, length);572} else {573struct lp_type really_final_type = dst_type;574assert(res_type.length * res_type.width ==575dst_type.length * dst_type.width * length);576really_final_type.length *= length;577res = LLVMBuildBitCast(gallivm->builder, res,578lp_build_vec_type(gallivm, really_final_type), "");579}580}581582return res;583}584585LLVMValueRef586lp_build_gather_values(struct gallivm_state * gallivm,587LLVMValueRef * values,588unsigned value_count)589{590LLVMTypeRef vec_type = LLVMVectorType(LLVMTypeOf(values[0]), value_count);591LLVMBuilderRef builder = gallivm->builder;592LLVMValueRef vec = LLVMGetUndef(vec_type);593unsigned i;594595for (i = 0; i < value_count; i++) {596LLVMValueRef index = lp_build_const_int32(gallivm, i);597vec = LLVMBuildInsertElement(builder, vec, values[i], index, "");598}599return vec;600}601602603