Path: blob/21.2-virgl/src/gallium/auxiliary/gallivm/lp_bld_arit.c
4565 views
/**************************************************************************1*2* Copyright 2009-2010 VMware, Inc.3* All Rights Reserved.4*5* Permission is hereby granted, free of charge, to any person obtaining a6* copy of this software and associated documentation files (the7* "Software"), to deal in the Software without restriction, including8* without limitation the rights to use, copy, modify, merge, publish,9* distribute, sub license, and/or sell copies of the Software, and to10* permit persons to whom the Software is furnished to do so, subject to11* the following conditions:12*13* The above copyright notice and this permission notice (including the14* next paragraph) shall be included in all copies or substantial portions15* of the Software.16*17* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS18* OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF19* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.20* IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR21* ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,22* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE23* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.24*25**************************************************************************/262728/**29* @file30* Helper31*32* LLVM IR doesn't support all basic arithmetic operations we care about (most33* notably min/max and saturated operations), and it is often necessary to34* resort machine-specific intrinsics directly. The functions here hide all35* these implementation details from the other modules.36*37* We also do simple expressions simplification here. Reasons are:38* - it is very easy given we have all necessary information readily available39* - LLVM optimization passes fail to simplify several vector expressions40* - We often know value constraints which the optimization passes have no way41* of knowing, such as when source arguments are known to be in [0, 1] range.42*43* @author Jose Fonseca <[email protected]>44*/454647#include <float.h>4849#include <llvm/Config/llvm-config.h>5051#include "util/u_memory.h"52#include "util/u_debug.h"53#include "util/u_math.h"54#include "util/u_cpu_detect.h"5556#include "lp_bld_type.h"57#include "lp_bld_const.h"58#include "lp_bld_init.h"59#include "lp_bld_intr.h"60#include "lp_bld_logic.h"61#include "lp_bld_pack.h"62#include "lp_bld_debug.h"63#include "lp_bld_bitarit.h"64#include "lp_bld_arit.h"65#include "lp_bld_flow.h"6667#if defined(PIPE_ARCH_SSE)68#include <xmmintrin.h>69#endif7071#ifndef _MM_DENORMALS_ZERO_MASK72#define _MM_DENORMALS_ZERO_MASK 0x004073#endif7475#ifndef _MM_FLUSH_ZERO_MASK76#define _MM_FLUSH_ZERO_MASK 0x800077#endif7879#define EXP_POLY_DEGREE 58081#define LOG_POLY_DEGREE 4828384/**85* Generate min(a, b)86* No checks for special case values of a or b = 1 or 0 are done.87* NaN's are handled according to the behavior specified by the88* nan_behavior argument.89*/90static LLVMValueRef91lp_build_min_simple(struct lp_build_context *bld,92LLVMValueRef a,93LLVMValueRef b,94enum gallivm_nan_behavior nan_behavior)95{96const struct lp_type type = bld->type;97const char *intrinsic = NULL;98unsigned intr_size = 0;99LLVMValueRef cond;100101assert(lp_check_value(type, a));102assert(lp_check_value(type, b));103104/* TODO: optimize the constant case */105106if (type.floating && util_get_cpu_caps()->has_sse) {107if (type.width == 32) {108if (type.length == 1) {109intrinsic = "llvm.x86.sse.min.ss";110intr_size = 128;111}112else if (type.length <= 4 || !util_get_cpu_caps()->has_avx) {113intrinsic = "llvm.x86.sse.min.ps";114intr_size = 128;115}116else {117intrinsic = "llvm.x86.avx.min.ps.256";118intr_size = 256;119}120}121if (type.width == 64 && util_get_cpu_caps()->has_sse2) {122if (type.length == 1) {123intrinsic = "llvm.x86.sse2.min.sd";124intr_size = 128;125}126else if (type.length == 2 || !util_get_cpu_caps()->has_avx) {127intrinsic = "llvm.x86.sse2.min.pd";128intr_size = 128;129}130else {131intrinsic = "llvm.x86.avx.min.pd.256";132intr_size = 256;133}134}135}136else if (type.floating && util_get_cpu_caps()->has_altivec) {137if (nan_behavior == GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN) {138debug_printf("%s: altivec doesn't support nan return nan behavior\n",139__FUNCTION__);140}141if (type.width == 32 && type.length == 4) {142intrinsic = "llvm.ppc.altivec.vminfp";143intr_size = 128;144}145} else if (util_get_cpu_caps()->has_altivec) {146intr_size = 128;147if (type.width == 8) {148if (!type.sign) {149intrinsic = "llvm.ppc.altivec.vminub";150} else {151intrinsic = "llvm.ppc.altivec.vminsb";152}153} else if (type.width == 16) {154if (!type.sign) {155intrinsic = "llvm.ppc.altivec.vminuh";156} else {157intrinsic = "llvm.ppc.altivec.vminsh";158}159} else if (type.width == 32) {160if (!type.sign) {161intrinsic = "llvm.ppc.altivec.vminuw";162} else {163intrinsic = "llvm.ppc.altivec.vminsw";164}165}166}167168if (intrinsic) {169/* We need to handle nan's for floating point numbers. If one of the170* inputs is nan the other should be returned (required by both D3D10+171* and OpenCL).172* The sse intrinsics return the second operator in case of nan by173* default so we need to special code to handle those.174*/175if (util_get_cpu_caps()->has_sse && type.floating &&176nan_behavior == GALLIVM_NAN_RETURN_OTHER) {177LLVMValueRef isnan, min;178min = lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,179type,180intr_size, a, b);181isnan = lp_build_isnan(bld, b);182return lp_build_select(bld, isnan, a, min);183} else {184return lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,185type,186intr_size, a, b);187}188}189190if (type.floating) {191switch (nan_behavior) {192case GALLIVM_NAN_RETURN_OTHER: {193LLVMValueRef isnan = lp_build_isnan(bld, a);194cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);195cond = LLVMBuildXor(bld->gallivm->builder, cond, isnan, "");196return lp_build_select(bld, cond, a, b);197}198break;199case GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN:200cond = lp_build_cmp_ordered(bld, PIPE_FUNC_LESS, a, b);201return lp_build_select(bld, cond, a, b);202case GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN:203cond = lp_build_cmp(bld, PIPE_FUNC_LESS, b, a);204return lp_build_select(bld, cond, b, a);205case GALLIVM_NAN_BEHAVIOR_UNDEFINED:206cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);207return lp_build_select(bld, cond, a, b);208break;209default:210assert(0);211cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);212return lp_build_select(bld, cond, a, b);213}214} else {215cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);216return lp_build_select(bld, cond, a, b);217}218}219220221LLVMValueRef222lp_build_fmuladd(LLVMBuilderRef builder,223LLVMValueRef a,224LLVMValueRef b,225LLVMValueRef c)226{227LLVMTypeRef type = LLVMTypeOf(a);228assert(type == LLVMTypeOf(b));229assert(type == LLVMTypeOf(c));230231char intrinsic[32];232lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.fmuladd", type);233LLVMValueRef args[] = { a, b, c };234return lp_build_intrinsic(builder, intrinsic, type, args, 3, 0);235}236237238/**239* Generate max(a, b)240* No checks for special case values of a or b = 1 or 0 are done.241* NaN's are handled according to the behavior specified by the242* nan_behavior argument.243*/244static LLVMValueRef245lp_build_max_simple(struct lp_build_context *bld,246LLVMValueRef a,247LLVMValueRef b,248enum gallivm_nan_behavior nan_behavior)249{250const struct lp_type type = bld->type;251const char *intrinsic = NULL;252unsigned intr_size = 0;253LLVMValueRef cond;254255assert(lp_check_value(type, a));256assert(lp_check_value(type, b));257258/* TODO: optimize the constant case */259260if (type.floating && util_get_cpu_caps()->has_sse) {261if (type.width == 32) {262if (type.length == 1) {263intrinsic = "llvm.x86.sse.max.ss";264intr_size = 128;265}266else if (type.length <= 4 || !util_get_cpu_caps()->has_avx) {267intrinsic = "llvm.x86.sse.max.ps";268intr_size = 128;269}270else {271intrinsic = "llvm.x86.avx.max.ps.256";272intr_size = 256;273}274}275if (type.width == 64 && util_get_cpu_caps()->has_sse2) {276if (type.length == 1) {277intrinsic = "llvm.x86.sse2.max.sd";278intr_size = 128;279}280else if (type.length == 2 || !util_get_cpu_caps()->has_avx) {281intrinsic = "llvm.x86.sse2.max.pd";282intr_size = 128;283}284else {285intrinsic = "llvm.x86.avx.max.pd.256";286intr_size = 256;287}288}289}290else if (type.floating && util_get_cpu_caps()->has_altivec) {291if (nan_behavior == GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN) {292debug_printf("%s: altivec doesn't support nan return nan behavior\n",293__FUNCTION__);294}295if (type.width == 32 || type.length == 4) {296intrinsic = "llvm.ppc.altivec.vmaxfp";297intr_size = 128;298}299} else if (util_get_cpu_caps()->has_altivec) {300intr_size = 128;301if (type.width == 8) {302if (!type.sign) {303intrinsic = "llvm.ppc.altivec.vmaxub";304} else {305intrinsic = "llvm.ppc.altivec.vmaxsb";306}307} else if (type.width == 16) {308if (!type.sign) {309intrinsic = "llvm.ppc.altivec.vmaxuh";310} else {311intrinsic = "llvm.ppc.altivec.vmaxsh";312}313} else if (type.width == 32) {314if (!type.sign) {315intrinsic = "llvm.ppc.altivec.vmaxuw";316} else {317intrinsic = "llvm.ppc.altivec.vmaxsw";318}319}320}321322if (intrinsic) {323if (util_get_cpu_caps()->has_sse && type.floating &&324nan_behavior == GALLIVM_NAN_RETURN_OTHER) {325LLVMValueRef isnan, max;326max = lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,327type,328intr_size, a, b);329isnan = lp_build_isnan(bld, b);330return lp_build_select(bld, isnan, a, max);331} else {332return lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,333type,334intr_size, a, b);335}336}337338if (type.floating) {339switch (nan_behavior) {340case GALLIVM_NAN_RETURN_OTHER: {341LLVMValueRef isnan = lp_build_isnan(bld, a);342cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);343cond = LLVMBuildXor(bld->gallivm->builder, cond, isnan, "");344return lp_build_select(bld, cond, a, b);345}346break;347case GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN:348cond = lp_build_cmp_ordered(bld, PIPE_FUNC_GREATER, a, b);349return lp_build_select(bld, cond, a, b);350case GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN:351cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, b, a);352return lp_build_select(bld, cond, b, a);353case GALLIVM_NAN_BEHAVIOR_UNDEFINED:354cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);355return lp_build_select(bld, cond, a, b);356break;357default:358assert(0);359cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);360return lp_build_select(bld, cond, a, b);361}362} else {363cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);364return lp_build_select(bld, cond, a, b);365}366}367368369/**370* Generate 1 - a, or ~a depending on bld->type.371*/372LLVMValueRef373lp_build_comp(struct lp_build_context *bld,374LLVMValueRef a)375{376LLVMBuilderRef builder = bld->gallivm->builder;377const struct lp_type type = bld->type;378379assert(lp_check_value(type, a));380381if(a == bld->one)382return bld->zero;383if(a == bld->zero)384return bld->one;385386if(type.norm && !type.floating && !type.fixed && !type.sign) {387if(LLVMIsConstant(a))388return LLVMConstNot(a);389else390return LLVMBuildNot(builder, a, "");391}392393if(LLVMIsConstant(a))394if (type.floating)395return LLVMConstFSub(bld->one, a);396else397return LLVMConstSub(bld->one, a);398else399if (type.floating)400return LLVMBuildFSub(builder, bld->one, a, "");401else402return LLVMBuildSub(builder, bld->one, a, "");403}404405406/**407* Generate a + b408*/409LLVMValueRef410lp_build_add(struct lp_build_context *bld,411LLVMValueRef a,412LLVMValueRef b)413{414LLVMBuilderRef builder = bld->gallivm->builder;415const struct lp_type type = bld->type;416LLVMValueRef res;417418assert(lp_check_value(type, a));419assert(lp_check_value(type, b));420421if (a == bld->zero)422return b;423if (b == bld->zero)424return a;425if (a == bld->undef || b == bld->undef)426return bld->undef;427428if (type.norm) {429const char *intrinsic = NULL;430431if (!type.sign && (a == bld->one || b == bld->one))432return bld->one;433434if (!type.floating && !type.fixed) {435if (LLVM_VERSION_MAJOR >= 8) {436char intrin[32];437intrinsic = type.sign ? "llvm.sadd.sat" : "llvm.uadd.sat";438lp_format_intrinsic(intrin, sizeof intrin, intrinsic, bld->vec_type);439return lp_build_intrinsic_binary(builder, intrin, bld->vec_type, a, b);440}441if (type.width * type.length == 128) {442if (util_get_cpu_caps()->has_sse2) {443if (type.width == 8)444intrinsic = type.sign ? "llvm.x86.sse2.padds.b" : "llvm.x86.sse2.paddus.b";445if (type.width == 16)446intrinsic = type.sign ? "llvm.x86.sse2.padds.w" : "llvm.x86.sse2.paddus.w";447} else if (util_get_cpu_caps()->has_altivec) {448if (type.width == 8)449intrinsic = type.sign ? "llvm.ppc.altivec.vaddsbs" : "llvm.ppc.altivec.vaddubs";450if (type.width == 16)451intrinsic = type.sign ? "llvm.ppc.altivec.vaddshs" : "llvm.ppc.altivec.vadduhs";452}453}454if (type.width * type.length == 256) {455if (util_get_cpu_caps()->has_avx2) {456if (type.width == 8)457intrinsic = type.sign ? "llvm.x86.avx2.padds.b" : "llvm.x86.avx2.paddus.b";458if (type.width == 16)459intrinsic = type.sign ? "llvm.x86.avx2.padds.w" : "llvm.x86.avx2.paddus.w";460}461}462}463464if (intrinsic)465return lp_build_intrinsic_binary(builder, intrinsic, lp_build_vec_type(bld->gallivm, bld->type), a, b);466}467468if(type.norm && !type.floating && !type.fixed) {469if (type.sign) {470uint64_t sign = (uint64_t)1 << (type.width - 1);471LLVMValueRef max_val = lp_build_const_int_vec(bld->gallivm, type, sign - 1);472LLVMValueRef min_val = lp_build_const_int_vec(bld->gallivm, type, sign);473/* a_clamp_max is the maximum a for positive b,474a_clamp_min is the minimum a for negative b. */475LLVMValueRef a_clamp_max = lp_build_min_simple(bld, a, LLVMBuildSub(builder, max_val, b, ""), GALLIVM_NAN_BEHAVIOR_UNDEFINED);476LLVMValueRef a_clamp_min = lp_build_max_simple(bld, a, LLVMBuildSub(builder, min_val, b, ""), GALLIVM_NAN_BEHAVIOR_UNDEFINED);477a = lp_build_select(bld, lp_build_cmp(bld, PIPE_FUNC_GREATER, b, bld->zero), a_clamp_max, a_clamp_min);478}479}480481if(LLVMIsConstant(a) && LLVMIsConstant(b))482if (type.floating)483res = LLVMConstFAdd(a, b);484else485res = LLVMConstAdd(a, b);486else487if (type.floating)488res = LLVMBuildFAdd(builder, a, b, "");489else490res = LLVMBuildAdd(builder, a, b, "");491492/* clamp to ceiling of 1.0 */493if(bld->type.norm && (bld->type.floating || bld->type.fixed))494res = lp_build_min_simple(bld, res, bld->one, GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN);495496if (type.norm && !type.floating && !type.fixed) {497if (!type.sign) {498/*499* newer llvm versions no longer support the intrinsics, but recognize500* the pattern. Since auto-upgrade of intrinsics doesn't work for jit501* code, it is important we match the pattern llvm uses (and pray llvm502* doesn't change it - and hope they decide on the same pattern for503* all backends supporting it...).504* NOTE: cmp/select does sext/trunc of the mask. Does not seem to505* interfere with llvm's ability to recognize the pattern but seems506* a bit brittle.507* NOTE: llvm 9+ always uses (non arch specific) intrinsic.508*/509LLVMValueRef overflowed = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, res);510res = lp_build_select(bld, overflowed,511LLVMConstAllOnes(bld->int_vec_type), res);512}513}514515/* XXX clamp to floor of -1 or 0??? */516517return res;518}519520521/** Return the scalar sum of the elements of a.522* Should avoid this operation whenever possible.523*/524LLVMValueRef525lp_build_horizontal_add(struct lp_build_context *bld,526LLVMValueRef a)527{528LLVMBuilderRef builder = bld->gallivm->builder;529const struct lp_type type = bld->type;530LLVMValueRef index, res;531unsigned i, length;532LLVMValueRef shuffles1[LP_MAX_VECTOR_LENGTH / 2];533LLVMValueRef shuffles2[LP_MAX_VECTOR_LENGTH / 2];534LLVMValueRef vecres, elem2;535536assert(lp_check_value(type, a));537538if (type.length == 1) {539return a;540}541542assert(!bld->type.norm);543544/*545* for byte vectors can do much better with psadbw.546* Using repeated shuffle/adds here. Note with multiple vectors547* this can be done more efficiently as outlined in the intel548* optimization manual.549* Note: could cause data rearrangement if used with smaller element550* sizes.551*/552553vecres = a;554length = type.length / 2;555while (length > 1) {556LLVMValueRef vec1, vec2;557for (i = 0; i < length; i++) {558shuffles1[i] = lp_build_const_int32(bld->gallivm, i);559shuffles2[i] = lp_build_const_int32(bld->gallivm, i + length);560}561vec1 = LLVMBuildShuffleVector(builder, vecres, vecres,562LLVMConstVector(shuffles1, length), "");563vec2 = LLVMBuildShuffleVector(builder, vecres, vecres,564LLVMConstVector(shuffles2, length), "");565if (type.floating) {566vecres = LLVMBuildFAdd(builder, vec1, vec2, "");567}568else {569vecres = LLVMBuildAdd(builder, vec1, vec2, "");570}571length = length >> 1;572}573574/* always have vector of size 2 here */575assert(length == 1);576577index = lp_build_const_int32(bld->gallivm, 0);578res = LLVMBuildExtractElement(builder, vecres, index, "");579index = lp_build_const_int32(bld->gallivm, 1);580elem2 = LLVMBuildExtractElement(builder, vecres, index, "");581582if (type.floating)583res = LLVMBuildFAdd(builder, res, elem2, "");584else585res = LLVMBuildAdd(builder, res, elem2, "");586587return res;588}589590/**591* Return the horizontal sums of 4 float vectors as a float4 vector.592* This uses the technique as outlined in Intel Optimization Manual.593*/594static LLVMValueRef595lp_build_horizontal_add4x4f(struct lp_build_context *bld,596LLVMValueRef src[4])597{598struct gallivm_state *gallivm = bld->gallivm;599LLVMBuilderRef builder = gallivm->builder;600LLVMValueRef shuffles[4];601LLVMValueRef tmp[4];602LLVMValueRef sumtmp[2], shuftmp[2];603604/* lower half of regs */605shuffles[0] = lp_build_const_int32(gallivm, 0);606shuffles[1] = lp_build_const_int32(gallivm, 1);607shuffles[2] = lp_build_const_int32(gallivm, 4);608shuffles[3] = lp_build_const_int32(gallivm, 5);609tmp[0] = LLVMBuildShuffleVector(builder, src[0], src[1],610LLVMConstVector(shuffles, 4), "");611tmp[2] = LLVMBuildShuffleVector(builder, src[2], src[3],612LLVMConstVector(shuffles, 4), "");613614/* upper half of regs */615shuffles[0] = lp_build_const_int32(gallivm, 2);616shuffles[1] = lp_build_const_int32(gallivm, 3);617shuffles[2] = lp_build_const_int32(gallivm, 6);618shuffles[3] = lp_build_const_int32(gallivm, 7);619tmp[1] = LLVMBuildShuffleVector(builder, src[0], src[1],620LLVMConstVector(shuffles, 4), "");621tmp[3] = LLVMBuildShuffleVector(builder, src[2], src[3],622LLVMConstVector(shuffles, 4), "");623624sumtmp[0] = LLVMBuildFAdd(builder, tmp[0], tmp[1], "");625sumtmp[1] = LLVMBuildFAdd(builder, tmp[2], tmp[3], "");626627shuffles[0] = lp_build_const_int32(gallivm, 0);628shuffles[1] = lp_build_const_int32(gallivm, 2);629shuffles[2] = lp_build_const_int32(gallivm, 4);630shuffles[3] = lp_build_const_int32(gallivm, 6);631shuftmp[0] = LLVMBuildShuffleVector(builder, sumtmp[0], sumtmp[1],632LLVMConstVector(shuffles, 4), "");633634shuffles[0] = lp_build_const_int32(gallivm, 1);635shuffles[1] = lp_build_const_int32(gallivm, 3);636shuffles[2] = lp_build_const_int32(gallivm, 5);637shuffles[3] = lp_build_const_int32(gallivm, 7);638shuftmp[1] = LLVMBuildShuffleVector(builder, sumtmp[0], sumtmp[1],639LLVMConstVector(shuffles, 4), "");640641return LLVMBuildFAdd(builder, shuftmp[0], shuftmp[1], "");642}643644645/*646* partially horizontally add 2-4 float vectors with length nx4,647* i.e. only four adjacent values in each vector will be added,648* assuming values are really grouped in 4 which also determines649* output order.650*651* Return a vector of the same length as the initial vectors,652* with the excess elements (if any) being undefined.653* The element order is independent of number of input vectors.654* For 3 vectors x0x1x2x3x4x5x6x7, y0y1y2y3y4y5y6y7, z0z1z2z3z4z5z6z7655* the output order thus will be656* sumx0-x3,sumy0-y3,sumz0-z3,undef,sumx4-x7,sumy4-y7,sumz4z7,undef657*/658LLVMValueRef659lp_build_hadd_partial4(struct lp_build_context *bld,660LLVMValueRef vectors[],661unsigned num_vecs)662{663struct gallivm_state *gallivm = bld->gallivm;664LLVMBuilderRef builder = gallivm->builder;665LLVMValueRef ret_vec;666LLVMValueRef tmp[4];667const char *intrinsic = NULL;668669assert(num_vecs >= 2 && num_vecs <= 4);670assert(bld->type.floating);671672/* only use this with at least 2 vectors, as it is sort of expensive673* (depending on cpu) and we always need two horizontal adds anyway,674* so a shuffle/add approach might be better.675*/676677tmp[0] = vectors[0];678tmp[1] = vectors[1];679680tmp[2] = num_vecs > 2 ? vectors[2] : vectors[0];681tmp[3] = num_vecs > 3 ? vectors[3] : vectors[0];682683if (util_get_cpu_caps()->has_sse3 && bld->type.width == 32 &&684bld->type.length == 4) {685intrinsic = "llvm.x86.sse3.hadd.ps";686}687else if (util_get_cpu_caps()->has_avx && bld->type.width == 32 &&688bld->type.length == 8) {689intrinsic = "llvm.x86.avx.hadd.ps.256";690}691if (intrinsic) {692tmp[0] = lp_build_intrinsic_binary(builder, intrinsic,693lp_build_vec_type(gallivm, bld->type),694tmp[0], tmp[1]);695if (num_vecs > 2) {696tmp[1] = lp_build_intrinsic_binary(builder, intrinsic,697lp_build_vec_type(gallivm, bld->type),698tmp[2], tmp[3]);699}700else {701tmp[1] = tmp[0];702}703return lp_build_intrinsic_binary(builder, intrinsic,704lp_build_vec_type(gallivm, bld->type),705tmp[0], tmp[1]);706}707708if (bld->type.length == 4) {709ret_vec = lp_build_horizontal_add4x4f(bld, tmp);710}711else {712LLVMValueRef partres[LP_MAX_VECTOR_LENGTH/4];713unsigned j;714unsigned num_iter = bld->type.length / 4;715struct lp_type parttype = bld->type;716parttype.length = 4;717for (j = 0; j < num_iter; j++) {718LLVMValueRef partsrc[4];719unsigned i;720for (i = 0; i < 4; i++) {721partsrc[i] = lp_build_extract_range(gallivm, tmp[i], j*4, 4);722}723partres[j] = lp_build_horizontal_add4x4f(bld, partsrc);724}725ret_vec = lp_build_concat(gallivm, partres, parttype, num_iter);726}727return ret_vec;728}729730/**731* Generate a - b732*/733LLVMValueRef734lp_build_sub(struct lp_build_context *bld,735LLVMValueRef a,736LLVMValueRef b)737{738LLVMBuilderRef builder = bld->gallivm->builder;739const struct lp_type type = bld->type;740LLVMValueRef res;741742assert(lp_check_value(type, a));743assert(lp_check_value(type, b));744745if (b == bld->zero)746return a;747if (a == bld->undef || b == bld->undef)748return bld->undef;749if (a == b)750return bld->zero;751752if (type.norm) {753const char *intrinsic = NULL;754755if (!type.sign && b == bld->one)756return bld->zero;757758if (!type.floating && !type.fixed) {759if (LLVM_VERSION_MAJOR >= 8) {760char intrin[32];761intrinsic = type.sign ? "llvm.ssub.sat" : "llvm.usub.sat";762lp_format_intrinsic(intrin, sizeof intrin, intrinsic, bld->vec_type);763return lp_build_intrinsic_binary(builder, intrin, bld->vec_type, a, b);764}765if (type.width * type.length == 128) {766if (util_get_cpu_caps()->has_sse2) {767if (type.width == 8)768intrinsic = type.sign ? "llvm.x86.sse2.psubs.b" : "llvm.x86.sse2.psubus.b";769if (type.width == 16)770intrinsic = type.sign ? "llvm.x86.sse2.psubs.w" : "llvm.x86.sse2.psubus.w";771} else if (util_get_cpu_caps()->has_altivec) {772if (type.width == 8)773intrinsic = type.sign ? "llvm.ppc.altivec.vsubsbs" : "llvm.ppc.altivec.vsububs";774if (type.width == 16)775intrinsic = type.sign ? "llvm.ppc.altivec.vsubshs" : "llvm.ppc.altivec.vsubuhs";776}777}778if (type.width * type.length == 256) {779if (util_get_cpu_caps()->has_avx2) {780if (type.width == 8)781intrinsic = type.sign ? "llvm.x86.avx2.psubs.b" : "llvm.x86.avx2.psubus.b";782if (type.width == 16)783intrinsic = type.sign ? "llvm.x86.avx2.psubs.w" : "llvm.x86.avx2.psubus.w";784}785}786}787788if (intrinsic)789return lp_build_intrinsic_binary(builder, intrinsic, lp_build_vec_type(bld->gallivm, bld->type), a, b);790}791792if(type.norm && !type.floating && !type.fixed) {793if (type.sign) {794uint64_t sign = (uint64_t)1 << (type.width - 1);795LLVMValueRef max_val = lp_build_const_int_vec(bld->gallivm, type, sign - 1);796LLVMValueRef min_val = lp_build_const_int_vec(bld->gallivm, type, sign);797/* a_clamp_max is the maximum a for negative b,798a_clamp_min is the minimum a for positive b. */799LLVMValueRef a_clamp_max = lp_build_min_simple(bld, a, LLVMBuildAdd(builder, max_val, b, ""), GALLIVM_NAN_BEHAVIOR_UNDEFINED);800LLVMValueRef a_clamp_min = lp_build_max_simple(bld, a, LLVMBuildAdd(builder, min_val, b, ""), GALLIVM_NAN_BEHAVIOR_UNDEFINED);801a = lp_build_select(bld, lp_build_cmp(bld, PIPE_FUNC_GREATER, b, bld->zero), a_clamp_min, a_clamp_max);802} else {803/*804* This must match llvm pattern for saturated unsigned sub.805* (lp_build_max_simple actually does the job with its current806* definition but do it explicitly here.)807* NOTE: cmp/select does sext/trunc of the mask. Does not seem to808* interfere with llvm's ability to recognize the pattern but seems809* a bit brittle.810* NOTE: llvm 9+ always uses (non arch specific) intrinsic.811*/812LLVMValueRef no_ov = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);813a = lp_build_select(bld, no_ov, a, b);814}815}816817if(LLVMIsConstant(a) && LLVMIsConstant(b))818if (type.floating)819res = LLVMConstFSub(a, b);820else821res = LLVMConstSub(a, b);822else823if (type.floating)824res = LLVMBuildFSub(builder, a, b, "");825else826res = LLVMBuildSub(builder, a, b, "");827828if(bld->type.norm && (bld->type.floating || bld->type.fixed))829res = lp_build_max_simple(bld, res, bld->zero, GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN);830831return res;832}833834835836/**837* Normalized multiplication.838*839* There are several approaches for (using 8-bit normalized multiplication as840* an example):841*842* - alpha plus one843*844* makes the following approximation to the division (Sree)845*846* a*b/255 ~= (a*(b + 1)) >> 256847*848* which is the fastest method that satisfies the following OpenGL criteria of849*850* 0*0 = 0 and 255*255 = 255851*852* - geometric series853*854* takes the geometric series approximation to the division855*856* t/255 = (t >> 8) + (t >> 16) + (t >> 24) ..857*858* in this case just the first two terms to fit in 16bit arithmetic859*860* t/255 ~= (t + (t >> 8)) >> 8861*862* note that just by itself it doesn't satisfies the OpenGL criteria, as863* 255*255 = 254, so the special case b = 255 must be accounted or roundoff864* must be used.865*866* - geometric series plus rounding867*868* when using a geometric series division instead of truncating the result869* use roundoff in the approximation (Jim Blinn)870*871* t/255 ~= (t + (t >> 8) + 0x80) >> 8872*873* achieving the exact results.874*875*876*877* @sa Alvy Ray Smith, Image Compositing Fundamentals, Tech Memo 4, Aug 15, 1995,878* ftp://ftp.alvyray.com/Acrobat/4_Comp.pdf879* @sa Michael Herf, The "double blend trick", May 2000,880* http://www.stereopsis.com/doubleblend.html881*/882LLVMValueRef883lp_build_mul_norm(struct gallivm_state *gallivm,884struct lp_type wide_type,885LLVMValueRef a, LLVMValueRef b)886{887LLVMBuilderRef builder = gallivm->builder;888struct lp_build_context bld;889unsigned n;890LLVMValueRef half;891LLVMValueRef ab;892893assert(!wide_type.floating);894assert(lp_check_value(wide_type, a));895assert(lp_check_value(wide_type, b));896897lp_build_context_init(&bld, gallivm, wide_type);898899n = wide_type.width / 2;900if (wide_type.sign) {901--n;902}903904/*905* TODO: for 16bits normalized SSE2 vectors we could consider using PMULHUW906* http://ssp.impulsetrain.com/2011/07/03/multiplying-normalized-16-bit-numbers-with-sse2/907*/908909/*910* a*b / (2**n - 1) ~= (a*b + (a*b >> n) + half) >> n911*/912913ab = LLVMBuildMul(builder, a, b, "");914ab = LLVMBuildAdd(builder, ab, lp_build_shr_imm(&bld, ab, n), "");915916/*917* half = sgn(ab) * 0.5 * (2 ** n) = sgn(ab) * (1 << (n - 1))918*/919920half = lp_build_const_int_vec(gallivm, wide_type, 1LL << (n - 1));921if (wide_type.sign) {922LLVMValueRef minus_half = LLVMBuildNeg(builder, half, "");923LLVMValueRef sign = lp_build_shr_imm(&bld, ab, wide_type.width - 1);924half = lp_build_select(&bld, sign, minus_half, half);925}926ab = LLVMBuildAdd(builder, ab, half, "");927928/* Final division */929ab = lp_build_shr_imm(&bld, ab, n);930931return ab;932}933934/**935* Generate a * b936*/937LLVMValueRef938lp_build_mul(struct lp_build_context *bld,939LLVMValueRef a,940LLVMValueRef b)941{942LLVMBuilderRef builder = bld->gallivm->builder;943const struct lp_type type = bld->type;944LLVMValueRef shift;945LLVMValueRef res;946947assert(lp_check_value(type, a));948assert(lp_check_value(type, b));949950if(a == bld->zero)951return bld->zero;952if(a == bld->one)953return b;954if(b == bld->zero)955return bld->zero;956if(b == bld->one)957return a;958if(a == bld->undef || b == bld->undef)959return bld->undef;960961if (!type.floating && !type.fixed && type.norm) {962struct lp_type wide_type = lp_wider_type(type);963LLVMValueRef al, ah, bl, bh, abl, abh, ab;964965lp_build_unpack2_native(bld->gallivm, type, wide_type, a, &al, &ah);966lp_build_unpack2_native(bld->gallivm, type, wide_type, b, &bl, &bh);967968/* PMULLW, PSRLW, PADDW */969abl = lp_build_mul_norm(bld->gallivm, wide_type, al, bl);970abh = lp_build_mul_norm(bld->gallivm, wide_type, ah, bh);971972ab = lp_build_pack2_native(bld->gallivm, wide_type, type, abl, abh);973974return ab;975}976977if(type.fixed)978shift = lp_build_const_int_vec(bld->gallivm, type, type.width/2);979else980shift = NULL;981982if(LLVMIsConstant(a) && LLVMIsConstant(b)) {983if (type.floating)984res = LLVMConstFMul(a, b);985else986res = LLVMConstMul(a, b);987if(shift) {988if(type.sign)989res = LLVMConstAShr(res, shift);990else991res = LLVMConstLShr(res, shift);992}993}994else {995if (type.floating)996res = LLVMBuildFMul(builder, a, b, "");997else998res = LLVMBuildMul(builder, a, b, "");999if(shift) {1000if(type.sign)1001res = LLVMBuildAShr(builder, res, shift, "");1002else1003res = LLVMBuildLShr(builder, res, shift, "");1004}1005}10061007return res;1008}10091010/*1011* Widening mul, valid for 32x32 bit -> 64bit only.1012* Result is low 32bits, high bits returned in res_hi.1013*1014* Emits code that is meant to be compiled for the host CPU.1015*/1016LLVMValueRef1017lp_build_mul_32_lohi_cpu(struct lp_build_context *bld,1018LLVMValueRef a,1019LLVMValueRef b,1020LLVMValueRef *res_hi)1021{1022struct gallivm_state *gallivm = bld->gallivm;1023LLVMBuilderRef builder = gallivm->builder;10241025assert(bld->type.width == 32);1026assert(bld->type.floating == 0);1027assert(bld->type.fixed == 0);1028assert(bld->type.norm == 0);10291030/*1031* XXX: for some reason, with zext/zext/mul/trunc the code llvm produces1032* for x86 simd is atrocious (even if the high bits weren't required),1033* trying to handle real 64bit inputs (which of course can't happen due1034* to using 64bit umul with 32bit numbers zero-extended to 64bit, but1035* apparently llvm does not recognize this widening mul). This includes 61036* (instead of 2) pmuludq plus extra adds and shifts1037* The same story applies to signed mul, albeit fixing this requires sse41.1038* https://llvm.org/bugs/show_bug.cgi?id=308451039* So, whip up our own code, albeit only for length 4 and 8 (which1040* should be good enough)...1041* FIXME: For llvm >= 7.0 we should match the autoupgrade pattern1042* (bitcast/and/mul/shuffle for unsigned, bitcast/shl/ashr/mul/shuffle1043* for signed), which the fallback code does not, without this llvm1044* will likely still produce atrocious code.1045*/1046if (LLVM_VERSION_MAJOR < 7 &&1047(bld->type.length == 4 || bld->type.length == 8) &&1048((util_get_cpu_caps()->has_sse2 && (bld->type.sign == 0)) ||1049util_get_cpu_caps()->has_sse4_1)) {1050const char *intrinsic = NULL;1051LLVMValueRef aeven, aodd, beven, bodd, muleven, mulodd;1052LLVMValueRef shuf[LP_MAX_VECTOR_WIDTH / 32], shuf_vec;1053struct lp_type type_wide = lp_wider_type(bld->type);1054LLVMTypeRef wider_type = lp_build_vec_type(gallivm, type_wide);1055unsigned i;1056for (i = 0; i < bld->type.length; i += 2) {1057shuf[i] = lp_build_const_int32(gallivm, i+1);1058shuf[i+1] = LLVMGetUndef(LLVMInt32TypeInContext(gallivm->context));1059}1060shuf_vec = LLVMConstVector(shuf, bld->type.length);1061aeven = a;1062beven = b;1063aodd = LLVMBuildShuffleVector(builder, aeven, bld->undef, shuf_vec, "");1064bodd = LLVMBuildShuffleVector(builder, beven, bld->undef, shuf_vec, "");10651066if (util_get_cpu_caps()->has_avx2 && bld->type.length == 8) {1067if (bld->type.sign) {1068intrinsic = "llvm.x86.avx2.pmul.dq";1069} else {1070intrinsic = "llvm.x86.avx2.pmulu.dq";1071}1072muleven = lp_build_intrinsic_binary(builder, intrinsic,1073wider_type, aeven, beven);1074mulodd = lp_build_intrinsic_binary(builder, intrinsic,1075wider_type, aodd, bodd);1076}1077else {1078/* for consistent naming look elsewhere... */1079if (bld->type.sign) {1080intrinsic = "llvm.x86.sse41.pmuldq";1081} else {1082intrinsic = "llvm.x86.sse2.pmulu.dq";1083}1084/*1085* XXX If we only have AVX but not AVX2 this is a pain.1086* lp_build_intrinsic_binary_anylength() can't handle it1087* (due to src and dst type not being identical).1088*/1089if (bld->type.length == 8) {1090LLVMValueRef aevenlo, aevenhi, bevenlo, bevenhi;1091LLVMValueRef aoddlo, aoddhi, boddlo, boddhi;1092LLVMValueRef muleven2[2], mulodd2[2];1093struct lp_type type_wide_half = type_wide;1094LLVMTypeRef wtype_half;1095type_wide_half.length = 2;1096wtype_half = lp_build_vec_type(gallivm, type_wide_half);1097aevenlo = lp_build_extract_range(gallivm, aeven, 0, 4);1098aevenhi = lp_build_extract_range(gallivm, aeven, 4, 4);1099bevenlo = lp_build_extract_range(gallivm, beven, 0, 4);1100bevenhi = lp_build_extract_range(gallivm, beven, 4, 4);1101aoddlo = lp_build_extract_range(gallivm, aodd, 0, 4);1102aoddhi = lp_build_extract_range(gallivm, aodd, 4, 4);1103boddlo = lp_build_extract_range(gallivm, bodd, 0, 4);1104boddhi = lp_build_extract_range(gallivm, bodd, 4, 4);1105muleven2[0] = lp_build_intrinsic_binary(builder, intrinsic,1106wtype_half, aevenlo, bevenlo);1107mulodd2[0] = lp_build_intrinsic_binary(builder, intrinsic,1108wtype_half, aoddlo, boddlo);1109muleven2[1] = lp_build_intrinsic_binary(builder, intrinsic,1110wtype_half, aevenhi, bevenhi);1111mulodd2[1] = lp_build_intrinsic_binary(builder, intrinsic,1112wtype_half, aoddhi, boddhi);1113muleven = lp_build_concat(gallivm, muleven2, type_wide_half, 2);1114mulodd = lp_build_concat(gallivm, mulodd2, type_wide_half, 2);11151116}1117else {1118muleven = lp_build_intrinsic_binary(builder, intrinsic,1119wider_type, aeven, beven);1120mulodd = lp_build_intrinsic_binary(builder, intrinsic,1121wider_type, aodd, bodd);1122}1123}1124muleven = LLVMBuildBitCast(builder, muleven, bld->vec_type, "");1125mulodd = LLVMBuildBitCast(builder, mulodd, bld->vec_type, "");11261127for (i = 0; i < bld->type.length; i += 2) {1128shuf[i] = lp_build_const_int32(gallivm, i + 1);1129shuf[i+1] = lp_build_const_int32(gallivm, i + 1 + bld->type.length);1130}1131shuf_vec = LLVMConstVector(shuf, bld->type.length);1132*res_hi = LLVMBuildShuffleVector(builder, muleven, mulodd, shuf_vec, "");11331134for (i = 0; i < bld->type.length; i += 2) {1135shuf[i] = lp_build_const_int32(gallivm, i);1136shuf[i+1] = lp_build_const_int32(gallivm, i + bld->type.length);1137}1138shuf_vec = LLVMConstVector(shuf, bld->type.length);1139return LLVMBuildShuffleVector(builder, muleven, mulodd, shuf_vec, "");1140}1141else {1142return lp_build_mul_32_lohi(bld, a, b, res_hi);1143}1144}114511461147/*1148* Widening mul, valid for <= 32 (8, 16, 32) -> 641149* Result is low N bits, high bits returned in res_hi.1150*1151* Emits generic code.1152*/1153LLVMValueRef1154lp_build_mul_32_lohi(struct lp_build_context *bld,1155LLVMValueRef a,1156LLVMValueRef b,1157LLVMValueRef *res_hi)1158{1159struct gallivm_state *gallivm = bld->gallivm;1160LLVMBuilderRef builder = gallivm->builder;1161LLVMValueRef tmp, shift, res_lo;1162struct lp_type type_tmp;1163LLVMTypeRef wide_type, narrow_type;11641165type_tmp = bld->type;1166narrow_type = lp_build_vec_type(gallivm, type_tmp);1167if (bld->type.width < 32)1168type_tmp.width = 32;1169else1170type_tmp.width *= 2;1171wide_type = lp_build_vec_type(gallivm, type_tmp);1172shift = lp_build_const_vec(gallivm, type_tmp, bld->type.width);11731174if (bld->type.sign) {1175a = LLVMBuildSExt(builder, a, wide_type, "");1176b = LLVMBuildSExt(builder, b, wide_type, "");1177} else {1178a = LLVMBuildZExt(builder, a, wide_type, "");1179b = LLVMBuildZExt(builder, b, wide_type, "");1180}1181tmp = LLVMBuildMul(builder, a, b, "");11821183res_lo = LLVMBuildTrunc(builder, tmp, narrow_type, "");11841185/* Since we truncate anyway, LShr and AShr are equivalent. */1186tmp = LLVMBuildLShr(builder, tmp, shift, "");1187*res_hi = LLVMBuildTrunc(builder, tmp, narrow_type, "");11881189return res_lo;1190}119111921193/* a * b + c */1194LLVMValueRef1195lp_build_mad(struct lp_build_context *bld,1196LLVMValueRef a,1197LLVMValueRef b,1198LLVMValueRef c)1199{1200const struct lp_type type = bld->type;1201if (type.floating) {1202return lp_build_fmuladd(bld->gallivm->builder, a, b, c);1203} else {1204return lp_build_add(bld, lp_build_mul(bld, a, b), c);1205}1206}120712081209/**1210* Small vector x scale multiplication optimization.1211*/1212LLVMValueRef1213lp_build_mul_imm(struct lp_build_context *bld,1214LLVMValueRef a,1215int b)1216{1217LLVMBuilderRef builder = bld->gallivm->builder;1218LLVMValueRef factor;12191220assert(lp_check_value(bld->type, a));12211222if(b == 0)1223return bld->zero;12241225if(b == 1)1226return a;12271228if(b == -1)1229return lp_build_negate(bld, a);12301231if(b == 2 && bld->type.floating)1232return lp_build_add(bld, a, a);12331234if(util_is_power_of_two_or_zero(b)) {1235unsigned shift = ffs(b) - 1;12361237if(bld->type.floating) {1238#if 01239/*1240* Power of two multiplication by directly manipulating the exponent.1241*1242* XXX: This might not be always faster, it will introduce a small error1243* for multiplication by zero, and it will produce wrong results1244* for Inf and NaN.1245*/1246unsigned mantissa = lp_mantissa(bld->type);1247factor = lp_build_const_int_vec(bld->gallivm, bld->type, (unsigned long long)shift << mantissa);1248a = LLVMBuildBitCast(builder, a, lp_build_int_vec_type(bld->type), "");1249a = LLVMBuildAdd(builder, a, factor, "");1250a = LLVMBuildBitCast(builder, a, lp_build_vec_type(bld->gallivm, bld->type), "");1251return a;1252#endif1253}1254else {1255factor = lp_build_const_vec(bld->gallivm, bld->type, shift);1256return LLVMBuildShl(builder, a, factor, "");1257}1258}12591260factor = lp_build_const_vec(bld->gallivm, bld->type, (double)b);1261return lp_build_mul(bld, a, factor);1262}126312641265/**1266* Generate a / b1267*/1268LLVMValueRef1269lp_build_div(struct lp_build_context *bld,1270LLVMValueRef a,1271LLVMValueRef b)1272{1273LLVMBuilderRef builder = bld->gallivm->builder;1274const struct lp_type type = bld->type;12751276assert(lp_check_value(type, a));1277assert(lp_check_value(type, b));12781279if(a == bld->zero)1280return bld->zero;1281if(a == bld->one && type.floating)1282return lp_build_rcp(bld, b);1283if(b == bld->zero)1284return bld->undef;1285if(b == bld->one)1286return a;1287if(a == bld->undef || b == bld->undef)1288return bld->undef;12891290if(LLVMIsConstant(a) && LLVMIsConstant(b)) {1291if (type.floating)1292return LLVMConstFDiv(a, b);1293else if (type.sign)1294return LLVMConstSDiv(a, b);1295else1296return LLVMConstUDiv(a, b);1297}12981299/* fast rcp is disabled (just uses div), so makes no sense to try that */1300if(FALSE &&1301((util_get_cpu_caps()->has_sse && type.width == 32 && type.length == 4) ||1302(util_get_cpu_caps()->has_avx && type.width == 32 && type.length == 8)) &&1303type.floating)1304return lp_build_mul(bld, a, lp_build_rcp(bld, b));13051306if (type.floating)1307return LLVMBuildFDiv(builder, a, b, "");1308else if (type.sign)1309return LLVMBuildSDiv(builder, a, b, "");1310else1311return LLVMBuildUDiv(builder, a, b, "");1312}131313141315/**1316* Linear interpolation helper.1317*1318* @param normalized whether we are interpolating normalized values,1319* encoded in normalized integers, twice as wide.1320*1321* @sa http://www.stereopsis.com/doubleblend.html1322*/1323static inline LLVMValueRef1324lp_build_lerp_simple(struct lp_build_context *bld,1325LLVMValueRef x,1326LLVMValueRef v0,1327LLVMValueRef v1,1328unsigned flags)1329{1330unsigned half_width = bld->type.width/2;1331LLVMBuilderRef builder = bld->gallivm->builder;1332LLVMValueRef delta;1333LLVMValueRef res;13341335assert(lp_check_value(bld->type, x));1336assert(lp_check_value(bld->type, v0));1337assert(lp_check_value(bld->type, v1));13381339delta = lp_build_sub(bld, v1, v0);13401341if (bld->type.floating) {1342assert(flags == 0);1343return lp_build_mad(bld, x, delta, v0);1344}13451346if (flags & LP_BLD_LERP_WIDE_NORMALIZED) {1347if (!bld->type.sign) {1348if (!(flags & LP_BLD_LERP_PRESCALED_WEIGHTS)) {1349/*1350* Scale x from [0, 2**n - 1] to [0, 2**n] by adding the1351* most-significant-bit to the lowest-significant-bit, so that1352* later we can just divide by 2**n instead of 2**n - 1.1353*/13541355x = lp_build_add(bld, x, lp_build_shr_imm(bld, x, half_width - 1));1356}13571358/* (x * delta) >> n */1359res = lp_build_mul(bld, x, delta);1360res = lp_build_shr_imm(bld, res, half_width);1361} else {1362/*1363* The rescaling trick above doesn't work for signed numbers, so1364* use the 2**n - 1 divison approximation in lp_build_mul_norm1365* instead.1366*/1367assert(!(flags & LP_BLD_LERP_PRESCALED_WEIGHTS));1368res = lp_build_mul_norm(bld->gallivm, bld->type, x, delta);1369}1370} else {1371assert(!(flags & LP_BLD_LERP_PRESCALED_WEIGHTS));1372res = lp_build_mul(bld, x, delta);1373}13741375if ((flags & LP_BLD_LERP_WIDE_NORMALIZED) && !bld->type.sign) {1376/*1377* At this point both res and v0 only use the lower half of the bits,1378* the rest is zero. Instead of add / mask, do add with half wide type.1379*/1380struct lp_type narrow_type;1381struct lp_build_context narrow_bld;13821383memset(&narrow_type, 0, sizeof narrow_type);1384narrow_type.sign = bld->type.sign;1385narrow_type.width = bld->type.width/2;1386narrow_type.length = bld->type.length*2;13871388lp_build_context_init(&narrow_bld, bld->gallivm, narrow_type);1389res = LLVMBuildBitCast(builder, res, narrow_bld.vec_type, "");1390v0 = LLVMBuildBitCast(builder, v0, narrow_bld.vec_type, "");1391res = lp_build_add(&narrow_bld, v0, res);1392res = LLVMBuildBitCast(builder, res, bld->vec_type, "");1393} else {1394res = lp_build_add(bld, v0, res);13951396if (bld->type.fixed) {1397/*1398* We need to mask out the high order bits when lerping 8bit1399* normalized colors stored on 16bits1400*/1401/* XXX: This step is necessary for lerping 8bit colors stored on1402* 16bits, but it will be wrong for true fixed point use cases.1403* Basically we need a more powerful lp_type, capable of further1404* distinguishing the values interpretation from the value storage.1405*/1406LLVMValueRef low_bits;1407low_bits = lp_build_const_int_vec(bld->gallivm, bld->type, (1 << half_width) - 1);1408res = LLVMBuildAnd(builder, res, low_bits, "");1409}1410}14111412return res;1413}141414151416/**1417* Linear interpolation.1418*/1419LLVMValueRef1420lp_build_lerp(struct lp_build_context *bld,1421LLVMValueRef x,1422LLVMValueRef v0,1423LLVMValueRef v1,1424unsigned flags)1425{1426const struct lp_type type = bld->type;1427LLVMValueRef res;14281429assert(lp_check_value(type, x));1430assert(lp_check_value(type, v0));1431assert(lp_check_value(type, v1));14321433assert(!(flags & LP_BLD_LERP_WIDE_NORMALIZED));14341435if (type.norm) {1436struct lp_type wide_type;1437struct lp_build_context wide_bld;1438LLVMValueRef xl, xh, v0l, v0h, v1l, v1h, resl, resh;14391440assert(type.length >= 2);14411442/*1443* Create a wider integer type, enough to hold the1444* intermediate result of the multiplication.1445*/1446memset(&wide_type, 0, sizeof wide_type);1447wide_type.sign = type.sign;1448wide_type.width = type.width*2;1449wide_type.length = type.length/2;14501451lp_build_context_init(&wide_bld, bld->gallivm, wide_type);14521453lp_build_unpack2_native(bld->gallivm, type, wide_type, x, &xl, &xh);1454lp_build_unpack2_native(bld->gallivm, type, wide_type, v0, &v0l, &v0h);1455lp_build_unpack2_native(bld->gallivm, type, wide_type, v1, &v1l, &v1h);14561457/*1458* Lerp both halves.1459*/14601461flags |= LP_BLD_LERP_WIDE_NORMALIZED;14621463resl = lp_build_lerp_simple(&wide_bld, xl, v0l, v1l, flags);1464resh = lp_build_lerp_simple(&wide_bld, xh, v0h, v1h, flags);14651466res = lp_build_pack2_native(bld->gallivm, wide_type, type, resl, resh);1467} else {1468res = lp_build_lerp_simple(bld, x, v0, v1, flags);1469}14701471return res;1472}147314741475/**1476* Bilinear interpolation.1477*1478* Values indices are in v_{yx}.1479*/1480LLVMValueRef1481lp_build_lerp_2d(struct lp_build_context *bld,1482LLVMValueRef x,1483LLVMValueRef y,1484LLVMValueRef v00,1485LLVMValueRef v01,1486LLVMValueRef v10,1487LLVMValueRef v11,1488unsigned flags)1489{1490LLVMValueRef v0 = lp_build_lerp(bld, x, v00, v01, flags);1491LLVMValueRef v1 = lp_build_lerp(bld, x, v10, v11, flags);1492return lp_build_lerp(bld, y, v0, v1, flags);1493}149414951496LLVMValueRef1497lp_build_lerp_3d(struct lp_build_context *bld,1498LLVMValueRef x,1499LLVMValueRef y,1500LLVMValueRef z,1501LLVMValueRef v000,1502LLVMValueRef v001,1503LLVMValueRef v010,1504LLVMValueRef v011,1505LLVMValueRef v100,1506LLVMValueRef v101,1507LLVMValueRef v110,1508LLVMValueRef v111,1509unsigned flags)1510{1511LLVMValueRef v0 = lp_build_lerp_2d(bld, x, y, v000, v001, v010, v011, flags);1512LLVMValueRef v1 = lp_build_lerp_2d(bld, x, y, v100, v101, v110, v111, flags);1513return lp_build_lerp(bld, z, v0, v1, flags);1514}151515161517/**1518* Generate min(a, b)1519* Do checks for special cases but not for nans.1520*/1521LLVMValueRef1522lp_build_min(struct lp_build_context *bld,1523LLVMValueRef a,1524LLVMValueRef b)1525{1526assert(lp_check_value(bld->type, a));1527assert(lp_check_value(bld->type, b));15281529if(a == bld->undef || b == bld->undef)1530return bld->undef;15311532if(a == b)1533return a;15341535if (bld->type.norm) {1536if (!bld->type.sign) {1537if (a == bld->zero || b == bld->zero) {1538return bld->zero;1539}1540}1541if(a == bld->one)1542return b;1543if(b == bld->one)1544return a;1545}15461547return lp_build_min_simple(bld, a, b, GALLIVM_NAN_BEHAVIOR_UNDEFINED);1548}15491550/**1551* Generate min(a, b)1552* NaN's are handled according to the behavior specified by the1553* nan_behavior argument.1554*/1555LLVMValueRef1556lp_build_min_ext(struct lp_build_context *bld,1557LLVMValueRef a,1558LLVMValueRef b,1559enum gallivm_nan_behavior nan_behavior)1560{1561assert(lp_check_value(bld->type, a));1562assert(lp_check_value(bld->type, b));15631564if(a == bld->undef || b == bld->undef)1565return bld->undef;15661567if(a == b)1568return a;15691570if (bld->type.norm) {1571if (!bld->type.sign) {1572if (a == bld->zero || b == bld->zero) {1573return bld->zero;1574}1575}1576if(a == bld->one)1577return b;1578if(b == bld->one)1579return a;1580}15811582return lp_build_min_simple(bld, a, b, nan_behavior);1583}15841585/**1586* Generate max(a, b)1587* Do checks for special cases, but NaN behavior is undefined.1588*/1589LLVMValueRef1590lp_build_max(struct lp_build_context *bld,1591LLVMValueRef a,1592LLVMValueRef b)1593{1594assert(lp_check_value(bld->type, a));1595assert(lp_check_value(bld->type, b));15961597if(a == bld->undef || b == bld->undef)1598return bld->undef;15991600if(a == b)1601return a;16021603if(bld->type.norm) {1604if(a == bld->one || b == bld->one)1605return bld->one;1606if (!bld->type.sign) {1607if (a == bld->zero) {1608return b;1609}1610if (b == bld->zero) {1611return a;1612}1613}1614}16151616return lp_build_max_simple(bld, a, b, GALLIVM_NAN_BEHAVIOR_UNDEFINED);1617}161816191620/**1621* Generate max(a, b)1622* Checks for special cases.1623* NaN's are handled according to the behavior specified by the1624* nan_behavior argument.1625*/1626LLVMValueRef1627lp_build_max_ext(struct lp_build_context *bld,1628LLVMValueRef a,1629LLVMValueRef b,1630enum gallivm_nan_behavior nan_behavior)1631{1632assert(lp_check_value(bld->type, a));1633assert(lp_check_value(bld->type, b));16341635if(a == bld->undef || b == bld->undef)1636return bld->undef;16371638if(a == b)1639return a;16401641if(bld->type.norm) {1642if(a == bld->one || b == bld->one)1643return bld->one;1644if (!bld->type.sign) {1645if (a == bld->zero) {1646return b;1647}1648if (b == bld->zero) {1649return a;1650}1651}1652}16531654return lp_build_max_simple(bld, a, b, nan_behavior);1655}16561657/**1658* Generate clamp(a, min, max)1659* NaN behavior (for any of a, min, max) is undefined.1660* Do checks for special cases.1661*/1662LLVMValueRef1663lp_build_clamp(struct lp_build_context *bld,1664LLVMValueRef a,1665LLVMValueRef min,1666LLVMValueRef max)1667{1668assert(lp_check_value(bld->type, a));1669assert(lp_check_value(bld->type, min));1670assert(lp_check_value(bld->type, max));16711672a = lp_build_min(bld, a, max);1673a = lp_build_max(bld, a, min);1674return a;1675}167616771678/**1679* Generate clamp(a, 0, 1)1680* A NaN will get converted to zero.1681*/1682LLVMValueRef1683lp_build_clamp_zero_one_nanzero(struct lp_build_context *bld,1684LLVMValueRef a)1685{1686a = lp_build_max_ext(bld, a, bld->zero, GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN);1687a = lp_build_min(bld, a, bld->one);1688return a;1689}169016911692/**1693* Generate abs(a)1694*/1695LLVMValueRef1696lp_build_abs(struct lp_build_context *bld,1697LLVMValueRef a)1698{1699LLVMBuilderRef builder = bld->gallivm->builder;1700const struct lp_type type = bld->type;1701LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);17021703assert(lp_check_value(type, a));17041705if(!type.sign)1706return a;17071708if(type.floating) {1709char intrinsic[32];1710lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.fabs", vec_type);1711return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a);1712}17131714if(type.width*type.length == 128 && util_get_cpu_caps()->has_ssse3 && LLVM_VERSION_MAJOR < 6) {1715switch(type.width) {1716case 8:1717return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.b.128", vec_type, a);1718case 16:1719return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.w.128", vec_type, a);1720case 32:1721return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.d.128", vec_type, a);1722}1723}1724else if (type.width*type.length == 256 && util_get_cpu_caps()->has_avx2 && LLVM_VERSION_MAJOR < 6) {1725switch(type.width) {1726case 8:1727return lp_build_intrinsic_unary(builder, "llvm.x86.avx2.pabs.b", vec_type, a);1728case 16:1729return lp_build_intrinsic_unary(builder, "llvm.x86.avx2.pabs.w", vec_type, a);1730case 32:1731return lp_build_intrinsic_unary(builder, "llvm.x86.avx2.pabs.d", vec_type, a);1732}1733}17341735return lp_build_select(bld, lp_build_cmp(bld, PIPE_FUNC_GREATER, a, bld->zero),1736a, LLVMBuildNeg(builder, a, ""));1737}173817391740LLVMValueRef1741lp_build_negate(struct lp_build_context *bld,1742LLVMValueRef a)1743{1744LLVMBuilderRef builder = bld->gallivm->builder;17451746assert(lp_check_value(bld->type, a));17471748if (bld->type.floating)1749a = LLVMBuildFNeg(builder, a, "");1750else1751a = LLVMBuildNeg(builder, a, "");17521753return a;1754}175517561757/** Return -1, 0 or +1 depending on the sign of a */1758LLVMValueRef1759lp_build_sgn(struct lp_build_context *bld,1760LLVMValueRef a)1761{1762LLVMBuilderRef builder = bld->gallivm->builder;1763const struct lp_type type = bld->type;1764LLVMValueRef cond;1765LLVMValueRef res;17661767assert(lp_check_value(type, a));17681769/* Handle non-zero case */1770if(!type.sign) {1771/* if not zero then sign must be positive */1772res = bld->one;1773}1774else if(type.floating) {1775LLVMTypeRef vec_type;1776LLVMTypeRef int_type;1777LLVMValueRef mask;1778LLVMValueRef sign;1779LLVMValueRef one;1780unsigned long long maskBit = (unsigned long long)1 << (type.width - 1);17811782int_type = lp_build_int_vec_type(bld->gallivm, type);1783vec_type = lp_build_vec_type(bld->gallivm, type);1784mask = lp_build_const_int_vec(bld->gallivm, type, maskBit);17851786/* Take the sign bit and add it to 1 constant */1787sign = LLVMBuildBitCast(builder, a, int_type, "");1788sign = LLVMBuildAnd(builder, sign, mask, "");1789one = LLVMConstBitCast(bld->one, int_type);1790res = LLVMBuildOr(builder, sign, one, "");1791res = LLVMBuildBitCast(builder, res, vec_type, "");1792}1793else1794{1795/* signed int/norm/fixed point */1796/* could use psign with sse3 and appropriate vectors here */1797LLVMValueRef minus_one = lp_build_const_vec(bld->gallivm, type, -1.0);1798cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, bld->zero);1799res = lp_build_select(bld, cond, bld->one, minus_one);1800}18011802/* Handle zero */1803cond = lp_build_cmp(bld, PIPE_FUNC_EQUAL, a, bld->zero);1804res = lp_build_select(bld, cond, bld->zero, res);18051806return res;1807}180818091810/**1811* Set the sign of float vector 'a' according to 'sign'.1812* If sign==0, return abs(a).1813* If sign==1, return -abs(a);1814* Other values for sign produce undefined results.1815*/1816LLVMValueRef1817lp_build_set_sign(struct lp_build_context *bld,1818LLVMValueRef a, LLVMValueRef sign)1819{1820LLVMBuilderRef builder = bld->gallivm->builder;1821const struct lp_type type = bld->type;1822LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);1823LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);1824LLVMValueRef shift = lp_build_const_int_vec(bld->gallivm, type, type.width - 1);1825LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type,1826~((unsigned long long) 1 << (type.width - 1)));1827LLVMValueRef val, res;18281829assert(type.floating);1830assert(lp_check_value(type, a));18311832/* val = reinterpret_cast<int>(a) */1833val = LLVMBuildBitCast(builder, a, int_vec_type, "");1834/* val = val & mask */1835val = LLVMBuildAnd(builder, val, mask, "");1836/* sign = sign << shift */1837sign = LLVMBuildShl(builder, sign, shift, "");1838/* res = val | sign */1839res = LLVMBuildOr(builder, val, sign, "");1840/* res = reinterpret_cast<float>(res) */1841res = LLVMBuildBitCast(builder, res, vec_type, "");18421843return res;1844}184518461847/**1848* Convert vector of (or scalar) int to vector of (or scalar) float.1849*/1850LLVMValueRef1851lp_build_int_to_float(struct lp_build_context *bld,1852LLVMValueRef a)1853{1854LLVMBuilderRef builder = bld->gallivm->builder;1855const struct lp_type type = bld->type;1856LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);18571858assert(type.floating);18591860return LLVMBuildSIToFP(builder, a, vec_type, "");1861}18621863static boolean1864arch_rounding_available(const struct lp_type type)1865{1866if ((util_get_cpu_caps()->has_sse4_1 &&1867(type.length == 1 || type.width*type.length == 128)) ||1868(util_get_cpu_caps()->has_avx && type.width*type.length == 256) ||1869(util_get_cpu_caps()->has_avx512f && type.width*type.length == 512))1870return TRUE;1871else if ((util_get_cpu_caps()->has_altivec &&1872(type.width == 32 && type.length == 4)))1873return TRUE;1874else if (util_get_cpu_caps()->has_neon)1875return TRUE;18761877return FALSE;1878}18791880enum lp_build_round_mode1881{1882LP_BUILD_ROUND_NEAREST = 0,1883LP_BUILD_ROUND_FLOOR = 1,1884LP_BUILD_ROUND_CEIL = 2,1885LP_BUILD_ROUND_TRUNCATE = 31886};18871888static inline LLVMValueRef1889lp_build_iround_nearest_sse2(struct lp_build_context *bld,1890LLVMValueRef a)1891{1892LLVMBuilderRef builder = bld->gallivm->builder;1893const struct lp_type type = bld->type;1894LLVMTypeRef i32t = LLVMInt32TypeInContext(bld->gallivm->context);1895LLVMTypeRef ret_type = lp_build_int_vec_type(bld->gallivm, type);1896const char *intrinsic;1897LLVMValueRef res;18981899assert(type.floating);1900/* using the double precision conversions is a bit more complicated */1901assert(type.width == 32);19021903assert(lp_check_value(type, a));1904assert(util_get_cpu_caps()->has_sse2);19051906/* This is relying on MXCSR rounding mode, which should always be nearest. */1907if (type.length == 1) {1908LLVMTypeRef vec_type;1909LLVMValueRef undef;1910LLVMValueRef arg;1911LLVMValueRef index0 = LLVMConstInt(i32t, 0, 0);19121913vec_type = LLVMVectorType(bld->elem_type, 4);19141915intrinsic = "llvm.x86.sse.cvtss2si";19161917undef = LLVMGetUndef(vec_type);19181919arg = LLVMBuildInsertElement(builder, undef, a, index0, "");19201921res = lp_build_intrinsic_unary(builder, intrinsic,1922ret_type, arg);1923}1924else {1925if (type.width* type.length == 128) {1926intrinsic = "llvm.x86.sse2.cvtps2dq";1927}1928else {1929assert(type.width*type.length == 256);1930assert(util_get_cpu_caps()->has_avx);19311932intrinsic = "llvm.x86.avx.cvt.ps2dq.256";1933}1934res = lp_build_intrinsic_unary(builder, intrinsic,1935ret_type, a);1936}19371938return res;1939}194019411942/*1943*/1944static inline LLVMValueRef1945lp_build_round_altivec(struct lp_build_context *bld,1946LLVMValueRef a,1947enum lp_build_round_mode mode)1948{1949LLVMBuilderRef builder = bld->gallivm->builder;1950const struct lp_type type = bld->type;1951const char *intrinsic = NULL;19521953assert(type.floating);19541955assert(lp_check_value(type, a));1956assert(util_get_cpu_caps()->has_altivec);19571958(void)type;19591960switch (mode) {1961case LP_BUILD_ROUND_NEAREST:1962intrinsic = "llvm.ppc.altivec.vrfin";1963break;1964case LP_BUILD_ROUND_FLOOR:1965intrinsic = "llvm.ppc.altivec.vrfim";1966break;1967case LP_BUILD_ROUND_CEIL:1968intrinsic = "llvm.ppc.altivec.vrfip";1969break;1970case LP_BUILD_ROUND_TRUNCATE:1971intrinsic = "llvm.ppc.altivec.vrfiz";1972break;1973}19741975return lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);1976}19771978static inline LLVMValueRef1979lp_build_round_arch(struct lp_build_context *bld,1980LLVMValueRef a,1981enum lp_build_round_mode mode)1982{1983if (util_get_cpu_caps()->has_sse4_1 || util_get_cpu_caps()->has_neon) {1984LLVMBuilderRef builder = bld->gallivm->builder;1985const struct lp_type type = bld->type;1986const char *intrinsic_root;1987char intrinsic[32];19881989assert(type.floating);1990assert(lp_check_value(type, a));1991(void)type;19921993switch (mode) {1994case LP_BUILD_ROUND_NEAREST:1995intrinsic_root = "llvm.nearbyint";1996break;1997case LP_BUILD_ROUND_FLOOR:1998intrinsic_root = "llvm.floor";1999break;2000case LP_BUILD_ROUND_CEIL:2001intrinsic_root = "llvm.ceil";2002break;2003case LP_BUILD_ROUND_TRUNCATE:2004intrinsic_root = "llvm.trunc";2005break;2006default:2007unreachable("unhandled lp_build_round_mode");2008}20092010lp_format_intrinsic(intrinsic, sizeof intrinsic, intrinsic_root, bld->vec_type);2011return lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);2012}2013else /* (util_get_cpu_caps()->has_altivec) */2014return lp_build_round_altivec(bld, a, mode);2015}20162017/**2018* Return the integer part of a float (vector) value (== round toward zero).2019* The returned value is a float (vector).2020* Ex: trunc(-1.5) = -1.02021*/2022LLVMValueRef2023lp_build_trunc(struct lp_build_context *bld,2024LLVMValueRef a)2025{2026LLVMBuilderRef builder = bld->gallivm->builder;2027const struct lp_type type = bld->type;20282029assert(type.floating);2030assert(lp_check_value(type, a));20312032if (arch_rounding_available(type)) {2033return lp_build_round_arch(bld, a, LP_BUILD_ROUND_TRUNCATE);2034}2035else {2036const struct lp_type type = bld->type;2037struct lp_type inttype;2038struct lp_build_context intbld;2039LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 1<<24);2040LLVMValueRef trunc, res, anosign, mask;2041LLVMTypeRef int_vec_type = bld->int_vec_type;2042LLVMTypeRef vec_type = bld->vec_type;20432044inttype = type;2045inttype.floating = 0;2046lp_build_context_init(&intbld, bld->gallivm, inttype);20472048/* round by truncation */2049trunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");2050res = LLVMBuildSIToFP(builder, trunc, vec_type, "floor.trunc");20512052/* mask out sign bit */2053anosign = lp_build_abs(bld, a);2054/*2055* mask out all values if anosign > 2^242056* This should work both for large ints (all rounding is no-op for them2057* because such floats are always exact) as well as special cases like2058* NaNs, Infs (taking advantage of the fact they use max exponent).2059* (2^24 is arbitrary anything between 2^24 and 2^31 should work.)2060*/2061anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, "");2062cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, "");2063mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval);2064return lp_build_select(bld, mask, a, res);2065}2066}206720682069/**2070* Return float (vector) rounded to nearest integer (vector). The returned2071* value is a float (vector).2072* Ex: round(0.9) = 1.02073* Ex: round(-1.5) = -2.02074*/2075LLVMValueRef2076lp_build_round(struct lp_build_context *bld,2077LLVMValueRef a)2078{2079LLVMBuilderRef builder = bld->gallivm->builder;2080const struct lp_type type = bld->type;20812082assert(type.floating);2083assert(lp_check_value(type, a));20842085if (arch_rounding_available(type)) {2086return lp_build_round_arch(bld, a, LP_BUILD_ROUND_NEAREST);2087}2088else {2089const struct lp_type type = bld->type;2090struct lp_type inttype;2091struct lp_build_context intbld;2092LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 1<<24);2093LLVMValueRef res, anosign, mask;2094LLVMTypeRef int_vec_type = bld->int_vec_type;2095LLVMTypeRef vec_type = bld->vec_type;20962097inttype = type;2098inttype.floating = 0;2099lp_build_context_init(&intbld, bld->gallivm, inttype);21002101res = lp_build_iround(bld, a);2102res = LLVMBuildSIToFP(builder, res, vec_type, "");21032104/* mask out sign bit */2105anosign = lp_build_abs(bld, a);2106/*2107* mask out all values if anosign > 2^242108* This should work both for large ints (all rounding is no-op for them2109* because such floats are always exact) as well as special cases like2110* NaNs, Infs (taking advantage of the fact they use max exponent).2111* (2^24 is arbitrary anything between 2^24 and 2^31 should work.)2112*/2113anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, "");2114cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, "");2115mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval);2116return lp_build_select(bld, mask, a, res);2117}2118}211921202121/**2122* Return floor of float (vector), result is a float (vector)2123* Ex: floor(1.1) = 1.02124* Ex: floor(-1.1) = -2.02125*/2126LLVMValueRef2127lp_build_floor(struct lp_build_context *bld,2128LLVMValueRef a)2129{2130LLVMBuilderRef builder = bld->gallivm->builder;2131const struct lp_type type = bld->type;21322133assert(type.floating);2134assert(lp_check_value(type, a));21352136if (arch_rounding_available(type)) {2137return lp_build_round_arch(bld, a, LP_BUILD_ROUND_FLOOR);2138}2139else {2140const struct lp_type type = bld->type;2141struct lp_type inttype;2142struct lp_build_context intbld;2143LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 1<<24);2144LLVMValueRef trunc, res, anosign, mask;2145LLVMTypeRef int_vec_type = bld->int_vec_type;2146LLVMTypeRef vec_type = bld->vec_type;21472148if (type.width != 32) {2149char intrinsic[32];2150lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.floor", vec_type);2151return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a);2152}21532154assert(type.width == 32); /* might want to handle doubles at some point */21552156inttype = type;2157inttype.floating = 0;2158lp_build_context_init(&intbld, bld->gallivm, inttype);21592160/* round by truncation */2161trunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");2162res = LLVMBuildSIToFP(builder, trunc, vec_type, "floor.trunc");21632164if (type.sign) {2165LLVMValueRef tmp;21662167/*2168* fix values if rounding is wrong (for non-special cases)2169* - this is the case if trunc > a2170*/2171mask = lp_build_cmp(bld, PIPE_FUNC_GREATER, res, a);2172/* tmp = trunc > a ? 1.0 : 0.0 */2173tmp = LLVMBuildBitCast(builder, bld->one, int_vec_type, "");2174tmp = lp_build_and(&intbld, mask, tmp);2175tmp = LLVMBuildBitCast(builder, tmp, vec_type, "");2176res = lp_build_sub(bld, res, tmp);2177}21782179/* mask out sign bit */2180anosign = lp_build_abs(bld, a);2181/*2182* mask out all values if anosign > 2^242183* This should work both for large ints (all rounding is no-op for them2184* because such floats are always exact) as well as special cases like2185* NaNs, Infs (taking advantage of the fact they use max exponent).2186* (2^24 is arbitrary anything between 2^24 and 2^31 should work.)2187*/2188anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, "");2189cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, "");2190mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval);2191return lp_build_select(bld, mask, a, res);2192}2193}219421952196/**2197* Return ceiling of float (vector), returning float (vector).2198* Ex: ceil( 1.1) = 2.02199* Ex: ceil(-1.1) = -1.02200*/2201LLVMValueRef2202lp_build_ceil(struct lp_build_context *bld,2203LLVMValueRef a)2204{2205LLVMBuilderRef builder = bld->gallivm->builder;2206const struct lp_type type = bld->type;22072208assert(type.floating);2209assert(lp_check_value(type, a));22102211if (arch_rounding_available(type)) {2212return lp_build_round_arch(bld, a, LP_BUILD_ROUND_CEIL);2213}2214else {2215const struct lp_type type = bld->type;2216struct lp_type inttype;2217struct lp_build_context intbld;2218LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 1<<24);2219LLVMValueRef trunc, res, anosign, mask, tmp;2220LLVMTypeRef int_vec_type = bld->int_vec_type;2221LLVMTypeRef vec_type = bld->vec_type;22222223if (type.width != 32) {2224char intrinsic[32];2225lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.ceil", vec_type);2226return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a);2227}22282229assert(type.width == 32); /* might want to handle doubles at some point */22302231inttype = type;2232inttype.floating = 0;2233lp_build_context_init(&intbld, bld->gallivm, inttype);22342235/* round by truncation */2236trunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");2237trunc = LLVMBuildSIToFP(builder, trunc, vec_type, "ceil.trunc");22382239/*2240* fix values if rounding is wrong (for non-special cases)2241* - this is the case if trunc < a2242*/2243mask = lp_build_cmp(bld, PIPE_FUNC_LESS, trunc, a);2244/* tmp = trunc < a ? 1.0 : 0.0 */2245tmp = LLVMBuildBitCast(builder, bld->one, int_vec_type, "");2246tmp = lp_build_and(&intbld, mask, tmp);2247tmp = LLVMBuildBitCast(builder, tmp, vec_type, "");2248res = lp_build_add(bld, trunc, tmp);22492250/* mask out sign bit */2251anosign = lp_build_abs(bld, a);2252/*2253* mask out all values if anosign > 2^242254* This should work both for large ints (all rounding is no-op for them2255* because such floats are always exact) as well as special cases like2256* NaNs, Infs (taking advantage of the fact they use max exponent).2257* (2^24 is arbitrary anything between 2^24 and 2^31 should work.)2258*/2259anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, "");2260cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, "");2261mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval);2262return lp_build_select(bld, mask, a, res);2263}2264}226522662267/**2268* Return fractional part of 'a' computed as a - floor(a)2269* Typically used in texture coord arithmetic.2270*/2271LLVMValueRef2272lp_build_fract(struct lp_build_context *bld,2273LLVMValueRef a)2274{2275assert(bld->type.floating);2276return lp_build_sub(bld, a, lp_build_floor(bld, a));2277}227822792280/**2281* Prevent returning 1.0 for very small negative values of 'a' by clamping2282* against 0.99999(9). (Will also return that value for NaNs.)2283*/2284static inline LLVMValueRef2285clamp_fract(struct lp_build_context *bld, LLVMValueRef fract)2286{2287LLVMValueRef max;22882289/* this is the largest number smaller than 1.0 representable as float */2290max = lp_build_const_vec(bld->gallivm, bld->type,22911.0 - 1.0/(1LL << (lp_mantissa(bld->type) + 1)));2292return lp_build_min_ext(bld, fract, max,2293GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN);2294}229522962297/**2298* Same as lp_build_fract, but guarantees that the result is always smaller2299* than one. Will also return the smaller-than-one value for infs, NaNs.2300*/2301LLVMValueRef2302lp_build_fract_safe(struct lp_build_context *bld,2303LLVMValueRef a)2304{2305return clamp_fract(bld, lp_build_fract(bld, a));2306}230723082309/**2310* Return the integer part of a float (vector) value (== round toward zero).2311* The returned value is an integer (vector).2312* Ex: itrunc(-1.5) = -12313*/2314LLVMValueRef2315lp_build_itrunc(struct lp_build_context *bld,2316LLVMValueRef a)2317{2318LLVMBuilderRef builder = bld->gallivm->builder;2319const struct lp_type type = bld->type;2320LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);23212322assert(type.floating);2323assert(lp_check_value(type, a));23242325return LLVMBuildFPToSI(builder, a, int_vec_type, "");2326}232723282329/**2330* Return float (vector) rounded to nearest integer (vector). The returned2331* value is an integer (vector).2332* Ex: iround(0.9) = 12333* Ex: iround(-1.5) = -22334*/2335LLVMValueRef2336lp_build_iround(struct lp_build_context *bld,2337LLVMValueRef a)2338{2339LLVMBuilderRef builder = bld->gallivm->builder;2340const struct lp_type type = bld->type;2341LLVMTypeRef int_vec_type = bld->int_vec_type;2342LLVMValueRef res;23432344assert(type.floating);23452346assert(lp_check_value(type, a));23472348if ((util_get_cpu_caps()->has_sse2 &&2349((type.width == 32) && (type.length == 1 || type.length == 4))) ||2350(util_get_cpu_caps()->has_avx && type.width == 32 && type.length == 8)) {2351return lp_build_iround_nearest_sse2(bld, a);2352}2353if (arch_rounding_available(type)) {2354res = lp_build_round_arch(bld, a, LP_BUILD_ROUND_NEAREST);2355}2356else {2357LLVMValueRef half;23582359half = lp_build_const_vec(bld->gallivm, type, nextafterf(0.5, 0.0));23602361if (type.sign) {2362LLVMTypeRef vec_type = bld->vec_type;2363LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type,2364(unsigned long long)1 << (type.width - 1));2365LLVMValueRef sign;23662367/* get sign bit */2368sign = LLVMBuildBitCast(builder, a, int_vec_type, "");2369sign = LLVMBuildAnd(builder, sign, mask, "");23702371/* sign * 0.5 */2372half = LLVMBuildBitCast(builder, half, int_vec_type, "");2373half = LLVMBuildOr(builder, sign, half, "");2374half = LLVMBuildBitCast(builder, half, vec_type, "");2375}23762377res = LLVMBuildFAdd(builder, a, half, "");2378}23792380res = LLVMBuildFPToSI(builder, res, int_vec_type, "");23812382return res;2383}238423852386/**2387* Return floor of float (vector), result is an int (vector)2388* Ex: ifloor(1.1) = 1.02389* Ex: ifloor(-1.1) = -2.02390*/2391LLVMValueRef2392lp_build_ifloor(struct lp_build_context *bld,2393LLVMValueRef a)2394{2395LLVMBuilderRef builder = bld->gallivm->builder;2396const struct lp_type type = bld->type;2397LLVMTypeRef int_vec_type = bld->int_vec_type;2398LLVMValueRef res;23992400assert(type.floating);2401assert(lp_check_value(type, a));24022403res = a;2404if (type.sign) {2405if (arch_rounding_available(type)) {2406res = lp_build_round_arch(bld, a, LP_BUILD_ROUND_FLOOR);2407}2408else {2409struct lp_type inttype;2410struct lp_build_context intbld;2411LLVMValueRef trunc, itrunc, mask;24122413assert(type.floating);2414assert(lp_check_value(type, a));24152416inttype = type;2417inttype.floating = 0;2418lp_build_context_init(&intbld, bld->gallivm, inttype);24192420/* round by truncation */2421itrunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");2422trunc = LLVMBuildSIToFP(builder, itrunc, bld->vec_type, "ifloor.trunc");24232424/*2425* fix values if rounding is wrong (for non-special cases)2426* - this is the case if trunc > a2427* The results of doing this with NaNs, very large values etc.2428* are undefined but this seems to be the case anyway.2429*/2430mask = lp_build_cmp(bld, PIPE_FUNC_GREATER, trunc, a);2431/* cheapie minus one with mask since the mask is minus one / zero */2432return lp_build_add(&intbld, itrunc, mask);2433}2434}24352436/* round to nearest (toward zero) */2437res = LLVMBuildFPToSI(builder, res, int_vec_type, "ifloor.res");24382439return res;2440}244124422443/**2444* Return ceiling of float (vector), returning int (vector).2445* Ex: iceil( 1.1) = 22446* Ex: iceil(-1.1) = -12447*/2448LLVMValueRef2449lp_build_iceil(struct lp_build_context *bld,2450LLVMValueRef a)2451{2452LLVMBuilderRef builder = bld->gallivm->builder;2453const struct lp_type type = bld->type;2454LLVMTypeRef int_vec_type = bld->int_vec_type;2455LLVMValueRef res;24562457assert(type.floating);2458assert(lp_check_value(type, a));24592460if (arch_rounding_available(type)) {2461res = lp_build_round_arch(bld, a, LP_BUILD_ROUND_CEIL);2462}2463else {2464struct lp_type inttype;2465struct lp_build_context intbld;2466LLVMValueRef trunc, itrunc, mask;24672468assert(type.floating);2469assert(lp_check_value(type, a));24702471inttype = type;2472inttype.floating = 0;2473lp_build_context_init(&intbld, bld->gallivm, inttype);24742475/* round by truncation */2476itrunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");2477trunc = LLVMBuildSIToFP(builder, itrunc, bld->vec_type, "iceil.trunc");24782479/*2480* fix values if rounding is wrong (for non-special cases)2481* - this is the case if trunc < a2482* The results of doing this with NaNs, very large values etc.2483* are undefined but this seems to be the case anyway.2484*/2485mask = lp_build_cmp(bld, PIPE_FUNC_LESS, trunc, a);2486/* cheapie plus one with mask since the mask is minus one / zero */2487return lp_build_sub(&intbld, itrunc, mask);2488}24892490/* round to nearest (toward zero) */2491res = LLVMBuildFPToSI(builder, res, int_vec_type, "iceil.res");24922493return res;2494}249524962497/**2498* Combined ifloor() & fract().2499*2500* Preferred to calling the functions separately, as it will ensure that the2501* strategy (floor() vs ifloor()) that results in less redundant work is used.2502*/2503void2504lp_build_ifloor_fract(struct lp_build_context *bld,2505LLVMValueRef a,2506LLVMValueRef *out_ipart,2507LLVMValueRef *out_fpart)2508{2509LLVMBuilderRef builder = bld->gallivm->builder;2510const struct lp_type type = bld->type;2511LLVMValueRef ipart;25122513assert(type.floating);2514assert(lp_check_value(type, a));25152516if (arch_rounding_available(type)) {2517/*2518* floor() is easier.2519*/25202521ipart = lp_build_floor(bld, a);2522*out_fpart = LLVMBuildFSub(builder, a, ipart, "fpart");2523*out_ipart = LLVMBuildFPToSI(builder, ipart, bld->int_vec_type, "ipart");2524}2525else {2526/*2527* ifloor() is easier.2528*/25292530*out_ipart = lp_build_ifloor(bld, a);2531ipart = LLVMBuildSIToFP(builder, *out_ipart, bld->vec_type, "ipart");2532*out_fpart = LLVMBuildFSub(builder, a, ipart, "fpart");2533}2534}253525362537/**2538* Same as lp_build_ifloor_fract, but guarantees that the fractional part is2539* always smaller than one.2540*/2541void2542lp_build_ifloor_fract_safe(struct lp_build_context *bld,2543LLVMValueRef a,2544LLVMValueRef *out_ipart,2545LLVMValueRef *out_fpart)2546{2547lp_build_ifloor_fract(bld, a, out_ipart, out_fpart);2548*out_fpart = clamp_fract(bld, *out_fpart);2549}255025512552LLVMValueRef2553lp_build_sqrt(struct lp_build_context *bld,2554LLVMValueRef a)2555{2556LLVMBuilderRef builder = bld->gallivm->builder;2557const struct lp_type type = bld->type;2558LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);2559char intrinsic[32];25602561assert(lp_check_value(type, a));25622563assert(type.floating);2564lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.sqrt", vec_type);25652566return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a);2567}256825692570/**2571* Do one Newton-Raphson step to improve reciprocate precision:2572*2573* x_{i+1} = x_i + x_i * (1 - a * x_i)2574*2575* XXX: Unfortunately this won't give IEEE-754 conformant results for 0 or2576* +/-Inf, giving NaN instead. Certain applications rely on this behavior,2577* such as Google Earth, which does RCP(RSQRT(0.0)) when drawing the Earth's2578* halo. It would be necessary to clamp the argument to prevent this.2579*2580* See also:2581* - http://en.wikipedia.org/wiki/Division_(digital)#Newton.E2.80.93Raphson_division2582* - http://softwarecommunity.intel.com/articles/eng/1818.htm2583*/2584static inline LLVMValueRef2585lp_build_rcp_refine(struct lp_build_context *bld,2586LLVMValueRef a,2587LLVMValueRef rcp_a)2588{2589LLVMBuilderRef builder = bld->gallivm->builder;2590LLVMValueRef neg_a;2591LLVMValueRef res;25922593neg_a = LLVMBuildFNeg(builder, a, "");2594res = lp_build_fmuladd(builder, neg_a, rcp_a, bld->one);2595res = lp_build_fmuladd(builder, res, rcp_a, rcp_a);25962597return res;2598}259926002601LLVMValueRef2602lp_build_rcp(struct lp_build_context *bld,2603LLVMValueRef a)2604{2605LLVMBuilderRef builder = bld->gallivm->builder;2606const struct lp_type type = bld->type;26072608assert(lp_check_value(type, a));26092610if(a == bld->zero)2611return bld->undef;2612if(a == bld->one)2613return bld->one;2614if(a == bld->undef)2615return bld->undef;26162617assert(type.floating);26182619if(LLVMIsConstant(a))2620return LLVMConstFDiv(bld->one, a);26212622/*2623* We don't use RCPPS because:2624* - it only has 10bits of precision2625* - it doesn't even get the reciprocate of 1.0 exactly2626* - doing Newton-Rapshon steps yields wrong (NaN) values for 0.0 or Inf2627* - for recent processors the benefit over DIVPS is marginal, a case2628* dependent2629*2630* We could still use it on certain processors if benchmarks show that the2631* RCPPS plus necessary workarounds are still preferrable to DIVPS; or for2632* particular uses that require less workarounds.2633*/26342635if (FALSE && ((util_get_cpu_caps()->has_sse && type.width == 32 && type.length == 4) ||2636(util_get_cpu_caps()->has_avx && type.width == 32 && type.length == 8))){2637const unsigned num_iterations = 0;2638LLVMValueRef res;2639unsigned i;2640const char *intrinsic = NULL;26412642if (type.length == 4) {2643intrinsic = "llvm.x86.sse.rcp.ps";2644}2645else {2646intrinsic = "llvm.x86.avx.rcp.ps.256";2647}26482649res = lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);26502651for (i = 0; i < num_iterations; ++i) {2652res = lp_build_rcp_refine(bld, a, res);2653}26542655return res;2656}26572658return LLVMBuildFDiv(builder, bld->one, a, "");2659}266026612662/**2663* Do one Newton-Raphson step to improve rsqrt precision:2664*2665* x_{i+1} = 0.5 * x_i * (3.0 - a * x_i * x_i)2666*2667* See also Intel 64 and IA-32 Architectures Optimization Manual.2668*/2669static inline LLVMValueRef2670lp_build_rsqrt_refine(struct lp_build_context *bld,2671LLVMValueRef a,2672LLVMValueRef rsqrt_a)2673{2674LLVMBuilderRef builder = bld->gallivm->builder;2675LLVMValueRef half = lp_build_const_vec(bld->gallivm, bld->type, 0.5);2676LLVMValueRef three = lp_build_const_vec(bld->gallivm, bld->type, 3.0);2677LLVMValueRef res;26782679res = LLVMBuildFMul(builder, rsqrt_a, rsqrt_a, "");2680res = LLVMBuildFMul(builder, a, res, "");2681res = LLVMBuildFSub(builder, three, res, "");2682res = LLVMBuildFMul(builder, rsqrt_a, res, "");2683res = LLVMBuildFMul(builder, half, res, "");26842685return res;2686}268726882689/**2690* Generate 1/sqrt(a).2691* Result is undefined for values < 0, infinity for +0.2692*/2693LLVMValueRef2694lp_build_rsqrt(struct lp_build_context *bld,2695LLVMValueRef a)2696{2697const struct lp_type type = bld->type;26982699assert(lp_check_value(type, a));27002701assert(type.floating);27022703/*2704* This should be faster but all denormals will end up as infinity.2705*/2706if (0 && lp_build_fast_rsqrt_available(type)) {2707const unsigned num_iterations = 1;2708LLVMValueRef res;2709unsigned i;27102711/* rsqrt(1.0) != 1.0 here */2712res = lp_build_fast_rsqrt(bld, a);27132714if (num_iterations) {2715/*2716* Newton-Raphson will result in NaN instead of infinity for zero,2717* and NaN instead of zero for infinity.2718* Also, need to ensure rsqrt(1.0) == 1.0.2719* All numbers smaller than FLT_MIN will result in +infinity2720* (rsqrtps treats all denormals as zero).2721*/2722LLVMValueRef cmp;2723LLVMValueRef flt_min = lp_build_const_vec(bld->gallivm, type, FLT_MIN);2724LLVMValueRef inf = lp_build_const_vec(bld->gallivm, type, INFINITY);27252726for (i = 0; i < num_iterations; ++i) {2727res = lp_build_rsqrt_refine(bld, a, res);2728}2729cmp = lp_build_compare(bld->gallivm, type, PIPE_FUNC_LESS, a, flt_min);2730res = lp_build_select(bld, cmp, inf, res);2731cmp = lp_build_compare(bld->gallivm, type, PIPE_FUNC_EQUAL, a, inf);2732res = lp_build_select(bld, cmp, bld->zero, res);2733cmp = lp_build_compare(bld->gallivm, type, PIPE_FUNC_EQUAL, a, bld->one);2734res = lp_build_select(bld, cmp, bld->one, res);2735}27362737return res;2738}27392740return lp_build_rcp(bld, lp_build_sqrt(bld, a));2741}27422743/**2744* If there's a fast (inaccurate) rsqrt instruction available2745* (caller may want to avoid to call rsqrt_fast if it's not available,2746* i.e. for calculating x^0.5 it may do rsqrt_fast(x) * x but if2747* unavailable it would result in sqrt/div/mul so obviously2748* much better to just call sqrt, skipping both div and mul).2749*/2750boolean2751lp_build_fast_rsqrt_available(struct lp_type type)2752{2753assert(type.floating);27542755if ((util_get_cpu_caps()->has_sse && type.width == 32 && type.length == 4) ||2756(util_get_cpu_caps()->has_avx && type.width == 32 && type.length == 8)) {2757return true;2758}2759return false;2760}276127622763/**2764* Generate 1/sqrt(a).2765* Result is undefined for values < 0, infinity for +0.2766* Precision is limited, only ~10 bits guaranteed2767* (rsqrt 1.0 may not be 1.0, denorms may be flushed to 0).2768*/2769LLVMValueRef2770lp_build_fast_rsqrt(struct lp_build_context *bld,2771LLVMValueRef a)2772{2773LLVMBuilderRef builder = bld->gallivm->builder;2774const struct lp_type type = bld->type;27752776assert(lp_check_value(type, a));27772778if (lp_build_fast_rsqrt_available(type)) {2779const char *intrinsic = NULL;27802781if (type.length == 4) {2782intrinsic = "llvm.x86.sse.rsqrt.ps";2783}2784else {2785intrinsic = "llvm.x86.avx.rsqrt.ps.256";2786}2787return lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);2788}2789else {2790debug_printf("%s: emulating fast rsqrt with rcp/sqrt\n", __FUNCTION__);2791}2792return lp_build_rcp(bld, lp_build_sqrt(bld, a));2793}279427952796/**2797* Generate sin(a) or cos(a) using polynomial approximation.2798* TODO: it might be worth recognizing sin and cos using same source2799* (i.e. d3d10 sincos opcode). Obviously doing both at the same time2800* would be way cheaper than calculating (nearly) everything twice...2801* Not sure it's common enough to be worth bothering however, scs2802* opcode could also benefit from calculating both though.2803*/2804static LLVMValueRef2805lp_build_sin_or_cos(struct lp_build_context *bld,2806LLVMValueRef a,2807boolean cos)2808{2809struct gallivm_state *gallivm = bld->gallivm;2810LLVMBuilderRef b = gallivm->builder;2811struct lp_type int_type = lp_int_type(bld->type);28122813/*2814* take the absolute value,2815* x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);2816*/28172818LLVMValueRef inv_sig_mask = lp_build_const_int_vec(gallivm, bld->type, ~0x80000000);2819LLVMValueRef a_v4si = LLVMBuildBitCast(b, a, bld->int_vec_type, "a_v4si");28202821LLVMValueRef absi = LLVMBuildAnd(b, a_v4si, inv_sig_mask, "absi");2822LLVMValueRef x_abs = LLVMBuildBitCast(b, absi, bld->vec_type, "x_abs");28232824/*2825* scale by 4/Pi2826* y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);2827*/28282829LLVMValueRef FOPi = lp_build_const_vec(gallivm, bld->type, 1.27323954473516);2830LLVMValueRef scale_y = LLVMBuildFMul(b, x_abs, FOPi, "scale_y");28312832/*2833* store the integer part of y in mm02834* emm2 = _mm_cvttps_epi32(y);2835*/28362837LLVMValueRef emm2_i = LLVMBuildFPToSI(b, scale_y, bld->int_vec_type, "emm2_i");28382839/*2840* j=(j+1) & (~1) (see the cephes sources)2841* emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1);2842*/28432844LLVMValueRef all_one = lp_build_const_int_vec(gallivm, bld->type, 1);2845LLVMValueRef emm2_add = LLVMBuildAdd(b, emm2_i, all_one, "emm2_add");2846/*2847* emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1);2848*/2849LLVMValueRef inv_one = lp_build_const_int_vec(gallivm, bld->type, ~1);2850LLVMValueRef emm2_and = LLVMBuildAnd(b, emm2_add, inv_one, "emm2_and");28512852/*2853* y = _mm_cvtepi32_ps(emm2);2854*/2855LLVMValueRef y_2 = LLVMBuildSIToFP(b, emm2_and, bld->vec_type, "y_2");28562857LLVMValueRef const_2 = lp_build_const_int_vec(gallivm, bld->type, 2);2858LLVMValueRef const_4 = lp_build_const_int_vec(gallivm, bld->type, 4);2859LLVMValueRef const_29 = lp_build_const_int_vec(gallivm, bld->type, 29);2860LLVMValueRef sign_mask = lp_build_const_int_vec(gallivm, bld->type, 0x80000000);28612862/*2863* Argument used for poly selection and sign bit determination2864* is different for sin vs. cos.2865*/2866LLVMValueRef emm2_2 = cos ? LLVMBuildSub(b, emm2_and, const_2, "emm2_2") :2867emm2_and;28682869LLVMValueRef sign_bit = cos ? LLVMBuildShl(b, LLVMBuildAnd(b, const_4,2870LLVMBuildNot(b, emm2_2, ""), ""),2871const_29, "sign_bit") :2872LLVMBuildAnd(b, LLVMBuildXor(b, a_v4si,2873LLVMBuildShl(b, emm2_add,2874const_29, ""), ""),2875sign_mask, "sign_bit");28762877/*2878* get the polynom selection mask2879* there is one polynom for 0 <= x <= Pi/42880* and another one for Pi/4<x<=Pi/22881* Both branches will be computed.2882*2883* emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);2884* emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());2885*/28862887LLVMValueRef emm2_3 = LLVMBuildAnd(b, emm2_2, const_2, "emm2_3");2888LLVMValueRef poly_mask = lp_build_compare(gallivm,2889int_type, PIPE_FUNC_EQUAL,2890emm2_3, lp_build_const_int_vec(gallivm, bld->type, 0));28912892/*2893* _PS_CONST(minus_cephes_DP1, -0.78515625);2894* _PS_CONST(minus_cephes_DP2, -2.4187564849853515625e-4);2895* _PS_CONST(minus_cephes_DP3, -3.77489497744594108e-8);2896*/2897LLVMValueRef DP1 = lp_build_const_vec(gallivm, bld->type, -0.78515625);2898LLVMValueRef DP2 = lp_build_const_vec(gallivm, bld->type, -2.4187564849853515625e-4);2899LLVMValueRef DP3 = lp_build_const_vec(gallivm, bld->type, -3.77489497744594108e-8);29002901/*2902* The magic pass: "Extended precision modular arithmetic"2903* x = ((x - y * DP1) - y * DP2) - y * DP3;2904*/2905LLVMValueRef x_1 = lp_build_fmuladd(b, y_2, DP1, x_abs);2906LLVMValueRef x_2 = lp_build_fmuladd(b, y_2, DP2, x_1);2907LLVMValueRef x_3 = lp_build_fmuladd(b, y_2, DP3, x_2);29082909/*2910* Evaluate the first polynom (0 <= x <= Pi/4)2911*2912* z = _mm_mul_ps(x,x);2913*/2914LLVMValueRef z = LLVMBuildFMul(b, x_3, x_3, "z");29152916/*2917* _PS_CONST(coscof_p0, 2.443315711809948E-005);2918* _PS_CONST(coscof_p1, -1.388731625493765E-003);2919* _PS_CONST(coscof_p2, 4.166664568298827E-002);2920*/2921LLVMValueRef coscof_p0 = lp_build_const_vec(gallivm, bld->type, 2.443315711809948E-005);2922LLVMValueRef coscof_p1 = lp_build_const_vec(gallivm, bld->type, -1.388731625493765E-003);2923LLVMValueRef coscof_p2 = lp_build_const_vec(gallivm, bld->type, 4.166664568298827E-002);29242925/*2926* y = *(v4sf*)_ps_coscof_p0;2927* y = _mm_mul_ps(y, z);2928*/2929LLVMValueRef y_4 = lp_build_fmuladd(b, z, coscof_p0, coscof_p1);2930LLVMValueRef y_6 = lp_build_fmuladd(b, y_4, z, coscof_p2);2931LLVMValueRef y_7 = LLVMBuildFMul(b, y_6, z, "y_7");2932LLVMValueRef y_8 = LLVMBuildFMul(b, y_7, z, "y_8");293329342935/*2936* tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);2937* y = _mm_sub_ps(y, tmp);2938* y = _mm_add_ps(y, *(v4sf*)_ps_1);2939*/2940LLVMValueRef half = lp_build_const_vec(gallivm, bld->type, 0.5);2941LLVMValueRef tmp = LLVMBuildFMul(b, z, half, "tmp");2942LLVMValueRef y_9 = LLVMBuildFSub(b, y_8, tmp, "y_8");2943LLVMValueRef one = lp_build_const_vec(gallivm, bld->type, 1.0);2944LLVMValueRef y_10 = LLVMBuildFAdd(b, y_9, one, "y_9");29452946/*2947* _PS_CONST(sincof_p0, -1.9515295891E-4);2948* _PS_CONST(sincof_p1, 8.3321608736E-3);2949* _PS_CONST(sincof_p2, -1.6666654611E-1);2950*/2951LLVMValueRef sincof_p0 = lp_build_const_vec(gallivm, bld->type, -1.9515295891E-4);2952LLVMValueRef sincof_p1 = lp_build_const_vec(gallivm, bld->type, 8.3321608736E-3);2953LLVMValueRef sincof_p2 = lp_build_const_vec(gallivm, bld->type, -1.6666654611E-1);29542955/*2956* Evaluate the second polynom (Pi/4 <= x <= 0)2957*2958* y2 = *(v4sf*)_ps_sincof_p0;2959* y2 = _mm_mul_ps(y2, z);2960* y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);2961* y2 = _mm_mul_ps(y2, z);2962* y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);2963* y2 = _mm_mul_ps(y2, z);2964* y2 = _mm_mul_ps(y2, x);2965* y2 = _mm_add_ps(y2, x);2966*/29672968LLVMValueRef y2_4 = lp_build_fmuladd(b, z, sincof_p0, sincof_p1);2969LLVMValueRef y2_6 = lp_build_fmuladd(b, y2_4, z, sincof_p2);2970LLVMValueRef y2_7 = LLVMBuildFMul(b, y2_6, z, "y2_7");2971LLVMValueRef y2_9 = lp_build_fmuladd(b, y2_7, x_3, x_3);29722973/*2974* select the correct result from the two polynoms2975* xmm3 = poly_mask;2976* y2 = _mm_and_ps(xmm3, y2); //, xmm3);2977* y = _mm_andnot_ps(xmm3, y);2978* y = _mm_or_ps(y,y2);2979*/2980LLVMValueRef y2_i = LLVMBuildBitCast(b, y2_9, bld->int_vec_type, "y2_i");2981LLVMValueRef y_i = LLVMBuildBitCast(b, y_10, bld->int_vec_type, "y_i");2982LLVMValueRef y2_and = LLVMBuildAnd(b, y2_i, poly_mask, "y2_and");2983LLVMValueRef poly_mask_inv = LLVMBuildNot(b, poly_mask, "poly_mask_inv");2984LLVMValueRef y_and = LLVMBuildAnd(b, y_i, poly_mask_inv, "y_and");2985LLVMValueRef y_combine = LLVMBuildOr(b, y_and, y2_and, "y_combine");29862987/*2988* update the sign2989* y = _mm_xor_ps(y, sign_bit);2990*/2991LLVMValueRef y_sign = LLVMBuildXor(b, y_combine, sign_bit, "y_sign");2992LLVMValueRef y_result = LLVMBuildBitCast(b, y_sign, bld->vec_type, "y_result");29932994LLVMValueRef isfinite = lp_build_isfinite(bld, a);29952996/* clamp output to be within [-1, 1] */2997y_result = lp_build_clamp(bld, y_result,2998lp_build_const_vec(bld->gallivm, bld->type, -1.f),2999lp_build_const_vec(bld->gallivm, bld->type, 1.f));3000/* If a is -inf, inf or NaN then return NaN */3001y_result = lp_build_select(bld, isfinite, y_result,3002lp_build_const_vec(bld->gallivm, bld->type, NAN));3003return y_result;3004}300530063007/**3008* Generate sin(a)3009*/3010LLVMValueRef3011lp_build_sin(struct lp_build_context *bld,3012LLVMValueRef a)3013{3014return lp_build_sin_or_cos(bld, a, FALSE);3015}301630173018/**3019* Generate cos(a)3020*/3021LLVMValueRef3022lp_build_cos(struct lp_build_context *bld,3023LLVMValueRef a)3024{3025return lp_build_sin_or_cos(bld, a, TRUE);3026}302730283029/**3030* Generate pow(x, y)3031*/3032LLVMValueRef3033lp_build_pow(struct lp_build_context *bld,3034LLVMValueRef x,3035LLVMValueRef y)3036{3037/* TODO: optimize the constant case */3038if (gallivm_debug & GALLIVM_DEBUG_PERF &&3039LLVMIsConstant(x) && LLVMIsConstant(y)) {3040debug_printf("%s: inefficient/imprecise constant arithmetic\n",3041__FUNCTION__);3042}30433044LLVMValueRef cmp = lp_build_cmp(bld, PIPE_FUNC_EQUAL, x, lp_build_const_vec(bld->gallivm, bld->type, 0.0f));3045LLVMValueRef res = lp_build_exp2(bld, lp_build_mul(bld, lp_build_log2(bld, x), y));30463047res = lp_build_select(bld, cmp, lp_build_const_vec(bld->gallivm, bld->type, 0.0f), res);3048return res;3049}305030513052/**3053* Generate exp(x)3054*/3055LLVMValueRef3056lp_build_exp(struct lp_build_context *bld,3057LLVMValueRef x)3058{3059/* log2(e) = 1/log(2) */3060LLVMValueRef log2e = lp_build_const_vec(bld->gallivm, bld->type,30611.4426950408889634);30623063assert(lp_check_value(bld->type, x));30643065return lp_build_exp2(bld, lp_build_mul(bld, log2e, x));3066}306730683069/**3070* Generate log(x)3071* Behavior is undefined with infs, 0s and nans3072*/3073LLVMValueRef3074lp_build_log(struct lp_build_context *bld,3075LLVMValueRef x)3076{3077/* log(2) */3078LLVMValueRef log2 = lp_build_const_vec(bld->gallivm, bld->type,30790.69314718055994529);30803081assert(lp_check_value(bld->type, x));30823083return lp_build_mul(bld, log2, lp_build_log2(bld, x));3084}30853086/**3087* Generate log(x) that handles edge cases (infs, 0s and nans)3088*/3089LLVMValueRef3090lp_build_log_safe(struct lp_build_context *bld,3091LLVMValueRef x)3092{3093/* log(2) */3094LLVMValueRef log2 = lp_build_const_vec(bld->gallivm, bld->type,30950.69314718055994529);30963097assert(lp_check_value(bld->type, x));30983099return lp_build_mul(bld, log2, lp_build_log2_safe(bld, x));3100}310131023103/**3104* Generate polynomial.3105* Ex: coeffs[0] + x * coeffs[1] + x^2 * coeffs[2].3106*/3107LLVMValueRef3108lp_build_polynomial(struct lp_build_context *bld,3109LLVMValueRef x,3110const double *coeffs,3111unsigned num_coeffs)3112{3113const struct lp_type type = bld->type;3114LLVMValueRef even = NULL, odd = NULL;3115LLVMValueRef x2;3116unsigned i;31173118assert(lp_check_value(bld->type, x));31193120/* TODO: optimize the constant case */3121if (gallivm_debug & GALLIVM_DEBUG_PERF &&3122LLVMIsConstant(x)) {3123debug_printf("%s: inefficient/imprecise constant arithmetic\n",3124__FUNCTION__);3125}31263127/*3128* Calculate odd and even terms seperately to decrease data dependency3129* Ex:3130* c[0] + x^2 * c[2] + x^4 * c[4] ...3131* + x * (c[1] + x^2 * c[3] + x^4 * c[5]) ...3132*/3133x2 = lp_build_mul(bld, x, x);31343135for (i = num_coeffs; i--; ) {3136LLVMValueRef coeff;31373138coeff = lp_build_const_vec(bld->gallivm, type, coeffs[i]);31393140if (i % 2 == 0) {3141if (even)3142even = lp_build_mad(bld, x2, even, coeff);3143else3144even = coeff;3145} else {3146if (odd)3147odd = lp_build_mad(bld, x2, odd, coeff);3148else3149odd = coeff;3150}3151}31523153if (odd)3154return lp_build_mad(bld, odd, x, even);3155else if (even)3156return even;3157else3158return bld->undef;3159}316031613162/**3163* Minimax polynomial fit of 2**x, in range [0, 1[3164*/3165const double lp_build_exp2_polynomial[] = {3166#if EXP_POLY_DEGREE == 531671.000000000000000000000, /*XXX: was 0.999999925063526176901, recompute others */31680.693153073200168932794,31690.240153617044375388211,31700.0558263180532956664775,31710.00898934009049466391101,31720.001877576675191479126993173#elif EXP_POLY_DEGREE == 431741.00000259337069434683,31750.693003834469974940458,31760.24144275689150793076,31770.0520114606103070150235,31780.01353416791612702687643179#elif EXP_POLY_DEGREE == 331800.999925218562710312959,31810.695833540494823811697,31820.226067155427249155588,31830.07802452264063729929673184#elif EXP_POLY_DEGREE == 231851.00172476321474503578,31860.657636275736077639316,31870.337189434619687207043188#else3189#error3190#endif3191};319231933194LLVMValueRef3195lp_build_exp2(struct lp_build_context *bld,3196LLVMValueRef x)3197{3198LLVMBuilderRef builder = bld->gallivm->builder;3199const struct lp_type type = bld->type;3200LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);3201LLVMValueRef ipart = NULL;3202LLVMValueRef fpart = NULL;3203LLVMValueRef expipart = NULL;3204LLVMValueRef expfpart = NULL;3205LLVMValueRef res = NULL;32063207assert(lp_check_value(bld->type, x));32083209/* TODO: optimize the constant case */3210if (gallivm_debug & GALLIVM_DEBUG_PERF &&3211LLVMIsConstant(x)) {3212debug_printf("%s: inefficient/imprecise constant arithmetic\n",3213__FUNCTION__);3214}32153216assert(type.floating && type.width == 32);32173218/* We want to preserve NaN and make sure than for exp2 if x > 128,3219* the result is INF and if it's smaller than -126.9 the result is 0 */3220x = lp_build_min_ext(bld, lp_build_const_vec(bld->gallivm, type, 128.0), x,3221GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN);3222x = lp_build_max_ext(bld, lp_build_const_vec(bld->gallivm, type, -126.99999),3223x, GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN);32243225/* ipart = floor(x) */3226/* fpart = x - ipart */3227lp_build_ifloor_fract(bld, x, &ipart, &fpart);32283229/* expipart = (float) (1 << ipart) */3230expipart = LLVMBuildAdd(builder, ipart,3231lp_build_const_int_vec(bld->gallivm, type, 127), "");3232expipart = LLVMBuildShl(builder, expipart,3233lp_build_const_int_vec(bld->gallivm, type, 23), "");3234expipart = LLVMBuildBitCast(builder, expipart, vec_type, "");32353236expfpart = lp_build_polynomial(bld, fpart, lp_build_exp2_polynomial,3237ARRAY_SIZE(lp_build_exp2_polynomial));32383239res = LLVMBuildFMul(builder, expipart, expfpart, "");32403241return res;3242}3243324432453246/**3247* Extract the exponent of a IEEE-754 floating point value.3248*3249* Optionally apply an integer bias.3250*3251* Result is an integer value with3252*3253* ifloor(log2(x)) + bias3254*/3255LLVMValueRef3256lp_build_extract_exponent(struct lp_build_context *bld,3257LLVMValueRef x,3258int bias)3259{3260LLVMBuilderRef builder = bld->gallivm->builder;3261const struct lp_type type = bld->type;3262unsigned mantissa = lp_mantissa(type);3263LLVMValueRef res;32643265assert(type.floating);32663267assert(lp_check_value(bld->type, x));32683269x = LLVMBuildBitCast(builder, x, bld->int_vec_type, "");32703271res = LLVMBuildLShr(builder, x,3272lp_build_const_int_vec(bld->gallivm, type, mantissa), "");3273res = LLVMBuildAnd(builder, res,3274lp_build_const_int_vec(bld->gallivm, type, 255), "");3275res = LLVMBuildSub(builder, res,3276lp_build_const_int_vec(bld->gallivm, type, 127 - bias), "");32773278return res;3279}328032813282/**3283* Extract the mantissa of the a floating.3284*3285* Result is a floating point value with3286*3287* x / floor(log2(x))3288*/3289LLVMValueRef3290lp_build_extract_mantissa(struct lp_build_context *bld,3291LLVMValueRef x)3292{3293LLVMBuilderRef builder = bld->gallivm->builder;3294const struct lp_type type = bld->type;3295unsigned mantissa = lp_mantissa(type);3296LLVMValueRef mantmask = lp_build_const_int_vec(bld->gallivm, type,3297(1ULL << mantissa) - 1);3298LLVMValueRef one = LLVMConstBitCast(bld->one, bld->int_vec_type);3299LLVMValueRef res;33003301assert(lp_check_value(bld->type, x));33023303assert(type.floating);33043305x = LLVMBuildBitCast(builder, x, bld->int_vec_type, "");33063307/* res = x / 2**ipart */3308res = LLVMBuildAnd(builder, x, mantmask, "");3309res = LLVMBuildOr(builder, res, one, "");3310res = LLVMBuildBitCast(builder, res, bld->vec_type, "");33113312return res;3313}3314331533163317/**3318* Minimax polynomial fit of log2((1.0 + sqrt(x))/(1.0 - sqrt(x)))/sqrt(x) ,for x in range of [0, 1/9[3319* These coefficients can be generate with3320* http://www.boost.org/doc/libs/1_36_0/libs/math/doc/sf_and_dist/html/math_toolkit/toolkit/internals2/minimax.html3321*/3322const double lp_build_log2_polynomial[] = {3323#if LOG_POLY_DEGREE == 533242.88539008148777786488L,33250.961796878841293367824L,33260.577058946784739859012L,33270.412914355135828735411L,33280.308591899232910175289L,33290.352376952300281371868L,3330#elif LOG_POLY_DEGREE == 433312.88539009343309178325L,33320.961791550404184197881L,33330.577440339438736392009L,33340.403343858251329912514L,33350.406718052498846252698L,3336#elif LOG_POLY_DEGREE == 333372.88538959748872753838L,33380.961932915889597772928L,33390.571118517972136195241L,33400.493997535084709500285L,3341#else3342#error3343#endif3344};33453346/**3347* See http://www.devmaster.net/forums/showthread.php?p=435803348* http://en.wikipedia.org/wiki/Logarithm#Calculation3349* http://www.nezumi.demon.co.uk/consult/logx.htm3350*3351* If handle_edge_cases is true the function will perform computations3352* to match the required D3D10+ behavior for each of the edge cases.3353* That means that if input is:3354* - less than zero (to and including -inf) then NaN will be returned3355* - equal to zero (-denorm, -0, +0 or +denorm), then -inf will be returned3356* - +infinity, then +infinity will be returned3357* - NaN, then NaN will be returned3358*3359* Those checks are fairly expensive so if you don't need them make sure3360* handle_edge_cases is false.3361*/3362void3363lp_build_log2_approx(struct lp_build_context *bld,3364LLVMValueRef x,3365LLVMValueRef *p_exp,3366LLVMValueRef *p_floor_log2,3367LLVMValueRef *p_log2,3368boolean handle_edge_cases)3369{3370LLVMBuilderRef builder = bld->gallivm->builder;3371const struct lp_type type = bld->type;3372LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);3373LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);33743375LLVMValueRef expmask = lp_build_const_int_vec(bld->gallivm, type, 0x7f800000);3376LLVMValueRef mantmask = lp_build_const_int_vec(bld->gallivm, type, 0x007fffff);3377LLVMValueRef one = LLVMConstBitCast(bld->one, int_vec_type);33783379LLVMValueRef i = NULL;3380LLVMValueRef y = NULL;3381LLVMValueRef z = NULL;3382LLVMValueRef exp = NULL;3383LLVMValueRef mant = NULL;3384LLVMValueRef logexp = NULL;3385LLVMValueRef p_z = NULL;3386LLVMValueRef res = NULL;33873388assert(lp_check_value(bld->type, x));33893390if(p_exp || p_floor_log2 || p_log2) {3391/* TODO: optimize the constant case */3392if (gallivm_debug & GALLIVM_DEBUG_PERF &&3393LLVMIsConstant(x)) {3394debug_printf("%s: inefficient/imprecise constant arithmetic\n",3395__FUNCTION__);3396}33973398assert(type.floating && type.width == 32);33993400/*3401* We don't explicitly handle denormalized numbers. They will yield a3402* result in the neighbourhood of -127, which appears to be adequate3403* enough.3404*/34053406i = LLVMBuildBitCast(builder, x, int_vec_type, "");34073408/* exp = (float) exponent(x) */3409exp = LLVMBuildAnd(builder, i, expmask, "");3410}34113412if(p_floor_log2 || p_log2) {3413logexp = LLVMBuildLShr(builder, exp, lp_build_const_int_vec(bld->gallivm, type, 23), "");3414logexp = LLVMBuildSub(builder, logexp, lp_build_const_int_vec(bld->gallivm, type, 127), "");3415logexp = LLVMBuildSIToFP(builder, logexp, vec_type, "");3416}34173418if (p_log2) {3419/* mant = 1 + (float) mantissa(x) */3420mant = LLVMBuildAnd(builder, i, mantmask, "");3421mant = LLVMBuildOr(builder, mant, one, "");3422mant = LLVMBuildBitCast(builder, mant, vec_type, "");34233424/* y = (mant - 1) / (mant + 1) */3425y = lp_build_div(bld,3426lp_build_sub(bld, mant, bld->one),3427lp_build_add(bld, mant, bld->one)3428);34293430/* z = y^2 */3431z = lp_build_mul(bld, y, y);34323433/* compute P(z) */3434p_z = lp_build_polynomial(bld, z, lp_build_log2_polynomial,3435ARRAY_SIZE(lp_build_log2_polynomial));34363437/* y * P(z) + logexp */3438res = lp_build_mad(bld, y, p_z, logexp);34393440if (type.floating && handle_edge_cases) {3441LLVMValueRef negmask, infmask, zmask;3442negmask = lp_build_cmp(bld, PIPE_FUNC_LESS, x,3443lp_build_const_vec(bld->gallivm, type, 0.0f));3444zmask = lp_build_cmp(bld, PIPE_FUNC_EQUAL, x,3445lp_build_const_vec(bld->gallivm, type, 0.0f));3446infmask = lp_build_cmp(bld, PIPE_FUNC_GEQUAL, x,3447lp_build_const_vec(bld->gallivm, type, INFINITY));34483449/* If x is qual to inf make sure we return inf */3450res = lp_build_select(bld, infmask,3451lp_build_const_vec(bld->gallivm, type, INFINITY),3452res);3453/* If x is qual to 0, return -inf */3454res = lp_build_select(bld, zmask,3455lp_build_const_vec(bld->gallivm, type, -INFINITY),3456res);3457/* If x is nan or less than 0, return nan */3458res = lp_build_select(bld, negmask,3459lp_build_const_vec(bld->gallivm, type, NAN),3460res);3461}3462}34633464if (p_exp) {3465exp = LLVMBuildBitCast(builder, exp, vec_type, "");3466*p_exp = exp;3467}34683469if (p_floor_log2)3470*p_floor_log2 = logexp;34713472if (p_log2)3473*p_log2 = res;3474}347534763477/*3478* log2 implementation which doesn't have special code to3479* handle edge cases (-inf, 0, inf, NaN). It's faster but3480* the results for those cases are undefined.3481*/3482LLVMValueRef3483lp_build_log2(struct lp_build_context *bld,3484LLVMValueRef x)3485{3486LLVMValueRef res;3487lp_build_log2_approx(bld, x, NULL, NULL, &res, FALSE);3488return res;3489}34903491/*3492* Version of log2 which handles all edge cases.3493* Look at documentation of lp_build_log2_approx for3494* description of the behavior for each of the edge cases.3495*/3496LLVMValueRef3497lp_build_log2_safe(struct lp_build_context *bld,3498LLVMValueRef x)3499{3500LLVMValueRef res;3501lp_build_log2_approx(bld, x, NULL, NULL, &res, TRUE);3502return res;3503}350435053506/**3507* Faster (and less accurate) log2.3508*3509* log2(x) = floor(log2(x)) - 1 + x / 2**floor(log2(x))3510*3511* Piece-wise linear approximation, with exact results when x is a3512* power of two.3513*3514* See http://www.flipcode.com/archives/Fast_log_Function.shtml3515*/3516LLVMValueRef3517lp_build_fast_log2(struct lp_build_context *bld,3518LLVMValueRef x)3519{3520LLVMBuilderRef builder = bld->gallivm->builder;3521LLVMValueRef ipart;3522LLVMValueRef fpart;35233524assert(lp_check_value(bld->type, x));35253526assert(bld->type.floating);35273528/* ipart = floor(log2(x)) - 1 */3529ipart = lp_build_extract_exponent(bld, x, -1);3530ipart = LLVMBuildSIToFP(builder, ipart, bld->vec_type, "");35313532/* fpart = x / 2**ipart */3533fpart = lp_build_extract_mantissa(bld, x);35343535/* ipart + fpart */3536return LLVMBuildFAdd(builder, ipart, fpart, "");3537}353835393540/**3541* Fast implementation of iround(log2(x)).3542*3543* Not an approximation -- it should give accurate results all the time.3544*/3545LLVMValueRef3546lp_build_ilog2(struct lp_build_context *bld,3547LLVMValueRef x)3548{3549LLVMBuilderRef builder = bld->gallivm->builder;3550LLVMValueRef sqrt2 = lp_build_const_vec(bld->gallivm, bld->type, M_SQRT2);3551LLVMValueRef ipart;35523553assert(bld->type.floating);35543555assert(lp_check_value(bld->type, x));35563557/* x * 2^(0.5) i.e., add 0.5 to the log2(x) */3558x = LLVMBuildFMul(builder, x, sqrt2, "");35593560/* ipart = floor(log2(x) + 0.5) */3561ipart = lp_build_extract_exponent(bld, x, 0);35623563return ipart;3564}35653566LLVMValueRef3567lp_build_mod(struct lp_build_context *bld,3568LLVMValueRef x,3569LLVMValueRef y)3570{3571LLVMBuilderRef builder = bld->gallivm->builder;3572LLVMValueRef res;3573const struct lp_type type = bld->type;35743575assert(lp_check_value(type, x));3576assert(lp_check_value(type, y));35773578if (type.floating)3579res = LLVMBuildFRem(builder, x, y, "");3580else if (type.sign)3581res = LLVMBuildSRem(builder, x, y, "");3582else3583res = LLVMBuildURem(builder, x, y, "");3584return res;3585}358635873588/*3589* For floating inputs it creates and returns a mask3590* which is all 1's for channels which are NaN.3591* Channels inside x which are not NaN will be 0.3592*/3593LLVMValueRef3594lp_build_isnan(struct lp_build_context *bld,3595LLVMValueRef x)3596{3597LLVMValueRef mask;3598LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, bld->type);35993600assert(bld->type.floating);3601assert(lp_check_value(bld->type, x));36023603mask = LLVMBuildFCmp(bld->gallivm->builder, LLVMRealOEQ, x, x,3604"isnotnan");3605mask = LLVMBuildNot(bld->gallivm->builder, mask, "");3606mask = LLVMBuildSExt(bld->gallivm->builder, mask, int_vec_type, "isnan");3607return mask;3608}36093610/* Returns all 1's for floating point numbers that are3611* finite numbers and returns all zeros for -inf,3612* inf and nan's */3613LLVMValueRef3614lp_build_isfinite(struct lp_build_context *bld,3615LLVMValueRef x)3616{3617LLVMBuilderRef builder = bld->gallivm->builder;3618LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, bld->type);3619struct lp_type int_type = lp_int_type(bld->type);3620LLVMValueRef intx = LLVMBuildBitCast(builder, x, int_vec_type, "");3621LLVMValueRef infornan32 = lp_build_const_int_vec(bld->gallivm, bld->type,36220x7f800000);36233624if (!bld->type.floating) {3625return lp_build_const_int_vec(bld->gallivm, bld->type, 0);3626}3627assert(bld->type.floating);3628assert(lp_check_value(bld->type, x));3629assert(bld->type.width == 32);36303631intx = LLVMBuildAnd(builder, intx, infornan32, "");3632return lp_build_compare(bld->gallivm, int_type, PIPE_FUNC_NOTEQUAL,3633intx, infornan32);3634}36353636/*3637* Returns true if the number is nan or inf and false otherwise.3638* The input has to be a floating point vector.3639*/3640LLVMValueRef3641lp_build_is_inf_or_nan(struct gallivm_state *gallivm,3642const struct lp_type type,3643LLVMValueRef x)3644{3645LLVMBuilderRef builder = gallivm->builder;3646struct lp_type int_type = lp_int_type(type);3647LLVMValueRef const0 = lp_build_const_int_vec(gallivm, int_type,36480x7f800000);3649LLVMValueRef ret;36503651assert(type.floating);36523653ret = LLVMBuildBitCast(builder, x, lp_build_vec_type(gallivm, int_type), "");3654ret = LLVMBuildAnd(builder, ret, const0, "");3655ret = lp_build_compare(gallivm, int_type, PIPE_FUNC_EQUAL,3656ret, const0);36573658return ret;3659}366036613662LLVMValueRef3663lp_build_fpstate_get(struct gallivm_state *gallivm)3664{3665if (util_get_cpu_caps()->has_sse) {3666LLVMBuilderRef builder = gallivm->builder;3667LLVMValueRef mxcsr_ptr = lp_build_alloca(3668gallivm,3669LLVMInt32TypeInContext(gallivm->context),3670"mxcsr_ptr");3671LLVMValueRef mxcsr_ptr8 = LLVMBuildPointerCast(builder, mxcsr_ptr,3672LLVMPointerType(LLVMInt8TypeInContext(gallivm->context), 0), "");3673lp_build_intrinsic(builder,3674"llvm.x86.sse.stmxcsr",3675LLVMVoidTypeInContext(gallivm->context),3676&mxcsr_ptr8, 1, 0);3677return mxcsr_ptr;3678}3679return 0;3680}36813682void3683lp_build_fpstate_set_denorms_zero(struct gallivm_state *gallivm,3684boolean zero)3685{3686if (util_get_cpu_caps()->has_sse) {3687/* turn on DAZ (64) | FTZ (32768) = 32832 if available */3688int daz_ftz = _MM_FLUSH_ZERO_MASK;36893690LLVMBuilderRef builder = gallivm->builder;3691LLVMValueRef mxcsr_ptr = lp_build_fpstate_get(gallivm);3692LLVMValueRef mxcsr =3693LLVMBuildLoad(builder, mxcsr_ptr, "mxcsr");36943695if (util_get_cpu_caps()->has_daz) {3696/* Enable denormals are zero mode */3697daz_ftz |= _MM_DENORMALS_ZERO_MASK;3698}3699if (zero) {3700mxcsr = LLVMBuildOr(builder, mxcsr,3701LLVMConstInt(LLVMTypeOf(mxcsr), daz_ftz, 0), "");3702} else {3703mxcsr = LLVMBuildAnd(builder, mxcsr,3704LLVMConstInt(LLVMTypeOf(mxcsr), ~daz_ftz, 0), "");3705}37063707LLVMBuildStore(builder, mxcsr, mxcsr_ptr);3708lp_build_fpstate_set(gallivm, mxcsr_ptr);3709}3710}37113712void3713lp_build_fpstate_set(struct gallivm_state *gallivm,3714LLVMValueRef mxcsr_ptr)3715{3716if (util_get_cpu_caps()->has_sse) {3717LLVMBuilderRef builder = gallivm->builder;3718mxcsr_ptr = LLVMBuildPointerCast(builder, mxcsr_ptr,3719LLVMPointerType(LLVMInt8TypeInContext(gallivm->context), 0), "");3720lp_build_intrinsic(builder,3721"llvm.x86.sse.ldmxcsr",3722LLVMVoidTypeInContext(gallivm->context),3723&mxcsr_ptr, 1, 0);3724}3725}372637273728