Path: blob/21.2-virgl/src/gallium/drivers/llvmpipe/lp_bld_depth.c
4570 views
/**************************************************************************1*2* Copyright 2009-2010 VMware, Inc.3* All Rights Reserved.4*5* Permission is hereby granted, free of charge, to any person obtaining a6* copy of this software and associated documentation files (the7* "Software"), to deal in the Software without restriction, including8* without limitation the rights to use, copy, modify, merge, publish,9* distribute, sub license, and/or sell copies of the Software, and to10* permit persons to whom the Software is furnished to do so, subject to11* the following conditions:12*13* The above copyright notice and this permission notice (including the14* next paragraph) shall be included in all copies or substantial portions15* of the Software.16*17* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS18* OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF19* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.20* IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR21* ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,22* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE23* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.24*25**************************************************************************/2627/**28* @file29* Depth/stencil testing to LLVM IR translation.30*31* To be done accurately/efficiently the depth/stencil test must be done with32* the same type/format of the depth/stencil buffer, which implies massaging33* the incoming depths to fit into place. Using a more straightforward34* type/format for depth/stencil values internally and only convert when35* flushing would avoid this, but it would most likely result in depth fighting36* artifacts.37*38* Since we're using linear layout for everything, but we need to deal with39* 2x2 quads, we need to load/store multiple values and swizzle them into40* place (we could avoid this by doing depth/stencil testing in linear format,41* which would be easy for late depth/stencil test as we could do that after42* the fragment shader loop just as we do for color buffers, but more tricky43* for early depth test as we'd need both masks and interpolated depth in44* linear format).45*46*47* @author Jose Fonseca <[email protected]>48* @author Brian Paul <[email protected]>49*/5051#include "pipe/p_state.h"52#include "util/format/u_format.h"53#include "util/u_cpu_detect.h"5455#include "gallivm/lp_bld_type.h"56#include "gallivm/lp_bld_arit.h"57#include "gallivm/lp_bld_bitarit.h"58#include "gallivm/lp_bld_const.h"59#include "gallivm/lp_bld_conv.h"60#include "gallivm/lp_bld_logic.h"61#include "gallivm/lp_bld_flow.h"62#include "gallivm/lp_bld_intr.h"63#include "gallivm/lp_bld_debug.h"64#include "gallivm/lp_bld_swizzle.h"65#include "gallivm/lp_bld_pack.h"6667#include "lp_bld_depth.h"68#include "lp_state_fs.h"697071/** Used to select fields from pipe_stencil_state */72enum stencil_op {73S_FAIL_OP,74Z_FAIL_OP,75Z_PASS_OP76};77787980/**81* Do the stencil test comparison (compare FB stencil values against ref value).82* This will be used twice when generating two-sided stencil code.83* \param stencil the front/back stencil state84* \param stencilRef the stencil reference value, replicated as a vector85* \param stencilVals vector of stencil values from framebuffer86* \return vector mask of pass/fail values (~0 or 0)87*/88static LLVMValueRef89lp_build_stencil_test_single(struct lp_build_context *bld,90const struct pipe_stencil_state *stencil,91LLVMValueRef stencilRef,92LLVMValueRef stencilVals)93{94LLVMBuilderRef builder = bld->gallivm->builder;95const unsigned stencilMax = 255; /* XXX fix */96struct lp_type type = bld->type;97LLVMValueRef res;9899/*100* SSE2 has intrinsics for signed comparisons, but not unsigned ones. Values101* are between 0..255 so ensure we generate the fastest comparisons for102* wider elements.103*/104if (type.width <= 8) {105assert(!type.sign);106} else {107assert(type.sign);108}109110assert(stencil->enabled);111112if (stencil->valuemask != stencilMax) {113/* compute stencilRef = stencilRef & valuemask */114LLVMValueRef valuemask = lp_build_const_int_vec(bld->gallivm, type, stencil->valuemask);115stencilRef = LLVMBuildAnd(builder, stencilRef, valuemask, "");116/* compute stencilVals = stencilVals & valuemask */117stencilVals = LLVMBuildAnd(builder, stencilVals, valuemask, "");118}119120res = lp_build_cmp(bld, stencil->func, stencilRef, stencilVals);121122return res;123}124125126/**127* Do the one or two-sided stencil test comparison.128* \sa lp_build_stencil_test_single129* \param front_facing an integer vector mask, indicating front (~0) or back130* (0) facing polygon. If NULL, assume front-facing.131*/132static LLVMValueRef133lp_build_stencil_test(struct lp_build_context *bld,134const struct pipe_stencil_state stencil[2],135LLVMValueRef stencilRefs[2],136LLVMValueRef stencilVals,137LLVMValueRef front_facing)138{139LLVMValueRef res;140141assert(stencil[0].enabled);142143/* do front face test */144res = lp_build_stencil_test_single(bld, &stencil[0],145stencilRefs[0], stencilVals);146147if (stencil[1].enabled && front_facing != NULL) {148/* do back face test */149LLVMValueRef back_res;150151back_res = lp_build_stencil_test_single(bld, &stencil[1],152stencilRefs[1], stencilVals);153154res = lp_build_select(bld, front_facing, res, back_res);155}156157return res;158}159160161/**162* Apply the stencil operator (add/sub/keep/etc) to the given vector163* of stencil values.164* \return new stencil values vector165*/166static LLVMValueRef167lp_build_stencil_op_single(struct lp_build_context *bld,168const struct pipe_stencil_state *stencil,169enum stencil_op op,170LLVMValueRef stencilRef,171LLVMValueRef stencilVals)172173{174LLVMBuilderRef builder = bld->gallivm->builder;175struct lp_type type = bld->type;176LLVMValueRef res;177LLVMValueRef max = lp_build_const_int_vec(bld->gallivm, type, 0xff);178unsigned stencil_op;179180assert(type.sign);181182switch (op) {183case S_FAIL_OP:184stencil_op = stencil->fail_op;185break;186case Z_FAIL_OP:187stencil_op = stencil->zfail_op;188break;189case Z_PASS_OP:190stencil_op = stencil->zpass_op;191break;192default:193assert(0 && "Invalid stencil_op mode");194stencil_op = PIPE_STENCIL_OP_KEEP;195}196197switch (stencil_op) {198case PIPE_STENCIL_OP_KEEP:199res = stencilVals;200/* we can return early for this case */201return res;202case PIPE_STENCIL_OP_ZERO:203res = bld->zero;204break;205case PIPE_STENCIL_OP_REPLACE:206res = stencilRef;207break;208case PIPE_STENCIL_OP_INCR:209res = lp_build_add(bld, stencilVals, bld->one);210res = lp_build_min(bld, res, max);211break;212case PIPE_STENCIL_OP_DECR:213res = lp_build_sub(bld, stencilVals, bld->one);214res = lp_build_max(bld, res, bld->zero);215break;216case PIPE_STENCIL_OP_INCR_WRAP:217res = lp_build_add(bld, stencilVals, bld->one);218res = LLVMBuildAnd(builder, res, max, "");219break;220case PIPE_STENCIL_OP_DECR_WRAP:221res = lp_build_sub(bld, stencilVals, bld->one);222res = LLVMBuildAnd(builder, res, max, "");223break;224case PIPE_STENCIL_OP_INVERT:225res = LLVMBuildNot(builder, stencilVals, "");226res = LLVMBuildAnd(builder, res, max, "");227break;228default:229assert(0 && "bad stencil op mode");230res = bld->undef;231}232233return res;234}235236237/**238* Do the one or two-sided stencil test op/update.239*/240static LLVMValueRef241lp_build_stencil_op(struct lp_build_context *bld,242const struct pipe_stencil_state stencil[2],243enum stencil_op op,244LLVMValueRef stencilRefs[2],245LLVMValueRef stencilVals,246LLVMValueRef mask,247LLVMValueRef front_facing)248249{250LLVMBuilderRef builder = bld->gallivm->builder;251LLVMValueRef res;252253assert(stencil[0].enabled);254255/* do front face op */256res = lp_build_stencil_op_single(bld, &stencil[0], op,257stencilRefs[0], stencilVals);258259if (stencil[1].enabled && front_facing != NULL) {260/* do back face op */261LLVMValueRef back_res;262263back_res = lp_build_stencil_op_single(bld, &stencil[1], op,264stencilRefs[1], stencilVals);265266res = lp_build_select(bld, front_facing, res, back_res);267}268269if (stencil[0].writemask != 0xff ||270(stencil[1].enabled && front_facing != NULL && stencil[1].writemask != 0xff)) {271/* mask &= stencil[0].writemask */272LLVMValueRef writemask = lp_build_const_int_vec(bld->gallivm, bld->type,273stencil[0].writemask);274if (stencil[1].enabled && stencil[1].writemask != stencil[0].writemask && front_facing != NULL) {275LLVMValueRef back_writemask = lp_build_const_int_vec(bld->gallivm, bld->type,276stencil[1].writemask);277writemask = lp_build_select(bld, front_facing, writemask, back_writemask);278}279280mask = LLVMBuildAnd(builder, mask, writemask, "");281/* res = (res & mask) | (stencilVals & ~mask) */282res = lp_build_select_bitwise(bld, mask, res, stencilVals);283}284else {285/* res = mask ? res : stencilVals */286res = lp_build_select(bld, mask, res, stencilVals);287}288289return res;290}291292293294/**295* Return a type that matches the depth/stencil format.296*/297struct lp_type298lp_depth_type(const struct util_format_description *format_desc,299unsigned length)300{301struct lp_type type;302unsigned z_swizzle;303304assert(format_desc->colorspace == UTIL_FORMAT_COLORSPACE_ZS);305assert(format_desc->block.width == 1);306assert(format_desc->block.height == 1);307308memset(&type, 0, sizeof type);309type.width = format_desc->block.bits;310311z_swizzle = format_desc->swizzle[0];312if (z_swizzle < 4) {313if (format_desc->channel[z_swizzle].type == UTIL_FORMAT_TYPE_FLOAT) {314type.floating = TRUE;315assert(z_swizzle == 0);316assert(format_desc->channel[z_swizzle].size == 32);317}318else if(format_desc->channel[z_swizzle].type == UTIL_FORMAT_TYPE_UNSIGNED) {319assert(format_desc->block.bits <= 32);320assert(format_desc->channel[z_swizzle].normalized);321if (format_desc->channel[z_swizzle].size < format_desc->block.bits) {322/* Prefer signed integers when possible, as SSE has less support323* for unsigned comparison;324*/325type.sign = TRUE;326}327}328else329assert(0);330}331332type.length = length;333334return type;335}336337338/**339* Compute bitmask and bit shift to apply to the incoming fragment Z values340* and the Z buffer values needed before doing the Z comparison.341*342* Note that we leave the Z bits in the position that we find them343* in the Z buffer (typically 0xffffff00 or 0x00ffffff). That lets us344* get by with fewer bit twiddling steps.345*/346static boolean347get_z_shift_and_mask(const struct util_format_description *format_desc,348unsigned *shift, unsigned *width, unsigned *mask)349{350unsigned total_bits;351unsigned z_swizzle;352353assert(format_desc->colorspace == UTIL_FORMAT_COLORSPACE_ZS);354assert(format_desc->block.width == 1);355assert(format_desc->block.height == 1);356357/* 64bit d/s format is special already extracted 32 bits */358total_bits = format_desc->block.bits > 32 ? 32 : format_desc->block.bits;359360z_swizzle = format_desc->swizzle[0];361362if (z_swizzle == PIPE_SWIZZLE_NONE)363return FALSE;364365*width = format_desc->channel[z_swizzle].size;366/* & 31 is for the same reason as the 32-bit limit above */367*shift = format_desc->channel[z_swizzle].shift & 31;368369if (*width == total_bits) {370*mask = 0xffffffff;371} else {372*mask = ((1 << *width) - 1) << *shift;373}374375return TRUE;376}377378379/**380* Compute bitmask and bit shift to apply to the framebuffer pixel values381* to put the stencil bits in the least significant position.382* (i.e. 0x000000ff)383*/384static boolean385get_s_shift_and_mask(const struct util_format_description *format_desc,386unsigned *shift, unsigned *mask)387{388unsigned s_swizzle;389unsigned sz;390391s_swizzle = format_desc->swizzle[1];392393if (s_swizzle == PIPE_SWIZZLE_NONE)394return FALSE;395396/* just special case 64bit d/s format */397if (format_desc->block.bits > 32) {398/* XXX big-endian? */399assert(format_desc->format == PIPE_FORMAT_Z32_FLOAT_S8X24_UINT);400*shift = 0;401*mask = 0xff;402return TRUE;403}404405*shift = format_desc->channel[s_swizzle].shift;406sz = format_desc->channel[s_swizzle].size;407*mask = (1U << sz) - 1U;408409return TRUE;410}411412413/**414* Perform the occlusion test and increase the counter.415* Test the depth mask. Add the number of channel which has none zero mask416* into the occlusion counter. e.g. maskvalue is {-1, -1, -1, -1}.417* The counter will add 4.418* TODO: could get that out of the fs loop.419*420* \param type holds element type of the mask vector.421* \param maskvalue is the depth test mask.422* \param counter is a pointer of the uint32 counter.423*/424void425lp_build_occlusion_count(struct gallivm_state *gallivm,426struct lp_type type,427LLVMValueRef maskvalue,428LLVMValueRef counter)429{430LLVMBuilderRef builder = gallivm->builder;431LLVMContextRef context = gallivm->context;432LLVMValueRef countmask = lp_build_const_int_vec(gallivm, type, 1);433LLVMValueRef count, newcount;434435assert(type.length <= 16);436assert(type.floating);437438if(util_get_cpu_caps()->has_sse && type.length == 4) {439const char *movmskintr = "llvm.x86.sse.movmsk.ps";440const char *popcntintr = "llvm.ctpop.i32";441LLVMValueRef bits = LLVMBuildBitCast(builder, maskvalue,442lp_build_vec_type(gallivm, type), "");443bits = lp_build_intrinsic_unary(builder, movmskintr,444LLVMInt32TypeInContext(context), bits);445count = lp_build_intrinsic_unary(builder, popcntintr,446LLVMInt32TypeInContext(context), bits);447count = LLVMBuildZExt(builder, count, LLVMIntTypeInContext(context, 64), "");448}449else if(util_get_cpu_caps()->has_avx && type.length == 8) {450const char *movmskintr = "llvm.x86.avx.movmsk.ps.256";451const char *popcntintr = "llvm.ctpop.i32";452LLVMValueRef bits = LLVMBuildBitCast(builder, maskvalue,453lp_build_vec_type(gallivm, type), "");454bits = lp_build_intrinsic_unary(builder, movmskintr,455LLVMInt32TypeInContext(context), bits);456count = lp_build_intrinsic_unary(builder, popcntintr,457LLVMInt32TypeInContext(context), bits);458count = LLVMBuildZExt(builder, count, LLVMIntTypeInContext(context, 64), "");459}460else {461unsigned i;462LLVMValueRef countv = LLVMBuildAnd(builder, maskvalue, countmask, "countv");463LLVMTypeRef counttype = LLVMIntTypeInContext(context, type.length * 8);464LLVMTypeRef i8vntype = LLVMVectorType(LLVMInt8TypeInContext(context), type.length * 4);465LLVMValueRef shufflev, countd;466LLVMValueRef shuffles[16];467const char *popcntintr = NULL;468469countv = LLVMBuildBitCast(builder, countv, i8vntype, "");470471for (i = 0; i < type.length; i++) {472#if UTIL_ARCH_LITTLE_ENDIAN473shuffles[i] = lp_build_const_int32(gallivm, 4*i);474#else475shuffles[i] = lp_build_const_int32(gallivm, (4*i) + 3);476#endif477}478479shufflev = LLVMConstVector(shuffles, type.length);480countd = LLVMBuildShuffleVector(builder, countv, LLVMGetUndef(i8vntype), shufflev, "");481countd = LLVMBuildBitCast(builder, countd, counttype, "countd");482483/*484* XXX FIXME485* this is bad on cpus without popcount (on x86 supported by intel486* nehalem, amd barcelona, and up - not tied to sse42).487* Would be much faster to just sum the 4 elements of the vector with488* some horizontal add (shuffle/add/shuffle/add after the initial and).489*/490switch (type.length) {491case 4:492popcntintr = "llvm.ctpop.i32";493break;494case 8:495popcntintr = "llvm.ctpop.i64";496break;497case 16:498popcntintr = "llvm.ctpop.i128";499break;500default:501assert(0);502}503count = lp_build_intrinsic_unary(builder, popcntintr, counttype, countd);504505if (type.length > 8) {506count = LLVMBuildTrunc(builder, count, LLVMIntTypeInContext(context, 64), "");507}508else if (type.length < 8) {509count = LLVMBuildZExt(builder, count, LLVMIntTypeInContext(context, 64), "");510}511}512newcount = LLVMBuildLoad(builder, counter, "origcount");513newcount = LLVMBuildAdd(builder, newcount, count, "newcount");514LLVMBuildStore(builder, newcount, counter);515}516517518/**519* Load depth/stencil values.520* The stored values are linear, swizzle them.521*522* \param type the data type of the fragment depth/stencil values523* \param format_desc description of the depth/stencil surface524* \param is_1d whether this resource has only one dimension525* \param loop_counter the current loop iteration526* \param depth_ptr pointer to the depth/stencil values of this 4x4 block527* \param depth_stride stride of the depth/stencil buffer528* \param z_fb contains z values loaded from fb (may include padding)529* \param s_fb contains s values loaded from fb (may include padding)530*/531void532lp_build_depth_stencil_load_swizzled(struct gallivm_state *gallivm,533struct lp_type z_src_type,534const struct util_format_description *format_desc,535boolean is_1d,536LLVMValueRef depth_ptr,537LLVMValueRef depth_stride,538LLVMValueRef *z_fb,539LLVMValueRef *s_fb,540LLVMValueRef loop_counter)541{542LLVMBuilderRef builder = gallivm->builder;543LLVMValueRef shuffles[LP_MAX_VECTOR_LENGTH / 4];544LLVMValueRef zs_dst1, zs_dst2;545LLVMValueRef zs_dst_ptr;546LLVMValueRef depth_offset1, depth_offset2;547LLVMTypeRef load_ptr_type;548unsigned depth_bytes = format_desc->block.bits / 8;549struct lp_type zs_type = lp_depth_type(format_desc, z_src_type.length);550struct lp_type zs_load_type = zs_type;551552zs_load_type.length = zs_load_type.length / 2;553load_ptr_type = LLVMPointerType(lp_build_vec_type(gallivm, zs_load_type), 0);554555if (z_src_type.length == 4) {556unsigned i;557LLVMValueRef looplsb = LLVMBuildAnd(builder, loop_counter,558lp_build_const_int32(gallivm, 1), "");559LLVMValueRef loopmsb = LLVMBuildAnd(builder, loop_counter,560lp_build_const_int32(gallivm, 2), "");561LLVMValueRef offset2 = LLVMBuildMul(builder, loopmsb,562depth_stride, "");563depth_offset1 = LLVMBuildMul(builder, looplsb,564lp_build_const_int32(gallivm, depth_bytes * 2), "");565depth_offset1 = LLVMBuildAdd(builder, depth_offset1, offset2, "");566567/* just concatenate the loaded 2x2 values into 4-wide vector */568for (i = 0; i < 4; i++) {569shuffles[i] = lp_build_const_int32(gallivm, i);570}571}572else {573unsigned i;574LLVMValueRef loopx2 = LLVMBuildShl(builder, loop_counter,575lp_build_const_int32(gallivm, 1), "");576assert(z_src_type.length == 8);577depth_offset1 = LLVMBuildMul(builder, loopx2, depth_stride, "");578/*579* We load 2x4 values, and need to swizzle them (order580* 0,1,4,5,2,3,6,7) - not so hot with avx unfortunately.581*/582for (i = 0; i < 8; i++) {583shuffles[i] = lp_build_const_int32(gallivm, (i&1) + (i&2) * 2 + (i&4) / 2);584}585}586587depth_offset2 = LLVMBuildAdd(builder, depth_offset1, depth_stride, "");588589/* Load current z/stencil values from z/stencil buffer */590zs_dst_ptr = LLVMBuildGEP(builder, depth_ptr, &depth_offset1, 1, "");591zs_dst_ptr = LLVMBuildBitCast(builder, zs_dst_ptr, load_ptr_type, "");592zs_dst1 = LLVMBuildLoad(builder, zs_dst_ptr, "");593if (is_1d) {594zs_dst2 = lp_build_undef(gallivm, zs_load_type);595}596else {597zs_dst_ptr = LLVMBuildGEP(builder, depth_ptr, &depth_offset2, 1, "");598zs_dst_ptr = LLVMBuildBitCast(builder, zs_dst_ptr, load_ptr_type, "");599zs_dst2 = LLVMBuildLoad(builder, zs_dst_ptr, "");600}601602*z_fb = LLVMBuildShuffleVector(builder, zs_dst1, zs_dst2,603LLVMConstVector(shuffles, zs_type.length), "");604*s_fb = *z_fb;605606if (format_desc->block.bits == 8) {607/* Extend stencil-only 8 bit values (S8_UINT) */608*s_fb = LLVMBuildZExt(builder, *s_fb,609lp_build_int_vec_type(gallivm, z_src_type), "");610}611612if (format_desc->block.bits < z_src_type.width) {613/* Extend destination ZS values (e.g., when reading from Z16_UNORM) */614*z_fb = LLVMBuildZExt(builder, *z_fb,615lp_build_int_vec_type(gallivm, z_src_type), "");616}617618else if (format_desc->block.bits > 32) {619/* rely on llvm to handle too wide vector we have here nicely */620unsigned i;621struct lp_type typex2 = zs_type;622struct lp_type s_type = zs_type;623LLVMValueRef shuffles1[LP_MAX_VECTOR_LENGTH / 4];624LLVMValueRef shuffles2[LP_MAX_VECTOR_LENGTH / 4];625LLVMValueRef tmp;626627typex2.width = typex2.width / 2;628typex2.length = typex2.length * 2;629s_type.width = s_type.width / 2;630s_type.floating = 0;631632tmp = LLVMBuildBitCast(builder, *z_fb,633lp_build_vec_type(gallivm, typex2), "");634635for (i = 0; i < zs_type.length; i++) {636shuffles1[i] = lp_build_const_int32(gallivm, i * 2);637shuffles2[i] = lp_build_const_int32(gallivm, i * 2 + 1);638}639*z_fb = LLVMBuildShuffleVector(builder, tmp, tmp,640LLVMConstVector(shuffles1, zs_type.length), "");641*s_fb = LLVMBuildShuffleVector(builder, tmp, tmp,642LLVMConstVector(shuffles2, zs_type.length), "");643*s_fb = LLVMBuildBitCast(builder, *s_fb,644lp_build_vec_type(gallivm, s_type), "");645lp_build_name(*s_fb, "s_dst");646}647648lp_build_name(*z_fb, "z_dst");649lp_build_name(*s_fb, "s_dst");650lp_build_name(*z_fb, "z_dst");651}652653/**654* Store depth/stencil values.655* Incoming values are swizzled (typically n 2x2 quads), stored linear.656* If there's a mask it will do select/store otherwise just store.657*658* \param type the data type of the fragment depth/stencil values659* \param format_desc description of the depth/stencil surface660* \param is_1d whether this resource has only one dimension661* \param mask_value the alive/dead pixel mask for the quad (vector)662* \param z_fb z values read from fb (with padding)663* \param s_fb s values read from fb (with padding)664* \param loop_counter the current loop iteration665* \param depth_ptr pointer to the depth/stencil values of this 4x4 block666* \param depth_stride stride of the depth/stencil buffer667* \param z_value the depth values to store (with padding)668* \param s_value the stencil values to store (with padding)669*/670void671lp_build_depth_stencil_write_swizzled(struct gallivm_state *gallivm,672struct lp_type z_src_type,673const struct util_format_description *format_desc,674boolean is_1d,675LLVMValueRef mask_value,676LLVMValueRef z_fb,677LLVMValueRef s_fb,678LLVMValueRef loop_counter,679LLVMValueRef depth_ptr,680LLVMValueRef depth_stride,681LLVMValueRef z_value,682LLVMValueRef s_value)683{684struct lp_build_context z_bld;685LLVMValueRef shuffles[LP_MAX_VECTOR_LENGTH / 4];686LLVMBuilderRef builder = gallivm->builder;687LLVMValueRef zs_dst1, zs_dst2;688LLVMValueRef zs_dst_ptr1, zs_dst_ptr2;689LLVMValueRef depth_offset1, depth_offset2;690LLVMTypeRef load_ptr_type;691unsigned depth_bytes = format_desc->block.bits / 8;692struct lp_type zs_type = lp_depth_type(format_desc, z_src_type.length);693struct lp_type z_type = zs_type;694struct lp_type zs_load_type = zs_type;695696zs_load_type.length = zs_load_type.length / 2;697load_ptr_type = LLVMPointerType(lp_build_vec_type(gallivm, zs_load_type), 0);698699z_type.width = z_src_type.width;700701lp_build_context_init(&z_bld, gallivm, z_type);702703/*704* This is far from ideal, at least for late depth write we should do this705* outside the fs loop to avoid all the swizzle stuff.706*/707if (z_src_type.length == 4) {708LLVMValueRef looplsb = LLVMBuildAnd(builder, loop_counter,709lp_build_const_int32(gallivm, 1), "");710LLVMValueRef loopmsb = LLVMBuildAnd(builder, loop_counter,711lp_build_const_int32(gallivm, 2), "");712LLVMValueRef offset2 = LLVMBuildMul(builder, loopmsb,713depth_stride, "");714depth_offset1 = LLVMBuildMul(builder, looplsb,715lp_build_const_int32(gallivm, depth_bytes * 2), "");716depth_offset1 = LLVMBuildAdd(builder, depth_offset1, offset2, "");717}718else {719unsigned i;720LLVMValueRef loopx2 = LLVMBuildShl(builder, loop_counter,721lp_build_const_int32(gallivm, 1), "");722assert(z_src_type.length == 8);723depth_offset1 = LLVMBuildMul(builder, loopx2, depth_stride, "");724/*725* We load 2x4 values, and need to swizzle them (order726* 0,1,4,5,2,3,6,7) - not so hot with avx unfortunately.727*/728for (i = 0; i < 8; i++) {729shuffles[i] = lp_build_const_int32(gallivm, (i&1) + (i&2) * 2 + (i&4) / 2);730}731}732733depth_offset2 = LLVMBuildAdd(builder, depth_offset1, depth_stride, "");734735zs_dst_ptr1 = LLVMBuildGEP(builder, depth_ptr, &depth_offset1, 1, "");736zs_dst_ptr1 = LLVMBuildBitCast(builder, zs_dst_ptr1, load_ptr_type, "");737zs_dst_ptr2 = LLVMBuildGEP(builder, depth_ptr, &depth_offset2, 1, "");738zs_dst_ptr2 = LLVMBuildBitCast(builder, zs_dst_ptr2, load_ptr_type, "");739740if (format_desc->block.bits > 32) {741s_value = LLVMBuildBitCast(builder, s_value, z_bld.vec_type, "");742}743744if (mask_value) {745z_value = lp_build_select(&z_bld, mask_value, z_value, z_fb);746if (format_desc->block.bits > 32) {747s_fb = LLVMBuildBitCast(builder, s_fb, z_bld.vec_type, "");748s_value = lp_build_select(&z_bld, mask_value, s_value, s_fb);749}750}751752if (zs_type.width < z_src_type.width) {753/* Truncate ZS values (e.g., when writing to Z16_UNORM) */754z_value = LLVMBuildTrunc(builder, z_value,755lp_build_int_vec_type(gallivm, zs_type), "");756}757758if (format_desc->block.bits <= 32) {759if (z_src_type.length == 4) {760zs_dst1 = lp_build_extract_range(gallivm, z_value, 0, 2);761zs_dst2 = lp_build_extract_range(gallivm, z_value, 2, 2);762}763else {764assert(z_src_type.length == 8);765zs_dst1 = LLVMBuildShuffleVector(builder, z_value, z_value,766LLVMConstVector(&shuffles[0],767zs_load_type.length), "");768zs_dst2 = LLVMBuildShuffleVector(builder, z_value, z_value,769LLVMConstVector(&shuffles[4],770zs_load_type.length), "");771}772}773else {774if (z_src_type.length == 4) {775zs_dst1 = lp_build_interleave2(gallivm, z_type,776z_value, s_value, 0);777zs_dst2 = lp_build_interleave2(gallivm, z_type,778z_value, s_value, 1);779}780else {781unsigned i;782LLVMValueRef shuffles[LP_MAX_VECTOR_LENGTH / 2];783assert(z_src_type.length == 8);784for (i = 0; i < 8; i++) {785shuffles[i*2] = lp_build_const_int32(gallivm, (i&1) + (i&2) * 2 + (i&4) / 2);786shuffles[i*2+1] = lp_build_const_int32(gallivm, (i&1) + (i&2) * 2 + (i&4) / 2 +787z_src_type.length);788}789zs_dst1 = LLVMBuildShuffleVector(builder, z_value, s_value,790LLVMConstVector(&shuffles[0],791z_src_type.length), "");792zs_dst2 = LLVMBuildShuffleVector(builder, z_value, s_value,793LLVMConstVector(&shuffles[8],794z_src_type.length), "");795}796zs_dst1 = LLVMBuildBitCast(builder, zs_dst1,797lp_build_vec_type(gallivm, zs_load_type), "");798zs_dst2 = LLVMBuildBitCast(builder, zs_dst2,799lp_build_vec_type(gallivm, zs_load_type), "");800}801802LLVMBuildStore(builder, zs_dst1, zs_dst_ptr1);803if (!is_1d) {804LLVMBuildStore(builder, zs_dst2, zs_dst_ptr2);805}806}807808/**809* Generate code for performing depth and/or stencil tests.810* We operate on a vector of values (typically n 2x2 quads).811*812* \param depth the depth test state813* \param stencil the front/back stencil state814* \param type the data type of the fragment depth/stencil values815* \param format_desc description of the depth/stencil surface816* \param mask the alive/dead pixel mask for the quad (vector)817* \param cov_mask coverage mask818* \param stencil_refs the front/back stencil ref values (scalar)819* \param z_src the incoming depth/stencil values (n 2x2 quad values, float32)820* \param zs_dst the depth/stencil values in framebuffer821* \param face contains boolean value indicating front/back facing polygon822*/823void824lp_build_depth_stencil_test(struct gallivm_state *gallivm,825const struct lp_depth_state *depth,826const struct pipe_stencil_state stencil[2],827struct lp_type z_src_type,828const struct util_format_description *format_desc,829struct lp_build_mask_context *mask,830LLVMValueRef *cov_mask,831LLVMValueRef stencil_refs[2],832LLVMValueRef z_src,833LLVMValueRef z_fb,834LLVMValueRef s_fb,835LLVMValueRef face,836LLVMValueRef *z_value,837LLVMValueRef *s_value,838boolean do_branch)839{840LLVMBuilderRef builder = gallivm->builder;841struct lp_type z_type;842struct lp_build_context z_bld;843struct lp_build_context s_bld;844struct lp_type s_type;845unsigned z_shift = 0, z_width = 0, z_mask = 0;846LLVMValueRef z_dst = NULL;847LLVMValueRef stencil_vals = NULL;848LLVMValueRef z_bitmask = NULL, stencil_shift = NULL;849LLVMValueRef z_pass = NULL, s_pass_mask = NULL;850LLVMValueRef current_mask = mask ? lp_build_mask_value(mask) : *cov_mask;851LLVMValueRef front_facing = NULL;852boolean have_z, have_s;853854/*855* Depths are expected to be between 0 and 1, even if they are stored in856* floats. Setting these bits here will ensure that the lp_build_conv() call857* below won't try to unnecessarily clamp the incoming values.858*/859if(z_src_type.floating) {860z_src_type.sign = FALSE;861z_src_type.norm = TRUE;862}863else {864assert(!z_src_type.sign);865assert(z_src_type.norm);866}867868/* Pick the type matching the depth-stencil format. */869z_type = lp_depth_type(format_desc, z_src_type.length);870871/* Pick the intermediate type for depth operations. */872z_type.width = z_src_type.width;873assert(z_type.length == z_src_type.length);874875/* FIXME: for non-float depth/stencil might generate better code876* if we'd always split it up to use 128bit operations.877* For stencil we'd almost certainly want to pack to 8xi16 values,878* for z just run twice.879*/880881/* Sanity checking */882{883ASSERTED const unsigned z_swizzle = format_desc->swizzle[0];884ASSERTED const unsigned s_swizzle = format_desc->swizzle[1];885886assert(z_swizzle != PIPE_SWIZZLE_NONE ||887s_swizzle != PIPE_SWIZZLE_NONE);888889assert(depth->enabled || stencil[0].enabled);890891assert(format_desc->colorspace == UTIL_FORMAT_COLORSPACE_ZS);892assert(format_desc->block.width == 1);893assert(format_desc->block.height == 1);894895if (stencil[0].enabled) {896assert(s_swizzle < 4);897assert(format_desc->channel[s_swizzle].type == UTIL_FORMAT_TYPE_UNSIGNED);898assert(format_desc->channel[s_swizzle].pure_integer);899assert(!format_desc->channel[s_swizzle].normalized);900assert(format_desc->channel[s_swizzle].size == 8);901}902903if (depth->enabled) {904assert(z_swizzle < 4);905if (z_type.floating) {906assert(z_swizzle == 0);907assert(format_desc->channel[z_swizzle].type ==908UTIL_FORMAT_TYPE_FLOAT);909assert(format_desc->channel[z_swizzle].size == 32);910}911else {912assert(format_desc->channel[z_swizzle].type ==913UTIL_FORMAT_TYPE_UNSIGNED);914assert(format_desc->channel[z_swizzle].normalized);915assert(!z_type.fixed);916}917}918}919920921/* Setup build context for Z vals */922lp_build_context_init(&z_bld, gallivm, z_type);923924/* Setup build context for stencil vals */925s_type = lp_int_type(z_type);926lp_build_context_init(&s_bld, gallivm, s_type);927928/* Compute and apply the Z/stencil bitmasks and shifts.929*/930{931unsigned s_shift, s_mask;932933z_dst = z_fb;934stencil_vals = s_fb;935936have_z = get_z_shift_and_mask(format_desc, &z_shift, &z_width, &z_mask);937have_s = get_s_shift_and_mask(format_desc, &s_shift, &s_mask);938939if (have_z) {940if (z_mask != 0xffffffff) {941z_bitmask = lp_build_const_int_vec(gallivm, z_type, z_mask);942}943944/*945* Align the framebuffer Z 's LSB to the right.946*/947if (z_shift) {948LLVMValueRef shift = lp_build_const_int_vec(gallivm, z_type, z_shift);949z_dst = LLVMBuildLShr(builder, z_dst, shift, "z_dst");950} else if (z_bitmask) {951z_dst = LLVMBuildAnd(builder, z_dst, z_bitmask, "z_dst");952} else {953lp_build_name(z_dst, "z_dst");954}955}956957if (have_s) {958if (s_shift) {959LLVMValueRef shift = lp_build_const_int_vec(gallivm, s_type, s_shift);960stencil_vals = LLVMBuildLShr(builder, stencil_vals, shift, "");961stencil_shift = shift; /* used below */962}963964if (s_mask != 0xffffffff) {965LLVMValueRef mask = lp_build_const_int_vec(gallivm, s_type, s_mask);966stencil_vals = LLVMBuildAnd(builder, stencil_vals, mask, "");967}968969lp_build_name(stencil_vals, "s_dst");970}971}972973if (stencil[0].enabled) {974975if (face) {976if (0) {977/*978* XXX: the scalar expansion below produces atrocious code979* (basically producing a 64bit scalar value, then moving the 2980* 32bit pieces separately to simd, plus 4 shuffles, which is981* seriously lame). But the scalar-simd transitions are always982* tricky, so no big surprise there.983* This here would be way better, however llvm has some serious984* trouble later using it in the select, probably because it will985* recognize the expression as constant and move the simd value986* away (out of the loop) - and then it will suddenly try987* constructing i1 high-bit masks out of it later...988* (Try piglit stencil-twoside.)989* Note this is NOT due to using SExt/Trunc, it fails exactly the990* same even when using native compare/select.991* I cannot reproduce this problem when using stand-alone compiler992* though, suggesting some problem with optimization passes...993* (With stand-alone compilation, the construction of this mask994* value, no matter if the easy 3 instruction here or the complex995* 16+ one below, never gets separated from where it's used.)996* The scalar code still has the same problem, but the generated997* code looks a bit better at least for some reason, even if998* mostly by luck (the fundamental issue clearly is the same).999*/1000front_facing = lp_build_broadcast(gallivm, s_bld.vec_type, face);1001/* front_facing = face != 0 ? ~0 : 0 */1002front_facing = lp_build_compare(gallivm, s_bld.type,1003PIPE_FUNC_NOTEQUAL,1004front_facing, s_bld.zero);1005} else {1006LLVMValueRef zero = lp_build_const_int32(gallivm, 0);10071008/* front_facing = face != 0 ? ~0 : 0 */1009front_facing = LLVMBuildICmp(builder, LLVMIntNE, face, zero, "");1010front_facing = LLVMBuildSExt(builder, front_facing,1011LLVMIntTypeInContext(gallivm->context,1012s_bld.type.length*s_bld.type.width),1013"");1014front_facing = LLVMBuildBitCast(builder, front_facing,1015s_bld.int_vec_type, "");10161017}1018}10191020s_pass_mask = lp_build_stencil_test(&s_bld, stencil,1021stencil_refs, stencil_vals,1022front_facing);10231024/* apply stencil-fail operator */1025{1026LLVMValueRef s_fail_mask = lp_build_andnot(&s_bld, current_mask, s_pass_mask);1027stencil_vals = lp_build_stencil_op(&s_bld, stencil, S_FAIL_OP,1028stencil_refs, stencil_vals,1029s_fail_mask, front_facing);1030}1031}10321033if (depth->enabled) {1034/*1035* Convert fragment Z to the desired type, aligning the LSB to the right.1036*/10371038assert(z_type.width == z_src_type.width);1039assert(z_type.length == z_src_type.length);1040assert(lp_check_value(z_src_type, z_src));1041if (z_src_type.floating) {1042/*1043* Convert from floating point values1044*/10451046if (!z_type.floating) {1047z_src = lp_build_clamped_float_to_unsigned_norm(gallivm,1048z_src_type,1049z_width,1050z_src);1051}1052} else {1053/*1054* Convert from unsigned normalized values.1055*/10561057assert(!z_src_type.sign);1058assert(!z_src_type.fixed);1059assert(z_src_type.norm);1060assert(!z_type.floating);1061if (z_src_type.width > z_width) {1062LLVMValueRef shift = lp_build_const_int_vec(gallivm, z_src_type,1063z_src_type.width - z_width);1064z_src = LLVMBuildLShr(builder, z_src, shift, "");1065}1066}1067assert(lp_check_value(z_type, z_src));10681069lp_build_name(z_src, "z_src");10701071/* compare src Z to dst Z, returning 'pass' mask */1072z_pass = lp_build_cmp(&z_bld, depth->func, z_src, z_dst);10731074/* mask off bits that failed stencil test */1075if (s_pass_mask) {1076current_mask = LLVMBuildAnd(builder, current_mask, s_pass_mask, "");1077}10781079if (!stencil[0].enabled && mask) {1080/* We can potentially skip all remaining operations here, but only1081* if stencil is disabled because we still need to update the stencil1082* buffer values. Don't need to update Z buffer values.1083*/1084lp_build_mask_update(mask, z_pass);10851086if (do_branch) {1087lp_build_mask_check(mask);1088}1089}10901091if (depth->writemask) {1092LLVMValueRef z_pass_mask;10931094/* mask off bits that failed Z test */1095z_pass_mask = LLVMBuildAnd(builder, current_mask, z_pass, "");10961097/* Mix the old and new Z buffer values.1098* z_dst[i] = zselectmask[i] ? z_src[i] : z_dst[i]1099*/1100z_dst = lp_build_select(&z_bld, z_pass_mask, z_src, z_dst);1101}11021103if (stencil[0].enabled) {1104/* update stencil buffer values according to z pass/fail result */1105LLVMValueRef z_fail_mask, z_pass_mask;11061107/* apply Z-fail operator */1108z_fail_mask = lp_build_andnot(&s_bld, current_mask, z_pass);1109stencil_vals = lp_build_stencil_op(&s_bld, stencil, Z_FAIL_OP,1110stencil_refs, stencil_vals,1111z_fail_mask, front_facing);11121113/* apply Z-pass operator */1114z_pass_mask = LLVMBuildAnd(builder, current_mask, z_pass, "");1115stencil_vals = lp_build_stencil_op(&s_bld, stencil, Z_PASS_OP,1116stencil_refs, stencil_vals,1117z_pass_mask, front_facing);1118}1119}1120else {1121/* No depth test: apply Z-pass operator to stencil buffer values which1122* passed the stencil test.1123*/1124s_pass_mask = LLVMBuildAnd(builder, current_mask, s_pass_mask, "");1125stencil_vals = lp_build_stencil_op(&s_bld, stencil, Z_PASS_OP,1126stencil_refs, stencil_vals,1127s_pass_mask, front_facing);1128}11291130/* Put Z and stencil bits in the right place */1131if (have_z && z_shift) {1132LLVMValueRef shift = lp_build_const_int_vec(gallivm, z_type, z_shift);1133z_dst = LLVMBuildShl(builder, z_dst, shift, "");1134}1135if (stencil_vals && stencil_shift)1136stencil_vals = LLVMBuildShl(builder, stencil_vals,1137stencil_shift, "");11381139/* Finally, merge the z/stencil values */1140if (format_desc->block.bits <= 32) {1141if (have_z && have_s)1142*z_value = LLVMBuildOr(builder, z_dst, stencil_vals, "");1143else if (have_z)1144*z_value = z_dst;1145else1146*z_value = stencil_vals;1147*s_value = *z_value;1148}1149else {1150*z_value = z_dst;1151*s_value = stencil_vals;1152}11531154if (mask) {1155if (s_pass_mask)1156lp_build_mask_update(mask, s_pass_mask);11571158if (depth->enabled && stencil[0].enabled)1159lp_build_mask_update(mask, z_pass);1160} else {1161LLVMValueRef tmp_mask = *cov_mask;1162if (s_pass_mask)1163tmp_mask = LLVMBuildAnd(builder, tmp_mask, s_pass_mask, "");11641165/* for multisample we don't do the stencil optimisation so update always */1166if (depth->enabled)1167tmp_mask = LLVMBuildAnd(builder, tmp_mask, z_pass, "");1168*cov_mask = tmp_mask;1169}1170}1171117211731174