Path: blob/21.2-virgl/src/gallium/drivers/llvmpipe/lp_rast_tri.c
4570 views
/**************************************************************************1*2* Copyright 2007-2009 VMware, Inc.3* All Rights Reserved.4*5* Permission is hereby granted, free of charge, to any person obtaining a6* copy of this software and associated documentation files (the7* "Software"), to deal in the Software without restriction, including8* without limitation the rights to use, copy, modify, merge, publish,9* distribute, sub license, and/or sell copies of the Software, and to10* permit persons to whom the Software is furnished to do so, subject to11* the following conditions:12*13* The above copyright notice and this permission notice (including the14* next paragraph) shall be included in all copies or substantial portions15* of the Software.16*17* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS18* OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF19* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.20* IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR21* ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,22* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE23* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.24*25**************************************************************************/2627/*28* Rasterization for binned triangles within a tile29*/3031#include <limits.h>32#include "util/u_math.h"33#include "lp_debug.h"34#include "lp_perf.h"35#include "lp_rast_priv.h"3637/**38* Shade all pixels in a 4x4 block.39*/40static void41block_full_4(struct lp_rasterizer_task *task,42const struct lp_rast_triangle *tri,43int x, int y)44{45lp_rast_shade_quads_all(task, &tri->inputs, x, y);46}474849/**50* Shade all pixels in a 16x16 block.51*/52static void53block_full_16(struct lp_rasterizer_task *task,54const struct lp_rast_triangle *tri,55int x, int y)56{57unsigned ix, iy;58assert(x % 16 == 0);59assert(y % 16 == 0);60for (iy = 0; iy < 16; iy += 4)61for (ix = 0; ix < 16; ix += 4)62block_full_4(task, tri, x + ix, y + iy);63}6465static inline unsigned66build_mask_linear(int32_t c, int32_t dcdx, int32_t dcdy)67{68unsigned mask = 0;6970int32_t c0 = c;71int32_t c1 = c0 + dcdy;72int32_t c2 = c1 + dcdy;73int32_t c3 = c2 + dcdy;7475mask |= ((c0 + 0 * dcdx) >> 31) & (1 << 0);76mask |= ((c0 + 1 * dcdx) >> 31) & (1 << 1);77mask |= ((c0 + 2 * dcdx) >> 31) & (1 << 2);78mask |= ((c0 + 3 * dcdx) >> 31) & (1 << 3);79mask |= ((c1 + 0 * dcdx) >> 31) & (1 << 4);80mask |= ((c1 + 1 * dcdx) >> 31) & (1 << 5);81mask |= ((c1 + 2 * dcdx) >> 31) & (1 << 6);82mask |= ((c1 + 3 * dcdx) >> 31) & (1 << 7);83mask |= ((c2 + 0 * dcdx) >> 31) & (1 << 8);84mask |= ((c2 + 1 * dcdx) >> 31) & (1 << 9);85mask |= ((c2 + 2 * dcdx) >> 31) & (1 << 10);86mask |= ((c2 + 3 * dcdx) >> 31) & (1 << 11);87mask |= ((c3 + 0 * dcdx) >> 31) & (1 << 12);88mask |= ((c3 + 1 * dcdx) >> 31) & (1 << 13);89mask |= ((c3 + 2 * dcdx) >> 31) & (1 << 14);90mask |= ((c3 + 3 * dcdx) >> 31) & (1 << 15);9192return mask;93}949596static inline void97build_masks(int32_t c,98int32_t cdiff,99int32_t dcdx,100int32_t dcdy,101unsigned *outmask,102unsigned *partmask)103{104*outmask |= build_mask_linear(c, dcdx, dcdy);105*partmask |= build_mask_linear(c + cdiff, dcdx, dcdy);106}107108void109lp_rast_triangle_3_16(struct lp_rasterizer_task *task,110const union lp_rast_cmd_arg arg)111{112union lp_rast_cmd_arg arg2;113arg2.triangle.tri = arg.triangle.tri;114arg2.triangle.plane_mask = (1<<3)-1;115lp_rast_triangle_3(task, arg2);116}117118void119lp_rast_triangle_3_4(struct lp_rasterizer_task *task,120const union lp_rast_cmd_arg arg)121{122lp_rast_triangle_3_16(task, arg);123}124125void126lp_rast_triangle_4_16(struct lp_rasterizer_task *task,127const union lp_rast_cmd_arg arg)128{129union lp_rast_cmd_arg arg2;130arg2.triangle.tri = arg.triangle.tri;131arg2.triangle.plane_mask = (1<<4)-1;132lp_rast_triangle_4(task, arg2);133}134135void136lp_rast_triangle_ms_3_16(struct lp_rasterizer_task *task,137const union lp_rast_cmd_arg arg)138{139union lp_rast_cmd_arg arg2;140arg2.triangle.tri = arg.triangle.tri;141arg2.triangle.plane_mask = (1<<3)-1;142lp_rast_triangle_ms_3(task, arg2);143}144145void146lp_rast_triangle_ms_3_4(struct lp_rasterizer_task *task,147const union lp_rast_cmd_arg arg)148{149lp_rast_triangle_ms_3_16(task, arg);150}151152void153lp_rast_triangle_ms_4_16(struct lp_rasterizer_task *task,154const union lp_rast_cmd_arg arg)155{156union lp_rast_cmd_arg arg2;157arg2.triangle.tri = arg.triangle.tri;158arg2.triangle.plane_mask = (1<<4)-1;159lp_rast_triangle_ms_4(task, arg2);160}161162#if defined(PIPE_ARCH_SSE)163164#include <emmintrin.h>165#include "util/u_sse.h"166167168static inline void169build_masks_sse(int c,170int cdiff,171int dcdx,172int dcdy,173unsigned *outmask,174unsigned *partmask)175{176__m128i cstep0 = _mm_setr_epi32(c, c+dcdx, c+dcdx*2, c+dcdx*3);177__m128i xdcdy = _mm_set1_epi32(dcdy);178179/* Get values across the quad180*/181__m128i cstep1 = _mm_add_epi32(cstep0, xdcdy);182__m128i cstep2 = _mm_add_epi32(cstep1, xdcdy);183__m128i cstep3 = _mm_add_epi32(cstep2, xdcdy);184185{186__m128i cstep01, cstep23, result;187188cstep01 = _mm_packs_epi32(cstep0, cstep1);189cstep23 = _mm_packs_epi32(cstep2, cstep3);190result = _mm_packs_epi16(cstep01, cstep23);191192*outmask |= _mm_movemask_epi8(result);193}194195196{197__m128i cio4 = _mm_set1_epi32(cdiff);198__m128i cstep01, cstep23, result;199200cstep0 = _mm_add_epi32(cstep0, cio4);201cstep1 = _mm_add_epi32(cstep1, cio4);202cstep2 = _mm_add_epi32(cstep2, cio4);203cstep3 = _mm_add_epi32(cstep3, cio4);204205cstep01 = _mm_packs_epi32(cstep0, cstep1);206cstep23 = _mm_packs_epi32(cstep2, cstep3);207result = _mm_packs_epi16(cstep01, cstep23);208209*partmask |= _mm_movemask_epi8(result);210}211}212213214static inline unsigned215build_mask_linear_sse(int c, int dcdx, int dcdy)216{217__m128i cstep0 = _mm_setr_epi32(c, c+dcdx, c+dcdx*2, c+dcdx*3);218__m128i xdcdy = _mm_set1_epi32(dcdy);219220/* Get values across the quad221*/222__m128i cstep1 = _mm_add_epi32(cstep0, xdcdy);223__m128i cstep2 = _mm_add_epi32(cstep1, xdcdy);224__m128i cstep3 = _mm_add_epi32(cstep2, xdcdy);225226/* pack pairs of results into epi16227*/228__m128i cstep01 = _mm_packs_epi32(cstep0, cstep1);229__m128i cstep23 = _mm_packs_epi32(cstep2, cstep3);230231/* pack into epi8, preserving sign bits232*/233__m128i result = _mm_packs_epi16(cstep01, cstep23);234235/* extract sign bits to create mask236*/237return _mm_movemask_epi8(result);238}239240static inline unsigned241sign_bits4(const __m128i *cstep, int cdiff)242{243244/* Adjust the step values245*/246__m128i cio4 = _mm_set1_epi32(cdiff);247__m128i cstep0 = _mm_add_epi32(cstep[0], cio4);248__m128i cstep1 = _mm_add_epi32(cstep[1], cio4);249__m128i cstep2 = _mm_add_epi32(cstep[2], cio4);250__m128i cstep3 = _mm_add_epi32(cstep[3], cio4);251252/* Pack down to epi8253*/254__m128i cstep01 = _mm_packs_epi32(cstep0, cstep1);255__m128i cstep23 = _mm_packs_epi32(cstep2, cstep3);256__m128i result = _mm_packs_epi16(cstep01, cstep23);257258/* Extract the sign bits259*/260return _mm_movemask_epi8(result);261}262263264#define NR_PLANES 3265266void267lp_rast_triangle_32_3_16(struct lp_rasterizer_task *task,268const union lp_rast_cmd_arg arg)269{270const struct lp_rast_triangle *tri = arg.triangle.tri;271const struct lp_rast_plane *plane = GET_PLANES(tri);272int x = (arg.triangle.plane_mask & 0xff) + task->x;273int y = (arg.triangle.plane_mask >> 8) + task->y;274unsigned i, j;275276struct { unsigned mask:16; unsigned i:8; unsigned j:8; } out[16];277unsigned nr = 0;278279/* p0 and p2 are aligned, p1 is not (plane size 24 bytes). */280__m128i p0 = _mm_load_si128((__m128i *)&plane[0]); /* clo, chi, dcdx, dcdy */281__m128i p1 = _mm_loadu_si128((__m128i *)&plane[1]);282__m128i p2 = _mm_load_si128((__m128i *)&plane[2]);283__m128i zero = _mm_setzero_si128();284285__m128i c, dcdx, dcdy, rej4;286__m128i dcdx_neg_mask, dcdy_neg_mask;287__m128i dcdx2, dcdx3;288289__m128i span_0; /* 0,dcdx,2dcdx,3dcdx for plane 0 */290__m128i span_1; /* 0,dcdx,2dcdx,3dcdx for plane 1 */291__m128i span_2; /* 0,dcdx,2dcdx,3dcdx for plane 2 */292__m128i unused;293294transpose4_epi32(&p0, &p1, &p2, &zero,295&c, &unused, &dcdx, &dcdy);296297/* recalc eo - easier than trying to load as scalars / shuffle... */298dcdx_neg_mask = _mm_srai_epi32(dcdx, 31);299dcdy_neg_mask = _mm_srai_epi32(dcdy, 31);300rej4 = _mm_sub_epi32(_mm_andnot_si128(dcdy_neg_mask, dcdy),301_mm_and_si128(dcdx_neg_mask, dcdx));302303/* Adjust dcdx;304*/305dcdx = _mm_sub_epi32(zero, dcdx);306307c = _mm_add_epi32(c, mm_mullo_epi32(dcdx, _mm_set1_epi32(x)));308c = _mm_add_epi32(c, mm_mullo_epi32(dcdy, _mm_set1_epi32(y)));309rej4 = _mm_slli_epi32(rej4, 2);310311/* Adjust so we can just check the sign bit (< 0 comparison), instead of having to do a less efficient <= 0 comparison */312c = _mm_sub_epi32(c, _mm_set1_epi32(1));313rej4 = _mm_add_epi32(rej4, _mm_set1_epi32(1));314315dcdx2 = _mm_add_epi32(dcdx, dcdx);316dcdx3 = _mm_add_epi32(dcdx2, dcdx);317318transpose4_epi32(&zero, &dcdx, &dcdx2, &dcdx3,319&span_0, &span_1, &span_2, &unused);320321for (i = 0; i < 4; i++) {322__m128i cx = c;323324for (j = 0; j < 4; j++) {325__m128i c4rej = _mm_add_epi32(cx, rej4);326__m128i rej_masks = _mm_srai_epi32(c4rej, 31);327328/* if (is_zero(rej_masks)) */329if (_mm_movemask_epi8(rej_masks) == 0) {330__m128i c0_0 = _mm_add_epi32(SCALAR_EPI32(cx, 0), span_0);331__m128i c1_0 = _mm_add_epi32(SCALAR_EPI32(cx, 1), span_1);332__m128i c2_0 = _mm_add_epi32(SCALAR_EPI32(cx, 2), span_2);333334__m128i c_0 = _mm_or_si128(_mm_or_si128(c0_0, c1_0), c2_0);335336__m128i c0_1 = _mm_add_epi32(c0_0, SCALAR_EPI32(dcdy, 0));337__m128i c1_1 = _mm_add_epi32(c1_0, SCALAR_EPI32(dcdy, 1));338__m128i c2_1 = _mm_add_epi32(c2_0, SCALAR_EPI32(dcdy, 2));339340__m128i c_1 = _mm_or_si128(_mm_or_si128(c0_1, c1_1), c2_1);341__m128i c_01 = _mm_packs_epi32(c_0, c_1);342343__m128i c0_2 = _mm_add_epi32(c0_1, SCALAR_EPI32(dcdy, 0));344__m128i c1_2 = _mm_add_epi32(c1_1, SCALAR_EPI32(dcdy, 1));345__m128i c2_2 = _mm_add_epi32(c2_1, SCALAR_EPI32(dcdy, 2));346347__m128i c_2 = _mm_or_si128(_mm_or_si128(c0_2, c1_2), c2_2);348349__m128i c0_3 = _mm_add_epi32(c0_2, SCALAR_EPI32(dcdy, 0));350__m128i c1_3 = _mm_add_epi32(c1_2, SCALAR_EPI32(dcdy, 1));351__m128i c2_3 = _mm_add_epi32(c2_2, SCALAR_EPI32(dcdy, 2));352353__m128i c_3 = _mm_or_si128(_mm_or_si128(c0_3, c1_3), c2_3);354__m128i c_23 = _mm_packs_epi32(c_2, c_3);355__m128i c_0123 = _mm_packs_epi16(c_01, c_23);356357unsigned mask = _mm_movemask_epi8(c_0123);358359out[nr].i = i;360out[nr].j = j;361out[nr].mask = mask;362if (mask != 0xffff)363nr++;364}365cx = _mm_add_epi32(cx, _mm_slli_epi32(dcdx, 2));366}367368c = _mm_add_epi32(c, _mm_slli_epi32(dcdy, 2));369}370371for (i = 0; i < nr; i++)372lp_rast_shade_quads_mask(task,373&tri->inputs,374x + 4 * out[i].j,375y + 4 * out[i].i,3760xffff & ~out[i].mask);377}378379void380lp_rast_triangle_32_3_4(struct lp_rasterizer_task *task,381const union lp_rast_cmd_arg arg)382{383const struct lp_rast_triangle *tri = arg.triangle.tri;384const struct lp_rast_plane *plane = GET_PLANES(tri);385unsigned x = (arg.triangle.plane_mask & 0xff) + task->x;386unsigned y = (arg.triangle.plane_mask >> 8) + task->y;387388/* p0 and p2 are aligned, p1 is not (plane size 24 bytes). */389__m128i p0 = _mm_load_si128((__m128i *)&plane[0]); /* clo, chi, dcdx, dcdy */390__m128i p1 = _mm_loadu_si128((__m128i *)&plane[1]);391__m128i p2 = _mm_load_si128((__m128i *)&plane[2]);392__m128i zero = _mm_setzero_si128();393394__m128i c, dcdx, dcdy;395__m128i dcdx2, dcdx3;396397__m128i span_0; /* 0,dcdx,2dcdx,3dcdx for plane 0 */398__m128i span_1; /* 0,dcdx,2dcdx,3dcdx for plane 1 */399__m128i span_2; /* 0,dcdx,2dcdx,3dcdx for plane 2 */400__m128i unused;401402transpose4_epi32(&p0, &p1, &p2, &zero,403&c, &unused, &dcdx, &dcdy);404405/* Adjust dcdx;406*/407dcdx = _mm_sub_epi32(zero, dcdx);408409c = _mm_add_epi32(c, mm_mullo_epi32(dcdx, _mm_set1_epi32(x)));410c = _mm_add_epi32(c, mm_mullo_epi32(dcdy, _mm_set1_epi32(y)));411412/* Adjust so we can just check the sign bit (< 0 comparison), instead of having to do a less efficient <= 0 comparison */413c = _mm_sub_epi32(c, _mm_set1_epi32(1));414415dcdx2 = _mm_add_epi32(dcdx, dcdx);416dcdx3 = _mm_add_epi32(dcdx2, dcdx);417418transpose4_epi32(&zero, &dcdx, &dcdx2, &dcdx3,419&span_0, &span_1, &span_2, &unused);420421422{423__m128i c0_0 = _mm_add_epi32(SCALAR_EPI32(c, 0), span_0);424__m128i c1_0 = _mm_add_epi32(SCALAR_EPI32(c, 1), span_1);425__m128i c2_0 = _mm_add_epi32(SCALAR_EPI32(c, 2), span_2);426427__m128i c_0 = _mm_or_si128(_mm_or_si128(c0_0, c1_0), c2_0);428429__m128i c0_1 = _mm_add_epi32(c0_0, SCALAR_EPI32(dcdy, 0));430__m128i c1_1 = _mm_add_epi32(c1_0, SCALAR_EPI32(dcdy, 1));431__m128i c2_1 = _mm_add_epi32(c2_0, SCALAR_EPI32(dcdy, 2));432433__m128i c_1 = _mm_or_si128(_mm_or_si128(c0_1, c1_1), c2_1);434__m128i c_01 = _mm_packs_epi32(c_0, c_1);435436__m128i c0_2 = _mm_add_epi32(c0_1, SCALAR_EPI32(dcdy, 0));437__m128i c1_2 = _mm_add_epi32(c1_1, SCALAR_EPI32(dcdy, 1));438__m128i c2_2 = _mm_add_epi32(c2_1, SCALAR_EPI32(dcdy, 2));439440__m128i c_2 = _mm_or_si128(_mm_or_si128(c0_2, c1_2), c2_2);441442__m128i c0_3 = _mm_add_epi32(c0_2, SCALAR_EPI32(dcdy, 0));443__m128i c1_3 = _mm_add_epi32(c1_2, SCALAR_EPI32(dcdy, 1));444__m128i c2_3 = _mm_add_epi32(c2_2, SCALAR_EPI32(dcdy, 2));445446__m128i c_3 = _mm_or_si128(_mm_or_si128(c0_3, c1_3), c2_3);447__m128i c_23 = _mm_packs_epi32(c_2, c_3);448__m128i c_0123 = _mm_packs_epi16(c_01, c_23);449450unsigned mask = _mm_movemask_epi8(c_0123);451452if (mask != 0xffff)453lp_rast_shade_quads_mask(task,454&tri->inputs,455x,456y,4570xffff & ~mask);458}459}460461#undef NR_PLANES462463#else464465#if defined(_ARCH_PWR8) && UTIL_ARCH_LITTLE_ENDIAN466467#include <altivec.h>468#include "util/u_pwr8.h"469470static inline void471build_masks_ppc(int c,472int cdiff,473int dcdx,474int dcdy,475unsigned *outmask,476unsigned *partmask)477{478__m128i cstep0 = vec_setr_epi32(c, c+dcdx, c+dcdx*2, c+dcdx*3);479__m128i xdcdy = (__m128i) vec_splats(dcdy);480481/* Get values across the quad482*/483__m128i cstep1 = vec_add_epi32(cstep0, xdcdy);484__m128i cstep2 = vec_add_epi32(cstep1, xdcdy);485__m128i cstep3 = vec_add_epi32(cstep2, xdcdy);486487{488__m128i cstep01, cstep23, result;489490cstep01 = vec_packs_epi32(cstep0, cstep1);491cstep23 = vec_packs_epi32(cstep2, cstep3);492result = vec_packs_epi16(cstep01, cstep23);493494*outmask |= vec_movemask_epi8(result);495}496497498{499__m128i cio4 = (__m128i) vec_splats(cdiff);500__m128i cstep01, cstep23, result;501502cstep0 = vec_add_epi32(cstep0, cio4);503cstep1 = vec_add_epi32(cstep1, cio4);504cstep2 = vec_add_epi32(cstep2, cio4);505cstep3 = vec_add_epi32(cstep3, cio4);506507cstep01 = vec_packs_epi32(cstep0, cstep1);508cstep23 = vec_packs_epi32(cstep2, cstep3);509result = vec_packs_epi16(cstep01, cstep23);510511*partmask |= vec_movemask_epi8(result);512}513}514515static inline unsigned516build_mask_linear_ppc(int c, int dcdx, int dcdy)517{518__m128i cstep0 = vec_setr_epi32(c, c+dcdx, c+dcdx*2, c+dcdx*3);519__m128i xdcdy = (__m128i) vec_splats(dcdy);520521/* Get values across the quad522*/523__m128i cstep1 = vec_add_epi32(cstep0, xdcdy);524__m128i cstep2 = vec_add_epi32(cstep1, xdcdy);525__m128i cstep3 = vec_add_epi32(cstep2, xdcdy);526527/* pack pairs of results into epi16528*/529__m128i cstep01 = vec_packs_epi32(cstep0, cstep1);530__m128i cstep23 = vec_packs_epi32(cstep2, cstep3);531532/* pack into epi8, preserving sign bits533*/534__m128i result = vec_packs_epi16(cstep01, cstep23);535536/* extract sign bits to create mask537*/538return vec_movemask_epi8(result);539}540541static inline __m128i542lp_plane_to_m128i(const struct lp_rast_plane *plane)543{544return vec_setr_epi32((int32_t)plane->c, (int32_t)plane->dcdx,545(int32_t)plane->dcdy, (int32_t)plane->eo);546}547548#define NR_PLANES 3549550void551lp_rast_triangle_32_3_16(struct lp_rasterizer_task *task,552const union lp_rast_cmd_arg arg)553{554const struct lp_rast_triangle *tri = arg.triangle.tri;555const struct lp_rast_plane *plane = GET_PLANES(tri);556int x = (arg.triangle.plane_mask & 0xff) + task->x;557int y = (arg.triangle.plane_mask >> 8) + task->y;558unsigned i, j;559560struct { unsigned mask:16; unsigned i:8; unsigned j:8; } out[16];561unsigned nr = 0;562563__m128i p0 = lp_plane_to_m128i(&plane[0]); /* c, dcdx, dcdy, eo */564__m128i p1 = lp_plane_to_m128i(&plane[1]); /* c, dcdx, dcdy, eo */565__m128i p2 = lp_plane_to_m128i(&plane[2]); /* c, dcdx, dcdy, eo */566__m128i zero = vec_splats((unsigned char) 0);567568__m128i c;569__m128i dcdx;570__m128i dcdy;571__m128i rej4;572573__m128i dcdx2;574__m128i dcdx3;575576__m128i span_0; /* 0,dcdx,2dcdx,3dcdx for plane 0 */577__m128i span_1; /* 0,dcdx,2dcdx,3dcdx for plane 1 */578__m128i span_2; /* 0,dcdx,2dcdx,3dcdx for plane 2 */579__m128i unused;580581__m128i vshuf_mask0;582__m128i vshuf_mask1;583__m128i vshuf_mask2;584585#if UTIL_ARCH_LITTLE_ENDIAN586vshuf_mask0 = (__m128i) vec_splats((unsigned int) 0x03020100);587vshuf_mask1 = (__m128i) vec_splats((unsigned int) 0x07060504);588vshuf_mask2 = (__m128i) vec_splats((unsigned int) 0x0B0A0908);589#else590vshuf_mask0 = (__m128i) vec_splats((unsigned int) 0x0C0D0E0F);591vshuf_mask1 = (__m128i) vec_splats((unsigned int) 0x08090A0B);592vshuf_mask2 = (__m128i) vec_splats((unsigned int) 0x04050607);593#endif594595transpose4_epi32(&p0, &p1, &p2, &zero,596&c, &dcdx, &dcdy, &rej4);597598/* Adjust dcdx;599*/600dcdx = vec_sub_epi32(zero, dcdx);601602c = vec_add_epi32(c, vec_mullo_epi32(dcdx, (__m128i) vec_splats(x)));603c = vec_add_epi32(c, vec_mullo_epi32(dcdy, (__m128i) vec_splats(y)));604rej4 = vec_slli_epi32(rej4, 2);605606/*607* Adjust so we can just check the sign bit (< 0 comparison),608* instead of having to do a less efficient <= 0 comparison609*/610c = vec_sub_epi32(c, (__m128i) vec_splats((unsigned int) 1));611rej4 = vec_add_epi32(rej4, (__m128i) vec_splats((unsigned int) 1));612613dcdx2 = vec_add_epi32(dcdx, dcdx);614dcdx3 = vec_add_epi32(dcdx2, dcdx);615616transpose4_epi32(&zero, &dcdx, &dcdx2, &dcdx3,617&span_0, &span_1, &span_2, &unused);618619for (i = 0; i < 4; i++) {620__m128i cx = c;621622for (j = 0; j < 4; j++) {623__m128i c4rej = vec_add_epi32(cx, rej4);624__m128i rej_masks = vec_srai_epi32(c4rej, 31);625626/* if (is_zero(rej_masks)) */627if (vec_movemask_epi8(rej_masks) == 0) {628__m128i c0_0 = vec_add_epi32(vec_perm(cx, cx, vshuf_mask0), span_0);629__m128i c1_0 = vec_add_epi32(vec_perm(cx, cx, vshuf_mask1), span_1);630__m128i c2_0 = vec_add_epi32(vec_perm(cx, cx, vshuf_mask2), span_2);631632__m128i c_0 = vec_or(vec_or(c0_0, c1_0), c2_0);633634__m128i c0_1 = vec_add_epi32(c0_0, vec_perm(dcdy, dcdy, vshuf_mask0));635__m128i c1_1 = vec_add_epi32(c1_0, vec_perm(dcdy, dcdy, vshuf_mask1));636__m128i c2_1 = vec_add_epi32(c2_0, vec_perm(dcdy, dcdy, vshuf_mask2));637638__m128i c_1 = vec_or(vec_or(c0_1, c1_1), c2_1);639__m128i c_01 = vec_packs_epi32(c_0, c_1);640641__m128i c0_2 = vec_add_epi32(c0_1, vec_perm(dcdy, dcdy, vshuf_mask0));642__m128i c1_2 = vec_add_epi32(c1_1, vec_perm(dcdy, dcdy, vshuf_mask1));643__m128i c2_2 = vec_add_epi32(c2_1, vec_perm(dcdy, dcdy, vshuf_mask2));644645__m128i c_2 = vec_or(vec_or(c0_2, c1_2), c2_2);646647__m128i c0_3 = vec_add_epi32(c0_2, vec_perm(dcdy, dcdy, vshuf_mask0));648__m128i c1_3 = vec_add_epi32(c1_2, vec_perm(dcdy, dcdy, vshuf_mask1));649__m128i c2_3 = vec_add_epi32(c2_2, vec_perm(dcdy, dcdy, vshuf_mask2));650651__m128i c_3 = vec_or(vec_or(c0_3, c1_3), c2_3);652__m128i c_23 = vec_packs_epi32(c_2, c_3);653__m128i c_0123 = vec_packs_epi16(c_01, c_23);654655unsigned mask = vec_movemask_epi8(c_0123);656657out[nr].i = i;658out[nr].j = j;659out[nr].mask = mask;660if (mask != 0xffff)661nr++;662}663cx = vec_add_epi32(cx, vec_slli_epi32(dcdx, 2));664}665666c = vec_add_epi32(c, vec_slli_epi32(dcdy, 2));667}668669for (i = 0; i < nr; i++)670lp_rast_shade_quads_mask(task,671&tri->inputs,672x + 4 * out[i].j,673y + 4 * out[i].i,6740xffff & ~out[i].mask);675}676677#undef NR_PLANES678679#else680681void682lp_rast_triangle_32_3_16(struct lp_rasterizer_task *task,683const union lp_rast_cmd_arg arg)684{685union lp_rast_cmd_arg arg2;686arg2.triangle.tri = arg.triangle.tri;687arg2.triangle.plane_mask = (1<<3)-1;688lp_rast_triangle_32_3(task, arg2);689}690691#endif /* _ARCH_PWR8 && UTIL_ARCH_LITTLE_ENDIAN */692693void694lp_rast_triangle_32_4_16(struct lp_rasterizer_task *task,695const union lp_rast_cmd_arg arg)696{697union lp_rast_cmd_arg arg2;698arg2.triangle.tri = arg.triangle.tri;699arg2.triangle.plane_mask = (1<<4)-1;700lp_rast_triangle_32_4(task, arg2);701}702703void704lp_rast_triangle_32_3_4(struct lp_rasterizer_task *task,705const union lp_rast_cmd_arg arg)706{707lp_rast_triangle_32_3_16(task, arg);708}709710#endif711712#if defined PIPE_ARCH_SSE713#define BUILD_MASKS(c, cdiff, dcdx, dcdy, omask, pmask) build_masks_sse((int)c, (int)cdiff, dcdx, dcdy, omask, pmask)714#define BUILD_MASK_LINEAR(c, dcdx, dcdy) build_mask_linear_sse((int)c, dcdx, dcdy)715#elif (defined(_ARCH_PWR8) && UTIL_ARCH_LITTLE_ENDIAN)716#define BUILD_MASKS(c, cdiff, dcdx, dcdy, omask, pmask) build_masks_ppc((int)c, (int)cdiff, dcdx, dcdy, omask, pmask)717#define BUILD_MASK_LINEAR(c, dcdx, dcdy) build_mask_linear_ppc((int)c, dcdx, dcdy)718#else719#define BUILD_MASKS(c, cdiff, dcdx, dcdy, omask, pmask) build_masks(c, cdiff, dcdx, dcdy, omask, pmask)720#define BUILD_MASK_LINEAR(c, dcdx, dcdy) build_mask_linear(c, dcdx, dcdy)721#endif722723#define RASTER_64 1724725#define TAG(x) x##_1726#define NR_PLANES 1727#include "lp_rast_tri_tmp.h"728729#define TAG(x) x##_2730#define NR_PLANES 2731#include "lp_rast_tri_tmp.h"732733#define TAG(x) x##_3734#define NR_PLANES 3735/*#define TRI_4 lp_rast_triangle_3_4*/736/*#define TRI_16 lp_rast_triangle_3_16*/737#include "lp_rast_tri_tmp.h"738739#define TAG(x) x##_4740#define NR_PLANES 4741/*#define TRI_16 lp_rast_triangle_4_16*/742#include "lp_rast_tri_tmp.h"743744#define TAG(x) x##_5745#define NR_PLANES 5746#include "lp_rast_tri_tmp.h"747748#define TAG(x) x##_6749#define NR_PLANES 6750#include "lp_rast_tri_tmp.h"751752#define TAG(x) x##_7753#define NR_PLANES 7754#include "lp_rast_tri_tmp.h"755756#define TAG(x) x##_8757#define NR_PLANES 8758#include "lp_rast_tri_tmp.h"759760#undef RASTER_64761762#define TAG(x) x##_32_1763#define NR_PLANES 1764#include "lp_rast_tri_tmp.h"765766#define TAG(x) x##_32_2767#define NR_PLANES 2768#include "lp_rast_tri_tmp.h"769770#define TAG(x) x##_32_3771#define NR_PLANES 3772/*#define TRI_4 lp_rast_triangle_3_4*/773/*#define TRI_16 lp_rast_triangle_3_16*/774#include "lp_rast_tri_tmp.h"775776#define TAG(x) x##_32_4777#define NR_PLANES 4778#ifdef PIPE_ARCH_SSE779#define TRI_16 lp_rast_triangle_32_4_16780#endif781#include "lp_rast_tri_tmp.h"782783#define TAG(x) x##_32_5784#define NR_PLANES 5785#include "lp_rast_tri_tmp.h"786787#define TAG(x) x##_32_6788#define NR_PLANES 6789#include "lp_rast_tri_tmp.h"790791#define TAG(x) x##_32_7792#define NR_PLANES 7793#include "lp_rast_tri_tmp.h"794795#define TAG(x) x##_32_8796#define NR_PLANES 8797#include "lp_rast_tri_tmp.h"798799#define MULTISAMPLE 1800#define RASTER_64 1801802#define TAG(x) x##_ms_1803#define NR_PLANES 1804#include "lp_rast_tri_tmp.h"805806#define TAG(x) x##_ms_2807#define NR_PLANES 2808#include "lp_rast_tri_tmp.h"809810#define TAG(x) x##_ms_3811#define NR_PLANES 3812/*#define TRI_4 lp_rast_triangle_3_4*/813/*#define TRI_16 lp_rast_triangle_3_16*/814#include "lp_rast_tri_tmp.h"815816#define TAG(x) x##_ms_4817#define NR_PLANES 4818/*#define TRI_16 lp_rast_triangle_4_16*/819#include "lp_rast_tri_tmp.h"820821#define TAG(x) x##_ms_5822#define NR_PLANES 5823#include "lp_rast_tri_tmp.h"824825#define TAG(x) x##_ms_6826#define NR_PLANES 6827#include "lp_rast_tri_tmp.h"828829#define TAG(x) x##_ms_7830#define NR_PLANES 7831#include "lp_rast_tri_tmp.h"832833#define TAG(x) x##_ms_8834#define NR_PLANES 8835#include "lp_rast_tri_tmp.h"836837#undef RASTER_64838839840