Path: blob/21.2-virgl/src/gallium/auxiliary/translate/translate_sse.c
4565 views
/*1* Copyright 2003 VMware, Inc.2* All Rights Reserved.3*4* Permission is hereby granted, free of charge, to any person obtaining a5* copy of this software and associated documentation files (the "Software"),6* to deal in the Software without restriction, including without limitation7* on the rights to use, copy, modify, merge, publish, distribute, sub8* license, and/or sell copies of the Software, and to permit persons to whom9* the Software is furnished to do so, subject to the following conditions:10*11* The above copyright notice and this permission notice (including the next12* paragraph) shall be included in all copies or substantial portions of the13* Software.14*15* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR16* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,17* FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL18* VMWARE AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,19* DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR20* OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE21* USE OR OTHER DEALINGS IN THE SOFTWARE.22*23* Authors:24* Keith Whitwell <[email protected]>25*/262728#include "pipe/p_config.h"29#include "pipe/p_compiler.h"30#include "util/u_memory.h"31#include "util/u_math.h"32#include "util/format/u_format.h"3334#include "translate.h"353637#if (defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)) && !defined(EMBEDDED_DEVICE)3839#include "rtasm/rtasm_cpu.h"40#include "rtasm/rtasm_x86sse.h"414243#define X 044#define Y 145#define Z 246#define W 3474849struct translate_buffer50{51const void *base_ptr;52uintptr_t stride;53unsigned max_index;54};5556struct translate_buffer_variant57{58unsigned buffer_index;59unsigned instance_divisor;60void *ptr; /* updated either per vertex or per instance */61};626364#define ELEMENT_BUFFER_INSTANCE_ID 10016566#define NUM_CONSTS 76768enum69{70CONST_IDENTITY,71CONST_INV_127,72CONST_INV_255,73CONST_INV_32767,74CONST_INV_65535,75CONST_INV_2147483647,76CONST_25577};7879#define C(v) {(float)(v), (float)(v), (float)(v), (float)(v)}80static float consts[NUM_CONSTS][4] = {81{0, 0, 0, 1},82C(1.0 / 127.0),83C(1.0 / 255.0),84C(1.0 / 32767.0),85C(1.0 / 65535.0),86C(1.0 / 2147483647.0),87C(255.0)88};8990#undef C9192struct translate_sse93{94struct translate translate;9596struct x86_function linear_func;97struct x86_function elt_func;98struct x86_function elt16_func;99struct x86_function elt8_func;100struct x86_function *func;101102PIPE_ALIGN_VAR(16) float consts[NUM_CONSTS][4];103int8_t reg_to_const[16];104int8_t const_to_reg[NUM_CONSTS];105106struct translate_buffer buffer[TRANSLATE_MAX_ATTRIBS];107unsigned nr_buffers;108109/* Multiple buffer variants can map to a single buffer. */110struct translate_buffer_variant buffer_variant[TRANSLATE_MAX_ATTRIBS];111unsigned nr_buffer_variants;112113/* Multiple elements can map to a single buffer variant. */114unsigned element_to_buffer_variant[TRANSLATE_MAX_ATTRIBS];115116boolean use_instancing;117unsigned instance_id;118unsigned start_instance;119120/* these are actually known values, but putting them in a struct121* like this is helpful to keep them in sync across the file.122*/123struct x86_reg tmp_EAX;124struct x86_reg tmp2_EDX;125struct x86_reg src_ECX;126struct x86_reg idx_ESI; /* either start+i or &elt[i] */127struct x86_reg machine_EDI;128struct x86_reg outbuf_EBX;129struct x86_reg count_EBP; /* decrements to zero */130};131132133static int134get_offset(const void *a, const void *b)135{136return (const char *) b - (const char *) a;137}138139140static struct x86_reg141get_const(struct translate_sse *p, unsigned id)142{143struct x86_reg reg;144unsigned i;145146if (p->const_to_reg[id] >= 0)147return x86_make_reg(file_XMM, p->const_to_reg[id]);148149for (i = 2; i < 8; ++i) {150if (p->reg_to_const[i] < 0)151break;152}153154/* TODO: be smarter here */155if (i == 8)156--i;157158reg = x86_make_reg(file_XMM, i);159160if (p->reg_to_const[i] >= 0)161p->const_to_reg[p->reg_to_const[i]] = -1;162163p->reg_to_const[i] = id;164p->const_to_reg[id] = i;165166/* TODO: this should happen outside the loop, if possible */167sse_movaps(p->func, reg,168x86_make_disp(p->machine_EDI,169get_offset(p, &p->consts[id][0])));170171return reg;172}173174175/* load the data in a SSE2 register, padding with zeros */176static boolean177emit_load_sse2(struct translate_sse *p,178struct x86_reg data, struct x86_reg src, unsigned size)179{180struct x86_reg tmpXMM = x86_make_reg(file_XMM, 1);181struct x86_reg tmp = p->tmp_EAX;182switch (size) {183case 1:184x86_movzx8(p->func, tmp, src);185sse2_movd(p->func, data, tmp);186break;187case 2:188x86_movzx16(p->func, tmp, src);189sse2_movd(p->func, data, tmp);190break;191case 3:192x86_movzx8(p->func, tmp, x86_make_disp(src, 2));193x86_shl_imm(p->func, tmp, 16);194x86_mov16(p->func, tmp, src);195sse2_movd(p->func, data, tmp);196break;197case 4:198sse2_movd(p->func, data, src);199break;200case 6:201sse2_movd(p->func, data, src);202x86_movzx16(p->func, tmp, x86_make_disp(src, 4));203sse2_movd(p->func, tmpXMM, tmp);204sse2_punpckldq(p->func, data, tmpXMM);205break;206case 8:207sse2_movq(p->func, data, src);208break;209case 12:210sse2_movq(p->func, data, src);211sse2_movd(p->func, tmpXMM, x86_make_disp(src, 8));212sse2_punpcklqdq(p->func, data, tmpXMM);213break;214case 16:215sse2_movdqu(p->func, data, src);216break;217default:218return FALSE;219}220return TRUE;221}222223224/* this value can be passed for the out_chans argument */225#define CHANNELS_0001 5226227228/* this function will load #chans float values, and will229* pad the register with zeroes at least up to out_chans.230*231* If out_chans is set to CHANNELS_0001, then the fourth232* value will be padded with 1. Only pass this value if233* chans < 4 or results are undefined.234*/235static void236emit_load_float32(struct translate_sse *p, struct x86_reg data,237struct x86_reg arg0, unsigned out_chans, unsigned chans)238{239switch (chans) {240case 1:241/* a 0 0 0242* a 0 0 1243*/244sse_movss(p->func, data, arg0);245if (out_chans == CHANNELS_0001)246sse_orps(p->func, data, get_const(p, CONST_IDENTITY));247break;248case 2:249/* 0 0 0 1250* a b 0 1251*/252if (out_chans == CHANNELS_0001)253sse_shufps(p->func, data, get_const(p, CONST_IDENTITY),254SHUF(X, Y, Z, W));255else if (out_chans > 2)256sse_movlhps(p->func, data, get_const(p, CONST_IDENTITY));257sse_movlps(p->func, data, arg0);258break;259case 3:260/* Have to jump through some hoops:261*262* c 0 0 0263* c 0 0 1 if out_chans == CHANNELS_0001264* 0 0 c 0/1265* a b c 0/1266*/267sse_movss(p->func, data, x86_make_disp(arg0, 8));268if (out_chans == CHANNELS_0001)269sse_shufps(p->func, data, get_const(p, CONST_IDENTITY),270SHUF(X, Y, Z, W));271sse_shufps(p->func, data, data, SHUF(Y, Z, X, W));272sse_movlps(p->func, data, arg0);273break;274case 4:275sse_movups(p->func, data, arg0);276break;277}278}279280/* this function behaves like emit_load_float32, but loads28164-bit floating point numbers, converting them to 32-bit282ones */283static void284emit_load_float64to32(struct translate_sse *p, struct x86_reg data,285struct x86_reg arg0, unsigned out_chans, unsigned chans)286{287struct x86_reg tmpXMM = x86_make_reg(file_XMM, 1);288switch (chans) {289case 1:290sse2_movsd(p->func, data, arg0);291if (out_chans > 1)292sse2_cvtpd2ps(p->func, data, data);293else294sse2_cvtsd2ss(p->func, data, data);295if (out_chans == CHANNELS_0001)296sse_shufps(p->func, data, get_const(p, CONST_IDENTITY),297SHUF(X, Y, Z, W));298break;299case 2:300sse2_movupd(p->func, data, arg0);301sse2_cvtpd2ps(p->func, data, data);302if (out_chans == CHANNELS_0001)303sse_shufps(p->func, data, get_const(p, CONST_IDENTITY),304SHUF(X, Y, Z, W));305else if (out_chans > 2)306sse_movlhps(p->func, data, get_const(p, CONST_IDENTITY));307break;308case 3:309sse2_movupd(p->func, data, arg0);310sse2_cvtpd2ps(p->func, data, data);311sse2_movsd(p->func, tmpXMM, x86_make_disp(arg0, 16));312if (out_chans > 3)313sse2_cvtpd2ps(p->func, tmpXMM, tmpXMM);314else315sse2_cvtsd2ss(p->func, tmpXMM, tmpXMM);316sse_movlhps(p->func, data, tmpXMM);317if (out_chans == CHANNELS_0001)318sse_orps(p->func, data, get_const(p, CONST_IDENTITY));319break;320case 4:321sse2_movupd(p->func, data, arg0);322sse2_cvtpd2ps(p->func, data, data);323sse2_movupd(p->func, tmpXMM, x86_make_disp(arg0, 16));324sse2_cvtpd2ps(p->func, tmpXMM, tmpXMM);325sse_movlhps(p->func, data, tmpXMM);326break;327}328}329330331static void332emit_mov64(struct translate_sse *p, struct x86_reg dst_gpr,333struct x86_reg dst_xmm, struct x86_reg src_gpr,334struct x86_reg src_xmm)335{336if (x86_target(p->func) != X86_32)337x64_mov64(p->func, dst_gpr, src_gpr);338else {339/* TODO: when/on which CPUs is SSE2 actually better than SSE? */340if (x86_target_caps(p->func) & X86_SSE2)341sse2_movq(p->func, dst_xmm, src_xmm);342else343sse_movlps(p->func, dst_xmm, src_xmm);344}345}346347348static void349emit_load64(struct translate_sse *p, struct x86_reg dst_gpr,350struct x86_reg dst_xmm, struct x86_reg src)351{352emit_mov64(p, dst_gpr, dst_xmm, src, src);353}354355356static void357emit_store64(struct translate_sse *p, struct x86_reg dst,358struct x86_reg src_gpr, struct x86_reg src_xmm)359{360emit_mov64(p, dst, dst, src_gpr, src_xmm);361}362363364static void365emit_mov128(struct translate_sse *p, struct x86_reg dst, struct x86_reg src)366{367if (x86_target_caps(p->func) & X86_SSE2)368sse2_movdqu(p->func, dst, src);369else370sse_movups(p->func, dst, src);371}372373374/* TODO: this uses unaligned accesses liberally, which is great on Nehalem,375* but may or may not be good on older processors376* TODO: may perhaps want to use non-temporal stores here if possible377*/378static void379emit_memcpy(struct translate_sse *p, struct x86_reg dst, struct x86_reg src,380unsigned size)381{382struct x86_reg dataXMM = x86_make_reg(file_XMM, 0);383struct x86_reg dataXMM2 = x86_make_reg(file_XMM, 1);384struct x86_reg dataGPR = p->tmp_EAX;385struct x86_reg dataGPR2 = p->tmp2_EDX;386387if (size < 8) {388switch (size) {389case 1:390x86_mov8(p->func, dataGPR, src);391x86_mov8(p->func, dst, dataGPR);392break;393case 2:394x86_mov16(p->func, dataGPR, src);395x86_mov16(p->func, dst, dataGPR);396break;397case 3:398x86_mov16(p->func, dataGPR, src);399x86_mov8(p->func, dataGPR2, x86_make_disp(src, 2));400x86_mov16(p->func, dst, dataGPR);401x86_mov8(p->func, x86_make_disp(dst, 2), dataGPR2);402break;403case 4:404x86_mov(p->func, dataGPR, src);405x86_mov(p->func, dst, dataGPR);406break;407case 6:408x86_mov(p->func, dataGPR, src);409x86_mov16(p->func, dataGPR2, x86_make_disp(src, 4));410x86_mov(p->func, dst, dataGPR);411x86_mov16(p->func, x86_make_disp(dst, 4), dataGPR2);412break;413}414}415else if (!(x86_target_caps(p->func) & X86_SSE)) {416unsigned i = 0;417assert((size & 3) == 0);418for (i = 0; i < size; i += 4) {419x86_mov(p->func, dataGPR, x86_make_disp(src, i));420x86_mov(p->func, x86_make_disp(dst, i), dataGPR);421}422}423else {424switch (size) {425case 8:426emit_load64(p, dataGPR, dataXMM, src);427emit_store64(p, dst, dataGPR, dataXMM);428break;429case 12:430emit_load64(p, dataGPR2, dataXMM, src);431x86_mov(p->func, dataGPR, x86_make_disp(src, 8));432emit_store64(p, dst, dataGPR2, dataXMM);433x86_mov(p->func, x86_make_disp(dst, 8), dataGPR);434break;435case 16:436emit_mov128(p, dataXMM, src);437emit_mov128(p, dst, dataXMM);438break;439case 24:440emit_mov128(p, dataXMM, src);441emit_load64(p, dataGPR, dataXMM2, x86_make_disp(src, 16));442emit_mov128(p, dst, dataXMM);443emit_store64(p, x86_make_disp(dst, 16), dataGPR, dataXMM2);444break;445case 32:446emit_mov128(p, dataXMM, src);447emit_mov128(p, dataXMM2, x86_make_disp(src, 16));448emit_mov128(p, dst, dataXMM);449emit_mov128(p, x86_make_disp(dst, 16), dataXMM2);450break;451default:452assert(0);453}454}455}456457static boolean458translate_attr_convert(struct translate_sse *p,459const struct translate_element *a,460struct x86_reg src, struct x86_reg dst)461{462const struct util_format_description *input_desc =463util_format_description(a->input_format);464const struct util_format_description *output_desc =465util_format_description(a->output_format);466unsigned i;467boolean id_swizzle = TRUE;468unsigned swizzle[4] =469{ PIPE_SWIZZLE_NONE, PIPE_SWIZZLE_NONE,470PIPE_SWIZZLE_NONE, PIPE_SWIZZLE_NONE };471unsigned needed_chans = 0;472unsigned imms[2] = { 0, 0x3f800000 };473474if (a->output_format == PIPE_FORMAT_NONE475|| a->input_format == PIPE_FORMAT_NONE)476return FALSE;477478if (input_desc->channel[0].size & 7)479return FALSE;480481if (input_desc->colorspace != output_desc->colorspace)482return FALSE;483484for (i = 1; i < input_desc->nr_channels; ++i) {485if (memcmp486(&input_desc->channel[i], &input_desc->channel[0],487sizeof(input_desc->channel[0])))488return FALSE;489}490491for (i = 1; i < output_desc->nr_channels; ++i) {492if (memcmp493(&output_desc->channel[i], &output_desc->channel[0],494sizeof(output_desc->channel[0]))) {495return FALSE;496}497}498499for (i = 0; i < output_desc->nr_channels; ++i) {500if (output_desc->swizzle[i] < 4)501swizzle[output_desc->swizzle[i]] = input_desc->swizzle[i];502}503504if ((x86_target_caps(p->func) & X86_SSE) &&505(0 || a->output_format == PIPE_FORMAT_R32_FLOAT506|| a->output_format == PIPE_FORMAT_R32G32_FLOAT507|| a->output_format == PIPE_FORMAT_R32G32B32_FLOAT508|| a->output_format == PIPE_FORMAT_R32G32B32A32_FLOAT)) {509struct x86_reg dataXMM = x86_make_reg(file_XMM, 0);510511for (i = 0; i < output_desc->nr_channels; ++i) {512if (swizzle[i] == PIPE_SWIZZLE_0513&& i >= input_desc->nr_channels)514swizzle[i] = i;515}516517for (i = 0; i < output_desc->nr_channels; ++i) {518if (swizzle[i] < 4)519needed_chans = MAX2(needed_chans, swizzle[i] + 1);520if (swizzle[i] < PIPE_SWIZZLE_0 && swizzle[i] != i)521id_swizzle = FALSE;522}523524if (needed_chans > 0) {525switch (input_desc->channel[0].type) {526case UTIL_FORMAT_TYPE_UNSIGNED:527if (!(x86_target_caps(p->func) & X86_SSE2))528return FALSE;529emit_load_sse2(p, dataXMM, src,530input_desc->channel[0].size *531input_desc->nr_channels >> 3);532533/* TODO: add support for SSE4.1 pmovzx */534switch (input_desc->channel[0].size) {535case 8:536/* TODO: this may be inefficient due to get_identity() being537* used both as a float and integer register.538*/539sse2_punpcklbw(p->func, dataXMM, get_const(p, CONST_IDENTITY));540sse2_punpcklbw(p->func, dataXMM, get_const(p, CONST_IDENTITY));541break;542case 16:543sse2_punpcklwd(p->func, dataXMM, get_const(p, CONST_IDENTITY));544break;545case 32: /* we lose precision here */546sse2_psrld_imm(p->func, dataXMM, 1);547break;548default:549return FALSE;550}551sse2_cvtdq2ps(p->func, dataXMM, dataXMM);552if (input_desc->channel[0].normalized) {553struct x86_reg factor;554switch (input_desc->channel[0].size) {555case 8:556factor = get_const(p, CONST_INV_255);557break;558case 16:559factor = get_const(p, CONST_INV_65535);560break;561case 32:562factor = get_const(p, CONST_INV_2147483647);563break;564default:565assert(0);566factor.disp = 0;567factor.file = 0;568factor.idx = 0;569factor.mod = 0;570break;571}572sse_mulps(p->func, dataXMM, factor);573}574else if (input_desc->channel[0].size == 32)575/* compensate for the bit we threw away to fit u32 into s32 */576sse_addps(p->func, dataXMM, dataXMM);577break;578case UTIL_FORMAT_TYPE_SIGNED:579if (!(x86_target_caps(p->func) & X86_SSE2))580return FALSE;581emit_load_sse2(p, dataXMM, src,582input_desc->channel[0].size *583input_desc->nr_channels >> 3);584585/* TODO: add support for SSE4.1 pmovsx */586switch (input_desc->channel[0].size) {587case 8:588sse2_punpcklbw(p->func, dataXMM, dataXMM);589sse2_punpcklbw(p->func, dataXMM, dataXMM);590sse2_psrad_imm(p->func, dataXMM, 24);591break;592case 16:593sse2_punpcklwd(p->func, dataXMM, dataXMM);594sse2_psrad_imm(p->func, dataXMM, 16);595break;596case 32: /* we lose precision here */597break;598default:599return FALSE;600}601sse2_cvtdq2ps(p->func, dataXMM, dataXMM);602if (input_desc->channel[0].normalized) {603struct x86_reg factor;604switch (input_desc->channel[0].size) {605case 8:606factor = get_const(p, CONST_INV_127);607break;608case 16:609factor = get_const(p, CONST_INV_32767);610break;611case 32:612factor = get_const(p, CONST_INV_2147483647);613break;614default:615assert(0);616factor.disp = 0;617factor.file = 0;618factor.idx = 0;619factor.mod = 0;620break;621}622sse_mulps(p->func, dataXMM, factor);623}624break;625626break;627case UTIL_FORMAT_TYPE_FLOAT:628if (input_desc->channel[0].size != 32629&& input_desc->channel[0].size != 64) {630return FALSE;631}632if (swizzle[3] == PIPE_SWIZZLE_1633&& input_desc->nr_channels <= 3) {634swizzle[3] = PIPE_SWIZZLE_W;635needed_chans = CHANNELS_0001;636}637switch (input_desc->channel[0].size) {638case 32:639emit_load_float32(p, dataXMM, src, needed_chans,640input_desc->nr_channels);641break;642case 64: /* we lose precision here */643if (!(x86_target_caps(p->func) & X86_SSE2))644return FALSE;645emit_load_float64to32(p, dataXMM, src, needed_chans,646input_desc->nr_channels);647break;648default:649return FALSE;650}651break;652default:653return FALSE;654}655656if (!id_swizzle) {657sse_shufps(p->func, dataXMM, dataXMM,658SHUF(swizzle[0], swizzle[1], swizzle[2], swizzle[3]));659}660}661662if (output_desc->nr_channels >= 4663&& swizzle[0] < PIPE_SWIZZLE_0664&& swizzle[1] < PIPE_SWIZZLE_0665&& swizzle[2] < PIPE_SWIZZLE_0666&& swizzle[3] < PIPE_SWIZZLE_0) {667sse_movups(p->func, dst, dataXMM);668}669else {670if (output_desc->nr_channels >= 2671&& swizzle[0] < PIPE_SWIZZLE_0672&& swizzle[1] < PIPE_SWIZZLE_0) {673sse_movlps(p->func, dst, dataXMM);674}675else {676if (swizzle[0] < PIPE_SWIZZLE_0) {677sse_movss(p->func, dst, dataXMM);678}679else {680x86_mov_imm(p->func, dst,681imms[swizzle[0] - PIPE_SWIZZLE_0]);682}683684if (output_desc->nr_channels >= 2) {685if (swizzle[1] < PIPE_SWIZZLE_0) {686sse_shufps(p->func, dataXMM, dataXMM, SHUF(1, 1, 2, 3));687sse_movss(p->func, x86_make_disp(dst, 4), dataXMM);688}689else {690x86_mov_imm(p->func, x86_make_disp(dst, 4),691imms[swizzle[1] - PIPE_SWIZZLE_0]);692}693}694}695696if (output_desc->nr_channels >= 3) {697if (output_desc->nr_channels >= 4698&& swizzle[2] < PIPE_SWIZZLE_0699&& swizzle[3] < PIPE_SWIZZLE_0) {700sse_movhps(p->func, x86_make_disp(dst, 8), dataXMM);701}702else {703if (swizzle[2] < PIPE_SWIZZLE_0) {704sse_shufps(p->func, dataXMM, dataXMM, SHUF(2, 2, 2, 3));705sse_movss(p->func, x86_make_disp(dst, 8), dataXMM);706}707else {708x86_mov_imm(p->func, x86_make_disp(dst, 8),709imms[swizzle[2] - PIPE_SWIZZLE_0]);710}711712if (output_desc->nr_channels >= 4) {713if (swizzle[3] < PIPE_SWIZZLE_0) {714sse_shufps(p->func, dataXMM, dataXMM, SHUF(3, 3, 3, 3));715sse_movss(p->func, x86_make_disp(dst, 12), dataXMM);716}717else {718x86_mov_imm(p->func, x86_make_disp(dst, 12),719imms[swizzle[3] - PIPE_SWIZZLE_0]);720}721}722}723}724}725return TRUE;726}727else if ((x86_target_caps(p->func) & X86_SSE2)728&& input_desc->channel[0].size == 8729&& output_desc->channel[0].size == 16730&& output_desc->channel[0].normalized ==731input_desc->channel[0].normalized &&732(0 || (input_desc->channel[0].type == UTIL_FORMAT_TYPE_UNSIGNED733&& output_desc->channel[0].type == UTIL_FORMAT_TYPE_UNSIGNED)734|| (input_desc->channel[0].type == UTIL_FORMAT_TYPE_UNSIGNED735&& output_desc->channel[0].type == UTIL_FORMAT_TYPE_SIGNED)736|| (input_desc->channel[0].type == UTIL_FORMAT_TYPE_SIGNED737&& output_desc->channel[0].type == UTIL_FORMAT_TYPE_SIGNED))) {738struct x86_reg dataXMM = x86_make_reg(file_XMM, 0);739struct x86_reg tmpXMM = x86_make_reg(file_XMM, 1);740struct x86_reg tmp = p->tmp_EAX;741unsigned imms[2] = { 0, 1 };742743for (i = 0; i < output_desc->nr_channels; ++i) {744if (swizzle[i] == PIPE_SWIZZLE_0745&& i >= input_desc->nr_channels) {746swizzle[i] = i;747}748}749750for (i = 0; i < output_desc->nr_channels; ++i) {751if (swizzle[i] < 4)752needed_chans = MAX2(needed_chans, swizzle[i] + 1);753if (swizzle[i] < PIPE_SWIZZLE_0 && swizzle[i] != i)754id_swizzle = FALSE;755}756757if (needed_chans > 0) {758emit_load_sse2(p, dataXMM, src,759input_desc->channel[0].size *760input_desc->nr_channels >> 3);761762switch (input_desc->channel[0].type) {763case UTIL_FORMAT_TYPE_UNSIGNED:764if (input_desc->channel[0].normalized) {765sse2_punpcklbw(p->func, dataXMM, dataXMM);766if (output_desc->channel[0].type == UTIL_FORMAT_TYPE_SIGNED)767sse2_psrlw_imm(p->func, dataXMM, 1);768}769else770sse2_punpcklbw(p->func, dataXMM, get_const(p, CONST_IDENTITY));771break;772case UTIL_FORMAT_TYPE_SIGNED:773if (input_desc->channel[0].normalized) {774sse2_movq(p->func, tmpXMM, get_const(p, CONST_IDENTITY));775sse2_punpcklbw(p->func, tmpXMM, dataXMM);776sse2_psllw_imm(p->func, dataXMM, 9);777sse2_psrlw_imm(p->func, dataXMM, 8);778sse2_por(p->func, tmpXMM, dataXMM);779sse2_psrlw_imm(p->func, dataXMM, 7);780sse2_por(p->func, tmpXMM, dataXMM);781{782struct x86_reg t = dataXMM;783dataXMM = tmpXMM;784tmpXMM = t;785}786}787else {788sse2_punpcklbw(p->func, dataXMM, dataXMM);789sse2_psraw_imm(p->func, dataXMM, 8);790}791break;792default:793assert(0);794}795796if (output_desc->channel[0].normalized)797imms[1] =798(output_desc->channel[0].type ==799UTIL_FORMAT_TYPE_UNSIGNED) ? 0xffff : 0x7ffff;800801if (!id_swizzle)802sse2_pshuflw(p->func, dataXMM, dataXMM,803(swizzle[0] & 3) | ((swizzle[1] & 3) << 2) |804((swizzle[2] & 3) << 4) | ((swizzle[3] & 3) << 6));805}806807if (output_desc->nr_channels >= 4808&& swizzle[0] < PIPE_SWIZZLE_0809&& swizzle[1] < PIPE_SWIZZLE_0810&& swizzle[2] < PIPE_SWIZZLE_0811&& swizzle[3] < PIPE_SWIZZLE_0) {812sse2_movq(p->func, dst, dataXMM);813}814else {815if (swizzle[0] < PIPE_SWIZZLE_0) {816if (output_desc->nr_channels >= 2817&& swizzle[1] < PIPE_SWIZZLE_0) {818sse2_movd(p->func, dst, dataXMM);819}820else {821sse2_movd(p->func, tmp, dataXMM);822x86_mov16(p->func, dst, tmp);823if (output_desc->nr_channels >= 2)824x86_mov16_imm(p->func, x86_make_disp(dst, 2),825imms[swizzle[1] - PIPE_SWIZZLE_0]);826}827}828else {829if (output_desc->nr_channels >= 2830&& swizzle[1] >= PIPE_SWIZZLE_0) {831x86_mov_imm(p->func, dst,832(imms[swizzle[1] - PIPE_SWIZZLE_0] << 16) |833imms[swizzle[0] - PIPE_SWIZZLE_0]);834}835else {836x86_mov16_imm(p->func, dst,837imms[swizzle[0] - PIPE_SWIZZLE_0]);838if (output_desc->nr_channels >= 2) {839sse2_movd(p->func, tmp, dataXMM);840x86_shr_imm(p->func, tmp, 16);841x86_mov16(p->func, x86_make_disp(dst, 2), tmp);842}843}844}845846if (output_desc->nr_channels >= 3) {847if (swizzle[2] < PIPE_SWIZZLE_0) {848if (output_desc->nr_channels >= 4849&& swizzle[3] < PIPE_SWIZZLE_0) {850sse2_psrlq_imm(p->func, dataXMM, 32);851sse2_movd(p->func, x86_make_disp(dst, 4), dataXMM);852}853else {854sse2_psrlq_imm(p->func, dataXMM, 32);855sse2_movd(p->func, tmp, dataXMM);856x86_mov16(p->func, x86_make_disp(dst, 4), tmp);857if (output_desc->nr_channels >= 4) {858x86_mov16_imm(p->func, x86_make_disp(dst, 6),859imms[swizzle[3] - PIPE_SWIZZLE_0]);860}861}862}863else {864if (output_desc->nr_channels >= 4865&& swizzle[3] >= PIPE_SWIZZLE_0) {866x86_mov_imm(p->func, x86_make_disp(dst, 4),867(imms[swizzle[3] - PIPE_SWIZZLE_0] << 16)868| imms[swizzle[2] - PIPE_SWIZZLE_0]);869}870else {871x86_mov16_imm(p->func, x86_make_disp(dst, 4),872imms[swizzle[2] - PIPE_SWIZZLE_0]);873874if (output_desc->nr_channels >= 4) {875sse2_psrlq_imm(p->func, dataXMM, 48);876sse2_movd(p->func, tmp, dataXMM);877x86_mov16(p->func, x86_make_disp(dst, 6), tmp);878}879}880}881}882}883return TRUE;884}885else if (!memcmp(&output_desc->channel[0], &input_desc->channel[0],886sizeof(output_desc->channel[0]))) {887struct x86_reg tmp = p->tmp_EAX;888unsigned i;889890if (input_desc->channel[0].size == 8 && input_desc->nr_channels == 4891&& output_desc->nr_channels == 4892&& swizzle[0] == PIPE_SWIZZLE_W893&& swizzle[1] == PIPE_SWIZZLE_Z894&& swizzle[2] == PIPE_SWIZZLE_Y895&& swizzle[3] == PIPE_SWIZZLE_X) {896/* TODO: support movbe */897x86_mov(p->func, tmp, src);898x86_bswap(p->func, tmp);899x86_mov(p->func, dst, tmp);900return TRUE;901}902903for (i = 0; i < output_desc->nr_channels; ++i) {904switch (output_desc->channel[0].size) {905case 8:906if (swizzle[i] >= PIPE_SWIZZLE_0) {907unsigned v = 0;908if (swizzle[i] == PIPE_SWIZZLE_1) {909switch (output_desc->channel[0].type) {910case UTIL_FORMAT_TYPE_UNSIGNED:911v = output_desc->channel[0].normalized ? 0xff : 1;912break;913case UTIL_FORMAT_TYPE_SIGNED:914v = output_desc->channel[0].normalized ? 0x7f : 1;915break;916default:917return FALSE;918}919}920x86_mov8_imm(p->func, x86_make_disp(dst, i * 1), v);921}922else {923x86_mov8(p->func, tmp, x86_make_disp(src, swizzle[i] * 1));924x86_mov8(p->func, x86_make_disp(dst, i * 1), tmp);925}926break;927case 16:928if (swizzle[i] >= PIPE_SWIZZLE_0) {929unsigned v = 0;930if (swizzle[i] == PIPE_SWIZZLE_1) {931switch (output_desc->channel[1].type) {932case UTIL_FORMAT_TYPE_UNSIGNED:933v = output_desc->channel[1].normalized ? 0xffff : 1;934break;935case UTIL_FORMAT_TYPE_SIGNED:936v = output_desc->channel[1].normalized ? 0x7fff : 1;937break;938case UTIL_FORMAT_TYPE_FLOAT:939v = 0x3c00;940break;941default:942return FALSE;943}944}945x86_mov16_imm(p->func, x86_make_disp(dst, i * 2), v);946}947else if (swizzle[i] == PIPE_SWIZZLE_0) {948x86_mov16_imm(p->func, x86_make_disp(dst, i * 2), 0);949}950else {951x86_mov16(p->func, tmp, x86_make_disp(src, swizzle[i] * 2));952x86_mov16(p->func, x86_make_disp(dst, i * 2), tmp);953}954break;955case 32:956if (swizzle[i] >= PIPE_SWIZZLE_0) {957unsigned v = 0;958if (swizzle[i] == PIPE_SWIZZLE_1) {959switch (output_desc->channel[1].type) {960case UTIL_FORMAT_TYPE_UNSIGNED:961v = output_desc->channel[1].normalized ? 0xffffffff : 1;962break;963case UTIL_FORMAT_TYPE_SIGNED:964v = output_desc->channel[1].normalized ? 0x7fffffff : 1;965break;966case UTIL_FORMAT_TYPE_FLOAT:967v = 0x3f800000;968break;969default:970return FALSE;971}972}973x86_mov_imm(p->func, x86_make_disp(dst, i * 4), v);974}975else {976x86_mov(p->func, tmp, x86_make_disp(src, swizzle[i] * 4));977x86_mov(p->func, x86_make_disp(dst, i * 4), tmp);978}979break;980case 64:981if (swizzle[i] >= PIPE_SWIZZLE_0) {982unsigned l = 0;983unsigned h = 0;984if (swizzle[i] == PIPE_SWIZZLE_1) {985switch (output_desc->channel[1].type) {986case UTIL_FORMAT_TYPE_UNSIGNED:987h = output_desc->channel[1].normalized ? 0xffffffff : 0;988l = output_desc->channel[1].normalized ? 0xffffffff : 1;989break;990case UTIL_FORMAT_TYPE_SIGNED:991h = output_desc->channel[1].normalized ? 0x7fffffff : 0;992l = output_desc->channel[1].normalized ? 0xffffffff : 1;993break;994case UTIL_FORMAT_TYPE_FLOAT:995h = 0x3ff00000;996l = 0;997break;998default:999return FALSE;1000}1001}1002x86_mov_imm(p->func, x86_make_disp(dst, i * 8), l);1003x86_mov_imm(p->func, x86_make_disp(dst, i * 8 + 4), h);1004}1005else {1006if (x86_target_caps(p->func) & X86_SSE) {1007struct x86_reg tmpXMM = x86_make_reg(file_XMM, 0);1008emit_load64(p, tmp, tmpXMM,1009x86_make_disp(src, swizzle[i] * 8));1010emit_store64(p, x86_make_disp(dst, i * 8), tmp, tmpXMM);1011}1012else {1013x86_mov(p->func, tmp, x86_make_disp(src, swizzle[i] * 8));1014x86_mov(p->func, x86_make_disp(dst, i * 8), tmp);1015x86_mov(p->func, tmp,1016x86_make_disp(src, swizzle[i] * 8 + 4));1017x86_mov(p->func, x86_make_disp(dst, i * 8 + 4), tmp);1018}1019}1020break;1021default:1022return FALSE;1023}1024}1025return TRUE;1026}1027/* special case for draw's EMIT_4UB (RGBA) and EMIT_4UB_BGRA */1028else if ((x86_target_caps(p->func) & X86_SSE2) &&1029a->input_format == PIPE_FORMAT_R32G32B32A32_FLOAT &&1030(0 || a->output_format == PIPE_FORMAT_B8G8R8A8_UNORM1031|| a-> output_format == PIPE_FORMAT_R8G8B8A8_UNORM)) {1032struct x86_reg dataXMM = x86_make_reg(file_XMM, 0);10331034/* load */1035sse_movups(p->func, dataXMM, src);10361037if (a->output_format == PIPE_FORMAT_B8G8R8A8_UNORM) {1038sse_shufps(p->func, dataXMM, dataXMM, SHUF(2, 1, 0, 3));1039}10401041/* scale by 255.0 */1042sse_mulps(p->func, dataXMM, get_const(p, CONST_255));10431044/* pack and emit */1045sse2_cvtps2dq(p->func, dataXMM, dataXMM);1046sse2_packssdw(p->func, dataXMM, dataXMM);1047sse2_packuswb(p->func, dataXMM, dataXMM);1048sse2_movd(p->func, dst, dataXMM);10491050return TRUE;1051}10521053return FALSE;1054}105510561057static boolean1058translate_attr(struct translate_sse *p,1059const struct translate_element *a,1060struct x86_reg src, struct x86_reg dst)1061{1062if (a->input_format == a->output_format) {1063emit_memcpy(p, dst, src, util_format_get_stride(a->input_format, 1));1064return TRUE;1065}10661067return translate_attr_convert(p, a, src, dst);1068}106910701071static boolean1072init_inputs(struct translate_sse *p, unsigned index_size)1073{1074unsigned i;1075struct x86_reg instance_id =1076x86_make_disp(p->machine_EDI, get_offset(p, &p->instance_id));1077struct x86_reg start_instance =1078x86_make_disp(p->machine_EDI, get_offset(p, &p->start_instance));10791080for (i = 0; i < p->nr_buffer_variants; i++) {1081struct translate_buffer_variant *variant = &p->buffer_variant[i];1082struct translate_buffer *buffer = &p->buffer[variant->buffer_index];10831084if (!index_size || variant->instance_divisor) {1085struct x86_reg buf_max_index =1086x86_make_disp(p->machine_EDI, get_offset(p, &buffer->max_index));1087struct x86_reg buf_stride =1088x86_make_disp(p->machine_EDI, get_offset(p, &buffer->stride));1089struct x86_reg buf_ptr =1090x86_make_disp(p->machine_EDI, get_offset(p, &variant->ptr));1091struct x86_reg buf_base_ptr =1092x86_make_disp(p->machine_EDI, get_offset(p, &buffer->base_ptr));1093struct x86_reg elt = p->idx_ESI;1094struct x86_reg tmp_EAX = p->tmp_EAX;10951096/* Calculate pointer to first attrib:1097* base_ptr + stride * index, where index depends on instance divisor1098*/1099if (variant->instance_divisor) {1100struct x86_reg tmp_EDX = p->tmp2_EDX;11011102/* Start with instance = instance_id1103* which is true if divisor is 1.1104*/1105x86_mov(p->func, tmp_EAX, instance_id);11061107if (variant->instance_divisor != 1) {1108struct x86_reg tmp_ECX = p->src_ECX;11091110/* TODO: Add x86_shr() to rtasm and use it whenever1111* instance divisor is power of two.1112*/1113x86_xor(p->func, tmp_EDX, tmp_EDX);1114x86_mov_reg_imm(p->func, tmp_ECX, variant->instance_divisor);1115x86_div(p->func, tmp_ECX); /* EAX = EDX:EAX / ECX */1116}11171118/* instance = (instance_id / divisor) + start_instance1119*/1120x86_mov(p->func, tmp_EDX, start_instance);1121x86_add(p->func, tmp_EAX, tmp_EDX);11221123/* XXX we need to clamp the index here too, but to a1124* per-array max value, not the draw->pt.max_index value1125* that's being given to us via translate->set_buffer().1126*/1127}1128else {1129x86_mov(p->func, tmp_EAX, elt);11301131/* Clamp to max_index1132*/1133x86_cmp(p->func, tmp_EAX, buf_max_index);1134x86_cmovcc(p->func, tmp_EAX, buf_max_index, cc_AE);1135}11361137x86_mov(p->func, p->tmp2_EDX, buf_stride);1138x64_rexw(p->func);1139x86_imul(p->func, tmp_EAX, p->tmp2_EDX);1140x64_rexw(p->func);1141x86_add(p->func, tmp_EAX, buf_base_ptr);11421143x86_cmp(p->func, p->count_EBP, p->tmp_EAX);11441145/* In the linear case, keep the buffer pointer instead of the1146* index number.1147*/1148if (!index_size && p->nr_buffer_variants == 1) {1149x64_rexw(p->func);1150x86_mov(p->func, elt, tmp_EAX);1151}1152else {1153x64_rexw(p->func);1154x86_mov(p->func, buf_ptr, tmp_EAX);1155}1156}1157}11581159return TRUE;1160}116111621163static struct x86_reg1164get_buffer_ptr(struct translate_sse *p,1165unsigned index_size, unsigned var_idx, struct x86_reg elt)1166{1167if (var_idx == ELEMENT_BUFFER_INSTANCE_ID) {1168return x86_make_disp(p->machine_EDI, get_offset(p, &p->instance_id));1169}1170if (!index_size && p->nr_buffer_variants == 1) {1171return p->idx_ESI;1172}1173else if (!index_size || p->buffer_variant[var_idx].instance_divisor) {1174struct x86_reg ptr = p->src_ECX;1175struct x86_reg buf_ptr =1176x86_make_disp(p->machine_EDI,1177get_offset(p, &p->buffer_variant[var_idx].ptr));11781179x64_rexw(p->func);1180x86_mov(p->func, ptr, buf_ptr);1181return ptr;1182}1183else {1184struct x86_reg ptr = p->src_ECX;1185const struct translate_buffer_variant *variant =1186&p->buffer_variant[var_idx];1187struct x86_reg buf_stride =1188x86_make_disp(p->machine_EDI,1189get_offset(p, &p->buffer[variant->buffer_index].stride));1190struct x86_reg buf_base_ptr =1191x86_make_disp(p->machine_EDI,1192get_offset(p, &p->buffer[variant->buffer_index].base_ptr));1193struct x86_reg buf_max_index =1194x86_make_disp(p->machine_EDI,1195get_offset(p, &p->buffer[variant->buffer_index].max_index));11961197/* Calculate pointer to current attrib:1198*/1199switch (index_size) {1200case 1:1201x86_movzx8(p->func, ptr, elt);1202break;1203case 2:1204x86_movzx16(p->func, ptr, elt);1205break;1206case 4:1207x86_mov(p->func, ptr, elt);1208break;1209}12101211/* Clamp to max_index1212*/1213x86_cmp(p->func, ptr, buf_max_index);1214x86_cmovcc(p->func, ptr, buf_max_index, cc_AE);12151216x86_mov(p->func, p->tmp2_EDX, buf_stride);1217x64_rexw(p->func);1218x86_imul(p->func, ptr, p->tmp2_EDX);1219x64_rexw(p->func);1220x86_add(p->func, ptr, buf_base_ptr);1221return ptr;1222}1223}122412251226static boolean1227incr_inputs(struct translate_sse *p, unsigned index_size)1228{1229if (!index_size && p->nr_buffer_variants == 1) {1230const unsigned buffer_index = p->buffer_variant[0].buffer_index;1231struct x86_reg stride =1232x86_make_disp(p->machine_EDI,1233get_offset(p, &p->buffer[buffer_index].stride));12341235if (p->buffer_variant[0].instance_divisor == 0) {1236x64_rexw(p->func);1237x86_add(p->func, p->idx_ESI, stride);1238sse_prefetchnta(p->func, x86_make_disp(p->idx_ESI, 192));1239}1240}1241else if (!index_size) {1242unsigned i;12431244/* Is this worthwhile??1245*/1246for (i = 0; i < p->nr_buffer_variants; i++) {1247struct translate_buffer_variant *variant = &p->buffer_variant[i];1248struct x86_reg buf_ptr = x86_make_disp(p->machine_EDI,1249get_offset(p, &variant->ptr));1250struct x86_reg buf_stride =1251x86_make_disp(p->machine_EDI,1252get_offset(p, &p->buffer[variant->buffer_index].stride));12531254if (variant->instance_divisor == 0) {1255x86_mov(p->func, p->tmp_EAX, buf_stride);1256x64_rexw(p->func);1257x86_add(p->func, p->tmp_EAX, buf_ptr);1258if (i == 0)1259sse_prefetchnta(p->func, x86_make_disp(p->tmp_EAX, 192));1260x64_rexw(p->func);1261x86_mov(p->func, buf_ptr, p->tmp_EAX);1262}1263}1264}1265else {1266x64_rexw(p->func);1267x86_lea(p->func, p->idx_ESI, x86_make_disp(p->idx_ESI, index_size));1268}12691270return TRUE;1271}127212731274/* Build run( struct translate *machine,1275* unsigned start,1276* unsigned count,1277* void *output_buffer )1278* or1279* run_elts( struct translate *machine,1280* unsigned *elts,1281* unsigned count,1282* void *output_buffer )1283*1284* Lots of hardcoding1285*1286* EAX -- pointer to current output vertex1287* ECX -- pointer to current attribute1288*1289*/1290static boolean1291build_vertex_emit(struct translate_sse *p,1292struct x86_function *func, unsigned index_size)1293{1294int fixup, label;1295unsigned j;12961297memset(p->reg_to_const, 0xff, sizeof(p->reg_to_const));1298memset(p->const_to_reg, 0xff, sizeof(p->const_to_reg));12991300p->tmp_EAX = x86_make_reg(file_REG32, reg_AX);1301p->idx_ESI = x86_make_reg(file_REG32, reg_SI);1302p->outbuf_EBX = x86_make_reg(file_REG32, reg_BX);1303p->machine_EDI = x86_make_reg(file_REG32, reg_DI);1304p->count_EBP = x86_make_reg(file_REG32, reg_BP);1305p->tmp2_EDX = x86_make_reg(file_REG32, reg_DX);1306p->src_ECX = x86_make_reg(file_REG32, reg_CX);13071308p->func = func;13091310x86_init_func(p->func);13111312if (x86_target(p->func) == X86_64_WIN64_ABI) {1313/* the ABI guarantees a 16-byte aligned 32-byte "shadow space"1314* above the return address1315*/1316sse2_movdqa(p->func, x86_make_disp(x86_make_reg(file_REG32, reg_SP), 8),1317x86_make_reg(file_XMM, 6));1318sse2_movdqa(p->func,1319x86_make_disp(x86_make_reg(file_REG32, reg_SP), 24),1320x86_make_reg(file_XMM, 7));1321}13221323x86_push(p->func, p->outbuf_EBX);1324x86_push(p->func, p->count_EBP);13251326/* on non-Win64 x86-64, these are already in the right registers */1327if (x86_target(p->func) != X86_64_STD_ABI) {1328x86_push(p->func, p->machine_EDI);1329x86_push(p->func, p->idx_ESI);13301331if (x86_target(p->func) != X86_32) {1332x64_mov64(p->func, p->machine_EDI, x86_fn_arg(p->func, 1));1333x64_mov64(p->func, p->idx_ESI, x86_fn_arg(p->func, 2));1334}1335else {1336x86_mov(p->func, p->machine_EDI, x86_fn_arg(p->func, 1));1337x86_mov(p->func, p->idx_ESI, x86_fn_arg(p->func, 2));1338}1339}13401341x86_mov(p->func, p->count_EBP, x86_fn_arg(p->func, 3));13421343if (x86_target(p->func) != X86_32)1344x64_mov64(p->func, p->outbuf_EBX, x86_fn_arg(p->func, 6));1345else1346x86_mov(p->func, p->outbuf_EBX, x86_fn_arg(p->func, 6));13471348/* Load instance ID.1349*/1350if (p->use_instancing) {1351x86_mov(p->func, p->tmp2_EDX, x86_fn_arg(p->func, 4));1352x86_mov(p->func,1353x86_make_disp(p->machine_EDI,1354get_offset(p, &p->start_instance)), p->tmp2_EDX);13551356x86_mov(p->func, p->tmp_EAX, x86_fn_arg(p->func, 5));1357x86_mov(p->func,1358x86_make_disp(p->machine_EDI, get_offset(p, &p->instance_id)),1359p->tmp_EAX);1360}13611362/* Get vertex count, compare to zero1363*/1364x86_xor(p->func, p->tmp_EAX, p->tmp_EAX);1365x86_cmp(p->func, p->count_EBP, p->tmp_EAX);1366fixup = x86_jcc_forward(p->func, cc_E);13671368/* always load, needed or not:1369*/1370init_inputs(p, index_size);13711372/* Note address for loop jump1373*/1374label = x86_get_label(p->func);1375{1376struct x86_reg elt = !index_size ? p->idx_ESI : x86_deref(p->idx_ESI);1377int last_variant = -1;1378struct x86_reg vb;13791380for (j = 0; j < p->translate.key.nr_elements; j++) {1381const struct translate_element *a = &p->translate.key.element[j];1382unsigned variant = p->element_to_buffer_variant[j];13831384/* Figure out source pointer address:1385*/1386if (variant != last_variant) {1387last_variant = variant;1388vb = get_buffer_ptr(p, index_size, variant, elt);1389}13901391if (!translate_attr(p, a,1392x86_make_disp(vb, a->input_offset),1393x86_make_disp(p->outbuf_EBX, a->output_offset)))1394return FALSE;1395}13961397/* Next output vertex:1398*/1399x64_rexw(p->func);1400x86_lea(p->func, p->outbuf_EBX,1401x86_make_disp(p->outbuf_EBX, p->translate.key.output_stride));14021403/* Incr index1404*/1405incr_inputs(p, index_size);1406}14071408/* decr count, loop if not zero1409*/1410x86_dec(p->func, p->count_EBP);1411x86_jcc(p->func, cc_NZ, label);14121413/* Exit mmx state?1414*/1415if (p->func->need_emms)1416mmx_emms(p->func);14171418/* Land forward jump here:1419*/1420x86_fixup_fwd_jump(p->func, fixup);14211422/* Pop regs and return1423*/1424if (x86_target(p->func) != X86_64_STD_ABI) {1425x86_pop(p->func, p->idx_ESI);1426x86_pop(p->func, p->machine_EDI);1427}14281429x86_pop(p->func, p->count_EBP);1430x86_pop(p->func, p->outbuf_EBX);14311432if (x86_target(p->func) == X86_64_WIN64_ABI) {1433sse2_movdqa(p->func, x86_make_reg(file_XMM, 6),1434x86_make_disp(x86_make_reg(file_REG32, reg_SP), 8));1435sse2_movdqa(p->func, x86_make_reg(file_XMM, 7),1436x86_make_disp(x86_make_reg(file_REG32, reg_SP), 24));1437}1438x86_ret(p->func);14391440return TRUE;1441}144214431444static void1445translate_sse_set_buffer(struct translate *translate,1446unsigned buf,1447const void *ptr, unsigned stride, unsigned max_index)1448{1449struct translate_sse *p = (struct translate_sse *) translate;14501451if (buf < p->nr_buffers) {1452p->buffer[buf].base_ptr = (char *) ptr;1453p->buffer[buf].stride = stride;1454p->buffer[buf].max_index = max_index;1455}14561457if (0)1458debug_printf("%s %d/%d: %p %d\n",1459__FUNCTION__, buf, p->nr_buffers, ptr, stride);1460}146114621463static void1464translate_sse_release(struct translate *translate)1465{1466struct translate_sse *p = (struct translate_sse *) translate;14671468x86_release_func(&p->elt8_func);1469x86_release_func(&p->elt16_func);1470x86_release_func(&p->elt_func);1471x86_release_func(&p->linear_func);14721473os_free_aligned(p);1474}147514761477struct translate *1478translate_sse2_create(const struct translate_key *key)1479{1480struct translate_sse *p = NULL;1481unsigned i;14821483/* this is misnamed, it actually refers to whether rtasm is enabled or not */1484if (!rtasm_cpu_has_sse())1485goto fail;14861487p = os_malloc_aligned(sizeof(struct translate_sse), 16);1488if (!p)1489goto fail;14901491memset(p, 0, sizeof(*p));1492memcpy(p->consts, consts, sizeof(consts));14931494p->translate.key = *key;1495p->translate.release = translate_sse_release;1496p->translate.set_buffer = translate_sse_set_buffer;14971498assert(key->nr_elements <= TRANSLATE_MAX_ATTRIBS);14991500for (i = 0; i < key->nr_elements; i++) {1501if (key->element[i].type == TRANSLATE_ELEMENT_NORMAL) {1502unsigned j;15031504p->nr_buffers =1505MAX2(p->nr_buffers, key->element[i].input_buffer + 1);15061507if (key->element[i].instance_divisor) {1508p->use_instancing = TRUE;1509}15101511/*1512* Map vertex element to vertex buffer variant.1513*/1514for (j = 0; j < p->nr_buffer_variants; j++) {1515if (p->buffer_variant[j].buffer_index ==1516key->element[i].input_buffer1517&& p->buffer_variant[j].instance_divisor ==1518key->element[i].instance_divisor) {1519break;1520}1521}1522if (j == p->nr_buffer_variants) {1523p->buffer_variant[j].buffer_index = key->element[i].input_buffer;1524p->buffer_variant[j].instance_divisor =1525key->element[i].instance_divisor;1526p->nr_buffer_variants++;1527}1528p->element_to_buffer_variant[i] = j;1529}1530else {1531assert(key->element[i].type == TRANSLATE_ELEMENT_INSTANCE_ID);15321533p->element_to_buffer_variant[i] = ELEMENT_BUFFER_INSTANCE_ID;1534}1535}15361537if (0)1538debug_printf("nr_buffers: %d\n", p->nr_buffers);15391540if (!build_vertex_emit(p, &p->linear_func, 0))1541goto fail;15421543if (!build_vertex_emit(p, &p->elt_func, 4))1544goto fail;15451546if (!build_vertex_emit(p, &p->elt16_func, 2))1547goto fail;15481549if (!build_vertex_emit(p, &p->elt8_func, 1))1550goto fail;15511552p->translate.run = (run_func) x86_get_func(&p->linear_func);1553if (p->translate.run == NULL)1554goto fail;15551556p->translate.run_elts = (run_elts_func) x86_get_func(&p->elt_func);1557if (p->translate.run_elts == NULL)1558goto fail;15591560p->translate.run_elts16 = (run_elts16_func) x86_get_func(&p->elt16_func);1561if (p->translate.run_elts16 == NULL)1562goto fail;15631564p->translate.run_elts8 = (run_elts8_func) x86_get_func(&p->elt8_func);1565if (p->translate.run_elts8 == NULL)1566goto fail;15671568return &p->translate;15691570fail:1571if (p)1572translate_sse_release(&p->translate);15731574return NULL;1575}157615771578#else15791580struct translate *1581translate_sse2_create(const struct translate_key *key)1582{1583return NULL;1584}15851586#endif158715881589