Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
PojavLauncherTeam
GitHub Repository: PojavLauncherTeam/mesa
Path: blob/21.2-virgl/src/gallium/auxiliary/translate/translate_sse.c
4565 views
1
/*
2
* Copyright 2003 VMware, Inc.
3
* All Rights Reserved.
4
*
5
* Permission is hereby granted, free of charge, to any person obtaining a
6
* copy of this software and associated documentation files (the "Software"),
7
* to deal in the Software without restriction, including without limitation
8
* on the rights to use, copy, modify, merge, publish, distribute, sub
9
* license, and/or sell copies of the Software, and to permit persons to whom
10
* the Software is furnished to do so, subject to the following conditions:
11
*
12
* The above copyright notice and this permission notice (including the next
13
* paragraph) shall be included in all copies or substantial portions of the
14
* Software.
15
*
16
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18
* FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
19
* VMWARE AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
20
* DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
21
* OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
22
* USE OR OTHER DEALINGS IN THE SOFTWARE.
23
*
24
* Authors:
25
* Keith Whitwell <[email protected]>
26
*/
27
28
29
#include "pipe/p_config.h"
30
#include "pipe/p_compiler.h"
31
#include "util/u_memory.h"
32
#include "util/u_math.h"
33
#include "util/format/u_format.h"
34
35
#include "translate.h"
36
37
38
#if (defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)) && !defined(EMBEDDED_DEVICE)
39
40
#include "rtasm/rtasm_cpu.h"
41
#include "rtasm/rtasm_x86sse.h"
42
43
44
#define X 0
45
#define Y 1
46
#define Z 2
47
#define W 3
48
49
50
struct translate_buffer
51
{
52
const void *base_ptr;
53
uintptr_t stride;
54
unsigned max_index;
55
};
56
57
struct translate_buffer_variant
58
{
59
unsigned buffer_index;
60
unsigned instance_divisor;
61
void *ptr; /* updated either per vertex or per instance */
62
};
63
64
65
#define ELEMENT_BUFFER_INSTANCE_ID 1001
66
67
#define NUM_CONSTS 7
68
69
enum
70
{
71
CONST_IDENTITY,
72
CONST_INV_127,
73
CONST_INV_255,
74
CONST_INV_32767,
75
CONST_INV_65535,
76
CONST_INV_2147483647,
77
CONST_255
78
};
79
80
#define C(v) {(float)(v), (float)(v), (float)(v), (float)(v)}
81
static float consts[NUM_CONSTS][4] = {
82
{0, 0, 0, 1},
83
C(1.0 / 127.0),
84
C(1.0 / 255.0),
85
C(1.0 / 32767.0),
86
C(1.0 / 65535.0),
87
C(1.0 / 2147483647.0),
88
C(255.0)
89
};
90
91
#undef C
92
93
struct translate_sse
94
{
95
struct translate translate;
96
97
struct x86_function linear_func;
98
struct x86_function elt_func;
99
struct x86_function elt16_func;
100
struct x86_function elt8_func;
101
struct x86_function *func;
102
103
PIPE_ALIGN_VAR(16) float consts[NUM_CONSTS][4];
104
int8_t reg_to_const[16];
105
int8_t const_to_reg[NUM_CONSTS];
106
107
struct translate_buffer buffer[TRANSLATE_MAX_ATTRIBS];
108
unsigned nr_buffers;
109
110
/* Multiple buffer variants can map to a single buffer. */
111
struct translate_buffer_variant buffer_variant[TRANSLATE_MAX_ATTRIBS];
112
unsigned nr_buffer_variants;
113
114
/* Multiple elements can map to a single buffer variant. */
115
unsigned element_to_buffer_variant[TRANSLATE_MAX_ATTRIBS];
116
117
boolean use_instancing;
118
unsigned instance_id;
119
unsigned start_instance;
120
121
/* these are actually known values, but putting them in a struct
122
* like this is helpful to keep them in sync across the file.
123
*/
124
struct x86_reg tmp_EAX;
125
struct x86_reg tmp2_EDX;
126
struct x86_reg src_ECX;
127
struct x86_reg idx_ESI; /* either start+i or &elt[i] */
128
struct x86_reg machine_EDI;
129
struct x86_reg outbuf_EBX;
130
struct x86_reg count_EBP; /* decrements to zero */
131
};
132
133
134
static int
135
get_offset(const void *a, const void *b)
136
{
137
return (const char *) b - (const char *) a;
138
}
139
140
141
static struct x86_reg
142
get_const(struct translate_sse *p, unsigned id)
143
{
144
struct x86_reg reg;
145
unsigned i;
146
147
if (p->const_to_reg[id] >= 0)
148
return x86_make_reg(file_XMM, p->const_to_reg[id]);
149
150
for (i = 2; i < 8; ++i) {
151
if (p->reg_to_const[i] < 0)
152
break;
153
}
154
155
/* TODO: be smarter here */
156
if (i == 8)
157
--i;
158
159
reg = x86_make_reg(file_XMM, i);
160
161
if (p->reg_to_const[i] >= 0)
162
p->const_to_reg[p->reg_to_const[i]] = -1;
163
164
p->reg_to_const[i] = id;
165
p->const_to_reg[id] = i;
166
167
/* TODO: this should happen outside the loop, if possible */
168
sse_movaps(p->func, reg,
169
x86_make_disp(p->machine_EDI,
170
get_offset(p, &p->consts[id][0])));
171
172
return reg;
173
}
174
175
176
/* load the data in a SSE2 register, padding with zeros */
177
static boolean
178
emit_load_sse2(struct translate_sse *p,
179
struct x86_reg data, struct x86_reg src, unsigned size)
180
{
181
struct x86_reg tmpXMM = x86_make_reg(file_XMM, 1);
182
struct x86_reg tmp = p->tmp_EAX;
183
switch (size) {
184
case 1:
185
x86_movzx8(p->func, tmp, src);
186
sse2_movd(p->func, data, tmp);
187
break;
188
case 2:
189
x86_movzx16(p->func, tmp, src);
190
sse2_movd(p->func, data, tmp);
191
break;
192
case 3:
193
x86_movzx8(p->func, tmp, x86_make_disp(src, 2));
194
x86_shl_imm(p->func, tmp, 16);
195
x86_mov16(p->func, tmp, src);
196
sse2_movd(p->func, data, tmp);
197
break;
198
case 4:
199
sse2_movd(p->func, data, src);
200
break;
201
case 6:
202
sse2_movd(p->func, data, src);
203
x86_movzx16(p->func, tmp, x86_make_disp(src, 4));
204
sse2_movd(p->func, tmpXMM, tmp);
205
sse2_punpckldq(p->func, data, tmpXMM);
206
break;
207
case 8:
208
sse2_movq(p->func, data, src);
209
break;
210
case 12:
211
sse2_movq(p->func, data, src);
212
sse2_movd(p->func, tmpXMM, x86_make_disp(src, 8));
213
sse2_punpcklqdq(p->func, data, tmpXMM);
214
break;
215
case 16:
216
sse2_movdqu(p->func, data, src);
217
break;
218
default:
219
return FALSE;
220
}
221
return TRUE;
222
}
223
224
225
/* this value can be passed for the out_chans argument */
226
#define CHANNELS_0001 5
227
228
229
/* this function will load #chans float values, and will
230
* pad the register with zeroes at least up to out_chans.
231
*
232
* If out_chans is set to CHANNELS_0001, then the fourth
233
* value will be padded with 1. Only pass this value if
234
* chans < 4 or results are undefined.
235
*/
236
static void
237
emit_load_float32(struct translate_sse *p, struct x86_reg data,
238
struct x86_reg arg0, unsigned out_chans, unsigned chans)
239
{
240
switch (chans) {
241
case 1:
242
/* a 0 0 0
243
* a 0 0 1
244
*/
245
sse_movss(p->func, data, arg0);
246
if (out_chans == CHANNELS_0001)
247
sse_orps(p->func, data, get_const(p, CONST_IDENTITY));
248
break;
249
case 2:
250
/* 0 0 0 1
251
* a b 0 1
252
*/
253
if (out_chans == CHANNELS_0001)
254
sse_shufps(p->func, data, get_const(p, CONST_IDENTITY),
255
SHUF(X, Y, Z, W));
256
else if (out_chans > 2)
257
sse_movlhps(p->func, data, get_const(p, CONST_IDENTITY));
258
sse_movlps(p->func, data, arg0);
259
break;
260
case 3:
261
/* Have to jump through some hoops:
262
*
263
* c 0 0 0
264
* c 0 0 1 if out_chans == CHANNELS_0001
265
* 0 0 c 0/1
266
* a b c 0/1
267
*/
268
sse_movss(p->func, data, x86_make_disp(arg0, 8));
269
if (out_chans == CHANNELS_0001)
270
sse_shufps(p->func, data, get_const(p, CONST_IDENTITY),
271
SHUF(X, Y, Z, W));
272
sse_shufps(p->func, data, data, SHUF(Y, Z, X, W));
273
sse_movlps(p->func, data, arg0);
274
break;
275
case 4:
276
sse_movups(p->func, data, arg0);
277
break;
278
}
279
}
280
281
/* this function behaves like emit_load_float32, but loads
282
64-bit floating point numbers, converting them to 32-bit
283
ones */
284
static void
285
emit_load_float64to32(struct translate_sse *p, struct x86_reg data,
286
struct x86_reg arg0, unsigned out_chans, unsigned chans)
287
{
288
struct x86_reg tmpXMM = x86_make_reg(file_XMM, 1);
289
switch (chans) {
290
case 1:
291
sse2_movsd(p->func, data, arg0);
292
if (out_chans > 1)
293
sse2_cvtpd2ps(p->func, data, data);
294
else
295
sse2_cvtsd2ss(p->func, data, data);
296
if (out_chans == CHANNELS_0001)
297
sse_shufps(p->func, data, get_const(p, CONST_IDENTITY),
298
SHUF(X, Y, Z, W));
299
break;
300
case 2:
301
sse2_movupd(p->func, data, arg0);
302
sse2_cvtpd2ps(p->func, data, data);
303
if (out_chans == CHANNELS_0001)
304
sse_shufps(p->func, data, get_const(p, CONST_IDENTITY),
305
SHUF(X, Y, Z, W));
306
else if (out_chans > 2)
307
sse_movlhps(p->func, data, get_const(p, CONST_IDENTITY));
308
break;
309
case 3:
310
sse2_movupd(p->func, data, arg0);
311
sse2_cvtpd2ps(p->func, data, data);
312
sse2_movsd(p->func, tmpXMM, x86_make_disp(arg0, 16));
313
if (out_chans > 3)
314
sse2_cvtpd2ps(p->func, tmpXMM, tmpXMM);
315
else
316
sse2_cvtsd2ss(p->func, tmpXMM, tmpXMM);
317
sse_movlhps(p->func, data, tmpXMM);
318
if (out_chans == CHANNELS_0001)
319
sse_orps(p->func, data, get_const(p, CONST_IDENTITY));
320
break;
321
case 4:
322
sse2_movupd(p->func, data, arg0);
323
sse2_cvtpd2ps(p->func, data, data);
324
sse2_movupd(p->func, tmpXMM, x86_make_disp(arg0, 16));
325
sse2_cvtpd2ps(p->func, tmpXMM, tmpXMM);
326
sse_movlhps(p->func, data, tmpXMM);
327
break;
328
}
329
}
330
331
332
static void
333
emit_mov64(struct translate_sse *p, struct x86_reg dst_gpr,
334
struct x86_reg dst_xmm, struct x86_reg src_gpr,
335
struct x86_reg src_xmm)
336
{
337
if (x86_target(p->func) != X86_32)
338
x64_mov64(p->func, dst_gpr, src_gpr);
339
else {
340
/* TODO: when/on which CPUs is SSE2 actually better than SSE? */
341
if (x86_target_caps(p->func) & X86_SSE2)
342
sse2_movq(p->func, dst_xmm, src_xmm);
343
else
344
sse_movlps(p->func, dst_xmm, src_xmm);
345
}
346
}
347
348
349
static void
350
emit_load64(struct translate_sse *p, struct x86_reg dst_gpr,
351
struct x86_reg dst_xmm, struct x86_reg src)
352
{
353
emit_mov64(p, dst_gpr, dst_xmm, src, src);
354
}
355
356
357
static void
358
emit_store64(struct translate_sse *p, struct x86_reg dst,
359
struct x86_reg src_gpr, struct x86_reg src_xmm)
360
{
361
emit_mov64(p, dst, dst, src_gpr, src_xmm);
362
}
363
364
365
static void
366
emit_mov128(struct translate_sse *p, struct x86_reg dst, struct x86_reg src)
367
{
368
if (x86_target_caps(p->func) & X86_SSE2)
369
sse2_movdqu(p->func, dst, src);
370
else
371
sse_movups(p->func, dst, src);
372
}
373
374
375
/* TODO: this uses unaligned accesses liberally, which is great on Nehalem,
376
* but may or may not be good on older processors
377
* TODO: may perhaps want to use non-temporal stores here if possible
378
*/
379
static void
380
emit_memcpy(struct translate_sse *p, struct x86_reg dst, struct x86_reg src,
381
unsigned size)
382
{
383
struct x86_reg dataXMM = x86_make_reg(file_XMM, 0);
384
struct x86_reg dataXMM2 = x86_make_reg(file_XMM, 1);
385
struct x86_reg dataGPR = p->tmp_EAX;
386
struct x86_reg dataGPR2 = p->tmp2_EDX;
387
388
if (size < 8) {
389
switch (size) {
390
case 1:
391
x86_mov8(p->func, dataGPR, src);
392
x86_mov8(p->func, dst, dataGPR);
393
break;
394
case 2:
395
x86_mov16(p->func, dataGPR, src);
396
x86_mov16(p->func, dst, dataGPR);
397
break;
398
case 3:
399
x86_mov16(p->func, dataGPR, src);
400
x86_mov8(p->func, dataGPR2, x86_make_disp(src, 2));
401
x86_mov16(p->func, dst, dataGPR);
402
x86_mov8(p->func, x86_make_disp(dst, 2), dataGPR2);
403
break;
404
case 4:
405
x86_mov(p->func, dataGPR, src);
406
x86_mov(p->func, dst, dataGPR);
407
break;
408
case 6:
409
x86_mov(p->func, dataGPR, src);
410
x86_mov16(p->func, dataGPR2, x86_make_disp(src, 4));
411
x86_mov(p->func, dst, dataGPR);
412
x86_mov16(p->func, x86_make_disp(dst, 4), dataGPR2);
413
break;
414
}
415
}
416
else if (!(x86_target_caps(p->func) & X86_SSE)) {
417
unsigned i = 0;
418
assert((size & 3) == 0);
419
for (i = 0; i < size; i += 4) {
420
x86_mov(p->func, dataGPR, x86_make_disp(src, i));
421
x86_mov(p->func, x86_make_disp(dst, i), dataGPR);
422
}
423
}
424
else {
425
switch (size) {
426
case 8:
427
emit_load64(p, dataGPR, dataXMM, src);
428
emit_store64(p, dst, dataGPR, dataXMM);
429
break;
430
case 12:
431
emit_load64(p, dataGPR2, dataXMM, src);
432
x86_mov(p->func, dataGPR, x86_make_disp(src, 8));
433
emit_store64(p, dst, dataGPR2, dataXMM);
434
x86_mov(p->func, x86_make_disp(dst, 8), dataGPR);
435
break;
436
case 16:
437
emit_mov128(p, dataXMM, src);
438
emit_mov128(p, dst, dataXMM);
439
break;
440
case 24:
441
emit_mov128(p, dataXMM, src);
442
emit_load64(p, dataGPR, dataXMM2, x86_make_disp(src, 16));
443
emit_mov128(p, dst, dataXMM);
444
emit_store64(p, x86_make_disp(dst, 16), dataGPR, dataXMM2);
445
break;
446
case 32:
447
emit_mov128(p, dataXMM, src);
448
emit_mov128(p, dataXMM2, x86_make_disp(src, 16));
449
emit_mov128(p, dst, dataXMM);
450
emit_mov128(p, x86_make_disp(dst, 16), dataXMM2);
451
break;
452
default:
453
assert(0);
454
}
455
}
456
}
457
458
static boolean
459
translate_attr_convert(struct translate_sse *p,
460
const struct translate_element *a,
461
struct x86_reg src, struct x86_reg dst)
462
{
463
const struct util_format_description *input_desc =
464
util_format_description(a->input_format);
465
const struct util_format_description *output_desc =
466
util_format_description(a->output_format);
467
unsigned i;
468
boolean id_swizzle = TRUE;
469
unsigned swizzle[4] =
470
{ PIPE_SWIZZLE_NONE, PIPE_SWIZZLE_NONE,
471
PIPE_SWIZZLE_NONE, PIPE_SWIZZLE_NONE };
472
unsigned needed_chans = 0;
473
unsigned imms[2] = { 0, 0x3f800000 };
474
475
if (a->output_format == PIPE_FORMAT_NONE
476
|| a->input_format == PIPE_FORMAT_NONE)
477
return FALSE;
478
479
if (input_desc->channel[0].size & 7)
480
return FALSE;
481
482
if (input_desc->colorspace != output_desc->colorspace)
483
return FALSE;
484
485
for (i = 1; i < input_desc->nr_channels; ++i) {
486
if (memcmp
487
(&input_desc->channel[i], &input_desc->channel[0],
488
sizeof(input_desc->channel[0])))
489
return FALSE;
490
}
491
492
for (i = 1; i < output_desc->nr_channels; ++i) {
493
if (memcmp
494
(&output_desc->channel[i], &output_desc->channel[0],
495
sizeof(output_desc->channel[0]))) {
496
return FALSE;
497
}
498
}
499
500
for (i = 0; i < output_desc->nr_channels; ++i) {
501
if (output_desc->swizzle[i] < 4)
502
swizzle[output_desc->swizzle[i]] = input_desc->swizzle[i];
503
}
504
505
if ((x86_target_caps(p->func) & X86_SSE) &&
506
(0 || a->output_format == PIPE_FORMAT_R32_FLOAT
507
|| a->output_format == PIPE_FORMAT_R32G32_FLOAT
508
|| a->output_format == PIPE_FORMAT_R32G32B32_FLOAT
509
|| a->output_format == PIPE_FORMAT_R32G32B32A32_FLOAT)) {
510
struct x86_reg dataXMM = x86_make_reg(file_XMM, 0);
511
512
for (i = 0; i < output_desc->nr_channels; ++i) {
513
if (swizzle[i] == PIPE_SWIZZLE_0
514
&& i >= input_desc->nr_channels)
515
swizzle[i] = i;
516
}
517
518
for (i = 0; i < output_desc->nr_channels; ++i) {
519
if (swizzle[i] < 4)
520
needed_chans = MAX2(needed_chans, swizzle[i] + 1);
521
if (swizzle[i] < PIPE_SWIZZLE_0 && swizzle[i] != i)
522
id_swizzle = FALSE;
523
}
524
525
if (needed_chans > 0) {
526
switch (input_desc->channel[0].type) {
527
case UTIL_FORMAT_TYPE_UNSIGNED:
528
if (!(x86_target_caps(p->func) & X86_SSE2))
529
return FALSE;
530
emit_load_sse2(p, dataXMM, src,
531
input_desc->channel[0].size *
532
input_desc->nr_channels >> 3);
533
534
/* TODO: add support for SSE4.1 pmovzx */
535
switch (input_desc->channel[0].size) {
536
case 8:
537
/* TODO: this may be inefficient due to get_identity() being
538
* used both as a float and integer register.
539
*/
540
sse2_punpcklbw(p->func, dataXMM, get_const(p, CONST_IDENTITY));
541
sse2_punpcklbw(p->func, dataXMM, get_const(p, CONST_IDENTITY));
542
break;
543
case 16:
544
sse2_punpcklwd(p->func, dataXMM, get_const(p, CONST_IDENTITY));
545
break;
546
case 32: /* we lose precision here */
547
sse2_psrld_imm(p->func, dataXMM, 1);
548
break;
549
default:
550
return FALSE;
551
}
552
sse2_cvtdq2ps(p->func, dataXMM, dataXMM);
553
if (input_desc->channel[0].normalized) {
554
struct x86_reg factor;
555
switch (input_desc->channel[0].size) {
556
case 8:
557
factor = get_const(p, CONST_INV_255);
558
break;
559
case 16:
560
factor = get_const(p, CONST_INV_65535);
561
break;
562
case 32:
563
factor = get_const(p, CONST_INV_2147483647);
564
break;
565
default:
566
assert(0);
567
factor.disp = 0;
568
factor.file = 0;
569
factor.idx = 0;
570
factor.mod = 0;
571
break;
572
}
573
sse_mulps(p->func, dataXMM, factor);
574
}
575
else if (input_desc->channel[0].size == 32)
576
/* compensate for the bit we threw away to fit u32 into s32 */
577
sse_addps(p->func, dataXMM, dataXMM);
578
break;
579
case UTIL_FORMAT_TYPE_SIGNED:
580
if (!(x86_target_caps(p->func) & X86_SSE2))
581
return FALSE;
582
emit_load_sse2(p, dataXMM, src,
583
input_desc->channel[0].size *
584
input_desc->nr_channels >> 3);
585
586
/* TODO: add support for SSE4.1 pmovsx */
587
switch (input_desc->channel[0].size) {
588
case 8:
589
sse2_punpcklbw(p->func, dataXMM, dataXMM);
590
sse2_punpcklbw(p->func, dataXMM, dataXMM);
591
sse2_psrad_imm(p->func, dataXMM, 24);
592
break;
593
case 16:
594
sse2_punpcklwd(p->func, dataXMM, dataXMM);
595
sse2_psrad_imm(p->func, dataXMM, 16);
596
break;
597
case 32: /* we lose precision here */
598
break;
599
default:
600
return FALSE;
601
}
602
sse2_cvtdq2ps(p->func, dataXMM, dataXMM);
603
if (input_desc->channel[0].normalized) {
604
struct x86_reg factor;
605
switch (input_desc->channel[0].size) {
606
case 8:
607
factor = get_const(p, CONST_INV_127);
608
break;
609
case 16:
610
factor = get_const(p, CONST_INV_32767);
611
break;
612
case 32:
613
factor = get_const(p, CONST_INV_2147483647);
614
break;
615
default:
616
assert(0);
617
factor.disp = 0;
618
factor.file = 0;
619
factor.idx = 0;
620
factor.mod = 0;
621
break;
622
}
623
sse_mulps(p->func, dataXMM, factor);
624
}
625
break;
626
627
break;
628
case UTIL_FORMAT_TYPE_FLOAT:
629
if (input_desc->channel[0].size != 32
630
&& input_desc->channel[0].size != 64) {
631
return FALSE;
632
}
633
if (swizzle[3] == PIPE_SWIZZLE_1
634
&& input_desc->nr_channels <= 3) {
635
swizzle[3] = PIPE_SWIZZLE_W;
636
needed_chans = CHANNELS_0001;
637
}
638
switch (input_desc->channel[0].size) {
639
case 32:
640
emit_load_float32(p, dataXMM, src, needed_chans,
641
input_desc->nr_channels);
642
break;
643
case 64: /* we lose precision here */
644
if (!(x86_target_caps(p->func) & X86_SSE2))
645
return FALSE;
646
emit_load_float64to32(p, dataXMM, src, needed_chans,
647
input_desc->nr_channels);
648
break;
649
default:
650
return FALSE;
651
}
652
break;
653
default:
654
return FALSE;
655
}
656
657
if (!id_swizzle) {
658
sse_shufps(p->func, dataXMM, dataXMM,
659
SHUF(swizzle[0], swizzle[1], swizzle[2], swizzle[3]));
660
}
661
}
662
663
if (output_desc->nr_channels >= 4
664
&& swizzle[0] < PIPE_SWIZZLE_0
665
&& swizzle[1] < PIPE_SWIZZLE_0
666
&& swizzle[2] < PIPE_SWIZZLE_0
667
&& swizzle[3] < PIPE_SWIZZLE_0) {
668
sse_movups(p->func, dst, dataXMM);
669
}
670
else {
671
if (output_desc->nr_channels >= 2
672
&& swizzle[0] < PIPE_SWIZZLE_0
673
&& swizzle[1] < PIPE_SWIZZLE_0) {
674
sse_movlps(p->func, dst, dataXMM);
675
}
676
else {
677
if (swizzle[0] < PIPE_SWIZZLE_0) {
678
sse_movss(p->func, dst, dataXMM);
679
}
680
else {
681
x86_mov_imm(p->func, dst,
682
imms[swizzle[0] - PIPE_SWIZZLE_0]);
683
}
684
685
if (output_desc->nr_channels >= 2) {
686
if (swizzle[1] < PIPE_SWIZZLE_0) {
687
sse_shufps(p->func, dataXMM, dataXMM, SHUF(1, 1, 2, 3));
688
sse_movss(p->func, x86_make_disp(dst, 4), dataXMM);
689
}
690
else {
691
x86_mov_imm(p->func, x86_make_disp(dst, 4),
692
imms[swizzle[1] - PIPE_SWIZZLE_0]);
693
}
694
}
695
}
696
697
if (output_desc->nr_channels >= 3) {
698
if (output_desc->nr_channels >= 4
699
&& swizzle[2] < PIPE_SWIZZLE_0
700
&& swizzle[3] < PIPE_SWIZZLE_0) {
701
sse_movhps(p->func, x86_make_disp(dst, 8), dataXMM);
702
}
703
else {
704
if (swizzle[2] < PIPE_SWIZZLE_0) {
705
sse_shufps(p->func, dataXMM, dataXMM, SHUF(2, 2, 2, 3));
706
sse_movss(p->func, x86_make_disp(dst, 8), dataXMM);
707
}
708
else {
709
x86_mov_imm(p->func, x86_make_disp(dst, 8),
710
imms[swizzle[2] - PIPE_SWIZZLE_0]);
711
}
712
713
if (output_desc->nr_channels >= 4) {
714
if (swizzle[3] < PIPE_SWIZZLE_0) {
715
sse_shufps(p->func, dataXMM, dataXMM, SHUF(3, 3, 3, 3));
716
sse_movss(p->func, x86_make_disp(dst, 12), dataXMM);
717
}
718
else {
719
x86_mov_imm(p->func, x86_make_disp(dst, 12),
720
imms[swizzle[3] - PIPE_SWIZZLE_0]);
721
}
722
}
723
}
724
}
725
}
726
return TRUE;
727
}
728
else if ((x86_target_caps(p->func) & X86_SSE2)
729
&& input_desc->channel[0].size == 8
730
&& output_desc->channel[0].size == 16
731
&& output_desc->channel[0].normalized ==
732
input_desc->channel[0].normalized &&
733
(0 || (input_desc->channel[0].type == UTIL_FORMAT_TYPE_UNSIGNED
734
&& output_desc->channel[0].type == UTIL_FORMAT_TYPE_UNSIGNED)
735
|| (input_desc->channel[0].type == UTIL_FORMAT_TYPE_UNSIGNED
736
&& output_desc->channel[0].type == UTIL_FORMAT_TYPE_SIGNED)
737
|| (input_desc->channel[0].type == UTIL_FORMAT_TYPE_SIGNED
738
&& output_desc->channel[0].type == UTIL_FORMAT_TYPE_SIGNED))) {
739
struct x86_reg dataXMM = x86_make_reg(file_XMM, 0);
740
struct x86_reg tmpXMM = x86_make_reg(file_XMM, 1);
741
struct x86_reg tmp = p->tmp_EAX;
742
unsigned imms[2] = { 0, 1 };
743
744
for (i = 0; i < output_desc->nr_channels; ++i) {
745
if (swizzle[i] == PIPE_SWIZZLE_0
746
&& i >= input_desc->nr_channels) {
747
swizzle[i] = i;
748
}
749
}
750
751
for (i = 0; i < output_desc->nr_channels; ++i) {
752
if (swizzle[i] < 4)
753
needed_chans = MAX2(needed_chans, swizzle[i] + 1);
754
if (swizzle[i] < PIPE_SWIZZLE_0 && swizzle[i] != i)
755
id_swizzle = FALSE;
756
}
757
758
if (needed_chans > 0) {
759
emit_load_sse2(p, dataXMM, src,
760
input_desc->channel[0].size *
761
input_desc->nr_channels >> 3);
762
763
switch (input_desc->channel[0].type) {
764
case UTIL_FORMAT_TYPE_UNSIGNED:
765
if (input_desc->channel[0].normalized) {
766
sse2_punpcklbw(p->func, dataXMM, dataXMM);
767
if (output_desc->channel[0].type == UTIL_FORMAT_TYPE_SIGNED)
768
sse2_psrlw_imm(p->func, dataXMM, 1);
769
}
770
else
771
sse2_punpcklbw(p->func, dataXMM, get_const(p, CONST_IDENTITY));
772
break;
773
case UTIL_FORMAT_TYPE_SIGNED:
774
if (input_desc->channel[0].normalized) {
775
sse2_movq(p->func, tmpXMM, get_const(p, CONST_IDENTITY));
776
sse2_punpcklbw(p->func, tmpXMM, dataXMM);
777
sse2_psllw_imm(p->func, dataXMM, 9);
778
sse2_psrlw_imm(p->func, dataXMM, 8);
779
sse2_por(p->func, tmpXMM, dataXMM);
780
sse2_psrlw_imm(p->func, dataXMM, 7);
781
sse2_por(p->func, tmpXMM, dataXMM);
782
{
783
struct x86_reg t = dataXMM;
784
dataXMM = tmpXMM;
785
tmpXMM = t;
786
}
787
}
788
else {
789
sse2_punpcklbw(p->func, dataXMM, dataXMM);
790
sse2_psraw_imm(p->func, dataXMM, 8);
791
}
792
break;
793
default:
794
assert(0);
795
}
796
797
if (output_desc->channel[0].normalized)
798
imms[1] =
799
(output_desc->channel[0].type ==
800
UTIL_FORMAT_TYPE_UNSIGNED) ? 0xffff : 0x7ffff;
801
802
if (!id_swizzle)
803
sse2_pshuflw(p->func, dataXMM, dataXMM,
804
(swizzle[0] & 3) | ((swizzle[1] & 3) << 2) |
805
((swizzle[2] & 3) << 4) | ((swizzle[3] & 3) << 6));
806
}
807
808
if (output_desc->nr_channels >= 4
809
&& swizzle[0] < PIPE_SWIZZLE_0
810
&& swizzle[1] < PIPE_SWIZZLE_0
811
&& swizzle[2] < PIPE_SWIZZLE_0
812
&& swizzle[3] < PIPE_SWIZZLE_0) {
813
sse2_movq(p->func, dst, dataXMM);
814
}
815
else {
816
if (swizzle[0] < PIPE_SWIZZLE_0) {
817
if (output_desc->nr_channels >= 2
818
&& swizzle[1] < PIPE_SWIZZLE_0) {
819
sse2_movd(p->func, dst, dataXMM);
820
}
821
else {
822
sse2_movd(p->func, tmp, dataXMM);
823
x86_mov16(p->func, dst, tmp);
824
if (output_desc->nr_channels >= 2)
825
x86_mov16_imm(p->func, x86_make_disp(dst, 2),
826
imms[swizzle[1] - PIPE_SWIZZLE_0]);
827
}
828
}
829
else {
830
if (output_desc->nr_channels >= 2
831
&& swizzle[1] >= PIPE_SWIZZLE_0) {
832
x86_mov_imm(p->func, dst,
833
(imms[swizzle[1] - PIPE_SWIZZLE_0] << 16) |
834
imms[swizzle[0] - PIPE_SWIZZLE_0]);
835
}
836
else {
837
x86_mov16_imm(p->func, dst,
838
imms[swizzle[0] - PIPE_SWIZZLE_0]);
839
if (output_desc->nr_channels >= 2) {
840
sse2_movd(p->func, tmp, dataXMM);
841
x86_shr_imm(p->func, tmp, 16);
842
x86_mov16(p->func, x86_make_disp(dst, 2), tmp);
843
}
844
}
845
}
846
847
if (output_desc->nr_channels >= 3) {
848
if (swizzle[2] < PIPE_SWIZZLE_0) {
849
if (output_desc->nr_channels >= 4
850
&& swizzle[3] < PIPE_SWIZZLE_0) {
851
sse2_psrlq_imm(p->func, dataXMM, 32);
852
sse2_movd(p->func, x86_make_disp(dst, 4), dataXMM);
853
}
854
else {
855
sse2_psrlq_imm(p->func, dataXMM, 32);
856
sse2_movd(p->func, tmp, dataXMM);
857
x86_mov16(p->func, x86_make_disp(dst, 4), tmp);
858
if (output_desc->nr_channels >= 4) {
859
x86_mov16_imm(p->func, x86_make_disp(dst, 6),
860
imms[swizzle[3] - PIPE_SWIZZLE_0]);
861
}
862
}
863
}
864
else {
865
if (output_desc->nr_channels >= 4
866
&& swizzle[3] >= PIPE_SWIZZLE_0) {
867
x86_mov_imm(p->func, x86_make_disp(dst, 4),
868
(imms[swizzle[3] - PIPE_SWIZZLE_0] << 16)
869
| imms[swizzle[2] - PIPE_SWIZZLE_0]);
870
}
871
else {
872
x86_mov16_imm(p->func, x86_make_disp(dst, 4),
873
imms[swizzle[2] - PIPE_SWIZZLE_0]);
874
875
if (output_desc->nr_channels >= 4) {
876
sse2_psrlq_imm(p->func, dataXMM, 48);
877
sse2_movd(p->func, tmp, dataXMM);
878
x86_mov16(p->func, x86_make_disp(dst, 6), tmp);
879
}
880
}
881
}
882
}
883
}
884
return TRUE;
885
}
886
else if (!memcmp(&output_desc->channel[0], &input_desc->channel[0],
887
sizeof(output_desc->channel[0]))) {
888
struct x86_reg tmp = p->tmp_EAX;
889
unsigned i;
890
891
if (input_desc->channel[0].size == 8 && input_desc->nr_channels == 4
892
&& output_desc->nr_channels == 4
893
&& swizzle[0] == PIPE_SWIZZLE_W
894
&& swizzle[1] == PIPE_SWIZZLE_Z
895
&& swizzle[2] == PIPE_SWIZZLE_Y
896
&& swizzle[3] == PIPE_SWIZZLE_X) {
897
/* TODO: support movbe */
898
x86_mov(p->func, tmp, src);
899
x86_bswap(p->func, tmp);
900
x86_mov(p->func, dst, tmp);
901
return TRUE;
902
}
903
904
for (i = 0; i < output_desc->nr_channels; ++i) {
905
switch (output_desc->channel[0].size) {
906
case 8:
907
if (swizzle[i] >= PIPE_SWIZZLE_0) {
908
unsigned v = 0;
909
if (swizzle[i] == PIPE_SWIZZLE_1) {
910
switch (output_desc->channel[0].type) {
911
case UTIL_FORMAT_TYPE_UNSIGNED:
912
v = output_desc->channel[0].normalized ? 0xff : 1;
913
break;
914
case UTIL_FORMAT_TYPE_SIGNED:
915
v = output_desc->channel[0].normalized ? 0x7f : 1;
916
break;
917
default:
918
return FALSE;
919
}
920
}
921
x86_mov8_imm(p->func, x86_make_disp(dst, i * 1), v);
922
}
923
else {
924
x86_mov8(p->func, tmp, x86_make_disp(src, swizzle[i] * 1));
925
x86_mov8(p->func, x86_make_disp(dst, i * 1), tmp);
926
}
927
break;
928
case 16:
929
if (swizzle[i] >= PIPE_SWIZZLE_0) {
930
unsigned v = 0;
931
if (swizzle[i] == PIPE_SWIZZLE_1) {
932
switch (output_desc->channel[1].type) {
933
case UTIL_FORMAT_TYPE_UNSIGNED:
934
v = output_desc->channel[1].normalized ? 0xffff : 1;
935
break;
936
case UTIL_FORMAT_TYPE_SIGNED:
937
v = output_desc->channel[1].normalized ? 0x7fff : 1;
938
break;
939
case UTIL_FORMAT_TYPE_FLOAT:
940
v = 0x3c00;
941
break;
942
default:
943
return FALSE;
944
}
945
}
946
x86_mov16_imm(p->func, x86_make_disp(dst, i * 2), v);
947
}
948
else if (swizzle[i] == PIPE_SWIZZLE_0) {
949
x86_mov16_imm(p->func, x86_make_disp(dst, i * 2), 0);
950
}
951
else {
952
x86_mov16(p->func, tmp, x86_make_disp(src, swizzle[i] * 2));
953
x86_mov16(p->func, x86_make_disp(dst, i * 2), tmp);
954
}
955
break;
956
case 32:
957
if (swizzle[i] >= PIPE_SWIZZLE_0) {
958
unsigned v = 0;
959
if (swizzle[i] == PIPE_SWIZZLE_1) {
960
switch (output_desc->channel[1].type) {
961
case UTIL_FORMAT_TYPE_UNSIGNED:
962
v = output_desc->channel[1].normalized ? 0xffffffff : 1;
963
break;
964
case UTIL_FORMAT_TYPE_SIGNED:
965
v = output_desc->channel[1].normalized ? 0x7fffffff : 1;
966
break;
967
case UTIL_FORMAT_TYPE_FLOAT:
968
v = 0x3f800000;
969
break;
970
default:
971
return FALSE;
972
}
973
}
974
x86_mov_imm(p->func, x86_make_disp(dst, i * 4), v);
975
}
976
else {
977
x86_mov(p->func, tmp, x86_make_disp(src, swizzle[i] * 4));
978
x86_mov(p->func, x86_make_disp(dst, i * 4), tmp);
979
}
980
break;
981
case 64:
982
if (swizzle[i] >= PIPE_SWIZZLE_0) {
983
unsigned l = 0;
984
unsigned h = 0;
985
if (swizzle[i] == PIPE_SWIZZLE_1) {
986
switch (output_desc->channel[1].type) {
987
case UTIL_FORMAT_TYPE_UNSIGNED:
988
h = output_desc->channel[1].normalized ? 0xffffffff : 0;
989
l = output_desc->channel[1].normalized ? 0xffffffff : 1;
990
break;
991
case UTIL_FORMAT_TYPE_SIGNED:
992
h = output_desc->channel[1].normalized ? 0x7fffffff : 0;
993
l = output_desc->channel[1].normalized ? 0xffffffff : 1;
994
break;
995
case UTIL_FORMAT_TYPE_FLOAT:
996
h = 0x3ff00000;
997
l = 0;
998
break;
999
default:
1000
return FALSE;
1001
}
1002
}
1003
x86_mov_imm(p->func, x86_make_disp(dst, i * 8), l);
1004
x86_mov_imm(p->func, x86_make_disp(dst, i * 8 + 4), h);
1005
}
1006
else {
1007
if (x86_target_caps(p->func) & X86_SSE) {
1008
struct x86_reg tmpXMM = x86_make_reg(file_XMM, 0);
1009
emit_load64(p, tmp, tmpXMM,
1010
x86_make_disp(src, swizzle[i] * 8));
1011
emit_store64(p, x86_make_disp(dst, i * 8), tmp, tmpXMM);
1012
}
1013
else {
1014
x86_mov(p->func, tmp, x86_make_disp(src, swizzle[i] * 8));
1015
x86_mov(p->func, x86_make_disp(dst, i * 8), tmp);
1016
x86_mov(p->func, tmp,
1017
x86_make_disp(src, swizzle[i] * 8 + 4));
1018
x86_mov(p->func, x86_make_disp(dst, i * 8 + 4), tmp);
1019
}
1020
}
1021
break;
1022
default:
1023
return FALSE;
1024
}
1025
}
1026
return TRUE;
1027
}
1028
/* special case for draw's EMIT_4UB (RGBA) and EMIT_4UB_BGRA */
1029
else if ((x86_target_caps(p->func) & X86_SSE2) &&
1030
a->input_format == PIPE_FORMAT_R32G32B32A32_FLOAT &&
1031
(0 || a->output_format == PIPE_FORMAT_B8G8R8A8_UNORM
1032
|| a-> output_format == PIPE_FORMAT_R8G8B8A8_UNORM)) {
1033
struct x86_reg dataXMM = x86_make_reg(file_XMM, 0);
1034
1035
/* load */
1036
sse_movups(p->func, dataXMM, src);
1037
1038
if (a->output_format == PIPE_FORMAT_B8G8R8A8_UNORM) {
1039
sse_shufps(p->func, dataXMM, dataXMM, SHUF(2, 1, 0, 3));
1040
}
1041
1042
/* scale by 255.0 */
1043
sse_mulps(p->func, dataXMM, get_const(p, CONST_255));
1044
1045
/* pack and emit */
1046
sse2_cvtps2dq(p->func, dataXMM, dataXMM);
1047
sse2_packssdw(p->func, dataXMM, dataXMM);
1048
sse2_packuswb(p->func, dataXMM, dataXMM);
1049
sse2_movd(p->func, dst, dataXMM);
1050
1051
return TRUE;
1052
}
1053
1054
return FALSE;
1055
}
1056
1057
1058
static boolean
1059
translate_attr(struct translate_sse *p,
1060
const struct translate_element *a,
1061
struct x86_reg src, struct x86_reg dst)
1062
{
1063
if (a->input_format == a->output_format) {
1064
emit_memcpy(p, dst, src, util_format_get_stride(a->input_format, 1));
1065
return TRUE;
1066
}
1067
1068
return translate_attr_convert(p, a, src, dst);
1069
}
1070
1071
1072
static boolean
1073
init_inputs(struct translate_sse *p, unsigned index_size)
1074
{
1075
unsigned i;
1076
struct x86_reg instance_id =
1077
x86_make_disp(p->machine_EDI, get_offset(p, &p->instance_id));
1078
struct x86_reg start_instance =
1079
x86_make_disp(p->machine_EDI, get_offset(p, &p->start_instance));
1080
1081
for (i = 0; i < p->nr_buffer_variants; i++) {
1082
struct translate_buffer_variant *variant = &p->buffer_variant[i];
1083
struct translate_buffer *buffer = &p->buffer[variant->buffer_index];
1084
1085
if (!index_size || variant->instance_divisor) {
1086
struct x86_reg buf_max_index =
1087
x86_make_disp(p->machine_EDI, get_offset(p, &buffer->max_index));
1088
struct x86_reg buf_stride =
1089
x86_make_disp(p->machine_EDI, get_offset(p, &buffer->stride));
1090
struct x86_reg buf_ptr =
1091
x86_make_disp(p->machine_EDI, get_offset(p, &variant->ptr));
1092
struct x86_reg buf_base_ptr =
1093
x86_make_disp(p->machine_EDI, get_offset(p, &buffer->base_ptr));
1094
struct x86_reg elt = p->idx_ESI;
1095
struct x86_reg tmp_EAX = p->tmp_EAX;
1096
1097
/* Calculate pointer to first attrib:
1098
* base_ptr + stride * index, where index depends on instance divisor
1099
*/
1100
if (variant->instance_divisor) {
1101
struct x86_reg tmp_EDX = p->tmp2_EDX;
1102
1103
/* Start with instance = instance_id
1104
* which is true if divisor is 1.
1105
*/
1106
x86_mov(p->func, tmp_EAX, instance_id);
1107
1108
if (variant->instance_divisor != 1) {
1109
struct x86_reg tmp_ECX = p->src_ECX;
1110
1111
/* TODO: Add x86_shr() to rtasm and use it whenever
1112
* instance divisor is power of two.
1113
*/
1114
x86_xor(p->func, tmp_EDX, tmp_EDX);
1115
x86_mov_reg_imm(p->func, tmp_ECX, variant->instance_divisor);
1116
x86_div(p->func, tmp_ECX); /* EAX = EDX:EAX / ECX */
1117
}
1118
1119
/* instance = (instance_id / divisor) + start_instance
1120
*/
1121
x86_mov(p->func, tmp_EDX, start_instance);
1122
x86_add(p->func, tmp_EAX, tmp_EDX);
1123
1124
/* XXX we need to clamp the index here too, but to a
1125
* per-array max value, not the draw->pt.max_index value
1126
* that's being given to us via translate->set_buffer().
1127
*/
1128
}
1129
else {
1130
x86_mov(p->func, tmp_EAX, elt);
1131
1132
/* Clamp to max_index
1133
*/
1134
x86_cmp(p->func, tmp_EAX, buf_max_index);
1135
x86_cmovcc(p->func, tmp_EAX, buf_max_index, cc_AE);
1136
}
1137
1138
x86_mov(p->func, p->tmp2_EDX, buf_stride);
1139
x64_rexw(p->func);
1140
x86_imul(p->func, tmp_EAX, p->tmp2_EDX);
1141
x64_rexw(p->func);
1142
x86_add(p->func, tmp_EAX, buf_base_ptr);
1143
1144
x86_cmp(p->func, p->count_EBP, p->tmp_EAX);
1145
1146
/* In the linear case, keep the buffer pointer instead of the
1147
* index number.
1148
*/
1149
if (!index_size && p->nr_buffer_variants == 1) {
1150
x64_rexw(p->func);
1151
x86_mov(p->func, elt, tmp_EAX);
1152
}
1153
else {
1154
x64_rexw(p->func);
1155
x86_mov(p->func, buf_ptr, tmp_EAX);
1156
}
1157
}
1158
}
1159
1160
return TRUE;
1161
}
1162
1163
1164
static struct x86_reg
1165
get_buffer_ptr(struct translate_sse *p,
1166
unsigned index_size, unsigned var_idx, struct x86_reg elt)
1167
{
1168
if (var_idx == ELEMENT_BUFFER_INSTANCE_ID) {
1169
return x86_make_disp(p->machine_EDI, get_offset(p, &p->instance_id));
1170
}
1171
if (!index_size && p->nr_buffer_variants == 1) {
1172
return p->idx_ESI;
1173
}
1174
else if (!index_size || p->buffer_variant[var_idx].instance_divisor) {
1175
struct x86_reg ptr = p->src_ECX;
1176
struct x86_reg buf_ptr =
1177
x86_make_disp(p->machine_EDI,
1178
get_offset(p, &p->buffer_variant[var_idx].ptr));
1179
1180
x64_rexw(p->func);
1181
x86_mov(p->func, ptr, buf_ptr);
1182
return ptr;
1183
}
1184
else {
1185
struct x86_reg ptr = p->src_ECX;
1186
const struct translate_buffer_variant *variant =
1187
&p->buffer_variant[var_idx];
1188
struct x86_reg buf_stride =
1189
x86_make_disp(p->machine_EDI,
1190
get_offset(p, &p->buffer[variant->buffer_index].stride));
1191
struct x86_reg buf_base_ptr =
1192
x86_make_disp(p->machine_EDI,
1193
get_offset(p, &p->buffer[variant->buffer_index].base_ptr));
1194
struct x86_reg buf_max_index =
1195
x86_make_disp(p->machine_EDI,
1196
get_offset(p, &p->buffer[variant->buffer_index].max_index));
1197
1198
/* Calculate pointer to current attrib:
1199
*/
1200
switch (index_size) {
1201
case 1:
1202
x86_movzx8(p->func, ptr, elt);
1203
break;
1204
case 2:
1205
x86_movzx16(p->func, ptr, elt);
1206
break;
1207
case 4:
1208
x86_mov(p->func, ptr, elt);
1209
break;
1210
}
1211
1212
/* Clamp to max_index
1213
*/
1214
x86_cmp(p->func, ptr, buf_max_index);
1215
x86_cmovcc(p->func, ptr, buf_max_index, cc_AE);
1216
1217
x86_mov(p->func, p->tmp2_EDX, buf_stride);
1218
x64_rexw(p->func);
1219
x86_imul(p->func, ptr, p->tmp2_EDX);
1220
x64_rexw(p->func);
1221
x86_add(p->func, ptr, buf_base_ptr);
1222
return ptr;
1223
}
1224
}
1225
1226
1227
static boolean
1228
incr_inputs(struct translate_sse *p, unsigned index_size)
1229
{
1230
if (!index_size && p->nr_buffer_variants == 1) {
1231
const unsigned buffer_index = p->buffer_variant[0].buffer_index;
1232
struct x86_reg stride =
1233
x86_make_disp(p->machine_EDI,
1234
get_offset(p, &p->buffer[buffer_index].stride));
1235
1236
if (p->buffer_variant[0].instance_divisor == 0) {
1237
x64_rexw(p->func);
1238
x86_add(p->func, p->idx_ESI, stride);
1239
sse_prefetchnta(p->func, x86_make_disp(p->idx_ESI, 192));
1240
}
1241
}
1242
else if (!index_size) {
1243
unsigned i;
1244
1245
/* Is this worthwhile??
1246
*/
1247
for (i = 0; i < p->nr_buffer_variants; i++) {
1248
struct translate_buffer_variant *variant = &p->buffer_variant[i];
1249
struct x86_reg buf_ptr = x86_make_disp(p->machine_EDI,
1250
get_offset(p, &variant->ptr));
1251
struct x86_reg buf_stride =
1252
x86_make_disp(p->machine_EDI,
1253
get_offset(p, &p->buffer[variant->buffer_index].stride));
1254
1255
if (variant->instance_divisor == 0) {
1256
x86_mov(p->func, p->tmp_EAX, buf_stride);
1257
x64_rexw(p->func);
1258
x86_add(p->func, p->tmp_EAX, buf_ptr);
1259
if (i == 0)
1260
sse_prefetchnta(p->func, x86_make_disp(p->tmp_EAX, 192));
1261
x64_rexw(p->func);
1262
x86_mov(p->func, buf_ptr, p->tmp_EAX);
1263
}
1264
}
1265
}
1266
else {
1267
x64_rexw(p->func);
1268
x86_lea(p->func, p->idx_ESI, x86_make_disp(p->idx_ESI, index_size));
1269
}
1270
1271
return TRUE;
1272
}
1273
1274
1275
/* Build run( struct translate *machine,
1276
* unsigned start,
1277
* unsigned count,
1278
* void *output_buffer )
1279
* or
1280
* run_elts( struct translate *machine,
1281
* unsigned *elts,
1282
* unsigned count,
1283
* void *output_buffer )
1284
*
1285
* Lots of hardcoding
1286
*
1287
* EAX -- pointer to current output vertex
1288
* ECX -- pointer to current attribute
1289
*
1290
*/
1291
static boolean
1292
build_vertex_emit(struct translate_sse *p,
1293
struct x86_function *func, unsigned index_size)
1294
{
1295
int fixup, label;
1296
unsigned j;
1297
1298
memset(p->reg_to_const, 0xff, sizeof(p->reg_to_const));
1299
memset(p->const_to_reg, 0xff, sizeof(p->const_to_reg));
1300
1301
p->tmp_EAX = x86_make_reg(file_REG32, reg_AX);
1302
p->idx_ESI = x86_make_reg(file_REG32, reg_SI);
1303
p->outbuf_EBX = x86_make_reg(file_REG32, reg_BX);
1304
p->machine_EDI = x86_make_reg(file_REG32, reg_DI);
1305
p->count_EBP = x86_make_reg(file_REG32, reg_BP);
1306
p->tmp2_EDX = x86_make_reg(file_REG32, reg_DX);
1307
p->src_ECX = x86_make_reg(file_REG32, reg_CX);
1308
1309
p->func = func;
1310
1311
x86_init_func(p->func);
1312
1313
if (x86_target(p->func) == X86_64_WIN64_ABI) {
1314
/* the ABI guarantees a 16-byte aligned 32-byte "shadow space"
1315
* above the return address
1316
*/
1317
sse2_movdqa(p->func, x86_make_disp(x86_make_reg(file_REG32, reg_SP), 8),
1318
x86_make_reg(file_XMM, 6));
1319
sse2_movdqa(p->func,
1320
x86_make_disp(x86_make_reg(file_REG32, reg_SP), 24),
1321
x86_make_reg(file_XMM, 7));
1322
}
1323
1324
x86_push(p->func, p->outbuf_EBX);
1325
x86_push(p->func, p->count_EBP);
1326
1327
/* on non-Win64 x86-64, these are already in the right registers */
1328
if (x86_target(p->func) != X86_64_STD_ABI) {
1329
x86_push(p->func, p->machine_EDI);
1330
x86_push(p->func, p->idx_ESI);
1331
1332
if (x86_target(p->func) != X86_32) {
1333
x64_mov64(p->func, p->machine_EDI, x86_fn_arg(p->func, 1));
1334
x64_mov64(p->func, p->idx_ESI, x86_fn_arg(p->func, 2));
1335
}
1336
else {
1337
x86_mov(p->func, p->machine_EDI, x86_fn_arg(p->func, 1));
1338
x86_mov(p->func, p->idx_ESI, x86_fn_arg(p->func, 2));
1339
}
1340
}
1341
1342
x86_mov(p->func, p->count_EBP, x86_fn_arg(p->func, 3));
1343
1344
if (x86_target(p->func) != X86_32)
1345
x64_mov64(p->func, p->outbuf_EBX, x86_fn_arg(p->func, 6));
1346
else
1347
x86_mov(p->func, p->outbuf_EBX, x86_fn_arg(p->func, 6));
1348
1349
/* Load instance ID.
1350
*/
1351
if (p->use_instancing) {
1352
x86_mov(p->func, p->tmp2_EDX, x86_fn_arg(p->func, 4));
1353
x86_mov(p->func,
1354
x86_make_disp(p->machine_EDI,
1355
get_offset(p, &p->start_instance)), p->tmp2_EDX);
1356
1357
x86_mov(p->func, p->tmp_EAX, x86_fn_arg(p->func, 5));
1358
x86_mov(p->func,
1359
x86_make_disp(p->machine_EDI, get_offset(p, &p->instance_id)),
1360
p->tmp_EAX);
1361
}
1362
1363
/* Get vertex count, compare to zero
1364
*/
1365
x86_xor(p->func, p->tmp_EAX, p->tmp_EAX);
1366
x86_cmp(p->func, p->count_EBP, p->tmp_EAX);
1367
fixup = x86_jcc_forward(p->func, cc_E);
1368
1369
/* always load, needed or not:
1370
*/
1371
init_inputs(p, index_size);
1372
1373
/* Note address for loop jump
1374
*/
1375
label = x86_get_label(p->func);
1376
{
1377
struct x86_reg elt = !index_size ? p->idx_ESI : x86_deref(p->idx_ESI);
1378
int last_variant = -1;
1379
struct x86_reg vb;
1380
1381
for (j = 0; j < p->translate.key.nr_elements; j++) {
1382
const struct translate_element *a = &p->translate.key.element[j];
1383
unsigned variant = p->element_to_buffer_variant[j];
1384
1385
/* Figure out source pointer address:
1386
*/
1387
if (variant != last_variant) {
1388
last_variant = variant;
1389
vb = get_buffer_ptr(p, index_size, variant, elt);
1390
}
1391
1392
if (!translate_attr(p, a,
1393
x86_make_disp(vb, a->input_offset),
1394
x86_make_disp(p->outbuf_EBX, a->output_offset)))
1395
return FALSE;
1396
}
1397
1398
/* Next output vertex:
1399
*/
1400
x64_rexw(p->func);
1401
x86_lea(p->func, p->outbuf_EBX,
1402
x86_make_disp(p->outbuf_EBX, p->translate.key.output_stride));
1403
1404
/* Incr index
1405
*/
1406
incr_inputs(p, index_size);
1407
}
1408
1409
/* decr count, loop if not zero
1410
*/
1411
x86_dec(p->func, p->count_EBP);
1412
x86_jcc(p->func, cc_NZ, label);
1413
1414
/* Exit mmx state?
1415
*/
1416
if (p->func->need_emms)
1417
mmx_emms(p->func);
1418
1419
/* Land forward jump here:
1420
*/
1421
x86_fixup_fwd_jump(p->func, fixup);
1422
1423
/* Pop regs and return
1424
*/
1425
if (x86_target(p->func) != X86_64_STD_ABI) {
1426
x86_pop(p->func, p->idx_ESI);
1427
x86_pop(p->func, p->machine_EDI);
1428
}
1429
1430
x86_pop(p->func, p->count_EBP);
1431
x86_pop(p->func, p->outbuf_EBX);
1432
1433
if (x86_target(p->func) == X86_64_WIN64_ABI) {
1434
sse2_movdqa(p->func, x86_make_reg(file_XMM, 6),
1435
x86_make_disp(x86_make_reg(file_REG32, reg_SP), 8));
1436
sse2_movdqa(p->func, x86_make_reg(file_XMM, 7),
1437
x86_make_disp(x86_make_reg(file_REG32, reg_SP), 24));
1438
}
1439
x86_ret(p->func);
1440
1441
return TRUE;
1442
}
1443
1444
1445
static void
1446
translate_sse_set_buffer(struct translate *translate,
1447
unsigned buf,
1448
const void *ptr, unsigned stride, unsigned max_index)
1449
{
1450
struct translate_sse *p = (struct translate_sse *) translate;
1451
1452
if (buf < p->nr_buffers) {
1453
p->buffer[buf].base_ptr = (char *) ptr;
1454
p->buffer[buf].stride = stride;
1455
p->buffer[buf].max_index = max_index;
1456
}
1457
1458
if (0)
1459
debug_printf("%s %d/%d: %p %d\n",
1460
__FUNCTION__, buf, p->nr_buffers, ptr, stride);
1461
}
1462
1463
1464
static void
1465
translate_sse_release(struct translate *translate)
1466
{
1467
struct translate_sse *p = (struct translate_sse *) translate;
1468
1469
x86_release_func(&p->elt8_func);
1470
x86_release_func(&p->elt16_func);
1471
x86_release_func(&p->elt_func);
1472
x86_release_func(&p->linear_func);
1473
1474
os_free_aligned(p);
1475
}
1476
1477
1478
struct translate *
1479
translate_sse2_create(const struct translate_key *key)
1480
{
1481
struct translate_sse *p = NULL;
1482
unsigned i;
1483
1484
/* this is misnamed, it actually refers to whether rtasm is enabled or not */
1485
if (!rtasm_cpu_has_sse())
1486
goto fail;
1487
1488
p = os_malloc_aligned(sizeof(struct translate_sse), 16);
1489
if (!p)
1490
goto fail;
1491
1492
memset(p, 0, sizeof(*p));
1493
memcpy(p->consts, consts, sizeof(consts));
1494
1495
p->translate.key = *key;
1496
p->translate.release = translate_sse_release;
1497
p->translate.set_buffer = translate_sse_set_buffer;
1498
1499
assert(key->nr_elements <= TRANSLATE_MAX_ATTRIBS);
1500
1501
for (i = 0; i < key->nr_elements; i++) {
1502
if (key->element[i].type == TRANSLATE_ELEMENT_NORMAL) {
1503
unsigned j;
1504
1505
p->nr_buffers =
1506
MAX2(p->nr_buffers, key->element[i].input_buffer + 1);
1507
1508
if (key->element[i].instance_divisor) {
1509
p->use_instancing = TRUE;
1510
}
1511
1512
/*
1513
* Map vertex element to vertex buffer variant.
1514
*/
1515
for (j = 0; j < p->nr_buffer_variants; j++) {
1516
if (p->buffer_variant[j].buffer_index ==
1517
key->element[i].input_buffer
1518
&& p->buffer_variant[j].instance_divisor ==
1519
key->element[i].instance_divisor) {
1520
break;
1521
}
1522
}
1523
if (j == p->nr_buffer_variants) {
1524
p->buffer_variant[j].buffer_index = key->element[i].input_buffer;
1525
p->buffer_variant[j].instance_divisor =
1526
key->element[i].instance_divisor;
1527
p->nr_buffer_variants++;
1528
}
1529
p->element_to_buffer_variant[i] = j;
1530
}
1531
else {
1532
assert(key->element[i].type == TRANSLATE_ELEMENT_INSTANCE_ID);
1533
1534
p->element_to_buffer_variant[i] = ELEMENT_BUFFER_INSTANCE_ID;
1535
}
1536
}
1537
1538
if (0)
1539
debug_printf("nr_buffers: %d\n", p->nr_buffers);
1540
1541
if (!build_vertex_emit(p, &p->linear_func, 0))
1542
goto fail;
1543
1544
if (!build_vertex_emit(p, &p->elt_func, 4))
1545
goto fail;
1546
1547
if (!build_vertex_emit(p, &p->elt16_func, 2))
1548
goto fail;
1549
1550
if (!build_vertex_emit(p, &p->elt8_func, 1))
1551
goto fail;
1552
1553
p->translate.run = (run_func) x86_get_func(&p->linear_func);
1554
if (p->translate.run == NULL)
1555
goto fail;
1556
1557
p->translate.run_elts = (run_elts_func) x86_get_func(&p->elt_func);
1558
if (p->translate.run_elts == NULL)
1559
goto fail;
1560
1561
p->translate.run_elts16 = (run_elts16_func) x86_get_func(&p->elt16_func);
1562
if (p->translate.run_elts16 == NULL)
1563
goto fail;
1564
1565
p->translate.run_elts8 = (run_elts8_func) x86_get_func(&p->elt8_func);
1566
if (p->translate.run_elts8 == NULL)
1567
goto fail;
1568
1569
return &p->translate;
1570
1571
fail:
1572
if (p)
1573
translate_sse_release(&p->translate);
1574
1575
return NULL;
1576
}
1577
1578
1579
#else
1580
1581
struct translate *
1582
translate_sse2_create(const struct translate_key *key)
1583
{
1584
return NULL;
1585
}
1586
1587
#endif
1588
1589