CoCalc -- translate

GitHub Repository: PojavLauncherTeam/mesa
Path: blob/21.2-virgl/src/gallium/auxiliary/translate/translate_sse.c
⁴⁵⁶⁵ views
1
/*
2
 * Copyright 2003 VMware, Inc.
3
 * All Rights Reserved.
4
 *
5
 * Permission is hereby granted, free of charge, to any person obtaining a
6
 * copy of this software and associated documentation files (the "Software"),
7
 * to deal in the Software without restriction, including without limitation
8
 * on the rights to use, copy, modify, merge, publish, distribute, sub
9
 * license, and/or sell copies of the Software, and to permit persons to whom
10
 * the Software is furnished to do so, subject to the following conditions:
11
 *
12
 * The above copyright notice and this permission notice (including the next
13
 * paragraph) shall be included in all copies or substantial portions of the
14
 * Software.
15
 *
16
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18
 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.  IN NO EVENT SHALL
19
 * VMWARE AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
20
 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
21
 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
22
 * USE OR OTHER DEALINGS IN THE SOFTWARE.
23
 *
24
 * Authors:
25
 *    Keith Whitwell <[email protected]>
26
 */
27

28

29
#include "pipe/p_config.h"
30
#include "pipe/p_compiler.h"
31
#include "util/u_memory.h"
32
#include "util/u_math.h"
33
#include "util/format/u_format.h"
34

35
#include "translate.h"
36

37

38
#if (defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)) && !defined(EMBEDDED_DEVICE)
39

40
#include "rtasm/rtasm_cpu.h"
41
#include "rtasm/rtasm_x86sse.h"
42

43

44
#define X    0
45
#define Y    1
46
#define Z    2
47
#define W    3
48

49

50
struct translate_buffer
51
{
52
   const void *base_ptr;
53
   uintptr_t stride;
54
   unsigned max_index;
55
};
56

57
struct translate_buffer_variant
58
{
59
   unsigned buffer_index;
60
   unsigned instance_divisor;
61
   void *ptr;                   /* updated either per vertex or per instance */
62
};
63

64

65
#define ELEMENT_BUFFER_INSTANCE_ID  1001
66

67
#define NUM_CONSTS 7
68

69
enum
70
{
71
   CONST_IDENTITY,
72
   CONST_INV_127,
73
   CONST_INV_255,
74
   CONST_INV_32767,
75
   CONST_INV_65535,
76
   CONST_INV_2147483647,
77
   CONST_255
78
};
79

80
#define C(v) {(float)(v), (float)(v), (float)(v), (float)(v)}
81
static float consts[NUM_CONSTS][4] = {
82
   {0, 0, 0, 1},
83
   C(1.0 / 127.0),
84
   C(1.0 / 255.0),
85
   C(1.0 / 32767.0),
86
   C(1.0 / 65535.0),
87
   C(1.0 / 2147483647.0),
88
   C(255.0)
89
};
90

91
#undef C
92

93
struct translate_sse
94
{
95
   struct translate translate;
96

97
   struct x86_function linear_func;
98
   struct x86_function elt_func;
99
   struct x86_function elt16_func;
100
   struct x86_function elt8_func;
101
   struct x86_function *func;
102

103
     PIPE_ALIGN_VAR(16) float consts[NUM_CONSTS][4];
104
   int8_t reg_to_const[16];
105
   int8_t const_to_reg[NUM_CONSTS];
106

107
   struct translate_buffer buffer[TRANSLATE_MAX_ATTRIBS];
108
   unsigned nr_buffers;
109

110
   /* Multiple buffer variants can map to a single buffer. */
111
   struct translate_buffer_variant buffer_variant[TRANSLATE_MAX_ATTRIBS];
112
   unsigned nr_buffer_variants;
113

114
   /* Multiple elements can map to a single buffer variant. */
115
   unsigned element_to_buffer_variant[TRANSLATE_MAX_ATTRIBS];
116

117
   boolean use_instancing;
118
   unsigned instance_id;
119
   unsigned start_instance;
120

121
   /* these are actually known values, but putting them in a struct
122
    * like this is helpful to keep them in sync across the file.
123
    */
124
   struct x86_reg tmp_EAX;
125
   struct x86_reg tmp2_EDX;
126
   struct x86_reg src_ECX;
127
   struct x86_reg idx_ESI;      /* either start+i or &elt[i] */
128
   struct x86_reg machine_EDI;
129
   struct x86_reg outbuf_EBX;
130
   struct x86_reg count_EBP;    /* decrements to zero */
131
};
132

133

134
static int
135
get_offset(const void *a, const void *b)
136
{
137
   return (const char *) b - (const char *) a;
138
}
139

140

141
static struct x86_reg
142
get_const(struct translate_sse *p, unsigned id)
143
{
144
   struct x86_reg reg;
145
   unsigned i;
146

147
   if (p->const_to_reg[id] >= 0)
148
      return x86_make_reg(file_XMM, p->const_to_reg[id]);
149

150
   for (i = 2; i < 8; ++i) {
151
      if (p->reg_to_const[i] < 0)
152
         break;
153
   }
154

155
   /* TODO: be smarter here */
156
   if (i == 8)
157
      --i;
158

159
   reg = x86_make_reg(file_XMM, i);
160

161
   if (p->reg_to_const[i] >= 0)
162
      p->const_to_reg[p->reg_to_const[i]] = -1;
163

164
   p->reg_to_const[i] = id;
165
   p->const_to_reg[id] = i;
166

167
   /* TODO: this should happen outside the loop, if possible */
168
   sse_movaps(p->func, reg,
169
              x86_make_disp(p->machine_EDI,
170
                            get_offset(p, &p->consts[id][0])));
171

172
   return reg;
173
}
174

175

176
/* load the data in a SSE2 register, padding with zeros */
177
static boolean
178
emit_load_sse2(struct translate_sse *p,
179
               struct x86_reg data, struct x86_reg src, unsigned size)
180
{
181
   struct x86_reg tmpXMM = x86_make_reg(file_XMM, 1);
182
   struct x86_reg tmp = p->tmp_EAX;
183
   switch (size) {
184
   case 1:
185
      x86_movzx8(p->func, tmp, src);
186
      sse2_movd(p->func, data, tmp);
187
      break;
188
   case 2:
189
      x86_movzx16(p->func, tmp, src);
190
      sse2_movd(p->func, data, tmp);
191
      break;
192
   case 3:
193
      x86_movzx8(p->func, tmp, x86_make_disp(src, 2));
194
      x86_shl_imm(p->func, tmp, 16);
195
      x86_mov16(p->func, tmp, src);
196
      sse2_movd(p->func, data, tmp);
197
      break;
198
   case 4:
199
      sse2_movd(p->func, data, src);
200
      break;
201
   case 6:
202
      sse2_movd(p->func, data, src);
203
      x86_movzx16(p->func, tmp, x86_make_disp(src, 4));
204
      sse2_movd(p->func, tmpXMM, tmp);
205
      sse2_punpckldq(p->func, data, tmpXMM);
206
      break;
207
   case 8:
208
      sse2_movq(p->func, data, src);
209
      break;
210
   case 12:
211
      sse2_movq(p->func, data, src);
212
      sse2_movd(p->func, tmpXMM, x86_make_disp(src, 8));
213
      sse2_punpcklqdq(p->func, data, tmpXMM);
214
      break;
215
   case 16:
216
      sse2_movdqu(p->func, data, src);
217
      break;
218
   default:
219
      return FALSE;
220
   }
221
   return TRUE;
222
}
223

224

225
/* this value can be passed for the out_chans argument */
226
#define CHANNELS_0001 5
227

228

229
/* this function will load #chans float values, and will
230
 * pad the register with zeroes at least up to out_chans.
231
 *
232
 * If out_chans is set to CHANNELS_0001, then the fourth
233
 * value will be padded with 1. Only pass this value if
234
 * chans < 4 or results are undefined.
235
 */
236
static void
237
emit_load_float32(struct translate_sse *p, struct x86_reg data,
238
                  struct x86_reg arg0, unsigned out_chans, unsigned chans)
239
{
240
   switch (chans) {
241
   case 1:
242
      /* a 0 0 0
243
       * a 0 0 1
244
       */
245
      sse_movss(p->func, data, arg0);
246
      if (out_chans == CHANNELS_0001)
247
         sse_orps(p->func, data, get_const(p, CONST_IDENTITY));
248
      break;
249
   case 2:
250
      /* 0 0 0 1
251
       * a b 0 1
252
       */
253
      if (out_chans == CHANNELS_0001)
254
         sse_shufps(p->func, data, get_const(p, CONST_IDENTITY),
255
                    SHUF(X, Y, Z, W));
256
      else if (out_chans > 2)
257
         sse_movlhps(p->func, data, get_const(p, CONST_IDENTITY));
258
      sse_movlps(p->func, data, arg0);
259
      break;
260
   case 3:
261
      /* Have to jump through some hoops:
262
       *
263
       * c 0 0 0
264
       * c 0 0 1 if out_chans == CHANNELS_0001
265
       * 0 0 c 0/1
266
       * a b c 0/1
267
       */
268
      sse_movss(p->func, data, x86_make_disp(arg0, 8));
269
      if (out_chans == CHANNELS_0001)
270
         sse_shufps(p->func, data, get_const(p, CONST_IDENTITY),
271
                    SHUF(X, Y, Z, W));
272
      sse_shufps(p->func, data, data, SHUF(Y, Z, X, W));
273
      sse_movlps(p->func, data, arg0);
274
      break;
275
   case 4:
276
      sse_movups(p->func, data, arg0);
277
      break;
278
   }
279
}
280

281
/* this function behaves like emit_load_float32, but loads
282
   64-bit floating point numbers, converting them to 32-bit
283
  ones */
284
static void
285
emit_load_float64to32(struct translate_sse *p, struct x86_reg data,
286
                      struct x86_reg arg0, unsigned out_chans, unsigned chans)
287
{
288
   struct x86_reg tmpXMM = x86_make_reg(file_XMM, 1);
289
   switch (chans) {
290
   case 1:
291
      sse2_movsd(p->func, data, arg0);
292
      if (out_chans > 1)
293
         sse2_cvtpd2ps(p->func, data, data);
294
      else
295
         sse2_cvtsd2ss(p->func, data, data);
296
      if (out_chans == CHANNELS_0001)
297
         sse_shufps(p->func, data, get_const(p, CONST_IDENTITY),
298
                    SHUF(X, Y, Z, W));
299
      break;
300
   case 2:
301
      sse2_movupd(p->func, data, arg0);
302
      sse2_cvtpd2ps(p->func, data, data);
303
      if (out_chans == CHANNELS_0001)
304
         sse_shufps(p->func, data, get_const(p, CONST_IDENTITY),
305
                    SHUF(X, Y, Z, W));
306
      else if (out_chans > 2)
307
         sse_movlhps(p->func, data, get_const(p, CONST_IDENTITY));
308
      break;
309
   case 3:
310
      sse2_movupd(p->func, data, arg0);
311
      sse2_cvtpd2ps(p->func, data, data);
312
      sse2_movsd(p->func, tmpXMM, x86_make_disp(arg0, 16));
313
      if (out_chans > 3)
314
         sse2_cvtpd2ps(p->func, tmpXMM, tmpXMM);
315
      else
316
         sse2_cvtsd2ss(p->func, tmpXMM, tmpXMM);
317
      sse_movlhps(p->func, data, tmpXMM);
318
      if (out_chans == CHANNELS_0001)
319
         sse_orps(p->func, data, get_const(p, CONST_IDENTITY));
320
      break;
321
   case 4:
322
      sse2_movupd(p->func, data, arg0);
323
      sse2_cvtpd2ps(p->func, data, data);
324
      sse2_movupd(p->func, tmpXMM, x86_make_disp(arg0, 16));
325
      sse2_cvtpd2ps(p->func, tmpXMM, tmpXMM);
326
      sse_movlhps(p->func, data, tmpXMM);
327
      break;
328
   }
329
}
330

331

332
static void
333
emit_mov64(struct translate_sse *p, struct x86_reg dst_gpr,
334
           struct x86_reg dst_xmm, struct x86_reg src_gpr,
335
           struct x86_reg src_xmm)
336
{
337
   if (x86_target(p->func) != X86_32)
338
      x64_mov64(p->func, dst_gpr, src_gpr);
339
   else {
340
      /* TODO: when/on which CPUs is SSE2 actually better than SSE? */
341
      if (x86_target_caps(p->func) & X86_SSE2)
342
         sse2_movq(p->func, dst_xmm, src_xmm);
343
      else
344
         sse_movlps(p->func, dst_xmm, src_xmm);
345
   }
346
}
347

348

349
static void
350
emit_load64(struct translate_sse *p, struct x86_reg dst_gpr,
351
            struct x86_reg dst_xmm, struct x86_reg src)
352
{
353
   emit_mov64(p, dst_gpr, dst_xmm, src, src);
354
}
355

356

357
static void
358
emit_store64(struct translate_sse *p, struct x86_reg dst,
359
             struct x86_reg src_gpr, struct x86_reg src_xmm)
360
{
361
   emit_mov64(p, dst, dst, src_gpr, src_xmm);
362
}
363

364

365
static void
366
emit_mov128(struct translate_sse *p, struct x86_reg dst, struct x86_reg src)
367
{
368
   if (x86_target_caps(p->func) & X86_SSE2)
369
      sse2_movdqu(p->func, dst, src);
370
   else
371
      sse_movups(p->func, dst, src);
372
}
373

374

375
/* TODO: this uses unaligned accesses liberally, which is great on Nehalem,
376
 * but may or may not be good on older processors
377
 * TODO: may perhaps want to use non-temporal stores here if possible
378
 */
379
static void
380
emit_memcpy(struct translate_sse *p, struct x86_reg dst, struct x86_reg src,
381
            unsigned size)
382
{
383
   struct x86_reg dataXMM = x86_make_reg(file_XMM, 0);
384
   struct x86_reg dataXMM2 = x86_make_reg(file_XMM, 1);
385
   struct x86_reg dataGPR = p->tmp_EAX;
386
   struct x86_reg dataGPR2 = p->tmp2_EDX;
387

388
   if (size < 8) {
389
      switch (size) {
390
      case 1:
391
         x86_mov8(p->func, dataGPR, src);
392
         x86_mov8(p->func, dst, dataGPR);
393
         break;
394
      case 2:
395
         x86_mov16(p->func, dataGPR, src);
396
         x86_mov16(p->func, dst, dataGPR);
397
         break;
398
      case 3:
399
         x86_mov16(p->func, dataGPR, src);
400
         x86_mov8(p->func, dataGPR2, x86_make_disp(src, 2));
401
         x86_mov16(p->func, dst, dataGPR);
402
         x86_mov8(p->func, x86_make_disp(dst, 2), dataGPR2);
403
         break;
404
      case 4:
405
         x86_mov(p->func, dataGPR, src);
406
         x86_mov(p->func, dst, dataGPR);
407
         break;
408
      case 6:
409
         x86_mov(p->func, dataGPR, src);
410
         x86_mov16(p->func, dataGPR2, x86_make_disp(src, 4));
411
         x86_mov(p->func, dst, dataGPR);
412
         x86_mov16(p->func, x86_make_disp(dst, 4), dataGPR2);
413
         break;
414
      }
415
   }
416
   else if (!(x86_target_caps(p->func) & X86_SSE)) {
417
      unsigned i = 0;
418
      assert((size & 3) == 0);
419
      for (i = 0; i < size; i += 4) {
420
         x86_mov(p->func, dataGPR, x86_make_disp(src, i));
421
         x86_mov(p->func, x86_make_disp(dst, i), dataGPR);
422
      }
423
   }
424
   else {
425
      switch (size) {
426
      case 8:
427
         emit_load64(p, dataGPR, dataXMM, src);
428
         emit_store64(p, dst, dataGPR, dataXMM);
429
         break;
430
      case 12:
431
         emit_load64(p, dataGPR2, dataXMM, src);
432
         x86_mov(p->func, dataGPR, x86_make_disp(src, 8));
433
         emit_store64(p, dst, dataGPR2, dataXMM);
434
         x86_mov(p->func, x86_make_disp(dst, 8), dataGPR);
435
         break;
436
      case 16:
437
         emit_mov128(p, dataXMM, src);
438
         emit_mov128(p, dst, dataXMM);
439
         break;
440
      case 24:
441
         emit_mov128(p, dataXMM, src);
442
         emit_load64(p, dataGPR, dataXMM2, x86_make_disp(src, 16));
443
         emit_mov128(p, dst, dataXMM);
444
         emit_store64(p, x86_make_disp(dst, 16), dataGPR, dataXMM2);
445
         break;
446
      case 32:
447
         emit_mov128(p, dataXMM, src);
448
         emit_mov128(p, dataXMM2, x86_make_disp(src, 16));
449
         emit_mov128(p, dst, dataXMM);
450
         emit_mov128(p, x86_make_disp(dst, 16), dataXMM2);
451
         break;
452
      default:
453
         assert(0);
454
      }
455
   }
456
}
457

458
static boolean
459
translate_attr_convert(struct translate_sse *p,
460
                       const struct translate_element *a,
461
                       struct x86_reg src, struct x86_reg dst)
462
{
463
   const struct util_format_description *input_desc =
464
      util_format_description(a->input_format);
465
   const struct util_format_description *output_desc =
466
      util_format_description(a->output_format);
467
   unsigned i;
468
   boolean id_swizzle = TRUE;
469
   unsigned swizzle[4] =
470
      { PIPE_SWIZZLE_NONE, PIPE_SWIZZLE_NONE,
471
        PIPE_SWIZZLE_NONE, PIPE_SWIZZLE_NONE };
472
   unsigned needed_chans = 0;
473
   unsigned imms[2] = { 0, 0x3f800000 };
474

475
   if (a->output_format == PIPE_FORMAT_NONE
476
       || a->input_format == PIPE_FORMAT_NONE)
477
      return FALSE;
478

479
   if (input_desc->channel[0].size & 7)
480
      return FALSE;
481

482
   if (input_desc->colorspace != output_desc->colorspace)
483
      return FALSE;
484

485
   for (i = 1; i < input_desc->nr_channels; ++i) {
486
      if (memcmp
487
          (&input_desc->channel[i], &input_desc->channel[0],
488
           sizeof(input_desc->channel[0])))
489
         return FALSE;
490
   }
491

492
   for (i = 1; i < output_desc->nr_channels; ++i) {
493
      if (memcmp
494
          (&output_desc->channel[i], &output_desc->channel[0],
495
           sizeof(output_desc->channel[0]))) {
496
         return FALSE;
497
      }
498
   }
499

500
   for (i = 0; i < output_desc->nr_channels; ++i) {
501
      if (output_desc->swizzle[i] < 4)
502
         swizzle[output_desc->swizzle[i]] = input_desc->swizzle[i];
503
   }
504

505
   if ((x86_target_caps(p->func) & X86_SSE) &&
506
       (0 || a->output_format == PIPE_FORMAT_R32_FLOAT
507
        || a->output_format == PIPE_FORMAT_R32G32_FLOAT
508
        || a->output_format == PIPE_FORMAT_R32G32B32_FLOAT
509
        || a->output_format == PIPE_FORMAT_R32G32B32A32_FLOAT)) {
510
      struct x86_reg dataXMM = x86_make_reg(file_XMM, 0);
511

512
      for (i = 0; i < output_desc->nr_channels; ++i) {
513
         if (swizzle[i] == PIPE_SWIZZLE_0
514
             && i >= input_desc->nr_channels)
515
            swizzle[i] = i;
516
      }
517

518
      for (i = 0; i < output_desc->nr_channels; ++i) {
519
         if (swizzle[i] < 4)
520
            needed_chans = MAX2(needed_chans, swizzle[i] + 1);
521
         if (swizzle[i] < PIPE_SWIZZLE_0 && swizzle[i] != i)
522
            id_swizzle = FALSE;
523
      }
524

525
      if (needed_chans > 0) {
526
         switch (input_desc->channel[0].type) {
527
         case UTIL_FORMAT_TYPE_UNSIGNED:
528
            if (!(x86_target_caps(p->func) & X86_SSE2))
529
               return FALSE;
530
            emit_load_sse2(p, dataXMM, src,
531
                           input_desc->channel[0].size *
532
                           input_desc->nr_channels >> 3);
533

534
            /* TODO: add support for SSE4.1 pmovzx */
535
            switch (input_desc->channel[0].size) {
536
            case 8:
537
               /* TODO: this may be inefficient due to get_identity() being
538
                *  used both as a float and integer register.
539
                */
540
               sse2_punpcklbw(p->func, dataXMM, get_const(p, CONST_IDENTITY));
541
               sse2_punpcklbw(p->func, dataXMM, get_const(p, CONST_IDENTITY));
542
               break;
543
            case 16:
544
               sse2_punpcklwd(p->func, dataXMM, get_const(p, CONST_IDENTITY));
545
               break;
546
            case 32:           /* we lose precision here */
547
               sse2_psrld_imm(p->func, dataXMM, 1);
548
               break;
549
            default:
550
               return FALSE;
551
            }
552
            sse2_cvtdq2ps(p->func, dataXMM, dataXMM);
553
            if (input_desc->channel[0].normalized) {
554
               struct x86_reg factor;
555
               switch (input_desc->channel[0].size) {
556
               case 8:
557
                  factor = get_const(p, CONST_INV_255);
558
                  break;
559
               case 16:
560
                  factor = get_const(p, CONST_INV_65535);
561
                  break;
562
               case 32:
563
                  factor = get_const(p, CONST_INV_2147483647);
564
                  break;
565
               default:
566
                  assert(0);
567
                  factor.disp = 0;
568
                  factor.file = 0;
569
                  factor.idx = 0;
570
                  factor.mod = 0;
571
                  break;
572
               }
573
               sse_mulps(p->func, dataXMM, factor);
574
            }
575
            else if (input_desc->channel[0].size == 32)
576
               /* compensate for the bit we threw away to fit u32 into s32 */
577
               sse_addps(p->func, dataXMM, dataXMM);
578
            break;
579
         case UTIL_FORMAT_TYPE_SIGNED:
580
            if (!(x86_target_caps(p->func) & X86_SSE2))
581
               return FALSE;
582
            emit_load_sse2(p, dataXMM, src,
583
                           input_desc->channel[0].size *
584
                           input_desc->nr_channels >> 3);
585

586
            /* TODO: add support for SSE4.1 pmovsx */
587
            switch (input_desc->channel[0].size) {
588
            case 8:
589
               sse2_punpcklbw(p->func, dataXMM, dataXMM);
590
               sse2_punpcklbw(p->func, dataXMM, dataXMM);
591
               sse2_psrad_imm(p->func, dataXMM, 24);
592
               break;
593
            case 16:
594
               sse2_punpcklwd(p->func, dataXMM, dataXMM);
595
               sse2_psrad_imm(p->func, dataXMM, 16);
596
               break;
597
            case 32:           /* we lose precision here */
598
               break;
599
            default:
600
               return FALSE;
601
            }
602
            sse2_cvtdq2ps(p->func, dataXMM, dataXMM);
603
            if (input_desc->channel[0].normalized) {
604
               struct x86_reg factor;
605
               switch (input_desc->channel[0].size) {
606
               case 8:
607
                  factor = get_const(p, CONST_INV_127);
608
                  break;
609
               case 16:
610
                  factor = get_const(p, CONST_INV_32767);
611
                  break;
612
               case 32:
613
                  factor = get_const(p, CONST_INV_2147483647);
614
                  break;
615
               default:
616
                  assert(0);
617
                  factor.disp = 0;
618
                  factor.file = 0;
619
                  factor.idx = 0;
620
                  factor.mod = 0;
621
                  break;
622
               }
623
               sse_mulps(p->func, dataXMM, factor);
624
            }
625
            break;
626

627
            break;
628
         case UTIL_FORMAT_TYPE_FLOAT:
629
            if (input_desc->channel[0].size != 32
630
                && input_desc->channel[0].size != 64) {
631
               return FALSE;
632
            }
633
            if (swizzle[3] == PIPE_SWIZZLE_1
634
                && input_desc->nr_channels <= 3) {
635
               swizzle[3] = PIPE_SWIZZLE_W;
636
               needed_chans = CHANNELS_0001;
637
            }
638
            switch (input_desc->channel[0].size) {
639
            case 32:
640
               emit_load_float32(p, dataXMM, src, needed_chans,
641
                                 input_desc->nr_channels);
642
               break;
643
            case 64:           /* we lose precision here */
644
               if (!(x86_target_caps(p->func) & X86_SSE2))
645
                  return FALSE;
646
               emit_load_float64to32(p, dataXMM, src, needed_chans,
647
                                     input_desc->nr_channels);
648
               break;
649
            default:
650
               return FALSE;
651
            }
652
            break;
653
         default:
654
            return FALSE;
655
         }
656

657
         if (!id_swizzle) {
658
            sse_shufps(p->func, dataXMM, dataXMM,
659
                       SHUF(swizzle[0], swizzle[1], swizzle[2], swizzle[3]));
660
         }
661
      }
662

663
      if (output_desc->nr_channels >= 4
664
          && swizzle[0] < PIPE_SWIZZLE_0
665
          && swizzle[1] < PIPE_SWIZZLE_0
666
          && swizzle[2] < PIPE_SWIZZLE_0
667
          && swizzle[3] < PIPE_SWIZZLE_0) {
668
         sse_movups(p->func, dst, dataXMM);
669
      }
670
      else {
671
         if (output_desc->nr_channels >= 2
672
             && swizzle[0] < PIPE_SWIZZLE_0
673
             && swizzle[1] < PIPE_SWIZZLE_0) {
674
            sse_movlps(p->func, dst, dataXMM);
675
         }
676
         else {
677
            if (swizzle[0] < PIPE_SWIZZLE_0) {
678
               sse_movss(p->func, dst, dataXMM);
679
            }
680
            else {
681
               x86_mov_imm(p->func, dst,
682
                           imms[swizzle[0] - PIPE_SWIZZLE_0]);
683
            }
684

685
            if (output_desc->nr_channels >= 2) {
686
               if (swizzle[1] < PIPE_SWIZZLE_0) {
687
                  sse_shufps(p->func, dataXMM, dataXMM, SHUF(1, 1, 2, 3));
688
                  sse_movss(p->func, x86_make_disp(dst, 4), dataXMM);
689
               }
690
               else {
691
                  x86_mov_imm(p->func, x86_make_disp(dst, 4),
692
                              imms[swizzle[1] - PIPE_SWIZZLE_0]);
693
               }
694
            }
695
         }
696

697
         if (output_desc->nr_channels >= 3) {
698
            if (output_desc->nr_channels >= 4
699
                && swizzle[2] < PIPE_SWIZZLE_0
700
                && swizzle[3] < PIPE_SWIZZLE_0) {
701
               sse_movhps(p->func, x86_make_disp(dst, 8), dataXMM);
702
            }
703
            else {
704
               if (swizzle[2] < PIPE_SWIZZLE_0) {
705
                  sse_shufps(p->func, dataXMM, dataXMM, SHUF(2, 2, 2, 3));
706
                  sse_movss(p->func, x86_make_disp(dst, 8), dataXMM);
707
               }
708
               else {
709
                  x86_mov_imm(p->func, x86_make_disp(dst, 8),
710
                              imms[swizzle[2] - PIPE_SWIZZLE_0]);
711
               }
712

713
               if (output_desc->nr_channels >= 4) {
714
                  if (swizzle[3] < PIPE_SWIZZLE_0) {
715
                     sse_shufps(p->func, dataXMM, dataXMM, SHUF(3, 3, 3, 3));
716
                     sse_movss(p->func, x86_make_disp(dst, 12), dataXMM);
717
                  }
718
                  else {
719
                     x86_mov_imm(p->func, x86_make_disp(dst, 12),
720
                                 imms[swizzle[3] - PIPE_SWIZZLE_0]);
721
                  }
722
               }
723
            }
724
         }
725
      }
726
      return TRUE;
727
   }
728
   else if ((x86_target_caps(p->func) & X86_SSE2)
729
            && input_desc->channel[0].size == 8
730
            && output_desc->channel[0].size == 16
731
            && output_desc->channel[0].normalized ==
732
            input_desc->channel[0].normalized &&
733
            (0 || (input_desc->channel[0].type == UTIL_FORMAT_TYPE_UNSIGNED
734
                   && output_desc->channel[0].type == UTIL_FORMAT_TYPE_UNSIGNED)
735
             || (input_desc->channel[0].type == UTIL_FORMAT_TYPE_UNSIGNED
736
                 && output_desc->channel[0].type == UTIL_FORMAT_TYPE_SIGNED)
737
             || (input_desc->channel[0].type == UTIL_FORMAT_TYPE_SIGNED
738
                 && output_desc->channel[0].type == UTIL_FORMAT_TYPE_SIGNED))) {
739
      struct x86_reg dataXMM = x86_make_reg(file_XMM, 0);
740
      struct x86_reg tmpXMM = x86_make_reg(file_XMM, 1);
741
      struct x86_reg tmp = p->tmp_EAX;
742
      unsigned imms[2] = { 0, 1 };
743

744
      for (i = 0; i < output_desc->nr_channels; ++i) {
745
         if (swizzle[i] == PIPE_SWIZZLE_0
746
             && i >= input_desc->nr_channels) {
747
            swizzle[i] = i;
748
         }
749
      }
750

751
      for (i = 0; i < output_desc->nr_channels; ++i) {
752
         if (swizzle[i] < 4)
753
            needed_chans = MAX2(needed_chans, swizzle[i] + 1);
754
         if (swizzle[i] < PIPE_SWIZZLE_0 && swizzle[i] != i)
755
            id_swizzle = FALSE;
756
      }
757

758
      if (needed_chans > 0) {
759
         emit_load_sse2(p, dataXMM, src,
760
                        input_desc->channel[0].size *
761
                        input_desc->nr_channels >> 3);
762

763
         switch (input_desc->channel[0].type) {
764
         case UTIL_FORMAT_TYPE_UNSIGNED:
765
            if (input_desc->channel[0].normalized) {
766
               sse2_punpcklbw(p->func, dataXMM, dataXMM);
767
               if (output_desc->channel[0].type == UTIL_FORMAT_TYPE_SIGNED)
768
                  sse2_psrlw_imm(p->func, dataXMM, 1);
769
            }
770
            else
771
               sse2_punpcklbw(p->func, dataXMM, get_const(p, CONST_IDENTITY));
772
            break;
773
         case UTIL_FORMAT_TYPE_SIGNED:
774
            if (input_desc->channel[0].normalized) {
775
               sse2_movq(p->func, tmpXMM, get_const(p, CONST_IDENTITY));
776
               sse2_punpcklbw(p->func, tmpXMM, dataXMM);
777
               sse2_psllw_imm(p->func, dataXMM, 9);
778
               sse2_psrlw_imm(p->func, dataXMM, 8);
779
               sse2_por(p->func, tmpXMM, dataXMM);
780
               sse2_psrlw_imm(p->func, dataXMM, 7);
781
               sse2_por(p->func, tmpXMM, dataXMM);
782
               {
783
                  struct x86_reg t = dataXMM;
784
                  dataXMM = tmpXMM;
785
                  tmpXMM = t;
786
               }
787
            }
788
            else {
789
               sse2_punpcklbw(p->func, dataXMM, dataXMM);
790
               sse2_psraw_imm(p->func, dataXMM, 8);
791
            }
792
            break;
793
         default:
794
            assert(0);
795
         }
796

797
         if (output_desc->channel[0].normalized)
798
            imms[1] =
799
               (output_desc->channel[0].type ==
800
                UTIL_FORMAT_TYPE_UNSIGNED) ? 0xffff : 0x7ffff;
801

802
         if (!id_swizzle)
803
            sse2_pshuflw(p->func, dataXMM, dataXMM,
804
                         (swizzle[0] & 3) | ((swizzle[1] & 3) << 2) |
805
                         ((swizzle[2] & 3) << 4) | ((swizzle[3] & 3) << 6));
806
      }
807

808
      if (output_desc->nr_channels >= 4
809
          && swizzle[0] < PIPE_SWIZZLE_0
810
          && swizzle[1] < PIPE_SWIZZLE_0
811
          && swizzle[2] < PIPE_SWIZZLE_0
812
          && swizzle[3] < PIPE_SWIZZLE_0) {
813
         sse2_movq(p->func, dst, dataXMM);
814
      }
815
      else {
816
         if (swizzle[0] < PIPE_SWIZZLE_0) {
817
            if (output_desc->nr_channels >= 2
818
                && swizzle[1] < PIPE_SWIZZLE_0) {
819
               sse2_movd(p->func, dst, dataXMM);
820
            }
821
            else {
822
               sse2_movd(p->func, tmp, dataXMM);
823
               x86_mov16(p->func, dst, tmp);
824
               if (output_desc->nr_channels >= 2)
825
                  x86_mov16_imm(p->func, x86_make_disp(dst, 2),
826
                                imms[swizzle[1] - PIPE_SWIZZLE_0]);
827
            }
828
         }
829
         else {
830
            if (output_desc->nr_channels >= 2
831
                && swizzle[1] >= PIPE_SWIZZLE_0) {
832
               x86_mov_imm(p->func, dst,
833
                           (imms[swizzle[1] - PIPE_SWIZZLE_0] << 16) |
834
                           imms[swizzle[0] - PIPE_SWIZZLE_0]);
835
            }
836
            else {
837
               x86_mov16_imm(p->func, dst,
838
                             imms[swizzle[0] - PIPE_SWIZZLE_0]);
839
               if (output_desc->nr_channels >= 2) {
840
                  sse2_movd(p->func, tmp, dataXMM);
841
                  x86_shr_imm(p->func, tmp, 16);
842
                  x86_mov16(p->func, x86_make_disp(dst, 2), tmp);
843
               }
844
            }
845
         }
846

847
         if (output_desc->nr_channels >= 3) {
848
            if (swizzle[2] < PIPE_SWIZZLE_0) {
849
               if (output_desc->nr_channels >= 4
850
                   && swizzle[3] < PIPE_SWIZZLE_0) {
851
                  sse2_psrlq_imm(p->func, dataXMM, 32);
852
                  sse2_movd(p->func, x86_make_disp(dst, 4), dataXMM);
853
               }
854
               else {
855
                  sse2_psrlq_imm(p->func, dataXMM, 32);
856
                  sse2_movd(p->func, tmp, dataXMM);
857
                  x86_mov16(p->func, x86_make_disp(dst, 4), tmp);
858
                  if (output_desc->nr_channels >= 4) {
859
                     x86_mov16_imm(p->func, x86_make_disp(dst, 6),
860
                                   imms[swizzle[3] - PIPE_SWIZZLE_0]);
861
                  }
862
               }
863
            }
864
            else {
865
               if (output_desc->nr_channels >= 4
866
                   && swizzle[3] >= PIPE_SWIZZLE_0) {
867
                  x86_mov_imm(p->func, x86_make_disp(dst, 4),
868
                              (imms[swizzle[3] - PIPE_SWIZZLE_0] << 16)
869
                              | imms[swizzle[2] - PIPE_SWIZZLE_0]);
870
               }
871
               else {
872
                  x86_mov16_imm(p->func, x86_make_disp(dst, 4),
873
                                imms[swizzle[2] - PIPE_SWIZZLE_0]);
874

875
                  if (output_desc->nr_channels >= 4) {
876
                     sse2_psrlq_imm(p->func, dataXMM, 48);
877
                     sse2_movd(p->func, tmp, dataXMM);
878
                     x86_mov16(p->func, x86_make_disp(dst, 6), tmp);
879
                  }
880
               }
881
            }
882
         }
883
      }
884
      return TRUE;
885
   }
886
   else if (!memcmp(&output_desc->channel[0], &input_desc->channel[0],
887
                    sizeof(output_desc->channel[0]))) {
888
      struct x86_reg tmp = p->tmp_EAX;
889
      unsigned i;
890

891
      if (input_desc->channel[0].size == 8 && input_desc->nr_channels == 4
892
          && output_desc->nr_channels == 4
893
          && swizzle[0] == PIPE_SWIZZLE_W
894
          && swizzle[1] == PIPE_SWIZZLE_Z
895
          && swizzle[2] == PIPE_SWIZZLE_Y
896
          && swizzle[3] == PIPE_SWIZZLE_X) {
897
         /* TODO: support movbe */
898
         x86_mov(p->func, tmp, src);
899
         x86_bswap(p->func, tmp);
900
         x86_mov(p->func, dst, tmp);
901
         return TRUE;
902
      }
903

904
      for (i = 0; i < output_desc->nr_channels; ++i) {
905
         switch (output_desc->channel[0].size) {
906
         case 8:
907
            if (swizzle[i] >= PIPE_SWIZZLE_0) {
908
               unsigned v = 0;
909
               if (swizzle[i] == PIPE_SWIZZLE_1) {
910
                  switch (output_desc->channel[0].type) {
911
                  case UTIL_FORMAT_TYPE_UNSIGNED:
912
                     v = output_desc->channel[0].normalized ? 0xff : 1;
913
                     break;
914
                  case UTIL_FORMAT_TYPE_SIGNED:
915
                     v = output_desc->channel[0].normalized ? 0x7f : 1;
916
                     break;
917
                  default:
918
                     return FALSE;
919
                  }
920
               }
921
               x86_mov8_imm(p->func, x86_make_disp(dst, i * 1), v);
922
            }
923
            else {
924
               x86_mov8(p->func, tmp, x86_make_disp(src, swizzle[i] * 1));
925
               x86_mov8(p->func, x86_make_disp(dst, i * 1), tmp);
926
            }
927
            break;
928
         case 16:
929
            if (swizzle[i] >= PIPE_SWIZZLE_0) {
930
               unsigned v = 0;
931
               if (swizzle[i] == PIPE_SWIZZLE_1) {
932
                  switch (output_desc->channel[1].type) {
933
                  case UTIL_FORMAT_TYPE_UNSIGNED:
934
                     v = output_desc->channel[1].normalized ? 0xffff : 1;
935
                     break;
936
                  case UTIL_FORMAT_TYPE_SIGNED:
937
                     v = output_desc->channel[1].normalized ? 0x7fff : 1;
938
                     break;
939
                  case UTIL_FORMAT_TYPE_FLOAT:
940
                     v = 0x3c00;
941
                     break;
942
                  default:
943
                     return FALSE;
944
                  }
945
               }
946
               x86_mov16_imm(p->func, x86_make_disp(dst, i * 2), v);
947
            }
948
            else if (swizzle[i] == PIPE_SWIZZLE_0) {
949
               x86_mov16_imm(p->func, x86_make_disp(dst, i * 2), 0);
950
            }
951
            else {
952
               x86_mov16(p->func, tmp, x86_make_disp(src, swizzle[i] * 2));
953
               x86_mov16(p->func, x86_make_disp(dst, i * 2), tmp);
954
            }
955
            break;
956
         case 32:
957
            if (swizzle[i] >= PIPE_SWIZZLE_0) {
958
               unsigned v = 0;
959
               if (swizzle[i] == PIPE_SWIZZLE_1) {
960
                  switch (output_desc->channel[1].type) {
961
                  case UTIL_FORMAT_TYPE_UNSIGNED:
962
                     v = output_desc->channel[1].normalized ? 0xffffffff : 1;
963
                     break;
964
                  case UTIL_FORMAT_TYPE_SIGNED:
965
                     v = output_desc->channel[1].normalized ? 0x7fffffff : 1;
966
                     break;
967
                  case UTIL_FORMAT_TYPE_FLOAT:
968
                     v = 0x3f800000;
969
                     break;
970
                  default:
971
                     return FALSE;
972
                  }
973
               }
974
               x86_mov_imm(p->func, x86_make_disp(dst, i * 4), v);
975
            }
976
            else {
977
               x86_mov(p->func, tmp, x86_make_disp(src, swizzle[i] * 4));
978
               x86_mov(p->func, x86_make_disp(dst, i * 4), tmp);
979
            }
980
            break;
981
         case 64:
982
            if (swizzle[i] >= PIPE_SWIZZLE_0) {
983
               unsigned l = 0;
984
               unsigned h = 0;
985
               if (swizzle[i] == PIPE_SWIZZLE_1) {
986
                  switch (output_desc->channel[1].type) {
987
                  case UTIL_FORMAT_TYPE_UNSIGNED:
988
                     h = output_desc->channel[1].normalized ? 0xffffffff : 0;
989
                     l = output_desc->channel[1].normalized ? 0xffffffff : 1;
990
                     break;
991
                  case UTIL_FORMAT_TYPE_SIGNED:
992
                     h = output_desc->channel[1].normalized ? 0x7fffffff : 0;
993
                     l = output_desc->channel[1].normalized ? 0xffffffff : 1;
994
                     break;
995
                  case UTIL_FORMAT_TYPE_FLOAT:
996
                     h = 0x3ff00000;
997
                     l = 0;
998
                     break;
999
                  default:
1000
                     return FALSE;
1001
                  }
1002
               }
1003
               x86_mov_imm(p->func, x86_make_disp(dst, i * 8), l);
1004
               x86_mov_imm(p->func, x86_make_disp(dst, i * 8 + 4), h);
1005
            }
1006
            else {
1007
               if (x86_target_caps(p->func) & X86_SSE) {
1008
                  struct x86_reg tmpXMM = x86_make_reg(file_XMM, 0);
1009
                  emit_load64(p, tmp, tmpXMM,
1010
                              x86_make_disp(src, swizzle[i] * 8));
1011
                  emit_store64(p, x86_make_disp(dst, i * 8), tmp, tmpXMM);
1012
               }
1013
               else {
1014
                  x86_mov(p->func, tmp, x86_make_disp(src, swizzle[i] * 8));
1015
                  x86_mov(p->func, x86_make_disp(dst, i * 8), tmp);
1016
                  x86_mov(p->func, tmp,
1017
                          x86_make_disp(src, swizzle[i] * 8 + 4));
1018
                  x86_mov(p->func, x86_make_disp(dst, i * 8 + 4), tmp);
1019
               }
1020
            }
1021
            break;
1022
         default:
1023
            return FALSE;
1024
         }
1025
      }
1026
      return TRUE;
1027
   }
1028
   /* special case for draw's EMIT_4UB (RGBA) and EMIT_4UB_BGRA */
1029
   else if ((x86_target_caps(p->func) & X86_SSE2) &&
1030
            a->input_format == PIPE_FORMAT_R32G32B32A32_FLOAT &&
1031
            (0 || a->output_format == PIPE_FORMAT_B8G8R8A8_UNORM
1032
             || a-> output_format == PIPE_FORMAT_R8G8B8A8_UNORM)) {
1033
      struct x86_reg dataXMM = x86_make_reg(file_XMM, 0);
1034

1035
      /* load */
1036
      sse_movups(p->func, dataXMM, src);
1037

1038
      if (a->output_format == PIPE_FORMAT_B8G8R8A8_UNORM) {
1039
         sse_shufps(p->func, dataXMM, dataXMM, SHUF(2, 1, 0, 3));
1040
      }
1041

1042
      /* scale by 255.0 */
1043
      sse_mulps(p->func, dataXMM, get_const(p, CONST_255));
1044

1045
      /* pack and emit */
1046
      sse2_cvtps2dq(p->func, dataXMM, dataXMM);
1047
      sse2_packssdw(p->func, dataXMM, dataXMM);
1048
      sse2_packuswb(p->func, dataXMM, dataXMM);
1049
      sse2_movd(p->func, dst, dataXMM);
1050

1051
      return TRUE;
1052
   }
1053

1054
   return FALSE;
1055
}
1056

1057

1058
static boolean
1059
translate_attr(struct translate_sse *p,
1060
               const struct translate_element *a,
1061
               struct x86_reg src, struct x86_reg dst)
1062
{
1063
   if (a->input_format == a->output_format) {
1064
      emit_memcpy(p, dst, src, util_format_get_stride(a->input_format, 1));
1065
      return TRUE;
1066
   }
1067

1068
   return translate_attr_convert(p, a, src, dst);
1069
}
1070

1071

1072
static boolean
1073
init_inputs(struct translate_sse *p, unsigned index_size)
1074
{
1075
   unsigned i;
1076
   struct x86_reg instance_id =
1077
      x86_make_disp(p->machine_EDI, get_offset(p, &p->instance_id));
1078
   struct x86_reg start_instance =
1079
      x86_make_disp(p->machine_EDI, get_offset(p, &p->start_instance));
1080

1081
   for (i = 0; i < p->nr_buffer_variants; i++) {
1082
      struct translate_buffer_variant *variant = &p->buffer_variant[i];
1083
      struct translate_buffer *buffer = &p->buffer[variant->buffer_index];
1084

1085
      if (!index_size || variant->instance_divisor) {
1086
         struct x86_reg buf_max_index =
1087
            x86_make_disp(p->machine_EDI, get_offset(p, &buffer->max_index));
1088
         struct x86_reg buf_stride =
1089
            x86_make_disp(p->machine_EDI, get_offset(p, &buffer->stride));
1090
         struct x86_reg buf_ptr =
1091
            x86_make_disp(p->machine_EDI, get_offset(p, &variant->ptr));
1092
         struct x86_reg buf_base_ptr =
1093
            x86_make_disp(p->machine_EDI, get_offset(p, &buffer->base_ptr));
1094
         struct x86_reg elt = p->idx_ESI;
1095
         struct x86_reg tmp_EAX = p->tmp_EAX;
1096

1097
         /* Calculate pointer to first attrib:
1098
          *   base_ptr + stride * index, where index depends on instance divisor
1099
          */
1100
         if (variant->instance_divisor) {
1101
            struct x86_reg tmp_EDX = p->tmp2_EDX;
1102

1103
            /* Start with instance = instance_id
1104
             * which is true if divisor is 1.
1105
             */
1106
            x86_mov(p->func, tmp_EAX, instance_id);
1107

1108
            if (variant->instance_divisor != 1) {
1109
               struct x86_reg tmp_ECX = p->src_ECX;
1110

1111
               /* TODO: Add x86_shr() to rtasm and use it whenever
1112
                *       instance divisor is power of two.
1113
                */
1114
               x86_xor(p->func, tmp_EDX, tmp_EDX);
1115
               x86_mov_reg_imm(p->func, tmp_ECX, variant->instance_divisor);
1116
               x86_div(p->func, tmp_ECX);       /* EAX = EDX:EAX / ECX */
1117
            }
1118

1119
            /* instance = (instance_id / divisor) + start_instance
1120
             */
1121
            x86_mov(p->func, tmp_EDX, start_instance);
1122
            x86_add(p->func, tmp_EAX, tmp_EDX);
1123

1124
            /* XXX we need to clamp the index here too, but to a
1125
             * per-array max value, not the draw->pt.max_index value
1126
             * that's being given to us via translate->set_buffer().
1127
             */
1128
         }
1129
         else {
1130
            x86_mov(p->func, tmp_EAX, elt);
1131

1132
            /* Clamp to max_index
1133
             */
1134
            x86_cmp(p->func, tmp_EAX, buf_max_index);
1135
            x86_cmovcc(p->func, tmp_EAX, buf_max_index, cc_AE);
1136
         }
1137

1138
         x86_mov(p->func, p->tmp2_EDX, buf_stride);
1139
         x64_rexw(p->func);
1140
         x86_imul(p->func, tmp_EAX, p->tmp2_EDX);
1141
         x64_rexw(p->func);
1142
         x86_add(p->func, tmp_EAX, buf_base_ptr);
1143

1144
         x86_cmp(p->func, p->count_EBP, p->tmp_EAX);
1145

1146
         /* In the linear case, keep the buffer pointer instead of the
1147
          * index number.
1148
          */
1149
         if (!index_size && p->nr_buffer_variants == 1) {
1150
            x64_rexw(p->func);
1151
            x86_mov(p->func, elt, tmp_EAX);
1152
         }
1153
         else {
1154
            x64_rexw(p->func);
1155
            x86_mov(p->func, buf_ptr, tmp_EAX);
1156
         }
1157
      }
1158
   }
1159

1160
   return TRUE;
1161
}
1162

1163

1164
static struct x86_reg
1165
get_buffer_ptr(struct translate_sse *p,
1166
               unsigned index_size, unsigned var_idx, struct x86_reg elt)
1167
{
1168
   if (var_idx == ELEMENT_BUFFER_INSTANCE_ID) {
1169
      return x86_make_disp(p->machine_EDI, get_offset(p, &p->instance_id));
1170
   }
1171
   if (!index_size && p->nr_buffer_variants == 1) {
1172
      return p->idx_ESI;
1173
   }
1174
   else if (!index_size || p->buffer_variant[var_idx].instance_divisor) {
1175
      struct x86_reg ptr = p->src_ECX;
1176
      struct x86_reg buf_ptr =
1177
         x86_make_disp(p->machine_EDI,
1178
                       get_offset(p, &p->buffer_variant[var_idx].ptr));
1179

1180
      x64_rexw(p->func);
1181
      x86_mov(p->func, ptr, buf_ptr);
1182
      return ptr;
1183
   }
1184
   else {
1185
      struct x86_reg ptr = p->src_ECX;
1186
      const struct translate_buffer_variant *variant =
1187
         &p->buffer_variant[var_idx];
1188
      struct x86_reg buf_stride =
1189
         x86_make_disp(p->machine_EDI,
1190
                       get_offset(p, &p->buffer[variant->buffer_index].stride));
1191
      struct x86_reg buf_base_ptr =
1192
         x86_make_disp(p->machine_EDI,
1193
                  get_offset(p, &p->buffer[variant->buffer_index].base_ptr));
1194
      struct x86_reg buf_max_index =
1195
         x86_make_disp(p->machine_EDI,
1196
                  get_offset(p, &p->buffer[variant->buffer_index].max_index));
1197

1198
      /* Calculate pointer to current attrib:
1199
       */
1200
      switch (index_size) {
1201
      case 1:
1202
         x86_movzx8(p->func, ptr, elt);
1203
         break;
1204
      case 2:
1205
         x86_movzx16(p->func, ptr, elt);
1206
         break;
1207
      case 4:
1208
         x86_mov(p->func, ptr, elt);
1209
         break;
1210
      }
1211

1212
      /* Clamp to max_index
1213
       */
1214
      x86_cmp(p->func, ptr, buf_max_index);
1215
      x86_cmovcc(p->func, ptr, buf_max_index, cc_AE);
1216

1217
      x86_mov(p->func, p->tmp2_EDX, buf_stride);
1218
      x64_rexw(p->func);
1219
      x86_imul(p->func, ptr, p->tmp2_EDX);
1220
      x64_rexw(p->func);
1221
      x86_add(p->func, ptr, buf_base_ptr);
1222
      return ptr;
1223
   }
1224
}
1225

1226

1227
static boolean
1228
incr_inputs(struct translate_sse *p, unsigned index_size)
1229
{
1230
   if (!index_size && p->nr_buffer_variants == 1) {
1231
      const unsigned buffer_index = p->buffer_variant[0].buffer_index;
1232
      struct x86_reg stride =
1233
         x86_make_disp(p->machine_EDI,
1234
                       get_offset(p, &p->buffer[buffer_index].stride));
1235

1236
      if (p->buffer_variant[0].instance_divisor == 0) {
1237
         x64_rexw(p->func);
1238
         x86_add(p->func, p->idx_ESI, stride);
1239
         sse_prefetchnta(p->func, x86_make_disp(p->idx_ESI, 192));
1240
      }
1241
   }
1242
   else if (!index_size) {
1243
      unsigned i;
1244

1245
      /* Is this worthwhile??
1246
       */
1247
      for (i = 0; i < p->nr_buffer_variants; i++) {
1248
         struct translate_buffer_variant *variant = &p->buffer_variant[i];
1249
         struct x86_reg buf_ptr = x86_make_disp(p->machine_EDI,
1250
                                                get_offset(p, &variant->ptr));
1251
      struct x86_reg buf_stride =
1252
         x86_make_disp(p->machine_EDI,
1253
                       get_offset(p, &p->buffer[variant->buffer_index].stride));
1254

1255
         if (variant->instance_divisor == 0) {
1256
            x86_mov(p->func, p->tmp_EAX, buf_stride);
1257
            x64_rexw(p->func);
1258
            x86_add(p->func, p->tmp_EAX, buf_ptr);
1259
            if (i == 0)
1260
               sse_prefetchnta(p->func, x86_make_disp(p->tmp_EAX, 192));
1261
            x64_rexw(p->func);
1262
            x86_mov(p->func, buf_ptr, p->tmp_EAX);
1263
         }
1264
      }
1265
   }
1266
   else {
1267
      x64_rexw(p->func);
1268
      x86_lea(p->func, p->idx_ESI, x86_make_disp(p->idx_ESI, index_size));
1269
   }
1270

1271
   return TRUE;
1272
}
1273

1274

1275
/* Build run( struct translate *machine,
1276
 *            unsigned start,
1277
 *            unsigned count,
1278
 *            void *output_buffer )
1279
 * or
1280
 *  run_elts( struct translate *machine,
1281
 *            unsigned *elts,
1282
 *            unsigned count,
1283
 *            void *output_buffer )
1284
 *
1285
 *  Lots of hardcoding
1286
 *
1287
 * EAX -- pointer to current output vertex
1288
 * ECX -- pointer to current attribute 
1289
 * 
1290
 */
1291
static boolean
1292
build_vertex_emit(struct translate_sse *p,
1293
                  struct x86_function *func, unsigned index_size)
1294
{
1295
   int fixup, label;
1296
   unsigned j;
1297

1298
   memset(p->reg_to_const, 0xff, sizeof(p->reg_to_const));
1299
   memset(p->const_to_reg, 0xff, sizeof(p->const_to_reg));
1300

1301
   p->tmp_EAX = x86_make_reg(file_REG32, reg_AX);
1302
   p->idx_ESI = x86_make_reg(file_REG32, reg_SI);
1303
   p->outbuf_EBX = x86_make_reg(file_REG32, reg_BX);
1304
   p->machine_EDI = x86_make_reg(file_REG32, reg_DI);
1305
   p->count_EBP = x86_make_reg(file_REG32, reg_BP);
1306
   p->tmp2_EDX = x86_make_reg(file_REG32, reg_DX);
1307
   p->src_ECX = x86_make_reg(file_REG32, reg_CX);
1308

1309
   p->func = func;
1310

1311
   x86_init_func(p->func);
1312

1313
   if (x86_target(p->func) == X86_64_WIN64_ABI) {
1314
      /* the ABI guarantees a 16-byte aligned 32-byte "shadow space"
1315
       * above the return address
1316
       */
1317
      sse2_movdqa(p->func, x86_make_disp(x86_make_reg(file_REG32, reg_SP), 8),
1318
                  x86_make_reg(file_XMM, 6));
1319
      sse2_movdqa(p->func,
1320
                  x86_make_disp(x86_make_reg(file_REG32, reg_SP), 24),
1321
                  x86_make_reg(file_XMM, 7));
1322
   }
1323

1324
   x86_push(p->func, p->outbuf_EBX);
1325
   x86_push(p->func, p->count_EBP);
1326

1327
   /* on non-Win64 x86-64, these are already in the right registers */
1328
   if (x86_target(p->func) != X86_64_STD_ABI) {
1329
      x86_push(p->func, p->machine_EDI);
1330
      x86_push(p->func, p->idx_ESI);
1331

1332
      if (x86_target(p->func) != X86_32) {
1333
         x64_mov64(p->func, p->machine_EDI, x86_fn_arg(p->func, 1));
1334
         x64_mov64(p->func, p->idx_ESI, x86_fn_arg(p->func, 2));
1335
      }
1336
      else {
1337
         x86_mov(p->func, p->machine_EDI, x86_fn_arg(p->func, 1));
1338
         x86_mov(p->func, p->idx_ESI, x86_fn_arg(p->func, 2));
1339
      }
1340
   }
1341

1342
   x86_mov(p->func, p->count_EBP, x86_fn_arg(p->func, 3));
1343

1344
   if (x86_target(p->func) != X86_32)
1345
      x64_mov64(p->func, p->outbuf_EBX, x86_fn_arg(p->func, 6));
1346
   else
1347
      x86_mov(p->func, p->outbuf_EBX, x86_fn_arg(p->func, 6));
1348

1349
   /* Load instance ID.
1350
    */
1351
   if (p->use_instancing) {
1352
      x86_mov(p->func, p->tmp2_EDX, x86_fn_arg(p->func, 4));
1353
      x86_mov(p->func,
1354
              x86_make_disp(p->machine_EDI,
1355
                            get_offset(p, &p->start_instance)), p->tmp2_EDX);
1356

1357
      x86_mov(p->func, p->tmp_EAX, x86_fn_arg(p->func, 5));
1358
      x86_mov(p->func,
1359
              x86_make_disp(p->machine_EDI, get_offset(p, &p->instance_id)),
1360
              p->tmp_EAX);
1361
   }
1362

1363
   /* Get vertex count, compare to zero
1364
    */
1365
   x86_xor(p->func, p->tmp_EAX, p->tmp_EAX);
1366
   x86_cmp(p->func, p->count_EBP, p->tmp_EAX);
1367
   fixup = x86_jcc_forward(p->func, cc_E);
1368

1369
   /* always load, needed or not:
1370
    */
1371
   init_inputs(p, index_size);
1372

1373
   /* Note address for loop jump
1374
    */
1375
   label = x86_get_label(p->func);
1376
   {
1377
      struct x86_reg elt = !index_size ? p->idx_ESI : x86_deref(p->idx_ESI);
1378
      int last_variant = -1;
1379
      struct x86_reg vb;
1380

1381
      for (j = 0; j < p->translate.key.nr_elements; j++) {
1382
         const struct translate_element *a = &p->translate.key.element[j];
1383
         unsigned variant = p->element_to_buffer_variant[j];
1384

1385
         /* Figure out source pointer address:
1386
          */
1387
         if (variant != last_variant) {
1388
            last_variant = variant;
1389
            vb = get_buffer_ptr(p, index_size, variant, elt);
1390
         }
1391

1392
         if (!translate_attr(p, a,
1393
                             x86_make_disp(vb, a->input_offset),
1394
                             x86_make_disp(p->outbuf_EBX, a->output_offset)))
1395
            return FALSE;
1396
      }
1397

1398
      /* Next output vertex:
1399
       */
1400
      x64_rexw(p->func);
1401
      x86_lea(p->func, p->outbuf_EBX,
1402
              x86_make_disp(p->outbuf_EBX, p->translate.key.output_stride));
1403

1404
      /* Incr index
1405
       */
1406
      incr_inputs(p, index_size);
1407
   }
1408

1409
   /* decr count, loop if not zero
1410
    */
1411
   x86_dec(p->func, p->count_EBP);
1412
   x86_jcc(p->func, cc_NZ, label);
1413

1414
   /* Exit mmx state?
1415
    */
1416
   if (p->func->need_emms)
1417
      mmx_emms(p->func);
1418

1419
   /* Land forward jump here:
1420
    */
1421
   x86_fixup_fwd_jump(p->func, fixup);
1422

1423
   /* Pop regs and return
1424
    */
1425
   if (x86_target(p->func) != X86_64_STD_ABI) {
1426
      x86_pop(p->func, p->idx_ESI);
1427
      x86_pop(p->func, p->machine_EDI);
1428
   }
1429

1430
   x86_pop(p->func, p->count_EBP);
1431
   x86_pop(p->func, p->outbuf_EBX);
1432

1433
   if (x86_target(p->func) == X86_64_WIN64_ABI) {
1434
      sse2_movdqa(p->func, x86_make_reg(file_XMM, 6),
1435
                  x86_make_disp(x86_make_reg(file_REG32, reg_SP), 8));
1436
      sse2_movdqa(p->func, x86_make_reg(file_XMM, 7),
1437
                  x86_make_disp(x86_make_reg(file_REG32, reg_SP), 24));
1438
   }
1439
   x86_ret(p->func);
1440

1441
   return TRUE;
1442
}
1443

1444

1445
static void
1446
translate_sse_set_buffer(struct translate *translate,
1447
                         unsigned buf,
1448
                         const void *ptr, unsigned stride, unsigned max_index)
1449
{
1450
   struct translate_sse *p = (struct translate_sse *) translate;
1451

1452
   if (buf < p->nr_buffers) {
1453
      p->buffer[buf].base_ptr = (char *) ptr;
1454
      p->buffer[buf].stride = stride;
1455
      p->buffer[buf].max_index = max_index;
1456
   }
1457

1458
   if (0)
1459
      debug_printf("%s %d/%d: %p %d\n",
1460
                   __FUNCTION__, buf, p->nr_buffers, ptr, stride);
1461
}
1462

1463

1464
static void
1465
translate_sse_release(struct translate *translate)
1466
{
1467
   struct translate_sse *p = (struct translate_sse *) translate;
1468

1469
   x86_release_func(&p->elt8_func);
1470
   x86_release_func(&p->elt16_func);
1471
   x86_release_func(&p->elt_func);
1472
   x86_release_func(&p->linear_func);
1473

1474
   os_free_aligned(p);
1475
}
1476

1477

1478
struct translate *
1479
translate_sse2_create(const struct translate_key *key)
1480
{
1481
   struct translate_sse *p = NULL;
1482
   unsigned i;
1483

1484
   /* this is misnamed, it actually refers to whether rtasm is enabled or not */
1485
   if (!rtasm_cpu_has_sse())
1486
      goto fail;
1487

1488
   p = os_malloc_aligned(sizeof(struct translate_sse), 16);
1489
   if (!p)
1490
      goto fail;
1491

1492
   memset(p, 0, sizeof(*p));
1493
   memcpy(p->consts, consts, sizeof(consts));
1494

1495
   p->translate.key = *key;
1496
   p->translate.release = translate_sse_release;
1497
   p->translate.set_buffer = translate_sse_set_buffer;
1498

1499
   assert(key->nr_elements <= TRANSLATE_MAX_ATTRIBS);
1500

1501
   for (i = 0; i < key->nr_elements; i++) {
1502
      if (key->element[i].type == TRANSLATE_ELEMENT_NORMAL) {
1503
         unsigned j;
1504

1505
         p->nr_buffers =
1506
            MAX2(p->nr_buffers, key->element[i].input_buffer + 1);
1507

1508
         if (key->element[i].instance_divisor) {
1509
            p->use_instancing = TRUE;
1510
         }
1511

1512
         /*
1513
          * Map vertex element to vertex buffer variant.
1514
          */
1515
         for (j = 0; j < p->nr_buffer_variants; j++) {
1516
            if (p->buffer_variant[j].buffer_index ==
1517
                key->element[i].input_buffer
1518
                && p->buffer_variant[j].instance_divisor ==
1519
                key->element[i].instance_divisor) {
1520
               break;
1521
            }
1522
         }
1523
         if (j == p->nr_buffer_variants) {
1524
            p->buffer_variant[j].buffer_index = key->element[i].input_buffer;
1525
            p->buffer_variant[j].instance_divisor =
1526
               key->element[i].instance_divisor;
1527
            p->nr_buffer_variants++;
1528
         }
1529
         p->element_to_buffer_variant[i] = j;
1530
      }
1531
      else {
1532
         assert(key->element[i].type == TRANSLATE_ELEMENT_INSTANCE_ID);
1533

1534
         p->element_to_buffer_variant[i] = ELEMENT_BUFFER_INSTANCE_ID;
1535
      }
1536
   }
1537

1538
   if (0)
1539
      debug_printf("nr_buffers: %d\n", p->nr_buffers);
1540

1541
   if (!build_vertex_emit(p, &p->linear_func, 0))
1542
      goto fail;
1543

1544
   if (!build_vertex_emit(p, &p->elt_func, 4))
1545
      goto fail;
1546

1547
   if (!build_vertex_emit(p, &p->elt16_func, 2))
1548
      goto fail;
1549

1550
   if (!build_vertex_emit(p, &p->elt8_func, 1))
1551
      goto fail;
1552

1553
   p->translate.run = (run_func) x86_get_func(&p->linear_func);
1554
   if (p->translate.run == NULL)
1555
      goto fail;
1556

1557
   p->translate.run_elts = (run_elts_func) x86_get_func(&p->elt_func);
1558
   if (p->translate.run_elts == NULL)
1559
      goto fail;
1560

1561
   p->translate.run_elts16 = (run_elts16_func) x86_get_func(&p->elt16_func);
1562
   if (p->translate.run_elts16 == NULL)
1563
      goto fail;
1564

1565
   p->translate.run_elts8 = (run_elts8_func) x86_get_func(&p->elt8_func);
1566
   if (p->translate.run_elts8 == NULL)
1567
      goto fail;
1568

1569
   return &p->translate;
1570

1571
 fail:
1572
   if (p)
1573
      translate_sse_release(&p->translate);
1574

1575
   return NULL;
1576
}
1577

1578

1579
#else
1580

1581
struct translate *
1582
translate_sse2_create(const struct translate_key *key)
1583
{
1584
   return NULL;
1585
}
1586

1587
#endif
1588

1589
Product

Resources

Company