Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
PojavLauncherTeam
GitHub Repository: PojavLauncherTeam/mesa
Path: blob/21.2-virgl/src/gallium/drivers/r300/compiler/radeon_program_alu.c
4574 views
1
/*
2
* Copyright (C) 2008 Nicolai Haehnle.
3
*
4
* All Rights Reserved.
5
*
6
* Permission is hereby granted, free of charge, to any person obtaining
7
* a copy of this software and associated documentation files (the
8
* "Software"), to deal in the Software without restriction, including
9
* without limitation the rights to use, copy, modify, merge, publish,
10
* distribute, sublicense, and/or sell copies of the Software, and to
11
* permit persons to whom the Software is furnished to do so, subject to
12
* the following conditions:
13
*
14
* The above copyright notice and this permission notice (including the
15
* next paragraph) shall be included in all copies or substantial
16
* portions of the Software.
17
*
18
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
19
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
21
* IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
22
* LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
23
* OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
24
* WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25
*
26
*/
27
28
/**
29
* @file
30
*
31
* Shareable transformations that transform "special" ALU instructions
32
* into ALU instructions that are supported by hardware.
33
*
34
*/
35
36
#include "radeon_program_alu.h"
37
38
#include "radeon_compiler.h"
39
#include "radeon_compiler_util.h"
40
41
42
static struct rc_instruction *emit1(
43
struct radeon_compiler * c, struct rc_instruction * after,
44
rc_opcode Opcode, struct rc_sub_instruction * base,
45
struct rc_dst_register DstReg, struct rc_src_register SrcReg)
46
{
47
struct rc_instruction *fpi = rc_insert_new_instruction(c, after);
48
49
if (base) {
50
memcpy(&fpi->U.I, base, sizeof(struct rc_sub_instruction));
51
}
52
53
fpi->U.I.Opcode = Opcode;
54
fpi->U.I.DstReg = DstReg;
55
fpi->U.I.SrcReg[0] = SrcReg;
56
return fpi;
57
}
58
59
static struct rc_instruction *emit2(
60
struct radeon_compiler * c, struct rc_instruction * after,
61
rc_opcode Opcode, struct rc_sub_instruction * base,
62
struct rc_dst_register DstReg,
63
struct rc_src_register SrcReg0, struct rc_src_register SrcReg1)
64
{
65
struct rc_instruction *fpi = rc_insert_new_instruction(c, after);
66
67
if (base) {
68
memcpy(&fpi->U.I, base, sizeof(struct rc_sub_instruction));
69
}
70
71
fpi->U.I.Opcode = Opcode;
72
fpi->U.I.DstReg = DstReg;
73
fpi->U.I.SrcReg[0] = SrcReg0;
74
fpi->U.I.SrcReg[1] = SrcReg1;
75
return fpi;
76
}
77
78
static struct rc_instruction *emit3(
79
struct radeon_compiler * c, struct rc_instruction * after,
80
rc_opcode Opcode, struct rc_sub_instruction * base,
81
struct rc_dst_register DstReg,
82
struct rc_src_register SrcReg0, struct rc_src_register SrcReg1,
83
struct rc_src_register SrcReg2)
84
{
85
struct rc_instruction *fpi = rc_insert_new_instruction(c, after);
86
87
if (base) {
88
memcpy(&fpi->U.I, base, sizeof(struct rc_sub_instruction));
89
}
90
91
fpi->U.I.Opcode = Opcode;
92
fpi->U.I.DstReg = DstReg;
93
fpi->U.I.SrcReg[0] = SrcReg0;
94
fpi->U.I.SrcReg[1] = SrcReg1;
95
fpi->U.I.SrcReg[2] = SrcReg2;
96
return fpi;
97
}
98
99
static struct rc_dst_register dstregtmpmask(int index, int mask)
100
{
101
struct rc_dst_register dst = {0, 0, 0};
102
dst.File = RC_FILE_TEMPORARY;
103
dst.Index = index;
104
dst.WriteMask = mask;
105
return dst;
106
}
107
108
static const struct rc_src_register builtin_zero = {
109
.File = RC_FILE_NONE,
110
.Index = 0,
111
.Swizzle = RC_SWIZZLE_0000
112
};
113
static const struct rc_src_register builtin_one = {
114
.File = RC_FILE_NONE,
115
.Index = 0,
116
.Swizzle = RC_SWIZZLE_1111
117
};
118
119
static const struct rc_src_register builtin_half = {
120
.File = RC_FILE_NONE,
121
.Index = 0,
122
.Swizzle = RC_SWIZZLE_HHHH
123
};
124
125
static const struct rc_src_register srcreg_undefined = {
126
.File = RC_FILE_NONE,
127
.Index = 0,
128
.Swizzle = RC_SWIZZLE_XYZW
129
};
130
131
static struct rc_src_register srcreg(int file, int index)
132
{
133
struct rc_src_register src = srcreg_undefined;
134
src.File = file;
135
src.Index = index;
136
return src;
137
}
138
139
static struct rc_src_register srcregswz(int file, int index, int swz)
140
{
141
struct rc_src_register src = srcreg_undefined;
142
src.File = file;
143
src.Index = index;
144
src.Swizzle = swz;
145
return src;
146
}
147
148
static struct rc_src_register absolute(struct rc_src_register reg)
149
{
150
struct rc_src_register newreg = reg;
151
newreg.Abs = 1;
152
newreg.Negate = RC_MASK_NONE;
153
return newreg;
154
}
155
156
static struct rc_src_register negate(struct rc_src_register reg)
157
{
158
struct rc_src_register newreg = reg;
159
newreg.Negate = newreg.Negate ^ RC_MASK_XYZW;
160
return newreg;
161
}
162
163
static struct rc_src_register swizzle(struct rc_src_register reg,
164
rc_swizzle x, rc_swizzle y, rc_swizzle z, rc_swizzle w)
165
{
166
struct rc_src_register swizzled = reg;
167
swizzled.Swizzle = combine_swizzles4(reg.Swizzle, x, y, z, w);
168
return swizzled;
169
}
170
171
static struct rc_src_register swizzle_smear(struct rc_src_register reg,
172
rc_swizzle x)
173
{
174
return swizzle(reg, x, x, x, x);
175
}
176
177
static struct rc_src_register swizzle_xxxx(struct rc_src_register reg)
178
{
179
return swizzle_smear(reg, RC_SWIZZLE_X);
180
}
181
182
static struct rc_src_register swizzle_yyyy(struct rc_src_register reg)
183
{
184
return swizzle_smear(reg, RC_SWIZZLE_Y);
185
}
186
187
static struct rc_src_register swizzle_zzzz(struct rc_src_register reg)
188
{
189
return swizzle_smear(reg, RC_SWIZZLE_Z);
190
}
191
192
static struct rc_src_register swizzle_wwww(struct rc_src_register reg)
193
{
194
return swizzle_smear(reg, RC_SWIZZLE_W);
195
}
196
197
static int is_dst_safe_to_reuse(struct rc_instruction *inst)
198
{
199
const struct rc_opcode_info *info = rc_get_opcode_info(inst->U.I.Opcode);
200
unsigned i;
201
202
assert(info->HasDstReg);
203
204
if (inst->U.I.DstReg.File != RC_FILE_TEMPORARY)
205
return 0;
206
207
for (i = 0; i < info->NumSrcRegs; i++) {
208
if (inst->U.I.SrcReg[i].File == RC_FILE_TEMPORARY &&
209
inst->U.I.SrcReg[i].Index == inst->U.I.DstReg.Index)
210
return 0;
211
}
212
213
return 1;
214
}
215
216
static struct rc_dst_register try_to_reuse_dst(struct radeon_compiler *c,
217
struct rc_instruction *inst)
218
{
219
unsigned tmp;
220
221
if (is_dst_safe_to_reuse(inst))
222
tmp = inst->U.I.DstReg.Index;
223
else
224
tmp = rc_find_free_temporary(c);
225
226
return dstregtmpmask(tmp, inst->U.I.DstReg.WriteMask);
227
}
228
229
static void transform_ABS(struct radeon_compiler* c,
230
struct rc_instruction* inst)
231
{
232
struct rc_src_register src = inst->U.I.SrcReg[0];
233
src.Abs = 1;
234
src.Negate = RC_MASK_NONE;
235
emit1(c, inst->Prev, RC_OPCODE_MOV, &inst->U.I, inst->U.I.DstReg, src);
236
rc_remove_instruction(inst);
237
}
238
239
static void transform_CEIL(struct radeon_compiler* c,
240
struct rc_instruction* inst)
241
{
242
/* Assuming:
243
* ceil(x) = -floor(-x)
244
*
245
* After inlining floor:
246
* ceil(x) = -(-x-frac(-x))
247
*
248
* After simplification:
249
* ceil(x) = x+frac(-x)
250
*/
251
252
struct rc_dst_register dst = try_to_reuse_dst(c, inst);
253
emit1(c, inst->Prev, RC_OPCODE_FRC, 0, dst, negate(inst->U.I.SrcReg[0]));
254
emit2(c, inst->Prev, RC_OPCODE_ADD, &inst->U.I, inst->U.I.DstReg,
255
inst->U.I.SrcReg[0], srcreg(RC_FILE_TEMPORARY, dst.Index));
256
rc_remove_instruction(inst);
257
}
258
259
static void transform_CLAMP(struct radeon_compiler *c,
260
struct rc_instruction *inst)
261
{
262
/* CLAMP dst, src, min, max
263
* into:
264
* MIN tmp, src, max
265
* MAX dst, tmp, min
266
*/
267
struct rc_dst_register dst = try_to_reuse_dst(c, inst);
268
emit2(c, inst->Prev, RC_OPCODE_MIN, 0, dst,
269
inst->U.I.SrcReg[0], inst->U.I.SrcReg[2]);
270
emit2(c, inst->Prev, RC_OPCODE_MAX, &inst->U.I, inst->U.I.DstReg,
271
srcreg(RC_FILE_TEMPORARY, dst.Index), inst->U.I.SrcReg[1]);
272
rc_remove_instruction(inst);
273
}
274
275
static void transform_DP2(struct radeon_compiler* c,
276
struct rc_instruction* inst)
277
{
278
struct rc_src_register src0 = inst->U.I.SrcReg[0];
279
struct rc_src_register src1 = inst->U.I.SrcReg[1];
280
src0.Negate &= ~(RC_MASK_Z | RC_MASK_W);
281
src0.Swizzle &= ~(63 << (3 * 2));
282
src0.Swizzle |= (RC_SWIZZLE_ZERO << (3 * 2)) | (RC_SWIZZLE_ZERO << (3 * 3));
283
src1.Negate &= ~(RC_MASK_Z | RC_MASK_W);
284
src1.Swizzle &= ~(63 << (3 * 2));
285
src1.Swizzle |= (RC_SWIZZLE_ZERO << (3 * 2)) | (RC_SWIZZLE_ZERO << (3 * 3));
286
emit2(c, inst->Prev, RC_OPCODE_DP3, &inst->U.I, inst->U.I.DstReg, src0, src1);
287
rc_remove_instruction(inst);
288
}
289
290
static void transform_DPH(struct radeon_compiler* c,
291
struct rc_instruction* inst)
292
{
293
struct rc_src_register src0 = inst->U.I.SrcReg[0];
294
src0.Negate &= ~RC_MASK_W;
295
src0.Swizzle &= ~(7 << (3 * 3));
296
src0.Swizzle |= RC_SWIZZLE_ONE << (3 * 3);
297
emit2(c, inst->Prev, RC_OPCODE_DP4, &inst->U.I, inst->U.I.DstReg, src0, inst->U.I.SrcReg[1]);
298
rc_remove_instruction(inst);
299
}
300
301
/**
302
* [1, src0.y*src1.y, src0.z, src1.w]
303
* So basically MUL with lotsa swizzling.
304
*/
305
static void transform_DST(struct radeon_compiler* c,
306
struct rc_instruction* inst)
307
{
308
emit2(c, inst->Prev, RC_OPCODE_MUL, &inst->U.I, inst->U.I.DstReg,
309
swizzle(inst->U.I.SrcReg[0], RC_SWIZZLE_ONE, RC_SWIZZLE_Y, RC_SWIZZLE_Z, RC_SWIZZLE_ONE),
310
swizzle(inst->U.I.SrcReg[1], RC_SWIZZLE_ONE, RC_SWIZZLE_Y, RC_SWIZZLE_ONE, RC_SWIZZLE_W));
311
rc_remove_instruction(inst);
312
}
313
314
static void transform_FLR(struct radeon_compiler* c,
315
struct rc_instruction* inst)
316
{
317
struct rc_dst_register dst = try_to_reuse_dst(c, inst);
318
emit1(c, inst->Prev, RC_OPCODE_FRC, 0, dst, inst->U.I.SrcReg[0]);
319
emit2(c, inst->Prev, RC_OPCODE_ADD, &inst->U.I, inst->U.I.DstReg,
320
inst->U.I.SrcReg[0], negate(srcreg(RC_FILE_TEMPORARY, dst.Index)));
321
rc_remove_instruction(inst);
322
}
323
324
static void transform_TRUNC(struct radeon_compiler* c,
325
struct rc_instruction* inst)
326
{
327
/* Definition of trunc:
328
* trunc(x) = (abs(x) - fract(abs(x))) * sgn(x)
329
*
330
* The multiplication by sgn(x) can be simplified using CMP:
331
* y * sgn(x) = (x < 0 ? -y : y)
332
*/
333
struct rc_dst_register dst = try_to_reuse_dst(c, inst);
334
emit1(c, inst->Prev, RC_OPCODE_FRC, 0, dst, absolute(inst->U.I.SrcReg[0]));
335
emit2(c, inst->Prev, RC_OPCODE_ADD, 0, dst, absolute(inst->U.I.SrcReg[0]),
336
negate(srcreg(RC_FILE_TEMPORARY, dst.Index)));
337
emit3(c, inst->Prev, RC_OPCODE_CMP, &inst->U.I, inst->U.I.DstReg, inst->U.I.SrcReg[0],
338
negate(srcreg(RC_FILE_TEMPORARY, dst.Index)), srcreg(RC_FILE_TEMPORARY, dst.Index));
339
rc_remove_instruction(inst);
340
}
341
342
/**
343
* Definition of LIT (from ARB_fragment_program):
344
*
345
* tmp = VectorLoad(op0);
346
* if (tmp.x < 0) tmp.x = 0;
347
* if (tmp.y < 0) tmp.y = 0;
348
* if (tmp.w < -(128.0-epsilon)) tmp.w = -(128.0-epsilon);
349
* else if (tmp.w > 128-epsilon) tmp.w = 128-epsilon;
350
* result.x = 1.0;
351
* result.y = tmp.x;
352
* result.z = (tmp.x > 0) ? RoughApproxPower(tmp.y, tmp.w) : 0.0;
353
* result.w = 1.0;
354
*
355
* The longest path of computation is the one leading to result.z,
356
* consisting of 5 operations. This implementation of LIT takes
357
* 5 slots, if the subsequent optimization passes are clever enough
358
* to pair instructions correctly.
359
*/
360
static void transform_LIT(struct radeon_compiler* c,
361
struct rc_instruction* inst)
362
{
363
unsigned int constant;
364
unsigned int constant_swizzle;
365
unsigned int temp;
366
struct rc_src_register srctemp;
367
368
constant = rc_constants_add_immediate_scalar(&c->Program.Constants, -127.999999, &constant_swizzle);
369
370
if (inst->U.I.DstReg.WriteMask != RC_MASK_XYZW || inst->U.I.DstReg.File != RC_FILE_TEMPORARY) {
371
struct rc_instruction * inst_mov;
372
373
inst_mov = emit1(c, inst,
374
RC_OPCODE_MOV, 0, inst->U.I.DstReg,
375
srcreg(RC_FILE_TEMPORARY, rc_find_free_temporary(c)));
376
377
inst->U.I.DstReg.File = RC_FILE_TEMPORARY;
378
inst->U.I.DstReg.Index = inst_mov->U.I.SrcReg[0].Index;
379
inst->U.I.DstReg.WriteMask = RC_MASK_XYZW;
380
}
381
382
temp = inst->U.I.DstReg.Index;
383
srctemp = srcreg(RC_FILE_TEMPORARY, temp);
384
385
/* tmp.x = max(0.0, Src.x); */
386
/* tmp.y = max(0.0, Src.y); */
387
/* tmp.w = clamp(Src.z, -128+eps, 128-eps); */
388
emit2(c, inst->Prev, RC_OPCODE_MAX, 0,
389
dstregtmpmask(temp, RC_MASK_XYW),
390
inst->U.I.SrcReg[0],
391
swizzle(srcreg(RC_FILE_CONSTANT, constant),
392
RC_SWIZZLE_ZERO, RC_SWIZZLE_ZERO, RC_SWIZZLE_ZERO, constant_swizzle&3));
393
emit2(c, inst->Prev, RC_OPCODE_MIN, 0,
394
dstregtmpmask(temp, RC_MASK_Z),
395
swizzle_wwww(srctemp),
396
negate(srcregswz(RC_FILE_CONSTANT, constant, constant_swizzle)));
397
398
/* tmp.w = Pow(tmp.y, tmp.w) */
399
emit1(c, inst->Prev, RC_OPCODE_LG2, 0,
400
dstregtmpmask(temp, RC_MASK_W),
401
swizzle_yyyy(srctemp));
402
emit2(c, inst->Prev, RC_OPCODE_MUL, 0,
403
dstregtmpmask(temp, RC_MASK_W),
404
swizzle_wwww(srctemp),
405
swizzle_zzzz(srctemp));
406
emit1(c, inst->Prev, RC_OPCODE_EX2, 0,
407
dstregtmpmask(temp, RC_MASK_W),
408
swizzle_wwww(srctemp));
409
410
/* tmp.z = (tmp.x > 0) ? tmp.w : 0.0 */
411
emit3(c, inst->Prev, RC_OPCODE_CMP, &inst->U.I,
412
dstregtmpmask(temp, RC_MASK_Z),
413
negate(swizzle_xxxx(srctemp)),
414
swizzle_wwww(srctemp),
415
builtin_zero);
416
417
/* tmp.x, tmp.y, tmp.w = 1.0, tmp.x, 1.0 */
418
emit1(c, inst->Prev, RC_OPCODE_MOV, &inst->U.I,
419
dstregtmpmask(temp, RC_MASK_XYW),
420
swizzle(srctemp, RC_SWIZZLE_ONE, RC_SWIZZLE_X, RC_SWIZZLE_ONE, RC_SWIZZLE_ONE));
421
422
rc_remove_instruction(inst);
423
}
424
425
static void transform_LRP(struct radeon_compiler* c,
426
struct rc_instruction* inst)
427
{
428
struct rc_dst_register dst = try_to_reuse_dst(c, inst);
429
430
emit2(c, inst->Prev, RC_OPCODE_ADD, 0,
431
dst,
432
inst->U.I.SrcReg[1], negate(inst->U.I.SrcReg[2]));
433
emit3(c, inst->Prev, RC_OPCODE_MAD, &inst->U.I,
434
inst->U.I.DstReg,
435
inst->U.I.SrcReg[0], srcreg(RC_FILE_TEMPORARY, dst.Index), inst->U.I.SrcReg[2]);
436
437
rc_remove_instruction(inst);
438
}
439
440
static void transform_POW(struct radeon_compiler* c,
441
struct rc_instruction* inst)
442
{
443
struct rc_dst_register tempdst = try_to_reuse_dst(c, inst);
444
struct rc_src_register tempsrc = srcreg(RC_FILE_TEMPORARY, tempdst.Index);
445
tempdst.WriteMask = RC_MASK_W;
446
tempsrc.Swizzle = RC_SWIZZLE_WWWW;
447
448
emit1(c, inst->Prev, RC_OPCODE_LG2, 0, tempdst, swizzle_xxxx(inst->U.I.SrcReg[0]));
449
emit2(c, inst->Prev, RC_OPCODE_MUL, 0, tempdst, tempsrc, swizzle_xxxx(inst->U.I.SrcReg[1]));
450
emit1(c, inst->Prev, RC_OPCODE_EX2, &inst->U.I, inst->U.I.DstReg, tempsrc);
451
452
rc_remove_instruction(inst);
453
}
454
455
/* dst = ROUND(src) :
456
* add = src + .5
457
* frac = FRC(add)
458
* dst = add - frac
459
*
460
* According to the GLSL spec, the implementor can decide which way to round
461
* when the fraction is .5. We round down for .5.
462
*
463
*/
464
static void transform_ROUND(struct radeon_compiler* c,
465
struct rc_instruction* inst)
466
{
467
unsigned int mask = inst->U.I.DstReg.WriteMask;
468
unsigned int frac_index, add_index;
469
struct rc_dst_register frac_dst, add_dst;
470
struct rc_src_register frac_src, add_src;
471
472
/* add = src + .5 */
473
add_index = rc_find_free_temporary(c);
474
add_dst = dstregtmpmask(add_index, mask);
475
emit2(c, inst->Prev, RC_OPCODE_ADD, 0, add_dst, inst->U.I.SrcReg[0],
476
builtin_half);
477
add_src = srcreg(RC_FILE_TEMPORARY, add_dst.Index);
478
479
480
/* frac = FRC(add) */
481
frac_index = rc_find_free_temporary(c);
482
frac_dst = dstregtmpmask(frac_index, mask);
483
emit1(c, inst->Prev, RC_OPCODE_FRC, 0, frac_dst, add_src);
484
frac_src = srcreg(RC_FILE_TEMPORARY, frac_dst.Index);
485
486
/* dst = add - frac */
487
emit2(c, inst->Prev, RC_OPCODE_ADD, 0, inst->U.I.DstReg,
488
add_src, negate(frac_src));
489
rc_remove_instruction(inst);
490
}
491
492
static void transform_RSQ(struct radeon_compiler* c,
493
struct rc_instruction* inst)
494
{
495
inst->U.I.SrcReg[0] = absolute(inst->U.I.SrcReg[0]);
496
}
497
498
static void transform_SEQ(struct radeon_compiler* c,
499
struct rc_instruction* inst)
500
{
501
struct rc_dst_register dst = try_to_reuse_dst(c, inst);
502
503
emit2(c, inst->Prev, RC_OPCODE_ADD, 0, dst, inst->U.I.SrcReg[0], negate(inst->U.I.SrcReg[1]));
504
emit3(c, inst->Prev, RC_OPCODE_CMP, &inst->U.I, inst->U.I.DstReg,
505
negate(absolute(srcreg(RC_FILE_TEMPORARY, dst.Index))), builtin_zero, builtin_one);
506
507
rc_remove_instruction(inst);
508
}
509
510
static void transform_SFL(struct radeon_compiler* c,
511
struct rc_instruction* inst)
512
{
513
emit1(c, inst->Prev, RC_OPCODE_MOV, &inst->U.I, inst->U.I.DstReg, builtin_zero);
514
rc_remove_instruction(inst);
515
}
516
517
static void transform_SGE(struct radeon_compiler* c,
518
struct rc_instruction* inst)
519
{
520
struct rc_dst_register dst = try_to_reuse_dst(c, inst);
521
522
emit2(c, inst->Prev, RC_OPCODE_ADD, 0, dst, inst->U.I.SrcReg[0], negate(inst->U.I.SrcReg[1]));
523
emit3(c, inst->Prev, RC_OPCODE_CMP, &inst->U.I, inst->U.I.DstReg,
524
srcreg(RC_FILE_TEMPORARY, dst.Index), builtin_zero, builtin_one);
525
526
rc_remove_instruction(inst);
527
}
528
529
static void transform_SGT(struct radeon_compiler* c,
530
struct rc_instruction* inst)
531
{
532
struct rc_dst_register dst = try_to_reuse_dst(c, inst);
533
534
emit2(c, inst->Prev, RC_OPCODE_ADD, 0, dst, negate(inst->U.I.SrcReg[0]), inst->U.I.SrcReg[1]);
535
emit3(c, inst->Prev, RC_OPCODE_CMP, &inst->U.I, inst->U.I.DstReg,
536
srcreg(RC_FILE_TEMPORARY, dst.Index), builtin_one, builtin_zero);
537
538
rc_remove_instruction(inst);
539
}
540
541
static void transform_SLE(struct radeon_compiler* c,
542
struct rc_instruction* inst)
543
{
544
struct rc_dst_register dst = try_to_reuse_dst(c, inst);
545
546
emit2(c, inst->Prev, RC_OPCODE_ADD, 0, dst, negate(inst->U.I.SrcReg[0]), inst->U.I.SrcReg[1]);
547
emit3(c, inst->Prev, RC_OPCODE_CMP, &inst->U.I, inst->U.I.DstReg,
548
srcreg(RC_FILE_TEMPORARY, dst.Index), builtin_zero, builtin_one);
549
550
rc_remove_instruction(inst);
551
}
552
553
static void transform_SLT(struct radeon_compiler* c,
554
struct rc_instruction* inst)
555
{
556
struct rc_dst_register dst = try_to_reuse_dst(c, inst);
557
558
emit2(c, inst->Prev, RC_OPCODE_ADD, 0, dst, inst->U.I.SrcReg[0], negate(inst->U.I.SrcReg[1]));
559
emit3(c, inst->Prev, RC_OPCODE_CMP, &inst->U.I, inst->U.I.DstReg,
560
srcreg(RC_FILE_TEMPORARY, dst.Index), builtin_one, builtin_zero);
561
562
rc_remove_instruction(inst);
563
}
564
565
static void transform_SNE(struct radeon_compiler* c,
566
struct rc_instruction* inst)
567
{
568
struct rc_dst_register dst = try_to_reuse_dst(c, inst);
569
570
emit2(c, inst->Prev, RC_OPCODE_ADD, 0, dst, inst->U.I.SrcReg[0], negate(inst->U.I.SrcReg[1]));
571
emit3(c, inst->Prev, RC_OPCODE_CMP, &inst->U.I, inst->U.I.DstReg,
572
negate(absolute(srcreg(RC_FILE_TEMPORARY, dst.Index))), builtin_one, builtin_zero);
573
574
rc_remove_instruction(inst);
575
}
576
577
static void transform_SSG(struct radeon_compiler* c,
578
struct rc_instruction* inst)
579
{
580
/* result = sign(x)
581
*
582
* CMP tmp0, -x, 1, 0
583
* CMP tmp1, x, 1, 0
584
* ADD result, tmp0, -tmp1;
585
*/
586
struct rc_dst_register dst0;
587
unsigned tmp1;
588
589
/* 0 < x */
590
dst0 = try_to_reuse_dst(c, inst);
591
emit3(c, inst->Prev, RC_OPCODE_CMP, 0,
592
dst0,
593
negate(inst->U.I.SrcReg[0]),
594
builtin_one,
595
builtin_zero);
596
597
/* x < 0 */
598
tmp1 = rc_find_free_temporary(c);
599
emit3(c, inst->Prev, RC_OPCODE_CMP, 0,
600
dstregtmpmask(tmp1, inst->U.I.DstReg.WriteMask),
601
inst->U.I.SrcReg[0],
602
builtin_one,
603
builtin_zero);
604
605
/* Either both are zero, or one of them is one and the other is zero. */
606
/* result = tmp0 - tmp1 */
607
emit2(c, inst->Prev, RC_OPCODE_ADD, 0,
608
inst->U.I.DstReg,
609
srcreg(RC_FILE_TEMPORARY, dst0.Index),
610
negate(srcreg(RC_FILE_TEMPORARY, tmp1)));
611
612
rc_remove_instruction(inst);
613
}
614
615
static void transform_SUB(struct radeon_compiler* c,
616
struct rc_instruction* inst)
617
{
618
inst->U.I.Opcode = RC_OPCODE_ADD;
619
inst->U.I.SrcReg[1] = negate(inst->U.I.SrcReg[1]);
620
}
621
622
static void transform_SWZ(struct radeon_compiler* c,
623
struct rc_instruction* inst)
624
{
625
inst->U.I.Opcode = RC_OPCODE_MOV;
626
}
627
628
static void transform_XPD(struct radeon_compiler* c,
629
struct rc_instruction* inst)
630
{
631
struct rc_dst_register dst = try_to_reuse_dst(c, inst);
632
633
emit2(c, inst->Prev, RC_OPCODE_MUL, 0, dst,
634
swizzle(inst->U.I.SrcReg[0], RC_SWIZZLE_Z, RC_SWIZZLE_X, RC_SWIZZLE_Y, RC_SWIZZLE_W),
635
swizzle(inst->U.I.SrcReg[1], RC_SWIZZLE_Y, RC_SWIZZLE_Z, RC_SWIZZLE_X, RC_SWIZZLE_W));
636
emit3(c, inst->Prev, RC_OPCODE_MAD, &inst->U.I, inst->U.I.DstReg,
637
swizzle(inst->U.I.SrcReg[0], RC_SWIZZLE_Y, RC_SWIZZLE_Z, RC_SWIZZLE_X, RC_SWIZZLE_W),
638
swizzle(inst->U.I.SrcReg[1], RC_SWIZZLE_Z, RC_SWIZZLE_X, RC_SWIZZLE_Y, RC_SWIZZLE_W),
639
negate(srcreg(RC_FILE_TEMPORARY, dst.Index)));
640
641
rc_remove_instruction(inst);
642
}
643
644
645
/**
646
* Can be used as a transformation for @ref radeonClauseLocalTransform,
647
* no userData necessary.
648
*
649
* Eliminates the following ALU instructions:
650
* ABS, CEIL, DPH, DST, FLR, LIT, LRP, POW, SEQ, SFL, SGE, SGT, SLE, SLT, SNE, SUB, SWZ, XPD
651
* using:
652
* MOV, ADD, MUL, MAD, FRC, DP3, LG2, EX2, CMP
653
*
654
* Transforms RSQ to Radeon's native RSQ by explicitly setting
655
* absolute value.
656
*
657
* @note should be applicable to R300 and R500 fragment programs.
658
*/
659
int radeonTransformALU(
660
struct radeon_compiler * c,
661
struct rc_instruction* inst,
662
void* unused)
663
{
664
switch(inst->U.I.Opcode) {
665
case RC_OPCODE_ABS: transform_ABS(c, inst); return 1;
666
case RC_OPCODE_CEIL: transform_CEIL(c, inst); return 1;
667
case RC_OPCODE_CLAMP: transform_CLAMP(c, inst); return 1;
668
case RC_OPCODE_DP2: transform_DP2(c, inst); return 1;
669
case RC_OPCODE_DPH: transform_DPH(c, inst); return 1;
670
case RC_OPCODE_DST: transform_DST(c, inst); return 1;
671
case RC_OPCODE_FLR: transform_FLR(c, inst); return 1;
672
case RC_OPCODE_LIT: transform_LIT(c, inst); return 1;
673
case RC_OPCODE_LRP: transform_LRP(c, inst); return 1;
674
case RC_OPCODE_POW: transform_POW(c, inst); return 1;
675
case RC_OPCODE_ROUND: transform_ROUND(c, inst); return 1;
676
case RC_OPCODE_RSQ: transform_RSQ(c, inst); return 1;
677
case RC_OPCODE_SEQ: transform_SEQ(c, inst); return 1;
678
case RC_OPCODE_SFL: transform_SFL(c, inst); return 1;
679
case RC_OPCODE_SGE: transform_SGE(c, inst); return 1;
680
case RC_OPCODE_SGT: transform_SGT(c, inst); return 1;
681
case RC_OPCODE_SLE: transform_SLE(c, inst); return 1;
682
case RC_OPCODE_SLT: transform_SLT(c, inst); return 1;
683
case RC_OPCODE_SNE: transform_SNE(c, inst); return 1;
684
case RC_OPCODE_SSG: transform_SSG(c, inst); return 1;
685
case RC_OPCODE_SUB: transform_SUB(c, inst); return 1;
686
case RC_OPCODE_SWZ: transform_SWZ(c, inst); return 1;
687
case RC_OPCODE_TRUNC: transform_TRUNC(c, inst); return 1;
688
case RC_OPCODE_XPD: transform_XPD(c, inst); return 1;
689
default:
690
return 0;
691
}
692
}
693
694
695
static void transform_r300_vertex_ABS(struct radeon_compiler* c,
696
struct rc_instruction* inst)
697
{
698
/* Note: r500 can take absolute values, but r300 cannot. */
699
inst->U.I.Opcode = RC_OPCODE_MAX;
700
inst->U.I.SrcReg[1] = inst->U.I.SrcReg[0];
701
inst->U.I.SrcReg[1].Negate ^= RC_MASK_XYZW;
702
}
703
704
static void transform_r300_vertex_CMP(struct radeon_compiler* c,
705
struct rc_instruction* inst)
706
{
707
/* There is no decent CMP available, so let's rig one up.
708
* CMP is defined as dst = src0 < 0.0 ? src1 : src2
709
* The following sequence consumes zero to two temps and two extra slots
710
* (the second temp and the second slot is consumed by transform_LRP),
711
* but should be equivalent:
712
*
713
* SLT tmp0, src0, 0.0
714
* LRP dst, tmp0, src1, src2
715
*
716
* Yes, I know, I'm a mad scientist. ~ C. & M. */
717
struct rc_dst_register dst = try_to_reuse_dst(c, inst);
718
719
/* SLT tmp0, src0, 0.0 */
720
emit2(c, inst->Prev, RC_OPCODE_SLT, 0,
721
dst,
722
inst->U.I.SrcReg[0], builtin_zero);
723
724
/* LRP dst, tmp0, src1, src2 */
725
transform_LRP(c,
726
emit3(c, inst->Prev, RC_OPCODE_LRP, 0,
727
inst->U.I.DstReg,
728
srcreg(RC_FILE_TEMPORARY, dst.Index), inst->U.I.SrcReg[1], inst->U.I.SrcReg[2]));
729
730
rc_remove_instruction(inst);
731
}
732
733
static void transform_r300_vertex_DP2(struct radeon_compiler* c,
734
struct rc_instruction* inst)
735
{
736
struct rc_instruction *next_inst = inst->Next;
737
transform_DP2(c, inst);
738
next_inst->Prev->U.I.Opcode = RC_OPCODE_DP4;
739
}
740
741
static void transform_r300_vertex_DP3(struct radeon_compiler* c,
742
struct rc_instruction* inst)
743
{
744
struct rc_src_register src0 = inst->U.I.SrcReg[0];
745
struct rc_src_register src1 = inst->U.I.SrcReg[1];
746
src0.Negate &= ~RC_MASK_W;
747
src0.Swizzle &= ~(7 << (3 * 3));
748
src0.Swizzle |= RC_SWIZZLE_ZERO << (3 * 3);
749
src1.Negate &= ~RC_MASK_W;
750
src1.Swizzle &= ~(7 << (3 * 3));
751
src1.Swizzle |= RC_SWIZZLE_ZERO << (3 * 3);
752
emit2(c, inst->Prev, RC_OPCODE_DP4, &inst->U.I, inst->U.I.DstReg, src0, src1);
753
rc_remove_instruction(inst);
754
}
755
756
static void transform_r300_vertex_fix_LIT(struct radeon_compiler* c,
757
struct rc_instruction* inst)
758
{
759
struct rc_dst_register dst = try_to_reuse_dst(c, inst);
760
unsigned constant_swizzle;
761
int constant = rc_constants_add_immediate_scalar(&c->Program.Constants,
762
0.0000000000000000001,
763
&constant_swizzle);
764
765
/* MOV dst, src */
766
dst.WriteMask = RC_MASK_XYZW;
767
emit1(c, inst->Prev, RC_OPCODE_MOV, 0,
768
dst,
769
inst->U.I.SrcReg[0]);
770
771
/* MAX dst.y, src, 0.00...001 */
772
emit2(c, inst->Prev, RC_OPCODE_MAX, 0,
773
dstregtmpmask(dst.Index, RC_MASK_Y),
774
srcreg(RC_FILE_TEMPORARY, dst.Index),
775
srcregswz(RC_FILE_CONSTANT, constant, constant_swizzle));
776
777
inst->U.I.SrcReg[0] = srcreg(RC_FILE_TEMPORARY, dst.Index);
778
}
779
780
static void transform_r300_vertex_SEQ(struct radeon_compiler *c,
781
struct rc_instruction *inst)
782
{
783
/* x = y <==> x >= y && y >= x */
784
int tmp = rc_find_free_temporary(c);
785
786
/* x <= y */
787
emit2(c, inst->Prev, RC_OPCODE_SGE, 0,
788
dstregtmpmask(tmp, inst->U.I.DstReg.WriteMask),
789
inst->U.I.SrcReg[0],
790
inst->U.I.SrcReg[1]);
791
792
/* y <= x */
793
emit2(c, inst->Prev, RC_OPCODE_SGE, 0,
794
inst->U.I.DstReg,
795
inst->U.I.SrcReg[1],
796
inst->U.I.SrcReg[0]);
797
798
/* x && y = x * y */
799
emit2(c, inst->Prev, RC_OPCODE_MUL, 0,
800
inst->U.I.DstReg,
801
srcreg(RC_FILE_TEMPORARY, tmp),
802
srcreg(inst->U.I.DstReg.File, inst->U.I.DstReg.Index));
803
804
rc_remove_instruction(inst);
805
}
806
807
static void transform_r300_vertex_SNE(struct radeon_compiler *c,
808
struct rc_instruction *inst)
809
{
810
/* x != y <==> x < y || y < x */
811
int tmp = rc_find_free_temporary(c);
812
813
/* x < y */
814
emit2(c, inst->Prev, RC_OPCODE_SLT, 0,
815
dstregtmpmask(tmp, inst->U.I.DstReg.WriteMask),
816
inst->U.I.SrcReg[0],
817
inst->U.I.SrcReg[1]);
818
819
/* y < x */
820
emit2(c, inst->Prev, RC_OPCODE_SLT, 0,
821
inst->U.I.DstReg,
822
inst->U.I.SrcReg[1],
823
inst->U.I.SrcReg[0]);
824
825
/* x || y = max(x, y) */
826
emit2(c, inst->Prev, RC_OPCODE_MAX, 0,
827
inst->U.I.DstReg,
828
srcreg(RC_FILE_TEMPORARY, tmp),
829
srcreg(inst->U.I.DstReg.File, inst->U.I.DstReg.Index));
830
831
rc_remove_instruction(inst);
832
}
833
834
static void transform_r300_vertex_SGT(struct radeon_compiler* c,
835
struct rc_instruction* inst)
836
{
837
/* x > y <==> -x < -y */
838
inst->U.I.Opcode = RC_OPCODE_SLT;
839
inst->U.I.SrcReg[0].Negate ^= RC_MASK_XYZW;
840
inst->U.I.SrcReg[1].Negate ^= RC_MASK_XYZW;
841
}
842
843
static void transform_r300_vertex_SLE(struct radeon_compiler* c,
844
struct rc_instruction* inst)
845
{
846
/* x <= y <==> -x >= -y */
847
inst->U.I.Opcode = RC_OPCODE_SGE;
848
inst->U.I.SrcReg[0].Negate ^= RC_MASK_XYZW;
849
inst->U.I.SrcReg[1].Negate ^= RC_MASK_XYZW;
850
}
851
852
static void transform_r300_vertex_SSG(struct radeon_compiler* c,
853
struct rc_instruction* inst)
854
{
855
/* result = sign(x)
856
*
857
* SLT tmp0, 0, x;
858
* SLT tmp1, x, 0;
859
* ADD result, tmp0, -tmp1;
860
*/
861
struct rc_dst_register dst0 = try_to_reuse_dst(c, inst);
862
unsigned tmp1;
863
864
/* 0 < x */
865
dst0 = try_to_reuse_dst(c, inst);
866
emit2(c, inst->Prev, RC_OPCODE_SLT, 0,
867
dst0,
868
builtin_zero,
869
inst->U.I.SrcReg[0]);
870
871
/* x < 0 */
872
tmp1 = rc_find_free_temporary(c);
873
emit2(c, inst->Prev, RC_OPCODE_SLT, 0,
874
dstregtmpmask(tmp1, inst->U.I.DstReg.WriteMask),
875
inst->U.I.SrcReg[0],
876
builtin_zero);
877
878
/* Either both are zero, or one of them is one and the other is zero. */
879
/* result = tmp0 - tmp1 */
880
emit2(c, inst->Prev, RC_OPCODE_ADD, 0,
881
inst->U.I.DstReg,
882
srcreg(RC_FILE_TEMPORARY, dst0.Index),
883
negate(srcreg(RC_FILE_TEMPORARY, tmp1)));
884
885
rc_remove_instruction(inst);
886
}
887
888
static void transform_vertex_TRUNC(struct radeon_compiler* c,
889
struct rc_instruction* inst)
890
{
891
struct rc_instruction *next = inst->Next;
892
893
/* next->Prev is removed after each transformation and replaced
894
* by a new instruction. */
895
transform_TRUNC(c, next->Prev);
896
transform_r300_vertex_CMP(c, next->Prev);
897
}
898
899
/**
900
* For use with rc_local_transform, this transforms non-native ALU
901
* instructions of the r300 up to r500 vertex engine.
902
*/
903
int r300_transform_vertex_alu(
904
struct radeon_compiler * c,
905
struct rc_instruction* inst,
906
void* unused)
907
{
908
switch(inst->U.I.Opcode) {
909
case RC_OPCODE_ABS: transform_r300_vertex_ABS(c, inst); return 1;
910
case RC_OPCODE_CEIL: transform_CEIL(c, inst); return 1;
911
case RC_OPCODE_CLAMP: transform_CLAMP(c, inst); return 1;
912
case RC_OPCODE_CMP: transform_r300_vertex_CMP(c, inst); return 1;
913
case RC_OPCODE_DP2: transform_r300_vertex_DP2(c, inst); return 1;
914
case RC_OPCODE_DP3: transform_r300_vertex_DP3(c, inst); return 1;
915
case RC_OPCODE_DPH: transform_DPH(c, inst); return 1;
916
case RC_OPCODE_FLR: transform_FLR(c, inst); return 1;
917
case RC_OPCODE_LIT: transform_r300_vertex_fix_LIT(c, inst); return 1;
918
case RC_OPCODE_LRP: transform_LRP(c, inst); return 1;
919
case RC_OPCODE_SEQ:
920
if (!c->is_r500) {
921
transform_r300_vertex_SEQ(c, inst);
922
return 1;
923
}
924
return 0;
925
case RC_OPCODE_SFL: transform_SFL(c, inst); return 1;
926
case RC_OPCODE_SGT: transform_r300_vertex_SGT(c, inst); return 1;
927
case RC_OPCODE_SLE: transform_r300_vertex_SLE(c, inst); return 1;
928
case RC_OPCODE_SNE:
929
if (!c->is_r500) {
930
transform_r300_vertex_SNE(c, inst);
931
return 1;
932
}
933
return 0;
934
case RC_OPCODE_SSG: transform_r300_vertex_SSG(c, inst); return 1;
935
case RC_OPCODE_SUB: transform_SUB(c, inst); return 1;
936
case RC_OPCODE_SWZ: transform_SWZ(c, inst); return 1;
937
case RC_OPCODE_TRUNC: transform_vertex_TRUNC(c, inst); return 1;
938
case RC_OPCODE_XPD: transform_XPD(c, inst); return 1;
939
default:
940
return 0;
941
}
942
}
943
944
static void sincos_constants(struct radeon_compiler* c, unsigned int *constants)
945
{
946
static const float SinCosConsts[2][4] = {
947
{
948
1.273239545, /* 4/PI */
949
-0.405284735, /* -4/(PI*PI) */
950
3.141592654, /* PI */
951
0.2225 /* weight */
952
},
953
{
954
0.75,
955
0.5,
956
0.159154943, /* 1/(2*PI) */
957
6.283185307 /* 2*PI */
958
}
959
};
960
int i;
961
962
for(i = 0; i < 2; ++i)
963
constants[i] = rc_constants_add_immediate_vec4(&c->Program.Constants, SinCosConsts[i]);
964
}
965
966
/**
967
* Approximate sin(x), where x is clamped to (-pi/2, pi/2).
968
*
969
* MUL tmp.xy, src, { 4/PI, -4/(PI^2) }
970
* MAD tmp.x, tmp.y, |src|, tmp.x
971
* MAD tmp.y, tmp.x, |tmp.x|, -tmp.x
972
* MAD dest, tmp.y, weight, tmp.x
973
*/
974
static void sin_approx(
975
struct radeon_compiler* c, struct rc_instruction * inst,
976
struct rc_dst_register dst, struct rc_src_register src, const unsigned int* constants)
977
{
978
unsigned int tempreg = rc_find_free_temporary(c);
979
980
emit2(c, inst->Prev, RC_OPCODE_MUL, 0, dstregtmpmask(tempreg, RC_MASK_XY),
981
swizzle_xxxx(src),
982
srcreg(RC_FILE_CONSTANT, constants[0]));
983
emit3(c, inst->Prev, RC_OPCODE_MAD, 0, dstregtmpmask(tempreg, RC_MASK_X),
984
swizzle_yyyy(srcreg(RC_FILE_TEMPORARY, tempreg)),
985
absolute(swizzle_xxxx(src)),
986
swizzle_xxxx(srcreg(RC_FILE_TEMPORARY, tempreg)));
987
emit3(c, inst->Prev, RC_OPCODE_MAD, 0, dstregtmpmask(tempreg, RC_MASK_Y),
988
swizzle_xxxx(srcreg(RC_FILE_TEMPORARY, tempreg)),
989
absolute(swizzle_xxxx(srcreg(RC_FILE_TEMPORARY, tempreg))),
990
negate(swizzle_xxxx(srcreg(RC_FILE_TEMPORARY, tempreg))));
991
emit3(c, inst->Prev, RC_OPCODE_MAD, 0, dst,
992
swizzle_yyyy(srcreg(RC_FILE_TEMPORARY, tempreg)),
993
swizzle_wwww(srcreg(RC_FILE_CONSTANT, constants[0])),
994
swizzle_xxxx(srcreg(RC_FILE_TEMPORARY, tempreg)));
995
}
996
997
/**
998
* Translate the trigonometric functions COS, SIN, and SCS
999
* using only the basic instructions
1000
* MOV, ADD, MUL, MAD, FRC
1001
*/
1002
int r300_transform_trig_simple(struct radeon_compiler* c,
1003
struct rc_instruction* inst,
1004
void* unused)
1005
{
1006
unsigned int constants[2];
1007
unsigned int tempreg;
1008
1009
if (inst->U.I.Opcode != RC_OPCODE_COS &&
1010
inst->U.I.Opcode != RC_OPCODE_SIN &&
1011
inst->U.I.Opcode != RC_OPCODE_SCS)
1012
return 0;
1013
1014
tempreg = rc_find_free_temporary(c);
1015
1016
sincos_constants(c, constants);
1017
1018
if (inst->U.I.Opcode == RC_OPCODE_COS) {
1019
/* MAD tmp.x, src, 1/(2*PI), 0.75 */
1020
/* FRC tmp.x, tmp.x */
1021
/* MAD tmp.z, tmp.x, 2*PI, -PI */
1022
emit3(c, inst->Prev, RC_OPCODE_MAD, 0, dstregtmpmask(tempreg, RC_MASK_W),
1023
swizzle_xxxx(inst->U.I.SrcReg[0]),
1024
swizzle_zzzz(srcreg(RC_FILE_CONSTANT, constants[1])),
1025
swizzle_xxxx(srcreg(RC_FILE_CONSTANT, constants[1])));
1026
emit1(c, inst->Prev, RC_OPCODE_FRC, 0, dstregtmpmask(tempreg, RC_MASK_W),
1027
swizzle_wwww(srcreg(RC_FILE_TEMPORARY, tempreg)));
1028
emit3(c, inst->Prev, RC_OPCODE_MAD, 0, dstregtmpmask(tempreg, RC_MASK_W),
1029
swizzle_wwww(srcreg(RC_FILE_TEMPORARY, tempreg)),
1030
swizzle_wwww(srcreg(RC_FILE_CONSTANT, constants[1])),
1031
negate(swizzle_zzzz(srcreg(RC_FILE_CONSTANT, constants[0]))));
1032
1033
sin_approx(c, inst, inst->U.I.DstReg,
1034
swizzle_wwww(srcreg(RC_FILE_TEMPORARY, tempreg)),
1035
constants);
1036
} else if (inst->U.I.Opcode == RC_OPCODE_SIN) {
1037
emit3(c, inst->Prev, RC_OPCODE_MAD, 0, dstregtmpmask(tempreg, RC_MASK_W),
1038
swizzle_xxxx(inst->U.I.SrcReg[0]),
1039
swizzle_zzzz(srcreg(RC_FILE_CONSTANT, constants[1])),
1040
swizzle_yyyy(srcreg(RC_FILE_CONSTANT, constants[1])));
1041
emit1(c, inst->Prev, RC_OPCODE_FRC, 0, dstregtmpmask(tempreg, RC_MASK_W),
1042
swizzle_wwww(srcreg(RC_FILE_TEMPORARY, tempreg)));
1043
emit3(c, inst->Prev, RC_OPCODE_MAD, 0, dstregtmpmask(tempreg, RC_MASK_W),
1044
swizzle_wwww(srcreg(RC_FILE_TEMPORARY, tempreg)),
1045
swizzle_wwww(srcreg(RC_FILE_CONSTANT, constants[1])),
1046
negate(swizzle_zzzz(srcreg(RC_FILE_CONSTANT, constants[0]))));
1047
1048
sin_approx(c, inst, inst->U.I.DstReg,
1049
swizzle_wwww(srcreg(RC_FILE_TEMPORARY, tempreg)),
1050
constants);
1051
} else {
1052
struct rc_dst_register dst;
1053
1054
emit3(c, inst->Prev, RC_OPCODE_MAD, 0, dstregtmpmask(tempreg, RC_MASK_XY),
1055
swizzle_xxxx(inst->U.I.SrcReg[0]),
1056
swizzle_zzzz(srcreg(RC_FILE_CONSTANT, constants[1])),
1057
swizzle(srcreg(RC_FILE_CONSTANT, constants[1]), RC_SWIZZLE_X, RC_SWIZZLE_Y, RC_SWIZZLE_Z, RC_SWIZZLE_W));
1058
emit1(c, inst->Prev, RC_OPCODE_FRC, 0, dstregtmpmask(tempreg, RC_MASK_XY),
1059
srcreg(RC_FILE_TEMPORARY, tempreg));
1060
emit3(c, inst->Prev, RC_OPCODE_MAD, 0, dstregtmpmask(tempreg, RC_MASK_XY),
1061
srcreg(RC_FILE_TEMPORARY, tempreg),
1062
swizzle_wwww(srcreg(RC_FILE_CONSTANT, constants[1])),
1063
negate(swizzle_zzzz(srcreg(RC_FILE_CONSTANT, constants[0]))));
1064
1065
dst = inst->U.I.DstReg;
1066
1067
dst.WriteMask = inst->U.I.DstReg.WriteMask & RC_MASK_X;
1068
sin_approx(c, inst, dst,
1069
swizzle_xxxx(srcreg(RC_FILE_TEMPORARY, tempreg)),
1070
constants);
1071
1072
dst.WriteMask = inst->U.I.DstReg.WriteMask & RC_MASK_Y;
1073
sin_approx(c, inst, dst,
1074
swizzle_yyyy(srcreg(RC_FILE_TEMPORARY, tempreg)),
1075
constants);
1076
}
1077
1078
rc_remove_instruction(inst);
1079
1080
return 1;
1081
}
1082
1083
static void r300_transform_SIN_COS_SCS(struct radeon_compiler *c,
1084
struct rc_instruction *inst,
1085
unsigned srctmp)
1086
{
1087
if (inst->U.I.Opcode == RC_OPCODE_COS) {
1088
emit1(c, inst->Prev, RC_OPCODE_COS, &inst->U.I, inst->U.I.DstReg,
1089
srcregswz(RC_FILE_TEMPORARY, srctmp, RC_SWIZZLE_WWWW));
1090
} else if (inst->U.I.Opcode == RC_OPCODE_SIN) {
1091
emit1(c, inst->Prev, RC_OPCODE_SIN, &inst->U.I,
1092
inst->U.I.DstReg, srcregswz(RC_FILE_TEMPORARY, srctmp, RC_SWIZZLE_WWWW));
1093
} else if (inst->U.I.Opcode == RC_OPCODE_SCS) {
1094
struct rc_dst_register moddst = inst->U.I.DstReg;
1095
1096
if (inst->U.I.DstReg.WriteMask & RC_MASK_X) {
1097
moddst.WriteMask = RC_MASK_X;
1098
emit1(c, inst->Prev, RC_OPCODE_COS, &inst->U.I, moddst,
1099
srcregswz(RC_FILE_TEMPORARY, srctmp, RC_SWIZZLE_WWWW));
1100
}
1101
if (inst->U.I.DstReg.WriteMask & RC_MASK_Y) {
1102
moddst.WriteMask = RC_MASK_Y;
1103
emit1(c, inst->Prev, RC_OPCODE_SIN, &inst->U.I, moddst,
1104
srcregswz(RC_FILE_TEMPORARY, srctmp, RC_SWIZZLE_WWWW));
1105
}
1106
}
1107
1108
rc_remove_instruction(inst);
1109
}
1110
1111
1112
/**
1113
* Transform the trigonometric functions COS, SIN, and SCS
1114
* to include pre-scaling by 1/(2*PI) and taking the fractional
1115
* part, so that the input to COS and SIN is always in the range [0,1).
1116
* SCS is replaced by one COS and one SIN instruction.
1117
*
1118
* @warning This transformation implicitly changes the semantics of SIN and COS!
1119
*/
1120
int radeonTransformTrigScale(struct radeon_compiler* c,
1121
struct rc_instruction* inst,
1122
void* unused)
1123
{
1124
static const float RCP_2PI = 0.15915494309189535;
1125
unsigned int temp;
1126
unsigned int constant;
1127
unsigned int constant_swizzle;
1128
1129
if (inst->U.I.Opcode != RC_OPCODE_COS &&
1130
inst->U.I.Opcode != RC_OPCODE_SIN &&
1131
inst->U.I.Opcode != RC_OPCODE_SCS)
1132
return 0;
1133
1134
temp = rc_find_free_temporary(c);
1135
constant = rc_constants_add_immediate_scalar(&c->Program.Constants, RCP_2PI, &constant_swizzle);
1136
1137
emit2(c, inst->Prev, RC_OPCODE_MUL, 0, dstregtmpmask(temp, RC_MASK_W),
1138
swizzle_xxxx(inst->U.I.SrcReg[0]),
1139
srcregswz(RC_FILE_CONSTANT, constant, constant_swizzle));
1140
emit1(c, inst->Prev, RC_OPCODE_FRC, 0, dstregtmpmask(temp, RC_MASK_W),
1141
srcreg(RC_FILE_TEMPORARY, temp));
1142
1143
r300_transform_SIN_COS_SCS(c, inst, temp);
1144
return 1;
1145
}
1146
1147
/**
1148
* Transform the trigonometric functions COS, SIN, and SCS
1149
* so that the input to COS and SIN is always in the range [-PI, PI].
1150
* SCS is replaced by one COS and one SIN instruction.
1151
*/
1152
int r300_transform_trig_scale_vertex(struct radeon_compiler *c,
1153
struct rc_instruction *inst,
1154
void *unused)
1155
{
1156
static const float cons[4] = {0.15915494309189535, 0.5, 6.28318530717959, -3.14159265358979};
1157
unsigned int temp;
1158
unsigned int constant;
1159
1160
if (inst->U.I.Opcode != RC_OPCODE_COS &&
1161
inst->U.I.Opcode != RC_OPCODE_SIN &&
1162
inst->U.I.Opcode != RC_OPCODE_SCS)
1163
return 0;
1164
1165
/* Repeat x in the range [-PI, PI]:
1166
*
1167
* repeat(x) = frac(x / 2PI + 0.5) * 2PI - PI
1168
*/
1169
1170
temp = rc_find_free_temporary(c);
1171
constant = rc_constants_add_immediate_vec4(&c->Program.Constants, cons);
1172
1173
emit3(c, inst->Prev, RC_OPCODE_MAD, 0, dstregtmpmask(temp, RC_MASK_W),
1174
swizzle_xxxx(inst->U.I.SrcReg[0]),
1175
srcregswz(RC_FILE_CONSTANT, constant, RC_SWIZZLE_XXXX),
1176
srcregswz(RC_FILE_CONSTANT, constant, RC_SWIZZLE_YYYY));
1177
emit1(c, inst->Prev, RC_OPCODE_FRC, 0, dstregtmpmask(temp, RC_MASK_W),
1178
srcreg(RC_FILE_TEMPORARY, temp));
1179
emit3(c, inst->Prev, RC_OPCODE_MAD, 0, dstregtmpmask(temp, RC_MASK_W),
1180
srcreg(RC_FILE_TEMPORARY, temp),
1181
srcregswz(RC_FILE_CONSTANT, constant, RC_SWIZZLE_ZZZZ),
1182
srcregswz(RC_FILE_CONSTANT, constant, RC_SWIZZLE_WWWW));
1183
1184
r300_transform_SIN_COS_SCS(c, inst, temp);
1185
return 1;
1186
}
1187
1188
/**
1189
* Rewrite DDX/DDY instructions to properly work with r5xx shaders.
1190
* The r5xx MDH/MDV instruction provides per-quad partial derivatives.
1191
* It takes the form A*B+C. A and C are set by setting src0. B should be -1.
1192
*
1193
* @warning This explicitly changes the form of DDX and DDY!
1194
*/
1195
1196
int radeonTransformDeriv(struct radeon_compiler* c,
1197
struct rc_instruction* inst,
1198
void* unused)
1199
{
1200
if (inst->U.I.Opcode != RC_OPCODE_DDX && inst->U.I.Opcode != RC_OPCODE_DDY)
1201
return 0;
1202
1203
inst->U.I.SrcReg[1].Swizzle = RC_SWIZZLE_1111;
1204
inst->U.I.SrcReg[1].Negate = RC_MASK_XYZW;
1205
1206
return 1;
1207
}
1208
1209
/**
1210
* IF Temp[0].x -> IF Temp[0].x
1211
* ... -> ...
1212
* KILL -> KIL -abs(Temp[0].x)
1213
* ... -> ...
1214
* ENDIF -> ENDIF
1215
*
1216
* === OR ===
1217
*
1218
* IF Temp[0].x -\
1219
* KILL - > KIL -abs(Temp[0].x)
1220
* ENDIF -/
1221
*
1222
* === OR ===
1223
*
1224
* IF Temp[0].x -> IF Temp[0].x
1225
* ... -> ...
1226
* ELSE -> ELSE
1227
* ... -> ...
1228
* KILL -> KIL -abs(Temp[0].x)
1229
* ... -> ...
1230
* ENDIF -> ENDIF
1231
*
1232
* === OR ===
1233
*
1234
* KILL -> KIL -none.1111
1235
*
1236
* This needs to be done in its own pass, because it might modify the
1237
* instructions before and after KILL.
1238
*/
1239
void rc_transform_KILL(struct radeon_compiler * c, void *user)
1240
{
1241
struct rc_instruction * inst;
1242
for (inst = c->Program.Instructions.Next;
1243
inst != &c->Program.Instructions; inst = inst->Next) {
1244
struct rc_instruction * if_inst;
1245
unsigned in_if = 0;
1246
1247
if (inst->U.I.Opcode != RC_OPCODE_KILP)
1248
continue;
1249
1250
for (if_inst = inst->Prev; if_inst != &c->Program.Instructions;
1251
if_inst = if_inst->Prev) {
1252
1253
if (if_inst->U.I.Opcode == RC_OPCODE_IF) {
1254
in_if = 1;
1255
break;
1256
}
1257
}
1258
1259
inst->U.I.Opcode = RC_OPCODE_KIL;
1260
1261
if (!in_if) {
1262
inst->U.I.SrcReg[0] = negate(builtin_one);
1263
} else {
1264
/* This should work even if the KILP is inside the ELSE
1265
* block, because -0.0 is considered negative. */
1266
inst->U.I.SrcReg[0] =
1267
negate(absolute(if_inst->U.I.SrcReg[0]));
1268
1269
if (inst->Prev->U.I.Opcode != RC_OPCODE_IF
1270
&& inst->Next->U.I.Opcode != RC_OPCODE_ENDIF) {
1271
1272
/* Optimize the special case:
1273
* IF Temp[0].x
1274
* KILP
1275
* ENDIF
1276
*/
1277
1278
/* Remove IF */
1279
rc_remove_instruction(inst->Prev);
1280
/* Remove ENDIF */
1281
rc_remove_instruction(inst->Next);
1282
}
1283
}
1284
}
1285
}
1286
1287
int rc_force_output_alpha_to_one(struct radeon_compiler *c,
1288
struct rc_instruction *inst, void *data)
1289
{
1290
struct r300_fragment_program_compiler *fragc = (struct r300_fragment_program_compiler*)c;
1291
const struct rc_opcode_info *info = rc_get_opcode_info(inst->U.I.Opcode);
1292
unsigned tmp;
1293
1294
if (!info->HasDstReg || inst->U.I.DstReg.File != RC_FILE_OUTPUT ||
1295
inst->U.I.DstReg.Index == fragc->OutputDepth)
1296
return 1;
1297
1298
tmp = rc_find_free_temporary(c);
1299
1300
/* Insert MOV after inst, set alpha to 1. */
1301
emit1(c, inst, RC_OPCODE_MOV, 0, inst->U.I.DstReg,
1302
srcregswz(RC_FILE_TEMPORARY, tmp, RC_SWIZZLE_XYZ1));
1303
1304
/* Re-route the destination of inst to the source of mov. */
1305
inst->U.I.DstReg.File = RC_FILE_TEMPORARY;
1306
inst->U.I.DstReg.Index = tmp;
1307
1308
/* Move the saturate output modifier to the MOV instruction
1309
* (for better copy propagation). */
1310
inst->Next->U.I.SaturateMode = inst->U.I.SaturateMode;
1311
inst->U.I.SaturateMode = RC_SATURATE_NONE;
1312
return 1;
1313
}
1314
1315