Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
PojavLauncherTeam
GitHub Repository: PojavLauncherTeam/mesa
Path: blob/21.2-virgl/src/gallium/drivers/r300/compiler/r3xx_vertprog.c
4574 views
1
/*
2
* Copyright 2009 Nicolai Hähnle <[email protected]>
3
*
4
* Permission is hereby granted, free of charge, to any person obtaining a
5
* copy of this software and associated documentation files (the "Software"),
6
* to deal in the Software without restriction, including without limitation
7
* on the rights to use, copy, modify, merge, publish, distribute, sub
8
* license, and/or sell copies of the Software, and to permit persons to whom
9
* the Software is furnished to do so, subject to the following conditions:
10
*
11
* The above copyright notice and this permission notice (including the next
12
* paragraph) shall be included in all copies or substantial portions of the
13
* Software.
14
*
15
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
* FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
18
* THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
19
* DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
20
* OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
21
* USE OR OTHER DEALINGS IN THE SOFTWARE. */
22
23
#include "radeon_compiler.h"
24
25
#include <stdio.h>
26
27
#include "r300_reg.h"
28
29
#include "radeon_compiler_util.h"
30
#include "radeon_dataflow.h"
31
#include "radeon_program.h"
32
#include "radeon_program_alu.h"
33
#include "radeon_swizzle.h"
34
#include "radeon_emulate_branches.h"
35
#include "radeon_emulate_loops.h"
36
#include "radeon_remove_constants.h"
37
38
#include "util/compiler.h"
39
40
/*
41
* Take an already-setup and valid source then swizzle it appropriately to
42
* obtain a constant ZERO or ONE source.
43
*/
44
#define __CONST(x, y) \
45
(PVS_SRC_OPERAND(t_src_index(vp, &vpi->SrcReg[x]), \
46
t_swizzle(y), \
47
t_swizzle(y), \
48
t_swizzle(y), \
49
t_swizzle(y), \
50
t_src_class(vpi->SrcReg[x].File), \
51
RC_MASK_NONE) | (vpi->SrcReg[x].RelAddr << 4))
52
53
54
static unsigned long t_dst_mask(unsigned int mask)
55
{
56
/* RC_MASK_* is equivalent to VSF_FLAG_* */
57
return mask & RC_MASK_XYZW;
58
}
59
60
static unsigned long t_dst_class(rc_register_file file)
61
{
62
switch (file) {
63
default:
64
fprintf(stderr, "%s: Bad register file %i\n", __FUNCTION__, file);
65
FALLTHROUGH;
66
case RC_FILE_TEMPORARY:
67
return PVS_DST_REG_TEMPORARY;
68
case RC_FILE_OUTPUT:
69
return PVS_DST_REG_OUT;
70
case RC_FILE_ADDRESS:
71
return PVS_DST_REG_A0;
72
}
73
}
74
75
static unsigned long t_dst_index(struct r300_vertex_program_code *vp,
76
struct rc_dst_register *dst)
77
{
78
if (dst->File == RC_FILE_OUTPUT)
79
return vp->outputs[dst->Index];
80
81
return dst->Index;
82
}
83
84
static unsigned long t_src_class(rc_register_file file)
85
{
86
switch (file) {
87
default:
88
fprintf(stderr, "%s: Bad register file %i\n", __FUNCTION__, file);
89
FALLTHROUGH;
90
case RC_FILE_NONE:
91
case RC_FILE_TEMPORARY:
92
return PVS_SRC_REG_TEMPORARY;
93
case RC_FILE_INPUT:
94
return PVS_SRC_REG_INPUT;
95
case RC_FILE_CONSTANT:
96
return PVS_SRC_REG_CONSTANT;
97
}
98
}
99
100
static int t_src_conflict(struct rc_src_register a, struct rc_src_register b)
101
{
102
unsigned long aclass = t_src_class(a.File);
103
unsigned long bclass = t_src_class(b.File);
104
105
if (aclass != bclass)
106
return 0;
107
if (aclass == PVS_SRC_REG_TEMPORARY)
108
return 0;
109
110
if (a.RelAddr || b.RelAddr)
111
return 1;
112
if (a.Index != b.Index)
113
return 1;
114
115
return 0;
116
}
117
118
static inline unsigned long t_swizzle(unsigned int swizzle)
119
{
120
/* this is in fact a NOP as the Mesa RC_SWIZZLE_* are all identical to VSF_IN_COMPONENT_* */
121
return swizzle;
122
}
123
124
static unsigned long t_src_index(struct r300_vertex_program_code *vp,
125
struct rc_src_register *src)
126
{
127
if (src->File == RC_FILE_INPUT) {
128
assert(vp->inputs[src->Index] != -1);
129
return vp->inputs[src->Index];
130
} else {
131
if (src->Index < 0) {
132
fprintf(stderr,
133
"negative offsets for indirect addressing do not work.\n");
134
return 0;
135
}
136
return src->Index;
137
}
138
}
139
140
/* these two functions should probably be merged... */
141
142
static unsigned long t_src(struct r300_vertex_program_code *vp,
143
struct rc_src_register *src)
144
{
145
/* src->Negate uses the RC_MASK_ flags from program_instruction.h,
146
* which equal our VSF_FLAGS_ values, so it's safe to just pass it here.
147
*/
148
return PVS_SRC_OPERAND(t_src_index(vp, src),
149
t_swizzle(GET_SWZ(src->Swizzle, 0)),
150
t_swizzle(GET_SWZ(src->Swizzle, 1)),
151
t_swizzle(GET_SWZ(src->Swizzle, 2)),
152
t_swizzle(GET_SWZ(src->Swizzle, 3)),
153
t_src_class(src->File),
154
src->Negate) |
155
(src->RelAddr << 4) | (src->Abs << 3);
156
}
157
158
static unsigned long t_src_scalar(struct r300_vertex_program_code *vp,
159
struct rc_src_register *src)
160
{
161
/* src->Negate uses the RC_MASK_ flags from program_instruction.h,
162
* which equal our VSF_FLAGS_ values, so it's safe to just pass it here.
163
*/
164
unsigned int swz = rc_get_scalar_src_swz(src->Swizzle);
165
166
return PVS_SRC_OPERAND(t_src_index(vp, src),
167
t_swizzle(swz),
168
t_swizzle(swz),
169
t_swizzle(swz),
170
t_swizzle(swz),
171
t_src_class(src->File),
172
src->Negate ? RC_MASK_XYZW : RC_MASK_NONE) |
173
(src->RelAddr << 4) | (src->Abs << 3);
174
}
175
176
static int valid_dst(struct r300_vertex_program_code *vp,
177
struct rc_dst_register *dst)
178
{
179
if (dst->File == RC_FILE_OUTPUT && vp->outputs[dst->Index] == -1) {
180
return 0;
181
} else if (dst->File == RC_FILE_ADDRESS) {
182
assert(dst->Index == 0);
183
}
184
185
return 1;
186
}
187
188
static void ei_vector1(struct r300_vertex_program_code *vp,
189
unsigned int hw_opcode,
190
struct rc_sub_instruction *vpi,
191
unsigned int * inst)
192
{
193
inst[0] = PVS_OP_DST_OPERAND(hw_opcode,
194
0,
195
0,
196
t_dst_index(vp, &vpi->DstReg),
197
t_dst_mask(vpi->DstReg.WriteMask),
198
t_dst_class(vpi->DstReg.File),
199
vpi->SaturateMode == RC_SATURATE_ZERO_ONE);
200
inst[1] = t_src(vp, &vpi->SrcReg[0]);
201
inst[2] = __CONST(0, RC_SWIZZLE_ZERO);
202
inst[3] = __CONST(0, RC_SWIZZLE_ZERO);
203
}
204
205
static void ei_vector2(struct r300_vertex_program_code *vp,
206
unsigned int hw_opcode,
207
struct rc_sub_instruction *vpi,
208
unsigned int * inst)
209
{
210
inst[0] = PVS_OP_DST_OPERAND(hw_opcode,
211
0,
212
0,
213
t_dst_index(vp, &vpi->DstReg),
214
t_dst_mask(vpi->DstReg.WriteMask),
215
t_dst_class(vpi->DstReg.File),
216
vpi->SaturateMode == RC_SATURATE_ZERO_ONE);
217
inst[1] = t_src(vp, &vpi->SrcReg[0]);
218
inst[2] = t_src(vp, &vpi->SrcReg[1]);
219
inst[3] = __CONST(1, RC_SWIZZLE_ZERO);
220
}
221
222
static void ei_math1(struct r300_vertex_program_code *vp,
223
unsigned int hw_opcode,
224
struct rc_sub_instruction *vpi,
225
unsigned int * inst)
226
{
227
inst[0] = PVS_OP_DST_OPERAND(hw_opcode,
228
1,
229
0,
230
t_dst_index(vp, &vpi->DstReg),
231
t_dst_mask(vpi->DstReg.WriteMask),
232
t_dst_class(vpi->DstReg.File),
233
vpi->SaturateMode == RC_SATURATE_ZERO_ONE);
234
inst[1] = t_src_scalar(vp, &vpi->SrcReg[0]);
235
inst[2] = __CONST(0, RC_SWIZZLE_ZERO);
236
inst[3] = __CONST(0, RC_SWIZZLE_ZERO);
237
}
238
239
static void ei_lit(struct r300_vertex_program_code *vp,
240
struct rc_sub_instruction *vpi,
241
unsigned int * inst)
242
{
243
//LIT TMP 1.Y Z TMP 1{} {X W Z Y} TMP 1{} {Y W Z X} TMP 1{} {Y X Z W}
244
245
inst[0] = PVS_OP_DST_OPERAND(ME_LIGHT_COEFF_DX,
246
1,
247
0,
248
t_dst_index(vp, &vpi->DstReg),
249
t_dst_mask(vpi->DstReg.WriteMask),
250
t_dst_class(vpi->DstReg.File),
251
vpi->SaturateMode == RC_SATURATE_ZERO_ONE);
252
/* NOTE: Users swizzling might not work. */
253
inst[1] = PVS_SRC_OPERAND(t_src_index(vp, &vpi->SrcReg[0]), t_swizzle(GET_SWZ(vpi->SrcReg[0].Swizzle, 0)), // X
254
t_swizzle(GET_SWZ(vpi->SrcReg[0].Swizzle, 3)), // W
255
PVS_SRC_SELECT_FORCE_0, // Z
256
t_swizzle(GET_SWZ(vpi->SrcReg[0].Swizzle, 1)), // Y
257
t_src_class(vpi->SrcReg[0].File),
258
vpi->SrcReg[0].Negate ? RC_MASK_XYZW : RC_MASK_NONE) |
259
(vpi->SrcReg[0].RelAddr << 4);
260
inst[2] = PVS_SRC_OPERAND(t_src_index(vp, &vpi->SrcReg[0]), t_swizzle(GET_SWZ(vpi->SrcReg[0].Swizzle, 1)), // Y
261
t_swizzle(GET_SWZ(vpi->SrcReg[0].Swizzle, 3)), // W
262
PVS_SRC_SELECT_FORCE_0, // Z
263
t_swizzle(GET_SWZ(vpi->SrcReg[0].Swizzle, 0)), // X
264
t_src_class(vpi->SrcReg[0].File),
265
vpi->SrcReg[0].Negate ? RC_MASK_XYZW : RC_MASK_NONE) |
266
(vpi->SrcReg[0].RelAddr << 4);
267
inst[3] = PVS_SRC_OPERAND(t_src_index(vp, &vpi->SrcReg[0]), t_swizzle(GET_SWZ(vpi->SrcReg[0].Swizzle, 1)), // Y
268
t_swizzle(GET_SWZ(vpi->SrcReg[0].Swizzle, 0)), // X
269
PVS_SRC_SELECT_FORCE_0, // Z
270
t_swizzle(GET_SWZ(vpi->SrcReg[0].Swizzle, 3)), // W
271
t_src_class(vpi->SrcReg[0].File),
272
vpi->SrcReg[0].Negate ? RC_MASK_XYZW : RC_MASK_NONE) |
273
(vpi->SrcReg[0].RelAddr << 4);
274
}
275
276
static void ei_mad(struct r300_vertex_program_code *vp,
277
struct rc_sub_instruction *vpi,
278
unsigned int * inst)
279
{
280
unsigned int i;
281
/* Remarks about hardware limitations of MAD
282
* (please preserve this comment, as this information is _NOT_
283
* in the documentation provided by AMD).
284
*
285
* As described in the documentation, MAD with three unique temporary
286
* source registers requires the use of the macro version.
287
*
288
* However (and this is not mentioned in the documentation), apparently
289
* the macro version is _NOT_ a full superset of the normal version.
290
* In particular, the macro version does not always work when relative
291
* addressing is used in the source operands.
292
*
293
* This limitation caused incorrect rendering in Sauerbraten's OpenGL
294
* assembly shader path when using medium quality animations
295
* (i.e. animations with matrix blending instead of quaternion blending).
296
*
297
* Unfortunately, I (nha) have been unable to extract a Piglit regression
298
* test for this issue - for some reason, it is possible to have vertex
299
* programs whose prefix is *exactly* the same as the prefix of the
300
* offending program in Sauerbraten up to the offending instruction
301
* without causing any trouble.
302
*
303
* Bottom line: Only use the macro version only when really necessary;
304
* according to AMD docs, this should improve performance by one clock
305
* as a nice side bonus.
306
*/
307
if (vpi->SrcReg[0].File == RC_FILE_TEMPORARY &&
308
vpi->SrcReg[1].File == RC_FILE_TEMPORARY &&
309
vpi->SrcReg[2].File == RC_FILE_TEMPORARY &&
310
vpi->SrcReg[0].Index != vpi->SrcReg[1].Index &&
311
vpi->SrcReg[0].Index != vpi->SrcReg[2].Index &&
312
vpi->SrcReg[1].Index != vpi->SrcReg[2].Index) {
313
inst[0] = PVS_OP_DST_OPERAND(PVS_MACRO_OP_2CLK_MADD,
314
0,
315
1,
316
t_dst_index(vp, &vpi->DstReg),
317
t_dst_mask(vpi->DstReg.WriteMask),
318
t_dst_class(vpi->DstReg.File),
319
vpi->SaturateMode == RC_SATURATE_ZERO_ONE);
320
} else {
321
inst[0] = PVS_OP_DST_OPERAND(VE_MULTIPLY_ADD,
322
0,
323
0,
324
t_dst_index(vp, &vpi->DstReg),
325
t_dst_mask(vpi->DstReg.WriteMask),
326
t_dst_class(vpi->DstReg.File),
327
vpi->SaturateMode == RC_SATURATE_ZERO_ONE);
328
329
/* Arguments with constant swizzles still count as a unique
330
* temporary, so we should make sure these arguments share a
331
* register index with one of the other arguments. */
332
for (i = 0; i < 3; i++) {
333
unsigned int j;
334
if (vpi->SrcReg[i].File != RC_FILE_NONE)
335
continue;
336
337
for (j = 0; j < 3; j++) {
338
if (i != j) {
339
vpi->SrcReg[i].Index =
340
vpi->SrcReg[j].Index;
341
break;
342
}
343
}
344
}
345
}
346
inst[1] = t_src(vp, &vpi->SrcReg[0]);
347
inst[2] = t_src(vp, &vpi->SrcReg[1]);
348
inst[3] = t_src(vp, &vpi->SrcReg[2]);
349
}
350
351
static void ei_pow(struct r300_vertex_program_code *vp,
352
struct rc_sub_instruction *vpi,
353
unsigned int * inst)
354
{
355
inst[0] = PVS_OP_DST_OPERAND(ME_POWER_FUNC_FF,
356
1,
357
0,
358
t_dst_index(vp, &vpi->DstReg),
359
t_dst_mask(vpi->DstReg.WriteMask),
360
t_dst_class(vpi->DstReg.File),
361
vpi->SaturateMode == RC_SATURATE_ZERO_ONE);
362
inst[1] = t_src_scalar(vp, &vpi->SrcReg[0]);
363
inst[2] = __CONST(0, RC_SWIZZLE_ZERO);
364
inst[3] = t_src_scalar(vp, &vpi->SrcReg[1]);
365
}
366
367
static void translate_vertex_program(struct radeon_compiler *c, void *user)
368
{
369
struct r300_vertex_program_compiler *compiler = (struct r300_vertex_program_compiler*)c;
370
struct rc_instruction *rci;
371
372
unsigned loops[R500_PVS_MAX_LOOP_DEPTH];
373
unsigned loop_depth = 0;
374
375
compiler->code->pos_end = 0; /* Not supported yet */
376
compiler->code->length = 0;
377
compiler->code->num_temporaries = 0;
378
379
compiler->SetHwInputOutput(compiler);
380
381
for(rci = compiler->Base.Program.Instructions.Next; rci != &compiler->Base.Program.Instructions; rci = rci->Next) {
382
struct rc_sub_instruction *vpi = &rci->U.I;
383
unsigned int *inst = compiler->code->body.d + compiler->code->length;
384
const struct rc_opcode_info *info = rc_get_opcode_info(vpi->Opcode);
385
386
/* Skip instructions writing to non-existing destination */
387
if (!valid_dst(compiler->code, &vpi->DstReg))
388
continue;
389
390
if (info->HasDstReg) {
391
/* Neither is Saturate. */
392
if (vpi->SaturateMode != RC_SATURATE_NONE && !c->is_r500) {
393
rc_error(&compiler->Base, "Vertex program does not support the Saturate "
394
"modifier (yet).\n");
395
}
396
}
397
398
if (compiler->code->length >= c->max_alu_insts * 4) {
399
rc_error(&compiler->Base, "Vertex program has too many instructions\n");
400
return;
401
}
402
403
assert(compiler->Base.is_r500 ||
404
(vpi->Opcode != RC_OPCODE_SEQ &&
405
vpi->Opcode != RC_OPCODE_SNE));
406
407
switch (vpi->Opcode) {
408
case RC_OPCODE_ADD: ei_vector2(compiler->code, VE_ADD, vpi, inst); break;
409
case RC_OPCODE_ARL: ei_vector1(compiler->code, VE_FLT2FIX_DX, vpi, inst); break;
410
case RC_OPCODE_ARR: ei_vector1(compiler->code, VE_FLT2FIX_DX_RND, vpi, inst); break;
411
case RC_OPCODE_COS: ei_math1(compiler->code, ME_COS, vpi, inst); break;
412
case RC_OPCODE_DP4: ei_vector2(compiler->code, VE_DOT_PRODUCT, vpi, inst); break;
413
case RC_OPCODE_DST: ei_vector2(compiler->code, VE_DISTANCE_VECTOR, vpi, inst); break;
414
case RC_OPCODE_EX2: ei_math1(compiler->code, ME_EXP_BASE2_FULL_DX, vpi, inst); break;
415
case RC_OPCODE_EXP: ei_math1(compiler->code, ME_EXP_BASE2_DX, vpi, inst); break;
416
case RC_OPCODE_FRC: ei_vector1(compiler->code, VE_FRACTION, vpi, inst); break;
417
case RC_OPCODE_LG2: ei_math1(compiler->code, ME_LOG_BASE2_FULL_DX, vpi, inst); break;
418
case RC_OPCODE_LIT: ei_lit(compiler->code, vpi, inst); break;
419
case RC_OPCODE_LOG: ei_math1(compiler->code, ME_LOG_BASE2_DX, vpi, inst); break;
420
case RC_OPCODE_MAD: ei_mad(compiler->code, vpi, inst); break;
421
case RC_OPCODE_MAX: ei_vector2(compiler->code, VE_MAXIMUM, vpi, inst); break;
422
case RC_OPCODE_MIN: ei_vector2(compiler->code, VE_MINIMUM, vpi, inst); break;
423
case RC_OPCODE_MOV: ei_vector1(compiler->code, VE_ADD, vpi, inst); break;
424
case RC_OPCODE_MUL: ei_vector2(compiler->code, VE_MULTIPLY, vpi, inst); break;
425
case RC_OPCODE_POW: ei_pow(compiler->code, vpi, inst); break;
426
case RC_OPCODE_RCP: ei_math1(compiler->code, ME_RECIP_DX, vpi, inst); break;
427
case RC_OPCODE_RSQ: ei_math1(compiler->code, ME_RECIP_SQRT_DX, vpi, inst); break;
428
case RC_OPCODE_SEQ: ei_vector2(compiler->code, VE_SET_EQUAL, vpi, inst); break;
429
case RC_OPCODE_SGE: ei_vector2(compiler->code, VE_SET_GREATER_THAN_EQUAL, vpi, inst); break;
430
case RC_OPCODE_SIN: ei_math1(compiler->code, ME_SIN, vpi, inst); break;
431
case RC_OPCODE_SLT: ei_vector2(compiler->code, VE_SET_LESS_THAN, vpi, inst); break;
432
case RC_OPCODE_SNE: ei_vector2(compiler->code, VE_SET_NOT_EQUAL, vpi, inst); break;
433
case RC_OPCODE_BGNLOOP:
434
{
435
if ((!compiler->Base.is_r500
436
&& loop_depth >= R300_VS_MAX_LOOP_DEPTH)
437
|| loop_depth >= R500_PVS_MAX_LOOP_DEPTH) {
438
rc_error(&compiler->Base,
439
"Loops are nested too deep.");
440
return;
441
}
442
loops[loop_depth++] = ((compiler->code->length)/ 4) + 1;
443
break;
444
}
445
case RC_OPCODE_ENDLOOP:
446
{
447
unsigned int act_addr;
448
unsigned int last_addr;
449
unsigned int ret_addr;
450
451
ret_addr = loops[--loop_depth];
452
act_addr = ret_addr - 1;
453
last_addr = (compiler->code->length / 4) - 1;
454
455
if (loop_depth >= R300_VS_MAX_FC_OPS) {
456
rc_error(&compiler->Base,
457
"Too many flow control instructions.");
458
return;
459
}
460
if (compiler->Base.is_r500) {
461
compiler->code->fc_op_addrs.r500
462
[compiler->code->num_fc_ops].lw =
463
R500_PVS_FC_ACT_ADRS(act_addr)
464
| R500_PVS_FC_LOOP_CNT_JMP_INST(0x00ff)
465
;
466
compiler->code->fc_op_addrs.r500
467
[compiler->code->num_fc_ops].uw =
468
R500_PVS_FC_LAST_INST(last_addr)
469
| R500_PVS_FC_RTN_INST(ret_addr)
470
;
471
} else {
472
compiler->code->fc_op_addrs.r300
473
[compiler->code->num_fc_ops] =
474
R300_PVS_FC_ACT_ADRS(act_addr)
475
| R300_PVS_FC_LOOP_CNT_JMP_INST(0xff)
476
| R300_PVS_FC_LAST_INST(last_addr)
477
| R300_PVS_FC_RTN_INST(ret_addr)
478
;
479
}
480
compiler->code->fc_loop_index[compiler->code->num_fc_ops] =
481
R300_PVS_FC_LOOP_INIT_VAL(0x0)
482
| R300_PVS_FC_LOOP_STEP_VAL(0x1)
483
;
484
compiler->code->fc_ops |= R300_VAP_PVS_FC_OPC_LOOP(
485
compiler->code->num_fc_ops);
486
compiler->code->num_fc_ops++;
487
488
break;
489
}
490
491
case RC_ME_PRED_SET_CLR:
492
ei_math1(compiler->code, ME_PRED_SET_CLR, vpi, inst);
493
break;
494
495
case RC_ME_PRED_SET_INV:
496
ei_math1(compiler->code, ME_PRED_SET_INV, vpi, inst);
497
break;
498
499
case RC_ME_PRED_SET_POP:
500
ei_math1(compiler->code, ME_PRED_SET_POP, vpi, inst);
501
break;
502
503
case RC_ME_PRED_SET_RESTORE:
504
ei_math1(compiler->code, ME_PRED_SET_RESTORE, vpi, inst);
505
break;
506
507
case RC_ME_PRED_SEQ:
508
ei_math1(compiler->code, ME_PRED_SET_EQ, vpi, inst);
509
break;
510
511
case RC_ME_PRED_SNEQ:
512
ei_math1(compiler->code, ME_PRED_SET_NEQ, vpi, inst);
513
break;
514
515
case RC_VE_PRED_SNEQ_PUSH:
516
ei_vector2(compiler->code, VE_PRED_SET_NEQ_PUSH,
517
vpi, inst);
518
break;
519
520
default:
521
rc_error(&compiler->Base, "Unknown opcode %s\n", info->Name);
522
return;
523
}
524
525
if (vpi->DstReg.Pred != RC_PRED_DISABLED) {
526
inst[0] |= (PVS_DST_PRED_ENABLE_MASK
527
<< PVS_DST_PRED_ENABLE_SHIFT);
528
if (vpi->DstReg.Pred == RC_PRED_SET) {
529
inst[0] |= (PVS_DST_PRED_SENSE_MASK
530
<< PVS_DST_PRED_SENSE_SHIFT);
531
}
532
}
533
534
/* Update the number of temporaries. */
535
if (info->HasDstReg && vpi->DstReg.File == RC_FILE_TEMPORARY &&
536
vpi->DstReg.Index >= compiler->code->num_temporaries)
537
compiler->code->num_temporaries = vpi->DstReg.Index + 1;
538
539
for (unsigned i = 0; i < info->NumSrcRegs; i++)
540
if (vpi->SrcReg[i].File == RC_FILE_TEMPORARY &&
541
vpi->SrcReg[i].Index >= compiler->code->num_temporaries)
542
compiler->code->num_temporaries = vpi->SrcReg[i].Index + 1;
543
544
if (compiler->code->num_temporaries > compiler->Base.max_temp_regs) {
545
rc_error(&compiler->Base, "Too many temporaries.\n");
546
return;
547
}
548
549
compiler->code->length += 4;
550
551
if (compiler->Base.Error)
552
return;
553
}
554
}
555
556
struct temporary_allocation {
557
unsigned int Allocated:1;
558
unsigned int HwTemp:15;
559
struct rc_instruction * LastRead;
560
};
561
562
static void allocate_temporary_registers(struct radeon_compiler *c, void *user)
563
{
564
struct r300_vertex_program_compiler *compiler = (struct r300_vertex_program_compiler*)c;
565
struct rc_instruction *inst;
566
struct rc_instruction *end_loop = NULL;
567
unsigned int num_orig_temps = 0;
568
char hwtemps[RC_REGISTER_MAX_INDEX];
569
struct temporary_allocation * ta;
570
unsigned int i, j;
571
572
memset(hwtemps, 0, sizeof(hwtemps));
573
574
rc_recompute_ips(c);
575
576
/* Pass 1: Count original temporaries. */
577
for(inst = compiler->Base.Program.Instructions.Next; inst != &compiler->Base.Program.Instructions; inst = inst->Next) {
578
const struct rc_opcode_info * opcode = rc_get_opcode_info(inst->U.I.Opcode);
579
580
for (i = 0; i < opcode->NumSrcRegs; ++i) {
581
if (inst->U.I.SrcReg[i].File == RC_FILE_TEMPORARY) {
582
if (inst->U.I.SrcReg[i].Index >= num_orig_temps)
583
num_orig_temps = inst->U.I.SrcReg[i].Index + 1;
584
}
585
}
586
587
if (opcode->HasDstReg) {
588
if (inst->U.I.DstReg.File == RC_FILE_TEMPORARY) {
589
if (inst->U.I.DstReg.Index >= num_orig_temps)
590
num_orig_temps = inst->U.I.DstReg.Index + 1;
591
}
592
}
593
}
594
595
ta = (struct temporary_allocation*)memory_pool_malloc(&compiler->Base.Pool,
596
sizeof(struct temporary_allocation) * num_orig_temps);
597
memset(ta, 0, sizeof(struct temporary_allocation) * num_orig_temps);
598
599
/* Pass 2: Determine original temporary lifetimes */
600
for(inst = compiler->Base.Program.Instructions.Next; inst != &compiler->Base.Program.Instructions; inst = inst->Next) {
601
const struct rc_opcode_info * opcode = rc_get_opcode_info(inst->U.I.Opcode);
602
/* Instructions inside of loops need to use the ENDLOOP
603
* instruction as their LastRead. */
604
if (!end_loop && inst->U.I.Opcode == RC_OPCODE_BGNLOOP) {
605
int endloops = 1;
606
struct rc_instruction * ptr;
607
for(ptr = inst->Next;
608
ptr != &compiler->Base.Program.Instructions;
609
ptr = ptr->Next){
610
if (ptr->U.I.Opcode == RC_OPCODE_BGNLOOP) {
611
endloops++;
612
} else if (ptr->U.I.Opcode == RC_OPCODE_ENDLOOP) {
613
endloops--;
614
if (endloops <= 0) {
615
end_loop = ptr;
616
break;
617
}
618
}
619
}
620
}
621
622
if (inst == end_loop) {
623
end_loop = NULL;
624
continue;
625
}
626
627
for (i = 0; i < opcode->NumSrcRegs; ++i) {
628
if (inst->U.I.SrcReg[i].File == RC_FILE_TEMPORARY) {
629
ta[inst->U.I.SrcReg[i].Index].LastRead = end_loop ? end_loop : inst;
630
}
631
}
632
}
633
634
/* Pass 3: Register allocation */
635
for(inst = compiler->Base.Program.Instructions.Next; inst != &compiler->Base.Program.Instructions; inst = inst->Next) {
636
const struct rc_opcode_info * opcode = rc_get_opcode_info(inst->U.I.Opcode);
637
638
for (i = 0; i < opcode->NumSrcRegs; ++i) {
639
if (inst->U.I.SrcReg[i].File == RC_FILE_TEMPORARY) {
640
unsigned int orig = inst->U.I.SrcReg[i].Index;
641
inst->U.I.SrcReg[i].Index = ta[orig].HwTemp;
642
643
if (ta[orig].Allocated && inst == ta[orig].LastRead)
644
hwtemps[ta[orig].HwTemp] = 0;
645
}
646
}
647
648
if (opcode->HasDstReg) {
649
if (inst->U.I.DstReg.File == RC_FILE_TEMPORARY) {
650
unsigned int orig = inst->U.I.DstReg.Index;
651
652
if (!ta[orig].Allocated) {
653
for(j = 0; j < c->max_temp_regs; ++j) {
654
if (!hwtemps[j])
655
break;
656
}
657
ta[orig].Allocated = 1;
658
ta[orig].HwTemp = j;
659
hwtemps[ta[orig].HwTemp] = 1;
660
}
661
662
inst->U.I.DstReg.Index = ta[orig].HwTemp;
663
}
664
}
665
}
666
}
667
668
/**
669
* R3xx-R4xx vertex engine does not support the Absolute source operand modifier
670
* and the Saturate opcode modifier. Only Absolute is currently transformed.
671
*/
672
static int transform_nonnative_modifiers(
673
struct radeon_compiler *c,
674
struct rc_instruction *inst,
675
void* unused)
676
{
677
const struct rc_opcode_info *opcode = rc_get_opcode_info(inst->U.I.Opcode);
678
unsigned i;
679
680
/* Transform ABS(a) to MAX(a, -a). */
681
for (i = 0; i < opcode->NumSrcRegs; i++) {
682
if (inst->U.I.SrcReg[i].Abs) {
683
struct rc_instruction *new_inst;
684
unsigned temp;
685
686
inst->U.I.SrcReg[i].Abs = 0;
687
688
temp = rc_find_free_temporary(c);
689
690
new_inst = rc_insert_new_instruction(c, inst->Prev);
691
new_inst->U.I.Opcode = RC_OPCODE_MAX;
692
new_inst->U.I.DstReg.File = RC_FILE_TEMPORARY;
693
new_inst->U.I.DstReg.Index = temp;
694
new_inst->U.I.SrcReg[0] = inst->U.I.SrcReg[i];
695
new_inst->U.I.SrcReg[1] = inst->U.I.SrcReg[i];
696
new_inst->U.I.SrcReg[1].Negate ^= RC_MASK_XYZW;
697
698
memset(&inst->U.I.SrcReg[i], 0, sizeof(inst->U.I.SrcReg[i]));
699
inst->U.I.SrcReg[i].File = RC_FILE_TEMPORARY;
700
inst->U.I.SrcReg[i].Index = temp;
701
inst->U.I.SrcReg[i].Swizzle = RC_SWIZZLE_XYZW;
702
}
703
}
704
return 1;
705
}
706
707
/**
708
* Vertex engine cannot read two inputs or two constants at the same time.
709
* Introduce intermediate MOVs to temporary registers to account for this.
710
*/
711
static int transform_source_conflicts(
712
struct radeon_compiler *c,
713
struct rc_instruction* inst,
714
void* unused)
715
{
716
const struct rc_opcode_info * opcode = rc_get_opcode_info(inst->U.I.Opcode);
717
718
if (opcode->NumSrcRegs == 3) {
719
if (t_src_conflict(inst->U.I.SrcReg[1], inst->U.I.SrcReg[2])
720
|| t_src_conflict(inst->U.I.SrcReg[0], inst->U.I.SrcReg[2])) {
721
int tmpreg = rc_find_free_temporary(c);
722
struct rc_instruction * inst_mov = rc_insert_new_instruction(c, inst->Prev);
723
inst_mov->U.I.Opcode = RC_OPCODE_MOV;
724
inst_mov->U.I.DstReg.File = RC_FILE_TEMPORARY;
725
inst_mov->U.I.DstReg.Index = tmpreg;
726
inst_mov->U.I.SrcReg[0] = inst->U.I.SrcReg[2];
727
728
reset_srcreg(&inst->U.I.SrcReg[2]);
729
inst->U.I.SrcReg[2].File = RC_FILE_TEMPORARY;
730
inst->U.I.SrcReg[2].Index = tmpreg;
731
}
732
}
733
734
if (opcode->NumSrcRegs >= 2) {
735
if (t_src_conflict(inst->U.I.SrcReg[1], inst->U.I.SrcReg[0])) {
736
int tmpreg = rc_find_free_temporary(c);
737
struct rc_instruction * inst_mov = rc_insert_new_instruction(c, inst->Prev);
738
inst_mov->U.I.Opcode = RC_OPCODE_MOV;
739
inst_mov->U.I.DstReg.File = RC_FILE_TEMPORARY;
740
inst_mov->U.I.DstReg.Index = tmpreg;
741
inst_mov->U.I.SrcReg[0] = inst->U.I.SrcReg[1];
742
743
reset_srcreg(&inst->U.I.SrcReg[1]);
744
inst->U.I.SrcReg[1].File = RC_FILE_TEMPORARY;
745
inst->U.I.SrcReg[1].Index = tmpreg;
746
}
747
}
748
749
return 1;
750
}
751
752
static void rc_vs_add_artificial_outputs(struct radeon_compiler *c, void *user)
753
{
754
struct r300_vertex_program_compiler * compiler = (struct r300_vertex_program_compiler*)c;
755
int i;
756
757
for(i = 0; i < 32; ++i) {
758
if ((compiler->RequiredOutputs & (1 << i)) &&
759
!(compiler->Base.Program.OutputsWritten & (1 << i))) {
760
struct rc_instruction * inst = rc_insert_new_instruction(&compiler->Base, compiler->Base.Program.Instructions.Prev);
761
inst->U.I.Opcode = RC_OPCODE_MOV;
762
763
inst->U.I.DstReg.File = RC_FILE_OUTPUT;
764
inst->U.I.DstReg.Index = i;
765
inst->U.I.DstReg.WriteMask = RC_MASK_XYZW;
766
767
inst->U.I.SrcReg[0].File = RC_FILE_CONSTANT;
768
inst->U.I.SrcReg[0].Index = 0;
769
inst->U.I.SrcReg[0].Swizzle = RC_SWIZZLE_XYZW;
770
771
compiler->Base.Program.OutputsWritten |= 1 << i;
772
}
773
}
774
}
775
776
static void dataflow_outputs_mark_used(void * userdata, void * data,
777
void (*callback)(void *, unsigned int, unsigned int))
778
{
779
struct r300_vertex_program_compiler * c = userdata;
780
int i;
781
782
for(i = 0; i < 32; ++i) {
783
if (c->RequiredOutputs & (1 << i))
784
callback(data, i, RC_MASK_XYZW);
785
}
786
}
787
788
static int swizzle_is_native(rc_opcode opcode, struct rc_src_register reg)
789
{
790
(void) opcode;
791
(void) reg;
792
793
return 1;
794
}
795
796
static void transform_negative_addressing(struct r300_vertex_program_compiler *c,
797
struct rc_instruction *arl,
798
struct rc_instruction *end,
799
int min_offset)
800
{
801
struct rc_instruction *inst, *add;
802
unsigned const_swizzle;
803
804
/* Transform ARL/ARR */
805
add = rc_insert_new_instruction(&c->Base, arl->Prev);
806
add->U.I.Opcode = RC_OPCODE_ADD;
807
add->U.I.DstReg.File = RC_FILE_TEMPORARY;
808
add->U.I.DstReg.Index = rc_find_free_temporary(&c->Base);
809
add->U.I.DstReg.WriteMask = RC_MASK_X;
810
add->U.I.SrcReg[0] = arl->U.I.SrcReg[0];
811
add->U.I.SrcReg[1].File = RC_FILE_CONSTANT;
812
add->U.I.SrcReg[1].Index = rc_constants_add_immediate_scalar(&c->Base.Program.Constants,
813
min_offset, &const_swizzle);
814
add->U.I.SrcReg[1].Swizzle = const_swizzle;
815
816
arl->U.I.SrcReg[0].File = RC_FILE_TEMPORARY;
817
arl->U.I.SrcReg[0].Index = add->U.I.DstReg.Index;
818
arl->U.I.SrcReg[0].Swizzle = RC_SWIZZLE_XXXX;
819
820
/* Rewrite offsets up to and excluding inst. */
821
for (inst = arl->Next; inst != end; inst = inst->Next) {
822
const struct rc_opcode_info * opcode = rc_get_opcode_info(inst->U.I.Opcode);
823
824
for (unsigned i = 0; i < opcode->NumSrcRegs; i++)
825
if (inst->U.I.SrcReg[i].RelAddr)
826
inst->U.I.SrcReg[i].Index -= min_offset;
827
}
828
}
829
830
static void rc_emulate_negative_addressing(struct radeon_compiler *compiler, void *user)
831
{
832
struct r300_vertex_program_compiler * c = (struct r300_vertex_program_compiler*)compiler;
833
struct rc_instruction *inst, *lastARL = NULL;
834
int min_offset = 0;
835
836
for (inst = c->Base.Program.Instructions.Next; inst != &c->Base.Program.Instructions; inst = inst->Next) {
837
const struct rc_opcode_info * opcode = rc_get_opcode_info(inst->U.I.Opcode);
838
839
if (inst->U.I.Opcode == RC_OPCODE_ARL || inst->U.I.Opcode == RC_OPCODE_ARR) {
840
if (lastARL != NULL && min_offset < 0)
841
transform_negative_addressing(c, lastARL, inst, min_offset);
842
843
lastARL = inst;
844
min_offset = 0;
845
continue;
846
}
847
848
for (unsigned i = 0; i < opcode->NumSrcRegs; i++) {
849
if (inst->U.I.SrcReg[i].RelAddr &&
850
inst->U.I.SrcReg[i].Index < 0) {
851
/* ARL must precede any indirect addressing. */
852
if (!lastARL) {
853
rc_error(&c->Base, "Vertex shader: Found relative addressing without ARL/ARR.");
854
return;
855
}
856
857
if (inst->U.I.SrcReg[i].Index < min_offset)
858
min_offset = inst->U.I.SrcReg[i].Index;
859
}
860
}
861
}
862
863
if (lastARL != NULL && min_offset < 0)
864
transform_negative_addressing(c, lastARL, inst, min_offset);
865
}
866
867
struct rc_swizzle_caps r300_vertprog_swizzle_caps = {
868
.IsNative = &swizzle_is_native,
869
.Split = 0 /* should never be called */
870
};
871
872
void r3xx_compile_vertex_program(struct r300_vertex_program_compiler *c)
873
{
874
int is_r500 = c->Base.is_r500;
875
int opt = !c->Base.disable_optimizations;
876
877
/* Lists of instruction transformations. */
878
struct radeon_program_transformation alu_rewrite_r500[] = {
879
{ &r300_transform_vertex_alu, 0 },
880
{ &r300_transform_trig_scale_vertex, 0 },
881
{ 0, 0 }
882
};
883
884
struct radeon_program_transformation alu_rewrite_r300[] = {
885
{ &r300_transform_vertex_alu, 0 },
886
{ &r300_transform_trig_simple, 0 },
887
{ 0, 0 }
888
};
889
890
/* Note: These passes have to be done seperately from ALU rewrite,
891
* otherwise non-native ALU instructions with source conflits
892
* or non-native modifiers will not be treated properly.
893
*/
894
struct radeon_program_transformation emulate_modifiers[] = {
895
{ &transform_nonnative_modifiers, 0 },
896
{ 0, 0 }
897
};
898
899
struct radeon_program_transformation resolve_src_conflicts[] = {
900
{ &transform_source_conflicts, 0 },
901
{ 0, 0 }
902
};
903
904
/* List of compiler passes. */
905
struct radeon_compiler_pass vs_list[] = {
906
/* NAME DUMP PREDICATE FUNCTION PARAM */
907
{"add artificial outputs", 0, 1, rc_vs_add_artificial_outputs, NULL},
908
{"emulate branches", 1, !is_r500, rc_emulate_branches, NULL},
909
{"emulate negative addressing", 1, 1, rc_emulate_negative_addressing, NULL},
910
{"native rewrite", 1, is_r500, rc_local_transform, alu_rewrite_r500},
911
{"native rewrite", 1, !is_r500, rc_local_transform, alu_rewrite_r300},
912
{"emulate modifiers", 1, !is_r500, rc_local_transform, emulate_modifiers},
913
{"deadcode", 1, opt, rc_dataflow_deadcode, dataflow_outputs_mark_used},
914
{"dataflow optimize", 1, opt, rc_optimize, NULL},
915
/* This pass must be done after optimizations. */
916
{"source conflict resolve", 1, 1, rc_local_transform, resolve_src_conflicts},
917
{"register allocation", 1, opt, allocate_temporary_registers, NULL},
918
{"dead constants", 1, 1, rc_remove_unused_constants, &c->code->constants_remap_table},
919
{"lower control flow opcodes", 1, is_r500, rc_vert_fc, NULL},
920
{"final code validation", 0, 1, rc_validate_final_shader, NULL},
921
{"machine code generation", 0, 1, translate_vertex_program, NULL},
922
{"dump machine code", 0, c->Base.Debug & RC_DBG_LOG, r300_vertex_program_dump, NULL},
923
{NULL, 0, 0, NULL, NULL}
924
};
925
926
c->Base.type = RC_VERTEX_PROGRAM;
927
c->Base.SwizzleCaps = &r300_vertprog_swizzle_caps;
928
929
rc_run_compiler(&c->Base, vs_list);
930
931
c->code->InputsRead = c->Base.Program.InputsRead;
932
c->code->OutputsWritten = c->Base.Program.OutputsWritten;
933
rc_constants_copy(&c->code->constants, &c->Base.Program.Constants);
934
}
935
936