Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
PojavLauncherTeam
GitHub Repository: PojavLauncherTeam/mesa
Path: blob/21.2-virgl/src/gallium/drivers/r600/r600_asm.c
4570 views
1
/*
2
* Copyright 2010 Jerome Glisse <[email protected]>
3
*
4
* Permission is hereby granted, free of charge, to any person obtaining a
5
* copy of this software and associated documentation files (the "Software"),
6
* to deal in the Software without restriction, including without limitation
7
* on the rights to use, copy, modify, merge, publish, distribute, sub
8
* license, and/or sell copies of the Software, and to permit persons to whom
9
* the Software is furnished to do so, subject to the following conditions:
10
*
11
* The above copyright notice and this permission notice (including the next
12
* paragraph) shall be included in all copies or substantial portions of the
13
* Software.
14
*
15
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
* FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
18
* THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
19
* DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
20
* OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
21
* USE OR OTHER DEALINGS IN THE SOFTWARE.
22
*/
23
#include "r600_sq.h"
24
#include "r600_opcodes.h"
25
#include "r600_formats.h"
26
#include "r600_shader.h"
27
#include "r600d.h"
28
29
#include <errno.h>
30
#include "util/u_bitcast.h"
31
#include "util/u_dump.h"
32
#include "util/u_memory.h"
33
#include "util/u_math.h"
34
#include "pipe/p_shader_tokens.h"
35
36
#include "sb/sb_public.h"
37
38
#define NUM_OF_CYCLES 3
39
#define NUM_OF_COMPONENTS 4
40
41
static inline bool alu_writes(struct r600_bytecode_alu *alu)
42
{
43
return alu->dst.write || alu->is_op3;
44
}
45
46
static inline unsigned int r600_bytecode_get_num_operands(const struct r600_bytecode_alu *alu)
47
{
48
return r600_isa_alu(alu->op)->src_count;
49
}
50
51
static struct r600_bytecode_cf *r600_bytecode_cf(void)
52
{
53
struct r600_bytecode_cf *cf = CALLOC_STRUCT(r600_bytecode_cf);
54
55
if (!cf)
56
return NULL;
57
list_inithead(&cf->list);
58
list_inithead(&cf->alu);
59
list_inithead(&cf->vtx);
60
list_inithead(&cf->tex);
61
list_inithead(&cf->gds);
62
return cf;
63
}
64
65
static struct r600_bytecode_alu *r600_bytecode_alu(void)
66
{
67
struct r600_bytecode_alu *alu = CALLOC_STRUCT(r600_bytecode_alu);
68
69
if (!alu)
70
return NULL;
71
list_inithead(&alu->list);
72
return alu;
73
}
74
75
static struct r600_bytecode_vtx *r600_bytecode_vtx(void)
76
{
77
struct r600_bytecode_vtx *vtx = CALLOC_STRUCT(r600_bytecode_vtx);
78
79
if (!vtx)
80
return NULL;
81
list_inithead(&vtx->list);
82
return vtx;
83
}
84
85
static struct r600_bytecode_tex *r600_bytecode_tex(void)
86
{
87
struct r600_bytecode_tex *tex = CALLOC_STRUCT(r600_bytecode_tex);
88
89
if (!tex)
90
return NULL;
91
list_inithead(&tex->list);
92
return tex;
93
}
94
95
static struct r600_bytecode_gds *r600_bytecode_gds(void)
96
{
97
struct r600_bytecode_gds *gds = CALLOC_STRUCT(r600_bytecode_gds);
98
99
if (gds == NULL)
100
return NULL;
101
list_inithead(&gds->list);
102
return gds;
103
}
104
105
static unsigned stack_entry_size(enum radeon_family chip) {
106
/* Wavefront size:
107
* 64: R600/RV670/RV770/Cypress/R740/Barts/Turks/Caicos/
108
* Aruba/Sumo/Sumo2/redwood/juniper
109
* 32: R630/R730/R710/Palm/Cedar
110
* 16: R610/Rs780
111
*
112
* Stack row size:
113
* Wavefront Size 16 32 48 64
114
* Columns per Row (R6xx/R7xx/R8xx only) 8 8 4 4
115
* Columns per Row (R9xx+) 8 4 4 4 */
116
117
switch (chip) {
118
/* FIXME: are some chips missing here? */
119
/* wavefront size 16 */
120
case CHIP_RV610:
121
case CHIP_RS780:
122
case CHIP_RV620:
123
case CHIP_RS880:
124
/* wavefront size 32 */
125
case CHIP_RV630:
126
case CHIP_RV635:
127
case CHIP_RV730:
128
case CHIP_RV710:
129
case CHIP_PALM:
130
case CHIP_CEDAR:
131
return 8;
132
133
/* wavefront size 64 */
134
default:
135
return 4;
136
}
137
}
138
139
void r600_bytecode_init(struct r600_bytecode *bc,
140
enum chip_class chip_class,
141
enum radeon_family family,
142
bool has_compressed_msaa_texturing)
143
{
144
static unsigned next_shader_id = 0;
145
146
bc->debug_id = ++next_shader_id;
147
148
if ((chip_class == R600) &&
149
(family != CHIP_RV670 && family != CHIP_RS780 && family != CHIP_RS880)) {
150
bc->ar_handling = AR_HANDLE_RV6XX;
151
bc->r6xx_nop_after_rel_dst = 1;
152
} else {
153
bc->ar_handling = AR_HANDLE_NORMAL;
154
bc->r6xx_nop_after_rel_dst = 0;
155
}
156
157
list_inithead(&bc->cf);
158
bc->chip_class = chip_class;
159
bc->family = family;
160
bc->has_compressed_msaa_texturing = has_compressed_msaa_texturing;
161
bc->stack.entry_size = stack_entry_size(family);
162
}
163
164
int r600_bytecode_add_cf(struct r600_bytecode *bc)
165
{
166
struct r600_bytecode_cf *cf = r600_bytecode_cf();
167
168
if (!cf)
169
return -ENOMEM;
170
list_addtail(&cf->list, &bc->cf);
171
if (bc->cf_last) {
172
cf->id = bc->cf_last->id + 2;
173
if (bc->cf_last->eg_alu_extended) {
174
/* take into account extended alu size */
175
cf->id += 2;
176
bc->ndw += 2;
177
}
178
}
179
bc->cf_last = cf;
180
bc->ncf++;
181
bc->ndw += 2;
182
bc->force_add_cf = 0;
183
bc->ar_loaded = 0;
184
return 0;
185
}
186
187
int r600_bytecode_add_output(struct r600_bytecode *bc,
188
const struct r600_bytecode_output *output)
189
{
190
int r;
191
192
if (output->gpr >= bc->ngpr)
193
bc->ngpr = output->gpr + 1;
194
195
if (bc->cf_last && (bc->cf_last->op == output->op ||
196
(bc->cf_last->op == CF_OP_EXPORT &&
197
output->op == CF_OP_EXPORT_DONE)) &&
198
output->type == bc->cf_last->output.type &&
199
output->elem_size == bc->cf_last->output.elem_size &&
200
output->swizzle_x == bc->cf_last->output.swizzle_x &&
201
output->swizzle_y == bc->cf_last->output.swizzle_y &&
202
output->swizzle_z == bc->cf_last->output.swizzle_z &&
203
output->swizzle_w == bc->cf_last->output.swizzle_w &&
204
output->comp_mask == bc->cf_last->output.comp_mask &&
205
(output->burst_count + bc->cf_last->output.burst_count) <= 16) {
206
207
if ((output->gpr + output->burst_count) == bc->cf_last->output.gpr &&
208
(output->array_base + output->burst_count) == bc->cf_last->output.array_base) {
209
210
bc->cf_last->op = bc->cf_last->output.op = output->op;
211
bc->cf_last->output.gpr = output->gpr;
212
bc->cf_last->output.array_base = output->array_base;
213
bc->cf_last->output.burst_count += output->burst_count;
214
return 0;
215
216
} else if (output->gpr == (bc->cf_last->output.gpr + bc->cf_last->output.burst_count) &&
217
output->array_base == (bc->cf_last->output.array_base + bc->cf_last->output.burst_count)) {
218
219
bc->cf_last->op = bc->cf_last->output.op = output->op;
220
bc->cf_last->output.burst_count += output->burst_count;
221
return 0;
222
}
223
}
224
225
r = r600_bytecode_add_cf(bc);
226
if (r)
227
return r;
228
bc->cf_last->op = output->op;
229
memcpy(&bc->cf_last->output, output, sizeof(struct r600_bytecode_output));
230
bc->cf_last->barrier = 1;
231
return 0;
232
}
233
234
int r600_bytecode_add_pending_output(struct r600_bytecode *bc,
235
const struct r600_bytecode_output *output)
236
{
237
assert(bc->n_pending_outputs + 1 < ARRAY_SIZE(bc->pending_outputs));
238
bc->pending_outputs[bc->n_pending_outputs++] = *output;
239
240
return 0;
241
}
242
243
void r600_bytecode_need_wait_ack(struct r600_bytecode *bc, boolean need_wait_ack)
244
{
245
bc->need_wait_ack = need_wait_ack;
246
}
247
248
boolean r600_bytecode_get_need_wait_ack(struct r600_bytecode *bc)
249
{
250
return bc->need_wait_ack;
251
}
252
253
/* alu instructions that can ony exits once per group */
254
static int is_alu_once_inst(struct r600_bytecode_alu *alu)
255
{
256
return r600_isa_alu(alu->op)->flags & (AF_KILL | AF_PRED) || alu->is_lds_idx_op || alu->op == ALU_OP0_GROUP_BARRIER;
257
}
258
259
static int is_alu_reduction_inst(struct r600_bytecode *bc, struct r600_bytecode_alu *alu)
260
{
261
return (r600_isa_alu(alu->op)->flags & AF_REPL) &&
262
(r600_isa_alu_slots(bc->isa->hw_class, alu->op) == AF_4V);
263
}
264
265
static int is_alu_mova_inst(struct r600_bytecode_alu *alu)
266
{
267
return r600_isa_alu(alu->op)->flags & AF_MOVA;
268
}
269
270
static int alu_uses_rel(struct r600_bytecode_alu *alu)
271
{
272
unsigned num_src = r600_bytecode_get_num_operands(alu);
273
unsigned src;
274
275
if (alu->dst.rel) {
276
return 1;
277
}
278
279
for (src = 0; src < num_src; ++src) {
280
if (alu->src[src].rel) {
281
return 1;
282
}
283
}
284
return 0;
285
}
286
287
static int is_lds_read(int sel)
288
{
289
return sel == EG_V_SQ_ALU_SRC_LDS_OQ_A_POP || sel == EG_V_SQ_ALU_SRC_LDS_OQ_B_POP;
290
}
291
292
static int alu_uses_lds(struct r600_bytecode_alu *alu)
293
{
294
unsigned num_src = r600_bytecode_get_num_operands(alu);
295
unsigned src;
296
297
for (src = 0; src < num_src; ++src) {
298
if (is_lds_read(alu->src[src].sel)) {
299
return 1;
300
}
301
}
302
return 0;
303
}
304
305
static int is_alu_64bit_inst(struct r600_bytecode_alu *alu)
306
{
307
const struct alu_op_info *op = r600_isa_alu(alu->op);
308
return (op->flags & AF_64);
309
}
310
311
static int is_alu_vec_unit_inst(struct r600_bytecode *bc, struct r600_bytecode_alu *alu)
312
{
313
unsigned slots = r600_isa_alu_slots(bc->isa->hw_class, alu->op);
314
return !(slots & AF_S);
315
}
316
317
static int is_alu_trans_unit_inst(struct r600_bytecode *bc, struct r600_bytecode_alu *alu)
318
{
319
unsigned slots = r600_isa_alu_slots(bc->isa->hw_class, alu->op);
320
return !(slots & AF_V);
321
}
322
323
/* alu instructions that can execute on any unit */
324
static int is_alu_any_unit_inst(struct r600_bytecode *bc, struct r600_bytecode_alu *alu)
325
{
326
unsigned slots = r600_isa_alu_slots(bc->isa->hw_class, alu->op);
327
return slots == AF_VS;
328
}
329
330
static int is_nop_inst(struct r600_bytecode_alu *alu)
331
{
332
return alu->op == ALU_OP0_NOP;
333
}
334
335
static int assign_alu_units(struct r600_bytecode *bc, struct r600_bytecode_alu *alu_first,
336
struct r600_bytecode_alu *assignment[5])
337
{
338
struct r600_bytecode_alu *alu;
339
unsigned i, chan, trans;
340
int max_slots = bc->chip_class == CAYMAN ? 4 : 5;
341
342
for (i = 0; i < max_slots; i++)
343
assignment[i] = NULL;
344
345
for (alu = alu_first; alu; alu = LIST_ENTRY(struct r600_bytecode_alu, alu->list.next, list)) {
346
chan = alu->dst.chan;
347
if (max_slots == 4)
348
trans = 0;
349
else if (is_alu_trans_unit_inst(bc, alu))
350
trans = 1;
351
else if (is_alu_vec_unit_inst(bc, alu))
352
trans = 0;
353
else if (assignment[chan])
354
trans = 1; /* Assume ALU_INST_PREFER_VECTOR. */
355
else
356
trans = 0;
357
358
if (trans) {
359
if (assignment[4]) {
360
assert(0); /* ALU.Trans has already been allocated. */
361
return -1;
362
}
363
assignment[4] = alu;
364
} else {
365
if (assignment[chan]) {
366
assert(0); /* ALU.chan has already been allocated. */
367
return -1;
368
}
369
assignment[chan] = alu;
370
}
371
372
if (alu->last)
373
break;
374
}
375
return 0;
376
}
377
378
struct alu_bank_swizzle {
379
int hw_gpr[NUM_OF_CYCLES][NUM_OF_COMPONENTS];
380
int hw_cfile_addr[4];
381
int hw_cfile_elem[4];
382
};
383
384
static const unsigned cycle_for_bank_swizzle_vec[][3] = {
385
[SQ_ALU_VEC_012] = { 0, 1, 2 },
386
[SQ_ALU_VEC_021] = { 0, 2, 1 },
387
[SQ_ALU_VEC_120] = { 1, 2, 0 },
388
[SQ_ALU_VEC_102] = { 1, 0, 2 },
389
[SQ_ALU_VEC_201] = { 2, 0, 1 },
390
[SQ_ALU_VEC_210] = { 2, 1, 0 }
391
};
392
393
static const unsigned cycle_for_bank_swizzle_scl[][3] = {
394
[SQ_ALU_SCL_210] = { 2, 1, 0 },
395
[SQ_ALU_SCL_122] = { 1, 2, 2 },
396
[SQ_ALU_SCL_212] = { 2, 1, 2 },
397
[SQ_ALU_SCL_221] = { 2, 2, 1 }
398
};
399
400
static void init_bank_swizzle(struct alu_bank_swizzle *bs)
401
{
402
int i, cycle, component;
403
/* set up gpr use */
404
for (cycle = 0; cycle < NUM_OF_CYCLES; cycle++)
405
for (component = 0; component < NUM_OF_COMPONENTS; component++)
406
bs->hw_gpr[cycle][component] = -1;
407
for (i = 0; i < 4; i++)
408
bs->hw_cfile_addr[i] = -1;
409
for (i = 0; i < 4; i++)
410
bs->hw_cfile_elem[i] = -1;
411
}
412
413
static int reserve_gpr(struct alu_bank_swizzle *bs, unsigned sel, unsigned chan, unsigned cycle)
414
{
415
if (bs->hw_gpr[cycle][chan] == -1)
416
bs->hw_gpr[cycle][chan] = sel;
417
else if (bs->hw_gpr[cycle][chan] != (int)sel) {
418
/* Another scalar operation has already used the GPR read port for the channel. */
419
return -1;
420
}
421
return 0;
422
}
423
424
static int reserve_cfile(const struct r600_bytecode *bc,
425
struct alu_bank_swizzle *bs, unsigned sel, unsigned chan)
426
{
427
int res, num_res = 4;
428
if (bc->chip_class >= R700) {
429
num_res = 2;
430
chan /= 2;
431
}
432
for (res = 0; res < num_res; ++res) {
433
if (bs->hw_cfile_addr[res] == -1) {
434
bs->hw_cfile_addr[res] = sel;
435
bs->hw_cfile_elem[res] = chan;
436
return 0;
437
} else if (bs->hw_cfile_addr[res] == sel &&
438
bs->hw_cfile_elem[res] == chan)
439
return 0; /* Read for this scalar element already reserved, nothing to do here. */
440
}
441
/* All cfile read ports are used, cannot reference vector element. */
442
return -1;
443
}
444
445
static int is_gpr(unsigned sel)
446
{
447
return (sel <= 127);
448
}
449
450
/* CB constants start at 512, and get translated to a kcache index when ALU
451
* clauses are constructed. Note that we handle kcache constants the same way
452
* as (the now gone) cfile constants, is that really required? */
453
static int is_cfile(unsigned sel)
454
{
455
return (sel > 255 && sel < 512) ||
456
(sel > 511 && sel < 4607) || /* Kcache before translation. */
457
(sel > 127 && sel < 192); /* Kcache after translation. */
458
}
459
460
static int is_const(int sel)
461
{
462
return is_cfile(sel) ||
463
(sel >= V_SQ_ALU_SRC_0 &&
464
sel <= V_SQ_ALU_SRC_LITERAL);
465
}
466
467
static int check_vector(const struct r600_bytecode *bc, const struct r600_bytecode_alu *alu,
468
struct alu_bank_swizzle *bs, int bank_swizzle)
469
{
470
int r, src, num_src, sel, elem, cycle;
471
472
num_src = r600_bytecode_get_num_operands(alu);
473
for (src = 0; src < num_src; src++) {
474
sel = alu->src[src].sel;
475
elem = alu->src[src].chan;
476
if (is_gpr(sel)) {
477
cycle = cycle_for_bank_swizzle_vec[bank_swizzle][src];
478
if (src == 1 && sel == alu->src[0].sel && elem == alu->src[0].chan)
479
/* Nothing to do; special-case optimization,
480
* second source uses first source’s reservation. */
481
continue;
482
else {
483
r = reserve_gpr(bs, sel, elem, cycle);
484
if (r)
485
return r;
486
}
487
} else if (is_cfile(sel)) {
488
r = reserve_cfile(bc, bs, (alu->src[src].kc_bank<<16) + sel, elem);
489
if (r)
490
return r;
491
}
492
/* No restrictions on PV, PS, literal or special constants. */
493
}
494
return 0;
495
}
496
497
static int check_scalar(const struct r600_bytecode *bc, const struct r600_bytecode_alu *alu,
498
struct alu_bank_swizzle *bs, int bank_swizzle)
499
{
500
int r, src, num_src, const_count, sel, elem, cycle;
501
502
num_src = r600_bytecode_get_num_operands(alu);
503
for (const_count = 0, src = 0; src < num_src; ++src) {
504
sel = alu->src[src].sel;
505
elem = alu->src[src].chan;
506
if (is_const(sel)) { /* Any constant, including literal and inline constants. */
507
if (const_count >= 2)
508
/* More than two references to a constant in
509
* transcendental operation. */
510
return -1;
511
else
512
const_count++;
513
}
514
if (is_cfile(sel)) {
515
r = reserve_cfile(bc, bs, (alu->src[src].kc_bank<<16) + sel, elem);
516
if (r)
517
return r;
518
}
519
}
520
for (src = 0; src < num_src; ++src) {
521
sel = alu->src[src].sel;
522
elem = alu->src[src].chan;
523
if (is_gpr(sel)) {
524
cycle = cycle_for_bank_swizzle_scl[bank_swizzle][src];
525
if (cycle < const_count)
526
/* Cycle for GPR load conflicts with
527
* constant load in transcendental operation. */
528
return -1;
529
r = reserve_gpr(bs, sel, elem, cycle);
530
if (r)
531
return r;
532
}
533
/* PV PS restrictions */
534
if (const_count && (sel == 254 || sel == 255)) {
535
cycle = cycle_for_bank_swizzle_scl[bank_swizzle][src];
536
if (cycle < const_count)
537
return -1;
538
}
539
}
540
return 0;
541
}
542
543
static int check_and_set_bank_swizzle(const struct r600_bytecode *bc,
544
struct r600_bytecode_alu *slots[5])
545
{
546
struct alu_bank_swizzle bs;
547
int bank_swizzle[5];
548
int i, r = 0, forced = 1;
549
boolean scalar_only = bc->chip_class == CAYMAN ? false : true;
550
int max_slots = bc->chip_class == CAYMAN ? 4 : 5;
551
552
for (i = 0; i < max_slots; i++) {
553
if (slots[i]) {
554
if (slots[i]->bank_swizzle_force) {
555
slots[i]->bank_swizzle = slots[i]->bank_swizzle_force;
556
} else {
557
forced = 0;
558
}
559
}
560
561
if (i < 4 && slots[i])
562
scalar_only = false;
563
}
564
if (forced)
565
return 0;
566
567
/* Just check every possible combination of bank swizzle.
568
* Not very efficent, but works on the first try in most of the cases. */
569
for (i = 0; i < 4; i++)
570
if (!slots[i] || !slots[i]->bank_swizzle_force)
571
bank_swizzle[i] = SQ_ALU_VEC_012;
572
else
573
bank_swizzle[i] = slots[i]->bank_swizzle;
574
575
bank_swizzle[4] = SQ_ALU_SCL_210;
576
while(bank_swizzle[4] <= SQ_ALU_SCL_221) {
577
578
init_bank_swizzle(&bs);
579
if (scalar_only == false) {
580
for (i = 0; i < 4; i++) {
581
if (slots[i]) {
582
r = check_vector(bc, slots[i], &bs, bank_swizzle[i]);
583
if (r)
584
break;
585
}
586
}
587
} else
588
r = 0;
589
590
if (!r && max_slots == 5 && slots[4]) {
591
r = check_scalar(bc, slots[4], &bs, bank_swizzle[4]);
592
}
593
if (!r) {
594
for (i = 0; i < max_slots; i++) {
595
if (slots[i])
596
slots[i]->bank_swizzle = bank_swizzle[i];
597
}
598
return 0;
599
}
600
601
if (scalar_only) {
602
bank_swizzle[4]++;
603
} else {
604
for (i = 0; i < max_slots; i++) {
605
if (!slots[i] || !slots[i]->bank_swizzle_force) {
606
bank_swizzle[i]++;
607
if (bank_swizzle[i] <= SQ_ALU_VEC_210)
608
break;
609
else if (i < max_slots - 1)
610
bank_swizzle[i] = SQ_ALU_VEC_012;
611
else
612
return -1;
613
}
614
}
615
}
616
}
617
618
/* Couldn't find a working swizzle. */
619
return -1;
620
}
621
622
static int replace_gpr_with_pv_ps(struct r600_bytecode *bc,
623
struct r600_bytecode_alu *slots[5], struct r600_bytecode_alu *alu_prev)
624
{
625
struct r600_bytecode_alu *prev[5];
626
int gpr[5], chan[5];
627
int i, j, r, src, num_src;
628
int max_slots = bc->chip_class == CAYMAN ? 4 : 5;
629
630
r = assign_alu_units(bc, alu_prev, prev);
631
if (r)
632
return r;
633
634
for (i = 0; i < max_slots; ++i) {
635
if (prev[i] && alu_writes(prev[i]) && !prev[i]->dst.rel) {
636
637
if (is_alu_64bit_inst(prev[i])) {
638
gpr[i] = -1;
639
continue;
640
}
641
642
gpr[i] = prev[i]->dst.sel;
643
/* cube writes more than PV.X */
644
if (is_alu_reduction_inst(bc, prev[i]))
645
chan[i] = 0;
646
else
647
chan[i] = prev[i]->dst.chan;
648
} else
649
gpr[i] = -1;
650
}
651
652
for (i = 0; i < max_slots; ++i) {
653
struct r600_bytecode_alu *alu = slots[i];
654
if (!alu)
655
continue;
656
657
if (is_alu_64bit_inst(alu))
658
continue;
659
num_src = r600_bytecode_get_num_operands(alu);
660
for (src = 0; src < num_src; ++src) {
661
if (!is_gpr(alu->src[src].sel) || alu->src[src].rel)
662
continue;
663
664
if (bc->chip_class < CAYMAN) {
665
if (alu->src[src].sel == gpr[4] &&
666
alu->src[src].chan == chan[4] &&
667
alu_prev->pred_sel == alu->pred_sel) {
668
alu->src[src].sel = V_SQ_ALU_SRC_PS;
669
alu->src[src].chan = 0;
670
continue;
671
}
672
}
673
674
for (j = 0; j < 4; ++j) {
675
if (alu->src[src].sel == gpr[j] &&
676
alu->src[src].chan == j &&
677
alu_prev->pred_sel == alu->pred_sel) {
678
alu->src[src].sel = V_SQ_ALU_SRC_PV;
679
alu->src[src].chan = chan[j];
680
break;
681
}
682
}
683
}
684
}
685
686
return 0;
687
}
688
689
void r600_bytecode_special_constants(uint32_t value, unsigned *sel)
690
{
691
switch(value) {
692
case 0:
693
*sel = V_SQ_ALU_SRC_0;
694
break;
695
case 1:
696
*sel = V_SQ_ALU_SRC_1_INT;
697
break;
698
case -1:
699
*sel = V_SQ_ALU_SRC_M_1_INT;
700
break;
701
case 0x3F800000: /* 1.0f */
702
*sel = V_SQ_ALU_SRC_1;
703
break;
704
case 0x3F000000: /* 0.5f */
705
*sel = V_SQ_ALU_SRC_0_5;
706
break;
707
default:
708
*sel = V_SQ_ALU_SRC_LITERAL;
709
break;
710
}
711
}
712
713
/* compute how many literal are needed */
714
static int r600_bytecode_alu_nliterals(struct r600_bytecode_alu *alu,
715
uint32_t literal[4], unsigned *nliteral)
716
{
717
unsigned num_src = r600_bytecode_get_num_operands(alu);
718
unsigned i, j;
719
720
for (i = 0; i < num_src; ++i) {
721
if (alu->src[i].sel == V_SQ_ALU_SRC_LITERAL) {
722
uint32_t value = alu->src[i].value;
723
unsigned found = 0;
724
for (j = 0; j < *nliteral; ++j) {
725
if (literal[j] == value) {
726
found = 1;
727
break;
728
}
729
}
730
if (!found) {
731
if (*nliteral >= 4)
732
return -EINVAL;
733
literal[(*nliteral)++] = value;
734
}
735
}
736
}
737
return 0;
738
}
739
740
static void r600_bytecode_alu_adjust_literals(struct r600_bytecode_alu *alu,
741
uint32_t literal[4], unsigned nliteral)
742
{
743
unsigned num_src = r600_bytecode_get_num_operands(alu);
744
unsigned i, j;
745
746
for (i = 0; i < num_src; ++i) {
747
if (alu->src[i].sel == V_SQ_ALU_SRC_LITERAL) {
748
uint32_t value = alu->src[i].value;
749
for (j = 0; j < nliteral; ++j) {
750
if (literal[j] == value) {
751
alu->src[i].chan = j;
752
break;
753
}
754
}
755
}
756
}
757
}
758
759
static int merge_inst_groups(struct r600_bytecode *bc, struct r600_bytecode_alu *slots[5],
760
struct r600_bytecode_alu *alu_prev)
761
{
762
struct r600_bytecode_alu *prev[5];
763
struct r600_bytecode_alu *result[5] = { NULL };
764
765
uint8_t interp_xz = 0;
766
767
uint32_t literal[4], prev_literal[4];
768
unsigned nliteral = 0, prev_nliteral = 0;
769
770
int i, j, r, src, num_src;
771
int num_once_inst = 0;
772
int have_mova = 0, have_rel = 0;
773
int max_slots = bc->chip_class == CAYMAN ? 4 : 5;
774
775
r = assign_alu_units(bc, alu_prev, prev);
776
if (r)
777
return r;
778
779
for (i = 0; i < max_slots; ++i) {
780
if (prev[i]) {
781
if (prev[i]->pred_sel)
782
return 0;
783
if (is_alu_once_inst(prev[i]))
784
return 0;
785
786
if (prev[i]->op == ALU_OP2_INTERP_X)
787
interp_xz |= 1;
788
if (prev[i]->op == ALU_OP2_INTERP_Z)
789
interp_xz |= 2;
790
}
791
if (slots[i]) {
792
if (slots[i]->pred_sel)
793
return 0;
794
if (is_alu_once_inst(slots[i]))
795
return 0;
796
if (slots[i]->op == ALU_OP2_INTERP_X)
797
interp_xz |= 1;
798
if (slots[i]->op == ALU_OP2_INTERP_Z)
799
interp_xz |= 2;
800
}
801
if (interp_xz == 3)
802
return 0;
803
}
804
805
for (i = 0; i < max_slots; ++i) {
806
struct r600_bytecode_alu *alu;
807
808
if (num_once_inst > 0)
809
return 0;
810
811
/* check number of literals */
812
if (prev[i]) {
813
if (r600_bytecode_alu_nliterals(prev[i], literal, &nliteral))
814
return 0;
815
if (r600_bytecode_alu_nliterals(prev[i], prev_literal, &prev_nliteral))
816
return 0;
817
if (is_alu_mova_inst(prev[i])) {
818
if (have_rel)
819
return 0;
820
have_mova = 1;
821
}
822
823
if (alu_uses_rel(prev[i])) {
824
if (have_mova) {
825
return 0;
826
}
827
have_rel = 1;
828
}
829
if (alu_uses_lds(prev[i]))
830
return 0;
831
832
num_once_inst += is_alu_once_inst(prev[i]);
833
}
834
if (slots[i] && r600_bytecode_alu_nliterals(slots[i], literal, &nliteral))
835
return 0;
836
837
/* Let's check used slots. */
838
if (prev[i] && !slots[i]) {
839
result[i] = prev[i];
840
continue;
841
} else if (prev[i] && slots[i]) {
842
if (max_slots == 5 && result[4] == NULL && prev[4] == NULL && slots[4] == NULL) {
843
/* Trans unit is still free try to use it. */
844
if (is_alu_any_unit_inst(bc, slots[i]) && !alu_uses_lds(slots[i])) {
845
result[i] = prev[i];
846
result[4] = slots[i];
847
} else if (is_alu_any_unit_inst(bc, prev[i])) {
848
if (slots[i]->dst.sel == prev[i]->dst.sel &&
849
alu_writes(slots[i]) &&
850
alu_writes(prev[i]))
851
return 0;
852
853
result[i] = slots[i];
854
result[4] = prev[i];
855
} else
856
return 0;
857
} else
858
return 0;
859
} else if(!slots[i]) {
860
continue;
861
} else {
862
if (max_slots == 5 && slots[i] && prev[4] &&
863
slots[i]->dst.sel == prev[4]->dst.sel &&
864
slots[i]->dst.chan == prev[4]->dst.chan &&
865
alu_writes(slots[i]) &&
866
alu_writes(prev[4]))
867
return 0;
868
869
result[i] = slots[i];
870
}
871
872
alu = slots[i];
873
num_once_inst += is_alu_once_inst(alu);
874
875
/* don't reschedule NOPs */
876
if (is_nop_inst(alu))
877
return 0;
878
879
if (is_alu_mova_inst(alu)) {
880
if (have_rel) {
881
return 0;
882
}
883
have_mova = 1;
884
}
885
886
if (alu_uses_rel(alu)) {
887
if (have_mova) {
888
return 0;
889
}
890
have_rel = 1;
891
}
892
893
if (alu->op == ALU_OP0_SET_CF_IDX0 ||
894
alu->op == ALU_OP0_SET_CF_IDX1)
895
return 0; /* data hazard with MOVA */
896
897
/* Let's check source gprs */
898
num_src = r600_bytecode_get_num_operands(alu);
899
for (src = 0; src < num_src; ++src) {
900
901
/* Constants don't matter. */
902
if (!is_gpr(alu->src[src].sel))
903
continue;
904
905
for (j = 0; j < max_slots; ++j) {
906
if (!prev[j] || !alu_writes(prev[j]))
907
continue;
908
909
/* If it's relative then we can't determin which gpr is really used. */
910
if (prev[j]->dst.chan == alu->src[src].chan &&
911
(prev[j]->dst.sel == alu->src[src].sel ||
912
prev[j]->dst.rel || alu->src[src].rel))
913
return 0;
914
}
915
}
916
}
917
918
/* more than one PRED_ or KILL_ ? */
919
if (num_once_inst > 1)
920
return 0;
921
922
/* check if the result can still be swizzlet */
923
r = check_and_set_bank_swizzle(bc, result);
924
if (r)
925
return 0;
926
927
/* looks like everything worked out right, apply the changes */
928
929
/* undo adding previus literals */
930
bc->cf_last->ndw -= align(prev_nliteral, 2);
931
932
/* sort instructions */
933
for (i = 0; i < max_slots; ++i) {
934
slots[i] = result[i];
935
if (result[i]) {
936
list_del(&result[i]->list);
937
result[i]->last = 0;
938
list_addtail(&result[i]->list, &bc->cf_last->alu);
939
}
940
}
941
942
/* determine new last instruction */
943
LIST_ENTRY(struct r600_bytecode_alu, bc->cf_last->alu.prev, list)->last = 1;
944
945
/* determine new first instruction */
946
for (i = 0; i < max_slots; ++i) {
947
if (result[i]) {
948
bc->cf_last->curr_bs_head = result[i];
949
break;
950
}
951
}
952
953
bc->cf_last->prev_bs_head = bc->cf_last->prev2_bs_head;
954
bc->cf_last->prev2_bs_head = NULL;
955
956
return 0;
957
}
958
959
/* we'll keep kcache sets sorted by bank & addr */
960
static int r600_bytecode_alloc_kcache_line(struct r600_bytecode *bc,
961
struct r600_bytecode_kcache *kcache,
962
unsigned bank, unsigned line, unsigned index_mode)
963
{
964
int i, kcache_banks = bc->chip_class >= EVERGREEN ? 4 : 2;
965
966
for (i = 0; i < kcache_banks; i++) {
967
if (kcache[i].mode) {
968
int d;
969
970
if (kcache[i].bank < bank)
971
continue;
972
973
if ((kcache[i].bank == bank && kcache[i].addr > line+1) ||
974
kcache[i].bank > bank) {
975
/* try to insert new line */
976
if (kcache[kcache_banks-1].mode) {
977
/* all sets are in use */
978
return -ENOMEM;
979
}
980
981
memmove(&kcache[i+1],&kcache[i], (kcache_banks-i-1)*sizeof(struct r600_bytecode_kcache));
982
kcache[i].mode = V_SQ_CF_KCACHE_LOCK_1;
983
kcache[i].bank = bank;
984
kcache[i].addr = line;
985
kcache[i].index_mode = index_mode;
986
return 0;
987
}
988
989
d = line - kcache[i].addr;
990
991
if (d == -1) {
992
kcache[i].addr--;
993
if (kcache[i].mode == V_SQ_CF_KCACHE_LOCK_2) {
994
/* we are prepending the line to the current set,
995
* discarding the existing second line,
996
* so we'll have to insert line+2 after it */
997
line += 2;
998
continue;
999
} else if (kcache[i].mode == V_SQ_CF_KCACHE_LOCK_1) {
1000
kcache[i].mode = V_SQ_CF_KCACHE_LOCK_2;
1001
return 0;
1002
} else {
1003
/* V_SQ_CF_KCACHE_LOCK_LOOP_INDEX is not supported */
1004
return -ENOMEM;
1005
}
1006
} else if (d == 1) {
1007
kcache[i].mode = V_SQ_CF_KCACHE_LOCK_2;
1008
return 0;
1009
} else if (d == 0)
1010
return 0;
1011
} else { /* free kcache set - use it */
1012
kcache[i].mode = V_SQ_CF_KCACHE_LOCK_1;
1013
kcache[i].bank = bank;
1014
kcache[i].addr = line;
1015
kcache[i].index_mode = index_mode;
1016
return 0;
1017
}
1018
}
1019
return -ENOMEM;
1020
}
1021
1022
static int r600_bytecode_alloc_inst_kcache_lines(struct r600_bytecode *bc,
1023
struct r600_bytecode_kcache *kcache,
1024
struct r600_bytecode_alu *alu)
1025
{
1026
int i, r;
1027
1028
for (i = 0; i < 3; i++) {
1029
unsigned bank, line, sel = alu->src[i].sel, index_mode;
1030
1031
if (sel < 512)
1032
continue;
1033
1034
bank = alu->src[i].kc_bank;
1035
assert(bank < R600_MAX_HW_CONST_BUFFERS);
1036
line = (sel-512)>>4;
1037
index_mode = alu->src[i].kc_rel ? 1 : 0; // V_SQ_CF_INDEX_0 / V_SQ_CF_INDEX_NONE
1038
1039
if ((r = r600_bytecode_alloc_kcache_line(bc, kcache, bank, line, index_mode)))
1040
return r;
1041
}
1042
return 0;
1043
}
1044
1045
static int r600_bytecode_assign_kcache_banks(
1046
struct r600_bytecode_alu *alu,
1047
struct r600_bytecode_kcache * kcache)
1048
{
1049
int i, j;
1050
1051
/* Alter the src operands to refer to the kcache. */
1052
for (i = 0; i < 3; ++i) {
1053
static const unsigned int base[] = {128, 160, 256, 288};
1054
unsigned int line, sel = alu->src[i].sel, found = 0;
1055
1056
if (sel < 512)
1057
continue;
1058
1059
sel -= 512;
1060
line = sel>>4;
1061
1062
for (j = 0; j < 4 && !found; ++j) {
1063
switch (kcache[j].mode) {
1064
case V_SQ_CF_KCACHE_NOP:
1065
case V_SQ_CF_KCACHE_LOCK_LOOP_INDEX:
1066
R600_ERR("unexpected kcache line mode\n");
1067
return -ENOMEM;
1068
default:
1069
if (kcache[j].bank == alu->src[i].kc_bank &&
1070
kcache[j].addr <= line &&
1071
line < kcache[j].addr + kcache[j].mode) {
1072
alu->src[i].sel = sel - (kcache[j].addr<<4);
1073
alu->src[i].sel += base[j];
1074
found=1;
1075
}
1076
}
1077
}
1078
}
1079
return 0;
1080
}
1081
1082
static int r600_bytecode_alloc_kcache_lines(struct r600_bytecode *bc,
1083
struct r600_bytecode_alu *alu,
1084
unsigned type)
1085
{
1086
struct r600_bytecode_kcache kcache_sets[4];
1087
struct r600_bytecode_kcache *kcache = kcache_sets;
1088
int r;
1089
1090
memcpy(kcache, bc->cf_last->kcache, 4 * sizeof(struct r600_bytecode_kcache));
1091
1092
if ((r = r600_bytecode_alloc_inst_kcache_lines(bc, kcache, alu))) {
1093
/* can't alloc, need to start new clause */
1094
if ((r = r600_bytecode_add_cf(bc))) {
1095
return r;
1096
}
1097
bc->cf_last->op = type;
1098
1099
/* retry with the new clause */
1100
kcache = bc->cf_last->kcache;
1101
if ((r = r600_bytecode_alloc_inst_kcache_lines(bc, kcache, alu))) {
1102
/* can't alloc again- should never happen */
1103
return r;
1104
}
1105
} else {
1106
/* update kcache sets */
1107
memcpy(bc->cf_last->kcache, kcache, 4 * sizeof(struct r600_bytecode_kcache));
1108
}
1109
1110
/* if we actually used more than 2 kcache sets, or have relative indexing - use ALU_EXTENDED on eg+ */
1111
if (kcache[2].mode != V_SQ_CF_KCACHE_NOP ||
1112
kcache[0].index_mode || kcache[1].index_mode || kcache[2].index_mode || kcache[3].index_mode) {
1113
if (bc->chip_class < EVERGREEN)
1114
return -ENOMEM;
1115
bc->cf_last->eg_alu_extended = 1;
1116
}
1117
1118
return 0;
1119
}
1120
1121
static int insert_nop_r6xx(struct r600_bytecode *bc)
1122
{
1123
struct r600_bytecode_alu alu;
1124
int r, i;
1125
1126
for (i = 0; i < 4; i++) {
1127
memset(&alu, 0, sizeof(alu));
1128
alu.op = ALU_OP0_NOP;
1129
alu.src[0].chan = i;
1130
alu.dst.chan = i;
1131
alu.last = (i == 3);
1132
r = r600_bytecode_add_alu(bc, &alu);
1133
if (r)
1134
return r;
1135
}
1136
return 0;
1137
}
1138
1139
/* load AR register from gpr (bc->ar_reg) with MOVA_INT */
1140
static int load_ar_r6xx(struct r600_bytecode *bc)
1141
{
1142
struct r600_bytecode_alu alu;
1143
int r;
1144
1145
if (bc->ar_loaded)
1146
return 0;
1147
1148
/* hack to avoid making MOVA the last instruction in the clause */
1149
if ((bc->cf_last->ndw>>1) >= 110)
1150
bc->force_add_cf = 1;
1151
1152
memset(&alu, 0, sizeof(alu));
1153
alu.op = ALU_OP1_MOVA_GPR_INT;
1154
alu.src[0].sel = bc->ar_reg;
1155
alu.src[0].chan = bc->ar_chan;
1156
alu.last = 1;
1157
alu.index_mode = INDEX_MODE_LOOP;
1158
r = r600_bytecode_add_alu(bc, &alu);
1159
if (r)
1160
return r;
1161
1162
/* no requirement to set uses waterfall on MOVA_GPR_INT */
1163
bc->ar_loaded = 1;
1164
return 0;
1165
}
1166
1167
/* load AR register from gpr (bc->ar_reg) with MOVA_INT */
1168
static int load_ar(struct r600_bytecode *bc)
1169
{
1170
struct r600_bytecode_alu alu;
1171
int r;
1172
1173
if (bc->ar_handling)
1174
return load_ar_r6xx(bc);
1175
1176
if (bc->ar_loaded)
1177
return 0;
1178
1179
/* hack to avoid making MOVA the last instruction in the clause */
1180
if ((bc->cf_last->ndw>>1) >= 110)
1181
bc->force_add_cf = 1;
1182
1183
memset(&alu, 0, sizeof(alu));
1184
alu.op = ALU_OP1_MOVA_INT;
1185
alu.src[0].sel = bc->ar_reg;
1186
alu.src[0].chan = bc->ar_chan;
1187
alu.last = 1;
1188
r = r600_bytecode_add_alu(bc, &alu);
1189
if (r)
1190
return r;
1191
1192
bc->cf_last->r6xx_uses_waterfall = 1;
1193
bc->ar_loaded = 1;
1194
return 0;
1195
}
1196
1197
int r600_bytecode_add_alu_type(struct r600_bytecode *bc,
1198
const struct r600_bytecode_alu *alu, unsigned type)
1199
{
1200
struct r600_bytecode_alu *nalu = r600_bytecode_alu();
1201
struct r600_bytecode_alu *lalu;
1202
int i, r;
1203
1204
if (!nalu)
1205
return -ENOMEM;
1206
memcpy(nalu, alu, sizeof(struct r600_bytecode_alu));
1207
1208
if (alu->is_op3) {
1209
/* will fail later since alu does not support it. */
1210
assert(!alu->src[0].abs && !alu->src[1].abs && !alu->src[2].abs);
1211
}
1212
1213
if (bc->cf_last != NULL && bc->cf_last->op != type) {
1214
/* check if we could add it anyway */
1215
if (bc->cf_last->op == CF_OP_ALU &&
1216
type == CF_OP_ALU_PUSH_BEFORE) {
1217
LIST_FOR_EACH_ENTRY(lalu, &bc->cf_last->alu, list) {
1218
if (lalu->execute_mask) {
1219
bc->force_add_cf = 1;
1220
break;
1221
}
1222
}
1223
} else
1224
bc->force_add_cf = 1;
1225
}
1226
1227
/* cf can contains only alu or only vtx or only tex */
1228
if (bc->cf_last == NULL || bc->force_add_cf) {
1229
r = r600_bytecode_add_cf(bc);
1230
if (r) {
1231
free(nalu);
1232
return r;
1233
}
1234
}
1235
bc->cf_last->op = type;
1236
1237
/* Load index register if required */
1238
if (bc->chip_class >= EVERGREEN) {
1239
for (i = 0; i < 3; i++)
1240
if (nalu->src[i].kc_bank && nalu->src[i].kc_rel)
1241
egcm_load_index_reg(bc, 0, true);
1242
}
1243
1244
/* Check AR usage and load it if required */
1245
for (i = 0; i < 3; i++)
1246
if (nalu->src[i].rel && !bc->ar_loaded)
1247
load_ar(bc);
1248
1249
if (nalu->dst.rel && !bc->ar_loaded)
1250
load_ar(bc);
1251
1252
/* Setup the kcache for this ALU instruction. This will start a new
1253
* ALU clause if needed. */
1254
if ((r = r600_bytecode_alloc_kcache_lines(bc, nalu, type))) {
1255
free(nalu);
1256
return r;
1257
}
1258
1259
if (!bc->cf_last->curr_bs_head) {
1260
bc->cf_last->curr_bs_head = nalu;
1261
}
1262
/* number of gpr == the last gpr used in any alu */
1263
for (i = 0; i < 3; i++) {
1264
if (nalu->src[i].sel >= bc->ngpr && nalu->src[i].sel < 128) {
1265
bc->ngpr = nalu->src[i].sel + 1;
1266
}
1267
if (nalu->src[i].sel == V_SQ_ALU_SRC_LITERAL)
1268
r600_bytecode_special_constants(nalu->src[i].value,
1269
&nalu->src[i].sel);
1270
}
1271
if (nalu->dst.sel >= bc->ngpr) {
1272
bc->ngpr = nalu->dst.sel + 1;
1273
}
1274
list_addtail(&nalu->list, &bc->cf_last->alu);
1275
/* each alu use 2 dwords */
1276
bc->cf_last->ndw += 2;
1277
bc->ndw += 2;
1278
1279
/* process cur ALU instructions for bank swizzle */
1280
if (nalu->last) {
1281
uint32_t literal[4];
1282
unsigned nliteral;
1283
struct r600_bytecode_alu *slots[5];
1284
int max_slots = bc->chip_class == CAYMAN ? 4 : 5;
1285
r = assign_alu_units(bc, bc->cf_last->curr_bs_head, slots);
1286
if (r)
1287
return r;
1288
1289
if (bc->cf_last->prev_bs_head) {
1290
r = merge_inst_groups(bc, slots, bc->cf_last->prev_bs_head);
1291
if (r)
1292
return r;
1293
}
1294
1295
if (bc->cf_last->prev_bs_head) {
1296
r = replace_gpr_with_pv_ps(bc, slots, bc->cf_last->prev_bs_head);
1297
if (r)
1298
return r;
1299
}
1300
1301
r = check_and_set_bank_swizzle(bc, slots);
1302
if (r)
1303
return r;
1304
1305
for (i = 0, nliteral = 0; i < max_slots; i++) {
1306
if (slots[i]) {
1307
r = r600_bytecode_alu_nliterals(slots[i], literal, &nliteral);
1308
if (r)
1309
return r;
1310
}
1311
}
1312
bc->cf_last->ndw += align(nliteral, 2);
1313
1314
/* at most 128 slots, one add alu can add 5 slots + 4 constants(2 slots)
1315
* worst case */
1316
if ((bc->cf_last->ndw >> 1) >= 120) {
1317
bc->force_add_cf = 1;
1318
}
1319
1320
bc->cf_last->prev2_bs_head = bc->cf_last->prev_bs_head;
1321
bc->cf_last->prev_bs_head = bc->cf_last->curr_bs_head;
1322
bc->cf_last->curr_bs_head = NULL;
1323
}
1324
1325
if (nalu->dst.rel && bc->r6xx_nop_after_rel_dst)
1326
insert_nop_r6xx(bc);
1327
1328
/* Might need to insert spill write ops after current clause */
1329
if (nalu->last && bc->n_pending_outputs) {
1330
while (bc->n_pending_outputs) {
1331
r = r600_bytecode_add_output(bc, &bc->pending_outputs[--bc->n_pending_outputs]);
1332
if (r)
1333
return r;
1334
}
1335
}
1336
1337
return 0;
1338
}
1339
1340
int r600_bytecode_add_alu(struct r600_bytecode *bc, const struct r600_bytecode_alu *alu)
1341
{
1342
return r600_bytecode_add_alu_type(bc, alu, CF_OP_ALU);
1343
}
1344
1345
static unsigned r600_bytecode_num_tex_and_vtx_instructions(const struct r600_bytecode *bc)
1346
{
1347
switch (bc->chip_class) {
1348
case R600:
1349
return 8;
1350
1351
case R700:
1352
case EVERGREEN:
1353
case CAYMAN:
1354
return 16;
1355
1356
default:
1357
R600_ERR("Unknown chip class %d.\n", bc->chip_class);
1358
return 8;
1359
}
1360
}
1361
1362
static inline boolean last_inst_was_not_vtx_fetch(struct r600_bytecode *bc)
1363
{
1364
return !((r600_isa_cf(bc->cf_last->op)->flags & CF_FETCH) &&
1365
bc->cf_last->op != CF_OP_GDS &&
1366
(bc->chip_class == CAYMAN ||
1367
bc->cf_last->op != CF_OP_TEX));
1368
}
1369
1370
static int r600_bytecode_add_vtx_internal(struct r600_bytecode *bc, const struct r600_bytecode_vtx *vtx,
1371
bool use_tc)
1372
{
1373
struct r600_bytecode_vtx *nvtx = r600_bytecode_vtx();
1374
int r;
1375
1376
if (!nvtx)
1377
return -ENOMEM;
1378
memcpy(nvtx, vtx, sizeof(struct r600_bytecode_vtx));
1379
1380
/* Load index register if required */
1381
if (bc->chip_class >= EVERGREEN) {
1382
if (vtx->buffer_index_mode)
1383
egcm_load_index_reg(bc, vtx->buffer_index_mode - 1, false);
1384
}
1385
1386
/* cf can contains only alu or only vtx or only tex */
1387
if (bc->cf_last == NULL ||
1388
last_inst_was_not_vtx_fetch(bc) ||
1389
bc->force_add_cf) {
1390
r = r600_bytecode_add_cf(bc);
1391
if (r) {
1392
free(nvtx);
1393
return r;
1394
}
1395
switch (bc->chip_class) {
1396
case R600:
1397
case R700:
1398
bc->cf_last->op = CF_OP_VTX;
1399
break;
1400
case EVERGREEN:
1401
if (use_tc)
1402
bc->cf_last->op = CF_OP_TEX;
1403
else
1404
bc->cf_last->op = CF_OP_VTX;
1405
break;
1406
case CAYMAN:
1407
bc->cf_last->op = CF_OP_TEX;
1408
break;
1409
default:
1410
R600_ERR("Unknown chip class %d.\n", bc->chip_class);
1411
free(nvtx);
1412
return -EINVAL;
1413
}
1414
}
1415
list_addtail(&nvtx->list, &bc->cf_last->vtx);
1416
/* each fetch use 4 dwords */
1417
bc->cf_last->ndw += 4;
1418
bc->ndw += 4;
1419
if ((bc->cf_last->ndw / 4) >= r600_bytecode_num_tex_and_vtx_instructions(bc))
1420
bc->force_add_cf = 1;
1421
1422
bc->ngpr = MAX2(bc->ngpr, vtx->src_gpr + 1);
1423
bc->ngpr = MAX2(bc->ngpr, vtx->dst_gpr + 1);
1424
1425
return 0;
1426
}
1427
1428
int r600_bytecode_add_vtx(struct r600_bytecode *bc, const struct r600_bytecode_vtx *vtx)
1429
{
1430
return r600_bytecode_add_vtx_internal(bc, vtx, false);
1431
}
1432
1433
int r600_bytecode_add_vtx_tc(struct r600_bytecode *bc, const struct r600_bytecode_vtx *vtx)
1434
{
1435
return r600_bytecode_add_vtx_internal(bc, vtx, true);
1436
}
1437
1438
int r600_bytecode_add_tex(struct r600_bytecode *bc, const struct r600_bytecode_tex *tex)
1439
{
1440
struct r600_bytecode_tex *ntex = r600_bytecode_tex();
1441
int r;
1442
1443
if (!ntex)
1444
return -ENOMEM;
1445
memcpy(ntex, tex, sizeof(struct r600_bytecode_tex));
1446
1447
/* Load index register if required */
1448
if (bc->chip_class >= EVERGREEN) {
1449
if (tex->sampler_index_mode || tex->resource_index_mode)
1450
egcm_load_index_reg(bc, 1, false);
1451
}
1452
1453
/* we can't fetch data und use it as texture lookup address in the same TEX clause */
1454
if (bc->cf_last != NULL &&
1455
bc->cf_last->op == CF_OP_TEX) {
1456
struct r600_bytecode_tex *ttex;
1457
LIST_FOR_EACH_ENTRY(ttex, &bc->cf_last->tex, list) {
1458
if (ttex->dst_gpr == ntex->src_gpr &&
1459
(ttex->dst_sel_x < 4 || ttex->dst_sel_y < 4 ||
1460
ttex->dst_sel_z < 4 || ttex->dst_sel_w < 4)) {
1461
bc->force_add_cf = 1;
1462
break;
1463
}
1464
}
1465
/* slight hack to make gradients always go into same cf */
1466
if (ntex->op == FETCH_OP_SET_GRADIENTS_H)
1467
bc->force_add_cf = 1;
1468
}
1469
1470
/* cf can contains only alu or only vtx or only tex */
1471
if (bc->cf_last == NULL ||
1472
bc->cf_last->op != CF_OP_TEX ||
1473
bc->force_add_cf) {
1474
r = r600_bytecode_add_cf(bc);
1475
if (r) {
1476
free(ntex);
1477
return r;
1478
}
1479
bc->cf_last->op = CF_OP_TEX;
1480
}
1481
if (ntex->src_gpr >= bc->ngpr) {
1482
bc->ngpr = ntex->src_gpr + 1;
1483
}
1484
if (ntex->dst_gpr >= bc->ngpr) {
1485
bc->ngpr = ntex->dst_gpr + 1;
1486
}
1487
list_addtail(&ntex->list, &bc->cf_last->tex);
1488
/* each texture fetch use 4 dwords */
1489
bc->cf_last->ndw += 4;
1490
bc->ndw += 4;
1491
if ((bc->cf_last->ndw / 4) >= r600_bytecode_num_tex_and_vtx_instructions(bc))
1492
bc->force_add_cf = 1;
1493
return 0;
1494
}
1495
1496
int r600_bytecode_add_gds(struct r600_bytecode *bc, const struct r600_bytecode_gds *gds)
1497
{
1498
struct r600_bytecode_gds *ngds = r600_bytecode_gds();
1499
int r;
1500
1501
if (ngds == NULL)
1502
return -ENOMEM;
1503
memcpy(ngds, gds, sizeof(struct r600_bytecode_gds));
1504
1505
if (bc->chip_class >= EVERGREEN) {
1506
if (gds->uav_index_mode)
1507
egcm_load_index_reg(bc, gds->uav_index_mode - 1, false);
1508
}
1509
1510
if (bc->cf_last == NULL ||
1511
bc->cf_last->op != CF_OP_GDS ||
1512
bc->force_add_cf) {
1513
r = r600_bytecode_add_cf(bc);
1514
if (r) {
1515
free(ngds);
1516
return r;
1517
}
1518
bc->cf_last->op = CF_OP_GDS;
1519
}
1520
1521
list_addtail(&ngds->list, &bc->cf_last->gds);
1522
bc->cf_last->ndw += 4; /* each GDS uses 4 dwords */
1523
if ((bc->cf_last->ndw / 4) >= r600_bytecode_num_tex_and_vtx_instructions(bc))
1524
bc->force_add_cf = 1;
1525
return 0;
1526
}
1527
1528
int r600_bytecode_add_cfinst(struct r600_bytecode *bc, unsigned op)
1529
{
1530
int r;
1531
1532
/* Emit WAIT_ACK before control flow to ensure pending writes are always acked. */
1533
if (op != CF_OP_MEM_SCRATCH && bc->need_wait_ack) {
1534
bc->need_wait_ack = false;
1535
r = r600_bytecode_add_cfinst(bc, CF_OP_WAIT_ACK);
1536
}
1537
1538
r = r600_bytecode_add_cf(bc);
1539
if (r)
1540
return r;
1541
1542
bc->cf_last->cond = V_SQ_CF_COND_ACTIVE;
1543
bc->cf_last->op = op;
1544
return 0;
1545
}
1546
1547
int cm_bytecode_add_cf_end(struct r600_bytecode *bc)
1548
{
1549
return r600_bytecode_add_cfinst(bc, CF_OP_CF_END);
1550
}
1551
1552
/* common to all 3 families */
1553
static int r600_bytecode_vtx_build(struct r600_bytecode *bc, struct r600_bytecode_vtx *vtx, unsigned id)
1554
{
1555
if (r600_isa_fetch(vtx->op)->flags & FF_MEM)
1556
return r700_bytecode_fetch_mem_build(bc, vtx, id);
1557
bc->bytecode[id] = S_SQ_VTX_WORD0_VTX_INST(r600_isa_fetch_opcode(bc->isa->hw_class, vtx->op)) |
1558
S_SQ_VTX_WORD0_BUFFER_ID(vtx->buffer_id) |
1559
S_SQ_VTX_WORD0_FETCH_TYPE(vtx->fetch_type) |
1560
S_SQ_VTX_WORD0_SRC_GPR(vtx->src_gpr) |
1561
S_SQ_VTX_WORD0_SRC_SEL_X(vtx->src_sel_x);
1562
if (bc->chip_class < CAYMAN)
1563
bc->bytecode[id] |= S_SQ_VTX_WORD0_MEGA_FETCH_COUNT(vtx->mega_fetch_count);
1564
id++;
1565
bc->bytecode[id++] = S_SQ_VTX_WORD1_DST_SEL_X(vtx->dst_sel_x) |
1566
S_SQ_VTX_WORD1_DST_SEL_Y(vtx->dst_sel_y) |
1567
S_SQ_VTX_WORD1_DST_SEL_Z(vtx->dst_sel_z) |
1568
S_SQ_VTX_WORD1_DST_SEL_W(vtx->dst_sel_w) |
1569
S_SQ_VTX_WORD1_USE_CONST_FIELDS(vtx->use_const_fields) |
1570
S_SQ_VTX_WORD1_DATA_FORMAT(vtx->data_format) |
1571
S_SQ_VTX_WORD1_NUM_FORMAT_ALL(vtx->num_format_all) |
1572
S_SQ_VTX_WORD1_FORMAT_COMP_ALL(vtx->format_comp_all) |
1573
S_SQ_VTX_WORD1_SRF_MODE_ALL(vtx->srf_mode_all) |
1574
S_SQ_VTX_WORD1_GPR_DST_GPR(vtx->dst_gpr);
1575
bc->bytecode[id] = S_SQ_VTX_WORD2_OFFSET(vtx->offset)|
1576
S_SQ_VTX_WORD2_ENDIAN_SWAP(vtx->endian);
1577
if (bc->chip_class >= EVERGREEN)
1578
bc->bytecode[id] |= ((vtx->buffer_index_mode & 0x3) << 21); // S_SQ_VTX_WORD2_BIM(vtx->buffer_index_mode);
1579
if (bc->chip_class < CAYMAN)
1580
bc->bytecode[id] |= S_SQ_VTX_WORD2_MEGA_FETCH(1);
1581
id++;
1582
bc->bytecode[id++] = 0;
1583
return 0;
1584
}
1585
1586
/* common to all 3 families */
1587
static int r600_bytecode_tex_build(struct r600_bytecode *bc, struct r600_bytecode_tex *tex, unsigned id)
1588
{
1589
bc->bytecode[id] = S_SQ_TEX_WORD0_TEX_INST(
1590
r600_isa_fetch_opcode(bc->isa->hw_class, tex->op)) |
1591
EG_S_SQ_TEX_WORD0_INST_MOD(tex->inst_mod) |
1592
S_SQ_TEX_WORD0_RESOURCE_ID(tex->resource_id) |
1593
S_SQ_TEX_WORD0_SRC_GPR(tex->src_gpr) |
1594
S_SQ_TEX_WORD0_SRC_REL(tex->src_rel);
1595
if (bc->chip_class >= EVERGREEN)
1596
bc->bytecode[id] |= ((tex->sampler_index_mode & 0x3) << 27) | // S_SQ_TEX_WORD0_SIM(tex->sampler_index_mode);
1597
((tex->resource_index_mode & 0x3) << 25); // S_SQ_TEX_WORD0_RIM(tex->resource_index_mode)
1598
id++;
1599
bc->bytecode[id++] = S_SQ_TEX_WORD1_DST_GPR(tex->dst_gpr) |
1600
S_SQ_TEX_WORD1_DST_REL(tex->dst_rel) |
1601
S_SQ_TEX_WORD1_DST_SEL_X(tex->dst_sel_x) |
1602
S_SQ_TEX_WORD1_DST_SEL_Y(tex->dst_sel_y) |
1603
S_SQ_TEX_WORD1_DST_SEL_Z(tex->dst_sel_z) |
1604
S_SQ_TEX_WORD1_DST_SEL_W(tex->dst_sel_w) |
1605
S_SQ_TEX_WORD1_LOD_BIAS(tex->lod_bias) |
1606
S_SQ_TEX_WORD1_COORD_TYPE_X(tex->coord_type_x) |
1607
S_SQ_TEX_WORD1_COORD_TYPE_Y(tex->coord_type_y) |
1608
S_SQ_TEX_WORD1_COORD_TYPE_Z(tex->coord_type_z) |
1609
S_SQ_TEX_WORD1_COORD_TYPE_W(tex->coord_type_w);
1610
bc->bytecode[id++] = S_SQ_TEX_WORD2_OFFSET_X(tex->offset_x) |
1611
S_SQ_TEX_WORD2_OFFSET_Y(tex->offset_y) |
1612
S_SQ_TEX_WORD2_OFFSET_Z(tex->offset_z) |
1613
S_SQ_TEX_WORD2_SAMPLER_ID(tex->sampler_id) |
1614
S_SQ_TEX_WORD2_SRC_SEL_X(tex->src_sel_x) |
1615
S_SQ_TEX_WORD2_SRC_SEL_Y(tex->src_sel_y) |
1616
S_SQ_TEX_WORD2_SRC_SEL_Z(tex->src_sel_z) |
1617
S_SQ_TEX_WORD2_SRC_SEL_W(tex->src_sel_w);
1618
bc->bytecode[id++] = 0;
1619
return 0;
1620
}
1621
1622
/* r600 only, r700/eg bits in r700_asm.c */
1623
static int r600_bytecode_alu_build(struct r600_bytecode *bc, struct r600_bytecode_alu *alu, unsigned id)
1624
{
1625
unsigned opcode = r600_isa_alu_opcode(bc->isa->hw_class, alu->op);
1626
1627
/* don't replace gpr by pv or ps for destination register */
1628
bc->bytecode[id++] = S_SQ_ALU_WORD0_SRC0_SEL(alu->src[0].sel) |
1629
S_SQ_ALU_WORD0_SRC0_REL(alu->src[0].rel) |
1630
S_SQ_ALU_WORD0_SRC0_CHAN(alu->src[0].chan) |
1631
S_SQ_ALU_WORD0_SRC0_NEG(alu->src[0].neg) |
1632
S_SQ_ALU_WORD0_SRC1_SEL(alu->src[1].sel) |
1633
S_SQ_ALU_WORD0_SRC1_REL(alu->src[1].rel) |
1634
S_SQ_ALU_WORD0_SRC1_CHAN(alu->src[1].chan) |
1635
S_SQ_ALU_WORD0_SRC1_NEG(alu->src[1].neg) |
1636
S_SQ_ALU_WORD0_INDEX_MODE(alu->index_mode) |
1637
S_SQ_ALU_WORD0_PRED_SEL(alu->pred_sel) |
1638
S_SQ_ALU_WORD0_LAST(alu->last);
1639
1640
if (alu->is_op3) {
1641
assert(!alu->src[0].abs && !alu->src[1].abs && !alu->src[2].abs);
1642
bc->bytecode[id++] = S_SQ_ALU_WORD1_DST_GPR(alu->dst.sel) |
1643
S_SQ_ALU_WORD1_DST_CHAN(alu->dst.chan) |
1644
S_SQ_ALU_WORD1_DST_REL(alu->dst.rel) |
1645
S_SQ_ALU_WORD1_CLAMP(alu->dst.clamp) |
1646
S_SQ_ALU_WORD1_OP3_SRC2_SEL(alu->src[2].sel) |
1647
S_SQ_ALU_WORD1_OP3_SRC2_REL(alu->src[2].rel) |
1648
S_SQ_ALU_WORD1_OP3_SRC2_CHAN(alu->src[2].chan) |
1649
S_SQ_ALU_WORD1_OP3_SRC2_NEG(alu->src[2].neg) |
1650
S_SQ_ALU_WORD1_OP3_ALU_INST(opcode) |
1651
S_SQ_ALU_WORD1_BANK_SWIZZLE(alu->bank_swizzle);
1652
} else {
1653
bc->bytecode[id++] = S_SQ_ALU_WORD1_DST_GPR(alu->dst.sel) |
1654
S_SQ_ALU_WORD1_DST_CHAN(alu->dst.chan) |
1655
S_SQ_ALU_WORD1_DST_REL(alu->dst.rel) |
1656
S_SQ_ALU_WORD1_CLAMP(alu->dst.clamp) |
1657
S_SQ_ALU_WORD1_OP2_SRC0_ABS(alu->src[0].abs) |
1658
S_SQ_ALU_WORD1_OP2_SRC1_ABS(alu->src[1].abs) |
1659
S_SQ_ALU_WORD1_OP2_WRITE_MASK(alu->dst.write) |
1660
S_SQ_ALU_WORD1_OP2_OMOD(alu->omod) |
1661
S_SQ_ALU_WORD1_OP2_ALU_INST(opcode) |
1662
S_SQ_ALU_WORD1_BANK_SWIZZLE(alu->bank_swizzle) |
1663
S_SQ_ALU_WORD1_OP2_UPDATE_EXECUTE_MASK(alu->execute_mask) |
1664
S_SQ_ALU_WORD1_OP2_UPDATE_PRED(alu->update_pred);
1665
}
1666
return 0;
1667
}
1668
1669
static void r600_bytecode_cf_vtx_build(uint32_t *bytecode, const struct r600_bytecode_cf *cf)
1670
{
1671
*bytecode++ = S_SQ_CF_WORD0_ADDR(cf->addr >> 1);
1672
*bytecode++ = S_SQ_CF_WORD1_CF_INST(r600_isa_cf_opcode(ISA_CC_R600, cf->op)) |
1673
S_SQ_CF_WORD1_BARRIER(1) |
1674
S_SQ_CF_WORD1_COUNT((cf->ndw / 4) - 1)|
1675
S_SQ_CF_WORD1_END_OF_PROGRAM(cf->end_of_program);
1676
}
1677
1678
/* common for r600/r700 - eg in eg_asm.c */
1679
static int r600_bytecode_cf_build(struct r600_bytecode *bc, struct r600_bytecode_cf *cf)
1680
{
1681
unsigned id = cf->id;
1682
const struct cf_op_info *cfop = r600_isa_cf(cf->op);
1683
unsigned opcode = r600_isa_cf_opcode(bc->isa->hw_class, cf->op);
1684
1685
1686
if (cf->op == CF_NATIVE) {
1687
bc->bytecode[id++] = cf->isa[0];
1688
bc->bytecode[id++] = cf->isa[1];
1689
} else if (cfop->flags & CF_ALU) {
1690
bc->bytecode[id++] = S_SQ_CF_ALU_WORD0_ADDR(cf->addr >> 1) |
1691
S_SQ_CF_ALU_WORD0_KCACHE_MODE0(cf->kcache[0].mode) |
1692
S_SQ_CF_ALU_WORD0_KCACHE_BANK0(cf->kcache[0].bank) |
1693
S_SQ_CF_ALU_WORD0_KCACHE_BANK1(cf->kcache[1].bank);
1694
1695
bc->bytecode[id++] = S_SQ_CF_ALU_WORD1_CF_INST(opcode) |
1696
S_SQ_CF_ALU_WORD1_KCACHE_MODE1(cf->kcache[1].mode) |
1697
S_SQ_CF_ALU_WORD1_KCACHE_ADDR0(cf->kcache[0].addr) |
1698
S_SQ_CF_ALU_WORD1_KCACHE_ADDR1(cf->kcache[1].addr) |
1699
S_SQ_CF_ALU_WORD1_BARRIER(1) |
1700
S_SQ_CF_ALU_WORD1_USES_WATERFALL(bc->chip_class == R600 ? cf->r6xx_uses_waterfall : 0) |
1701
S_SQ_CF_ALU_WORD1_COUNT((cf->ndw / 2) - 1);
1702
} else if (cfop->flags & CF_FETCH) {
1703
if (bc->chip_class == R700)
1704
r700_bytecode_cf_vtx_build(&bc->bytecode[id], cf);
1705
else
1706
r600_bytecode_cf_vtx_build(&bc->bytecode[id], cf);
1707
} else if (cfop->flags & CF_EXP) {
1708
bc->bytecode[id++] = S_SQ_CF_ALLOC_EXPORT_WORD0_RW_GPR(cf->output.gpr) |
1709
S_SQ_CF_ALLOC_EXPORT_WORD0_ELEM_SIZE(cf->output.elem_size) |
1710
S_SQ_CF_ALLOC_EXPORT_WORD0_ARRAY_BASE(cf->output.array_base) |
1711
S_SQ_CF_ALLOC_EXPORT_WORD0_TYPE(cf->output.type) |
1712
S_SQ_CF_ALLOC_EXPORT_WORD0_INDEX_GPR(cf->output.index_gpr);
1713
bc->bytecode[id++] = S_SQ_CF_ALLOC_EXPORT_WORD1_BURST_COUNT(cf->output.burst_count - 1) |
1714
S_SQ_CF_ALLOC_EXPORT_WORD1_SWIZ_SEL_X(cf->output.swizzle_x) |
1715
S_SQ_CF_ALLOC_EXPORT_WORD1_SWIZ_SEL_Y(cf->output.swizzle_y) |
1716
S_SQ_CF_ALLOC_EXPORT_WORD1_SWIZ_SEL_Z(cf->output.swizzle_z) |
1717
S_SQ_CF_ALLOC_EXPORT_WORD1_SWIZ_SEL_W(cf->output.swizzle_w) |
1718
S_SQ_CF_ALLOC_EXPORT_WORD1_BARRIER(cf->barrier) |
1719
S_SQ_CF_ALLOC_EXPORT_WORD1_CF_INST(opcode) |
1720
S_SQ_CF_ALLOC_EXPORT_WORD1_END_OF_PROGRAM(cf->end_of_program);
1721
} else if (cfop->flags & CF_MEM) {
1722
bc->bytecode[id++] = S_SQ_CF_ALLOC_EXPORT_WORD0_RW_GPR(cf->output.gpr) |
1723
S_SQ_CF_ALLOC_EXPORT_WORD0_ELEM_SIZE(cf->output.elem_size) |
1724
S_SQ_CF_ALLOC_EXPORT_WORD0_ARRAY_BASE(cf->output.array_base) |
1725
S_SQ_CF_ALLOC_EXPORT_WORD0_TYPE(cf->output.type) |
1726
S_SQ_CF_ALLOC_EXPORT_WORD0_INDEX_GPR(cf->output.index_gpr);
1727
bc->bytecode[id++] = S_SQ_CF_ALLOC_EXPORT_WORD1_BURST_COUNT(cf->output.burst_count - 1) |
1728
S_SQ_CF_ALLOC_EXPORT_WORD1_BARRIER(cf->barrier) |
1729
S_SQ_CF_ALLOC_EXPORT_WORD1_CF_INST(opcode) |
1730
S_SQ_CF_ALLOC_EXPORT_WORD1_END_OF_PROGRAM(cf->end_of_program) |
1731
S_SQ_CF_ALLOC_EXPORT_WORD1_BUF_ARRAY_SIZE(cf->output.array_size) |
1732
S_SQ_CF_ALLOC_EXPORT_WORD1_BUF_COMP_MASK(cf->output.comp_mask);
1733
} else {
1734
bc->bytecode[id++] = S_SQ_CF_WORD0_ADDR(cf->cf_addr >> 1);
1735
bc->bytecode[id++] = S_SQ_CF_WORD1_CF_INST(opcode) |
1736
S_SQ_CF_WORD1_BARRIER(1) |
1737
S_SQ_CF_WORD1_COND(cf->cond) |
1738
S_SQ_CF_WORD1_POP_COUNT(cf->pop_count) |
1739
S_SQ_CF_WORD1_END_OF_PROGRAM(cf->end_of_program);
1740
}
1741
return 0;
1742
}
1743
1744
int r600_bytecode_build(struct r600_bytecode *bc)
1745
{
1746
struct r600_bytecode_cf *cf;
1747
struct r600_bytecode_alu *alu;
1748
struct r600_bytecode_vtx *vtx;
1749
struct r600_bytecode_tex *tex;
1750
struct r600_bytecode_gds *gds;
1751
uint32_t literal[4];
1752
unsigned nliteral;
1753
unsigned addr;
1754
int i, r;
1755
1756
if (!bc->nstack) { // If not 0, Stack_size already provided by llvm
1757
if (bc->stack.max_entries)
1758
bc->nstack = bc->stack.max_entries;
1759
else if (bc->type == PIPE_SHADER_VERTEX ||
1760
bc->type == PIPE_SHADER_TESS_EVAL ||
1761
bc->type == PIPE_SHADER_TESS_CTRL)
1762
bc->nstack = 1;
1763
}
1764
1765
/* first path compute addr of each CF block */
1766
/* addr start after all the CF instructions */
1767
addr = bc->cf_last->id + 2;
1768
LIST_FOR_EACH_ENTRY(cf, &bc->cf, list) {
1769
if (r600_isa_cf(cf->op)->flags & CF_FETCH) {
1770
addr += 3;
1771
addr &= 0xFFFFFFFCUL;
1772
}
1773
cf->addr = addr;
1774
addr += cf->ndw;
1775
bc->ndw = cf->addr + cf->ndw;
1776
}
1777
free(bc->bytecode);
1778
bc->bytecode = calloc(4, bc->ndw);
1779
if (bc->bytecode == NULL)
1780
return -ENOMEM;
1781
LIST_FOR_EACH_ENTRY(cf, &bc->cf, list) {
1782
const struct cf_op_info *cfop = r600_isa_cf(cf->op);
1783
addr = cf->addr;
1784
if (bc->chip_class >= EVERGREEN)
1785
r = eg_bytecode_cf_build(bc, cf);
1786
else
1787
r = r600_bytecode_cf_build(bc, cf);
1788
if (r)
1789
return r;
1790
if (cfop->flags & CF_ALU) {
1791
nliteral = 0;
1792
memset(literal, 0, sizeof(literal));
1793
LIST_FOR_EACH_ENTRY(alu, &cf->alu, list) {
1794
r = r600_bytecode_alu_nliterals(alu, literal, &nliteral);
1795
if (r)
1796
return r;
1797
r600_bytecode_alu_adjust_literals(alu, literal, nliteral);
1798
r600_bytecode_assign_kcache_banks(alu, cf->kcache);
1799
1800
switch(bc->chip_class) {
1801
case R600:
1802
r = r600_bytecode_alu_build(bc, alu, addr);
1803
break;
1804
case R700:
1805
r = r700_bytecode_alu_build(bc, alu, addr);
1806
break;
1807
case EVERGREEN:
1808
case CAYMAN:
1809
r = eg_bytecode_alu_build(bc, alu, addr);
1810
break;
1811
default:
1812
R600_ERR("unknown chip class %d.\n", bc->chip_class);
1813
return -EINVAL;
1814
}
1815
if (r)
1816
return r;
1817
addr += 2;
1818
if (alu->last) {
1819
for (i = 0; i < align(nliteral, 2); ++i) {
1820
bc->bytecode[addr++] = literal[i];
1821
}
1822
nliteral = 0;
1823
memset(literal, 0, sizeof(literal));
1824
}
1825
}
1826
} else if (cf->op == CF_OP_VTX) {
1827
LIST_FOR_EACH_ENTRY(vtx, &cf->vtx, list) {
1828
r = r600_bytecode_vtx_build(bc, vtx, addr);
1829
if (r)
1830
return r;
1831
addr += 4;
1832
}
1833
} else if (cf->op == CF_OP_GDS) {
1834
assert(bc->chip_class >= EVERGREEN);
1835
LIST_FOR_EACH_ENTRY(gds, &cf->gds, list) {
1836
r = eg_bytecode_gds_build(bc, gds, addr);
1837
if (r)
1838
return r;
1839
addr += 4;
1840
}
1841
} else if (cf->op == CF_OP_TEX) {
1842
LIST_FOR_EACH_ENTRY(vtx, &cf->vtx, list) {
1843
assert(bc->chip_class >= EVERGREEN);
1844
r = r600_bytecode_vtx_build(bc, vtx, addr);
1845
if (r)
1846
return r;
1847
addr += 4;
1848
}
1849
LIST_FOR_EACH_ENTRY(tex, &cf->tex, list) {
1850
r = r600_bytecode_tex_build(bc, tex, addr);
1851
if (r)
1852
return r;
1853
addr += 4;
1854
}
1855
}
1856
}
1857
return 0;
1858
}
1859
1860
void r600_bytecode_clear(struct r600_bytecode *bc)
1861
{
1862
struct r600_bytecode_cf *cf = NULL, *next_cf;
1863
1864
free(bc->bytecode);
1865
bc->bytecode = NULL;
1866
1867
LIST_FOR_EACH_ENTRY_SAFE(cf, next_cf, &bc->cf, list) {
1868
struct r600_bytecode_alu *alu = NULL, *next_alu;
1869
struct r600_bytecode_tex *tex = NULL, *next_tex;
1870
struct r600_bytecode_tex *vtx = NULL, *next_vtx;
1871
struct r600_bytecode_gds *gds = NULL, *next_gds;
1872
1873
LIST_FOR_EACH_ENTRY_SAFE(alu, next_alu, &cf->alu, list) {
1874
free(alu);
1875
}
1876
1877
list_inithead(&cf->alu);
1878
1879
LIST_FOR_EACH_ENTRY_SAFE(tex, next_tex, &cf->tex, list) {
1880
free(tex);
1881
}
1882
1883
list_inithead(&cf->tex);
1884
1885
LIST_FOR_EACH_ENTRY_SAFE(vtx, next_vtx, &cf->vtx, list) {
1886
free(vtx);
1887
}
1888
1889
list_inithead(&cf->vtx);
1890
1891
LIST_FOR_EACH_ENTRY_SAFE(gds, next_gds, &cf->gds, list) {
1892
free(gds);
1893
}
1894
1895
list_inithead(&cf->gds);
1896
1897
free(cf);
1898
}
1899
1900
list_inithead(&cf->list);
1901
}
1902
1903
static int print_swizzle(unsigned swz)
1904
{
1905
const char * swzchars = "xyzw01?_";
1906
assert(swz<8 && swz != 6);
1907
return fprintf(stderr, "%c", swzchars[swz]);
1908
}
1909
1910
static int print_sel(unsigned sel, unsigned rel, unsigned index_mode,
1911
unsigned need_brackets)
1912
{
1913
int o = 0;
1914
if (rel && index_mode >= 5 && sel < 128)
1915
o += fprintf(stderr, "G");
1916
if (rel || need_brackets) {
1917
o += fprintf(stderr, "[");
1918
}
1919
o += fprintf(stderr, "%d", sel);
1920
if (rel) {
1921
if (index_mode == 0 || index_mode == 6)
1922
o += fprintf(stderr, "+AR");
1923
else if (index_mode == 4)
1924
o += fprintf(stderr, "+AL");
1925
}
1926
if (rel || need_brackets) {
1927
o += fprintf(stderr, "]");
1928
}
1929
return o;
1930
}
1931
1932
static int print_dst(struct r600_bytecode_alu *alu)
1933
{
1934
int o = 0;
1935
unsigned sel = alu->dst.sel;
1936
char reg_char = 'R';
1937
if (sel > 128 - 4) { /* clause temporary gpr */
1938
sel -= 128 - 4;
1939
reg_char = 'T';
1940
}
1941
1942
if (alu_writes(alu)) {
1943
o += fprintf(stderr, "%c", reg_char);
1944
o += print_sel(alu->dst.sel, alu->dst.rel, alu->index_mode, 0);
1945
} else {
1946
o += fprintf(stderr, "__");
1947
}
1948
o += fprintf(stderr, ".");
1949
o += print_swizzle(alu->dst.chan);
1950
return o;
1951
}
1952
1953
static int print_src(struct r600_bytecode_alu *alu, unsigned idx)
1954
{
1955
int o = 0;
1956
struct r600_bytecode_alu_src *src = &alu->src[idx];
1957
unsigned sel = src->sel, need_sel = 1, need_chan = 1, need_brackets = 0;
1958
1959
if (src->neg)
1960
o += fprintf(stderr,"-");
1961
if (src->abs)
1962
o += fprintf(stderr,"|");
1963
1964
if (sel < 128 - 4) {
1965
o += fprintf(stderr, "R");
1966
} else if (sel < 128) {
1967
o += fprintf(stderr, "T");
1968
sel -= 128 - 4;
1969
} else if (sel < 160) {
1970
o += fprintf(stderr, "KC0");
1971
need_brackets = 1;
1972
sel -= 128;
1973
} else if (sel < 192) {
1974
o += fprintf(stderr, "KC1");
1975
need_brackets = 1;
1976
sel -= 160;
1977
} else if (sel >= 512) {
1978
o += fprintf(stderr, "C%d", src->kc_bank);
1979
need_brackets = 1;
1980
sel -= 512;
1981
} else if (sel >= 448) {
1982
o += fprintf(stderr, "Param");
1983
sel -= 448;
1984
need_chan = 0;
1985
} else if (sel >= 288) {
1986
o += fprintf(stderr, "KC3");
1987
need_brackets = 1;
1988
sel -= 288;
1989
} else if (sel >= 256) {
1990
o += fprintf(stderr, "KC2");
1991
need_brackets = 1;
1992
sel -= 256;
1993
} else {
1994
need_sel = 0;
1995
need_chan = 0;
1996
switch (sel) {
1997
case EG_V_SQ_ALU_SRC_LDS_DIRECT_A:
1998
o += fprintf(stderr, "LDS_A[0x%08X]", src->value);
1999
break;
2000
case EG_V_SQ_ALU_SRC_LDS_DIRECT_B:
2001
o += fprintf(stderr, "LDS_B[0x%08X]", src->value);
2002
break;
2003
case EG_V_SQ_ALU_SRC_LDS_OQ_A:
2004
o += fprintf(stderr, "LDS_OQ_A");
2005
need_chan = 1;
2006
break;
2007
case EG_V_SQ_ALU_SRC_LDS_OQ_B:
2008
o += fprintf(stderr, "LDS_OQ_B");
2009
need_chan = 1;
2010
break;
2011
case EG_V_SQ_ALU_SRC_LDS_OQ_A_POP:
2012
o += fprintf(stderr, "LDS_OQ_A_POP");
2013
need_chan = 1;
2014
break;
2015
case EG_V_SQ_ALU_SRC_LDS_OQ_B_POP:
2016
o += fprintf(stderr, "LDS_OQ_B_POP");
2017
need_chan = 1;
2018
break;
2019
case EG_V_SQ_ALU_SRC_TIME_LO:
2020
o += fprintf(stderr, "TIME_LO");
2021
break;
2022
case EG_V_SQ_ALU_SRC_TIME_HI:
2023
o += fprintf(stderr, "TIME_HI");
2024
break;
2025
case EG_V_SQ_ALU_SRC_SE_ID:
2026
o += fprintf(stderr, "SE_ID");
2027
break;
2028
case EG_V_SQ_ALU_SRC_SIMD_ID:
2029
o += fprintf(stderr, "SIMD_ID");
2030
break;
2031
case EG_V_SQ_ALU_SRC_HW_WAVE_ID:
2032
o += fprintf(stderr, "HW_WAVE_ID");
2033
break;
2034
case V_SQ_ALU_SRC_PS:
2035
o += fprintf(stderr, "PS");
2036
break;
2037
case V_SQ_ALU_SRC_PV:
2038
o += fprintf(stderr, "PV");
2039
need_chan = 1;
2040
break;
2041
case V_SQ_ALU_SRC_LITERAL:
2042
o += fprintf(stderr, "[0x%08X %f]", src->value, u_bitcast_u2f(src->value));
2043
break;
2044
case V_SQ_ALU_SRC_0_5:
2045
o += fprintf(stderr, "0.5");
2046
break;
2047
case V_SQ_ALU_SRC_M_1_INT:
2048
o += fprintf(stderr, "-1");
2049
break;
2050
case V_SQ_ALU_SRC_1_INT:
2051
o += fprintf(stderr, "1");
2052
break;
2053
case V_SQ_ALU_SRC_1:
2054
o += fprintf(stderr, "1.0");
2055
break;
2056
case V_SQ_ALU_SRC_0:
2057
o += fprintf(stderr, "0");
2058
break;
2059
default:
2060
o += fprintf(stderr, "??IMM_%d", sel);
2061
break;
2062
}
2063
}
2064
2065
if (need_sel)
2066
o += print_sel(sel, src->rel, alu->index_mode, need_brackets);
2067
2068
if (need_chan) {
2069
o += fprintf(stderr, ".");
2070
o += print_swizzle(src->chan);
2071
}
2072
2073
if (src->abs)
2074
o += fprintf(stderr,"|");
2075
2076
return o;
2077
}
2078
2079
static int print_indent(int p, int c)
2080
{
2081
int o = 0;
2082
while (p++ < c)
2083
o += fprintf(stderr, " ");
2084
return o;
2085
}
2086
2087
void r600_bytecode_disasm(struct r600_bytecode *bc)
2088
{
2089
const char *index_mode[] = {"CF_INDEX_NONE", "CF_INDEX_0", "CF_INDEX_1"};
2090
static int index = 0;
2091
struct r600_bytecode_cf *cf = NULL;
2092
struct r600_bytecode_alu *alu = NULL;
2093
struct r600_bytecode_vtx *vtx = NULL;
2094
struct r600_bytecode_tex *tex = NULL;
2095
struct r600_bytecode_gds *gds = NULL;
2096
2097
unsigned i, id, ngr = 0, last;
2098
uint32_t literal[4];
2099
unsigned nliteral;
2100
char chip = '6';
2101
2102
switch (bc->chip_class) {
2103
case R700:
2104
chip = '7';
2105
break;
2106
case EVERGREEN:
2107
chip = 'E';
2108
break;
2109
case CAYMAN:
2110
chip = 'C';
2111
break;
2112
case R600:
2113
default:
2114
chip = '6';
2115
break;
2116
}
2117
fprintf(stderr, "bytecode %d dw -- %d gprs -- %d nstack -------------\n",
2118
bc->ndw, bc->ngpr, bc->nstack);
2119
fprintf(stderr, "shader %d -- %c\n", index++, chip);
2120
2121
LIST_FOR_EACH_ENTRY(cf, &bc->cf, list) {
2122
id = cf->id;
2123
if (cf->op == CF_NATIVE) {
2124
fprintf(stderr, "%04d %08X %08X CF_NATIVE\n", id, bc->bytecode[id],
2125
bc->bytecode[id + 1]);
2126
} else {
2127
const struct cf_op_info *cfop = r600_isa_cf(cf->op);
2128
if (cfop->flags & CF_ALU) {
2129
if (cf->eg_alu_extended) {
2130
fprintf(stderr, "%04d %08X %08X %s\n", id, bc->bytecode[id],
2131
bc->bytecode[id + 1], "ALU_EXT");
2132
id += 2;
2133
}
2134
fprintf(stderr, "%04d %08X %08X %s ", id, bc->bytecode[id],
2135
bc->bytecode[id + 1], cfop->name);
2136
fprintf(stderr, "%d @%d ", cf->ndw / 2, cf->addr);
2137
for (i = 0; i < 4; ++i) {
2138
if (cf->kcache[i].mode) {
2139
int c_start = (cf->kcache[i].addr << 4);
2140
int c_end = c_start + (cf->kcache[i].mode << 4);
2141
fprintf(stderr, "KC%d[CB%d:%d-%d%s%s] ",
2142
i, cf->kcache[i].bank, c_start, c_end,
2143
cf->kcache[i].index_mode ? " " : "",
2144
cf->kcache[i].index_mode ? index_mode[cf->kcache[i].index_mode] : "");
2145
}
2146
}
2147
fprintf(stderr, "\n");
2148
} else if (cfop->flags & CF_FETCH) {
2149
fprintf(stderr, "%04d %08X %08X %s ", id, bc->bytecode[id],
2150
bc->bytecode[id + 1], cfop->name);
2151
fprintf(stderr, "%d @%d ", cf->ndw / 4, cf->addr);
2152
if (cf->vpm)
2153
fprintf(stderr, "VPM ");
2154
if (cf->end_of_program)
2155
fprintf(stderr, "EOP ");
2156
fprintf(stderr, "\n");
2157
2158
} else if (cfop->flags & CF_EXP) {
2159
int o = 0;
2160
const char *exp_type[] = {"PIXEL", "POS ", "PARAM"};
2161
o += fprintf(stderr, "%04d %08X %08X %s ", id, bc->bytecode[id],
2162
bc->bytecode[id + 1], cfop->name);
2163
o += print_indent(o, 43);
2164
o += fprintf(stderr, "%s ", exp_type[cf->output.type]);
2165
if (cf->output.burst_count > 1) {
2166
o += fprintf(stderr, "%d-%d ", cf->output.array_base,
2167
cf->output.array_base + cf->output.burst_count - 1);
2168
2169
o += print_indent(o, 55);
2170
o += fprintf(stderr, "R%d-%d.", cf->output.gpr,
2171
cf->output.gpr + cf->output.burst_count - 1);
2172
} else {
2173
o += fprintf(stderr, "%d ", cf->output.array_base);
2174
o += print_indent(o, 55);
2175
o += fprintf(stderr, "R%d.", cf->output.gpr);
2176
}
2177
2178
o += print_swizzle(cf->output.swizzle_x);
2179
o += print_swizzle(cf->output.swizzle_y);
2180
o += print_swizzle(cf->output.swizzle_z);
2181
o += print_swizzle(cf->output.swizzle_w);
2182
2183
print_indent(o, 67);
2184
2185
fprintf(stderr, " ES:%X ", cf->output.elem_size);
2186
if (cf->mark)
2187
fprintf(stderr, "MARK ");
2188
if (!cf->barrier)
2189
fprintf(stderr, "NO_BARRIER ");
2190
if (cf->end_of_program)
2191
fprintf(stderr, "EOP ");
2192
fprintf(stderr, "\n");
2193
} else if (r600_isa_cf(cf->op)->flags & CF_MEM) {
2194
int o = 0;
2195
const char *exp_type[] = {"WRITE", "WRITE_IND", "WRITE_ACK",
2196
"WRITE_IND_ACK"};
2197
o += fprintf(stderr, "%04d %08X %08X %s ", id,
2198
bc->bytecode[id], bc->bytecode[id + 1], cfop->name);
2199
o += print_indent(o, 43);
2200
o += fprintf(stderr, "%s ", exp_type[cf->output.type]);
2201
2202
if (r600_isa_cf(cf->op)->flags & CF_RAT) {
2203
o += fprintf(stderr, "RAT%d", cf->rat.id);
2204
if (cf->rat.index_mode) {
2205
o += fprintf(stderr, "[IDX%d]", cf->rat.index_mode - 1);
2206
}
2207
o += fprintf(stderr, " INST: %d ", cf->rat.inst);
2208
}
2209
2210
if (cf->output.burst_count > 1) {
2211
o += fprintf(stderr, "%d-%d ", cf->output.array_base,
2212
cf->output.array_base + cf->output.burst_count - 1);
2213
o += print_indent(o, 55);
2214
o += fprintf(stderr, "R%d-%d.", cf->output.gpr,
2215
cf->output.gpr + cf->output.burst_count - 1);
2216
} else {
2217
o += fprintf(stderr, "%d ", cf->output.array_base);
2218
o += print_indent(o, 55);
2219
o += fprintf(stderr, "R%d.", cf->output.gpr);
2220
}
2221
for (i = 0; i < 4; ++i) {
2222
if (cf->output.comp_mask & (1 << i))
2223
o += print_swizzle(i);
2224
else
2225
o += print_swizzle(7);
2226
}
2227
2228
if (cf->output.type == V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_WRITE_IND ||
2229
cf->output.type == V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_READ_IND)
2230
o += fprintf(stderr, " R%d", cf->output.index_gpr);
2231
2232
o += print_indent(o, 67);
2233
2234
fprintf(stderr, " ES:%i ", cf->output.elem_size);
2235
if (cf->output.array_size != 0xFFF)
2236
fprintf(stderr, "AS:%i ", cf->output.array_size);
2237
if (cf->mark)
2238
fprintf(stderr, "MARK ");
2239
if (!cf->barrier)
2240
fprintf(stderr, "NO_BARRIER ");
2241
if (cf->end_of_program)
2242
fprintf(stderr, "EOP ");
2243
2244
if (cf->output.mark)
2245
fprintf(stderr, "MARK ");
2246
2247
fprintf(stderr, "\n");
2248
} else {
2249
fprintf(stderr, "%04d %08X %08X %s ", id, bc->bytecode[id],
2250
bc->bytecode[id + 1], cfop->name);
2251
fprintf(stderr, "@%d ", cf->cf_addr);
2252
if (cf->cond)
2253
fprintf(stderr, "CND:%X ", cf->cond);
2254
if (cf->pop_count)
2255
fprintf(stderr, "POP:%X ", cf->pop_count);
2256
if (cf->count && (cfop->flags & CF_EMIT))
2257
fprintf(stderr, "STREAM%d ", cf->count);
2258
if (cf->vpm)
2259
fprintf(stderr, "VPM ");
2260
if (cf->end_of_program)
2261
fprintf(stderr, "EOP ");
2262
fprintf(stderr, "\n");
2263
}
2264
}
2265
2266
id = cf->addr;
2267
nliteral = 0;
2268
last = 1;
2269
LIST_FOR_EACH_ENTRY(alu, &cf->alu, list) {
2270
const char *omod_str[] = {"","*2","*4","/2"};
2271
const struct alu_op_info *aop = r600_isa_alu(alu->op);
2272
int o = 0;
2273
2274
r600_bytecode_alu_nliterals(alu, literal, &nliteral);
2275
o += fprintf(stderr, " %04d %08X %08X ", id, bc->bytecode[id], bc->bytecode[id+1]);
2276
if (last)
2277
o += fprintf(stderr, "%4d ", ++ngr);
2278
else
2279
o += fprintf(stderr, " ");
2280
o += fprintf(stderr, "%c%c %c ", alu->execute_mask ? 'M':' ',
2281
alu->update_pred ? 'P':' ',
2282
alu->pred_sel ? alu->pred_sel==2 ? '0':'1':' ');
2283
2284
o += fprintf(stderr, "%s%s%s ", aop->name,
2285
omod_str[alu->omod], alu->dst.clamp ? "_sat":"");
2286
2287
o += print_indent(o,60);
2288
o += print_dst(alu);
2289
for (i = 0; i < aop->src_count; ++i) {
2290
o += fprintf(stderr, i == 0 ? ", ": ", ");
2291
o += print_src(alu, i);
2292
}
2293
2294
if (alu->bank_swizzle) {
2295
o += print_indent(o,75);
2296
o += fprintf(stderr, " BS:%d", alu->bank_swizzle);
2297
}
2298
2299
fprintf(stderr, "\n");
2300
id += 2;
2301
2302
if (alu->last) {
2303
for (i = 0; i < nliteral; i++, id++) {
2304
float *f = (float*)(bc->bytecode + id);
2305
o = fprintf(stderr, " %04d %08X", id, bc->bytecode[id]);
2306
print_indent(o, 60);
2307
fprintf(stderr, " %f (%d)\n", *f, *(bc->bytecode + id));
2308
}
2309
id += nliteral & 1;
2310
nliteral = 0;
2311
}
2312
last = alu->last;
2313
}
2314
2315
LIST_FOR_EACH_ENTRY(tex, &cf->tex, list) {
2316
int o = 0;
2317
o += fprintf(stderr, " %04d %08X %08X %08X ", id, bc->bytecode[id],
2318
bc->bytecode[id + 1], bc->bytecode[id + 2]);
2319
2320
o += fprintf(stderr, "%s ", r600_isa_fetch(tex->op)->name);
2321
2322
o += print_indent(o, 50);
2323
2324
o += fprintf(stderr, "R%d.", tex->dst_gpr);
2325
o += print_swizzle(tex->dst_sel_x);
2326
o += print_swizzle(tex->dst_sel_y);
2327
o += print_swizzle(tex->dst_sel_z);
2328
o += print_swizzle(tex->dst_sel_w);
2329
2330
o += fprintf(stderr, ", R%d.", tex->src_gpr);
2331
o += print_swizzle(tex->src_sel_x);
2332
o += print_swizzle(tex->src_sel_y);
2333
o += print_swizzle(tex->src_sel_z);
2334
o += print_swizzle(tex->src_sel_w);
2335
2336
o += fprintf(stderr, ", RID:%d", tex->resource_id);
2337
o += fprintf(stderr, ", SID:%d ", tex->sampler_id);
2338
2339
if (tex->sampler_index_mode)
2340
fprintf(stderr, "SQ_%s ", index_mode[tex->sampler_index_mode]);
2341
2342
if (tex->lod_bias)
2343
fprintf(stderr, "LB:%d ", tex->lod_bias);
2344
2345
fprintf(stderr, "CT:%c%c%c%c ",
2346
tex->coord_type_x ? 'N' : 'U',
2347
tex->coord_type_y ? 'N' : 'U',
2348
tex->coord_type_z ? 'N' : 'U',
2349
tex->coord_type_w ? 'N' : 'U');
2350
2351
if (tex->offset_x)
2352
fprintf(stderr, "OX:%d ", tex->offset_x);
2353
if (tex->offset_y)
2354
fprintf(stderr, "OY:%d ", tex->offset_y);
2355
if (tex->offset_z)
2356
fprintf(stderr, "OZ:%d ", tex->offset_z);
2357
2358
id += 4;
2359
fprintf(stderr, "\n");
2360
}
2361
2362
LIST_FOR_EACH_ENTRY(vtx, &cf->vtx, list) {
2363
int o = 0;
2364
const char * fetch_type[] = {"VERTEX", "INSTANCE", ""};
2365
o += fprintf(stderr, " %04d %08X %08X %08X ", id, bc->bytecode[id],
2366
bc->bytecode[id + 1], bc->bytecode[id + 2]);
2367
2368
o += fprintf(stderr, "%s ", r600_isa_fetch(vtx->op)->name);
2369
2370
o += print_indent(o, 50);
2371
2372
o += fprintf(stderr, "R%d.", vtx->dst_gpr);
2373
o += print_swizzle(vtx->dst_sel_x);
2374
o += print_swizzle(vtx->dst_sel_y);
2375
o += print_swizzle(vtx->dst_sel_z);
2376
o += print_swizzle(vtx->dst_sel_w);
2377
2378
o += fprintf(stderr, ", R%d.", vtx->src_gpr);
2379
o += print_swizzle(vtx->src_sel_x);
2380
if (r600_isa_fetch(vtx->op)->flags & FF_MEM)
2381
o += print_swizzle(vtx->src_sel_y);
2382
2383
if (vtx->offset)
2384
fprintf(stderr, " +%db", vtx->offset);
2385
2386
o += print_indent(o, 55);
2387
2388
fprintf(stderr, ", RID:%d ", vtx->buffer_id);
2389
2390
fprintf(stderr, "%s ", fetch_type[vtx->fetch_type]);
2391
2392
if (bc->chip_class < CAYMAN && vtx->mega_fetch_count)
2393
fprintf(stderr, "MFC:%d ", vtx->mega_fetch_count);
2394
2395
if (bc->chip_class >= EVERGREEN && vtx->buffer_index_mode)
2396
fprintf(stderr, "SQ_%s ", index_mode[vtx->buffer_index_mode]);
2397
2398
if (r600_isa_fetch(vtx->op)->flags & FF_MEM) {
2399
if (vtx->uncached)
2400
fprintf(stderr, "UNCACHED ");
2401
if (vtx->indexed)
2402
fprintf(stderr, "INDEXED:%d ", vtx->indexed);
2403
2404
fprintf(stderr, "ELEM_SIZE:%d ", vtx->elem_size);
2405
if (vtx->burst_count)
2406
fprintf(stderr, "BURST_COUNT:%d ", vtx->burst_count);
2407
fprintf(stderr, "ARRAY_BASE:%d ", vtx->array_base);
2408
fprintf(stderr, "ARRAY_SIZE:%d ", vtx->array_size);
2409
}
2410
2411
fprintf(stderr, "UCF:%d ", vtx->use_const_fields);
2412
fprintf(stderr, "FMT(DTA:%d ", vtx->data_format);
2413
fprintf(stderr, "NUM:%d ", vtx->num_format_all);
2414
fprintf(stderr, "COMP:%d ", vtx->format_comp_all);
2415
fprintf(stderr, "MODE:%d)\n", vtx->srf_mode_all);
2416
2417
id += 4;
2418
}
2419
2420
LIST_FOR_EACH_ENTRY(gds, &cf->gds, list) {
2421
int o = 0;
2422
o += fprintf(stderr, " %04d %08X %08X %08X ", id, bc->bytecode[id],
2423
bc->bytecode[id + 1], bc->bytecode[id + 2]);
2424
2425
o += fprintf(stderr, "%s ", r600_isa_fetch(gds->op)->name);
2426
2427
if (gds->op != FETCH_OP_TF_WRITE) {
2428
o += fprintf(stderr, "R%d.", gds->dst_gpr);
2429
o += print_swizzle(gds->dst_sel_x);
2430
o += print_swizzle(gds->dst_sel_y);
2431
o += print_swizzle(gds->dst_sel_z);
2432
o += print_swizzle(gds->dst_sel_w);
2433
}
2434
2435
o += fprintf(stderr, ", R%d.", gds->src_gpr);
2436
o += print_swizzle(gds->src_sel_x);
2437
o += print_swizzle(gds->src_sel_y);
2438
o += print_swizzle(gds->src_sel_z);
2439
2440
if (gds->op != FETCH_OP_TF_WRITE) {
2441
o += fprintf(stderr, ", R%d.", gds->src_gpr2);
2442
}
2443
if (gds->alloc_consume) {
2444
o += fprintf(stderr, " UAV: %d", gds->uav_id);
2445
if (gds->uav_index_mode)
2446
o += fprintf(stderr, "[%s]", index_mode[gds->uav_index_mode]);
2447
}
2448
fprintf(stderr, "\n");
2449
id += 4;
2450
}
2451
}
2452
2453
fprintf(stderr, "--------------------------------------\n");
2454
}
2455
2456
void r600_vertex_data_type(enum pipe_format pformat,
2457
unsigned *format,
2458
unsigned *num_format, unsigned *format_comp, unsigned *endian)
2459
{
2460
const struct util_format_description *desc;
2461
unsigned i;
2462
2463
*format = 0;
2464
*num_format = 0;
2465
*format_comp = 0;
2466
*endian = ENDIAN_NONE;
2467
2468
if (pformat == PIPE_FORMAT_R11G11B10_FLOAT) {
2469
*format = FMT_10_11_11_FLOAT;
2470
*endian = r600_endian_swap(32);
2471
return;
2472
}
2473
2474
if (pformat == PIPE_FORMAT_B5G6R5_UNORM) {
2475
*format = FMT_5_6_5;
2476
*endian = r600_endian_swap(16);
2477
return;
2478
}
2479
2480
if (pformat == PIPE_FORMAT_B5G5R5A1_UNORM) {
2481
*format = FMT_1_5_5_5;
2482
*endian = r600_endian_swap(16);
2483
return;
2484
}
2485
2486
if (pformat == PIPE_FORMAT_A1B5G5R5_UNORM) {
2487
*format = FMT_5_5_5_1;
2488
return;
2489
}
2490
2491
desc = util_format_description(pformat);
2492
if (desc->layout != UTIL_FORMAT_LAYOUT_PLAIN) {
2493
goto out_unknown;
2494
}
2495
2496
/* Find the first non-VOID channel. */
2497
for (i = 0; i < 4; i++) {
2498
if (desc->channel[i].type != UTIL_FORMAT_TYPE_VOID) {
2499
break;
2500
}
2501
}
2502
2503
*endian = r600_endian_swap(desc->channel[i].size);
2504
2505
switch (desc->channel[i].type) {
2506
/* Half-floats, floats, ints */
2507
case UTIL_FORMAT_TYPE_FLOAT:
2508
switch (desc->channel[i].size) {
2509
case 16:
2510
switch (desc->nr_channels) {
2511
case 1:
2512
*format = FMT_16_FLOAT;
2513
break;
2514
case 2:
2515
*format = FMT_16_16_FLOAT;
2516
break;
2517
case 3:
2518
case 4:
2519
*format = FMT_16_16_16_16_FLOAT;
2520
break;
2521
}
2522
break;
2523
case 32:
2524
switch (desc->nr_channels) {
2525
case 1:
2526
*format = FMT_32_FLOAT;
2527
break;
2528
case 2:
2529
*format = FMT_32_32_FLOAT;
2530
break;
2531
case 3:
2532
*format = FMT_32_32_32_FLOAT;
2533
break;
2534
case 4:
2535
*format = FMT_32_32_32_32_FLOAT;
2536
break;
2537
}
2538
break;
2539
default:
2540
goto out_unknown;
2541
}
2542
break;
2543
/* Unsigned ints */
2544
case UTIL_FORMAT_TYPE_UNSIGNED:
2545
/* Signed ints */
2546
case UTIL_FORMAT_TYPE_SIGNED:
2547
switch (desc->channel[i].size) {
2548
case 4:
2549
switch (desc->nr_channels) {
2550
case 2:
2551
*format = FMT_4_4;
2552
break;
2553
case 4:
2554
*format = FMT_4_4_4_4;
2555
break;
2556
}
2557
break;
2558
case 8:
2559
switch (desc->nr_channels) {
2560
case 1:
2561
*format = FMT_8;
2562
break;
2563
case 2:
2564
*format = FMT_8_8;
2565
break;
2566
case 3:
2567
case 4:
2568
*format = FMT_8_8_8_8;
2569
break;
2570
}
2571
break;
2572
case 10:
2573
if (desc->nr_channels != 4)
2574
goto out_unknown;
2575
2576
*format = FMT_2_10_10_10;
2577
break;
2578
case 16:
2579
switch (desc->nr_channels) {
2580
case 1:
2581
*format = FMT_16;
2582
break;
2583
case 2:
2584
*format = FMT_16_16;
2585
break;
2586
case 3:
2587
case 4:
2588
*format = FMT_16_16_16_16;
2589
break;
2590
}
2591
break;
2592
case 32:
2593
switch (desc->nr_channels) {
2594
case 1:
2595
*format = FMT_32;
2596
break;
2597
case 2:
2598
*format = FMT_32_32;
2599
break;
2600
case 3:
2601
*format = FMT_32_32_32;
2602
break;
2603
case 4:
2604
*format = FMT_32_32_32_32;
2605
break;
2606
}
2607
break;
2608
default:
2609
goto out_unknown;
2610
}
2611
break;
2612
default:
2613
goto out_unknown;
2614
}
2615
2616
if (desc->channel[i].type == UTIL_FORMAT_TYPE_SIGNED) {
2617
*format_comp = 1;
2618
}
2619
2620
*num_format = 0;
2621
if (desc->channel[i].type == UTIL_FORMAT_TYPE_UNSIGNED ||
2622
desc->channel[i].type == UTIL_FORMAT_TYPE_SIGNED) {
2623
if (!desc->channel[i].normalized) {
2624
if (desc->channel[i].pure_integer)
2625
*num_format = 1;
2626
else
2627
*num_format = 2;
2628
}
2629
}
2630
return;
2631
out_unknown:
2632
R600_ERR("unsupported vertex format %s\n", util_format_name(pformat));
2633
}
2634
2635
void *r600_create_vertex_fetch_shader(struct pipe_context *ctx,
2636
unsigned count,
2637
const struct pipe_vertex_element *elements)
2638
{
2639
struct r600_context *rctx = (struct r600_context *)ctx;
2640
struct r600_bytecode bc;
2641
struct r600_bytecode_vtx vtx;
2642
const struct util_format_description *desc;
2643
unsigned fetch_resource_start = rctx->b.chip_class >= EVERGREEN ? 0 : 160;
2644
unsigned format, num_format, format_comp, endian;
2645
uint32_t *bytecode;
2646
int i, j, r, fs_size;
2647
struct r600_fetch_shader *shader;
2648
unsigned no_sb = rctx->screen->b.debug_flags & DBG_NO_SB ||
2649
(rctx->screen->b.debug_flags & DBG_NIR);
2650
unsigned sb_disasm = !no_sb || (rctx->screen->b.debug_flags & DBG_SB_DISASM);
2651
2652
assert(count < 32);
2653
2654
memset(&bc, 0, sizeof(bc));
2655
r600_bytecode_init(&bc, rctx->b.chip_class, rctx->b.family,
2656
rctx->screen->has_compressed_msaa_texturing);
2657
2658
bc.isa = rctx->isa;
2659
2660
for (i = 0; i < count; i++) {
2661
if (elements[i].instance_divisor > 1) {
2662
if (rctx->b.chip_class == CAYMAN) {
2663
for (j = 0; j < 4; j++) {
2664
struct r600_bytecode_alu alu;
2665
memset(&alu, 0, sizeof(alu));
2666
alu.op = ALU_OP2_MULHI_UINT;
2667
alu.src[0].sel = 0;
2668
alu.src[0].chan = 3;
2669
alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
2670
alu.src[1].value = (1ll << 32) / elements[i].instance_divisor + 1;
2671
alu.dst.sel = i + 1;
2672
alu.dst.chan = j;
2673
alu.dst.write = j == 3;
2674
alu.last = j == 3;
2675
if ((r = r600_bytecode_add_alu(&bc, &alu))) {
2676
r600_bytecode_clear(&bc);
2677
return NULL;
2678
}
2679
}
2680
} else {
2681
struct r600_bytecode_alu alu;
2682
memset(&alu, 0, sizeof(alu));
2683
alu.op = ALU_OP2_MULHI_UINT;
2684
alu.src[0].sel = 0;
2685
alu.src[0].chan = 3;
2686
alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
2687
alu.src[1].value = (1ll << 32) / elements[i].instance_divisor + 1;
2688
alu.dst.sel = i + 1;
2689
alu.dst.chan = 3;
2690
alu.dst.write = 1;
2691
alu.last = 1;
2692
if ((r = r600_bytecode_add_alu(&bc, &alu))) {
2693
r600_bytecode_clear(&bc);
2694
return NULL;
2695
}
2696
}
2697
}
2698
}
2699
2700
for (i = 0; i < count; i++) {
2701
r600_vertex_data_type(elements[i].src_format,
2702
&format, &num_format, &format_comp, &endian);
2703
2704
desc = util_format_description(elements[i].src_format);
2705
if (!desc) {
2706
r600_bytecode_clear(&bc);
2707
R600_ERR("unknown format %d\n", elements[i].src_format);
2708
return NULL;
2709
}
2710
2711
if (elements[i].src_offset > 65535) {
2712
r600_bytecode_clear(&bc);
2713
R600_ERR("too big src_offset: %u\n", elements[i].src_offset);
2714
return NULL;
2715
}
2716
2717
memset(&vtx, 0, sizeof(vtx));
2718
vtx.buffer_id = elements[i].vertex_buffer_index + fetch_resource_start;
2719
vtx.fetch_type = elements[i].instance_divisor ? SQ_VTX_FETCH_INSTANCE_DATA : SQ_VTX_FETCH_VERTEX_DATA;
2720
vtx.src_gpr = elements[i].instance_divisor > 1 ? i + 1 : 0;
2721
vtx.src_sel_x = elements[i].instance_divisor ? 3 : 0;
2722
vtx.mega_fetch_count = 0x1F;
2723
vtx.dst_gpr = i + 1;
2724
vtx.dst_sel_x = desc->swizzle[0];
2725
vtx.dst_sel_y = desc->swizzle[1];
2726
vtx.dst_sel_z = desc->swizzle[2];
2727
vtx.dst_sel_w = desc->swizzle[3];
2728
vtx.data_format = format;
2729
vtx.num_format_all = num_format;
2730
vtx.format_comp_all = format_comp;
2731
vtx.offset = elements[i].src_offset;
2732
vtx.endian = endian;
2733
2734
if ((r = r600_bytecode_add_vtx(&bc, &vtx))) {
2735
r600_bytecode_clear(&bc);
2736
return NULL;
2737
}
2738
}
2739
2740
r600_bytecode_add_cfinst(&bc, CF_OP_RET);
2741
2742
if ((r = r600_bytecode_build(&bc))) {
2743
r600_bytecode_clear(&bc);
2744
return NULL;
2745
}
2746
2747
if (rctx->screen->b.debug_flags & DBG_FS) {
2748
fprintf(stderr, "--------------------------------------------------------------\n");
2749
fprintf(stderr, "Vertex elements state:\n");
2750
for (i = 0; i < count; i++) {
2751
fprintf(stderr, " ");
2752
util_dump_vertex_element(stderr, elements+i);
2753
fprintf(stderr, "\n");
2754
}
2755
2756
if (!sb_disasm) {
2757
r600_bytecode_disasm(&bc);
2758
2759
fprintf(stderr, "______________________________________________________________\n");
2760
} else {
2761
r600_sb_bytecode_process(rctx, &bc, NULL, 1 /*dump*/, 0 /*optimize*/);
2762
}
2763
}
2764
2765
fs_size = bc.ndw*4;
2766
2767
/* Allocate the CSO. */
2768
shader = CALLOC_STRUCT(r600_fetch_shader);
2769
if (!shader) {
2770
r600_bytecode_clear(&bc);
2771
return NULL;
2772
}
2773
2774
u_suballocator_alloc(&rctx->allocator_fetch_shader, fs_size, 256,
2775
&shader->offset,
2776
(struct pipe_resource**)&shader->buffer);
2777
if (!shader->buffer) {
2778
r600_bytecode_clear(&bc);
2779
FREE(shader);
2780
return NULL;
2781
}
2782
2783
bytecode = r600_buffer_map_sync_with_rings
2784
(&rctx->b, shader->buffer,
2785
PIPE_MAP_WRITE | PIPE_MAP_UNSYNCHRONIZED | RADEON_MAP_TEMPORARY);
2786
bytecode += shader->offset / 4;
2787
2788
if (R600_BIG_ENDIAN) {
2789
for (i = 0; i < fs_size / 4; ++i) {
2790
bytecode[i] = util_cpu_to_le32(bc.bytecode[i]);
2791
}
2792
} else {
2793
memcpy(bytecode, bc.bytecode, fs_size);
2794
}
2795
rctx->b.ws->buffer_unmap(rctx->b.ws, shader->buffer->buf);
2796
2797
r600_bytecode_clear(&bc);
2798
return shader;
2799
}
2800
2801
void r600_bytecode_alu_read(struct r600_bytecode *bc,
2802
struct r600_bytecode_alu *alu, uint32_t word0, uint32_t word1)
2803
{
2804
/* WORD0 */
2805
alu->src[0].sel = G_SQ_ALU_WORD0_SRC0_SEL(word0);
2806
alu->src[0].rel = G_SQ_ALU_WORD0_SRC0_REL(word0);
2807
alu->src[0].chan = G_SQ_ALU_WORD0_SRC0_CHAN(word0);
2808
alu->src[0].neg = G_SQ_ALU_WORD0_SRC0_NEG(word0);
2809
alu->src[1].sel = G_SQ_ALU_WORD0_SRC1_SEL(word0);
2810
alu->src[1].rel = G_SQ_ALU_WORD0_SRC1_REL(word0);
2811
alu->src[1].chan = G_SQ_ALU_WORD0_SRC1_CHAN(word0);
2812
alu->src[1].neg = G_SQ_ALU_WORD0_SRC1_NEG(word0);
2813
alu->index_mode = G_SQ_ALU_WORD0_INDEX_MODE(word0);
2814
alu->pred_sel = G_SQ_ALU_WORD0_PRED_SEL(word0);
2815
alu->last = G_SQ_ALU_WORD0_LAST(word0);
2816
2817
/* WORD1 */
2818
alu->bank_swizzle = G_SQ_ALU_WORD1_BANK_SWIZZLE(word1);
2819
if (alu->bank_swizzle)
2820
alu->bank_swizzle_force = alu->bank_swizzle;
2821
alu->dst.sel = G_SQ_ALU_WORD1_DST_GPR(word1);
2822
alu->dst.rel = G_SQ_ALU_WORD1_DST_REL(word1);
2823
alu->dst.chan = G_SQ_ALU_WORD1_DST_CHAN(word1);
2824
alu->dst.clamp = G_SQ_ALU_WORD1_CLAMP(word1);
2825
if (G_SQ_ALU_WORD1_ENCODING(word1)) /*ALU_DWORD1_OP3*/
2826
{
2827
alu->is_op3 = 1;
2828
alu->src[2].sel = G_SQ_ALU_WORD1_OP3_SRC2_SEL(word1);
2829
alu->src[2].rel = G_SQ_ALU_WORD1_OP3_SRC2_REL(word1);
2830
alu->src[2].chan = G_SQ_ALU_WORD1_OP3_SRC2_CHAN(word1);
2831
alu->src[2].neg = G_SQ_ALU_WORD1_OP3_SRC2_NEG(word1);
2832
alu->op = r600_isa_alu_by_opcode(bc->isa,
2833
G_SQ_ALU_WORD1_OP3_ALU_INST(word1), /* is_op3 = */ 1);
2834
2835
}
2836
else /*ALU_DWORD1_OP2*/
2837
{
2838
alu->src[0].abs = G_SQ_ALU_WORD1_OP2_SRC0_ABS(word1);
2839
alu->src[1].abs = G_SQ_ALU_WORD1_OP2_SRC1_ABS(word1);
2840
alu->op = r600_isa_alu_by_opcode(bc->isa,
2841
G_SQ_ALU_WORD1_OP2_ALU_INST(word1), /* is_op3 = */ 0);
2842
alu->omod = G_SQ_ALU_WORD1_OP2_OMOD(word1);
2843
alu->dst.write = G_SQ_ALU_WORD1_OP2_WRITE_MASK(word1);
2844
alu->update_pred = G_SQ_ALU_WORD1_OP2_UPDATE_PRED(word1);
2845
alu->execute_mask =
2846
G_SQ_ALU_WORD1_OP2_UPDATE_EXECUTE_MASK(word1);
2847
}
2848
}
2849
2850
#if 0
2851
void r600_bytecode_export_read(struct r600_bytecode *bc,
2852
struct r600_bytecode_output *output, uint32_t word0, uint32_t word1)
2853
{
2854
output->array_base = G_SQ_CF_ALLOC_EXPORT_WORD0_ARRAY_BASE(word0);
2855
output->type = G_SQ_CF_ALLOC_EXPORT_WORD0_TYPE(word0);
2856
output->gpr = G_SQ_CF_ALLOC_EXPORT_WORD0_RW_GPR(word0);
2857
output->elem_size = G_SQ_CF_ALLOC_EXPORT_WORD0_ELEM_SIZE(word0);
2858
2859
output->swizzle_x = G_SQ_CF_ALLOC_EXPORT_WORD1_SWIZ_SEL_X(word1);
2860
output->swizzle_y = G_SQ_CF_ALLOC_EXPORT_WORD1_SWIZ_SEL_Y(word1);
2861
output->swizzle_z = G_SQ_CF_ALLOC_EXPORT_WORD1_SWIZ_SEL_Z(word1);
2862
output->swizzle_w = G_SQ_CF_ALLOC_EXPORT_WORD1_SWIZ_SEL_W(word1);
2863
output->burst_count = G_SQ_CF_ALLOC_EXPORT_WORD1_BURST_COUNT(word1);
2864
output->end_of_program = G_SQ_CF_ALLOC_EXPORT_WORD1_END_OF_PROGRAM(word1);
2865
output->op = r600_isa_cf_by_opcode(bc->isa,
2866
G_SQ_CF_ALLOC_EXPORT_WORD1_CF_INST(word1), 0);
2867
output->barrier = G_SQ_CF_ALLOC_EXPORT_WORD1_BARRIER(word1);
2868
output->array_size = G_SQ_CF_ALLOC_EXPORT_WORD1_BUF_ARRAY_SIZE(word1);
2869
output->comp_mask = G_SQ_CF_ALLOC_EXPORT_WORD1_BUF_COMP_MASK(word1);
2870
}
2871
#endif
2872
2873