Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
PojavLauncherTeam
GitHub Repository: PojavLauncherTeam/mesa
Path: blob/21.2-virgl/src/intel/compiler/brw_fs_builder.h
4550 views
1
/* -*- c++ -*- */
2
/*
3
* Copyright © 2010-2015 Intel Corporation
4
*
5
* Permission is hereby granted, free of charge, to any person obtaining a
6
* copy of this software and associated documentation files (the "Software"),
7
* to deal in the Software without restriction, including without limitation
8
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
9
* and/or sell copies of the Software, and to permit persons to whom the
10
* Software is furnished to do so, subject to the following conditions:
11
*
12
* The above copyright notice and this permission notice (including the next
13
* paragraph) shall be included in all copies or substantial portions of the
14
* Software.
15
*
16
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
21
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
22
* IN THE SOFTWARE.
23
*/
24
25
#ifndef BRW_FS_BUILDER_H
26
#define BRW_FS_BUILDER_H
27
28
#include "brw_ir_fs.h"
29
#include "brw_shader.h"
30
31
namespace brw {
32
/**
33
* Toolbox to assemble an FS IR program out of individual instructions.
34
*
35
* This object is meant to have an interface consistent with
36
* brw::vec4_builder. They cannot be fully interchangeable because
37
* brw::fs_builder generates scalar code while brw::vec4_builder generates
38
* vector code.
39
*/
40
class fs_builder {
41
public:
42
/** Type used in this IR to represent a source of an instruction. */
43
typedef fs_reg src_reg;
44
45
/** Type used in this IR to represent the destination of an instruction. */
46
typedef fs_reg dst_reg;
47
48
/** Type used in this IR to represent an instruction. */
49
typedef fs_inst instruction;
50
51
/**
52
* Construct an fs_builder that inserts instructions into \p shader.
53
* \p dispatch_width gives the native execution width of the program.
54
*/
55
fs_builder(backend_shader *shader,
56
unsigned dispatch_width) :
57
shader(shader), block(NULL), cursor(NULL),
58
_dispatch_width(dispatch_width),
59
_group(0),
60
force_writemask_all(false),
61
annotation()
62
{
63
}
64
65
/**
66
* Construct an fs_builder that inserts instructions into \p shader
67
* before instruction \p inst in basic block \p block. The default
68
* execution controls and debug annotation are initialized from the
69
* instruction passed as argument.
70
*/
71
fs_builder(backend_shader *shader, bblock_t *block, fs_inst *inst) :
72
shader(shader), block(block), cursor(inst),
73
_dispatch_width(inst->exec_size),
74
_group(inst->group),
75
force_writemask_all(inst->force_writemask_all)
76
{
77
annotation.str = inst->annotation;
78
annotation.ir = inst->ir;
79
}
80
81
/**
82
* Construct an fs_builder that inserts instructions before \p cursor in
83
* basic block \p block, inheriting other code generation parameters
84
* from this.
85
*/
86
fs_builder
87
at(bblock_t *block, exec_node *cursor) const
88
{
89
fs_builder bld = *this;
90
bld.block = block;
91
bld.cursor = cursor;
92
return bld;
93
}
94
95
/**
96
* Construct an fs_builder appending instructions at the end of the
97
* instruction list of the shader, inheriting other code generation
98
* parameters from this.
99
*/
100
fs_builder
101
at_end() const
102
{
103
return at(NULL, (exec_node *)&shader->instructions.tail_sentinel);
104
}
105
106
/**
107
* Construct a builder specifying the default SIMD width and group of
108
* channel enable signals, inheriting other code generation parameters
109
* from this.
110
*
111
* \p n gives the default SIMD width, \p i gives the slot group used for
112
* predication and control flow masking in multiples of \p n channels.
113
*/
114
fs_builder
115
group(unsigned n, unsigned i) const
116
{
117
fs_builder bld = *this;
118
119
if (n <= dispatch_width() && i < dispatch_width() / n) {
120
bld._group += i * n;
121
} else {
122
/* The requested channel group isn't a subset of the channel group
123
* of this builder, which means that the resulting instructions
124
* would use (potentially undefined) channel enable signals not
125
* specified by the parent builder. That's only valid if the
126
* instruction doesn't have per-channel semantics, in which case
127
* we should clear off the default group index in order to prevent
128
* emitting instructions with channel group not aligned to their
129
* own execution size.
130
*/
131
assert(force_writemask_all);
132
bld._group = 0;
133
}
134
135
bld._dispatch_width = n;
136
return bld;
137
}
138
139
/**
140
* Alias for group() with width equal to eight.
141
*/
142
fs_builder
143
quarter(unsigned i) const
144
{
145
return group(8, i);
146
}
147
148
/**
149
* Construct a builder with per-channel control flow execution masking
150
* disabled if \p b is true. If control flow execution masking is
151
* already disabled this has no effect.
152
*/
153
fs_builder
154
exec_all(bool b = true) const
155
{
156
fs_builder bld = *this;
157
if (b)
158
bld.force_writemask_all = true;
159
return bld;
160
}
161
162
/**
163
* Construct a builder with the given debug annotation info.
164
*/
165
fs_builder
166
annotate(const char *str, const void *ir = NULL) const
167
{
168
fs_builder bld = *this;
169
bld.annotation.str = str;
170
bld.annotation.ir = ir;
171
return bld;
172
}
173
174
/**
175
* Get the SIMD width in use.
176
*/
177
unsigned
178
dispatch_width() const
179
{
180
return _dispatch_width;
181
}
182
183
/**
184
* Get the channel group in use.
185
*/
186
unsigned
187
group() const
188
{
189
return _group;
190
}
191
192
/**
193
* Allocate a virtual register of natural vector size (one for this IR)
194
* and SIMD width. \p n gives the amount of space to allocate in
195
* dispatch_width units (which is just enough space for one logical
196
* component in this IR).
197
*/
198
dst_reg
199
vgrf(enum brw_reg_type type, unsigned n = 1) const
200
{
201
assert(dispatch_width() <= 32);
202
203
if (n > 0)
204
return dst_reg(VGRF, shader->alloc.allocate(
205
DIV_ROUND_UP(n * type_sz(type) * dispatch_width(),
206
REG_SIZE)),
207
type);
208
else
209
return retype(null_reg_ud(), type);
210
}
211
212
/**
213
* Create a null register of floating type.
214
*/
215
dst_reg
216
null_reg_f() const
217
{
218
return dst_reg(retype(brw_null_reg(), BRW_REGISTER_TYPE_F));
219
}
220
221
dst_reg
222
null_reg_df() const
223
{
224
return dst_reg(retype(brw_null_reg(), BRW_REGISTER_TYPE_DF));
225
}
226
227
/**
228
* Create a null register of signed integer type.
229
*/
230
dst_reg
231
null_reg_d() const
232
{
233
return dst_reg(retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
234
}
235
236
/**
237
* Create a null register of unsigned integer type.
238
*/
239
dst_reg
240
null_reg_ud() const
241
{
242
return dst_reg(retype(brw_null_reg(), BRW_REGISTER_TYPE_UD));
243
}
244
245
/**
246
* Insert an instruction into the program.
247
*/
248
instruction *
249
emit(const instruction &inst) const
250
{
251
return emit(new(shader->mem_ctx) instruction(inst));
252
}
253
254
/**
255
* Create and insert a nullary control instruction into the program.
256
*/
257
instruction *
258
emit(enum opcode opcode) const
259
{
260
return emit(instruction(opcode, dispatch_width()));
261
}
262
263
/**
264
* Create and insert a nullary instruction into the program.
265
*/
266
instruction *
267
emit(enum opcode opcode, const dst_reg &dst) const
268
{
269
return emit(instruction(opcode, dispatch_width(), dst));
270
}
271
272
/**
273
* Create and insert a unary instruction into the program.
274
*/
275
instruction *
276
emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0) const
277
{
278
switch (opcode) {
279
case SHADER_OPCODE_RCP:
280
case SHADER_OPCODE_RSQ:
281
case SHADER_OPCODE_SQRT:
282
case SHADER_OPCODE_EXP2:
283
case SHADER_OPCODE_LOG2:
284
case SHADER_OPCODE_SIN:
285
case SHADER_OPCODE_COS:
286
return emit(instruction(opcode, dispatch_width(), dst,
287
fix_math_operand(src0)));
288
289
default:
290
return emit(instruction(opcode, dispatch_width(), dst, src0));
291
}
292
}
293
294
/**
295
* Create and insert a binary instruction into the program.
296
*/
297
instruction *
298
emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
299
const src_reg &src1) const
300
{
301
switch (opcode) {
302
case SHADER_OPCODE_POW:
303
case SHADER_OPCODE_INT_QUOTIENT:
304
case SHADER_OPCODE_INT_REMAINDER:
305
return emit(instruction(opcode, dispatch_width(), dst,
306
fix_math_operand(src0),
307
fix_math_operand(src1)));
308
309
default:
310
return emit(instruction(opcode, dispatch_width(), dst,
311
src0, src1));
312
313
}
314
}
315
316
/**
317
* Create and insert a ternary instruction into the program.
318
*/
319
instruction *
320
emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
321
const src_reg &src1, const src_reg &src2) const
322
{
323
switch (opcode) {
324
case BRW_OPCODE_BFE:
325
case BRW_OPCODE_BFI2:
326
case BRW_OPCODE_MAD:
327
case BRW_OPCODE_LRP:
328
return emit(instruction(opcode, dispatch_width(), dst,
329
fix_3src_operand(src0),
330
fix_3src_operand(src1),
331
fix_3src_operand(src2)));
332
333
default:
334
return emit(instruction(opcode, dispatch_width(), dst,
335
src0, src1, src2));
336
}
337
}
338
339
/**
340
* Create and insert an instruction with a variable number of sources
341
* into the program.
342
*/
343
instruction *
344
emit(enum opcode opcode, const dst_reg &dst, const src_reg srcs[],
345
unsigned n) const
346
{
347
/* Use the emit() methods for specific operand counts to ensure that
348
* opcode-specific operand fixups occur.
349
*/
350
if (n == 2) {
351
return emit(opcode, dst, srcs[0], srcs[1]);
352
} else if (n == 3) {
353
return emit(opcode, dst, srcs[0], srcs[1], srcs[2]);
354
} else {
355
return emit(instruction(opcode, dispatch_width(), dst, srcs, n));
356
}
357
}
358
359
/**
360
* Insert a preallocated instruction into the program.
361
*/
362
instruction *
363
emit(instruction *inst) const
364
{
365
assert(inst->exec_size <= 32);
366
assert(inst->exec_size == dispatch_width() ||
367
force_writemask_all);
368
369
inst->group = _group;
370
inst->force_writemask_all = force_writemask_all;
371
inst->annotation = annotation.str;
372
inst->ir = annotation.ir;
373
374
if (block)
375
static_cast<instruction *>(cursor)->insert_before(block, inst);
376
else
377
cursor->insert_before(inst);
378
379
return inst;
380
}
381
382
/**
383
* Select \p src0 if the comparison of both sources with the given
384
* conditional mod evaluates to true, otherwise select \p src1.
385
*
386
* Generally useful to get the minimum or maximum of two values.
387
*/
388
instruction *
389
emit_minmax(const dst_reg &dst, const src_reg &src0,
390
const src_reg &src1, brw_conditional_mod mod) const
391
{
392
assert(mod == BRW_CONDITIONAL_GE || mod == BRW_CONDITIONAL_L);
393
394
/* In some cases we can't have bytes as operand for src1, so use the
395
* same type for both operand.
396
*/
397
return set_condmod(mod, SEL(dst, fix_unsigned_negate(src0),
398
fix_unsigned_negate(src1)));
399
}
400
401
/**
402
* Copy any live channel from \p src to the first channel of the result.
403
*/
404
src_reg
405
emit_uniformize(const src_reg &src) const
406
{
407
/* FIXME: We use a vector chan_index and dst to allow constant and
408
* copy propagration to move result all the way into the consuming
409
* instruction (typically a surface index or sampler index for a
410
* send). This uses 1 or 3 extra hw registers in 16 or 32 wide
411
* dispatch. Once we teach const/copy propagation about scalars we
412
* should go back to scalar destinations here.
413
*/
414
const fs_builder ubld = exec_all();
415
const dst_reg chan_index = vgrf(BRW_REGISTER_TYPE_UD);
416
const dst_reg dst = vgrf(src.type);
417
418
ubld.emit(SHADER_OPCODE_FIND_LIVE_CHANNEL, chan_index);
419
ubld.emit(SHADER_OPCODE_BROADCAST, dst, src, component(chan_index, 0));
420
421
return src_reg(component(dst, 0));
422
}
423
424
src_reg
425
move_to_vgrf(const src_reg &src, unsigned num_components) const
426
{
427
src_reg *const src_comps = new src_reg[num_components];
428
for (unsigned i = 0; i < num_components; i++)
429
src_comps[i] = offset(src, dispatch_width(), i);
430
431
const dst_reg dst = vgrf(src.type, num_components);
432
LOAD_PAYLOAD(dst, src_comps, num_components, 0);
433
434
delete[] src_comps;
435
436
return src_reg(dst);
437
}
438
439
void
440
emit_scan_step(enum opcode opcode, brw_conditional_mod mod,
441
const dst_reg &tmp,
442
unsigned left_offset, unsigned left_stride,
443
unsigned right_offset, unsigned right_stride) const
444
{
445
dst_reg left, right;
446
left = horiz_stride(horiz_offset(tmp, left_offset), left_stride);
447
right = horiz_stride(horiz_offset(tmp, right_offset), right_stride);
448
if ((tmp.type == BRW_REGISTER_TYPE_Q ||
449
tmp.type == BRW_REGISTER_TYPE_UQ) &&
450
!shader->devinfo->has_64bit_int) {
451
switch (opcode) {
452
case BRW_OPCODE_MUL:
453
/* This will get lowered by integer MUL lowering */
454
set_condmod(mod, emit(opcode, right, left, right));
455
break;
456
457
case BRW_OPCODE_SEL: {
458
/* In order for the comparisons to work out right, we need our
459
* comparisons to be strict.
460
*/
461
assert(mod == BRW_CONDITIONAL_L || mod == BRW_CONDITIONAL_GE);
462
if (mod == BRW_CONDITIONAL_GE)
463
mod = BRW_CONDITIONAL_G;
464
465
/* We treat the bottom 32 bits as unsigned regardless of
466
* whether or not the integer as a whole is signed.
467
*/
468
dst_reg right_low = subscript(right, BRW_REGISTER_TYPE_UD, 0);
469
dst_reg left_low = subscript(left, BRW_REGISTER_TYPE_UD, 0);
470
471
/* The upper bits get the same sign as the 64-bit type */
472
brw_reg_type type32 = brw_reg_type_from_bit_size(32, tmp.type);
473
dst_reg right_high = subscript(right, type32, 1);
474
dst_reg left_high = subscript(left, type32, 1);
475
476
/* Build up our comparison:
477
*
478
* l_hi < r_hi || (l_hi == r_hi && l_low < r_low)
479
*/
480
CMP(null_reg_ud(), retype(left_low, BRW_REGISTER_TYPE_UD),
481
retype(right_low, BRW_REGISTER_TYPE_UD), mod);
482
set_predicate(BRW_PREDICATE_NORMAL,
483
CMP(null_reg_ud(), left_high, right_high,
484
BRW_CONDITIONAL_EQ));
485
set_predicate_inv(BRW_PREDICATE_NORMAL, true,
486
CMP(null_reg_ud(), left_high, right_high, mod));
487
488
/* We could use selects here or we could use predicated MOVs
489
* because the destination and second source (if it were a SEL)
490
* are the same.
491
*/
492
set_predicate(BRW_PREDICATE_NORMAL, MOV(right_low, left_low));
493
set_predicate(BRW_PREDICATE_NORMAL, MOV(right_high, left_high));
494
break;
495
}
496
497
default:
498
unreachable("Unsupported 64-bit scan op");
499
}
500
} else {
501
set_condmod(mod, emit(opcode, right, left, right));
502
}
503
}
504
505
void
506
emit_scan(enum opcode opcode, const dst_reg &tmp,
507
unsigned cluster_size, brw_conditional_mod mod) const
508
{
509
assert(dispatch_width() >= 8);
510
511
/* The instruction splitting code isn't advanced enough to split
512
* these so we need to handle that ourselves.
513
*/
514
if (dispatch_width() * type_sz(tmp.type) > 2 * REG_SIZE) {
515
const unsigned half_width = dispatch_width() / 2;
516
const fs_builder ubld = exec_all().group(half_width, 0);
517
dst_reg left = tmp;
518
dst_reg right = horiz_offset(tmp, half_width);
519
ubld.emit_scan(opcode, left, cluster_size, mod);
520
ubld.emit_scan(opcode, right, cluster_size, mod);
521
if (cluster_size > half_width) {
522
ubld.emit_scan_step(opcode, mod, tmp,
523
half_width - 1, 0, half_width, 1);
524
}
525
return;
526
}
527
528
if (cluster_size > 1) {
529
const fs_builder ubld = exec_all().group(dispatch_width() / 2, 0);
530
ubld.emit_scan_step(opcode, mod, tmp, 0, 2, 1, 2);
531
}
532
533
if (cluster_size > 2) {
534
if (type_sz(tmp.type) <= 4) {
535
const fs_builder ubld =
536
exec_all().group(dispatch_width() / 4, 0);
537
ubld.emit_scan_step(opcode, mod, tmp, 1, 4, 2, 4);
538
ubld.emit_scan_step(opcode, mod, tmp, 1, 4, 3, 4);
539
} else {
540
/* For 64-bit types, we have to do things differently because
541
* the code above would land us with destination strides that
542
* the hardware can't handle. Fortunately, we'll only be
543
* 8-wide in that case and it's the same number of
544
* instructions.
545
*/
546
const fs_builder ubld = exec_all().group(2, 0);
547
for (unsigned i = 0; i < dispatch_width(); i += 4)
548
ubld.emit_scan_step(opcode, mod, tmp, i + 1, 0, i + 2, 1);
549
}
550
}
551
552
for (unsigned i = 4;
553
i < MIN2(cluster_size, dispatch_width());
554
i *= 2) {
555
const fs_builder ubld = exec_all().group(i, 0);
556
ubld.emit_scan_step(opcode, mod, tmp, i - 1, 0, i, 1);
557
558
if (dispatch_width() > i * 2)
559
ubld.emit_scan_step(opcode, mod, tmp, i * 3 - 1, 0, i * 3, 1);
560
561
if (dispatch_width() > i * 4) {
562
ubld.emit_scan_step(opcode, mod, tmp, i * 5 - 1, 0, i * 5, 1);
563
ubld.emit_scan_step(opcode, mod, tmp, i * 7 - 1, 0, i * 7, 1);
564
}
565
}
566
}
567
568
/**
569
* Assorted arithmetic ops.
570
* @{
571
*/
572
#define ALU1(op) \
573
instruction * \
574
op(const dst_reg &dst, const src_reg &src0) const \
575
{ \
576
return emit(BRW_OPCODE_##op, dst, src0); \
577
}
578
579
#define ALU2(op) \
580
instruction * \
581
op(const dst_reg &dst, const src_reg &src0, const src_reg &src1) const \
582
{ \
583
return emit(BRW_OPCODE_##op, dst, src0, src1); \
584
}
585
586
#define ALU2_ACC(op) \
587
instruction * \
588
op(const dst_reg &dst, const src_reg &src0, const src_reg &src1) const \
589
{ \
590
instruction *inst = emit(BRW_OPCODE_##op, dst, src0, src1); \
591
inst->writes_accumulator = true; \
592
return inst; \
593
}
594
595
#define ALU3(op) \
596
instruction * \
597
op(const dst_reg &dst, const src_reg &src0, const src_reg &src1, \
598
const src_reg &src2) const \
599
{ \
600
return emit(BRW_OPCODE_##op, dst, src0, src1, src2); \
601
}
602
603
ALU2(ADD)
604
ALU2_ACC(ADDC)
605
ALU2(AND)
606
ALU2(ASR)
607
ALU2(AVG)
608
ALU3(BFE)
609
ALU2(BFI1)
610
ALU3(BFI2)
611
ALU1(BFREV)
612
ALU1(CBIT)
613
ALU1(DIM)
614
ALU2(DP2)
615
ALU2(DP3)
616
ALU2(DP4)
617
ALU2(DPH)
618
ALU1(F16TO32)
619
ALU1(F32TO16)
620
ALU1(FBH)
621
ALU1(FBL)
622
ALU1(FRC)
623
ALU2(LINE)
624
ALU1(LZD)
625
ALU2(MAC)
626
ALU2_ACC(MACH)
627
ALU3(MAD)
628
ALU1(MOV)
629
ALU2(MUL)
630
ALU1(NOT)
631
ALU2(OR)
632
ALU2(PLN)
633
ALU1(RNDD)
634
ALU1(RNDE)
635
ALU1(RNDU)
636
ALU1(RNDZ)
637
ALU2(ROL)
638
ALU2(ROR)
639
ALU2(SAD2)
640
ALU2_ACC(SADA2)
641
ALU2(SEL)
642
ALU2(SHL)
643
ALU2(SHR)
644
ALU2_ACC(SUBB)
645
ALU2(XOR)
646
647
#undef ALU3
648
#undef ALU2_ACC
649
#undef ALU2
650
#undef ALU1
651
/** @} */
652
653
/**
654
* CMP: Sets the low bit of the destination channels with the result
655
* of the comparison, while the upper bits are undefined, and updates
656
* the flag register with the packed 16 bits of the result.
657
*/
658
instruction *
659
CMP(const dst_reg &dst, const src_reg &src0, const src_reg &src1,
660
brw_conditional_mod condition) const
661
{
662
/* Take the instruction:
663
*
664
* CMP null<d> src0<f> src1<f>
665
*
666
* Original gfx4 does type conversion to the destination type
667
* before comparison, producing garbage results for floating
668
* point comparisons.
669
*
670
* The destination type doesn't matter on newer generations,
671
* so we set the type to match src0 so we can compact the
672
* instruction.
673
*/
674
return set_condmod(condition,
675
emit(BRW_OPCODE_CMP, retype(dst, src0.type),
676
fix_unsigned_negate(src0),
677
fix_unsigned_negate(src1)));
678
}
679
680
/**
681
* CMPN: Behaves like CMP, but produces true if src1 is NaN.
682
*/
683
instruction *
684
CMPN(const dst_reg &dst, const src_reg &src0, const src_reg &src1,
685
brw_conditional_mod condition) const
686
{
687
/* Take the instruction:
688
*
689
* CMP null<d> src0<f> src1<f>
690
*
691
* Original gfx4 does type conversion to the destination type
692
* before comparison, producing garbage results for floating
693
* point comparisons.
694
*
695
* The destination type doesn't matter on newer generations,
696
* so we set the type to match src0 so we can compact the
697
* instruction.
698
*/
699
return set_condmod(condition,
700
emit(BRW_OPCODE_CMPN, retype(dst, src0.type),
701
fix_unsigned_negate(src0),
702
fix_unsigned_negate(src1)));
703
}
704
705
/**
706
* Gfx4 predicated IF.
707
*/
708
instruction *
709
IF(brw_predicate predicate) const
710
{
711
return set_predicate(predicate, emit(BRW_OPCODE_IF));
712
}
713
714
/**
715
* CSEL: dst = src2 <op> 0.0f ? src0 : src1
716
*/
717
instruction *
718
CSEL(const dst_reg &dst, const src_reg &src0, const src_reg &src1,
719
const src_reg &src2, brw_conditional_mod condition) const
720
{
721
/* CSEL only operates on floats, so we can't do integer </<=/>=/>
722
* comparisons. Zero/non-zero (== and !=) comparisons almost work.
723
* 0x80000000 fails because it is -0.0, and -0.0 == 0.0.
724
*/
725
assert(src2.type == BRW_REGISTER_TYPE_F);
726
727
return set_condmod(condition,
728
emit(BRW_OPCODE_CSEL,
729
retype(dst, BRW_REGISTER_TYPE_F),
730
retype(src0, BRW_REGISTER_TYPE_F),
731
retype(src1, BRW_REGISTER_TYPE_F),
732
src2));
733
}
734
735
/**
736
* Emit a linear interpolation instruction.
737
*/
738
instruction *
739
LRP(const dst_reg &dst, const src_reg &x, const src_reg &y,
740
const src_reg &a) const
741
{
742
if (shader->devinfo->ver >= 6 && shader->devinfo->ver <= 10) {
743
/* The LRP instruction actually does op1 * op0 + op2 * (1 - op0), so
744
* we need to reorder the operands.
745
*/
746
return emit(BRW_OPCODE_LRP, dst, a, y, x);
747
748
} else {
749
/* We can't use the LRP instruction. Emit x*(1-a) + y*a. */
750
const dst_reg y_times_a = vgrf(dst.type);
751
const dst_reg one_minus_a = vgrf(dst.type);
752
const dst_reg x_times_one_minus_a = vgrf(dst.type);
753
754
MUL(y_times_a, y, a);
755
ADD(one_minus_a, negate(a), brw_imm_f(1.0f));
756
MUL(x_times_one_minus_a, x, src_reg(one_minus_a));
757
return ADD(dst, src_reg(x_times_one_minus_a), src_reg(y_times_a));
758
}
759
}
760
761
/**
762
* Collect a number of registers in a contiguous range of registers.
763
*/
764
instruction *
765
LOAD_PAYLOAD(const dst_reg &dst, const src_reg *src,
766
unsigned sources, unsigned header_size) const
767
{
768
instruction *inst = emit(SHADER_OPCODE_LOAD_PAYLOAD, dst, src, sources);
769
inst->header_size = header_size;
770
inst->size_written = header_size * REG_SIZE;
771
for (unsigned i = header_size; i < sources; i++) {
772
inst->size_written +=
773
ALIGN(dispatch_width() * type_sz(src[i].type) * dst.stride,
774
REG_SIZE);
775
}
776
777
return inst;
778
}
779
780
instruction *
781
UNDEF(const dst_reg &dst) const
782
{
783
assert(dst.file == VGRF);
784
instruction *inst = emit(SHADER_OPCODE_UNDEF,
785
retype(dst, BRW_REGISTER_TYPE_UD));
786
inst->size_written = shader->alloc.sizes[dst.nr] * REG_SIZE;
787
788
return inst;
789
}
790
791
backend_shader *shader;
792
793
private:
794
/**
795
* Workaround for negation of UD registers. See comment in
796
* fs_generator::generate_code() for more details.
797
*/
798
src_reg
799
fix_unsigned_negate(const src_reg &src) const
800
{
801
if (src.type == BRW_REGISTER_TYPE_UD &&
802
src.negate) {
803
dst_reg temp = vgrf(BRW_REGISTER_TYPE_UD);
804
MOV(temp, src);
805
return src_reg(temp);
806
} else {
807
return src;
808
}
809
}
810
811
/**
812
* Workaround for source register modes not supported by the ternary
813
* instruction encoding.
814
*/
815
src_reg
816
fix_3src_operand(const src_reg &src) const
817
{
818
switch (src.file) {
819
case FIXED_GRF:
820
/* FINISHME: Could handle scalar region, other stride=1 regions */
821
if (src.vstride != BRW_VERTICAL_STRIDE_8 ||
822
src.width != BRW_WIDTH_8 ||
823
src.hstride != BRW_HORIZONTAL_STRIDE_1)
824
break;
825
FALLTHROUGH;
826
case ATTR:
827
case VGRF:
828
case UNIFORM:
829
case IMM:
830
return src;
831
default:
832
break;
833
}
834
835
dst_reg expanded = vgrf(src.type);
836
MOV(expanded, src);
837
return expanded;
838
}
839
840
/**
841
* Workaround for source register modes not supported by the math
842
* instruction.
843
*/
844
src_reg
845
fix_math_operand(const src_reg &src) const
846
{
847
/* Can't do hstride == 0 args on gfx6 math, so expand it out. We
848
* might be able to do better by doing execsize = 1 math and then
849
* expanding that result out, but we would need to be careful with
850
* masking.
851
*
852
* Gfx6 hardware ignores source modifiers (negate and abs) on math
853
* instructions, so we also move to a temp to set those up.
854
*
855
* Gfx7 relaxes most of the above restrictions, but still can't use IMM
856
* operands to math
857
*/
858
if ((shader->devinfo->ver == 6 &&
859
(src.file == IMM || src.file == UNIFORM ||
860
src.abs || src.negate)) ||
861
(shader->devinfo->ver == 7 && src.file == IMM)) {
862
const dst_reg tmp = vgrf(src.type);
863
MOV(tmp, src);
864
return tmp;
865
} else {
866
return src;
867
}
868
}
869
870
bblock_t *block;
871
exec_node *cursor;
872
873
unsigned _dispatch_width;
874
unsigned _group;
875
bool force_writemask_all;
876
877
/** Debug annotation info. */
878
struct {
879
const char *str;
880
const void *ir;
881
} annotation;
882
};
883
}
884
885
#endif
886
887