Path: blob/21.2-virgl/src/intel/compiler/brw_fs_builder.h
4550 views
/* -*- c++ -*- */1/*2* Copyright © 2010-2015 Intel Corporation3*4* Permission is hereby granted, free of charge, to any person obtaining a5* copy of this software and associated documentation files (the "Software"),6* to deal in the Software without restriction, including without limitation7* the rights to use, copy, modify, merge, publish, distribute, sublicense,8* and/or sell copies of the Software, and to permit persons to whom the9* Software is furnished to do so, subject to the following conditions:10*11* The above copyright notice and this permission notice (including the next12* paragraph) shall be included in all copies or substantial portions of the13* Software.14*15* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR16* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,17* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL18* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER19* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING20* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS21* IN THE SOFTWARE.22*/2324#ifndef BRW_FS_BUILDER_H25#define BRW_FS_BUILDER_H2627#include "brw_ir_fs.h"28#include "brw_shader.h"2930namespace brw {31/**32* Toolbox to assemble an FS IR program out of individual instructions.33*34* This object is meant to have an interface consistent with35* brw::vec4_builder. They cannot be fully interchangeable because36* brw::fs_builder generates scalar code while brw::vec4_builder generates37* vector code.38*/39class fs_builder {40public:41/** Type used in this IR to represent a source of an instruction. */42typedef fs_reg src_reg;4344/** Type used in this IR to represent the destination of an instruction. */45typedef fs_reg dst_reg;4647/** Type used in this IR to represent an instruction. */48typedef fs_inst instruction;4950/**51* Construct an fs_builder that inserts instructions into \p shader.52* \p dispatch_width gives the native execution width of the program.53*/54fs_builder(backend_shader *shader,55unsigned dispatch_width) :56shader(shader), block(NULL), cursor(NULL),57_dispatch_width(dispatch_width),58_group(0),59force_writemask_all(false),60annotation()61{62}6364/**65* Construct an fs_builder that inserts instructions into \p shader66* before instruction \p inst in basic block \p block. The default67* execution controls and debug annotation are initialized from the68* instruction passed as argument.69*/70fs_builder(backend_shader *shader, bblock_t *block, fs_inst *inst) :71shader(shader), block(block), cursor(inst),72_dispatch_width(inst->exec_size),73_group(inst->group),74force_writemask_all(inst->force_writemask_all)75{76annotation.str = inst->annotation;77annotation.ir = inst->ir;78}7980/**81* Construct an fs_builder that inserts instructions before \p cursor in82* basic block \p block, inheriting other code generation parameters83* from this.84*/85fs_builder86at(bblock_t *block, exec_node *cursor) const87{88fs_builder bld = *this;89bld.block = block;90bld.cursor = cursor;91return bld;92}9394/**95* Construct an fs_builder appending instructions at the end of the96* instruction list of the shader, inheriting other code generation97* parameters from this.98*/99fs_builder100at_end() const101{102return at(NULL, (exec_node *)&shader->instructions.tail_sentinel);103}104105/**106* Construct a builder specifying the default SIMD width and group of107* channel enable signals, inheriting other code generation parameters108* from this.109*110* \p n gives the default SIMD width, \p i gives the slot group used for111* predication and control flow masking in multiples of \p n channels.112*/113fs_builder114group(unsigned n, unsigned i) const115{116fs_builder bld = *this;117118if (n <= dispatch_width() && i < dispatch_width() / n) {119bld._group += i * n;120} else {121/* The requested channel group isn't a subset of the channel group122* of this builder, which means that the resulting instructions123* would use (potentially undefined) channel enable signals not124* specified by the parent builder. That's only valid if the125* instruction doesn't have per-channel semantics, in which case126* we should clear off the default group index in order to prevent127* emitting instructions with channel group not aligned to their128* own execution size.129*/130assert(force_writemask_all);131bld._group = 0;132}133134bld._dispatch_width = n;135return bld;136}137138/**139* Alias for group() with width equal to eight.140*/141fs_builder142quarter(unsigned i) const143{144return group(8, i);145}146147/**148* Construct a builder with per-channel control flow execution masking149* disabled if \p b is true. If control flow execution masking is150* already disabled this has no effect.151*/152fs_builder153exec_all(bool b = true) const154{155fs_builder bld = *this;156if (b)157bld.force_writemask_all = true;158return bld;159}160161/**162* Construct a builder with the given debug annotation info.163*/164fs_builder165annotate(const char *str, const void *ir = NULL) const166{167fs_builder bld = *this;168bld.annotation.str = str;169bld.annotation.ir = ir;170return bld;171}172173/**174* Get the SIMD width in use.175*/176unsigned177dispatch_width() const178{179return _dispatch_width;180}181182/**183* Get the channel group in use.184*/185unsigned186group() const187{188return _group;189}190191/**192* Allocate a virtual register of natural vector size (one for this IR)193* and SIMD width. \p n gives the amount of space to allocate in194* dispatch_width units (which is just enough space for one logical195* component in this IR).196*/197dst_reg198vgrf(enum brw_reg_type type, unsigned n = 1) const199{200assert(dispatch_width() <= 32);201202if (n > 0)203return dst_reg(VGRF, shader->alloc.allocate(204DIV_ROUND_UP(n * type_sz(type) * dispatch_width(),205REG_SIZE)),206type);207else208return retype(null_reg_ud(), type);209}210211/**212* Create a null register of floating type.213*/214dst_reg215null_reg_f() const216{217return dst_reg(retype(brw_null_reg(), BRW_REGISTER_TYPE_F));218}219220dst_reg221null_reg_df() const222{223return dst_reg(retype(brw_null_reg(), BRW_REGISTER_TYPE_DF));224}225226/**227* Create a null register of signed integer type.228*/229dst_reg230null_reg_d() const231{232return dst_reg(retype(brw_null_reg(), BRW_REGISTER_TYPE_D));233}234235/**236* Create a null register of unsigned integer type.237*/238dst_reg239null_reg_ud() const240{241return dst_reg(retype(brw_null_reg(), BRW_REGISTER_TYPE_UD));242}243244/**245* Insert an instruction into the program.246*/247instruction *248emit(const instruction &inst) const249{250return emit(new(shader->mem_ctx) instruction(inst));251}252253/**254* Create and insert a nullary control instruction into the program.255*/256instruction *257emit(enum opcode opcode) const258{259return emit(instruction(opcode, dispatch_width()));260}261262/**263* Create and insert a nullary instruction into the program.264*/265instruction *266emit(enum opcode opcode, const dst_reg &dst) const267{268return emit(instruction(opcode, dispatch_width(), dst));269}270271/**272* Create and insert a unary instruction into the program.273*/274instruction *275emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0) const276{277switch (opcode) {278case SHADER_OPCODE_RCP:279case SHADER_OPCODE_RSQ:280case SHADER_OPCODE_SQRT:281case SHADER_OPCODE_EXP2:282case SHADER_OPCODE_LOG2:283case SHADER_OPCODE_SIN:284case SHADER_OPCODE_COS:285return emit(instruction(opcode, dispatch_width(), dst,286fix_math_operand(src0)));287288default:289return emit(instruction(opcode, dispatch_width(), dst, src0));290}291}292293/**294* Create and insert a binary instruction into the program.295*/296instruction *297emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,298const src_reg &src1) const299{300switch (opcode) {301case SHADER_OPCODE_POW:302case SHADER_OPCODE_INT_QUOTIENT:303case SHADER_OPCODE_INT_REMAINDER:304return emit(instruction(opcode, dispatch_width(), dst,305fix_math_operand(src0),306fix_math_operand(src1)));307308default:309return emit(instruction(opcode, dispatch_width(), dst,310src0, src1));311312}313}314315/**316* Create and insert a ternary instruction into the program.317*/318instruction *319emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,320const src_reg &src1, const src_reg &src2) const321{322switch (opcode) {323case BRW_OPCODE_BFE:324case BRW_OPCODE_BFI2:325case BRW_OPCODE_MAD:326case BRW_OPCODE_LRP:327return emit(instruction(opcode, dispatch_width(), dst,328fix_3src_operand(src0),329fix_3src_operand(src1),330fix_3src_operand(src2)));331332default:333return emit(instruction(opcode, dispatch_width(), dst,334src0, src1, src2));335}336}337338/**339* Create and insert an instruction with a variable number of sources340* into the program.341*/342instruction *343emit(enum opcode opcode, const dst_reg &dst, const src_reg srcs[],344unsigned n) const345{346/* Use the emit() methods for specific operand counts to ensure that347* opcode-specific operand fixups occur.348*/349if (n == 2) {350return emit(opcode, dst, srcs[0], srcs[1]);351} else if (n == 3) {352return emit(opcode, dst, srcs[0], srcs[1], srcs[2]);353} else {354return emit(instruction(opcode, dispatch_width(), dst, srcs, n));355}356}357358/**359* Insert a preallocated instruction into the program.360*/361instruction *362emit(instruction *inst) const363{364assert(inst->exec_size <= 32);365assert(inst->exec_size == dispatch_width() ||366force_writemask_all);367368inst->group = _group;369inst->force_writemask_all = force_writemask_all;370inst->annotation = annotation.str;371inst->ir = annotation.ir;372373if (block)374static_cast<instruction *>(cursor)->insert_before(block, inst);375else376cursor->insert_before(inst);377378return inst;379}380381/**382* Select \p src0 if the comparison of both sources with the given383* conditional mod evaluates to true, otherwise select \p src1.384*385* Generally useful to get the minimum or maximum of two values.386*/387instruction *388emit_minmax(const dst_reg &dst, const src_reg &src0,389const src_reg &src1, brw_conditional_mod mod) const390{391assert(mod == BRW_CONDITIONAL_GE || mod == BRW_CONDITIONAL_L);392393/* In some cases we can't have bytes as operand for src1, so use the394* same type for both operand.395*/396return set_condmod(mod, SEL(dst, fix_unsigned_negate(src0),397fix_unsigned_negate(src1)));398}399400/**401* Copy any live channel from \p src to the first channel of the result.402*/403src_reg404emit_uniformize(const src_reg &src) const405{406/* FIXME: We use a vector chan_index and dst to allow constant and407* copy propagration to move result all the way into the consuming408* instruction (typically a surface index or sampler index for a409* send). This uses 1 or 3 extra hw registers in 16 or 32 wide410* dispatch. Once we teach const/copy propagation about scalars we411* should go back to scalar destinations here.412*/413const fs_builder ubld = exec_all();414const dst_reg chan_index = vgrf(BRW_REGISTER_TYPE_UD);415const dst_reg dst = vgrf(src.type);416417ubld.emit(SHADER_OPCODE_FIND_LIVE_CHANNEL, chan_index);418ubld.emit(SHADER_OPCODE_BROADCAST, dst, src, component(chan_index, 0));419420return src_reg(component(dst, 0));421}422423src_reg424move_to_vgrf(const src_reg &src, unsigned num_components) const425{426src_reg *const src_comps = new src_reg[num_components];427for (unsigned i = 0; i < num_components; i++)428src_comps[i] = offset(src, dispatch_width(), i);429430const dst_reg dst = vgrf(src.type, num_components);431LOAD_PAYLOAD(dst, src_comps, num_components, 0);432433delete[] src_comps;434435return src_reg(dst);436}437438void439emit_scan_step(enum opcode opcode, brw_conditional_mod mod,440const dst_reg &tmp,441unsigned left_offset, unsigned left_stride,442unsigned right_offset, unsigned right_stride) const443{444dst_reg left, right;445left = horiz_stride(horiz_offset(tmp, left_offset), left_stride);446right = horiz_stride(horiz_offset(tmp, right_offset), right_stride);447if ((tmp.type == BRW_REGISTER_TYPE_Q ||448tmp.type == BRW_REGISTER_TYPE_UQ) &&449!shader->devinfo->has_64bit_int) {450switch (opcode) {451case BRW_OPCODE_MUL:452/* This will get lowered by integer MUL lowering */453set_condmod(mod, emit(opcode, right, left, right));454break;455456case BRW_OPCODE_SEL: {457/* In order for the comparisons to work out right, we need our458* comparisons to be strict.459*/460assert(mod == BRW_CONDITIONAL_L || mod == BRW_CONDITIONAL_GE);461if (mod == BRW_CONDITIONAL_GE)462mod = BRW_CONDITIONAL_G;463464/* We treat the bottom 32 bits as unsigned regardless of465* whether or not the integer as a whole is signed.466*/467dst_reg right_low = subscript(right, BRW_REGISTER_TYPE_UD, 0);468dst_reg left_low = subscript(left, BRW_REGISTER_TYPE_UD, 0);469470/* The upper bits get the same sign as the 64-bit type */471brw_reg_type type32 = brw_reg_type_from_bit_size(32, tmp.type);472dst_reg right_high = subscript(right, type32, 1);473dst_reg left_high = subscript(left, type32, 1);474475/* Build up our comparison:476*477* l_hi < r_hi || (l_hi == r_hi && l_low < r_low)478*/479CMP(null_reg_ud(), retype(left_low, BRW_REGISTER_TYPE_UD),480retype(right_low, BRW_REGISTER_TYPE_UD), mod);481set_predicate(BRW_PREDICATE_NORMAL,482CMP(null_reg_ud(), left_high, right_high,483BRW_CONDITIONAL_EQ));484set_predicate_inv(BRW_PREDICATE_NORMAL, true,485CMP(null_reg_ud(), left_high, right_high, mod));486487/* We could use selects here or we could use predicated MOVs488* because the destination and second source (if it were a SEL)489* are the same.490*/491set_predicate(BRW_PREDICATE_NORMAL, MOV(right_low, left_low));492set_predicate(BRW_PREDICATE_NORMAL, MOV(right_high, left_high));493break;494}495496default:497unreachable("Unsupported 64-bit scan op");498}499} else {500set_condmod(mod, emit(opcode, right, left, right));501}502}503504void505emit_scan(enum opcode opcode, const dst_reg &tmp,506unsigned cluster_size, brw_conditional_mod mod) const507{508assert(dispatch_width() >= 8);509510/* The instruction splitting code isn't advanced enough to split511* these so we need to handle that ourselves.512*/513if (dispatch_width() * type_sz(tmp.type) > 2 * REG_SIZE) {514const unsigned half_width = dispatch_width() / 2;515const fs_builder ubld = exec_all().group(half_width, 0);516dst_reg left = tmp;517dst_reg right = horiz_offset(tmp, half_width);518ubld.emit_scan(opcode, left, cluster_size, mod);519ubld.emit_scan(opcode, right, cluster_size, mod);520if (cluster_size > half_width) {521ubld.emit_scan_step(opcode, mod, tmp,522half_width - 1, 0, half_width, 1);523}524return;525}526527if (cluster_size > 1) {528const fs_builder ubld = exec_all().group(dispatch_width() / 2, 0);529ubld.emit_scan_step(opcode, mod, tmp, 0, 2, 1, 2);530}531532if (cluster_size > 2) {533if (type_sz(tmp.type) <= 4) {534const fs_builder ubld =535exec_all().group(dispatch_width() / 4, 0);536ubld.emit_scan_step(opcode, mod, tmp, 1, 4, 2, 4);537ubld.emit_scan_step(opcode, mod, tmp, 1, 4, 3, 4);538} else {539/* For 64-bit types, we have to do things differently because540* the code above would land us with destination strides that541* the hardware can't handle. Fortunately, we'll only be542* 8-wide in that case and it's the same number of543* instructions.544*/545const fs_builder ubld = exec_all().group(2, 0);546for (unsigned i = 0; i < dispatch_width(); i += 4)547ubld.emit_scan_step(opcode, mod, tmp, i + 1, 0, i + 2, 1);548}549}550551for (unsigned i = 4;552i < MIN2(cluster_size, dispatch_width());553i *= 2) {554const fs_builder ubld = exec_all().group(i, 0);555ubld.emit_scan_step(opcode, mod, tmp, i - 1, 0, i, 1);556557if (dispatch_width() > i * 2)558ubld.emit_scan_step(opcode, mod, tmp, i * 3 - 1, 0, i * 3, 1);559560if (dispatch_width() > i * 4) {561ubld.emit_scan_step(opcode, mod, tmp, i * 5 - 1, 0, i * 5, 1);562ubld.emit_scan_step(opcode, mod, tmp, i * 7 - 1, 0, i * 7, 1);563}564}565}566567/**568* Assorted arithmetic ops.569* @{570*/571#define ALU1(op) \572instruction * \573op(const dst_reg &dst, const src_reg &src0) const \574{ \575return emit(BRW_OPCODE_##op, dst, src0); \576}577578#define ALU2(op) \579instruction * \580op(const dst_reg &dst, const src_reg &src0, const src_reg &src1) const \581{ \582return emit(BRW_OPCODE_##op, dst, src0, src1); \583}584585#define ALU2_ACC(op) \586instruction * \587op(const dst_reg &dst, const src_reg &src0, const src_reg &src1) const \588{ \589instruction *inst = emit(BRW_OPCODE_##op, dst, src0, src1); \590inst->writes_accumulator = true; \591return inst; \592}593594#define ALU3(op) \595instruction * \596op(const dst_reg &dst, const src_reg &src0, const src_reg &src1, \597const src_reg &src2) const \598{ \599return emit(BRW_OPCODE_##op, dst, src0, src1, src2); \600}601602ALU2(ADD)603ALU2_ACC(ADDC)604ALU2(AND)605ALU2(ASR)606ALU2(AVG)607ALU3(BFE)608ALU2(BFI1)609ALU3(BFI2)610ALU1(BFREV)611ALU1(CBIT)612ALU1(DIM)613ALU2(DP2)614ALU2(DP3)615ALU2(DP4)616ALU2(DPH)617ALU1(F16TO32)618ALU1(F32TO16)619ALU1(FBH)620ALU1(FBL)621ALU1(FRC)622ALU2(LINE)623ALU1(LZD)624ALU2(MAC)625ALU2_ACC(MACH)626ALU3(MAD)627ALU1(MOV)628ALU2(MUL)629ALU1(NOT)630ALU2(OR)631ALU2(PLN)632ALU1(RNDD)633ALU1(RNDE)634ALU1(RNDU)635ALU1(RNDZ)636ALU2(ROL)637ALU2(ROR)638ALU2(SAD2)639ALU2_ACC(SADA2)640ALU2(SEL)641ALU2(SHL)642ALU2(SHR)643ALU2_ACC(SUBB)644ALU2(XOR)645646#undef ALU3647#undef ALU2_ACC648#undef ALU2649#undef ALU1650/** @} */651652/**653* CMP: Sets the low bit of the destination channels with the result654* of the comparison, while the upper bits are undefined, and updates655* the flag register with the packed 16 bits of the result.656*/657instruction *658CMP(const dst_reg &dst, const src_reg &src0, const src_reg &src1,659brw_conditional_mod condition) const660{661/* Take the instruction:662*663* CMP null<d> src0<f> src1<f>664*665* Original gfx4 does type conversion to the destination type666* before comparison, producing garbage results for floating667* point comparisons.668*669* The destination type doesn't matter on newer generations,670* so we set the type to match src0 so we can compact the671* instruction.672*/673return set_condmod(condition,674emit(BRW_OPCODE_CMP, retype(dst, src0.type),675fix_unsigned_negate(src0),676fix_unsigned_negate(src1)));677}678679/**680* CMPN: Behaves like CMP, but produces true if src1 is NaN.681*/682instruction *683CMPN(const dst_reg &dst, const src_reg &src0, const src_reg &src1,684brw_conditional_mod condition) const685{686/* Take the instruction:687*688* CMP null<d> src0<f> src1<f>689*690* Original gfx4 does type conversion to the destination type691* before comparison, producing garbage results for floating692* point comparisons.693*694* The destination type doesn't matter on newer generations,695* so we set the type to match src0 so we can compact the696* instruction.697*/698return set_condmod(condition,699emit(BRW_OPCODE_CMPN, retype(dst, src0.type),700fix_unsigned_negate(src0),701fix_unsigned_negate(src1)));702}703704/**705* Gfx4 predicated IF.706*/707instruction *708IF(brw_predicate predicate) const709{710return set_predicate(predicate, emit(BRW_OPCODE_IF));711}712713/**714* CSEL: dst = src2 <op> 0.0f ? src0 : src1715*/716instruction *717CSEL(const dst_reg &dst, const src_reg &src0, const src_reg &src1,718const src_reg &src2, brw_conditional_mod condition) const719{720/* CSEL only operates on floats, so we can't do integer </<=/>=/>721* comparisons. Zero/non-zero (== and !=) comparisons almost work.722* 0x80000000 fails because it is -0.0, and -0.0 == 0.0.723*/724assert(src2.type == BRW_REGISTER_TYPE_F);725726return set_condmod(condition,727emit(BRW_OPCODE_CSEL,728retype(dst, BRW_REGISTER_TYPE_F),729retype(src0, BRW_REGISTER_TYPE_F),730retype(src1, BRW_REGISTER_TYPE_F),731src2));732}733734/**735* Emit a linear interpolation instruction.736*/737instruction *738LRP(const dst_reg &dst, const src_reg &x, const src_reg &y,739const src_reg &a) const740{741if (shader->devinfo->ver >= 6 && shader->devinfo->ver <= 10) {742/* The LRP instruction actually does op1 * op0 + op2 * (1 - op0), so743* we need to reorder the operands.744*/745return emit(BRW_OPCODE_LRP, dst, a, y, x);746747} else {748/* We can't use the LRP instruction. Emit x*(1-a) + y*a. */749const dst_reg y_times_a = vgrf(dst.type);750const dst_reg one_minus_a = vgrf(dst.type);751const dst_reg x_times_one_minus_a = vgrf(dst.type);752753MUL(y_times_a, y, a);754ADD(one_minus_a, negate(a), brw_imm_f(1.0f));755MUL(x_times_one_minus_a, x, src_reg(one_minus_a));756return ADD(dst, src_reg(x_times_one_minus_a), src_reg(y_times_a));757}758}759760/**761* Collect a number of registers in a contiguous range of registers.762*/763instruction *764LOAD_PAYLOAD(const dst_reg &dst, const src_reg *src,765unsigned sources, unsigned header_size) const766{767instruction *inst = emit(SHADER_OPCODE_LOAD_PAYLOAD, dst, src, sources);768inst->header_size = header_size;769inst->size_written = header_size * REG_SIZE;770for (unsigned i = header_size; i < sources; i++) {771inst->size_written +=772ALIGN(dispatch_width() * type_sz(src[i].type) * dst.stride,773REG_SIZE);774}775776return inst;777}778779instruction *780UNDEF(const dst_reg &dst) const781{782assert(dst.file == VGRF);783instruction *inst = emit(SHADER_OPCODE_UNDEF,784retype(dst, BRW_REGISTER_TYPE_UD));785inst->size_written = shader->alloc.sizes[dst.nr] * REG_SIZE;786787return inst;788}789790backend_shader *shader;791792private:793/**794* Workaround for negation of UD registers. See comment in795* fs_generator::generate_code() for more details.796*/797src_reg798fix_unsigned_negate(const src_reg &src) const799{800if (src.type == BRW_REGISTER_TYPE_UD &&801src.negate) {802dst_reg temp = vgrf(BRW_REGISTER_TYPE_UD);803MOV(temp, src);804return src_reg(temp);805} else {806return src;807}808}809810/**811* Workaround for source register modes not supported by the ternary812* instruction encoding.813*/814src_reg815fix_3src_operand(const src_reg &src) const816{817switch (src.file) {818case FIXED_GRF:819/* FINISHME: Could handle scalar region, other stride=1 regions */820if (src.vstride != BRW_VERTICAL_STRIDE_8 ||821src.width != BRW_WIDTH_8 ||822src.hstride != BRW_HORIZONTAL_STRIDE_1)823break;824FALLTHROUGH;825case ATTR:826case VGRF:827case UNIFORM:828case IMM:829return src;830default:831break;832}833834dst_reg expanded = vgrf(src.type);835MOV(expanded, src);836return expanded;837}838839/**840* Workaround for source register modes not supported by the math841* instruction.842*/843src_reg844fix_math_operand(const src_reg &src) const845{846/* Can't do hstride == 0 args on gfx6 math, so expand it out. We847* might be able to do better by doing execsize = 1 math and then848* expanding that result out, but we would need to be careful with849* masking.850*851* Gfx6 hardware ignores source modifiers (negate and abs) on math852* instructions, so we also move to a temp to set those up.853*854* Gfx7 relaxes most of the above restrictions, but still can't use IMM855* operands to math856*/857if ((shader->devinfo->ver == 6 &&858(src.file == IMM || src.file == UNIFORM ||859src.abs || src.negate)) ||860(shader->devinfo->ver == 7 && src.file == IMM)) {861const dst_reg tmp = vgrf(src.type);862MOV(tmp, src);863return tmp;864} else {865return src;866}867}868869bblock_t *block;870exec_node *cursor;871872unsigned _dispatch_width;873unsigned _group;874bool force_writemask_all;875876/** Debug annotation info. */877struct {878const char *str;879const void *ir;880} annotation;881};882}883884#endif885886887