Path: blob/21.2-virgl/src/intel/compiler/brw_fs_lower_regioning.cpp
4550 views
/*1* Copyright © 2018 Intel Corporation2*3* Permission is hereby granted, free of charge, to any person obtaining a4* copy of this software and associated documentation files (the "Software"),5* to deal in the Software without restriction, including without limitation6* the rights to use, copy, modify, merge, publish, distribute, sublicense,7* and/or sell copies of the Software, and to permit persons to whom the8* Software is furnished to do so, subject to the following conditions:9*10* The above copyright notice and this permission notice (including the next11* paragraph) shall be included in all copies or substantial portions of the12* Software.13*14* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR15* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,16* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL17* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER18* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING19* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS20* IN THE SOFTWARE.21*/2223#include "brw_fs.h"24#include "brw_cfg.h"25#include "brw_fs_builder.h"2627using namespace brw;2829namespace {30/* From the SKL PRM Vol 2a, "Move":31*32* "A mov with the same source and destination type, no source modifier,33* and no saturation is a raw move. A packed byte destination region (B34* or UB type with HorzStride == 1 and ExecSize > 1) can only be written35* using raw move."36*/37bool38is_byte_raw_mov(const fs_inst *inst)39{40return type_sz(inst->dst.type) == 1 &&41inst->opcode == BRW_OPCODE_MOV &&42inst->src[0].type == inst->dst.type &&43!inst->saturate &&44!inst->src[0].negate &&45!inst->src[0].abs;46}4748/*49* Return an acceptable byte stride for the destination of an instruction50* that requires it to have some particular alignment.51*/52unsigned53required_dst_byte_stride(const fs_inst *inst)54{55if (inst->dst.is_accumulator()) {56/* If the destination is an accumulator, insist that we leave the57* stride alone. We cannot "fix" accumulator destinations by writing58* to a temporary and emitting a MOV into the original destination.59* For multiply instructions (our one use of the accumulator), the60* MUL writes the full 66 bits of the accumulator whereas the MOV we61* would emit only writes 33 bits and leaves the top 33 bits62* undefined.63*64* It's safe to just require the original stride here because the65* lowering pass will detect the mismatch in has_invalid_src_region66* and fix the sources of the multiply instead of the destination.67*/68return inst->dst.stride * type_sz(inst->dst.type);69} else if (type_sz(inst->dst.type) < get_exec_type_size(inst) &&70!is_byte_raw_mov(inst)) {71return get_exec_type_size(inst);72} else {73/* Calculate the maximum byte stride and the minimum/maximum type74* size across all source and destination operands we are required to75* lower.76*/77unsigned max_stride = inst->dst.stride * type_sz(inst->dst.type);78unsigned min_size = type_sz(inst->dst.type);79unsigned max_size = type_sz(inst->dst.type);8081for (unsigned i = 0; i < inst->sources; i++) {82if (!is_uniform(inst->src[i]) && !inst->is_control_source(i)) {83const unsigned size = type_sz(inst->src[i].type);84max_stride = MAX2(max_stride, inst->src[i].stride * size);85min_size = MIN2(min_size, size);86max_size = MAX2(max_size, size);87}88}8990/* All operands involved in lowering need to fit in the calculated91* stride.92*/93assert(max_size <= 4 * min_size);9495/* Attempt to use the largest byte stride among all present operands,96* but never exceed a stride of 4 since that would lead to illegal97* destination regions during lowering.98*/99return MIN2(max_stride, 4 * min_size);100}101}102103/*104* Return an acceptable byte sub-register offset for the destination of an105* instruction that requires it to be aligned to the sub-register offset of106* the sources.107*/108unsigned109required_dst_byte_offset(const fs_inst *inst)110{111for (unsigned i = 0; i < inst->sources; i++) {112if (!is_uniform(inst->src[i]) && !inst->is_control_source(i))113if (reg_offset(inst->src[i]) % REG_SIZE !=114reg_offset(inst->dst) % REG_SIZE)115return 0;116}117118return reg_offset(inst->dst) % REG_SIZE;119}120121/*122* Return whether the instruction has an unsupported channel bit layout123* specified for the i-th source region.124*/125bool126has_invalid_src_region(const intel_device_info *devinfo, const fs_inst *inst,127unsigned i)128{129if (is_unordered(inst) || inst->is_control_source(i))130return false;131132/* Empirical testing shows that Broadwell has a bug affecting half-float133* MAD instructions when any of its sources has a non-zero offset, such134* as:135*136* mad(8) g18<1>HF -g17<4,4,1>HF g14.8<4,4,1>HF g11<4,4,1>HF { align16 1Q };137*138* We used to generate code like this for SIMD8 executions where we139* used to pack components Y and W of a vector at offset 16B of a SIMD140* register. The problem doesn't occur if the stride of the source is 0.141*/142if (devinfo->ver == 8 &&143inst->opcode == BRW_OPCODE_MAD &&144inst->src[i].type == BRW_REGISTER_TYPE_HF &&145reg_offset(inst->src[i]) % REG_SIZE > 0 &&146inst->src[i].stride != 0) {147return true;148}149150const unsigned dst_byte_stride = inst->dst.stride * type_sz(inst->dst.type);151const unsigned src_byte_stride = inst->src[i].stride *152type_sz(inst->src[i].type);153const unsigned dst_byte_offset = reg_offset(inst->dst) % REG_SIZE;154const unsigned src_byte_offset = reg_offset(inst->src[i]) % REG_SIZE;155156return has_dst_aligned_region_restriction(devinfo, inst) &&157!is_uniform(inst->src[i]) &&158(src_byte_stride != dst_byte_stride ||159src_byte_offset != dst_byte_offset);160}161162/*163* Return whether the instruction has an unsupported channel bit layout164* specified for the destination region.165*/166bool167has_invalid_dst_region(const intel_device_info *devinfo,168const fs_inst *inst)169{170if (is_unordered(inst)) {171return false;172} else {173const brw_reg_type exec_type = get_exec_type(inst);174const unsigned dst_byte_offset = reg_offset(inst->dst) % REG_SIZE;175const unsigned dst_byte_stride = inst->dst.stride * type_sz(inst->dst.type);176const bool is_narrowing_conversion = !is_byte_raw_mov(inst) &&177type_sz(inst->dst.type) < type_sz(exec_type);178179return (has_dst_aligned_region_restriction(devinfo, inst) &&180(required_dst_byte_stride(inst) != dst_byte_stride ||181required_dst_byte_offset(inst) != dst_byte_offset)) ||182(is_narrowing_conversion &&183required_dst_byte_stride(inst) != dst_byte_stride);184}185}186187/**188* Return a non-zero value if the execution type of the instruction is189* unsupported. The destination and sources matching the returned mask190* will be bit-cast to an integer type of appropriate size, lowering any191* source or destination modifiers into separate MOV instructions.192*/193unsigned194has_invalid_exec_type(const intel_device_info *devinfo, const fs_inst *inst)195{196switch (inst->opcode) {197case SHADER_OPCODE_SHUFFLE:198case SHADER_OPCODE_QUAD_SWIZZLE:199return has_dst_aligned_region_restriction(devinfo, inst) ?2000x1 : 0;201202case SHADER_OPCODE_BROADCAST:203case SHADER_OPCODE_MOV_INDIRECT:204return (((devinfo->verx10 == 70) ||205devinfo->is_cherryview || intel_device_info_is_9lp(devinfo) ||206devinfo->verx10 >= 125) && type_sz(inst->src[0].type) > 4) ||207(devinfo->verx10 >= 125 &&208brw_reg_type_is_floating_point(inst->src[0].type)) ?2090x1 : 0;210211default:212return 0;213}214}215216/*217* Return whether the instruction has unsupported source modifiers218* specified for the i-th source region.219*/220bool221has_invalid_src_modifiers(const intel_device_info *devinfo,222const fs_inst *inst, unsigned i)223{224return (!inst->can_do_source_mods(devinfo) &&225(inst->src[i].negate || inst->src[i].abs)) ||226((has_invalid_exec_type(devinfo, inst) & (1u << i)) &&227(inst->src[i].negate || inst->src[i].abs ||228inst->src[i].type != get_exec_type(inst)));229}230231/*232* Return whether the instruction has an unsupported type conversion233* specified for the destination.234*/235bool236has_invalid_conversion(const intel_device_info *devinfo, const fs_inst *inst)237{238switch (inst->opcode) {239case BRW_OPCODE_MOV:240return false;241case BRW_OPCODE_SEL:242return inst->dst.type != get_exec_type(inst);243default:244/* FIXME: We assume the opcodes not explicitly mentioned before just245* work fine with arbitrary conversions, unless they need to be246* bit-cast.247*/248return has_invalid_exec_type(devinfo, inst) &&249inst->dst.type != get_exec_type(inst);250}251}252253/**254* Return whether the instruction has unsupported destination modifiers.255*/256bool257has_invalid_dst_modifiers(const intel_device_info *devinfo, const fs_inst *inst)258{259return (has_invalid_exec_type(devinfo, inst) &&260(inst->saturate || inst->conditional_mod)) ||261has_invalid_conversion(devinfo, inst);262}263264/**265* Return whether the instruction has non-standard semantics for the266* conditional mod which don't cause the flag register to be updated with267* the comparison result.268*/269bool270has_inconsistent_cmod(const fs_inst *inst)271{272return inst->opcode == BRW_OPCODE_SEL ||273inst->opcode == BRW_OPCODE_CSEL ||274inst->opcode == BRW_OPCODE_IF ||275inst->opcode == BRW_OPCODE_WHILE;276}277278bool279lower_instruction(fs_visitor *v, bblock_t *block, fs_inst *inst);280}281282namespace brw {283/**284* Remove any modifiers from the \p i-th source region of the instruction,285* including negate, abs and any implicit type conversion to the execution286* type. Instead any source modifiers will be implemented as a separate287* MOV instruction prior to the original instruction.288*/289bool290lower_src_modifiers(fs_visitor *v, bblock_t *block, fs_inst *inst, unsigned i)291{292assert(inst->components_read(i) == 1);293assert(v->devinfo->has_integer_dword_mul ||294inst->opcode != BRW_OPCODE_MUL ||295brw_reg_type_is_floating_point(get_exec_type(inst)) ||296MIN2(type_sz(inst->src[0].type), type_sz(inst->src[1].type)) >= 4 ||297type_sz(inst->src[i].type) == get_exec_type_size(inst));298299const fs_builder ibld(v, block, inst);300const fs_reg tmp = ibld.vgrf(get_exec_type(inst));301302lower_instruction(v, block, ibld.MOV(tmp, inst->src[i]));303inst->src[i] = tmp;304305return true;306}307}308309namespace {310/**311* Remove any modifiers from the destination region of the instruction,312* including saturate, conditional mod and any implicit type conversion313* from the execution type. Instead any destination modifiers will be314* implemented as a separate MOV instruction after the original315* instruction.316*/317bool318lower_dst_modifiers(fs_visitor *v, bblock_t *block, fs_inst *inst)319{320const fs_builder ibld(v, block, inst);321const brw_reg_type type = get_exec_type(inst);322/* Not strictly necessary, but if possible use a temporary with the same323* channel alignment as the current destination in order to avoid324* violating the restrictions enforced later on by lower_src_region()325* and lower_dst_region(), which would introduce additional copy326* instructions into the program unnecessarily.327*/328const unsigned stride =329type_sz(inst->dst.type) * inst->dst.stride <= type_sz(type) ? 1 :330type_sz(inst->dst.type) * inst->dst.stride / type_sz(type);331fs_reg tmp = ibld.vgrf(type, stride);332ibld.UNDEF(tmp);333tmp = horiz_stride(tmp, stride);334335/* Emit a MOV taking care of all the destination modifiers. */336fs_inst *mov = ibld.at(block, inst->next).MOV(inst->dst, tmp);337mov->saturate = inst->saturate;338if (!has_inconsistent_cmod(inst))339mov->conditional_mod = inst->conditional_mod;340if (inst->opcode != BRW_OPCODE_SEL) {341mov->predicate = inst->predicate;342mov->predicate_inverse = inst->predicate_inverse;343}344mov->flag_subreg = inst->flag_subreg;345lower_instruction(v, block, mov);346347/* Point the original instruction at the temporary, and clean up any348* destination modifiers.349*/350assert(inst->size_written == inst->dst.component_size(inst->exec_size));351inst->dst = tmp;352inst->size_written = inst->dst.component_size(inst->exec_size);353inst->saturate = false;354if (!has_inconsistent_cmod(inst))355inst->conditional_mod = BRW_CONDITIONAL_NONE;356357assert(!inst->flags_written(v->devinfo) || !mov->predicate);358return true;359}360361/**362* Remove any non-trivial shuffling of data from the \p i-th source region363* of the instruction. Instead implement the region as a series of integer364* copies into a temporary with the same channel layout as the destination.365*/366bool367lower_src_region(fs_visitor *v, bblock_t *block, fs_inst *inst, unsigned i)368{369assert(inst->components_read(i) == 1);370const fs_builder ibld(v, block, inst);371const unsigned stride = type_sz(inst->dst.type) * inst->dst.stride /372type_sz(inst->src[i].type);373assert(stride > 0);374fs_reg tmp = ibld.vgrf(inst->src[i].type, stride);375ibld.UNDEF(tmp);376tmp = horiz_stride(tmp, stride);377378/* Emit a series of 32-bit integer copies with any source modifiers379* cleaned up (because their semantics are dependent on the type).380*/381const brw_reg_type raw_type = brw_int_type(MIN2(type_sz(tmp.type), 4),382false);383const unsigned n = type_sz(tmp.type) / type_sz(raw_type);384fs_reg raw_src = inst->src[i];385raw_src.negate = false;386raw_src.abs = false;387388for (unsigned j = 0; j < n; j++)389ibld.MOV(subscript(tmp, raw_type, j), subscript(raw_src, raw_type, j));390391/* Point the original instruction at the temporary, making sure to keep392* any source modifiers in the instruction.393*/394fs_reg lower_src = tmp;395lower_src.negate = inst->src[i].negate;396lower_src.abs = inst->src[i].abs;397inst->src[i] = lower_src;398399return true;400}401402/**403* Remove any non-trivial shuffling of data from the destination region of404* the instruction. Instead implement the region as a series of integer405* copies from a temporary with a channel layout compatible with the406* sources.407*/408bool409lower_dst_region(fs_visitor *v, bblock_t *block, fs_inst *inst)410{411/* We cannot replace the result of an integer multiply which writes the412* accumulator because MUL+MACH pairs act on the accumulator as a 66-bit413* value whereas the MOV will act on only 32 or 33 bits of the414* accumulator.415*/416assert(inst->opcode != BRW_OPCODE_MUL || !inst->dst.is_accumulator() ||417brw_reg_type_is_floating_point(inst->dst.type));418419const fs_builder ibld(v, block, inst);420const unsigned stride = required_dst_byte_stride(inst) /421type_sz(inst->dst.type);422assert(stride > 0);423fs_reg tmp = ibld.vgrf(inst->dst.type, stride);424ibld.UNDEF(tmp);425tmp = horiz_stride(tmp, stride);426427/* Emit a series of 32-bit integer copies from the temporary into the428* original destination.429*/430const brw_reg_type raw_type = brw_int_type(MIN2(type_sz(tmp.type), 4),431false);432const unsigned n = type_sz(tmp.type) / type_sz(raw_type);433434if (inst->predicate && inst->opcode != BRW_OPCODE_SEL) {435/* Note that in general we cannot simply predicate the copies on the436* same flag register as the original instruction, since it may have437* been overwritten by the instruction itself. Instead initialize438* the temporary with the previous contents of the destination439* register.440*/441for (unsigned j = 0; j < n; j++)442ibld.MOV(subscript(tmp, raw_type, j),443subscript(inst->dst, raw_type, j));444}445446for (unsigned j = 0; j < n; j++)447ibld.at(block, inst->next).MOV(subscript(inst->dst, raw_type, j),448subscript(tmp, raw_type, j));449450/* Point the original instruction at the temporary, making sure to keep451* any destination modifiers in the instruction.452*/453assert(inst->size_written == inst->dst.component_size(inst->exec_size));454inst->dst = tmp;455inst->size_written = inst->dst.component_size(inst->exec_size);456457return true;458}459460/**461* Bit-cast sources and destination of the instruction to an appropriate462* integer type, to be used in cases where the instruction doesn't support463* some other execution type.464*/465bool466lower_exec_type(fs_visitor *v, bblock_t *block, fs_inst *inst)467{468assert(inst->dst.type == get_exec_type(inst));469const unsigned mask = has_invalid_exec_type(v->devinfo, inst);470const brw_reg_type raw_type = brw_int_type(type_sz(inst->dst.type), false);471472for (unsigned i = 0; i < inst->sources; i++) {473if (mask & (1u << i)) {474assert(inst->src[i].type == inst->dst.type);475inst->src[i].type = raw_type;476}477}478479inst->dst.type = raw_type;480481return true;482}483484/**485* Legalize the source and destination regioning controls of the specified486* instruction.487*/488bool489lower_instruction(fs_visitor *v, bblock_t *block, fs_inst *inst)490{491const intel_device_info *devinfo = v->devinfo;492bool progress = false;493494if (has_invalid_dst_modifiers(devinfo, inst))495progress |= lower_dst_modifiers(v, block, inst);496497if (has_invalid_dst_region(devinfo, inst))498progress |= lower_dst_region(v, block, inst);499500for (unsigned i = 0; i < inst->sources; i++) {501if (has_invalid_src_modifiers(devinfo, inst, i))502progress |= lower_src_modifiers(v, block, inst, i);503504if (has_invalid_src_region(devinfo, inst, i))505progress |= lower_src_region(v, block, inst, i);506}507508if (has_invalid_exec_type(devinfo, inst))509progress |= lower_exec_type(v, block, inst);510511return progress;512}513}514515bool516fs_visitor::lower_regioning()517{518bool progress = false;519520foreach_block_and_inst_safe(block, fs_inst, inst, cfg)521progress |= lower_instruction(this, block, inst);522523if (progress)524invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES);525526return progress;527}528529530