Path: blob/21.2-virgl/src/intel/compiler/brw_fs.cpp
4550 views
/*1* Copyright © 2010 Intel Corporation2*3* Permission is hereby granted, free of charge, to any person obtaining a4* copy of this software and associated documentation files (the "Software"),5* to deal in the Software without restriction, including without limitation6* the rights to use, copy, modify, merge, publish, distribute, sublicense,7* and/or sell copies of the Software, and to permit persons to whom the8* Software is furnished to do so, subject to the following conditions:9*10* The above copyright notice and this permission notice (including the next11* paragraph) shall be included in all copies or substantial portions of the12* Software.13*14* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR15* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,16* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL17* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER18* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING19* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS20* IN THE SOFTWARE.21*/2223/** @file brw_fs.cpp24*25* This file drives the GLSL IR -> LIR translation, contains the26* optimizations on the LIR, and drives the generation of native code27* from the LIR.28*/2930#include "main/macros.h"31#include "brw_eu.h"32#include "brw_fs.h"33#include "brw_fs_live_variables.h"34#include "brw_nir.h"35#include "brw_vec4_gs_visitor.h"36#include "brw_cfg.h"37#include "brw_dead_control_flow.h"38#include "dev/intel_debug.h"39#include "compiler/glsl_types.h"40#include "compiler/nir/nir_builder.h"41#include "program/prog_parameter.h"42#include "util/u_math.h"4344using namespace brw;4546static unsigned get_lowered_simd_width(const struct intel_device_info *devinfo,47const fs_inst *inst);4849void50fs_inst::init(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,51const fs_reg *src, unsigned sources)52{53memset((void*)this, 0, sizeof(*this));5455this->src = new fs_reg[MAX2(sources, 3)];56for (unsigned i = 0; i < sources; i++)57this->src[i] = src[i];5859this->opcode = opcode;60this->dst = dst;61this->sources = sources;62this->exec_size = exec_size;63this->base_mrf = -1;6465assert(dst.file != IMM && dst.file != UNIFORM);6667assert(this->exec_size != 0);6869this->conditional_mod = BRW_CONDITIONAL_NONE;7071/* This will be the case for almost all instructions. */72switch (dst.file) {73case VGRF:74case ARF:75case FIXED_GRF:76case MRF:77case ATTR:78this->size_written = dst.component_size(exec_size);79break;80case BAD_FILE:81this->size_written = 0;82break;83case IMM:84case UNIFORM:85unreachable("Invalid destination register file");86}8788this->writes_accumulator = false;89}9091fs_inst::fs_inst()92{93init(BRW_OPCODE_NOP, 8, dst, NULL, 0);94}9596fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size)97{98init(opcode, exec_size, reg_undef, NULL, 0);99}100101fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst)102{103init(opcode, exec_size, dst, NULL, 0);104}105106fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,107const fs_reg &src0)108{109const fs_reg src[1] = { src0 };110init(opcode, exec_size, dst, src, 1);111}112113fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,114const fs_reg &src0, const fs_reg &src1)115{116const fs_reg src[2] = { src0, src1 };117init(opcode, exec_size, dst, src, 2);118}119120fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,121const fs_reg &src0, const fs_reg &src1, const fs_reg &src2)122{123const fs_reg src[3] = { src0, src1, src2 };124init(opcode, exec_size, dst, src, 3);125}126127fs_inst::fs_inst(enum opcode opcode, uint8_t exec_width, const fs_reg &dst,128const fs_reg src[], unsigned sources)129{130init(opcode, exec_width, dst, src, sources);131}132133fs_inst::fs_inst(const fs_inst &that)134{135memcpy((void*)this, &that, sizeof(that));136137this->src = new fs_reg[MAX2(that.sources, 3)];138139for (unsigned i = 0; i < that.sources; i++)140this->src[i] = that.src[i];141}142143fs_inst::~fs_inst()144{145delete[] this->src;146}147148void149fs_inst::resize_sources(uint8_t num_sources)150{151if (this->sources != num_sources) {152fs_reg *src = new fs_reg[MAX2(num_sources, 3)];153154for (unsigned i = 0; i < MIN2(this->sources, num_sources); ++i)155src[i] = this->src[i];156157delete[] this->src;158this->src = src;159this->sources = num_sources;160}161}162163void164fs_visitor::VARYING_PULL_CONSTANT_LOAD(const fs_builder &bld,165const fs_reg &dst,166const fs_reg &surf_index,167const fs_reg &varying_offset,168uint32_t const_offset,169uint8_t alignment)170{171/* We have our constant surface use a pitch of 4 bytes, so our index can172* be any component of a vector, and then we load 4 contiguous173* components starting from that.174*175* We break down the const_offset to a portion added to the variable offset176* and a portion done using fs_reg::offset, which means that if you have177* GLSL using something like "uniform vec4 a[20]; gl_FragColor = a[i]",178* we'll temporarily generate 4 vec4 loads from offset i * 4, and CSE can179* later notice that those loads are all the same and eliminate the180* redundant ones.181*/182fs_reg vec4_offset = vgrf(glsl_type::uint_type);183bld.ADD(vec4_offset, varying_offset, brw_imm_ud(const_offset & ~0xf));184185/* The pull load message will load a vec4 (16 bytes). If we are loading186* a double this means we are only loading 2 elements worth of data.187* We also want to use a 32-bit data type for the dst of the load operation188* so other parts of the driver don't get confused about the size of the189* result.190*/191fs_reg vec4_result = bld.vgrf(BRW_REGISTER_TYPE_F, 4);192fs_inst *inst = bld.emit(FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_LOGICAL,193vec4_result, surf_index, vec4_offset,194brw_imm_ud(alignment));195inst->size_written = 4 * vec4_result.component_size(inst->exec_size);196197shuffle_from_32bit_read(bld, dst, vec4_result,198(const_offset & 0xf) / type_sz(dst.type), 1);199}200201/**202* A helper for MOV generation for fixing up broken hardware SEND dependency203* handling.204*/205void206fs_visitor::DEP_RESOLVE_MOV(const fs_builder &bld, int grf)207{208/* The caller always wants uncompressed to emit the minimal extra209* dependencies, and to avoid having to deal with aligning its regs to 2.210*/211const fs_builder ubld = bld.annotate("send dependency resolve")212.quarter(0);213214ubld.MOV(ubld.null_reg_f(), fs_reg(VGRF, grf, BRW_REGISTER_TYPE_F));215}216217bool218fs_inst::is_send_from_grf() const219{220switch (opcode) {221case SHADER_OPCODE_SEND:222case SHADER_OPCODE_SHADER_TIME_ADD:223case FS_OPCODE_INTERPOLATE_AT_SAMPLE:224case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:225case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:226case SHADER_OPCODE_URB_WRITE_SIMD8:227case SHADER_OPCODE_URB_WRITE_SIMD8_PER_SLOT:228case SHADER_OPCODE_URB_WRITE_SIMD8_MASKED:229case SHADER_OPCODE_URB_WRITE_SIMD8_MASKED_PER_SLOT:230case SHADER_OPCODE_URB_READ_SIMD8:231case SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT:232case SHADER_OPCODE_INTERLOCK:233case SHADER_OPCODE_MEMORY_FENCE:234case SHADER_OPCODE_BARRIER:235return true;236case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:237return src[1].file == VGRF;238case FS_OPCODE_FB_WRITE:239case FS_OPCODE_FB_READ:240return src[0].file == VGRF;241default:242if (is_tex())243return src[0].file == VGRF;244245return false;246}247}248249bool250fs_inst::is_control_source(unsigned arg) const251{252switch (opcode) {253case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:254case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GFX7:255case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GFX4:256return arg == 0;257258case SHADER_OPCODE_BROADCAST:259case SHADER_OPCODE_SHUFFLE:260case SHADER_OPCODE_QUAD_SWIZZLE:261case FS_OPCODE_INTERPOLATE_AT_SAMPLE:262case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:263case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:264case SHADER_OPCODE_GET_BUFFER_SIZE:265return arg == 1;266267case SHADER_OPCODE_MOV_INDIRECT:268case SHADER_OPCODE_CLUSTER_BROADCAST:269case SHADER_OPCODE_TEX:270case FS_OPCODE_TXB:271case SHADER_OPCODE_TXD:272case SHADER_OPCODE_TXF:273case SHADER_OPCODE_TXF_LZ:274case SHADER_OPCODE_TXF_CMS:275case SHADER_OPCODE_TXF_CMS_W:276case SHADER_OPCODE_TXF_UMS:277case SHADER_OPCODE_TXF_MCS:278case SHADER_OPCODE_TXL:279case SHADER_OPCODE_TXL_LZ:280case SHADER_OPCODE_TXS:281case SHADER_OPCODE_LOD:282case SHADER_OPCODE_TG4:283case SHADER_OPCODE_TG4_OFFSET:284case SHADER_OPCODE_SAMPLEINFO:285return arg == 1 || arg == 2;286287case SHADER_OPCODE_SEND:288return arg == 0 || arg == 1;289290default:291return false;292}293}294295bool296fs_inst::is_payload(unsigned arg) const297{298switch (opcode) {299case FS_OPCODE_FB_WRITE:300case FS_OPCODE_FB_READ:301case SHADER_OPCODE_URB_WRITE_SIMD8:302case SHADER_OPCODE_URB_WRITE_SIMD8_PER_SLOT:303case SHADER_OPCODE_URB_WRITE_SIMD8_MASKED:304case SHADER_OPCODE_URB_WRITE_SIMD8_MASKED_PER_SLOT:305case SHADER_OPCODE_URB_READ_SIMD8:306case SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT:307case VEC4_OPCODE_UNTYPED_ATOMIC:308case VEC4_OPCODE_UNTYPED_SURFACE_READ:309case VEC4_OPCODE_UNTYPED_SURFACE_WRITE:310case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:311case SHADER_OPCODE_SHADER_TIME_ADD:312case FS_OPCODE_INTERPOLATE_AT_SAMPLE:313case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:314case SHADER_OPCODE_INTERLOCK:315case SHADER_OPCODE_MEMORY_FENCE:316case SHADER_OPCODE_BARRIER:317return arg == 0;318319case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GFX7:320return arg == 1;321322case SHADER_OPCODE_SEND:323return arg == 2 || arg == 3;324325default:326if (is_tex())327return arg == 0;328else329return false;330}331}332333/**334* Returns true if this instruction's sources and destinations cannot335* safely be the same register.336*337* In most cases, a register can be written over safely by the same338* instruction that is its last use. For a single instruction, the339* sources are dereferenced before writing of the destination starts340* (naturally).341*342* However, there are a few cases where this can be problematic:343*344* - Virtual opcodes that translate to multiple instructions in the345* code generator: if src == dst and one instruction writes the346* destination before a later instruction reads the source, then347* src will have been clobbered.348*349* - SIMD16 compressed instructions with certain regioning (see below).350*351* The register allocator uses this information to set up conflicts between352* GRF sources and the destination.353*/354bool355fs_inst::has_source_and_destination_hazard() const356{357switch (opcode) {358case FS_OPCODE_PACK_HALF_2x16_SPLIT:359/* Multiple partial writes to the destination */360return true;361case SHADER_OPCODE_SHUFFLE:362/* This instruction returns an arbitrary channel from the source and363* gets split into smaller instructions in the generator. It's possible364* that one of the instructions will read from a channel corresponding365* to an earlier instruction.366*/367case SHADER_OPCODE_SEL_EXEC:368/* This is implemented as369*370* mov(16) g4<1>D 0D { align1 WE_all 1H };371* mov(16) g4<1>D g5<8,8,1>D { align1 1H }372*373* Because the source is only read in the second instruction, the first374* may stomp all over it.375*/376return true;377case SHADER_OPCODE_QUAD_SWIZZLE:378switch (src[1].ud) {379case BRW_SWIZZLE_XXXX:380case BRW_SWIZZLE_YYYY:381case BRW_SWIZZLE_ZZZZ:382case BRW_SWIZZLE_WWWW:383case BRW_SWIZZLE_XXZZ:384case BRW_SWIZZLE_YYWW:385case BRW_SWIZZLE_XYXY:386case BRW_SWIZZLE_ZWZW:387/* These can be implemented as a single Align1 region on all388* platforms, so there's never a hazard between source and389* destination. C.f. fs_generator::generate_quad_swizzle().390*/391return false;392default:393return !is_uniform(src[0]);394}395default:396/* The SIMD16 compressed instruction397*398* add(16) g4<1>F g4<8,8,1>F g6<8,8,1>F399*400* is actually decoded in hardware as:401*402* add(8) g4<1>F g4<8,8,1>F g6<8,8,1>F403* add(8) g5<1>F g5<8,8,1>F g7<8,8,1>F404*405* Which is safe. However, if we have uniform accesses406* happening, we get into trouble:407*408* add(8) g4<1>F g4<0,1,0>F g6<8,8,1>F409* add(8) g5<1>F g4<0,1,0>F g7<8,8,1>F410*411* Now our destination for the first instruction overwrote the412* second instruction's src0, and we get garbage for those 8413* pixels. There's a similar issue for the pre-gfx6414* pixel_x/pixel_y, which are registers of 16-bit values and thus415* would get stomped by the first decode as well.416*/417if (exec_size == 16) {418for (int i = 0; i < sources; i++) {419if (src[i].file == VGRF && (src[i].stride == 0 ||420src[i].type == BRW_REGISTER_TYPE_UW ||421src[i].type == BRW_REGISTER_TYPE_W ||422src[i].type == BRW_REGISTER_TYPE_UB ||423src[i].type == BRW_REGISTER_TYPE_B)) {424return true;425}426}427}428return false;429}430}431432bool433fs_inst::can_do_source_mods(const struct intel_device_info *devinfo) const434{435if (devinfo->ver == 6 && is_math())436return false;437438if (is_send_from_grf())439return false;440441/* From Wa_1604601757:442*443* "When multiplying a DW and any lower precision integer, source modifier444* is not supported."445*/446if (devinfo->ver >= 12 && (opcode == BRW_OPCODE_MUL ||447opcode == BRW_OPCODE_MAD)) {448const brw_reg_type exec_type = get_exec_type(this);449const unsigned min_type_sz = opcode == BRW_OPCODE_MAD ?450MIN2(type_sz(src[1].type), type_sz(src[2].type)) :451MIN2(type_sz(src[0].type), type_sz(src[1].type));452453if (brw_reg_type_is_integer(exec_type) &&454type_sz(exec_type) >= 4 &&455type_sz(exec_type) != min_type_sz)456return false;457}458459if (!backend_instruction::can_do_source_mods())460return false;461462return true;463}464465bool466fs_inst::can_do_cmod()467{468if (!backend_instruction::can_do_cmod())469return false;470471/* The accumulator result appears to get used for the conditional modifier472* generation. When negating a UD value, there is a 33rd bit generated for473* the sign in the accumulator value, so now you can't check, for example,474* equality with a 32-bit value. See piglit fs-op-neg-uvec4.475*/476for (unsigned i = 0; i < sources; i++) {477if (type_is_unsigned_int(src[i].type) && src[i].negate)478return false;479}480481return true;482}483484bool485fs_inst::can_change_types() const486{487return dst.type == src[0].type &&488!src[0].abs && !src[0].negate && !saturate &&489(opcode == BRW_OPCODE_MOV ||490(opcode == BRW_OPCODE_SEL &&491dst.type == src[1].type &&492predicate != BRW_PREDICATE_NONE &&493!src[1].abs && !src[1].negate));494}495496void497fs_reg::init()498{499memset((void*)this, 0, sizeof(*this));500type = BRW_REGISTER_TYPE_UD;501stride = 1;502}503504/** Generic unset register constructor. */505fs_reg::fs_reg()506{507init();508this->file = BAD_FILE;509}510511fs_reg::fs_reg(struct ::brw_reg reg) :512backend_reg(reg)513{514this->offset = 0;515this->stride = 1;516if (this->file == IMM &&517(this->type != BRW_REGISTER_TYPE_V &&518this->type != BRW_REGISTER_TYPE_UV &&519this->type != BRW_REGISTER_TYPE_VF)) {520this->stride = 0;521}522}523524bool525fs_reg::equals(const fs_reg &r) const526{527return (this->backend_reg::equals(r) &&528stride == r.stride);529}530531bool532fs_reg::negative_equals(const fs_reg &r) const533{534return (this->backend_reg::negative_equals(r) &&535stride == r.stride);536}537538bool539fs_reg::is_contiguous() const540{541switch (file) {542case ARF:543case FIXED_GRF:544return hstride == BRW_HORIZONTAL_STRIDE_1 &&545vstride == width + hstride;546case MRF:547case VGRF:548case ATTR:549return stride == 1;550case UNIFORM:551case IMM:552case BAD_FILE:553return true;554}555556unreachable("Invalid register file");557}558559unsigned560fs_reg::component_size(unsigned width) const561{562const unsigned stride = ((file != ARF && file != FIXED_GRF) ? this->stride :563hstride == 0 ? 0 :5641 << (hstride - 1));565return MAX2(width * stride, 1) * type_sz(type);566}567568/**569* Create a MOV to read the timestamp register.570*/571fs_reg572fs_visitor::get_timestamp(const fs_builder &bld)573{574assert(devinfo->ver >= 7);575576fs_reg ts = fs_reg(retype(brw_vec4_reg(BRW_ARCHITECTURE_REGISTER_FILE,577BRW_ARF_TIMESTAMP,5780),579BRW_REGISTER_TYPE_UD));580581fs_reg dst = fs_reg(VGRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD);582583/* We want to read the 3 fields we care about even if it's not enabled in584* the dispatch.585*/586bld.group(4, 0).exec_all().MOV(dst, ts);587588return dst;589}590591void592fs_visitor::emit_shader_time_begin()593{594/* We want only the low 32 bits of the timestamp. Since it's running595* at the GPU clock rate of ~1.2ghz, it will roll over every ~3 seconds,596* which is plenty of time for our purposes. It is identical across the597* EUs, but since it's tracking GPU core speed it will increment at a598* varying rate as render P-states change.599*/600shader_start_time = component(601get_timestamp(bld.annotate("shader time start")), 0);602}603604void605fs_visitor::emit_shader_time_end()606{607/* Insert our code just before the final SEND with EOT. */608exec_node *end = this->instructions.get_tail();609assert(end && ((fs_inst *) end)->eot);610const fs_builder ibld = bld.annotate("shader time end")611.exec_all().at(NULL, end);612const fs_reg timestamp = get_timestamp(ibld);613614/* We only use the low 32 bits of the timestamp - see615* emit_shader_time_begin()).616*617* We could also check if render P-states have changed (or anything618* else that might disrupt timing) by setting smear to 2 and checking if619* that field is != 0.620*/621const fs_reg shader_end_time = component(timestamp, 0);622623/* Check that there weren't any timestamp reset events (assuming these624* were the only two timestamp reads that happened).625*/626const fs_reg reset = component(timestamp, 2);627set_condmod(BRW_CONDITIONAL_Z,628ibld.AND(ibld.null_reg_ud(), reset, brw_imm_ud(1u)));629ibld.IF(BRW_PREDICATE_NORMAL);630631fs_reg start = shader_start_time;632start.negate = true;633const fs_reg diff = component(fs_reg(VGRF, alloc.allocate(1),634BRW_REGISTER_TYPE_UD),6350);636const fs_builder cbld = ibld.group(1, 0);637cbld.group(1, 0).ADD(diff, start, shader_end_time);638639/* If there were no instructions between the two timestamp gets, the diff640* is 2 cycles. Remove that overhead, so I can forget about that when641* trying to determine the time taken for single instructions.642*/643cbld.ADD(diff, diff, brw_imm_ud(-2u));644SHADER_TIME_ADD(cbld, 0, diff);645SHADER_TIME_ADD(cbld, 1, brw_imm_ud(1u));646ibld.emit(BRW_OPCODE_ELSE);647SHADER_TIME_ADD(cbld, 2, brw_imm_ud(1u));648ibld.emit(BRW_OPCODE_ENDIF);649}650651void652fs_visitor::SHADER_TIME_ADD(const fs_builder &bld,653int shader_time_subindex,654fs_reg value)655{656int index = shader_time_index * 3 + shader_time_subindex;657struct brw_reg offset = brw_imm_d(index * BRW_SHADER_TIME_STRIDE);658659fs_reg payload;660if (dispatch_width == 8)661payload = vgrf(glsl_type::uvec2_type);662else663payload = vgrf(glsl_type::uint_type);664665bld.emit(SHADER_OPCODE_SHADER_TIME_ADD, fs_reg(), payload, offset, value);666}667668void669fs_visitor::vfail(const char *format, va_list va)670{671char *msg;672673if (failed)674return;675676failed = true;677678msg = ralloc_vasprintf(mem_ctx, format, va);679msg = ralloc_asprintf(mem_ctx, "SIMD%d %s compile failed: %s\n",680dispatch_width, stage_abbrev, msg);681682this->fail_msg = msg;683684if (unlikely(debug_enabled)) {685fprintf(stderr, "%s", msg);686}687}688689void690fs_visitor::fail(const char *format, ...)691{692va_list va;693694va_start(va, format);695vfail(format, va);696va_end(va);697}698699/**700* Mark this program as impossible to compile with dispatch width greater701* than n.702*703* During the SIMD8 compile (which happens first), we can detect and flag704* things that are unsupported in SIMD16+ mode, so the compiler can skip the705* SIMD16+ compile altogether.706*707* During a compile of dispatch width greater than n (if one happens anyway),708* this just calls fail().709*/710void711fs_visitor::limit_dispatch_width(unsigned n, const char *msg)712{713if (dispatch_width > n) {714fail("%s", msg);715} else {716max_dispatch_width = MIN2(max_dispatch_width, n);717compiler->shader_perf_log(log_data,718"Shader dispatch width limited to SIMD%d: %s",719n, msg);720}721}722723/**724* Returns true if the instruction has a flag that means it won't725* update an entire destination register.726*727* For example, dead code elimination and live variable analysis want to know728* when a write to a variable screens off any preceding values that were in729* it.730*/731bool732fs_inst::is_partial_write() const733{734return ((this->predicate && this->opcode != BRW_OPCODE_SEL) ||735(this->exec_size * type_sz(this->dst.type)) < 32 ||736!this->dst.is_contiguous() ||737this->dst.offset % REG_SIZE != 0);738}739740unsigned741fs_inst::components_read(unsigned i) const742{743/* Return zero if the source is not present. */744if (src[i].file == BAD_FILE)745return 0;746747switch (opcode) {748case FS_OPCODE_LINTERP:749if (i == 0)750return 2;751else752return 1;753754case FS_OPCODE_PIXEL_X:755case FS_OPCODE_PIXEL_Y:756assert(i < 2);757if (i == 0)758return 2;759else760return 1;761762case FS_OPCODE_FB_WRITE_LOGICAL:763assert(src[FB_WRITE_LOGICAL_SRC_COMPONENTS].file == IMM);764/* First/second FB write color. */765if (i < 2)766return src[FB_WRITE_LOGICAL_SRC_COMPONENTS].ud;767else768return 1;769770case SHADER_OPCODE_TEX_LOGICAL:771case SHADER_OPCODE_TXD_LOGICAL:772case SHADER_OPCODE_TXF_LOGICAL:773case SHADER_OPCODE_TXL_LOGICAL:774case SHADER_OPCODE_TXS_LOGICAL:775case SHADER_OPCODE_IMAGE_SIZE_LOGICAL:776case FS_OPCODE_TXB_LOGICAL:777case SHADER_OPCODE_TXF_CMS_LOGICAL:778case SHADER_OPCODE_TXF_CMS_W_LOGICAL:779case SHADER_OPCODE_TXF_UMS_LOGICAL:780case SHADER_OPCODE_TXF_MCS_LOGICAL:781case SHADER_OPCODE_LOD_LOGICAL:782case SHADER_OPCODE_TG4_LOGICAL:783case SHADER_OPCODE_TG4_OFFSET_LOGICAL:784case SHADER_OPCODE_SAMPLEINFO_LOGICAL:785assert(src[TEX_LOGICAL_SRC_COORD_COMPONENTS].file == IMM &&786src[TEX_LOGICAL_SRC_GRAD_COMPONENTS].file == IMM);787/* Texture coordinates. */788if (i == TEX_LOGICAL_SRC_COORDINATE)789return src[TEX_LOGICAL_SRC_COORD_COMPONENTS].ud;790/* Texture derivatives. */791else if ((i == TEX_LOGICAL_SRC_LOD || i == TEX_LOGICAL_SRC_LOD2) &&792opcode == SHADER_OPCODE_TXD_LOGICAL)793return src[TEX_LOGICAL_SRC_GRAD_COMPONENTS].ud;794/* Texture offset. */795else if (i == TEX_LOGICAL_SRC_TG4_OFFSET)796return 2;797/* MCS */798else if (i == TEX_LOGICAL_SRC_MCS && opcode == SHADER_OPCODE_TXF_CMS_W_LOGICAL)799return 2;800else801return 1;802803case SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL:804case SHADER_OPCODE_TYPED_SURFACE_READ_LOGICAL:805assert(src[SURFACE_LOGICAL_SRC_IMM_DIMS].file == IMM);806/* Surface coordinates. */807if (i == SURFACE_LOGICAL_SRC_ADDRESS)808return src[SURFACE_LOGICAL_SRC_IMM_DIMS].ud;809/* Surface operation source (ignored for reads). */810else if (i == SURFACE_LOGICAL_SRC_DATA)811return 0;812else813return 1;814815case SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL:816case SHADER_OPCODE_TYPED_SURFACE_WRITE_LOGICAL:817assert(src[SURFACE_LOGICAL_SRC_IMM_DIMS].file == IMM &&818src[SURFACE_LOGICAL_SRC_IMM_ARG].file == IMM);819/* Surface coordinates. */820if (i == SURFACE_LOGICAL_SRC_ADDRESS)821return src[SURFACE_LOGICAL_SRC_IMM_DIMS].ud;822/* Surface operation source. */823else if (i == SURFACE_LOGICAL_SRC_DATA)824return src[SURFACE_LOGICAL_SRC_IMM_ARG].ud;825else826return 1;827828case SHADER_OPCODE_A64_UNTYPED_READ_LOGICAL:829case SHADER_OPCODE_A64_OWORD_BLOCK_READ_LOGICAL:830case SHADER_OPCODE_A64_UNALIGNED_OWORD_BLOCK_READ_LOGICAL:831assert(src[2].file == IMM);832return 1;833834case SHADER_OPCODE_A64_OWORD_BLOCK_WRITE_LOGICAL:835assert(src[2].file == IMM);836if (i == 1) { /* data to write */837const unsigned comps = src[2].ud / exec_size;838assert(comps > 0);839return comps;840} else {841return 1;842}843844case SHADER_OPCODE_OWORD_BLOCK_READ_LOGICAL:845case SHADER_OPCODE_UNALIGNED_OWORD_BLOCK_READ_LOGICAL:846assert(src[SURFACE_LOGICAL_SRC_IMM_ARG].file == IMM);847return 1;848849case SHADER_OPCODE_OWORD_BLOCK_WRITE_LOGICAL:850assert(src[SURFACE_LOGICAL_SRC_IMM_ARG].file == IMM);851if (i == SURFACE_LOGICAL_SRC_DATA) {852const unsigned comps = src[SURFACE_LOGICAL_SRC_IMM_ARG].ud / exec_size;853assert(comps > 0);854return comps;855} else {856return 1;857}858859case SHADER_OPCODE_A64_UNTYPED_WRITE_LOGICAL:860assert(src[2].file == IMM);861return i == 1 ? src[2].ud : 1;862863case SHADER_OPCODE_A64_UNTYPED_ATOMIC_LOGICAL:864case SHADER_OPCODE_A64_UNTYPED_ATOMIC_INT16_LOGICAL:865case SHADER_OPCODE_A64_UNTYPED_ATOMIC_INT64_LOGICAL:866assert(src[2].file == IMM);867if (i == 1) {868/* Data source */869const unsigned op = src[2].ud;870switch (op) {871case BRW_AOP_INC:872case BRW_AOP_DEC:873case BRW_AOP_PREDEC:874return 0;875case BRW_AOP_CMPWR:876return 2;877default:878return 1;879}880} else {881return 1;882}883884case SHADER_OPCODE_A64_UNTYPED_ATOMIC_FLOAT16_LOGICAL:885case SHADER_OPCODE_A64_UNTYPED_ATOMIC_FLOAT32_LOGICAL:886assert(src[2].file == IMM);887if (i == 1) {888/* Data source */889const unsigned op = src[2].ud;890return op == BRW_AOP_FCMPWR ? 2 : 1;891} else {892return 1;893}894895case SHADER_OPCODE_BYTE_SCATTERED_READ_LOGICAL:896case SHADER_OPCODE_DWORD_SCATTERED_READ_LOGICAL:897/* Scattered logical opcodes use the following params:898* src[0] Surface coordinates899* src[1] Surface operation source (ignored for reads)900* src[2] Surface901* src[3] IMM with always 1 dimension.902* src[4] IMM with arg bitsize for scattered read/write 8, 16, 32903*/904assert(src[SURFACE_LOGICAL_SRC_IMM_DIMS].file == IMM &&905src[SURFACE_LOGICAL_SRC_IMM_ARG].file == IMM);906return i == SURFACE_LOGICAL_SRC_DATA ? 0 : 1;907908case SHADER_OPCODE_BYTE_SCATTERED_WRITE_LOGICAL:909case SHADER_OPCODE_DWORD_SCATTERED_WRITE_LOGICAL:910assert(src[SURFACE_LOGICAL_SRC_IMM_DIMS].file == IMM &&911src[SURFACE_LOGICAL_SRC_IMM_ARG].file == IMM);912return 1;913914case SHADER_OPCODE_UNTYPED_ATOMIC_LOGICAL:915case SHADER_OPCODE_TYPED_ATOMIC_LOGICAL: {916assert(src[SURFACE_LOGICAL_SRC_IMM_DIMS].file == IMM &&917src[SURFACE_LOGICAL_SRC_IMM_ARG].file == IMM);918const unsigned op = src[SURFACE_LOGICAL_SRC_IMM_ARG].ud;919/* Surface coordinates. */920if (i == SURFACE_LOGICAL_SRC_ADDRESS)921return src[SURFACE_LOGICAL_SRC_IMM_DIMS].ud;922/* Surface operation source. */923else if (i == SURFACE_LOGICAL_SRC_DATA && op == BRW_AOP_CMPWR)924return 2;925else if (i == SURFACE_LOGICAL_SRC_DATA &&926(op == BRW_AOP_INC || op == BRW_AOP_DEC || op == BRW_AOP_PREDEC))927return 0;928else929return 1;930}931case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:932return (i == 0 ? 2 : 1);933934case SHADER_OPCODE_UNTYPED_ATOMIC_FLOAT_LOGICAL: {935assert(src[SURFACE_LOGICAL_SRC_IMM_DIMS].file == IMM &&936src[SURFACE_LOGICAL_SRC_IMM_ARG].file == IMM);937const unsigned op = src[SURFACE_LOGICAL_SRC_IMM_ARG].ud;938/* Surface coordinates. */939if (i == SURFACE_LOGICAL_SRC_ADDRESS)940return src[SURFACE_LOGICAL_SRC_IMM_DIMS].ud;941/* Surface operation source. */942else if (i == SURFACE_LOGICAL_SRC_DATA && op == BRW_AOP_FCMPWR)943return 2;944else945return 1;946}947948default:949return 1;950}951}952953unsigned954fs_inst::size_read(int arg) const955{956switch (opcode) {957case SHADER_OPCODE_SEND:958if (arg == 2) {959return mlen * REG_SIZE;960} else if (arg == 3) {961return ex_mlen * REG_SIZE;962}963break;964965case FS_OPCODE_FB_WRITE:966case FS_OPCODE_REP_FB_WRITE:967if (arg == 0) {968if (base_mrf >= 0)969return src[0].file == BAD_FILE ? 0 : 2 * REG_SIZE;970else971return mlen * REG_SIZE;972}973break;974975case FS_OPCODE_FB_READ:976case SHADER_OPCODE_URB_WRITE_SIMD8:977case SHADER_OPCODE_URB_WRITE_SIMD8_PER_SLOT:978case SHADER_OPCODE_URB_WRITE_SIMD8_MASKED:979case SHADER_OPCODE_URB_WRITE_SIMD8_MASKED_PER_SLOT:980case SHADER_OPCODE_URB_READ_SIMD8:981case SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT:982case FS_OPCODE_INTERPOLATE_AT_SAMPLE:983case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:984if (arg == 0)985return mlen * REG_SIZE;986break;987988case FS_OPCODE_SET_SAMPLE_ID:989if (arg == 1)990return 1;991break;992993case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GFX7:994/* The payload is actually stored in src1 */995if (arg == 1)996return mlen * REG_SIZE;997break;998999case FS_OPCODE_LINTERP:1000if (arg == 1)1001return 16;1002break;10031004case SHADER_OPCODE_LOAD_PAYLOAD:1005if (arg < this->header_size)1006return REG_SIZE;1007break;10081009case CS_OPCODE_CS_TERMINATE:1010case SHADER_OPCODE_BARRIER:1011return REG_SIZE;10121013case SHADER_OPCODE_MOV_INDIRECT:1014if (arg == 0) {1015assert(src[2].file == IMM);1016return src[2].ud;1017}1018break;10191020default:1021if (is_tex() && arg == 0 && src[0].file == VGRF)1022return mlen * REG_SIZE;1023break;1024}10251026switch (src[arg].file) {1027case UNIFORM:1028case IMM:1029return components_read(arg) * type_sz(src[arg].type);1030case BAD_FILE:1031case ARF:1032case FIXED_GRF:1033case VGRF:1034case ATTR:1035return components_read(arg) * src[arg].component_size(exec_size);1036case MRF:1037unreachable("MRF registers are not allowed as sources");1038}1039return 0;1040}10411042namespace {1043unsigned1044predicate_width(brw_predicate predicate)1045{1046switch (predicate) {1047case BRW_PREDICATE_NONE: return 1;1048case BRW_PREDICATE_NORMAL: return 1;1049case BRW_PREDICATE_ALIGN1_ANY2H: return 2;1050case BRW_PREDICATE_ALIGN1_ALL2H: return 2;1051case BRW_PREDICATE_ALIGN1_ANY4H: return 4;1052case BRW_PREDICATE_ALIGN1_ALL4H: return 4;1053case BRW_PREDICATE_ALIGN1_ANY8H: return 8;1054case BRW_PREDICATE_ALIGN1_ALL8H: return 8;1055case BRW_PREDICATE_ALIGN1_ANY16H: return 16;1056case BRW_PREDICATE_ALIGN1_ALL16H: return 16;1057case BRW_PREDICATE_ALIGN1_ANY32H: return 32;1058case BRW_PREDICATE_ALIGN1_ALL32H: return 32;1059default: unreachable("Unsupported predicate");1060}1061}10621063/* Return the subset of flag registers that an instruction could1064* potentially read or write based on the execution controls and flag1065* subregister number of the instruction.1066*/1067unsigned1068flag_mask(const fs_inst *inst, unsigned width)1069{1070assert(util_is_power_of_two_nonzero(width));1071const unsigned start = (inst->flag_subreg * 16 + inst->group) &1072~(width - 1);1073const unsigned end = start + ALIGN(inst->exec_size, width);1074return ((1 << DIV_ROUND_UP(end, 8)) - 1) & ~((1 << (start / 8)) - 1);1075}10761077unsigned1078bit_mask(unsigned n)1079{1080return (n >= CHAR_BIT * sizeof(bit_mask(n)) ? ~0u : (1u << n) - 1);1081}10821083unsigned1084flag_mask(const fs_reg &r, unsigned sz)1085{1086if (r.file == ARF) {1087const unsigned start = (r.nr - BRW_ARF_FLAG) * 4 + r.subnr;1088const unsigned end = start + sz;1089return bit_mask(end) & ~bit_mask(start);1090} else {1091return 0;1092}1093}1094}10951096unsigned1097fs_inst::flags_read(const intel_device_info *devinfo) const1098{1099if (predicate == BRW_PREDICATE_ALIGN1_ANYV ||1100predicate == BRW_PREDICATE_ALIGN1_ALLV) {1101/* The vertical predication modes combine corresponding bits from1102* f0.0 and f1.0 on Gfx7+, and f0.0 and f0.1 on older hardware.1103*/1104const unsigned shift = devinfo->ver >= 7 ? 4 : 2;1105return flag_mask(this, 1) << shift | flag_mask(this, 1);1106} else if (predicate) {1107return flag_mask(this, predicate_width(predicate));1108} else {1109unsigned mask = 0;1110for (int i = 0; i < sources; i++) {1111mask |= flag_mask(src[i], size_read(i));1112}1113return mask;1114}1115}11161117unsigned1118fs_inst::flags_written(const intel_device_info *devinfo) const1119{1120/* On Gfx4 and Gfx5, sel.l (for min) and sel.ge (for max) are implemented1121* using a separte cmpn and sel instruction. This lowering occurs in1122* fs_vistor::lower_minmax which is called very, very late.1123*/1124if ((conditional_mod && ((opcode != BRW_OPCODE_SEL || devinfo->ver <= 5) &&1125opcode != BRW_OPCODE_CSEL &&1126opcode != BRW_OPCODE_IF &&1127opcode != BRW_OPCODE_WHILE)) ||1128opcode == FS_OPCODE_FB_WRITE) {1129return flag_mask(this, 1);1130} else if (opcode == SHADER_OPCODE_FIND_LIVE_CHANNEL ||1131opcode == FS_OPCODE_LOAD_LIVE_CHANNELS) {1132return flag_mask(this, 32);1133} else {1134return flag_mask(dst, size_written);1135}1136}11371138/**1139* Returns how many MRFs an FS opcode will write over.1140*1141* Note that this is not the 0 or 1 implied writes in an actual gen1142* instruction -- the FS opcodes often generate MOVs in addition.1143*/1144unsigned1145fs_inst::implied_mrf_writes() const1146{1147if (mlen == 0)1148return 0;11491150if (base_mrf == -1)1151return 0;11521153switch (opcode) {1154case SHADER_OPCODE_RCP:1155case SHADER_OPCODE_RSQ:1156case SHADER_OPCODE_SQRT:1157case SHADER_OPCODE_EXP2:1158case SHADER_OPCODE_LOG2:1159case SHADER_OPCODE_SIN:1160case SHADER_OPCODE_COS:1161return 1 * exec_size / 8;1162case SHADER_OPCODE_POW:1163case SHADER_OPCODE_INT_QUOTIENT:1164case SHADER_OPCODE_INT_REMAINDER:1165return 2 * exec_size / 8;1166case SHADER_OPCODE_TEX:1167case FS_OPCODE_TXB:1168case SHADER_OPCODE_TXD:1169case SHADER_OPCODE_TXF:1170case SHADER_OPCODE_TXF_CMS:1171case SHADER_OPCODE_TXF_MCS:1172case SHADER_OPCODE_TG4:1173case SHADER_OPCODE_TG4_OFFSET:1174case SHADER_OPCODE_TXL:1175case SHADER_OPCODE_TXS:1176case SHADER_OPCODE_LOD:1177case SHADER_OPCODE_SAMPLEINFO:1178return 1;1179case FS_OPCODE_FB_WRITE:1180case FS_OPCODE_REP_FB_WRITE:1181return src[0].file == BAD_FILE ? 0 : 2;1182case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:1183case SHADER_OPCODE_GFX4_SCRATCH_READ:1184return 1;1185case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GFX4:1186return mlen;1187case SHADER_OPCODE_GFX4_SCRATCH_WRITE:1188return mlen;1189default:1190unreachable("not reached");1191}1192}11931194fs_reg1195fs_visitor::vgrf(const glsl_type *const type)1196{1197int reg_width = dispatch_width / 8;1198return fs_reg(VGRF,1199alloc.allocate(glsl_count_dword_slots(type, false) * reg_width),1200brw_type_for_base_type(type));1201}12021203fs_reg::fs_reg(enum brw_reg_file file, int nr)1204{1205init();1206this->file = file;1207this->nr = nr;1208this->type = BRW_REGISTER_TYPE_F;1209this->stride = (file == UNIFORM ? 0 : 1);1210}12111212fs_reg::fs_reg(enum brw_reg_file file, int nr, enum brw_reg_type type)1213{1214init();1215this->file = file;1216this->nr = nr;1217this->type = type;1218this->stride = (file == UNIFORM ? 0 : 1);1219}12201221/* For SIMD16, we need to follow from the uniform setup of SIMD8 dispatch.1222* This brings in those uniform definitions1223*/1224void1225fs_visitor::import_uniforms(fs_visitor *v)1226{1227this->push_constant_loc = v->push_constant_loc;1228this->pull_constant_loc = v->pull_constant_loc;1229this->uniforms = v->uniforms;1230this->subgroup_id = v->subgroup_id;1231for (unsigned i = 0; i < ARRAY_SIZE(this->group_size); i++)1232this->group_size[i] = v->group_size[i];1233}12341235void1236fs_visitor::emit_fragcoord_interpolation(fs_reg wpos)1237{1238assert(stage == MESA_SHADER_FRAGMENT);12391240/* gl_FragCoord.x */1241bld.MOV(wpos, this->pixel_x);1242wpos = offset(wpos, bld, 1);12431244/* gl_FragCoord.y */1245bld.MOV(wpos, this->pixel_y);1246wpos = offset(wpos, bld, 1);12471248/* gl_FragCoord.z */1249if (devinfo->ver >= 6) {1250bld.MOV(wpos, this->pixel_z);1251} else {1252bld.emit(FS_OPCODE_LINTERP, wpos,1253this->delta_xy[BRW_BARYCENTRIC_PERSPECTIVE_PIXEL],1254component(interp_reg(VARYING_SLOT_POS, 2), 0));1255}1256wpos = offset(wpos, bld, 1);12571258/* gl_FragCoord.w: Already set up in emit_interpolation */1259bld.MOV(wpos, this->wpos_w);1260}12611262enum brw_barycentric_mode1263brw_barycentric_mode(enum glsl_interp_mode mode, nir_intrinsic_op op)1264{1265/* Barycentric modes don't make sense for flat inputs. */1266assert(mode != INTERP_MODE_FLAT);12671268unsigned bary;1269switch (op) {1270case nir_intrinsic_load_barycentric_pixel:1271case nir_intrinsic_load_barycentric_at_offset:1272bary = BRW_BARYCENTRIC_PERSPECTIVE_PIXEL;1273break;1274case nir_intrinsic_load_barycentric_centroid:1275bary = BRW_BARYCENTRIC_PERSPECTIVE_CENTROID;1276break;1277case nir_intrinsic_load_barycentric_sample:1278case nir_intrinsic_load_barycentric_at_sample:1279bary = BRW_BARYCENTRIC_PERSPECTIVE_SAMPLE;1280break;1281default:1282unreachable("invalid intrinsic");1283}12841285if (mode == INTERP_MODE_NOPERSPECTIVE)1286bary += 3;12871288return (enum brw_barycentric_mode) bary;1289}12901291/**1292* Turn one of the two CENTROID barycentric modes into PIXEL mode.1293*/1294static enum brw_barycentric_mode1295centroid_to_pixel(enum brw_barycentric_mode bary)1296{1297assert(bary == BRW_BARYCENTRIC_PERSPECTIVE_CENTROID ||1298bary == BRW_BARYCENTRIC_NONPERSPECTIVE_CENTROID);1299return (enum brw_barycentric_mode) ((unsigned) bary - 1);1300}13011302fs_reg *1303fs_visitor::emit_frontfacing_interpolation()1304{1305fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::bool_type));13061307if (devinfo->ver >= 12) {1308fs_reg g1 = fs_reg(retype(brw_vec1_grf(1, 1), BRW_REGISTER_TYPE_W));13091310fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_W);1311bld.ASR(tmp, g1, brw_imm_d(15));1312bld.NOT(*reg, tmp);1313} else if (devinfo->ver >= 6) {1314/* Bit 15 of g0.0 is 0 if the polygon is front facing. We want to create1315* a boolean result from this (~0/true or 0/false).1316*1317* We can use the fact that bit 15 is the MSB of g0.0:W to accomplish1318* this task in only one instruction:1319* - a negation source modifier will flip the bit; and1320* - a W -> D type conversion will sign extend the bit into the high1321* word of the destination.1322*1323* An ASR 15 fills the low word of the destination.1324*/1325fs_reg g0 = fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_W));1326g0.negate = true;13271328bld.ASR(*reg, g0, brw_imm_d(15));1329} else {1330/* Bit 31 of g1.6 is 0 if the polygon is front facing. We want to create1331* a boolean result from this (1/true or 0/false).1332*1333* Like in the above case, since the bit is the MSB of g1.6:UD we can use1334* the negation source modifier to flip it. Unfortunately the SHR1335* instruction only operates on UD (or D with an abs source modifier)1336* sources without negation.1337*1338* Instead, use ASR (which will give ~0/true or 0/false).1339*/1340fs_reg g1_6 = fs_reg(retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_D));1341g1_6.negate = true;13421343bld.ASR(*reg, g1_6, brw_imm_d(31));1344}13451346return reg;1347}13481349void1350fs_visitor::compute_sample_position(fs_reg dst, fs_reg int_sample_pos)1351{1352assert(stage == MESA_SHADER_FRAGMENT);1353struct brw_wm_prog_data *wm_prog_data = brw_wm_prog_data(this->prog_data);1354assert(dst.type == BRW_REGISTER_TYPE_F);13551356if (wm_prog_data->persample_dispatch) {1357/* Convert int_sample_pos to floating point */1358bld.MOV(dst, int_sample_pos);1359/* Scale to the range [0, 1] */1360bld.MUL(dst, dst, brw_imm_f(1 / 16.0f));1361}1362else {1363/* From ARB_sample_shading specification:1364* "When rendering to a non-multisample buffer, or if multisample1365* rasterization is disabled, gl_SamplePosition will always be1366* (0.5, 0.5).1367*/1368bld.MOV(dst, brw_imm_f(0.5f));1369}1370}13711372fs_reg *1373fs_visitor::emit_samplepos_setup()1374{1375assert(devinfo->ver >= 6);13761377const fs_builder abld = bld.annotate("compute sample position");1378fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::vec2_type));1379fs_reg pos = *reg;1380fs_reg int_sample_x = vgrf(glsl_type::int_type);1381fs_reg int_sample_y = vgrf(glsl_type::int_type);13821383/* WM will be run in MSDISPMODE_PERSAMPLE. So, only one of SIMD8 or SIMD161384* mode will be enabled.1385*1386* From the Ivy Bridge PRM, volume 2 part 1, page 344:1387* R31.1:0 Position Offset X/Y for Slot[3:0]1388* R31.3:2 Position Offset X/Y for Slot[7:4]1389* .....1390*1391* The X, Y sample positions come in as bytes in thread payload. So, read1392* the positions using vstride=16, width=8, hstride=2.1393*/1394const fs_reg sample_pos_reg =1395fetch_payload_reg(abld, payload.sample_pos_reg, BRW_REGISTER_TYPE_W);13961397/* Compute gl_SamplePosition.x */1398abld.MOV(int_sample_x, subscript(sample_pos_reg, BRW_REGISTER_TYPE_B, 0));1399compute_sample_position(offset(pos, abld, 0), int_sample_x);14001401/* Compute gl_SamplePosition.y */1402abld.MOV(int_sample_y, subscript(sample_pos_reg, BRW_REGISTER_TYPE_B, 1));1403compute_sample_position(offset(pos, abld, 1), int_sample_y);1404return reg;1405}14061407fs_reg *1408fs_visitor::emit_sampleid_setup()1409{1410assert(stage == MESA_SHADER_FRAGMENT);1411brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;1412assert(devinfo->ver >= 6);14131414const fs_builder abld = bld.annotate("compute sample id");1415fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::uint_type));14161417if (!key->multisample_fbo) {1418/* As per GL_ARB_sample_shading specification:1419* "When rendering to a non-multisample buffer, or if multisample1420* rasterization is disabled, gl_SampleID will always be zero."1421*/1422abld.MOV(*reg, brw_imm_d(0));1423} else if (devinfo->ver >= 8) {1424/* Sample ID comes in as 4-bit numbers in g1.0:1425*1426* 15:12 Slot 3 SampleID (only used in SIMD16)1427* 11:8 Slot 2 SampleID (only used in SIMD16)1428* 7:4 Slot 1 SampleID1429* 3:0 Slot 0 SampleID1430*1431* Each slot corresponds to four channels, so we want to replicate each1432* half-byte value to 4 channels in a row:1433*1434* dst+0: .7 .6 .5 .4 .3 .2 .1 .01435* 7:4 7:4 7:4 7:4 3:0 3:0 3:0 3:01436*1437* dst+1: .7 .6 .5 .4 .3 .2 .1 .0 (if SIMD16)1438* 15:12 15:12 15:12 15:12 11:8 11:8 11:8 11:81439*1440* First, we read g1.0 with a <1,8,0>UB region, causing the first 81441* channels to read the first byte (7:0), and the second group of 81442* channels to read the second byte (15:8). Then, we shift right by1443* a vector immediate of <4, 4, 4, 4, 0, 0, 0, 0>, moving the slot 1 / 31444* values into place. Finally, we AND with 0xf to keep the low nibble.1445*1446* shr(16) tmp<1>W g1.0<1,8,0>B 0x44440000:V1447* and(16) dst<1>D tmp<8,8,1>W 0xf:W1448*1449* TODO: These payload bits exist on Gfx7 too, but they appear to always1450* be zero, so this code fails to work. We should find out why.1451*/1452const fs_reg tmp = abld.vgrf(BRW_REGISTER_TYPE_UW);14531454for (unsigned i = 0; i < DIV_ROUND_UP(dispatch_width, 16); i++) {1455const fs_builder hbld = abld.group(MIN2(16, dispatch_width), i);1456hbld.SHR(offset(tmp, hbld, i),1457stride(retype(brw_vec1_grf(1 + i, 0), BRW_REGISTER_TYPE_UB),14581, 8, 0),1459brw_imm_v(0x44440000));1460}14611462abld.AND(*reg, tmp, brw_imm_w(0xf));1463} else {1464const fs_reg t1 = component(abld.vgrf(BRW_REGISTER_TYPE_UD), 0);1465const fs_reg t2 = abld.vgrf(BRW_REGISTER_TYPE_UW);14661467/* The PS will be run in MSDISPMODE_PERSAMPLE. For example with1468* 8x multisampling, subspan 0 will represent sample N (where N1469* is 0, 2, 4 or 6), subspan 1 will represent sample 1, 3, 5 or1470* 7. We can find the value of N by looking at R0.0 bits 7:61471* ("Starting Sample Pair Index (SSPI)") and multiplying by two1472* (since samples are always delivered in pairs). That is, we1473* compute 2*((R0.0 & 0xc0) >> 6) == (R0.0 & 0xc0) >> 5. Then1474* we need to add N to the sequence (0, 0, 0, 0, 1, 1, 1, 1) in1475* case of SIMD8 and sequence (0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2,1476* 2, 3, 3, 3, 3) in case of SIMD16. We compute this sequence by1477* populating a temporary variable with the sequence (0, 1, 2, 3),1478* and then reading from it using vstride=1, width=4, hstride=0.1479* These computations hold good for 4x multisampling as well.1480*1481* For 2x MSAA and SIMD16, we want to use the sequence (0, 1, 0, 1):1482* the first four slots are sample 0 of subspan 0; the next four1483* are sample 1 of subspan 0; the third group is sample 0 of1484* subspan 1, and finally sample 1 of subspan 1.1485*/14861487/* SKL+ has an extra bit for the Starting Sample Pair Index to1488* accomodate 16x MSAA.1489*/1490abld.exec_all().group(1, 0)1491.AND(t1, fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD)),1492brw_imm_ud(0xc0));1493abld.exec_all().group(1, 0).SHR(t1, t1, brw_imm_d(5));14941495/* This works for SIMD8-SIMD16. It also works for SIMD32 but only if we1496* can assume 4x MSAA. Disallow it on IVB+1497*1498* FINISHME: One day, we could come up with a way to do this that1499* actually works on gfx7.1500*/1501if (devinfo->ver >= 7)1502limit_dispatch_width(16, "gl_SampleId is unsupported in SIMD32 on gfx7");1503abld.exec_all().group(8, 0).MOV(t2, brw_imm_v(0x32103210));15041505/* This special instruction takes care of setting vstride=1,1506* width=4, hstride=0 of t2 during an ADD instruction.1507*/1508abld.emit(FS_OPCODE_SET_SAMPLE_ID, *reg, t1, t2);1509}15101511return reg;1512}15131514fs_reg *1515fs_visitor::emit_samplemaskin_setup()1516{1517assert(stage == MESA_SHADER_FRAGMENT);1518struct brw_wm_prog_data *wm_prog_data = brw_wm_prog_data(this->prog_data);1519assert(devinfo->ver >= 6);15201521fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::int_type));15221523/* The HW doesn't provide us with expected values. */1524assert(!wm_prog_data->per_coarse_pixel_dispatch);15251526fs_reg coverage_mask =1527fetch_payload_reg(bld, payload.sample_mask_in_reg, BRW_REGISTER_TYPE_D);15281529if (wm_prog_data->persample_dispatch) {1530/* gl_SampleMaskIn[] comes from two sources: the input coverage mask,1531* and a mask representing which sample is being processed by the1532* current shader invocation.1533*1534* From the OES_sample_variables specification:1535* "When per-sample shading is active due to the use of a fragment input1536* qualified by "sample" or due to the use of the gl_SampleID or1537* gl_SamplePosition variables, only the bit for the current sample is1538* set in gl_SampleMaskIn."1539*/1540const fs_builder abld = bld.annotate("compute gl_SampleMaskIn");15411542if (nir_system_values[SYSTEM_VALUE_SAMPLE_ID].file == BAD_FILE)1543nir_system_values[SYSTEM_VALUE_SAMPLE_ID] = *emit_sampleid_setup();15441545fs_reg one = vgrf(glsl_type::int_type);1546fs_reg enabled_mask = vgrf(glsl_type::int_type);1547abld.MOV(one, brw_imm_d(1));1548abld.SHL(enabled_mask, one, nir_system_values[SYSTEM_VALUE_SAMPLE_ID]);1549abld.AND(*reg, enabled_mask, coverage_mask);1550} else {1551/* In per-pixel mode, the coverage mask is sufficient. */1552*reg = coverage_mask;1553}1554return reg;1555}15561557fs_reg *1558fs_visitor::emit_shading_rate_setup()1559{1560assert(devinfo->ver >= 11);15611562const fs_builder abld = bld.annotate("compute fragment shading rate");15631564fs_reg *reg = new(this->mem_ctx) fs_reg(bld.vgrf(BRW_REGISTER_TYPE_UD));15651566struct brw_wm_prog_data *wm_prog_data =1567brw_wm_prog_data(bld.shader->stage_prog_data);15681569/* Coarse pixel shading size fields overlap with other fields of not in1570* coarse pixel dispatch mode, so report 0 when that's not the case.1571*/1572if (wm_prog_data->per_coarse_pixel_dispatch) {1573/* The shading rates provided in the shader are the actual 2D shading1574* rate while the SPIR-V built-in is the enum value that has the shading1575* rate encoded as a bitfield. Fortunately, the bitfield value is just1576* the shading rate divided by two and shifted.1577*/15781579/* r1.0 - 0:7 ActualCoarsePixelShadingSize.X */1580fs_reg actual_x = fs_reg(retype(brw_vec1_grf(1, 0), BRW_REGISTER_TYPE_UB));1581/* r1.0 - 15:8 ActualCoarsePixelShadingSize.Y */1582fs_reg actual_y = byte_offset(actual_x, 1);15831584fs_reg int_rate_x = bld.vgrf(BRW_REGISTER_TYPE_UD);1585fs_reg int_rate_y = bld.vgrf(BRW_REGISTER_TYPE_UD);15861587abld.SHR(int_rate_y, actual_y, brw_imm_ud(1));1588abld.SHR(int_rate_x, actual_x, brw_imm_ud(1));1589abld.SHL(int_rate_x, int_rate_x, brw_imm_ud(2));1590abld.OR(*reg, int_rate_x, int_rate_y);1591} else {1592abld.MOV(*reg, brw_imm_ud(0));1593}15941595return reg;1596}15971598fs_reg1599fs_visitor::resolve_source_modifiers(const fs_reg &src)1600{1601if (!src.abs && !src.negate)1602return src;16031604fs_reg temp = bld.vgrf(src.type);1605bld.MOV(temp, src);16061607return temp;1608}16091610void1611fs_visitor::emit_gs_thread_end()1612{1613assert(stage == MESA_SHADER_GEOMETRY);16141615struct brw_gs_prog_data *gs_prog_data = brw_gs_prog_data(prog_data);16161617if (gs_compile->control_data_header_size_bits > 0) {1618emit_gs_control_data_bits(this->final_gs_vertex_count);1619}16201621const fs_builder abld = bld.annotate("thread end");1622fs_inst *inst;16231624if (gs_prog_data->static_vertex_count != -1) {1625foreach_in_list_reverse(fs_inst, prev, &this->instructions) {1626if (prev->opcode == SHADER_OPCODE_URB_WRITE_SIMD8 ||1627prev->opcode == SHADER_OPCODE_URB_WRITE_SIMD8_MASKED ||1628prev->opcode == SHADER_OPCODE_URB_WRITE_SIMD8_PER_SLOT ||1629prev->opcode == SHADER_OPCODE_URB_WRITE_SIMD8_MASKED_PER_SLOT) {1630prev->eot = true;16311632/* Delete now dead instructions. */1633foreach_in_list_reverse_safe(exec_node, dead, &this->instructions) {1634if (dead == prev)1635break;1636dead->remove();1637}1638return;1639} else if (prev->is_control_flow() || prev->has_side_effects()) {1640break;1641}1642}1643fs_reg hdr = abld.vgrf(BRW_REGISTER_TYPE_UD, 1);1644abld.MOV(hdr, fs_reg(retype(brw_vec8_grf(1, 0), BRW_REGISTER_TYPE_UD)));1645inst = abld.emit(SHADER_OPCODE_URB_WRITE_SIMD8, reg_undef, hdr);1646inst->mlen = 1;1647} else {1648fs_reg payload = abld.vgrf(BRW_REGISTER_TYPE_UD, 2);1649fs_reg *sources = ralloc_array(mem_ctx, fs_reg, 2);1650sources[0] = fs_reg(retype(brw_vec8_grf(1, 0), BRW_REGISTER_TYPE_UD));1651sources[1] = this->final_gs_vertex_count;1652abld.LOAD_PAYLOAD(payload, sources, 2, 2);1653inst = abld.emit(SHADER_OPCODE_URB_WRITE_SIMD8, reg_undef, payload);1654inst->mlen = 2;1655}1656inst->eot = true;1657inst->offset = 0;1658}16591660void1661fs_visitor::assign_curb_setup()1662{1663unsigned uniform_push_length = DIV_ROUND_UP(stage_prog_data->nr_params, 8);16641665unsigned ubo_push_length = 0;1666unsigned ubo_push_start[4];1667for (int i = 0; i < 4; i++) {1668ubo_push_start[i] = 8 * (ubo_push_length + uniform_push_length);1669ubo_push_length += stage_prog_data->ubo_ranges[i].length;1670}16711672prog_data->curb_read_length = uniform_push_length + ubo_push_length;16731674uint64_t used = 0;16751676if (stage == MESA_SHADER_COMPUTE &&1677brw_cs_prog_data(prog_data)->uses_inline_data) {1678/* With COMPUTE_WALKER, we can push up to one register worth of data via1679* the inline data parameter in the COMPUTE_WALKER command itself.1680*1681* TODO: Support inline data and push at the same time.1682*/1683assert(devinfo->verx10 >= 125);1684assert(uniform_push_length <= 1);1685} else if (stage == MESA_SHADER_COMPUTE && devinfo->verx10 >= 125) {1686fs_builder ubld = bld.exec_all().group(8, 0).at(1687cfg->first_block(), cfg->first_block()->start());16881689/* The base address for our push data is passed in as R0.0[31:6]. We1690* have to mask off the bottom 6 bits.1691*/1692fs_reg base_addr = ubld.vgrf(BRW_REGISTER_TYPE_UD);1693ubld.group(1, 0).AND(base_addr,1694retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD),1695brw_imm_ud(INTEL_MASK(31, 6)));16961697fs_reg header0 = ubld.vgrf(BRW_REGISTER_TYPE_UD);1698ubld.MOV(header0, brw_imm_ud(0));1699ubld.group(1, 0).SHR(component(header0, 2), base_addr, brw_imm_ud(4));17001701/* On Gfx12-HP we load constants at the start of the program using A321702* stateless messages.1703*/1704for (unsigned i = 0; i < uniform_push_length;) {1705/* Limit ourselves to HW limit of 8 Owords (8 * 16bytes = 128 bytes1706* or 4 registers).1707*/1708unsigned num_regs = MIN2(uniform_push_length - i, 4);1709assert(num_regs > 0);1710num_regs = 1 << util_logbase2(num_regs);17111712fs_reg header;1713if (i == 0) {1714header = header0;1715} else {1716header = ubld.vgrf(BRW_REGISTER_TYPE_UD);1717ubld.MOV(header, brw_imm_ud(0));1718ubld.group(1, 0).ADD(component(header, 2),1719component(header0, 2),1720brw_imm_ud(i * 2));1721}17221723fs_reg srcs[4] = {1724brw_imm_ud(0), /* desc */1725brw_imm_ud(0), /* ex_desc */1726header, /* payload */1727fs_reg(), /* payload2 */1728};17291730fs_reg dest = retype(brw_vec8_grf(payload.num_regs + i, 0),1731BRW_REGISTER_TYPE_UD);17321733/* This instruction has to be run SIMD16 if we're filling more than a1734* single register.1735*/1736unsigned send_width = MIN2(16, num_regs * 8);17371738fs_inst *send = ubld.group(send_width, 0).emit(SHADER_OPCODE_SEND,1739dest, srcs, 4);1740send->sfid = GFX7_SFID_DATAPORT_DATA_CACHE;1741send->desc = brw_dp_desc(devinfo, GFX8_BTI_STATELESS_NON_COHERENT,1742GFX7_DATAPORT_DC_OWORD_BLOCK_READ,1743BRW_DATAPORT_OWORD_BLOCK_OWORDS(num_regs * 2));1744send->header_size = 1;1745send->mlen = 1;1746send->size_written = num_regs * REG_SIZE;1747send->send_is_volatile = true;17481749i += num_regs;1750}17511752invalidate_analysis(DEPENDENCY_INSTRUCTIONS);1753}17541755/* Map the offsets in the UNIFORM file to fixed HW regs. */1756foreach_block_and_inst(block, fs_inst, inst, cfg) {1757for (unsigned int i = 0; i < inst->sources; i++) {1758if (inst->src[i].file == UNIFORM) {1759int uniform_nr = inst->src[i].nr + inst->src[i].offset / 4;1760int constant_nr;1761if (inst->src[i].nr >= UBO_START) {1762/* constant_nr is in 32-bit units, the rest are in bytes */1763constant_nr = ubo_push_start[inst->src[i].nr - UBO_START] +1764inst->src[i].offset / 4;1765} else if (uniform_nr >= 0 && uniform_nr < (int) uniforms) {1766constant_nr = push_constant_loc[uniform_nr];1767} else {1768/* Section 5.11 of the OpenGL 4.1 spec says:1769* "Out-of-bounds reads return undefined values, which include1770* values from other variables of the active program or zero."1771* Just return the first push constant.1772*/1773constant_nr = 0;1774}17751776assert(constant_nr / 8 < 64);1777used |= BITFIELD64_BIT(constant_nr / 8);17781779struct brw_reg brw_reg = brw_vec1_grf(payload.num_regs +1780constant_nr / 8,1781constant_nr % 8);1782brw_reg.abs = inst->src[i].abs;1783brw_reg.negate = inst->src[i].negate;17841785assert(inst->src[i].stride == 0);1786inst->src[i] = byte_offset(1787retype(brw_reg, inst->src[i].type),1788inst->src[i].offset % 4);1789}1790}1791}17921793uint64_t want_zero = used & stage_prog_data->zero_push_reg;1794if (want_zero) {1795assert(!compiler->compact_params);1796fs_builder ubld = bld.exec_all().group(8, 0).at(1797cfg->first_block(), cfg->first_block()->start());17981799/* push_reg_mask_param is in 32-bit units */1800unsigned mask_param = stage_prog_data->push_reg_mask_param;1801struct brw_reg mask = brw_vec1_grf(payload.num_regs + mask_param / 8,1802mask_param % 8);18031804fs_reg b32;1805for (unsigned i = 0; i < 64; i++) {1806if (i % 16 == 0 && (want_zero & BITFIELD64_RANGE(i, 16))) {1807fs_reg shifted = ubld.vgrf(BRW_REGISTER_TYPE_W, 2);1808ubld.SHL(horiz_offset(shifted, 8),1809byte_offset(retype(mask, BRW_REGISTER_TYPE_W), i / 8),1810brw_imm_v(0x01234567));1811ubld.SHL(shifted, horiz_offset(shifted, 8), brw_imm_w(8));18121813fs_builder ubld16 = ubld.group(16, 0);1814b32 = ubld16.vgrf(BRW_REGISTER_TYPE_D);1815ubld16.group(16, 0).ASR(b32, shifted, brw_imm_w(15));1816}18171818if (want_zero & BITFIELD64_BIT(i)) {1819assert(i < prog_data->curb_read_length);1820struct brw_reg push_reg =1821retype(brw_vec8_grf(payload.num_regs + i, 0),1822BRW_REGISTER_TYPE_D);18231824ubld.AND(push_reg, push_reg, component(b32, i % 16));1825}1826}18271828invalidate_analysis(DEPENDENCY_INSTRUCTIONS);1829}18301831/* This may be updated in assign_urb_setup or assign_vs_urb_setup. */1832this->first_non_payload_grf = payload.num_regs + prog_data->curb_read_length;1833}18341835/*1836* Build up an array of indices into the urb_setup array that1837* references the active entries of the urb_setup array.1838* Used to accelerate walking the active entries of the urb_setup array1839* on each upload.1840*/1841void1842brw_compute_urb_setup_index(struct brw_wm_prog_data *wm_prog_data)1843{1844/* Make sure uint8_t is sufficient */1845STATIC_ASSERT(VARYING_SLOT_MAX <= 0xff);1846uint8_t index = 0;1847for (uint8_t attr = 0; attr < VARYING_SLOT_MAX; attr++) {1848if (wm_prog_data->urb_setup[attr] >= 0) {1849wm_prog_data->urb_setup_attribs[index++] = attr;1850}1851}1852wm_prog_data->urb_setup_attribs_count = index;1853}18541855static void1856calculate_urb_setup(const struct intel_device_info *devinfo,1857const struct brw_wm_prog_key *key,1858struct brw_wm_prog_data *prog_data,1859const nir_shader *nir)1860{1861memset(prog_data->urb_setup, -1,1862sizeof(prog_data->urb_setup[0]) * VARYING_SLOT_MAX);18631864int urb_next = 0;1865/* Figure out where each of the incoming setup attributes lands. */1866if (devinfo->ver >= 6) {1867if (util_bitcount64(nir->info.inputs_read &1868BRW_FS_VARYING_INPUT_MASK) <= 16) {1869/* The SF/SBE pipeline stage can do arbitrary rearrangement of the1870* first 16 varying inputs, so we can put them wherever we want.1871* Just put them in order.1872*1873* This is useful because it means that (a) inputs not used by the1874* fragment shader won't take up valuable register space, and (b) we1875* won't have to recompile the fragment shader if it gets paired with1876* a different vertex (or geometry) shader.1877*/1878for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {1879if (nir->info.inputs_read & BRW_FS_VARYING_INPUT_MASK &1880BITFIELD64_BIT(i)) {1881prog_data->urb_setup[i] = urb_next++;1882}1883}1884} else {1885/* We have enough input varyings that the SF/SBE pipeline stage can't1886* arbitrarily rearrange them to suit our whim; we have to put them1887* in an order that matches the output of the previous pipeline stage1888* (geometry or vertex shader).1889*/18901891/* Re-compute the VUE map here in the case that the one coming from1892* geometry has more than one position slot (used for Primitive1893* Replication).1894*/1895struct brw_vue_map prev_stage_vue_map;1896brw_compute_vue_map(devinfo, &prev_stage_vue_map,1897key->input_slots_valid,1898nir->info.separate_shader, 1);18991900int first_slot =1901brw_compute_first_urb_slot_required(nir->info.inputs_read,1902&prev_stage_vue_map);19031904assert(prev_stage_vue_map.num_slots <= first_slot + 32);1905for (int slot = first_slot; slot < prev_stage_vue_map.num_slots;1906slot++) {1907int varying = prev_stage_vue_map.slot_to_varying[slot];1908if (varying != BRW_VARYING_SLOT_PAD &&1909(nir->info.inputs_read & BRW_FS_VARYING_INPUT_MASK &1910BITFIELD64_BIT(varying))) {1911prog_data->urb_setup[varying] = slot - first_slot;1912}1913}1914urb_next = prev_stage_vue_map.num_slots - first_slot;1915}1916} else {1917/* FINISHME: The sf doesn't map VS->FS inputs for us very well. */1918for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {1919/* Point size is packed into the header, not as a general attribute */1920if (i == VARYING_SLOT_PSIZ)1921continue;19221923if (key->input_slots_valid & BITFIELD64_BIT(i)) {1924/* The back color slot is skipped when the front color is1925* also written to. In addition, some slots can be1926* written in the vertex shader and not read in the1927* fragment shader. So the register number must always be1928* incremented, mapped or not.1929*/1930if (_mesa_varying_slot_in_fs((gl_varying_slot) i))1931prog_data->urb_setup[i] = urb_next;1932urb_next++;1933}1934}19351936/*1937* It's a FS only attribute, and we did interpolation for this attribute1938* in SF thread. So, count it here, too.1939*1940* See compile_sf_prog() for more info.1941*/1942if (nir->info.inputs_read & BITFIELD64_BIT(VARYING_SLOT_PNTC))1943prog_data->urb_setup[VARYING_SLOT_PNTC] = urb_next++;1944}19451946prog_data->num_varying_inputs = urb_next;1947prog_data->inputs = nir->info.inputs_read;19481949brw_compute_urb_setup_index(prog_data);1950}19511952void1953fs_visitor::assign_urb_setup()1954{1955assert(stage == MESA_SHADER_FRAGMENT);1956struct brw_wm_prog_data *prog_data = brw_wm_prog_data(this->prog_data);19571958int urb_start = payload.num_regs + prog_data->base.curb_read_length;19591960/* Offset all the urb_setup[] index by the actual position of the1961* setup regs, now that the location of the constants has been chosen.1962*/1963foreach_block_and_inst(block, fs_inst, inst, cfg) {1964for (int i = 0; i < inst->sources; i++) {1965if (inst->src[i].file == ATTR) {1966/* ATTR regs in the FS are in units of logical scalar inputs each1967* of which consumes half of a GRF register.1968*/1969assert(inst->src[i].offset < REG_SIZE / 2);1970const unsigned grf = urb_start + inst->src[i].nr / 2;1971const unsigned offset = (inst->src[i].nr % 2) * (REG_SIZE / 2) +1972inst->src[i].offset;1973const unsigned width = inst->src[i].stride == 0 ?19741 : MIN2(inst->exec_size, 8);1975struct brw_reg reg = stride(1976byte_offset(retype(brw_vec8_grf(grf, 0), inst->src[i].type),1977offset),1978width * inst->src[i].stride,1979width, inst->src[i].stride);1980reg.abs = inst->src[i].abs;1981reg.negate = inst->src[i].negate;1982inst->src[i] = reg;1983}1984}1985}19861987/* Each attribute is 4 setup channels, each of which is half a reg. */1988this->first_non_payload_grf += prog_data->num_varying_inputs * 2;1989}19901991void1992fs_visitor::convert_attr_sources_to_hw_regs(fs_inst *inst)1993{1994for (int i = 0; i < inst->sources; i++) {1995if (inst->src[i].file == ATTR) {1996int grf = payload.num_regs +1997prog_data->curb_read_length +1998inst->src[i].nr +1999inst->src[i].offset / REG_SIZE;20002001/* As explained at brw_reg_from_fs_reg, From the Haswell PRM:2002*2003* VertStride must be used to cross GRF register boundaries. This2004* rule implies that elements within a 'Width' cannot cross GRF2005* boundaries.2006*2007* So, for registers that are large enough, we have to split the exec2008* size in two and trust the compression state to sort it out.2009*/2010unsigned total_size = inst->exec_size *2011inst->src[i].stride *2012type_sz(inst->src[i].type);20132014assert(total_size <= 2 * REG_SIZE);2015const unsigned exec_size =2016(total_size <= REG_SIZE) ? inst->exec_size : inst->exec_size / 2;20172018unsigned width = inst->src[i].stride == 0 ? 1 : exec_size;2019struct brw_reg reg =2020stride(byte_offset(retype(brw_vec8_grf(grf, 0), inst->src[i].type),2021inst->src[i].offset % REG_SIZE),2022exec_size * inst->src[i].stride,2023width, inst->src[i].stride);2024reg.abs = inst->src[i].abs;2025reg.negate = inst->src[i].negate;20262027inst->src[i] = reg;2028}2029}2030}20312032void2033fs_visitor::assign_vs_urb_setup()2034{2035struct brw_vs_prog_data *vs_prog_data = brw_vs_prog_data(prog_data);20362037assert(stage == MESA_SHADER_VERTEX);20382039/* Each attribute is 4 regs. */2040this->first_non_payload_grf += 4 * vs_prog_data->nr_attribute_slots;20412042assert(vs_prog_data->base.urb_read_length <= 15);20432044/* Rewrite all ATTR file references to the hw grf that they land in. */2045foreach_block_and_inst(block, fs_inst, inst, cfg) {2046convert_attr_sources_to_hw_regs(inst);2047}2048}20492050void2051fs_visitor::assign_tcs_urb_setup()2052{2053assert(stage == MESA_SHADER_TESS_CTRL);20542055/* Rewrite all ATTR file references to HW_REGs. */2056foreach_block_and_inst(block, fs_inst, inst, cfg) {2057convert_attr_sources_to_hw_regs(inst);2058}2059}20602061void2062fs_visitor::assign_tes_urb_setup()2063{2064assert(stage == MESA_SHADER_TESS_EVAL);20652066struct brw_vue_prog_data *vue_prog_data = brw_vue_prog_data(prog_data);20672068first_non_payload_grf += 8 * vue_prog_data->urb_read_length;20692070/* Rewrite all ATTR file references to HW_REGs. */2071foreach_block_and_inst(block, fs_inst, inst, cfg) {2072convert_attr_sources_to_hw_regs(inst);2073}2074}20752076void2077fs_visitor::assign_gs_urb_setup()2078{2079assert(stage == MESA_SHADER_GEOMETRY);20802081struct brw_vue_prog_data *vue_prog_data = brw_vue_prog_data(prog_data);20822083first_non_payload_grf +=20848 * vue_prog_data->urb_read_length * nir->info.gs.vertices_in;20852086foreach_block_and_inst(block, fs_inst, inst, cfg) {2087/* Rewrite all ATTR file references to GRFs. */2088convert_attr_sources_to_hw_regs(inst);2089}2090}209120922093/**2094* Split large virtual GRFs into separate components if we can.2095*2096* This is mostly duplicated with what brw_fs_vector_splitting does,2097* but that's really conservative because it's afraid of doing2098* splitting that doesn't result in real progress after the rest of2099* the optimization phases, which would cause infinite looping in2100* optimization. We can do it once here, safely. This also has the2101* opportunity to split interpolated values, or maybe even uniforms,2102* which we don't have at the IR level.2103*2104* We want to split, because virtual GRFs are what we register2105* allocate and spill (due to contiguousness requirements for some2106* instructions), and they're what we naturally generate in the2107* codegen process, but most virtual GRFs don't actually need to be2108* contiguous sets of GRFs. If we split, we'll end up with reduced2109* live intervals and better dead code elimination and coalescing.2110*/2111void2112fs_visitor::split_virtual_grfs()2113{2114/* Compact the register file so we eliminate dead vgrfs. This2115* only defines split points for live registers, so if we have2116* too large dead registers they will hit assertions later.2117*/2118compact_virtual_grfs();21192120int num_vars = this->alloc.count;21212122/* Count the total number of registers */2123int reg_count = 0;2124int vgrf_to_reg[num_vars];2125for (int i = 0; i < num_vars; i++) {2126vgrf_to_reg[i] = reg_count;2127reg_count += alloc.sizes[i];2128}21292130/* An array of "split points". For each register slot, this indicates2131* if this slot can be separated from the previous slot. Every time an2132* instruction uses multiple elements of a register (as a source or2133* destination), we mark the used slots as inseparable. Then we go2134* through and split the registers into the smallest pieces we can.2135*/2136bool *split_points = new bool[reg_count];2137memset(split_points, 0, reg_count * sizeof(*split_points));21382139/* Mark all used registers as fully splittable */2140foreach_block_and_inst(block, fs_inst, inst, cfg) {2141if (inst->dst.file == VGRF) {2142int reg = vgrf_to_reg[inst->dst.nr];2143for (unsigned j = 1; j < this->alloc.sizes[inst->dst.nr]; j++)2144split_points[reg + j] = true;2145}21462147for (int i = 0; i < inst->sources; i++) {2148if (inst->src[i].file == VGRF) {2149int reg = vgrf_to_reg[inst->src[i].nr];2150for (unsigned j = 1; j < this->alloc.sizes[inst->src[i].nr]; j++)2151split_points[reg + j] = true;2152}2153}2154}21552156foreach_block_and_inst(block, fs_inst, inst, cfg) {2157/* We fix up undef instructions later */2158if (inst->opcode == SHADER_OPCODE_UNDEF) {2159/* UNDEF instructions are currently only used to undef entire2160* registers. We need this invariant later when we split them.2161*/2162assert(inst->dst.file == VGRF);2163assert(inst->dst.offset == 0);2164assert(inst->size_written == alloc.sizes[inst->dst.nr] * REG_SIZE);2165continue;2166}21672168if (inst->dst.file == VGRF) {2169int reg = vgrf_to_reg[inst->dst.nr] + inst->dst.offset / REG_SIZE;2170for (unsigned j = 1; j < regs_written(inst); j++)2171split_points[reg + j] = false;2172}2173for (int i = 0; i < inst->sources; i++) {2174if (inst->src[i].file == VGRF) {2175int reg = vgrf_to_reg[inst->src[i].nr] + inst->src[i].offset / REG_SIZE;2176for (unsigned j = 1; j < regs_read(inst, i); j++)2177split_points[reg + j] = false;2178}2179}2180}21812182int *new_virtual_grf = new int[reg_count];2183int *new_reg_offset = new int[reg_count];21842185int reg = 0;2186for (int i = 0; i < num_vars; i++) {2187/* The first one should always be 0 as a quick sanity check. */2188assert(split_points[reg] == false);21892190/* j = 0 case */2191new_reg_offset[reg] = 0;2192reg++;2193int offset = 1;21942195/* j > 0 case */2196for (unsigned j = 1; j < alloc.sizes[i]; j++) {2197/* If this is a split point, reset the offset to 0 and allocate a2198* new virtual GRF for the previous offset many registers2199*/2200if (split_points[reg]) {2201assert(offset <= MAX_VGRF_SIZE);2202int grf = alloc.allocate(offset);2203for (int k = reg - offset; k < reg; k++)2204new_virtual_grf[k] = grf;2205offset = 0;2206}2207new_reg_offset[reg] = offset;2208offset++;2209reg++;2210}22112212/* The last one gets the original register number */2213assert(offset <= MAX_VGRF_SIZE);2214alloc.sizes[i] = offset;2215for (int k = reg - offset; k < reg; k++)2216new_virtual_grf[k] = i;2217}2218assert(reg == reg_count);22192220foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {2221if (inst->opcode == SHADER_OPCODE_UNDEF) {2222const fs_builder ibld(this, block, inst);2223assert(inst->size_written % REG_SIZE == 0);2224unsigned reg_offset = 0;2225while (reg_offset < inst->size_written / REG_SIZE) {2226reg = vgrf_to_reg[inst->dst.nr] + reg_offset;2227ibld.UNDEF(fs_reg(VGRF, new_virtual_grf[reg], inst->dst.type));2228reg_offset += alloc.sizes[new_virtual_grf[reg]];2229}2230inst->remove(block);2231continue;2232}22332234if (inst->dst.file == VGRF) {2235reg = vgrf_to_reg[inst->dst.nr] + inst->dst.offset / REG_SIZE;2236inst->dst.nr = new_virtual_grf[reg];2237inst->dst.offset = new_reg_offset[reg] * REG_SIZE +2238inst->dst.offset % REG_SIZE;2239assert((unsigned)new_reg_offset[reg] < alloc.sizes[new_virtual_grf[reg]]);2240}2241for (int i = 0; i < inst->sources; i++) {2242if (inst->src[i].file == VGRF) {2243reg = vgrf_to_reg[inst->src[i].nr] + inst->src[i].offset / REG_SIZE;2244inst->src[i].nr = new_virtual_grf[reg];2245inst->src[i].offset = new_reg_offset[reg] * REG_SIZE +2246inst->src[i].offset % REG_SIZE;2247assert((unsigned)new_reg_offset[reg] < alloc.sizes[new_virtual_grf[reg]]);2248}2249}2250}2251invalidate_analysis(DEPENDENCY_INSTRUCTION_DETAIL | DEPENDENCY_VARIABLES);22522253delete[] split_points;2254delete[] new_virtual_grf;2255delete[] new_reg_offset;2256}22572258/**2259* Remove unused virtual GRFs and compact the vgrf_* arrays.2260*2261* During code generation, we create tons of temporary variables, many of2262* which get immediately killed and are never used again. Yet, in later2263* optimization and analysis passes, such as compute_live_intervals, we need2264* to loop over all the virtual GRFs. Compacting them can save a lot of2265* overhead.2266*/2267bool2268fs_visitor::compact_virtual_grfs()2269{2270bool progress = false;2271int *remap_table = new int[this->alloc.count];2272memset(remap_table, -1, this->alloc.count * sizeof(int));22732274/* Mark which virtual GRFs are used. */2275foreach_block_and_inst(block, const fs_inst, inst, cfg) {2276if (inst->dst.file == VGRF)2277remap_table[inst->dst.nr] = 0;22782279for (int i = 0; i < inst->sources; i++) {2280if (inst->src[i].file == VGRF)2281remap_table[inst->src[i].nr] = 0;2282}2283}22842285/* Compact the GRF arrays. */2286int new_index = 0;2287for (unsigned i = 0; i < this->alloc.count; i++) {2288if (remap_table[i] == -1) {2289/* We just found an unused register. This means that we are2290* actually going to compact something.2291*/2292progress = true;2293} else {2294remap_table[i] = new_index;2295alloc.sizes[new_index] = alloc.sizes[i];2296invalidate_analysis(DEPENDENCY_INSTRUCTION_DETAIL | DEPENDENCY_VARIABLES);2297++new_index;2298}2299}23002301this->alloc.count = new_index;23022303/* Patch all the instructions to use the newly renumbered registers */2304foreach_block_and_inst(block, fs_inst, inst, cfg) {2305if (inst->dst.file == VGRF)2306inst->dst.nr = remap_table[inst->dst.nr];23072308for (int i = 0; i < inst->sources; i++) {2309if (inst->src[i].file == VGRF)2310inst->src[i].nr = remap_table[inst->src[i].nr];2311}2312}23132314/* Patch all the references to delta_xy, since they're used in register2315* allocation. If they're unused, switch them to BAD_FILE so we don't2316* think some random VGRF is delta_xy.2317*/2318for (unsigned i = 0; i < ARRAY_SIZE(delta_xy); i++) {2319if (delta_xy[i].file == VGRF) {2320if (remap_table[delta_xy[i].nr] != -1) {2321delta_xy[i].nr = remap_table[delta_xy[i].nr];2322} else {2323delta_xy[i].file = BAD_FILE;2324}2325}2326}23272328delete[] remap_table;23292330return progress;2331}23322333static int2334get_subgroup_id_param_index(const intel_device_info *devinfo,2335const brw_stage_prog_data *prog_data)2336{2337if (prog_data->nr_params == 0)2338return -1;23392340if (devinfo->verx10 >= 125)2341return -1;23422343/* The local thread id is always the last parameter in the list */2344uint32_t last_param = prog_data->param[prog_data->nr_params - 1];2345if (last_param == BRW_PARAM_BUILTIN_SUBGROUP_ID)2346return prog_data->nr_params - 1;23472348return -1;2349}23502351/**2352* Struct for handling complex alignments.2353*2354* A complex alignment is stored as multiplier and an offset. A value is2355* considered to be aligned if it is {offset} larger than a multiple of {mul}.2356* For instance, with an alignment of {8, 2}, cplx_align_apply would do the2357* following:2358*2359* N | cplx_align_apply({8, 2}, N)2360* ----+-----------------------------2361* 4 | 62362* 6 | 62363* 8 | 142364* 10 | 142365* 12 | 142366* 14 | 142367* 16 | 222368*/2369struct cplx_align {2370unsigned mul:4;2371unsigned offset:4;2372};23732374#define CPLX_ALIGN_MAX_MUL 823752376static void2377cplx_align_assert_sane(struct cplx_align a)2378{2379assert(a.mul > 0 && util_is_power_of_two_nonzero(a.mul));2380assert(a.offset < a.mul);2381}23822383/**2384* Combines two alignments to produce a least multiple of sorts.2385*2386* The returned alignment is the smallest (in terms of multiplier) such that2387* anything aligned to both a and b will be aligned to the new alignment.2388* This function will assert-fail if a and b are not compatible, i.e. if the2389* offset parameters are such that no common alignment is possible.2390*/2391static struct cplx_align2392cplx_align_combine(struct cplx_align a, struct cplx_align b)2393{2394cplx_align_assert_sane(a);2395cplx_align_assert_sane(b);23962397/* Assert that the alignments agree. */2398assert((a.offset & (b.mul - 1)) == (b.offset & (a.mul - 1)));23992400return a.mul > b.mul ? a : b;2401}24022403/**2404* Apply a complex alignment2405*2406* This function will return the smallest number greater than or equal to2407* offset that is aligned to align.2408*/2409static unsigned2410cplx_align_apply(struct cplx_align align, unsigned offset)2411{2412return ALIGN(offset - align.offset, align.mul) + align.offset;2413}24142415#define UNIFORM_SLOT_SIZE 424162417struct uniform_slot_info {2418/** True if the given uniform slot is live */2419unsigned is_live:1;24202421/** True if this slot and the next slot must remain contiguous */2422unsigned contiguous:1;24232424struct cplx_align align;2425};24262427static void2428mark_uniform_slots_read(struct uniform_slot_info *slots,2429unsigned num_slots, unsigned alignment)2430{2431assert(alignment > 0 && util_is_power_of_two_nonzero(alignment));2432assert(alignment <= CPLX_ALIGN_MAX_MUL);24332434/* We can't align a slot to anything less than the slot size */2435alignment = MAX2(alignment, UNIFORM_SLOT_SIZE);24362437struct cplx_align align = {alignment, 0};2438cplx_align_assert_sane(align);24392440for (unsigned i = 0; i < num_slots; i++) {2441slots[i].is_live = true;2442if (i < num_slots - 1)2443slots[i].contiguous = true;24442445align.offset = (i * UNIFORM_SLOT_SIZE) & (align.mul - 1);2446if (slots[i].align.mul == 0) {2447slots[i].align = align;2448} else {2449slots[i].align = cplx_align_combine(slots[i].align, align);2450}2451}2452}24532454/**2455* Assign UNIFORM file registers to either push constants or pull constants.2456*2457* We allow a fragment shader to have more than the specified minimum2458* maximum number of fragment shader uniform components (64). If2459* there are too many of these, they'd fill up all of register space.2460* So, this will push some of them out to the pull constant buffer and2461* update the program to load them.2462*/2463void2464fs_visitor::assign_constant_locations()2465{2466/* Only the first compile gets to decide on locations. */2467if (push_constant_loc) {2468assert(pull_constant_loc);2469return;2470}24712472if (compiler->compact_params) {2473struct uniform_slot_info slots[uniforms + 1];2474memset(slots, 0, sizeof(slots));24752476foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {2477for (int i = 0 ; i < inst->sources; i++) {2478if (inst->src[i].file != UNIFORM)2479continue;24802481/* NIR tightly packs things so the uniform number might not be2482* aligned (if we have a double right after a float, for2483* instance). This is fine because the process of re-arranging2484* them will ensure that things are properly aligned. The offset2485* into that uniform, however, must be aligned.2486*2487* In Vulkan, we have explicit offsets but everything is crammed2488* into a single "variable" so inst->src[i].nr will always be 0.2489* Everything will be properly aligned relative to that one base.2490*/2491assert(inst->src[i].offset % type_sz(inst->src[i].type) == 0);24922493unsigned u = inst->src[i].nr +2494inst->src[i].offset / UNIFORM_SLOT_SIZE;24952496if (u >= uniforms)2497continue;24982499unsigned slots_read;2500if (inst->opcode == SHADER_OPCODE_MOV_INDIRECT && i == 0) {2501slots_read = DIV_ROUND_UP(inst->src[2].ud, UNIFORM_SLOT_SIZE);2502} else {2503unsigned bytes_read = inst->components_read(i) *2504type_sz(inst->src[i].type);2505slots_read = DIV_ROUND_UP(bytes_read, UNIFORM_SLOT_SIZE);2506}25072508assert(u + slots_read <= uniforms);2509mark_uniform_slots_read(&slots[u], slots_read,2510type_sz(inst->src[i].type));2511}2512}25132514int subgroup_id_index = get_subgroup_id_param_index(devinfo,2515stage_prog_data);25162517/* Only allow 16 registers (128 uniform components) as push constants.2518*2519* Just demote the end of the list. We could probably do better2520* here, demoting things that are rarely used in the program first.2521*2522* If changing this value, note the limitation about total_regs in2523* brw_curbe.c.2524*/2525unsigned int max_push_components = 16 * 8;2526if (subgroup_id_index >= 0)2527max_push_components--; /* Save a slot for the thread ID */25282529/* We push small arrays, but no bigger than 16 floats. This is big2530* enough for a vec4 but hopefully not large enough to push out other2531* stuff. We should probably use a better heuristic at some point.2532*/2533const unsigned int max_chunk_size = 16;25342535unsigned int num_push_constants = 0;2536unsigned int num_pull_constants = 0;25372538push_constant_loc = ralloc_array(mem_ctx, int, uniforms);2539pull_constant_loc = ralloc_array(mem_ctx, int, uniforms);25402541/* Default to -1 meaning no location */2542memset(push_constant_loc, -1, uniforms * sizeof(*push_constant_loc));2543memset(pull_constant_loc, -1, uniforms * sizeof(*pull_constant_loc));25442545int chunk_start = -1;2546struct cplx_align align;2547for (unsigned u = 0; u < uniforms; u++) {2548if (!slots[u].is_live) {2549assert(chunk_start == -1);2550continue;2551}25522553/* Skip subgroup_id_index to put it in the last push register. */2554if (subgroup_id_index == (int)u)2555continue;25562557if (chunk_start == -1) {2558chunk_start = u;2559align = slots[u].align;2560} else {2561/* Offset into the chunk */2562unsigned chunk_offset = (u - chunk_start) * UNIFORM_SLOT_SIZE;25632564/* Shift the slot alignment down by the chunk offset so it is2565* comparable with the base chunk alignment.2566*/2567struct cplx_align slot_align = slots[u].align;2568slot_align.offset =2569(slot_align.offset - chunk_offset) & (align.mul - 1);25702571align = cplx_align_combine(align, slot_align);2572}25732574/* Sanity check the alignment */2575cplx_align_assert_sane(align);25762577if (slots[u].contiguous)2578continue;25792580/* Adjust the alignment to be in terms of slots, not bytes */2581assert((align.mul & (UNIFORM_SLOT_SIZE - 1)) == 0);2582assert((align.offset & (UNIFORM_SLOT_SIZE - 1)) == 0);2583align.mul /= UNIFORM_SLOT_SIZE;2584align.offset /= UNIFORM_SLOT_SIZE;25852586unsigned push_start_align = cplx_align_apply(align, num_push_constants);2587unsigned chunk_size = u - chunk_start + 1;2588if ((!compiler->supports_pull_constants && u < UBO_START) ||2589(chunk_size < max_chunk_size &&2590push_start_align + chunk_size <= max_push_components)) {2591/* Align up the number of push constants */2592num_push_constants = push_start_align;2593for (unsigned i = 0; i < chunk_size; i++)2594push_constant_loc[chunk_start + i] = num_push_constants++;2595} else {2596/* We need to pull this one */2597num_pull_constants = cplx_align_apply(align, num_pull_constants);2598for (unsigned i = 0; i < chunk_size; i++)2599pull_constant_loc[chunk_start + i] = num_pull_constants++;2600}26012602/* Reset the chunk and start again */2603chunk_start = -1;2604}26052606/* Add the CS local thread ID uniform at the end of the push constants */2607if (subgroup_id_index >= 0)2608push_constant_loc[subgroup_id_index] = num_push_constants++;26092610/* As the uniforms are going to be reordered, stash the old array and2611* create two new arrays for push/pull params.2612*/2613uint32_t *param = stage_prog_data->param;2614stage_prog_data->nr_params = num_push_constants;2615if (num_push_constants) {2616stage_prog_data->param = rzalloc_array(mem_ctx, uint32_t,2617num_push_constants);2618} else {2619stage_prog_data->param = NULL;2620}2621assert(stage_prog_data->nr_pull_params == 0);2622assert(stage_prog_data->pull_param == NULL);2623if (num_pull_constants > 0) {2624stage_prog_data->nr_pull_params = num_pull_constants;2625stage_prog_data->pull_param = rzalloc_array(mem_ctx, uint32_t,2626num_pull_constants);2627}26282629/* Up until now, the param[] array has been indexed by reg + offset2630* of UNIFORM registers. Move pull constants into pull_param[] and2631* condense param[] to only contain the uniforms we chose to push.2632*2633* NOTE: Because we are condensing the params[] array, we know that2634* push_constant_loc[i] <= i and we can do it in one smooth loop without2635* having to make a copy.2636*/2637for (unsigned int i = 0; i < uniforms; i++) {2638uint32_t value = param[i];2639if (pull_constant_loc[i] != -1) {2640stage_prog_data->pull_param[pull_constant_loc[i]] = value;2641} else if (push_constant_loc[i] != -1) {2642stage_prog_data->param[push_constant_loc[i]] = value;2643}2644}2645ralloc_free(param);2646} else {2647/* If we don't want to compact anything, just set up dummy push/pull2648* arrays. All the rest of the compiler cares about are these arrays.2649*/2650push_constant_loc = ralloc_array(mem_ctx, int, uniforms);2651pull_constant_loc = ralloc_array(mem_ctx, int, uniforms);26522653for (unsigned u = 0; u < uniforms; u++)2654push_constant_loc[u] = u;26552656memset(pull_constant_loc, -1, uniforms * sizeof(*pull_constant_loc));2657}26582659/* Now that we know how many regular uniforms we'll push, reduce the2660* UBO push ranges so we don't exceed the 3DSTATE_CONSTANT limits.2661*/2662/* For gen4/5:2663* Only allow 16 registers (128 uniform components) as push constants.2664*2665* If changing this value, note the limitation about total_regs in2666* brw_curbe.c/crocus_state.c2667*/2668const unsigned max_push_length = compiler->devinfo->ver < 6 ? 16 : 64;2669unsigned push_length = DIV_ROUND_UP(stage_prog_data->nr_params, 8);2670for (int i = 0; i < 4; i++) {2671struct brw_ubo_range *range = &prog_data->ubo_ranges[i];26722673if (push_length + range->length > max_push_length)2674range->length = max_push_length - push_length;26752676push_length += range->length;2677}2678assert(push_length <= max_push_length);2679}26802681bool2682fs_visitor::get_pull_locs(const fs_reg &src,2683unsigned *out_surf_index,2684unsigned *out_pull_index)2685{2686assert(src.file == UNIFORM);26872688if (src.nr >= UBO_START) {2689const struct brw_ubo_range *range =2690&prog_data->ubo_ranges[src.nr - UBO_START];26912692/* If this access is in our (reduced) range, use the push data. */2693if (src.offset / 32 < range->length)2694return false;26952696*out_surf_index = prog_data->binding_table.ubo_start + range->block;2697*out_pull_index = (32 * range->start + src.offset) / 4;26982699prog_data->has_ubo_pull = true;2700return true;2701}27022703const unsigned location = src.nr + src.offset / 4;27042705if (location < uniforms && pull_constant_loc[location] != -1) {2706/* A regular uniform push constant */2707*out_surf_index = stage_prog_data->binding_table.pull_constants_start;2708*out_pull_index = pull_constant_loc[location];27092710prog_data->has_ubo_pull = true;2711return true;2712}27132714return false;2715}27162717/**2718* Replace UNIFORM register file access with either UNIFORM_PULL_CONSTANT_LOAD2719* or VARYING_PULL_CONSTANT_LOAD instructions which load values into VGRFs.2720*/2721void2722fs_visitor::lower_constant_loads()2723{2724unsigned index, pull_index;27252726foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {2727/* Set up the annotation tracking for new generated instructions. */2728const fs_builder ibld(this, block, inst);27292730for (int i = 0; i < inst->sources; i++) {2731if (inst->src[i].file != UNIFORM)2732continue;27332734/* We'll handle this case later */2735if (inst->opcode == SHADER_OPCODE_MOV_INDIRECT && i == 0)2736continue;27372738if (!get_pull_locs(inst->src[i], &index, &pull_index))2739continue;27402741assert(inst->src[i].stride == 0);27422743const unsigned block_sz = 64; /* Fetch one cacheline at a time. */2744const fs_builder ubld = ibld.exec_all().group(block_sz / 4, 0);2745const fs_reg dst = ubld.vgrf(BRW_REGISTER_TYPE_UD);2746const unsigned base = pull_index * 4;27472748ubld.emit(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD,2749dst, brw_imm_ud(index), brw_imm_ud(base & ~(block_sz - 1)));27502751/* Rewrite the instruction to use the temporary VGRF. */2752inst->src[i].file = VGRF;2753inst->src[i].nr = dst.nr;2754inst->src[i].offset = (base & (block_sz - 1)) +2755inst->src[i].offset % 4;2756}27572758if (inst->opcode == SHADER_OPCODE_MOV_INDIRECT &&2759inst->src[0].file == UNIFORM) {27602761if (!get_pull_locs(inst->src[0], &index, &pull_index))2762continue;27632764VARYING_PULL_CONSTANT_LOAD(ibld, inst->dst,2765brw_imm_ud(index),2766inst->src[1],2767pull_index * 4, 4);2768inst->remove(block);2769}2770}2771invalidate_analysis(DEPENDENCY_INSTRUCTIONS);2772}27732774bool2775fs_visitor::opt_algebraic()2776{2777bool progress = false;27782779foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {2780switch (inst->opcode) {2781case BRW_OPCODE_MOV:2782if (!devinfo->has_64bit_float &&2783!devinfo->has_64bit_int &&2784(inst->dst.type == BRW_REGISTER_TYPE_DF ||2785inst->dst.type == BRW_REGISTER_TYPE_UQ ||2786inst->dst.type == BRW_REGISTER_TYPE_Q)) {2787assert(inst->dst.type == inst->src[0].type);2788assert(!inst->saturate);2789assert(!inst->src[0].abs);2790assert(!inst->src[0].negate);2791const brw::fs_builder ibld(this, block, inst);27922793ibld.MOV(subscript(inst->dst, BRW_REGISTER_TYPE_UD, 1),2794subscript(inst->src[0], BRW_REGISTER_TYPE_UD, 1));2795ibld.MOV(subscript(inst->dst, BRW_REGISTER_TYPE_UD, 0),2796subscript(inst->src[0], BRW_REGISTER_TYPE_UD, 0));27972798inst->remove(block);2799progress = true;2800}28012802if ((inst->conditional_mod == BRW_CONDITIONAL_Z ||2803inst->conditional_mod == BRW_CONDITIONAL_NZ) &&2804inst->dst.is_null() &&2805(inst->src[0].abs || inst->src[0].negate)) {2806inst->src[0].abs = false;2807inst->src[0].negate = false;2808progress = true;2809break;2810}28112812if (inst->src[0].file != IMM)2813break;28142815if (inst->saturate) {2816/* Full mixed-type saturates don't happen. However, we can end up2817* with things like:2818*2819* mov.sat(8) g21<1>DF -1F2820*2821* Other mixed-size-but-same-base-type cases may also be possible.2822*/2823if (inst->dst.type != inst->src[0].type &&2824inst->dst.type != BRW_REGISTER_TYPE_DF &&2825inst->src[0].type != BRW_REGISTER_TYPE_F)2826assert(!"unimplemented: saturate mixed types");28272828if (brw_saturate_immediate(inst->src[0].type,2829&inst->src[0].as_brw_reg())) {2830inst->saturate = false;2831progress = true;2832}2833}2834break;28352836case BRW_OPCODE_MUL:2837if (inst->src[1].file != IMM)2838continue;28392840/* a * 1.0 = a */2841if (inst->src[1].is_one()) {2842inst->opcode = BRW_OPCODE_MOV;2843inst->src[1] = reg_undef;2844progress = true;2845break;2846}28472848/* a * -1.0 = -a */2849if (inst->src[1].is_negative_one()) {2850inst->opcode = BRW_OPCODE_MOV;2851inst->src[0].negate = !inst->src[0].negate;2852inst->src[1] = reg_undef;2853progress = true;2854break;2855}28562857break;2858case BRW_OPCODE_ADD:2859if (inst->src[1].file != IMM)2860continue;28612862if (brw_reg_type_is_integer(inst->src[1].type) &&2863inst->src[1].is_zero()) {2864inst->opcode = BRW_OPCODE_MOV;2865inst->src[1] = reg_undef;2866progress = true;2867break;2868}28692870if (inst->src[0].file == IMM) {2871assert(inst->src[0].type == BRW_REGISTER_TYPE_F);2872inst->opcode = BRW_OPCODE_MOV;2873inst->src[0].f += inst->src[1].f;2874inst->src[1] = reg_undef;2875progress = true;2876break;2877}2878break;2879case BRW_OPCODE_OR:2880if (inst->src[0].equals(inst->src[1]) ||2881inst->src[1].is_zero()) {2882/* On Gfx8+, the OR instruction can have a source modifier that2883* performs logical not on the operand. Cases of 'OR r0, ~r1, 0'2884* or 'OR r0, ~r1, ~r1' should become a NOT instead of a MOV.2885*/2886if (inst->src[0].negate) {2887inst->opcode = BRW_OPCODE_NOT;2888inst->src[0].negate = false;2889} else {2890inst->opcode = BRW_OPCODE_MOV;2891}2892inst->src[1] = reg_undef;2893progress = true;2894break;2895}2896break;2897case BRW_OPCODE_CMP:2898if ((inst->conditional_mod == BRW_CONDITIONAL_Z ||2899inst->conditional_mod == BRW_CONDITIONAL_NZ) &&2900inst->src[1].is_zero() &&2901(inst->src[0].abs || inst->src[0].negate)) {2902inst->src[0].abs = false;2903inst->src[0].negate = false;2904progress = true;2905break;2906}2907break;2908case BRW_OPCODE_SEL:2909if (!devinfo->has_64bit_float &&2910!devinfo->has_64bit_int &&2911(inst->dst.type == BRW_REGISTER_TYPE_DF ||2912inst->dst.type == BRW_REGISTER_TYPE_UQ ||2913inst->dst.type == BRW_REGISTER_TYPE_Q)) {2914assert(inst->dst.type == inst->src[0].type);2915assert(!inst->saturate);2916assert(!inst->src[0].abs && !inst->src[0].negate);2917assert(!inst->src[1].abs && !inst->src[1].negate);2918const brw::fs_builder ibld(this, block, inst);29192920set_predicate(inst->predicate,2921ibld.SEL(subscript(inst->dst, BRW_REGISTER_TYPE_UD, 0),2922subscript(inst->src[0], BRW_REGISTER_TYPE_UD, 0),2923subscript(inst->src[1], BRW_REGISTER_TYPE_UD, 0)));2924set_predicate(inst->predicate,2925ibld.SEL(subscript(inst->dst, BRW_REGISTER_TYPE_UD, 1),2926subscript(inst->src[0], BRW_REGISTER_TYPE_UD, 1),2927subscript(inst->src[1], BRW_REGISTER_TYPE_UD, 1)));29282929inst->remove(block);2930progress = true;2931}2932if (inst->src[0].equals(inst->src[1])) {2933inst->opcode = BRW_OPCODE_MOV;2934inst->src[1] = reg_undef;2935inst->predicate = BRW_PREDICATE_NONE;2936inst->predicate_inverse = false;2937progress = true;2938} else if (inst->saturate && inst->src[1].file == IMM) {2939switch (inst->conditional_mod) {2940case BRW_CONDITIONAL_LE:2941case BRW_CONDITIONAL_L:2942switch (inst->src[1].type) {2943case BRW_REGISTER_TYPE_F:2944if (inst->src[1].f >= 1.0f) {2945inst->opcode = BRW_OPCODE_MOV;2946inst->src[1] = reg_undef;2947inst->conditional_mod = BRW_CONDITIONAL_NONE;2948progress = true;2949}2950break;2951default:2952break;2953}2954break;2955case BRW_CONDITIONAL_GE:2956case BRW_CONDITIONAL_G:2957switch (inst->src[1].type) {2958case BRW_REGISTER_TYPE_F:2959if (inst->src[1].f <= 0.0f) {2960inst->opcode = BRW_OPCODE_MOV;2961inst->src[1] = reg_undef;2962inst->conditional_mod = BRW_CONDITIONAL_NONE;2963progress = true;2964}2965break;2966default:2967break;2968}2969default:2970break;2971}2972}2973break;2974case BRW_OPCODE_MAD:2975if (inst->src[0].type != BRW_REGISTER_TYPE_F ||2976inst->src[1].type != BRW_REGISTER_TYPE_F ||2977inst->src[2].type != BRW_REGISTER_TYPE_F)2978break;2979if (inst->src[1].is_one()) {2980inst->opcode = BRW_OPCODE_ADD;2981inst->src[1] = inst->src[2];2982inst->src[2] = reg_undef;2983progress = true;2984} else if (inst->src[2].is_one()) {2985inst->opcode = BRW_OPCODE_ADD;2986inst->src[2] = reg_undef;2987progress = true;2988}2989break;2990case SHADER_OPCODE_BROADCAST:2991if (is_uniform(inst->src[0])) {2992inst->opcode = BRW_OPCODE_MOV;2993inst->sources = 1;2994inst->force_writemask_all = true;2995progress = true;2996} else if (inst->src[1].file == IMM) {2997inst->opcode = BRW_OPCODE_MOV;2998/* It's possible that the selected component will be too large and2999* overflow the register. This can happen if someone does a3000* readInvocation() from GLSL or SPIR-V and provides an OOB3001* invocationIndex. If this happens and we some how manage3002* to constant fold it in and get here, then component() may cause3003* us to start reading outside of the VGRF which will lead to an3004* assert later. Instead, just let it wrap around if it goes over3005* exec_size.3006*/3007const unsigned comp = inst->src[1].ud & (inst->exec_size - 1);3008inst->src[0] = component(inst->src[0], comp);3009inst->sources = 1;3010inst->force_writemask_all = true;3011progress = true;3012}3013break;30143015case SHADER_OPCODE_SHUFFLE:3016if (is_uniform(inst->src[0])) {3017inst->opcode = BRW_OPCODE_MOV;3018inst->sources = 1;3019progress = true;3020} else if (inst->src[1].file == IMM) {3021inst->opcode = BRW_OPCODE_MOV;3022inst->src[0] = component(inst->src[0],3023inst->src[1].ud);3024inst->sources = 1;3025progress = true;3026}3027break;30283029default:3030break;3031}30323033/* Swap if src[0] is immediate. */3034if (progress && inst->is_commutative()) {3035if (inst->src[0].file == IMM) {3036fs_reg tmp = inst->src[1];3037inst->src[1] = inst->src[0];3038inst->src[0] = tmp;3039}3040}3041}30423043if (progress)3044invalidate_analysis(DEPENDENCY_INSTRUCTION_DATA_FLOW |3045DEPENDENCY_INSTRUCTION_DETAIL);30463047return progress;3048}30493050/**3051* Optimize sample messages that have constant zero values for the trailing3052* texture coordinates. We can just reduce the message length for these3053* instructions instead of reserving a register for it. Trailing parameters3054* that aren't sent default to zero anyway. This will cause the dead code3055* eliminator to remove the MOV instruction that would otherwise be emitted to3056* set up the zero value.3057*/3058bool3059fs_visitor::opt_zero_samples()3060{3061/* Gfx4 infers the texturing opcode based on the message length so we can't3062* change it. Gfx12.5 has restrictions on the number of coordinate3063* parameters that have to be provided for some texture types3064* (Wa_14013363432).3065*/3066if (devinfo->ver < 5 || devinfo->verx10 == 125)3067return false;30683069bool progress = false;30703071foreach_block_and_inst(block, fs_inst, inst, cfg) {3072if (!inst->is_tex())3073continue;30743075fs_inst *load_payload = (fs_inst *) inst->prev;30763077if (load_payload->is_head_sentinel() ||3078load_payload->opcode != SHADER_OPCODE_LOAD_PAYLOAD)3079continue;30803081/* We don't want to remove the message header or the first parameter.3082* Removing the first parameter is not allowed, see the Haswell PRM3083* volume 7, page 149:3084*3085* "Parameter 0 is required except for the sampleinfo message, which3086* has no parameter 0"3087*/3088while (inst->mlen > inst->header_size + inst->exec_size / 8 &&3089load_payload->src[(inst->mlen - inst->header_size) /3090(inst->exec_size / 8) +3091inst->header_size - 1].is_zero()) {3092inst->mlen -= inst->exec_size / 8;3093progress = true;3094}3095}30963097if (progress)3098invalidate_analysis(DEPENDENCY_INSTRUCTION_DETAIL);30993100return progress;3101}31023103bool3104fs_visitor::opt_register_renaming()3105{3106bool progress = false;3107int depth = 0;31083109unsigned remap[alloc.count];3110memset(remap, ~0u, sizeof(unsigned) * alloc.count);31113112foreach_block_and_inst(block, fs_inst, inst, cfg) {3113if (inst->opcode == BRW_OPCODE_IF || inst->opcode == BRW_OPCODE_DO) {3114depth++;3115} else if (inst->opcode == BRW_OPCODE_ENDIF ||3116inst->opcode == BRW_OPCODE_WHILE) {3117depth--;3118}31193120/* Rewrite instruction sources. */3121for (int i = 0; i < inst->sources; i++) {3122if (inst->src[i].file == VGRF &&3123remap[inst->src[i].nr] != ~0u &&3124remap[inst->src[i].nr] != inst->src[i].nr) {3125inst->src[i].nr = remap[inst->src[i].nr];3126progress = true;3127}3128}31293130const unsigned dst = inst->dst.nr;31313132if (depth == 0 &&3133inst->dst.file == VGRF &&3134alloc.sizes[inst->dst.nr] * REG_SIZE == inst->size_written &&3135!inst->is_partial_write()) {3136if (remap[dst] == ~0u) {3137remap[dst] = dst;3138} else {3139remap[dst] = alloc.allocate(regs_written(inst));3140inst->dst.nr = remap[dst];3141progress = true;3142}3143} else if (inst->dst.file == VGRF &&3144remap[dst] != ~0u &&3145remap[dst] != dst) {3146inst->dst.nr = remap[dst];3147progress = true;3148}3149}31503151if (progress) {3152invalidate_analysis(DEPENDENCY_INSTRUCTION_DETAIL |3153DEPENDENCY_VARIABLES);31543155for (unsigned i = 0; i < ARRAY_SIZE(delta_xy); i++) {3156if (delta_xy[i].file == VGRF && remap[delta_xy[i].nr] != ~0u) {3157delta_xy[i].nr = remap[delta_xy[i].nr];3158}3159}3160}31613162return progress;3163}31643165/**3166* Remove redundant or useless halts.3167*3168* For example, we can eliminate halts in the following sequence:3169*3170* halt (redundant with the next halt)3171* halt (useless; jumps to the next instruction)3172* halt-target3173*/3174bool3175fs_visitor::opt_redundant_halt()3176{3177bool progress = false;31783179unsigned halt_count = 0;3180fs_inst *halt_target = NULL;3181bblock_t *halt_target_block = NULL;3182foreach_block_and_inst(block, fs_inst, inst, cfg) {3183if (inst->opcode == BRW_OPCODE_HALT)3184halt_count++;31853186if (inst->opcode == SHADER_OPCODE_HALT_TARGET) {3187halt_target = inst;3188halt_target_block = block;3189break;3190}3191}31923193if (!halt_target) {3194assert(halt_count == 0);3195return false;3196}31973198/* Delete any HALTs immediately before the halt target. */3199for (fs_inst *prev = (fs_inst *) halt_target->prev;3200!prev->is_head_sentinel() && prev->opcode == BRW_OPCODE_HALT;3201prev = (fs_inst *) halt_target->prev) {3202prev->remove(halt_target_block);3203halt_count--;3204progress = true;3205}32063207if (halt_count == 0) {3208halt_target->remove(halt_target_block);3209progress = true;3210}32113212if (progress)3213invalidate_analysis(DEPENDENCY_INSTRUCTIONS);32143215return progress;3216}32173218/**3219* Compute a bitmask with GRF granularity with a bit set for each GRF starting3220* from \p r.offset which overlaps the region starting at \p s.offset and3221* spanning \p ds bytes.3222*/3223static inline unsigned3224mask_relative_to(const fs_reg &r, const fs_reg &s, unsigned ds)3225{3226const int rel_offset = reg_offset(s) - reg_offset(r);3227const int shift = rel_offset / REG_SIZE;3228const unsigned n = DIV_ROUND_UP(rel_offset % REG_SIZE + ds, REG_SIZE);3229assert(reg_space(r) == reg_space(s) &&3230shift >= 0 && shift < int(8 * sizeof(unsigned)));3231return ((1 << n) - 1) << shift;3232}32333234bool3235fs_visitor::compute_to_mrf()3236{3237bool progress = false;3238int next_ip = 0;32393240/* No MRFs on Gen >= 7. */3241if (devinfo->ver >= 7)3242return false;32433244const fs_live_variables &live = live_analysis.require();32453246foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {3247int ip = next_ip;3248next_ip++;32493250if (inst->opcode != BRW_OPCODE_MOV ||3251inst->is_partial_write() ||3252inst->dst.file != MRF || inst->src[0].file != VGRF ||3253inst->dst.type != inst->src[0].type ||3254inst->src[0].abs || inst->src[0].negate ||3255!inst->src[0].is_contiguous() ||3256inst->src[0].offset % REG_SIZE != 0)3257continue;32583259/* Can't compute-to-MRF this GRF if someone else was going to3260* read it later.3261*/3262if (live.vgrf_end[inst->src[0].nr] > ip)3263continue;32643265/* Found a move of a GRF to a MRF. Let's see if we can go rewrite the3266* things that computed the value of all GRFs of the source region. The3267* regs_left bitset keeps track of the registers we haven't yet found a3268* generating instruction for.3269*/3270unsigned regs_left = (1 << regs_read(inst, 0)) - 1;32713272foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst) {3273if (regions_overlap(scan_inst->dst, scan_inst->size_written,3274inst->src[0], inst->size_read(0))) {3275/* Found the last thing to write our reg we want to turn3276* into a compute-to-MRF.3277*/32783279/* If this one instruction didn't populate all the3280* channels, bail. We might be able to rewrite everything3281* that writes that reg, but it would require smarter3282* tracking.3283*/3284if (scan_inst->is_partial_write())3285break;32863287/* Handling things not fully contained in the source of the copy3288* would need us to understand coalescing out more than one MOV at3289* a time.3290*/3291if (!region_contained_in(scan_inst->dst, scan_inst->size_written,3292inst->src[0], inst->size_read(0)))3293break;32943295/* SEND instructions can't have MRF as a destination. */3296if (scan_inst->mlen)3297break;32983299if (devinfo->ver == 6) {3300/* gfx6 math instructions must have the destination be3301* GRF, so no compute-to-MRF for them.3302*/3303if (scan_inst->is_math()) {3304break;3305}3306}33073308/* Clear the bits for any registers this instruction overwrites. */3309regs_left &= ~mask_relative_to(3310inst->src[0], scan_inst->dst, scan_inst->size_written);3311if (!regs_left)3312break;3313}33143315/* We don't handle control flow here. Most computation of3316* values that end up in MRFs are shortly before the MRF3317* write anyway.3318*/3319if (block->start() == scan_inst)3320break;33213322/* You can't read from an MRF, so if someone else reads our3323* MRF's source GRF that we wanted to rewrite, that stops us.3324*/3325bool interfered = false;3326for (int i = 0; i < scan_inst->sources; i++) {3327if (regions_overlap(scan_inst->src[i], scan_inst->size_read(i),3328inst->src[0], inst->size_read(0))) {3329interfered = true;3330}3331}3332if (interfered)3333break;33343335if (regions_overlap(scan_inst->dst, scan_inst->size_written,3336inst->dst, inst->size_written)) {3337/* If somebody else writes our MRF here, we can't3338* compute-to-MRF before that.3339*/3340break;3341}33423343if (scan_inst->mlen > 0 && scan_inst->base_mrf != -1 &&3344regions_overlap(fs_reg(MRF, scan_inst->base_mrf), scan_inst->mlen * REG_SIZE,3345inst->dst, inst->size_written)) {3346/* Found a SEND instruction, which means that there are3347* live values in MRFs from base_mrf to base_mrf +3348* scan_inst->mlen - 1. Don't go pushing our MRF write up3349* above it.3350*/3351break;3352}3353}33543355if (regs_left)3356continue;33573358/* Found all generating instructions of our MRF's source value, so it3359* should be safe to rewrite them to point to the MRF directly.3360*/3361regs_left = (1 << regs_read(inst, 0)) - 1;33623363foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst) {3364if (regions_overlap(scan_inst->dst, scan_inst->size_written,3365inst->src[0], inst->size_read(0))) {3366/* Clear the bits for any registers this instruction overwrites. */3367regs_left &= ~mask_relative_to(3368inst->src[0], scan_inst->dst, scan_inst->size_written);33693370const unsigned rel_offset = reg_offset(scan_inst->dst) -3371reg_offset(inst->src[0]);33723373if (inst->dst.nr & BRW_MRF_COMPR4) {3374/* Apply the same address transformation done by the hardware3375* for COMPR4 MRF writes.3376*/3377assert(rel_offset < 2 * REG_SIZE);3378scan_inst->dst.nr = inst->dst.nr + rel_offset / REG_SIZE * 4;33793380/* Clear the COMPR4 bit if the generating instruction is not3381* compressed.3382*/3383if (scan_inst->size_written < 2 * REG_SIZE)3384scan_inst->dst.nr &= ~BRW_MRF_COMPR4;33853386} else {3387/* Calculate the MRF number the result of this instruction is3388* ultimately written to.3389*/3390scan_inst->dst.nr = inst->dst.nr + rel_offset / REG_SIZE;3391}33923393scan_inst->dst.file = MRF;3394scan_inst->dst.offset = inst->dst.offset + rel_offset % REG_SIZE;3395scan_inst->saturate |= inst->saturate;3396if (!regs_left)3397break;3398}3399}34003401assert(!regs_left);3402inst->remove(block);3403progress = true;3404}34053406if (progress)3407invalidate_analysis(DEPENDENCY_INSTRUCTIONS);34083409return progress;3410}34113412/**3413* Eliminate FIND_LIVE_CHANNEL instructions occurring outside any control3414* flow. We could probably do better here with some form of divergence3415* analysis.3416*/3417bool3418fs_visitor::eliminate_find_live_channel()3419{3420bool progress = false;3421unsigned depth = 0;34223423if (!brw_stage_has_packed_dispatch(devinfo, stage, stage_prog_data)) {3424/* The optimization below assumes that channel zero is live on thread3425* dispatch, which may not be the case if the fixed function dispatches3426* threads sparsely.3427*/3428return false;3429}34303431foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {3432switch (inst->opcode) {3433case BRW_OPCODE_IF:3434case BRW_OPCODE_DO:3435depth++;3436break;34373438case BRW_OPCODE_ENDIF:3439case BRW_OPCODE_WHILE:3440depth--;3441break;34423443case BRW_OPCODE_HALT:3444/* This can potentially make control flow non-uniform until the end3445* of the program.3446*/3447return progress;34483449case SHADER_OPCODE_FIND_LIVE_CHANNEL:3450if (depth == 0) {3451inst->opcode = BRW_OPCODE_MOV;3452inst->src[0] = brw_imm_ud(0u);3453inst->sources = 1;3454inst->force_writemask_all = true;3455progress = true;3456}3457break;34583459default:3460break;3461}3462}34633464if (progress)3465invalidate_analysis(DEPENDENCY_INSTRUCTION_DETAIL);34663467return progress;3468}34693470/**3471* Once we've generated code, try to convert normal FS_OPCODE_FB_WRITE3472* instructions to FS_OPCODE_REP_FB_WRITE.3473*/3474void3475fs_visitor::emit_repclear_shader()3476{3477brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;3478int base_mrf = 0;3479int color_mrf = base_mrf + 2;3480fs_inst *mov;34813482if (uniforms > 0) {3483mov = bld.exec_all().group(4, 0)3484.MOV(brw_message_reg(color_mrf),3485fs_reg(UNIFORM, 0, BRW_REGISTER_TYPE_F));3486} else {3487struct brw_reg reg =3488brw_reg(BRW_GENERAL_REGISTER_FILE, 2, 3, 0, 0, BRW_REGISTER_TYPE_UD,3489BRW_VERTICAL_STRIDE_8, BRW_WIDTH_2, BRW_HORIZONTAL_STRIDE_4,3490BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);34913492mov = bld.exec_all().group(4, 0)3493.MOV(brw_uvec_mrf(4, color_mrf, 0), fs_reg(reg));3494}34953496fs_inst *write = NULL;3497if (key->nr_color_regions == 1) {3498write = bld.emit(FS_OPCODE_REP_FB_WRITE);3499write->saturate = key->clamp_fragment_color;3500write->base_mrf = color_mrf;3501write->target = 0;3502write->header_size = 0;3503write->mlen = 1;3504} else {3505assume(key->nr_color_regions > 0);35063507struct brw_reg header =3508retype(brw_message_reg(base_mrf), BRW_REGISTER_TYPE_UD);3509bld.exec_all().group(16, 0)3510.MOV(header, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));35113512for (int i = 0; i < key->nr_color_regions; ++i) {3513if (i > 0) {3514bld.exec_all().group(1, 0)3515.MOV(component(header, 2), brw_imm_ud(i));3516}35173518write = bld.emit(FS_OPCODE_REP_FB_WRITE);3519write->saturate = key->clamp_fragment_color;3520write->base_mrf = base_mrf;3521write->target = i;3522write->header_size = 2;3523write->mlen = 3;3524}3525}3526write->eot = true;3527write->last_rt = true;35283529calculate_cfg();35303531assign_constant_locations();3532assign_curb_setup();35333534/* Now that we have the uniform assigned, go ahead and force it to a vec4. */3535if (uniforms > 0) {3536assert(mov->src[0].file == FIXED_GRF);3537mov->src[0] = brw_vec4_grf(mov->src[0].nr, 0);3538}35393540lower_scoreboard();3541}35423543/**3544* Walks through basic blocks, looking for repeated MRF writes and3545* removing the later ones.3546*/3547bool3548fs_visitor::remove_duplicate_mrf_writes()3549{3550fs_inst *last_mrf_move[BRW_MAX_MRF(devinfo->ver)];3551bool progress = false;35523553/* Need to update the MRF tracking for compressed instructions. */3554if (dispatch_width >= 16)3555return false;35563557memset(last_mrf_move, 0, sizeof(last_mrf_move));35583559foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {3560if (inst->is_control_flow()) {3561memset(last_mrf_move, 0, sizeof(last_mrf_move));3562}35633564if (inst->opcode == BRW_OPCODE_MOV &&3565inst->dst.file == MRF) {3566fs_inst *prev_inst = last_mrf_move[inst->dst.nr];3567if (prev_inst && prev_inst->opcode == BRW_OPCODE_MOV &&3568inst->dst.equals(prev_inst->dst) &&3569inst->src[0].equals(prev_inst->src[0]) &&3570inst->saturate == prev_inst->saturate &&3571inst->predicate == prev_inst->predicate &&3572inst->conditional_mod == prev_inst->conditional_mod &&3573inst->exec_size == prev_inst->exec_size) {3574inst->remove(block);3575progress = true;3576continue;3577}3578}35793580/* Clear out the last-write records for MRFs that were overwritten. */3581if (inst->dst.file == MRF) {3582last_mrf_move[inst->dst.nr] = NULL;3583}35843585if (inst->mlen > 0 && inst->base_mrf != -1) {3586/* Found a SEND instruction, which will include two or fewer3587* implied MRF writes. We could do better here.3588*/3589for (unsigned i = 0; i < inst->implied_mrf_writes(); i++) {3590last_mrf_move[inst->base_mrf + i] = NULL;3591}3592}35933594/* Clear out any MRF move records whose sources got overwritten. */3595for (unsigned i = 0; i < ARRAY_SIZE(last_mrf_move); i++) {3596if (last_mrf_move[i] &&3597regions_overlap(inst->dst, inst->size_written,3598last_mrf_move[i]->src[0],3599last_mrf_move[i]->size_read(0))) {3600last_mrf_move[i] = NULL;3601}3602}36033604if (inst->opcode == BRW_OPCODE_MOV &&3605inst->dst.file == MRF &&3606inst->src[0].file != ARF &&3607!inst->is_partial_write()) {3608last_mrf_move[inst->dst.nr] = inst;3609}3610}36113612if (progress)3613invalidate_analysis(DEPENDENCY_INSTRUCTIONS);36143615return progress;3616}36173618/**3619* Rounding modes for conversion instructions are included for each3620* conversion, but right now it is a state. So once it is set,3621* we don't need to call it again for subsequent calls.3622*3623* This is useful for vector/matrices conversions, as setting the3624* mode once is enough for the full vector/matrix3625*/3626bool3627fs_visitor::remove_extra_rounding_modes()3628{3629bool progress = false;3630unsigned execution_mode = this->nir->info.float_controls_execution_mode;36313632brw_rnd_mode base_mode = BRW_RND_MODE_UNSPECIFIED;3633if ((FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP16 |3634FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP32 |3635FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP64) &3636execution_mode)3637base_mode = BRW_RND_MODE_RTNE;3638if ((FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP16 |3639FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP32 |3640FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP64) &3641execution_mode)3642base_mode = BRW_RND_MODE_RTZ;36433644foreach_block (block, cfg) {3645brw_rnd_mode prev_mode = base_mode;36463647foreach_inst_in_block_safe (fs_inst, inst, block) {3648if (inst->opcode == SHADER_OPCODE_RND_MODE) {3649assert(inst->src[0].file == BRW_IMMEDIATE_VALUE);3650const brw_rnd_mode mode = (brw_rnd_mode) inst->src[0].d;3651if (mode == prev_mode) {3652inst->remove(block);3653progress = true;3654} else {3655prev_mode = mode;3656}3657}3658}3659}36603661if (progress)3662invalidate_analysis(DEPENDENCY_INSTRUCTIONS);36633664return progress;3665}36663667static void3668clear_deps_for_inst_src(fs_inst *inst, bool *deps, int first_grf, int grf_len)3669{3670/* Clear the flag for registers that actually got read (as expected). */3671for (int i = 0; i < inst->sources; i++) {3672int grf;3673if (inst->src[i].file == VGRF || inst->src[i].file == FIXED_GRF) {3674grf = inst->src[i].nr;3675} else {3676continue;3677}36783679if (grf >= first_grf &&3680grf < first_grf + grf_len) {3681deps[grf - first_grf] = false;3682if (inst->exec_size == 16)3683deps[grf - first_grf + 1] = false;3684}3685}3686}36873688/**3689* Implements this workaround for the original 965:3690*3691* "[DevBW, DevCL] Implementation Restrictions: As the hardware does not3692* check for post destination dependencies on this instruction, software3693* must ensure that there is no destination hazard for the case of ‘write3694* followed by a posted write’ shown in the following example.3695*3696* 1. mov r3 03697* 2. send r3.xy <rest of send instruction>3698* 3. mov r2 r33699*3700* Due to no post-destination dependency check on the ‘send’, the above3701* code sequence could have two instructions (1 and 2) in flight at the3702* same time that both consider ‘r3’ as the target of their final writes.3703*/3704void3705fs_visitor::insert_gfx4_pre_send_dependency_workarounds(bblock_t *block,3706fs_inst *inst)3707{3708int write_len = regs_written(inst);3709int first_write_grf = inst->dst.nr;3710bool needs_dep[BRW_MAX_MRF(devinfo->ver)];3711assert(write_len < (int)sizeof(needs_dep) - 1);37123713memset(needs_dep, false, sizeof(needs_dep));3714memset(needs_dep, true, write_len);37153716clear_deps_for_inst_src(inst, needs_dep, first_write_grf, write_len);37173718/* Walk backwards looking for writes to registers we're writing which3719* aren't read since being written. If we hit the start of the program,3720* we assume that there are no outstanding dependencies on entry to the3721* program.3722*/3723foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst) {3724/* If we hit control flow, assume that there *are* outstanding3725* dependencies, and force their cleanup before our instruction.3726*/3727if (block->start() == scan_inst && block->num != 0) {3728for (int i = 0; i < write_len; i++) {3729if (needs_dep[i])3730DEP_RESOLVE_MOV(fs_builder(this, block, inst),3731first_write_grf + i);3732}3733return;3734}37353736/* We insert our reads as late as possible on the assumption that any3737* instruction but a MOV that might have left us an outstanding3738* dependency has more latency than a MOV.3739*/3740if (scan_inst->dst.file == VGRF) {3741for (unsigned i = 0; i < regs_written(scan_inst); i++) {3742int reg = scan_inst->dst.nr + i;37433744if (reg >= first_write_grf &&3745reg < first_write_grf + write_len &&3746needs_dep[reg - first_write_grf]) {3747DEP_RESOLVE_MOV(fs_builder(this, block, inst), reg);3748needs_dep[reg - first_write_grf] = false;3749if (scan_inst->exec_size == 16)3750needs_dep[reg - first_write_grf + 1] = false;3751}3752}3753}37543755/* Clear the flag for registers that actually got read (as expected). */3756clear_deps_for_inst_src(scan_inst, needs_dep, first_write_grf, write_len);37573758/* Continue the loop only if we haven't resolved all the dependencies */3759int i;3760for (i = 0; i < write_len; i++) {3761if (needs_dep[i])3762break;3763}3764if (i == write_len)3765return;3766}3767}37683769/**3770* Implements this workaround for the original 965:3771*3772* "[DevBW, DevCL] Errata: A destination register from a send can not be3773* used as a destination register until after it has been sourced by an3774* instruction with a different destination register.3775*/3776void3777fs_visitor::insert_gfx4_post_send_dependency_workarounds(bblock_t *block, fs_inst *inst)3778{3779int write_len = regs_written(inst);3780unsigned first_write_grf = inst->dst.nr;3781bool needs_dep[BRW_MAX_MRF(devinfo->ver)];3782assert(write_len < (int)sizeof(needs_dep) - 1);37833784memset(needs_dep, false, sizeof(needs_dep));3785memset(needs_dep, true, write_len);3786/* Walk forwards looking for writes to registers we're writing which aren't3787* read before being written.3788*/3789foreach_inst_in_block_starting_from(fs_inst, scan_inst, inst) {3790/* If we hit control flow, force resolve all remaining dependencies. */3791if (block->end() == scan_inst && block->num != cfg->num_blocks - 1) {3792for (int i = 0; i < write_len; i++) {3793if (needs_dep[i])3794DEP_RESOLVE_MOV(fs_builder(this, block, scan_inst),3795first_write_grf + i);3796}3797return;3798}37993800/* Clear the flag for registers that actually got read (as expected). */3801clear_deps_for_inst_src(scan_inst, needs_dep, first_write_grf, write_len);38023803/* We insert our reads as late as possible since they're reading the3804* result of a SEND, which has massive latency.3805*/3806if (scan_inst->dst.file == VGRF &&3807scan_inst->dst.nr >= first_write_grf &&3808scan_inst->dst.nr < first_write_grf + write_len &&3809needs_dep[scan_inst->dst.nr - first_write_grf]) {3810DEP_RESOLVE_MOV(fs_builder(this, block, scan_inst),3811scan_inst->dst.nr);3812needs_dep[scan_inst->dst.nr - first_write_grf] = false;3813}38143815/* Continue the loop only if we haven't resolved all the dependencies */3816int i;3817for (i = 0; i < write_len; i++) {3818if (needs_dep[i])3819break;3820}3821if (i == write_len)3822return;3823}3824}38253826void3827fs_visitor::insert_gfx4_send_dependency_workarounds()3828{3829if (devinfo->ver != 4 || devinfo->is_g4x)3830return;38313832bool progress = false;38333834foreach_block_and_inst(block, fs_inst, inst, cfg) {3835if (inst->mlen != 0 && inst->dst.file == VGRF) {3836insert_gfx4_pre_send_dependency_workarounds(block, inst);3837insert_gfx4_post_send_dependency_workarounds(block, inst);3838progress = true;3839}3840}38413842if (progress)3843invalidate_analysis(DEPENDENCY_INSTRUCTIONS);3844}38453846/**3847* Turns the generic expression-style uniform pull constant load instruction3848* into a hardware-specific series of instructions for loading a pull3849* constant.3850*3851* The expression style allows the CSE pass before this to optimize out3852* repeated loads from the same offset, and gives the pre-register-allocation3853* scheduling full flexibility, while the conversion to native instructions3854* allows the post-register-allocation scheduler the best information3855* possible.3856*3857* Note that execution masking for setting up pull constant loads is special:3858* the channels that need to be written are unrelated to the current execution3859* mask, since a later instruction will use one of the result channels as a3860* source operand for all 8 or 16 of its channels.3861*/3862void3863fs_visitor::lower_uniform_pull_constant_loads()3864{3865foreach_block_and_inst (block, fs_inst, inst, cfg) {3866if (inst->opcode != FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD)3867continue;38683869const fs_reg& surface = inst->src[0];3870const fs_reg& offset_B = inst->src[1];3871assert(offset_B.file == IMM);38723873if (devinfo->has_lsc) {3874const fs_builder ubld =3875fs_builder(this, block, inst).group(8, 0).exec_all();38763877const fs_reg payload = ubld.vgrf(BRW_REGISTER_TYPE_UD);3878ubld.MOV(payload, offset_B);38793880inst->sfid = GFX12_SFID_UGM;3881inst->desc = lsc_msg_desc(devinfo, LSC_OP_LOAD,38821 /* simd_size */,3883LSC_ADDR_SURFTYPE_BTI,3884LSC_ADDR_SIZE_A32,38851 /* num_coordinates */,3886LSC_DATA_SIZE_D32,3887inst->size_written / 4,3888true /* transpose */,3889LSC_CACHE_LOAD_L1STATE_L3MOCS,3890true /* has_dest */);38913892fs_reg ex_desc;3893if (surface.file == IMM) {3894ex_desc = brw_imm_ud(lsc_bti_ex_desc(devinfo, surface.ud));3895} else {3896/* We only need the first component for the payload so we can use3897* one of the other components for the extended descriptor3898*/3899ex_desc = component(payload, 1);3900ubld.group(1, 0).SHL(ex_desc, surface, brw_imm_ud(24));3901}39023903/* Update the original instruction. */3904inst->opcode = SHADER_OPCODE_SEND;3905inst->mlen = lsc_msg_desc_src0_len(devinfo, inst->desc);3906inst->ex_mlen = 0;3907inst->header_size = 0;3908inst->send_has_side_effects = false;3909inst->send_is_volatile = true;3910inst->exec_size = 1;39113912/* Finally, the payload */3913inst->resize_sources(3);3914inst->src[0] = brw_imm_ud(0); /* desc */3915inst->src[1] = ex_desc;3916inst->src[2] = payload;39173918invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES);3919} else if (devinfo->ver >= 7) {3920const fs_builder ubld = fs_builder(this, block, inst).exec_all();3921const fs_reg payload = ubld.group(8, 0).vgrf(BRW_REGISTER_TYPE_UD);39223923ubld.group(8, 0).MOV(payload,3924retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));3925ubld.group(1, 0).MOV(component(payload, 2),3926brw_imm_ud(offset_B.ud / 16));39273928inst->opcode = FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GFX7;3929inst->src[1] = payload;3930inst->header_size = 1;3931inst->mlen = 1;39323933invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES);3934} else {3935/* Before register allocation, we didn't tell the scheduler about the3936* MRF we use. We know it's safe to use this MRF because nothing3937* else does except for register spill/unspill, which generates and3938* uses its MRF within a single IR instruction.3939*/3940inst->base_mrf = FIRST_PULL_LOAD_MRF(devinfo->ver) + 1;3941inst->mlen = 1;3942}3943}3944}39453946bool3947fs_visitor::lower_load_payload()3948{3949bool progress = false;39503951foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {3952if (inst->opcode != SHADER_OPCODE_LOAD_PAYLOAD)3953continue;39543955assert(inst->dst.file == MRF || inst->dst.file == VGRF);3956assert(inst->saturate == false);3957fs_reg dst = inst->dst;39583959/* Get rid of COMPR4. We'll add it back in if we need it */3960if (dst.file == MRF)3961dst.nr = dst.nr & ~BRW_MRF_COMPR4;39623963const fs_builder ibld(this, block, inst);3964const fs_builder ubld = ibld.exec_all();39653966for (uint8_t i = 0; i < inst->header_size;) {3967/* Number of header GRFs to initialize at once with a single MOV3968* instruction.3969*/3970const unsigned n =3971(i + 1 < inst->header_size && inst->src[i].stride == 1 &&3972inst->src[i + 1].equals(byte_offset(inst->src[i], REG_SIZE))) ?39732 : 1;39743975if (inst->src[i].file != BAD_FILE)3976ubld.group(8 * n, 0).MOV(retype(dst, BRW_REGISTER_TYPE_UD),3977retype(inst->src[i], BRW_REGISTER_TYPE_UD));39783979dst = byte_offset(dst, n * REG_SIZE);3980i += n;3981}39823983if (inst->dst.file == MRF && (inst->dst.nr & BRW_MRF_COMPR4) &&3984inst->exec_size > 8) {3985/* In this case, the payload portion of the LOAD_PAYLOAD isn't3986* a straightforward copy. Instead, the result of the3987* LOAD_PAYLOAD is treated as interleaved and the first four3988* non-header sources are unpacked as:3989*3990* m + 0: r03991* m + 1: g03992* m + 2: b03993* m + 3: a03994* m + 4: r13995* m + 5: g13996* m + 6: b13997* m + 7: a13998*3999* This is used for gen <= 5 fb writes.4000*/4001assert(inst->exec_size == 16);4002assert(inst->header_size + 4 <= inst->sources);4003for (uint8_t i = inst->header_size; i < inst->header_size + 4; i++) {4004if (inst->src[i].file != BAD_FILE) {4005if (devinfo->has_compr4) {4006fs_reg compr4_dst = retype(dst, inst->src[i].type);4007compr4_dst.nr |= BRW_MRF_COMPR4;4008ibld.MOV(compr4_dst, inst->src[i]);4009} else {4010/* Platform doesn't have COMPR4. We have to fake it */4011fs_reg mov_dst = retype(dst, inst->src[i].type);4012ibld.quarter(0).MOV(mov_dst, quarter(inst->src[i], 0));4013mov_dst.nr += 4;4014ibld.quarter(1).MOV(mov_dst, quarter(inst->src[i], 1));4015}4016}40174018dst.nr++;4019}40204021/* The loop above only ever incremented us through the first set4022* of 4 registers. However, thanks to the magic of COMPR4, we4023* actually wrote to the first 8 registers, so we need to take4024* that into account now.4025*/4026dst.nr += 4;40274028/* The COMPR4 code took care of the first 4 sources. We'll let4029* the regular path handle any remaining sources. Yes, we are4030* modifying the instruction but we're about to delete it so4031* this really doesn't hurt anything.4032*/4033inst->header_size += 4;4034}40354036for (uint8_t i = inst->header_size; i < inst->sources; i++) {4037if (inst->src[i].file != BAD_FILE) {4038dst.type = inst->src[i].type;4039ibld.MOV(dst, inst->src[i]);4040} else {4041dst.type = BRW_REGISTER_TYPE_UD;4042}4043dst = offset(dst, ibld, 1);4044}40454046inst->remove(block);4047progress = true;4048}40494050if (progress)4051invalidate_analysis(DEPENDENCY_INSTRUCTIONS);40524053return progress;4054}40554056void4057fs_visitor::lower_mul_dword_inst(fs_inst *inst, bblock_t *block)4058{4059const fs_builder ibld(this, block, inst);40604061const bool ud = (inst->src[1].type == BRW_REGISTER_TYPE_UD);4062if (inst->src[1].file == IMM &&4063(( ud && inst->src[1].ud <= UINT16_MAX) ||4064(!ud && inst->src[1].d <= INT16_MAX && inst->src[1].d >= INT16_MIN))) {4065/* The MUL instruction isn't commutative. On Gen <= 6, only the low4066* 16-bits of src0 are read, and on Gen >= 7 only the low 16-bits of4067* src1 are used.4068*4069* If multiplying by an immediate value that fits in 16-bits, do a4070* single MUL instruction with that value in the proper location.4071*/4072if (devinfo->ver < 7) {4073fs_reg imm(VGRF, alloc.allocate(dispatch_width / 8), inst->dst.type);4074ibld.MOV(imm, inst->src[1]);4075ibld.MUL(inst->dst, imm, inst->src[0]);4076} else {4077ibld.MUL(inst->dst, inst->src[0],4078ud ? brw_imm_uw(inst->src[1].ud)4079: brw_imm_w(inst->src[1].d));4080}4081} else {4082/* Gen < 8 (and some Gfx8+ low-power parts like Cherryview) cannot4083* do 32-bit integer multiplication in one instruction, but instead4084* must do a sequence (which actually calculates a 64-bit result):4085*4086* mul(8) acc0<1>D g3<8,8,1>D g4<8,8,1>D4087* mach(8) null g3<8,8,1>D g4<8,8,1>D4088* mov(8) g2<1>D acc0<8,8,1>D4089*4090* But on Gen > 6, the ability to use second accumulator register4091* (acc1) for non-float data types was removed, preventing a simple4092* implementation in SIMD16. A 16-channel result can be calculated by4093* executing the three instructions twice in SIMD8, once with quarter4094* control of 1Q for the first eight channels and again with 2Q for4095* the second eight channels.4096*4097* Which accumulator register is implicitly accessed (by AccWrEnable4098* for instance) is determined by the quarter control. Unfortunately4099* Ivybridge (and presumably Baytrail) has a hardware bug in which an4100* implicit accumulator access by an instruction with 2Q will access4101* acc1 regardless of whether the data type is usable in acc1.4102*4103* Specifically, the 2Q mach(8) writes acc1 which does not exist for4104* integer data types.4105*4106* Since we only want the low 32-bits of the result, we can do two4107* 32-bit x 16-bit multiplies (like the mul and mach are doing), and4108* adjust the high result and add them (like the mach is doing):4109*4110* mul(8) g7<1>D g3<8,8,1>D g4.0<8,8,1>UW4111* mul(8) g8<1>D g3<8,8,1>D g4.1<8,8,1>UW4112* shl(8) g9<1>D g8<8,8,1>D 16D4113* add(8) g2<1>D g7<8,8,1>D g8<8,8,1>D4114*4115* We avoid the shl instruction by realizing that we only want to add4116* the low 16-bits of the "high" result to the high 16-bits of the4117* "low" result and using proper regioning on the add:4118*4119* mul(8) g7<1>D g3<8,8,1>D g4.0<16,8,2>UW4120* mul(8) g8<1>D g3<8,8,1>D g4.1<16,8,2>UW4121* add(8) g7.1<2>UW g7.1<16,8,2>UW g8<16,8,2>UW4122*4123* Since it does not use the (single) accumulator register, we can4124* schedule multi-component multiplications much better.4125*/41264127bool needs_mov = false;4128fs_reg orig_dst = inst->dst;41294130/* Get a new VGRF for the "low" 32x16-bit multiplication result if4131* reusing the original destination is impossible due to hardware4132* restrictions, source/destination overlap, or it being the null4133* register.4134*/4135fs_reg low = inst->dst;4136if (orig_dst.is_null() || orig_dst.file == MRF ||4137regions_overlap(inst->dst, inst->size_written,4138inst->src[0], inst->size_read(0)) ||4139regions_overlap(inst->dst, inst->size_written,4140inst->src[1], inst->size_read(1)) ||4141inst->dst.stride >= 4) {4142needs_mov = true;4143low = fs_reg(VGRF, alloc.allocate(regs_written(inst)),4144inst->dst.type);4145}41464147/* Get a new VGRF but keep the same stride as inst->dst */4148fs_reg high(VGRF, alloc.allocate(regs_written(inst)), inst->dst.type);4149high.stride = inst->dst.stride;4150high.offset = inst->dst.offset % REG_SIZE;41514152if (devinfo->ver >= 7) {4153/* From Wa_1604601757:4154*4155* "When multiplying a DW and any lower precision integer, source modifier4156* is not supported."4157*4158* An unsupported negate modifier on src[1] would ordinarily be4159* lowered by the subsequent lower_regioning pass. In this case that4160* pass would spawn another dword multiply. Instead, lower the4161* modifier first.4162*/4163const bool source_mods_unsupported = (devinfo->ver >= 12);41644165if (inst->src[1].abs || (inst->src[1].negate &&4166source_mods_unsupported))4167lower_src_modifiers(this, block, inst, 1);41684169if (inst->src[1].file == IMM) {4170ibld.MUL(low, inst->src[0],4171brw_imm_uw(inst->src[1].ud & 0xffff));4172ibld.MUL(high, inst->src[0],4173brw_imm_uw(inst->src[1].ud >> 16));4174} else {4175ibld.MUL(low, inst->src[0],4176subscript(inst->src[1], BRW_REGISTER_TYPE_UW, 0));4177ibld.MUL(high, inst->src[0],4178subscript(inst->src[1], BRW_REGISTER_TYPE_UW, 1));4179}4180} else {4181if (inst->src[0].abs)4182lower_src_modifiers(this, block, inst, 0);41834184ibld.MUL(low, subscript(inst->src[0], BRW_REGISTER_TYPE_UW, 0),4185inst->src[1]);4186ibld.MUL(high, subscript(inst->src[0], BRW_REGISTER_TYPE_UW, 1),4187inst->src[1]);4188}41894190ibld.ADD(subscript(low, BRW_REGISTER_TYPE_UW, 1),4191subscript(low, BRW_REGISTER_TYPE_UW, 1),4192subscript(high, BRW_REGISTER_TYPE_UW, 0));41934194if (needs_mov || inst->conditional_mod)4195set_condmod(inst->conditional_mod, ibld.MOV(orig_dst, low));4196}4197}41984199void4200fs_visitor::lower_mul_qword_inst(fs_inst *inst, bblock_t *block)4201{4202const fs_builder ibld(this, block, inst);42034204/* Considering two 64-bit integers ab and cd where each letter ab4205* corresponds to 32 bits, we get a 128-bit result WXYZ. We * cd4206* only need to provide the YZ part of the result. -------4207* BD4208* Only BD needs to be 64 bits. For AD and BC we only care + AD4209* about the lower 32 bits (since they are part of the upper + BC4210* 32 bits of our result). AC is not needed since it starts + AC4211* on the 65th bit of the result. -------4212* WXYZ4213*/4214unsigned int q_regs = regs_written(inst);4215unsigned int d_regs = (q_regs + 1) / 2;42164217fs_reg bd(VGRF, alloc.allocate(q_regs), BRW_REGISTER_TYPE_UQ);4218fs_reg ad(VGRF, alloc.allocate(d_regs), BRW_REGISTER_TYPE_UD);4219fs_reg bc(VGRF, alloc.allocate(d_regs), BRW_REGISTER_TYPE_UD);42204221/* Here we need the full 64 bit result for 32b * 32b. */4222if (devinfo->has_integer_dword_mul) {4223ibld.MUL(bd, subscript(inst->src[0], BRW_REGISTER_TYPE_UD, 0),4224subscript(inst->src[1], BRW_REGISTER_TYPE_UD, 0));4225} else {4226fs_reg bd_high(VGRF, alloc.allocate(d_regs), BRW_REGISTER_TYPE_UD);4227fs_reg bd_low(VGRF, alloc.allocate(d_regs), BRW_REGISTER_TYPE_UD);4228fs_reg acc = retype(brw_acc_reg(inst->exec_size), BRW_REGISTER_TYPE_UD);42294230fs_inst *mul = ibld.MUL(acc,4231subscript(inst->src[0], BRW_REGISTER_TYPE_UD, 0),4232subscript(inst->src[1], BRW_REGISTER_TYPE_UW, 0));4233mul->writes_accumulator = true;42344235ibld.MACH(bd_high, subscript(inst->src[0], BRW_REGISTER_TYPE_UD, 0),4236subscript(inst->src[1], BRW_REGISTER_TYPE_UD, 0));4237ibld.MOV(bd_low, acc);42384239ibld.MOV(subscript(bd, BRW_REGISTER_TYPE_UD, 0), bd_low);4240ibld.MOV(subscript(bd, BRW_REGISTER_TYPE_UD, 1), bd_high);4241}42424243ibld.MUL(ad, subscript(inst->src[0], BRW_REGISTER_TYPE_UD, 1),4244subscript(inst->src[1], BRW_REGISTER_TYPE_UD, 0));4245ibld.MUL(bc, subscript(inst->src[0], BRW_REGISTER_TYPE_UD, 0),4246subscript(inst->src[1], BRW_REGISTER_TYPE_UD, 1));42474248ibld.ADD(ad, ad, bc);4249ibld.ADD(subscript(bd, BRW_REGISTER_TYPE_UD, 1),4250subscript(bd, BRW_REGISTER_TYPE_UD, 1), ad);42514252if (devinfo->has_64bit_int) {4253ibld.MOV(inst->dst, bd);4254} else {4255ibld.MOV(subscript(inst->dst, BRW_REGISTER_TYPE_UD, 0),4256subscript(bd, BRW_REGISTER_TYPE_UD, 0));4257ibld.MOV(subscript(inst->dst, BRW_REGISTER_TYPE_UD, 1),4258subscript(bd, BRW_REGISTER_TYPE_UD, 1));4259}4260}42614262void4263fs_visitor::lower_mulh_inst(fs_inst *inst, bblock_t *block)4264{4265const fs_builder ibld(this, block, inst);42664267/* According to the BDW+ BSpec page for the "Multiply Accumulate4268* High" instruction:4269*4270* "An added preliminary mov is required for source modification on4271* src1:4272* mov (8) r3.0<1>:d -r3<8;8,1>:d4273* mul (8) acc0:d r2.0<8;8,1>:d r3.0<16;8,2>:uw4274* mach (8) r5.0<1>:d r2.0<8;8,1>:d r3.0<8;8,1>:d"4275*/4276if (devinfo->ver >= 8 && (inst->src[1].negate || inst->src[1].abs))4277lower_src_modifiers(this, block, inst, 1);42784279/* Should have been lowered to 8-wide. */4280assert(inst->exec_size <= get_lowered_simd_width(devinfo, inst));4281const fs_reg acc = retype(brw_acc_reg(inst->exec_size), inst->dst.type);4282fs_inst *mul = ibld.MUL(acc, inst->src[0], inst->src[1]);4283fs_inst *mach = ibld.MACH(inst->dst, inst->src[0], inst->src[1]);42844285if (devinfo->ver >= 8) {4286/* Until Gfx8, integer multiplies read 32-bits from one source,4287* and 16-bits from the other, and relying on the MACH instruction4288* to generate the high bits of the result.4289*4290* On Gfx8, the multiply instruction does a full 32x32-bit4291* multiply, but in order to do a 64-bit multiply we can simulate4292* the previous behavior and then use a MACH instruction.4293*/4294assert(mul->src[1].type == BRW_REGISTER_TYPE_D ||4295mul->src[1].type == BRW_REGISTER_TYPE_UD);4296mul->src[1].type = BRW_REGISTER_TYPE_UW;4297mul->src[1].stride *= 2;42984299if (mul->src[1].file == IMM) {4300mul->src[1] = brw_imm_uw(mul->src[1].ud);4301}4302} else if (devinfo->verx10 == 70 &&4303inst->group > 0) {4304/* Among other things the quarter control bits influence which4305* accumulator register is used by the hardware for instructions4306* that access the accumulator implicitly (e.g. MACH). A4307* second-half instruction would normally map to acc1, which4308* doesn't exist on Gfx7 and up (the hardware does emulate it for4309* floating-point instructions *only* by taking advantage of the4310* extra precision of acc0 not normally used for floating point4311* arithmetic).4312*4313* HSW and up are careful enough not to try to access an4314* accumulator register that doesn't exist, but on earlier Gfx74315* hardware we need to make sure that the quarter control bits are4316* zero to avoid non-deterministic behaviour and emit an extra MOV4317* to get the result masked correctly according to the current4318* channel enables.4319*/4320mach->group = 0;4321mach->force_writemask_all = true;4322mach->dst = ibld.vgrf(inst->dst.type);4323ibld.MOV(inst->dst, mach->dst);4324}4325}43264327bool4328fs_visitor::lower_integer_multiplication()4329{4330bool progress = false;43314332foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {4333if (inst->opcode == BRW_OPCODE_MUL) {4334/* If the instruction is already in a form that does not need lowering,4335* return early.4336*/4337if (devinfo->ver >= 7) {4338if (type_sz(inst->src[1].type) < 4 && type_sz(inst->src[0].type) <= 4)4339continue;4340} else {4341if (type_sz(inst->src[0].type) < 4 && type_sz(inst->src[1].type) <= 4)4342continue;4343}43444345if ((inst->dst.type == BRW_REGISTER_TYPE_Q ||4346inst->dst.type == BRW_REGISTER_TYPE_UQ) &&4347(inst->src[0].type == BRW_REGISTER_TYPE_Q ||4348inst->src[0].type == BRW_REGISTER_TYPE_UQ) &&4349(inst->src[1].type == BRW_REGISTER_TYPE_Q ||4350inst->src[1].type == BRW_REGISTER_TYPE_UQ)) {4351lower_mul_qword_inst(inst, block);4352inst->remove(block);4353progress = true;4354} else if (!inst->dst.is_accumulator() &&4355(inst->dst.type == BRW_REGISTER_TYPE_D ||4356inst->dst.type == BRW_REGISTER_TYPE_UD) &&4357(!devinfo->has_integer_dword_mul ||4358devinfo->verx10 >= 125)) {4359lower_mul_dword_inst(inst, block);4360inst->remove(block);4361progress = true;4362}4363} else if (inst->opcode == SHADER_OPCODE_MULH) {4364lower_mulh_inst(inst, block);4365inst->remove(block);4366progress = true;4367}43684369}43704371if (progress)4372invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES);43734374return progress;4375}43764377bool4378fs_visitor::lower_minmax()4379{4380assert(devinfo->ver < 6);43814382bool progress = false;43834384foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {4385const fs_builder ibld(this, block, inst);43864387if (inst->opcode == BRW_OPCODE_SEL &&4388inst->predicate == BRW_PREDICATE_NONE) {4389/* If src1 is an immediate value that is not NaN, then it can't be4390* NaN. In that case, emit CMP because it is much better for cmod4391* propagation. Likewise if src1 is not float. Gfx4 and Gfx5 don't4392* support HF or DF, so it is not necessary to check for those.4393*/4394if (inst->src[1].type != BRW_REGISTER_TYPE_F ||4395(inst->src[1].file == IMM && !isnan(inst->src[1].f))) {4396ibld.CMP(ibld.null_reg_d(), inst->src[0], inst->src[1],4397inst->conditional_mod);4398} else {4399ibld.CMPN(ibld.null_reg_d(), inst->src[0], inst->src[1],4400inst->conditional_mod);4401}4402inst->predicate = BRW_PREDICATE_NORMAL;4403inst->conditional_mod = BRW_CONDITIONAL_NONE;44044405progress = true;4406}4407}44084409if (progress)4410invalidate_analysis(DEPENDENCY_INSTRUCTIONS);44114412return progress;4413}44144415bool4416fs_visitor::lower_sub_sat()4417{4418bool progress = false;44194420foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {4421const fs_builder ibld(this, block, inst);44224423if (inst->opcode == SHADER_OPCODE_USUB_SAT ||4424inst->opcode == SHADER_OPCODE_ISUB_SAT) {4425/* The fundamental problem is the hardware performs source negation4426* at the bit width of the source. If the source is 0x80000000D, the4427* negation is 0x80000000D. As a result, subtractSaturate(0,4428* 0x80000000) will produce 0x80000000 instead of 0x7fffffff. There4429* are at least three ways to resolve this:4430*4431* 1. Use the accumulator for the negated source. The accumulator is4432* 33 bits, so our source 0x80000000 is sign-extended to4433* 0x1800000000. The negation of which is 0x080000000. This4434* doesn't help for 64-bit integers (which are already bigger than4435* 33 bits). There are also only 8 accumulators, so SIMD16 or4436* SIMD32 instructions would have to be split into multiple SIMD84437* instructions.4438*4439* 2. Use slightly different math. For any n-bit value x, we know (x4440* >> 1) != -(x >> 1). We can use this fact to only do4441* subtractions involving (x >> 1). subtractSaturate(a, b) ==4442* subtractSaturate(subtractSaturate(a, (b >> 1)), b - (b >> 1)).4443*4444* 3. For unsigned sources, it is sufficient to replace the4445* subtractSaturate with (a > b) ? a - b : 0.4446*4447* It may also be possible to use the SUBB instruction. This4448* implicitly writes the accumulator, so it could only be used in the4449* same situations as #1 above. It is further limited by only4450* allowing UD sources.4451*/4452if (inst->exec_size == 8 && inst->src[0].type != BRW_REGISTER_TYPE_Q &&4453inst->src[0].type != BRW_REGISTER_TYPE_UQ) {4454fs_reg acc(ARF, BRW_ARF_ACCUMULATOR, inst->src[1].type);44554456ibld.MOV(acc, inst->src[1]);4457fs_inst *add = ibld.ADD(inst->dst, acc, inst->src[0]);4458add->saturate = true;4459add->src[0].negate = true;4460} else if (inst->opcode == SHADER_OPCODE_ISUB_SAT) {4461/* tmp = src1 >> 1;4462* dst = add.sat(add.sat(src0, -tmp), -(src1 - tmp));4463*/4464fs_reg tmp1 = ibld.vgrf(inst->src[0].type);4465fs_reg tmp2 = ibld.vgrf(inst->src[0].type);4466fs_reg tmp3 = ibld.vgrf(inst->src[0].type);4467fs_inst *add;44684469ibld.SHR(tmp1, inst->src[1], brw_imm_d(1));44704471add = ibld.ADD(tmp2, inst->src[1], tmp1);4472add->src[1].negate = true;44734474add = ibld.ADD(tmp3, inst->src[0], tmp1);4475add->src[1].negate = true;4476add->saturate = true;44774478add = ibld.ADD(inst->dst, tmp3, tmp2);4479add->src[1].negate = true;4480add->saturate = true;4481} else {4482/* a > b ? a - b : 0 */4483ibld.CMP(ibld.null_reg_d(), inst->src[0], inst->src[1],4484BRW_CONDITIONAL_G);44854486fs_inst *add = ibld.ADD(inst->dst, inst->src[0], inst->src[1]);4487add->src[1].negate = !add->src[1].negate;44884489ibld.SEL(inst->dst, inst->dst, brw_imm_ud(0))4490->predicate = BRW_PREDICATE_NORMAL;4491}44924493inst->remove(block);4494progress = true;4495}4496}44974498if (progress)4499invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES);45004501return progress;4502}45034504/**4505* Get the mask of SIMD channels enabled during dispatch and not yet disabled4506* by discard. Due to the layout of the sample mask in the fragment shader4507* thread payload, \p bld is required to have a dispatch_width() not greater4508* than 16 for fragment shaders.4509*/4510static fs_reg4511sample_mask_reg(const fs_builder &bld)4512{4513const fs_visitor *v = static_cast<const fs_visitor *>(bld.shader);45144515if (v->stage != MESA_SHADER_FRAGMENT) {4516return brw_imm_ud(0xffffffff);4517} else if (brw_wm_prog_data(v->stage_prog_data)->uses_kill) {4518assert(bld.dispatch_width() <= 16);4519return brw_flag_subreg(sample_mask_flag_subreg(v) + bld.group() / 16);4520} else {4521assert(v->devinfo->ver >= 6 && bld.dispatch_width() <= 16);4522return retype(brw_vec1_grf((bld.group() >= 16 ? 2 : 1), 7),4523BRW_REGISTER_TYPE_UW);4524}4525}45264527static void4528setup_color_payload(const fs_builder &bld, const brw_wm_prog_key *key,4529fs_reg *dst, fs_reg color, unsigned components)4530{4531if (key->clamp_fragment_color) {4532fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_F, 4);4533assert(color.type == BRW_REGISTER_TYPE_F);45344535for (unsigned i = 0; i < components; i++)4536set_saturate(true,4537bld.MOV(offset(tmp, bld, i), offset(color, bld, i)));45384539color = tmp;4540}45414542for (unsigned i = 0; i < components; i++)4543dst[i] = offset(color, bld, i);4544}45454546uint32_t4547brw_fb_write_msg_control(const fs_inst *inst,4548const struct brw_wm_prog_data *prog_data)4549{4550uint32_t mctl;45514552if (inst->opcode == FS_OPCODE_REP_FB_WRITE) {4553assert(inst->group == 0 && inst->exec_size == 16);4554mctl = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD16_SINGLE_SOURCE_REPLICATED;4555} else if (prog_data->dual_src_blend) {4556assert(inst->exec_size == 8);45574558if (inst->group % 16 == 0)4559mctl = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD8_DUAL_SOURCE_SUBSPAN01;4560else if (inst->group % 16 == 8)4561mctl = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD8_DUAL_SOURCE_SUBSPAN23;4562else4563unreachable("Invalid dual-source FB write instruction group");4564} else {4565assert(inst->group == 0 || (inst->group == 16 && inst->exec_size == 16));45664567if (inst->exec_size == 16)4568mctl = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD16_SINGLE_SOURCE;4569else if (inst->exec_size == 8)4570mctl = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD8_SINGLE_SOURCE_SUBSPAN01;4571else4572unreachable("Invalid FB write execution size");4573}45744575return mctl;4576}45774578static void4579lower_fb_write_logical_send(const fs_builder &bld, fs_inst *inst,4580const struct brw_wm_prog_data *prog_data,4581const brw_wm_prog_key *key,4582const fs_visitor::thread_payload &payload)4583{4584assert(inst->src[FB_WRITE_LOGICAL_SRC_COMPONENTS].file == IMM);4585const intel_device_info *devinfo = bld.shader->devinfo;4586const fs_reg &color0 = inst->src[FB_WRITE_LOGICAL_SRC_COLOR0];4587const fs_reg &color1 = inst->src[FB_WRITE_LOGICAL_SRC_COLOR1];4588const fs_reg &src0_alpha = inst->src[FB_WRITE_LOGICAL_SRC_SRC0_ALPHA];4589const fs_reg &src_depth = inst->src[FB_WRITE_LOGICAL_SRC_SRC_DEPTH];4590const fs_reg &dst_depth = inst->src[FB_WRITE_LOGICAL_SRC_DST_DEPTH];4591const fs_reg &src_stencil = inst->src[FB_WRITE_LOGICAL_SRC_SRC_STENCIL];4592fs_reg sample_mask = inst->src[FB_WRITE_LOGICAL_SRC_OMASK];4593const unsigned components =4594inst->src[FB_WRITE_LOGICAL_SRC_COMPONENTS].ud;45954596assert(inst->target != 0 || src0_alpha.file == BAD_FILE);45974598/* We can potentially have a message length of up to 15, so we have to set4599* base_mrf to either 0 or 1 in order to fit in m0..m15.4600*/4601fs_reg sources[15];4602int header_size = 2, payload_header_size;4603unsigned length = 0;46044605if (devinfo->ver < 6) {4606/* TODO: Support SIMD32 on gfx4-5 */4607assert(bld.group() < 16);46084609/* For gfx4-5, we always have a header consisting of g0 and g1. We have4610* an implied MOV from g0,g1 to the start of the message. The MOV from4611* g0 is handled by the hardware and the MOV from g1 is provided by the4612* generator. This is required because, on gfx4-5, the generator may4613* generate two write messages with different message lengths in order4614* to handle AA data properly.4615*4616* Also, since the pixel mask goes in the g0 portion of the message and4617* since render target writes are the last thing in the shader, we write4618* the pixel mask directly into g0 and it will get copied as part of the4619* implied write.4620*/4621if (prog_data->uses_kill) {4622bld.exec_all().group(1, 0)4623.MOV(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UW),4624sample_mask_reg(bld));4625}46264627assert(length == 0);4628length = 2;4629} else if ((devinfo->verx10 <= 70 &&4630prog_data->uses_kill) ||4631(devinfo->ver < 11 &&4632(color1.file != BAD_FILE || key->nr_color_regions > 1))) {4633/* From the Sandy Bridge PRM, volume 4, page 198:4634*4635* "Dispatched Pixel Enables. One bit per pixel indicating4636* which pixels were originally enabled when the thread was4637* dispatched. This field is only required for the end-of-4638* thread message and on all dual-source messages."4639*/4640const fs_builder ubld = bld.exec_all().group(8, 0);46414642fs_reg header = ubld.vgrf(BRW_REGISTER_TYPE_UD, 2);4643if (bld.group() < 16) {4644/* The header starts off as g0 and g1 for the first half */4645ubld.group(16, 0).MOV(header, retype(brw_vec8_grf(0, 0),4646BRW_REGISTER_TYPE_UD));4647} else {4648/* The header starts off as g0 and g2 for the second half */4649assert(bld.group() < 32);4650const fs_reg header_sources[2] = {4651retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD),4652retype(brw_vec8_grf(2, 0), BRW_REGISTER_TYPE_UD),4653};4654ubld.LOAD_PAYLOAD(header, header_sources, 2, 0);46554656/* Gfx12 will require additional fix-ups if we ever hit this path. */4657assert(devinfo->ver < 12);4658}46594660uint32_t g00_bits = 0;46614662/* Set "Source0 Alpha Present to RenderTarget" bit in message4663* header.4664*/4665if (src0_alpha.file != BAD_FILE)4666g00_bits |= 1 << 11;46674668/* Set computes stencil to render target */4669if (prog_data->computed_stencil)4670g00_bits |= 1 << 14;46714672if (g00_bits) {4673/* OR extra bits into g0.0 */4674ubld.group(1, 0).OR(component(header, 0),4675retype(brw_vec1_grf(0, 0),4676BRW_REGISTER_TYPE_UD),4677brw_imm_ud(g00_bits));4678}46794680/* Set the render target index for choosing BLEND_STATE. */4681if (inst->target > 0) {4682ubld.group(1, 0).MOV(component(header, 2), brw_imm_ud(inst->target));4683}46844685if (prog_data->uses_kill) {4686ubld.group(1, 0).MOV(retype(component(header, 15),4687BRW_REGISTER_TYPE_UW),4688sample_mask_reg(bld));4689}46904691assert(length == 0);4692sources[0] = header;4693sources[1] = horiz_offset(header, 8);4694length = 2;4695}4696assert(length == 0 || length == 2);4697header_size = length;46984699if (payload.aa_dest_stencil_reg[0]) {4700assert(inst->group < 16);4701sources[length] = fs_reg(VGRF, bld.shader->alloc.allocate(1));4702bld.group(8, 0).exec_all().annotate("FB write stencil/AA alpha")4703.MOV(sources[length],4704fs_reg(brw_vec8_grf(payload.aa_dest_stencil_reg[0], 0)));4705length++;4706}47074708if (src0_alpha.file != BAD_FILE) {4709for (unsigned i = 0; i < bld.dispatch_width() / 8; i++) {4710const fs_builder &ubld = bld.exec_all().group(8, i)4711.annotate("FB write src0 alpha");4712const fs_reg tmp = ubld.vgrf(BRW_REGISTER_TYPE_F);4713ubld.MOV(tmp, horiz_offset(src0_alpha, i * 8));4714setup_color_payload(ubld, key, &sources[length], tmp, 1);4715length++;4716}4717}47184719if (sample_mask.file != BAD_FILE) {4720sources[length] = fs_reg(VGRF, bld.shader->alloc.allocate(1),4721BRW_REGISTER_TYPE_UD);47224723/* Hand over gl_SampleMask. Only the lower 16 bits of each channel are4724* relevant. Since it's unsigned single words one vgrf is always4725* 16-wide, but only the lower or higher 8 channels will be used by the4726* hardware when doing a SIMD8 write depending on whether we have4727* selected the subspans for the first or second half respectively.4728*/4729assert(sample_mask.file != BAD_FILE && type_sz(sample_mask.type) == 4);4730sample_mask.type = BRW_REGISTER_TYPE_UW;4731sample_mask.stride *= 2;47324733bld.exec_all().annotate("FB write oMask")4734.MOV(horiz_offset(retype(sources[length], BRW_REGISTER_TYPE_UW),4735inst->group % 16),4736sample_mask);4737length++;4738}47394740payload_header_size = length;47414742setup_color_payload(bld, key, &sources[length], color0, components);4743length += 4;47444745if (color1.file != BAD_FILE) {4746setup_color_payload(bld, key, &sources[length], color1, components);4747length += 4;4748}47494750if (src_depth.file != BAD_FILE) {4751sources[length] = src_depth;4752length++;4753}47544755if (dst_depth.file != BAD_FILE) {4756sources[length] = dst_depth;4757length++;4758}47594760if (src_stencil.file != BAD_FILE) {4761assert(devinfo->ver >= 9);4762assert(bld.dispatch_width() == 8);47634764/* XXX: src_stencil is only available on gfx9+. dst_depth is never4765* available on gfx9+. As such it's impossible to have both enabled at the4766* same time and therefore length cannot overrun the array.4767*/4768assert(length < 15);47694770sources[length] = bld.vgrf(BRW_REGISTER_TYPE_UD);4771bld.exec_all().annotate("FB write OS")4772.MOV(retype(sources[length], BRW_REGISTER_TYPE_UB),4773subscript(src_stencil, BRW_REGISTER_TYPE_UB, 0));4774length++;4775}47764777fs_inst *load;4778if (devinfo->ver >= 7) {4779/* Send from the GRF */4780fs_reg payload = fs_reg(VGRF, -1, BRW_REGISTER_TYPE_F);4781load = bld.LOAD_PAYLOAD(payload, sources, length, payload_header_size);4782payload.nr = bld.shader->alloc.allocate(regs_written(load));4783load->dst = payload;47844785uint32_t msg_ctl = brw_fb_write_msg_control(inst, prog_data);47864787inst->desc =4788(inst->group / 16) << 11 | /* rt slot group */4789brw_fb_write_desc(devinfo, inst->target, msg_ctl, inst->last_rt,4790prog_data->per_coarse_pixel_dispatch);47914792uint32_t ex_desc = 0;4793if (devinfo->ver >= 11) {4794/* Set the "Render Target Index" and "Src0 Alpha Present" fields4795* in the extended message descriptor, in lieu of using a header.4796*/4797ex_desc = inst->target << 12 | (src0_alpha.file != BAD_FILE) << 15;47984799if (key->nr_color_regions == 0)4800ex_desc |= 1 << 20; /* Null Render Target */4801}4802inst->ex_desc = ex_desc;48034804inst->opcode = SHADER_OPCODE_SEND;4805inst->resize_sources(3);4806inst->sfid = GFX6_SFID_DATAPORT_RENDER_CACHE;4807inst->src[0] = brw_imm_ud(0);4808inst->src[1] = brw_imm_ud(0);4809inst->src[2] = payload;4810inst->mlen = regs_written(load);4811inst->ex_mlen = 0;4812inst->header_size = header_size;4813inst->check_tdr = true;4814inst->send_has_side_effects = true;4815} else {4816/* Send from the MRF */4817load = bld.LOAD_PAYLOAD(fs_reg(MRF, 1, BRW_REGISTER_TYPE_F),4818sources, length, payload_header_size);48194820/* On pre-SNB, we have to interlace the color values. LOAD_PAYLOAD4821* will do this for us if we just give it a COMPR4 destination.4822*/4823if (devinfo->ver < 6 && bld.dispatch_width() == 16)4824load->dst.nr |= BRW_MRF_COMPR4;48254826if (devinfo->ver < 6) {4827/* Set up src[0] for the implied MOV from grf0-1 */4828inst->resize_sources(1);4829inst->src[0] = brw_vec8_grf(0, 0);4830} else {4831inst->resize_sources(0);4832}4833inst->base_mrf = 1;4834inst->opcode = FS_OPCODE_FB_WRITE;4835inst->mlen = regs_written(load);4836inst->header_size = header_size;4837}4838}48394840static void4841lower_fb_read_logical_send(const fs_builder &bld, fs_inst *inst)4842{4843const intel_device_info *devinfo = bld.shader->devinfo;4844const fs_builder &ubld = bld.exec_all().group(8, 0);4845const unsigned length = 2;4846const fs_reg header = ubld.vgrf(BRW_REGISTER_TYPE_UD, length);48474848if (bld.group() < 16) {4849ubld.group(16, 0).MOV(header, retype(brw_vec8_grf(0, 0),4850BRW_REGISTER_TYPE_UD));4851} else {4852assert(bld.group() < 32);4853const fs_reg header_sources[] = {4854retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD),4855retype(brw_vec8_grf(2, 0), BRW_REGISTER_TYPE_UD)4856};4857ubld.LOAD_PAYLOAD(header, header_sources, ARRAY_SIZE(header_sources), 0);48584859if (devinfo->ver >= 12) {4860/* On Gfx12 the Viewport and Render Target Array Index fields (AKA4861* Poly 0 Info) are provided in r1.1 instead of r0.0, and the render4862* target message header format was updated accordingly -- However4863* the updated format only works for the lower 16 channels in a4864* SIMD32 thread, since the higher 16 channels want the subspan data4865* from r2 instead of r1, so we need to copy over the contents of4866* r1.1 in order to fix things up.4867*/4868ubld.group(1, 0).MOV(component(header, 9),4869retype(brw_vec1_grf(1, 1), BRW_REGISTER_TYPE_UD));4870}4871}48724873inst->resize_sources(1);4874inst->src[0] = header;4875inst->opcode = FS_OPCODE_FB_READ;4876inst->mlen = length;4877inst->header_size = length;4878}48794880static void4881lower_sampler_logical_send_gfx4(const fs_builder &bld, fs_inst *inst, opcode op,4882const fs_reg &coordinate,4883const fs_reg &shadow_c,4884const fs_reg &lod, const fs_reg &lod2,4885const fs_reg &surface,4886const fs_reg &sampler,4887unsigned coord_components,4888unsigned grad_components)4889{4890const bool has_lod = (op == SHADER_OPCODE_TXL || op == FS_OPCODE_TXB ||4891op == SHADER_OPCODE_TXF || op == SHADER_OPCODE_TXS);4892fs_reg msg_begin(MRF, 1, BRW_REGISTER_TYPE_F);4893fs_reg msg_end = msg_begin;48944895/* g0 header. */4896msg_end = offset(msg_end, bld.group(8, 0), 1);48974898for (unsigned i = 0; i < coord_components; i++)4899bld.MOV(retype(offset(msg_end, bld, i), coordinate.type),4900offset(coordinate, bld, i));49014902msg_end = offset(msg_end, bld, coord_components);49034904/* Messages other than SAMPLE and RESINFO in SIMD16 and TXD in SIMD84905* require all three components to be present and zero if they are unused.4906*/4907if (coord_components > 0 &&4908(has_lod || shadow_c.file != BAD_FILE ||4909(op == SHADER_OPCODE_TEX && bld.dispatch_width() == 8))) {4910assert(coord_components <= 3);4911for (unsigned i = 0; i < 3 - coord_components; i++)4912bld.MOV(offset(msg_end, bld, i), brw_imm_f(0.0f));49134914msg_end = offset(msg_end, bld, 3 - coord_components);4915}49164917if (op == SHADER_OPCODE_TXD) {4918/* TXD unsupported in SIMD16 mode. */4919assert(bld.dispatch_width() == 8);49204921/* the slots for u and v are always present, but r is optional */4922if (coord_components < 2)4923msg_end = offset(msg_end, bld, 2 - coord_components);49244925/* P = u, v, r4926* dPdx = dudx, dvdx, drdx4927* dPdy = dudy, dvdy, drdy4928*4929* 1-arg: Does not exist.4930*4931* 2-arg: dudx dvdx dudy dvdy4932* dPdx.x dPdx.y dPdy.x dPdy.y4933* m4 m5 m6 m74934*4935* 3-arg: dudx dvdx drdx dudy dvdy drdy4936* dPdx.x dPdx.y dPdx.z dPdy.x dPdy.y dPdy.z4937* m5 m6 m7 m8 m9 m104938*/4939for (unsigned i = 0; i < grad_components; i++)4940bld.MOV(offset(msg_end, bld, i), offset(lod, bld, i));49414942msg_end = offset(msg_end, bld, MAX2(grad_components, 2));49434944for (unsigned i = 0; i < grad_components; i++)4945bld.MOV(offset(msg_end, bld, i), offset(lod2, bld, i));49464947msg_end = offset(msg_end, bld, MAX2(grad_components, 2));4948}49494950if (has_lod) {4951/* Bias/LOD with shadow comparator is unsupported in SIMD16 -- *Without*4952* shadow comparator (including RESINFO) it's unsupported in SIMD8 mode.4953*/4954assert(shadow_c.file != BAD_FILE ? bld.dispatch_width() == 8 :4955bld.dispatch_width() == 16);49564957const brw_reg_type type =4958(op == SHADER_OPCODE_TXF || op == SHADER_OPCODE_TXS ?4959BRW_REGISTER_TYPE_UD : BRW_REGISTER_TYPE_F);4960bld.MOV(retype(msg_end, type), lod);4961msg_end = offset(msg_end, bld, 1);4962}49634964if (shadow_c.file != BAD_FILE) {4965if (op == SHADER_OPCODE_TEX && bld.dispatch_width() == 8) {4966/* There's no plain shadow compare message, so we use shadow4967* compare with a bias of 0.0.4968*/4969bld.MOV(msg_end, brw_imm_f(0.0f));4970msg_end = offset(msg_end, bld, 1);4971}49724973bld.MOV(msg_end, shadow_c);4974msg_end = offset(msg_end, bld, 1);4975}49764977inst->opcode = op;4978inst->src[0] = reg_undef;4979inst->src[1] = surface;4980inst->src[2] = sampler;4981inst->resize_sources(3);4982inst->base_mrf = msg_begin.nr;4983inst->mlen = msg_end.nr - msg_begin.nr;4984inst->header_size = 1;4985}49864987static void4988lower_sampler_logical_send_gfx5(const fs_builder &bld, fs_inst *inst, opcode op,4989const fs_reg &coordinate,4990const fs_reg &shadow_c,4991const fs_reg &lod, const fs_reg &lod2,4992const fs_reg &sample_index,4993const fs_reg &surface,4994const fs_reg &sampler,4995unsigned coord_components,4996unsigned grad_components)4997{4998fs_reg message(MRF, 2, BRW_REGISTER_TYPE_F);4999fs_reg msg_coords = message;5000unsigned header_size = 0;50015002if (inst->offset != 0) {5003/* The offsets set up by the visitor are in the m1 header, so we can't5004* go headerless.5005*/5006header_size = 1;5007message.nr--;5008}50095010for (unsigned i = 0; i < coord_components; i++)5011bld.MOV(retype(offset(msg_coords, bld, i), coordinate.type),5012offset(coordinate, bld, i));50135014fs_reg msg_end = offset(msg_coords, bld, coord_components);5015fs_reg msg_lod = offset(msg_coords, bld, 4);50165017if (shadow_c.file != BAD_FILE) {5018fs_reg msg_shadow = msg_lod;5019bld.MOV(msg_shadow, shadow_c);5020msg_lod = offset(msg_shadow, bld, 1);5021msg_end = msg_lod;5022}50235024switch (op) {5025case SHADER_OPCODE_TXL:5026case FS_OPCODE_TXB:5027bld.MOV(msg_lod, lod);5028msg_end = offset(msg_lod, bld, 1);5029break;5030case SHADER_OPCODE_TXD:5031/**5032* P = u, v, r5033* dPdx = dudx, dvdx, drdx5034* dPdy = dudy, dvdy, drdy5035*5036* Load up these values:5037* - dudx dudy dvdx dvdy drdx drdy5038* - dPdx.x dPdy.x dPdx.y dPdy.y dPdx.z dPdy.z5039*/5040msg_end = msg_lod;5041for (unsigned i = 0; i < grad_components; i++) {5042bld.MOV(msg_end, offset(lod, bld, i));5043msg_end = offset(msg_end, bld, 1);50445045bld.MOV(msg_end, offset(lod2, bld, i));5046msg_end = offset(msg_end, bld, 1);5047}5048break;5049case SHADER_OPCODE_TXS:5050msg_lod = retype(msg_end, BRW_REGISTER_TYPE_UD);5051bld.MOV(msg_lod, lod);5052msg_end = offset(msg_lod, bld, 1);5053break;5054case SHADER_OPCODE_TXF:5055msg_lod = offset(msg_coords, bld, 3);5056bld.MOV(retype(msg_lod, BRW_REGISTER_TYPE_UD), lod);5057msg_end = offset(msg_lod, bld, 1);5058break;5059case SHADER_OPCODE_TXF_CMS:5060msg_lod = offset(msg_coords, bld, 3);5061/* lod */5062bld.MOV(retype(msg_lod, BRW_REGISTER_TYPE_UD), brw_imm_ud(0u));5063/* sample index */5064bld.MOV(retype(offset(msg_lod, bld, 1), BRW_REGISTER_TYPE_UD), sample_index);5065msg_end = offset(msg_lod, bld, 2);5066break;5067default:5068break;5069}50705071inst->opcode = op;5072inst->src[0] = reg_undef;5073inst->src[1] = surface;5074inst->src[2] = sampler;5075inst->resize_sources(3);5076inst->base_mrf = message.nr;5077inst->mlen = msg_end.nr - message.nr;5078inst->header_size = header_size;50795080/* Message length > MAX_SAMPLER_MESSAGE_SIZE disallowed by hardware. */5081assert(inst->mlen <= MAX_SAMPLER_MESSAGE_SIZE);5082}50835084static bool5085is_high_sampler(const struct intel_device_info *devinfo, const fs_reg &sampler)5086{5087if (devinfo->verx10 <= 70)5088return false;50895090return sampler.file != IMM || sampler.ud >= 16;5091}50925093static unsigned5094sampler_msg_type(const intel_device_info *devinfo,5095opcode opcode, bool shadow_compare)5096{5097assert(devinfo->ver >= 5);5098switch (opcode) {5099case SHADER_OPCODE_TEX:5100return shadow_compare ? GFX5_SAMPLER_MESSAGE_SAMPLE_COMPARE :5101GFX5_SAMPLER_MESSAGE_SAMPLE;5102case FS_OPCODE_TXB:5103return shadow_compare ? GFX5_SAMPLER_MESSAGE_SAMPLE_BIAS_COMPARE :5104GFX5_SAMPLER_MESSAGE_SAMPLE_BIAS;5105case SHADER_OPCODE_TXL:5106return shadow_compare ? GFX5_SAMPLER_MESSAGE_SAMPLE_LOD_COMPARE :5107GFX5_SAMPLER_MESSAGE_SAMPLE_LOD;5108case SHADER_OPCODE_TXL_LZ:5109return shadow_compare ? GFX9_SAMPLER_MESSAGE_SAMPLE_C_LZ :5110GFX9_SAMPLER_MESSAGE_SAMPLE_LZ;5111case SHADER_OPCODE_TXS:5112case SHADER_OPCODE_IMAGE_SIZE_LOGICAL:5113return GFX5_SAMPLER_MESSAGE_SAMPLE_RESINFO;5114case SHADER_OPCODE_TXD:5115assert(!shadow_compare || devinfo->verx10 >= 75);5116return shadow_compare ? HSW_SAMPLER_MESSAGE_SAMPLE_DERIV_COMPARE :5117GFX5_SAMPLER_MESSAGE_SAMPLE_DERIVS;5118case SHADER_OPCODE_TXF:5119return GFX5_SAMPLER_MESSAGE_SAMPLE_LD;5120case SHADER_OPCODE_TXF_LZ:5121assert(devinfo->ver >= 9);5122return GFX9_SAMPLER_MESSAGE_SAMPLE_LD_LZ;5123case SHADER_OPCODE_TXF_CMS_W:5124assert(devinfo->ver >= 9);5125return GFX9_SAMPLER_MESSAGE_SAMPLE_LD2DMS_W;5126case SHADER_OPCODE_TXF_CMS:5127return devinfo->ver >= 7 ? GFX7_SAMPLER_MESSAGE_SAMPLE_LD2DMS :5128GFX5_SAMPLER_MESSAGE_SAMPLE_LD;5129case SHADER_OPCODE_TXF_UMS:5130assert(devinfo->ver >= 7);5131return GFX7_SAMPLER_MESSAGE_SAMPLE_LD2DSS;5132case SHADER_OPCODE_TXF_MCS:5133assert(devinfo->ver >= 7);5134return GFX7_SAMPLER_MESSAGE_SAMPLE_LD_MCS;5135case SHADER_OPCODE_LOD:5136return GFX5_SAMPLER_MESSAGE_LOD;5137case SHADER_OPCODE_TG4:5138assert(devinfo->ver >= 7);5139return shadow_compare ? GFX7_SAMPLER_MESSAGE_SAMPLE_GATHER4_C :5140GFX7_SAMPLER_MESSAGE_SAMPLE_GATHER4;5141break;5142case SHADER_OPCODE_TG4_OFFSET:5143assert(devinfo->ver >= 7);5144return shadow_compare ? GFX7_SAMPLER_MESSAGE_SAMPLE_GATHER4_PO_C :5145GFX7_SAMPLER_MESSAGE_SAMPLE_GATHER4_PO;5146case SHADER_OPCODE_SAMPLEINFO:5147return GFX6_SAMPLER_MESSAGE_SAMPLE_SAMPLEINFO;5148default:5149unreachable("not reached");5150}5151}51525153static void5154lower_sampler_logical_send_gfx7(const fs_builder &bld, fs_inst *inst, opcode op,5155const fs_reg &coordinate,5156const fs_reg &shadow_c,5157fs_reg lod, const fs_reg &lod2,5158const fs_reg &min_lod,5159const fs_reg &sample_index,5160const fs_reg &mcs,5161const fs_reg &surface,5162const fs_reg &sampler,5163const fs_reg &surface_handle,5164const fs_reg &sampler_handle,5165const fs_reg &tg4_offset,5166unsigned coord_components,5167unsigned grad_components)5168{5169const intel_device_info *devinfo = bld.shader->devinfo;5170const brw_stage_prog_data *prog_data = bld.shader->stage_prog_data;5171unsigned reg_width = bld.dispatch_width() / 8;5172unsigned header_size = 0, length = 0;5173fs_reg sources[MAX_SAMPLER_MESSAGE_SIZE];5174for (unsigned i = 0; i < ARRAY_SIZE(sources); i++)5175sources[i] = bld.vgrf(BRW_REGISTER_TYPE_F);51765177/* We must have exactly one of surface/sampler and surface/sampler_handle */5178assert((surface.file == BAD_FILE) != (surface_handle.file == BAD_FILE));5179assert((sampler.file == BAD_FILE) != (sampler_handle.file == BAD_FILE));51805181if (op == SHADER_OPCODE_TG4 || op == SHADER_OPCODE_TG4_OFFSET ||5182inst->offset != 0 || inst->eot ||5183op == SHADER_OPCODE_SAMPLEINFO ||5184sampler_handle.file != BAD_FILE ||5185is_high_sampler(devinfo, sampler)) {5186/* For general texture offsets (no txf workaround), we need a header to5187* put them in.5188*5189* TG4 needs to place its channel select in the header, for interaction5190* with ARB_texture_swizzle. The sampler index is only 4-bits, so for5191* larger sampler numbers we need to offset the Sampler State Pointer in5192* the header.5193*/5194fs_reg header = retype(sources[0], BRW_REGISTER_TYPE_UD);5195header_size = 1;5196length++;51975198/* If we're requesting fewer than four channels worth of response,5199* and we have an explicit header, we need to set up the sampler5200* writemask. It's reversed from normal: 1 means "don't write".5201*/5202if (!inst->eot && regs_written(inst) != 4 * reg_width) {5203assert(regs_written(inst) % reg_width == 0);5204unsigned mask = ~((1 << (regs_written(inst) / reg_width)) - 1) & 0xf;5205inst->offset |= mask << 12;5206}52075208/* Build the actual header */5209const fs_builder ubld = bld.exec_all().group(8, 0);5210const fs_builder ubld1 = ubld.group(1, 0);5211ubld.MOV(header, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));5212if (inst->offset) {5213ubld1.MOV(component(header, 2), brw_imm_ud(inst->offset));5214} else if (bld.shader->stage != MESA_SHADER_VERTEX &&5215bld.shader->stage != MESA_SHADER_FRAGMENT) {5216/* The vertex and fragment stages have g0.2 set to 0, so5217* header0.2 is 0 when g0 is copied. Other stages may not, so we5218* must set it to 0 to avoid setting undesirable bits in the5219* message.5220*/5221ubld1.MOV(component(header, 2), brw_imm_ud(0));5222}52235224if (sampler_handle.file != BAD_FILE) {5225/* Bindless sampler handles aren't relative to the sampler state5226* pointer passed into the shader through SAMPLER_STATE_POINTERS_*.5227* Instead, it's an absolute pointer relative to dynamic state base5228* address.5229*5230* Sampler states are 16 bytes each and the pointer we give here has5231* to be 32-byte aligned. In order to avoid more indirect messages5232* than required, we assume that all bindless sampler states are5233* 32-byte aligned. This sacrifices a bit of general state base5234* address space but means we can do something more efficient in the5235* shader.5236*/5237ubld1.MOV(component(header, 3), sampler_handle);5238} else if (is_high_sampler(devinfo, sampler)) {5239fs_reg sampler_state_ptr =5240retype(brw_vec1_grf(0, 3), BRW_REGISTER_TYPE_UD);52415242/* Gfx11+ sampler message headers include bits in 4:0 which conflict5243* with the ones included in g0.3 bits 4:0. Mask them out.5244*/5245if (devinfo->ver >= 11) {5246sampler_state_ptr = ubld1.vgrf(BRW_REGISTER_TYPE_UD);5247ubld1.AND(sampler_state_ptr,5248retype(brw_vec1_grf(0, 3), BRW_REGISTER_TYPE_UD),5249brw_imm_ud(INTEL_MASK(31, 5)));5250}52515252if (sampler.file == BRW_IMMEDIATE_VALUE) {5253assert(sampler.ud >= 16);5254const int sampler_state_size = 16; /* 16 bytes */52555256ubld1.ADD(component(header, 3), sampler_state_ptr,5257brw_imm_ud(16 * (sampler.ud / 16) * sampler_state_size));5258} else {5259fs_reg tmp = ubld1.vgrf(BRW_REGISTER_TYPE_UD);5260ubld1.AND(tmp, sampler, brw_imm_ud(0x0f0));5261ubld1.SHL(tmp, tmp, brw_imm_ud(4));5262ubld1.ADD(component(header, 3), sampler_state_ptr, tmp);5263}5264} else if (devinfo->ver >= 11) {5265/* Gfx11+ sampler message headers include bits in 4:0 which conflict5266* with the ones included in g0.3 bits 4:0. Mask them out.5267*/5268ubld1.AND(component(header, 3),5269retype(brw_vec1_grf(0, 3), BRW_REGISTER_TYPE_UD),5270brw_imm_ud(INTEL_MASK(31, 5)));5271}5272}52735274if (shadow_c.file != BAD_FILE) {5275bld.MOV(sources[length], shadow_c);5276length++;5277}52785279bool coordinate_done = false;52805281/* Set up the LOD info */5282switch (op) {5283case FS_OPCODE_TXB:5284case SHADER_OPCODE_TXL:5285if (devinfo->ver >= 9 && op == SHADER_OPCODE_TXL && lod.is_zero()) {5286op = SHADER_OPCODE_TXL_LZ;5287break;5288}5289bld.MOV(sources[length], lod);5290length++;5291break;5292case SHADER_OPCODE_TXD:5293/* TXD should have been lowered in SIMD16 mode. */5294assert(bld.dispatch_width() == 8);52955296/* Load dPdx and the coordinate together:5297* [hdr], [ref], x, dPdx.x, dPdy.x, y, dPdx.y, dPdy.y, z, dPdx.z, dPdy.z5298*/5299for (unsigned i = 0; i < coord_components; i++) {5300bld.MOV(sources[length++], offset(coordinate, bld, i));53015302/* For cube map array, the coordinate is (u,v,r,ai) but there are5303* only derivatives for (u, v, r).5304*/5305if (i < grad_components) {5306bld.MOV(sources[length++], offset(lod, bld, i));5307bld.MOV(sources[length++], offset(lod2, bld, i));5308}5309}53105311coordinate_done = true;5312break;5313case SHADER_OPCODE_TXS:5314bld.MOV(retype(sources[length], BRW_REGISTER_TYPE_UD), lod);5315length++;5316break;5317case SHADER_OPCODE_IMAGE_SIZE_LOGICAL:5318/* We need an LOD; just use 0 */5319bld.MOV(retype(sources[length], BRW_REGISTER_TYPE_UD), brw_imm_ud(0));5320length++;5321break;5322case SHADER_OPCODE_TXF:5323/* Unfortunately, the parameters for LD are intermixed: u, lod, v, r.5324* On Gfx9 they are u, v, lod, r5325*/5326bld.MOV(retype(sources[length++], BRW_REGISTER_TYPE_D), coordinate);53275328if (devinfo->ver >= 9) {5329if (coord_components >= 2) {5330bld.MOV(retype(sources[length], BRW_REGISTER_TYPE_D),5331offset(coordinate, bld, 1));5332} else {5333sources[length] = brw_imm_d(0);5334}5335length++;5336}53375338if (devinfo->ver >= 9 && lod.is_zero()) {5339op = SHADER_OPCODE_TXF_LZ;5340} else {5341bld.MOV(retype(sources[length], BRW_REGISTER_TYPE_D), lod);5342length++;5343}53445345for (unsigned i = devinfo->ver >= 9 ? 2 : 1; i < coord_components; i++)5346bld.MOV(retype(sources[length++], BRW_REGISTER_TYPE_D),5347offset(coordinate, bld, i));53485349coordinate_done = true;5350break;53515352case SHADER_OPCODE_TXF_CMS:5353case SHADER_OPCODE_TXF_CMS_W:5354case SHADER_OPCODE_TXF_UMS:5355case SHADER_OPCODE_TXF_MCS:5356if (op == SHADER_OPCODE_TXF_UMS ||5357op == SHADER_OPCODE_TXF_CMS ||5358op == SHADER_OPCODE_TXF_CMS_W) {5359bld.MOV(retype(sources[length], BRW_REGISTER_TYPE_UD), sample_index);5360length++;5361}53625363if (op == SHADER_OPCODE_TXF_CMS || op == SHADER_OPCODE_TXF_CMS_W) {5364/* Data from the multisample control surface. */5365bld.MOV(retype(sources[length], BRW_REGISTER_TYPE_UD), mcs);5366length++;53675368/* On Gfx9+ we'll use ld2dms_w instead which has two registers for5369* the MCS data.5370*/5371if (op == SHADER_OPCODE_TXF_CMS_W) {5372bld.MOV(retype(sources[length], BRW_REGISTER_TYPE_UD),5373mcs.file == IMM ?5374mcs :5375offset(mcs, bld, 1));5376length++;5377}5378}53795380/* There is no offsetting for this message; just copy in the integer5381* texture coordinates.5382*/5383for (unsigned i = 0; i < coord_components; i++)5384bld.MOV(retype(sources[length++], BRW_REGISTER_TYPE_D),5385offset(coordinate, bld, i));53865387coordinate_done = true;5388break;5389case SHADER_OPCODE_TG4_OFFSET:5390/* More crazy intermixing */5391for (unsigned i = 0; i < 2; i++) /* u, v */5392bld.MOV(sources[length++], offset(coordinate, bld, i));53935394for (unsigned i = 0; i < 2; i++) /* offu, offv */5395bld.MOV(retype(sources[length++], BRW_REGISTER_TYPE_D),5396offset(tg4_offset, bld, i));53975398if (coord_components == 3) /* r if present */5399bld.MOV(sources[length++], offset(coordinate, bld, 2));54005401coordinate_done = true;5402break;5403default:5404break;5405}54065407/* Set up the coordinate (except for cases where it was done above) */5408if (!coordinate_done) {5409for (unsigned i = 0; i < coord_components; i++)5410bld.MOV(sources[length++], offset(coordinate, bld, i));5411}54125413if (min_lod.file != BAD_FILE) {5414/* Account for all of the missing coordinate sources */5415length += 4 - coord_components;5416if (op == SHADER_OPCODE_TXD)5417length += (3 - grad_components) * 2;54185419bld.MOV(sources[length++], min_lod);5420}54215422unsigned mlen;5423if (reg_width == 2)5424mlen = length * reg_width - header_size;5425else5426mlen = length * reg_width;54275428const fs_reg src_payload = fs_reg(VGRF, bld.shader->alloc.allocate(mlen),5429BRW_REGISTER_TYPE_F);5430bld.LOAD_PAYLOAD(src_payload, sources, length, header_size);54315432/* Generate the SEND. */5433inst->opcode = SHADER_OPCODE_SEND;5434inst->mlen = mlen;5435inst->header_size = header_size;54365437const unsigned msg_type =5438sampler_msg_type(devinfo, op, inst->shadow_compare);5439const unsigned simd_mode =5440inst->exec_size <= 8 ? BRW_SAMPLER_SIMD_MODE_SIMD8 :5441BRW_SAMPLER_SIMD_MODE_SIMD16;54425443uint32_t base_binding_table_index;5444switch (op) {5445case SHADER_OPCODE_TG4:5446case SHADER_OPCODE_TG4_OFFSET:5447base_binding_table_index = prog_data->binding_table.gather_texture_start;5448break;5449case SHADER_OPCODE_IMAGE_SIZE_LOGICAL:5450base_binding_table_index = prog_data->binding_table.image_start;5451break;5452default:5453base_binding_table_index = prog_data->binding_table.texture_start;5454break;5455}54565457inst->sfid = BRW_SFID_SAMPLER;5458if (surface.file == IMM &&5459(sampler.file == IMM || sampler_handle.file != BAD_FILE)) {5460inst->desc = brw_sampler_desc(devinfo,5461surface.ud + base_binding_table_index,5462sampler.file == IMM ? sampler.ud % 16 : 0,5463msg_type,5464simd_mode,54650 /* return_format unused on gfx7+ */);5466inst->src[0] = brw_imm_ud(0);5467inst->src[1] = brw_imm_ud(0);5468} else if (surface_handle.file != BAD_FILE) {5469/* Bindless surface */5470assert(devinfo->ver >= 9);5471inst->desc = brw_sampler_desc(devinfo,5472GFX9_BTI_BINDLESS,5473sampler.file == IMM ? sampler.ud % 16 : 0,5474msg_type,5475simd_mode,54760 /* return_format unused on gfx7+ */);54775478/* For bindless samplers, the entire address is included in the message5479* header so we can leave the portion in the message descriptor 0.5480*/5481if (sampler_handle.file != BAD_FILE || sampler.file == IMM) {5482inst->src[0] = brw_imm_ud(0);5483} else {5484const fs_builder ubld = bld.group(1, 0).exec_all();5485fs_reg desc = ubld.vgrf(BRW_REGISTER_TYPE_UD);5486ubld.SHL(desc, sampler, brw_imm_ud(8));5487inst->src[0] = desc;5488}54895490/* We assume that the driver provided the handle in the top 20 bits so5491* we can use the surface handle directly as the extended descriptor.5492*/5493inst->src[1] = retype(surface_handle, BRW_REGISTER_TYPE_UD);5494} else {5495/* Immediate portion of the descriptor */5496inst->desc = brw_sampler_desc(devinfo,54970, /* surface */54980, /* sampler */5499msg_type,5500simd_mode,55010 /* return_format unused on gfx7+ */);5502const fs_builder ubld = bld.group(1, 0).exec_all();5503fs_reg desc = ubld.vgrf(BRW_REGISTER_TYPE_UD);5504if (surface.equals(sampler)) {5505/* This case is common in GL */5506ubld.MUL(desc, surface, brw_imm_ud(0x101));5507} else {5508if (sampler_handle.file != BAD_FILE) {5509ubld.MOV(desc, surface);5510} else if (sampler.file == IMM) {5511ubld.OR(desc, surface, brw_imm_ud(sampler.ud << 8));5512} else {5513ubld.SHL(desc, sampler, brw_imm_ud(8));5514ubld.OR(desc, desc, surface);5515}5516}5517if (base_binding_table_index)5518ubld.ADD(desc, desc, brw_imm_ud(base_binding_table_index));5519ubld.AND(desc, desc, brw_imm_ud(0xfff));55205521inst->src[0] = component(desc, 0);5522inst->src[1] = brw_imm_ud(0); /* ex_desc */5523}55245525inst->ex_desc = 0;55265527inst->src[2] = src_payload;5528inst->resize_sources(3);55295530if (inst->eot) {5531/* EOT sampler messages don't make sense to split because it would5532* involve ending half of the thread early.5533*/5534assert(inst->group == 0);5535/* We need to use SENDC for EOT sampler messages */5536inst->check_tdr = true;5537inst->send_has_side_effects = true;5538}55395540/* Message length > MAX_SAMPLER_MESSAGE_SIZE disallowed by hardware. */5541assert(inst->mlen <= MAX_SAMPLER_MESSAGE_SIZE);5542}55435544static void5545lower_sampler_logical_send(const fs_builder &bld, fs_inst *inst, opcode op)5546{5547const intel_device_info *devinfo = bld.shader->devinfo;5548const fs_reg &coordinate = inst->src[TEX_LOGICAL_SRC_COORDINATE];5549const fs_reg &shadow_c = inst->src[TEX_LOGICAL_SRC_SHADOW_C];5550const fs_reg &lod = inst->src[TEX_LOGICAL_SRC_LOD];5551const fs_reg &lod2 = inst->src[TEX_LOGICAL_SRC_LOD2];5552const fs_reg &min_lod = inst->src[TEX_LOGICAL_SRC_MIN_LOD];5553const fs_reg &sample_index = inst->src[TEX_LOGICAL_SRC_SAMPLE_INDEX];5554const fs_reg &mcs = inst->src[TEX_LOGICAL_SRC_MCS];5555const fs_reg &surface = inst->src[TEX_LOGICAL_SRC_SURFACE];5556const fs_reg &sampler = inst->src[TEX_LOGICAL_SRC_SAMPLER];5557const fs_reg &surface_handle = inst->src[TEX_LOGICAL_SRC_SURFACE_HANDLE];5558const fs_reg &sampler_handle = inst->src[TEX_LOGICAL_SRC_SAMPLER_HANDLE];5559const fs_reg &tg4_offset = inst->src[TEX_LOGICAL_SRC_TG4_OFFSET];5560assert(inst->src[TEX_LOGICAL_SRC_COORD_COMPONENTS].file == IMM);5561const unsigned coord_components = inst->src[TEX_LOGICAL_SRC_COORD_COMPONENTS].ud;5562assert(inst->src[TEX_LOGICAL_SRC_GRAD_COMPONENTS].file == IMM);5563const unsigned grad_components = inst->src[TEX_LOGICAL_SRC_GRAD_COMPONENTS].ud;55645565if (devinfo->ver >= 7) {5566lower_sampler_logical_send_gfx7(bld, inst, op, coordinate,5567shadow_c, lod, lod2, min_lod,5568sample_index,5569mcs, surface, sampler,5570surface_handle, sampler_handle,5571tg4_offset,5572coord_components, grad_components);5573} else if (devinfo->ver >= 5) {5574lower_sampler_logical_send_gfx5(bld, inst, op, coordinate,5575shadow_c, lod, lod2, sample_index,5576surface, sampler,5577coord_components, grad_components);5578} else {5579lower_sampler_logical_send_gfx4(bld, inst, op, coordinate,5580shadow_c, lod, lod2,5581surface, sampler,5582coord_components, grad_components);5583}5584}55855586/**5587* Predicate the specified instruction on the sample mask.5588*/5589static void5590emit_predicate_on_sample_mask(const fs_builder &bld, fs_inst *inst)5591{5592assert(bld.shader->stage == MESA_SHADER_FRAGMENT &&5593bld.group() == inst->group &&5594bld.dispatch_width() == inst->exec_size);55955596const fs_visitor *v = static_cast<const fs_visitor *>(bld.shader);5597const fs_reg sample_mask = sample_mask_reg(bld);5598const unsigned subreg = sample_mask_flag_subreg(v);55995600if (brw_wm_prog_data(v->stage_prog_data)->uses_kill) {5601assert(sample_mask.file == ARF &&5602sample_mask.nr == brw_flag_subreg(subreg).nr &&5603sample_mask.subnr == brw_flag_subreg(5604subreg + inst->group / 16).subnr);5605} else {5606bld.group(1, 0).exec_all()5607.MOV(brw_flag_subreg(subreg + inst->group / 16), sample_mask);5608}56095610if (inst->predicate) {5611assert(inst->predicate == BRW_PREDICATE_NORMAL);5612assert(!inst->predicate_inverse);5613assert(inst->flag_subreg == 0);5614/* Combine the sample mask with the existing predicate by using a5615* vertical predication mode.5616*/5617inst->predicate = BRW_PREDICATE_ALIGN1_ALLV;5618} else {5619inst->flag_subreg = subreg;5620inst->predicate = BRW_PREDICATE_NORMAL;5621inst->predicate_inverse = false;5622}5623}56245625static void5626setup_surface_descriptors(const fs_builder &bld, fs_inst *inst, uint32_t desc,5627const fs_reg &surface, const fs_reg &surface_handle)5628{5629const ASSERTED intel_device_info *devinfo = bld.shader->devinfo;56305631/* We must have exactly one of surface and surface_handle */5632assert((surface.file == BAD_FILE) != (surface_handle.file == BAD_FILE));56335634if (surface.file == IMM) {5635inst->desc = desc | (surface.ud & 0xff);5636inst->src[0] = brw_imm_ud(0);5637inst->src[1] = brw_imm_ud(0); /* ex_desc */5638} else if (surface_handle.file != BAD_FILE) {5639/* Bindless surface */5640assert(devinfo->ver >= 9);5641inst->desc = desc | GFX9_BTI_BINDLESS;5642inst->src[0] = brw_imm_ud(0);56435644/* We assume that the driver provided the handle in the top 20 bits so5645* we can use the surface handle directly as the extended descriptor.5646*/5647inst->src[1] = retype(surface_handle, BRW_REGISTER_TYPE_UD);5648} else {5649inst->desc = desc;5650const fs_builder ubld = bld.exec_all().group(1, 0);5651fs_reg tmp = ubld.vgrf(BRW_REGISTER_TYPE_UD);5652ubld.AND(tmp, surface, brw_imm_ud(0xff));5653inst->src[0] = component(tmp, 0);5654inst->src[1] = brw_imm_ud(0); /* ex_desc */5655}5656}56575658static void5659lower_surface_logical_send(const fs_builder &bld, fs_inst *inst)5660{5661const intel_device_info *devinfo = bld.shader->devinfo;56625663/* Get the logical send arguments. */5664const fs_reg &addr = inst->src[SURFACE_LOGICAL_SRC_ADDRESS];5665const fs_reg &src = inst->src[SURFACE_LOGICAL_SRC_DATA];5666const fs_reg &surface = inst->src[SURFACE_LOGICAL_SRC_SURFACE];5667const fs_reg &surface_handle = inst->src[SURFACE_LOGICAL_SRC_SURFACE_HANDLE];5668const UNUSED fs_reg &dims = inst->src[SURFACE_LOGICAL_SRC_IMM_DIMS];5669const fs_reg &arg = inst->src[SURFACE_LOGICAL_SRC_IMM_ARG];5670const fs_reg &allow_sample_mask =5671inst->src[SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK];5672assert(arg.file == IMM);5673assert(allow_sample_mask.file == IMM);56745675/* Calculate the total number of components of the payload. */5676const unsigned addr_sz = inst->components_read(SURFACE_LOGICAL_SRC_ADDRESS);5677const unsigned src_sz = inst->components_read(SURFACE_LOGICAL_SRC_DATA);56785679const bool is_typed_access =5680inst->opcode == SHADER_OPCODE_TYPED_SURFACE_READ_LOGICAL ||5681inst->opcode == SHADER_OPCODE_TYPED_SURFACE_WRITE_LOGICAL ||5682inst->opcode == SHADER_OPCODE_TYPED_ATOMIC_LOGICAL;56835684const bool is_surface_access = is_typed_access ||5685inst->opcode == SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL ||5686inst->opcode == SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL ||5687inst->opcode == SHADER_OPCODE_UNTYPED_ATOMIC_LOGICAL;56885689const bool is_stateless =5690surface.file == IMM && (surface.ud == BRW_BTI_STATELESS ||5691surface.ud == GFX8_BTI_STATELESS_NON_COHERENT);56925693const bool has_side_effects = inst->has_side_effects();56945695fs_reg sample_mask = allow_sample_mask.ud ? sample_mask_reg(bld) :5696fs_reg(brw_imm_d(0xffff));56975698/* From the BDW PRM Volume 7, page 147:5699*5700* "For the Data Cache Data Port*, the header must be present for the5701* following message types: [...] Typed read/write/atomics"5702*5703* Earlier generations have a similar wording. Because of this restriction5704* we don't attempt to implement sample masks via predication for such5705* messages prior to Gfx9, since we have to provide a header anyway. On5706* Gfx11+ the header has been removed so we can only use predication.5707*5708* For all stateless A32 messages, we also need a header5709*/5710fs_reg header;5711if ((devinfo->ver < 9 && is_typed_access) || is_stateless) {5712fs_builder ubld = bld.exec_all().group(8, 0);5713header = ubld.vgrf(BRW_REGISTER_TYPE_UD);5714if (is_stateless) {5715assert(!is_surface_access);5716ubld.emit(SHADER_OPCODE_SCRATCH_HEADER, header);5717} else {5718ubld.MOV(header, brw_imm_d(0));5719if (is_surface_access)5720ubld.group(1, 0).MOV(component(header, 7), sample_mask);5721}5722}5723const unsigned header_sz = header.file != BAD_FILE ? 1 : 0;57245725fs_reg payload, payload2;5726unsigned mlen, ex_mlen = 0;5727if (devinfo->ver >= 9 &&5728(src.file == BAD_FILE || header.file == BAD_FILE)) {5729/* We have split sends on gfx9 and above */5730if (header.file == BAD_FILE) {5731payload = bld.move_to_vgrf(addr, addr_sz);5732payload2 = bld.move_to_vgrf(src, src_sz);5733mlen = addr_sz * (inst->exec_size / 8);5734ex_mlen = src_sz * (inst->exec_size / 8);5735} else {5736assert(src.file == BAD_FILE);5737payload = header;5738payload2 = bld.move_to_vgrf(addr, addr_sz);5739mlen = header_sz;5740ex_mlen = addr_sz * (inst->exec_size / 8);5741}5742} else {5743/* Allocate space for the payload. */5744const unsigned sz = header_sz + addr_sz + src_sz;5745payload = bld.vgrf(BRW_REGISTER_TYPE_UD, sz);5746fs_reg *const components = new fs_reg[sz];5747unsigned n = 0;57485749/* Construct the payload. */5750if (header.file != BAD_FILE)5751components[n++] = header;57525753for (unsigned i = 0; i < addr_sz; i++)5754components[n++] = offset(addr, bld, i);57555756for (unsigned i = 0; i < src_sz; i++)5757components[n++] = offset(src, bld, i);57585759bld.LOAD_PAYLOAD(payload, components, sz, header_sz);5760mlen = header_sz + (addr_sz + src_sz) * inst->exec_size / 8;57615762delete[] components;5763}57645765/* Predicate the instruction on the sample mask if no header is5766* provided.5767*/5768if ((header.file == BAD_FILE || !is_surface_access) &&5769sample_mask.file != BAD_FILE && sample_mask.file != IMM)5770emit_predicate_on_sample_mask(bld, inst);57715772uint32_t sfid;5773switch (inst->opcode) {5774case SHADER_OPCODE_BYTE_SCATTERED_WRITE_LOGICAL:5775case SHADER_OPCODE_BYTE_SCATTERED_READ_LOGICAL:5776/* Byte scattered opcodes go through the normal data cache */5777sfid = GFX7_SFID_DATAPORT_DATA_CACHE;5778break;57795780case SHADER_OPCODE_DWORD_SCATTERED_READ_LOGICAL:5781case SHADER_OPCODE_DWORD_SCATTERED_WRITE_LOGICAL:5782sfid = devinfo->ver >= 7 ? GFX7_SFID_DATAPORT_DATA_CACHE :5783devinfo->ver >= 6 ? GFX6_SFID_DATAPORT_RENDER_CACHE :5784BRW_DATAPORT_READ_TARGET_RENDER_CACHE;5785break;57865787case SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL:5788case SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL:5789case SHADER_OPCODE_UNTYPED_ATOMIC_LOGICAL:5790case SHADER_OPCODE_UNTYPED_ATOMIC_FLOAT_LOGICAL:5791/* Untyped Surface messages go through the data cache but the SFID value5792* changed on Haswell.5793*/5794sfid = (devinfo->verx10 >= 75 ?5795HSW_SFID_DATAPORT_DATA_CACHE_1 :5796GFX7_SFID_DATAPORT_DATA_CACHE);5797break;57985799case SHADER_OPCODE_TYPED_SURFACE_READ_LOGICAL:5800case SHADER_OPCODE_TYPED_SURFACE_WRITE_LOGICAL:5801case SHADER_OPCODE_TYPED_ATOMIC_LOGICAL:5802/* Typed surface messages go through the render cache on IVB and the5803* data cache on HSW+.5804*/5805sfid = (devinfo->verx10 >= 75 ?5806HSW_SFID_DATAPORT_DATA_CACHE_1 :5807GFX6_SFID_DATAPORT_RENDER_CACHE);5808break;58095810default:5811unreachable("Unsupported surface opcode");5812}58135814uint32_t desc;5815switch (inst->opcode) {5816case SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL:5817desc = brw_dp_untyped_surface_rw_desc(devinfo, inst->exec_size,5818arg.ud, /* num_channels */5819false /* write */);5820break;58215822case SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL:5823desc = brw_dp_untyped_surface_rw_desc(devinfo, inst->exec_size,5824arg.ud, /* num_channels */5825true /* write */);5826break;58275828case SHADER_OPCODE_BYTE_SCATTERED_READ_LOGICAL:5829desc = brw_dp_byte_scattered_rw_desc(devinfo, inst->exec_size,5830arg.ud, /* bit_size */5831false /* write */);5832break;58335834case SHADER_OPCODE_BYTE_SCATTERED_WRITE_LOGICAL:5835desc = brw_dp_byte_scattered_rw_desc(devinfo, inst->exec_size,5836arg.ud, /* bit_size */5837true /* write */);5838break;58395840case SHADER_OPCODE_DWORD_SCATTERED_READ_LOGICAL:5841assert(arg.ud == 32); /* bit_size */5842desc = brw_dp_dword_scattered_rw_desc(devinfo, inst->exec_size,5843false /* write */);5844break;58455846case SHADER_OPCODE_DWORD_SCATTERED_WRITE_LOGICAL:5847assert(arg.ud == 32); /* bit_size */5848desc = brw_dp_dword_scattered_rw_desc(devinfo, inst->exec_size,5849true /* write */);5850break;58515852case SHADER_OPCODE_UNTYPED_ATOMIC_LOGICAL:5853desc = brw_dp_untyped_atomic_desc(devinfo, inst->exec_size,5854arg.ud, /* atomic_op */5855!inst->dst.is_null());5856break;58575858case SHADER_OPCODE_UNTYPED_ATOMIC_FLOAT_LOGICAL:5859desc = brw_dp_untyped_atomic_float_desc(devinfo, inst->exec_size,5860arg.ud, /* atomic_op */5861!inst->dst.is_null());5862break;58635864case SHADER_OPCODE_TYPED_SURFACE_READ_LOGICAL:5865desc = brw_dp_typed_surface_rw_desc(devinfo, inst->exec_size, inst->group,5866arg.ud, /* num_channels */5867false /* write */);5868break;58695870case SHADER_OPCODE_TYPED_SURFACE_WRITE_LOGICAL:5871desc = brw_dp_typed_surface_rw_desc(devinfo, inst->exec_size, inst->group,5872arg.ud, /* num_channels */5873true /* write */);5874break;58755876case SHADER_OPCODE_TYPED_ATOMIC_LOGICAL:5877desc = brw_dp_typed_atomic_desc(devinfo, inst->exec_size, inst->group,5878arg.ud, /* atomic_op */5879!inst->dst.is_null());5880break;58815882default:5883unreachable("Unknown surface logical instruction");5884}58855886/* Update the original instruction. */5887inst->opcode = SHADER_OPCODE_SEND;5888inst->mlen = mlen;5889inst->ex_mlen = ex_mlen;5890inst->header_size = header_sz;5891inst->send_has_side_effects = has_side_effects;5892inst->send_is_volatile = !has_side_effects;58935894/* Set up SFID and descriptors */5895inst->sfid = sfid;5896setup_surface_descriptors(bld, inst, desc, surface, surface_handle);58975898/* Finally, the payload */5899inst->src[2] = payload;5900inst->src[3] = payload2;59015902inst->resize_sources(4);5903}59045905static enum lsc_opcode5906brw_atomic_op_to_lsc_atomic_op(unsigned op)5907{5908switch(op) {5909case BRW_AOP_AND:5910return LSC_OP_ATOMIC_AND;5911case BRW_AOP_OR:5912return LSC_OP_ATOMIC_OR;5913case BRW_AOP_XOR:5914return LSC_OP_ATOMIC_XOR;5915case BRW_AOP_MOV:5916return LSC_OP_ATOMIC_STORE;5917case BRW_AOP_INC:5918return LSC_OP_ATOMIC_INC;5919case BRW_AOP_DEC:5920return LSC_OP_ATOMIC_DEC;5921case BRW_AOP_ADD:5922return LSC_OP_ATOMIC_ADD;5923case BRW_AOP_SUB:5924return LSC_OP_ATOMIC_SUB;5925case BRW_AOP_IMAX:5926return LSC_OP_ATOMIC_MAX;5927case BRW_AOP_IMIN:5928return LSC_OP_ATOMIC_MIN;5929case BRW_AOP_UMAX:5930return LSC_OP_ATOMIC_UMAX;5931case BRW_AOP_UMIN:5932return LSC_OP_ATOMIC_UMIN;5933case BRW_AOP_CMPWR:5934return LSC_OP_ATOMIC_CMPXCHG;5935default:5936assert(false);5937unreachable("invalid atomic opcode");5938}5939}59405941static enum lsc_opcode5942brw_atomic_op_to_lsc_fatomic_op(uint32_t aop)5943{5944switch(aop) {5945case BRW_AOP_FMAX:5946return LSC_OP_ATOMIC_FMAX;5947case BRW_AOP_FMIN:5948return LSC_OP_ATOMIC_FMIN;5949case BRW_AOP_FCMPWR:5950return LSC_OP_ATOMIC_FCMPXCHG;5951default:5952unreachable("Unsupported float atomic opcode");5953}5954}59555956static enum lsc_data_size5957lsc_bits_to_data_size(unsigned bit_size)5958{5959switch (bit_size / 8) {5960case 1: return LSC_DATA_SIZE_D8U32;5961case 2: return LSC_DATA_SIZE_D16U32;5962case 4: return LSC_DATA_SIZE_D32;5963case 8: return LSC_DATA_SIZE_D64;5964default:5965unreachable("Unsupported data size.");5966}5967}59685969static void5970lower_lsc_surface_logical_send(const fs_builder &bld, fs_inst *inst)5971{5972const intel_device_info *devinfo = bld.shader->devinfo;5973assert(devinfo->has_lsc);59745975/* Get the logical send arguments. */5976const fs_reg addr = inst->src[SURFACE_LOGICAL_SRC_ADDRESS];5977const fs_reg src = inst->src[SURFACE_LOGICAL_SRC_DATA];5978const fs_reg surface = inst->src[SURFACE_LOGICAL_SRC_SURFACE];5979const fs_reg surface_handle = inst->src[SURFACE_LOGICAL_SRC_SURFACE_HANDLE];5980const UNUSED fs_reg &dims = inst->src[SURFACE_LOGICAL_SRC_IMM_DIMS];5981const fs_reg arg = inst->src[SURFACE_LOGICAL_SRC_IMM_ARG];5982const fs_reg allow_sample_mask =5983inst->src[SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK];5984assert(arg.file == IMM);5985assert(allow_sample_mask.file == IMM);59865987/* Calculate the total number of components of the payload. */5988const unsigned addr_sz = inst->components_read(SURFACE_LOGICAL_SRC_ADDRESS);5989const unsigned src_sz = inst->components_read(SURFACE_LOGICAL_SRC_DATA);59905991const bool has_side_effects = inst->has_side_effects();59925993unsigned ex_mlen = 0;5994fs_reg payload, payload2;5995payload = bld.move_to_vgrf(addr, addr_sz);5996if (src.file != BAD_FILE) {5997payload2 = bld.move_to_vgrf(src, src_sz);5998ex_mlen = src_sz * (inst->exec_size / 8);5999}60006001/* Predicate the instruction on the sample mask if needed */6002fs_reg sample_mask = allow_sample_mask.ud ? sample_mask_reg(bld) :6003fs_reg(brw_imm_d(0xffff));6004if (sample_mask.file != BAD_FILE && sample_mask.file != IMM)6005emit_predicate_on_sample_mask(bld, inst);60066007if (surface.file == IMM && surface.ud == GFX7_BTI_SLM)6008inst->sfid = GFX12_SFID_SLM;6009else6010inst->sfid = GFX12_SFID_UGM;60116012/* We must have exactly one of surface and surface_handle */6013assert((surface.file == BAD_FILE) != (surface_handle.file == BAD_FILE));60146015enum lsc_addr_surface_type surf_type;6016if (surface_handle.file != BAD_FILE)6017surf_type = LSC_ADDR_SURFTYPE_BSS;6018else if (surface.file == IMM && surface.ud == GFX7_BTI_SLM)6019surf_type = LSC_ADDR_SURFTYPE_FLAT;6020else6021surf_type = LSC_ADDR_SURFTYPE_BTI;60226023switch (inst->opcode) {6024case SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL:6025inst->desc = lsc_msg_desc(devinfo, LSC_OP_LOAD_CMASK, inst->exec_size,6026surf_type, LSC_ADDR_SIZE_A32,60271 /* num_coordinates */,6028LSC_DATA_SIZE_D32, arg.ud /* num_channels */,6029false /* transpose */,6030LSC_CACHE_LOAD_L1STATE_L3MOCS,6031true /* has_dest */);6032break;6033case SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL:6034inst->desc = lsc_msg_desc(devinfo, LSC_OP_STORE_CMASK, inst->exec_size,6035surf_type, LSC_ADDR_SIZE_A32,60361 /* num_coordinates */,6037LSC_DATA_SIZE_D32, arg.ud /* num_channels */,6038false /* transpose */,6039LSC_CACHE_STORE_L1STATE_L3MOCS,6040false /* has_dest */);6041break;6042case SHADER_OPCODE_UNTYPED_ATOMIC_LOGICAL:6043case SHADER_OPCODE_UNTYPED_ATOMIC_FLOAT_LOGICAL: {6044/* Bspec: Atomic instruction -> Cache section:6045*6046* Atomic messages are always forced to "un-cacheable" in the L16047* cache.6048*/6049enum lsc_opcode opcode =6050inst->opcode == SHADER_OPCODE_UNTYPED_ATOMIC_FLOAT_LOGICAL ?6051brw_atomic_op_to_lsc_fatomic_op(arg.ud) :6052brw_atomic_op_to_lsc_atomic_op(arg.ud);6053inst->desc = lsc_msg_desc(devinfo, opcode, inst->exec_size,6054surf_type, LSC_ADDR_SIZE_A32,60551 /* num_coordinates */,6056LSC_DATA_SIZE_D32, 1 /* num_channels */,6057false /* transpose */,6058LSC_CACHE_STORE_L1UC_L3WB,6059!inst->dst.is_null());6060break;6061}6062case SHADER_OPCODE_BYTE_SCATTERED_READ_LOGICAL:6063inst->desc = lsc_msg_desc(devinfo, LSC_OP_LOAD, inst->exec_size,6064surf_type, LSC_ADDR_SIZE_A32,60651 /* num_coordinates */,6066lsc_bits_to_data_size(arg.ud),60671 /* num_channels */,6068false /* transpose */,6069LSC_CACHE_LOAD_L1STATE_L3MOCS,6070true /* has_dest */);6071break;6072case SHADER_OPCODE_BYTE_SCATTERED_WRITE_LOGICAL:6073inst->desc = lsc_msg_desc(devinfo, LSC_OP_STORE, inst->exec_size,6074surf_type, LSC_ADDR_SIZE_A32,60751 /* num_coordinates */,6076lsc_bits_to_data_size(arg.ud),60771 /* num_channels */,6078false /* transpose */,6079LSC_CACHE_STORE_L1STATE_L3MOCS,6080false /* has_dest */);6081break;6082default:6083unreachable("Unknown surface logical instruction");6084}60856086inst->src[0] = brw_imm_ud(0);60876088/* Set up extended descriptors */6089switch (surf_type) {6090case LSC_ADDR_SURFTYPE_FLAT:6091inst->src[1] = brw_imm_ud(0);6092break;6093case LSC_ADDR_SURFTYPE_BSS:6094/* We assume that the driver provided the handle in the top 20 bits so6095* we can use the surface handle directly as the extended descriptor.6096*/6097inst->src[1] = retype(surface_handle, BRW_REGISTER_TYPE_UD);6098break;6099case LSC_ADDR_SURFTYPE_BTI:6100if (surface.file == IMM) {6101inst->src[1] = brw_imm_ud(lsc_bti_ex_desc(devinfo, surface.ud));6102} else {6103const fs_builder ubld = bld.exec_all().group(1, 0);6104fs_reg tmp = ubld.vgrf(BRW_REGISTER_TYPE_UD);6105ubld.SHL(tmp, surface, brw_imm_ud(24));6106inst->src[1] = component(tmp, 0);6107}6108break;6109default:6110unreachable("Unknown surface type");6111}61126113/* Update the original instruction. */6114inst->opcode = SHADER_OPCODE_SEND;6115inst->mlen = lsc_msg_desc_src0_len(devinfo, inst->desc);6116inst->ex_mlen = ex_mlen;6117inst->header_size = 0;6118inst->send_has_side_effects = has_side_effects;6119inst->send_is_volatile = !has_side_effects;61206121/* Finally, the payload */6122inst->src[2] = payload;6123inst->src[3] = payload2;61246125inst->resize_sources(4);6126}61276128static void6129lower_surface_block_logical_send(const fs_builder &bld, fs_inst *inst)6130{6131const intel_device_info *devinfo = bld.shader->devinfo;6132assert(devinfo->ver >= 9);61336134/* Get the logical send arguments. */6135const fs_reg &addr = inst->src[SURFACE_LOGICAL_SRC_ADDRESS];6136const fs_reg &src = inst->src[SURFACE_LOGICAL_SRC_DATA];6137const fs_reg &surface = inst->src[SURFACE_LOGICAL_SRC_SURFACE];6138const fs_reg &surface_handle = inst->src[SURFACE_LOGICAL_SRC_SURFACE_HANDLE];6139const fs_reg &arg = inst->src[SURFACE_LOGICAL_SRC_IMM_ARG];6140assert(arg.file == IMM);6141assert(inst->src[SURFACE_LOGICAL_SRC_IMM_DIMS].file == BAD_FILE);6142assert(inst->src[SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK].file == BAD_FILE);61436144const bool is_stateless =6145surface.file == IMM && (surface.ud == BRW_BTI_STATELESS ||6146surface.ud == GFX8_BTI_STATELESS_NON_COHERENT);61476148const bool has_side_effects = inst->has_side_effects();61496150const bool align_16B =6151inst->opcode != SHADER_OPCODE_UNALIGNED_OWORD_BLOCK_READ_LOGICAL;61526153const bool write = inst->opcode == SHADER_OPCODE_OWORD_BLOCK_WRITE_LOGICAL;61546155/* The address is stored in the header. See MH_A32_GO and MH_BTS_GO. */6156fs_builder ubld = bld.exec_all().group(8, 0);6157fs_reg header = ubld.vgrf(BRW_REGISTER_TYPE_UD);61586159if (is_stateless)6160ubld.emit(SHADER_OPCODE_SCRATCH_HEADER, header);6161else6162ubld.MOV(header, brw_imm_d(0));61636164/* Address in OWord units when aligned to OWords. */6165if (align_16B)6166ubld.group(1, 0).SHR(component(header, 2), addr, brw_imm_ud(4));6167else6168ubld.group(1, 0).MOV(component(header, 2), addr);61696170fs_reg data;6171unsigned ex_mlen = 0;6172if (write) {6173const unsigned src_sz = inst->components_read(SURFACE_LOGICAL_SRC_DATA);6174data = retype(bld.move_to_vgrf(src, src_sz), BRW_REGISTER_TYPE_UD);6175ex_mlen = src_sz * type_sz(src.type) * inst->exec_size / REG_SIZE;6176}61776178inst->opcode = SHADER_OPCODE_SEND;6179inst->mlen = 1;6180inst->ex_mlen = ex_mlen;6181inst->header_size = 1;6182inst->send_has_side_effects = has_side_effects;6183inst->send_is_volatile = !has_side_effects;61846185inst->sfid = GFX7_SFID_DATAPORT_DATA_CACHE;61866187const uint32_t desc = brw_dp_oword_block_rw_desc(devinfo, align_16B,6188arg.ud, write);6189setup_surface_descriptors(bld, inst, desc, surface, surface_handle);61906191inst->src[2] = header;6192inst->src[3] = data;61936194inst->resize_sources(4);6195}61966197static fs_reg6198emit_a64_oword_block_header(const fs_builder &bld, const fs_reg &addr)6199{6200const fs_builder ubld = bld.exec_all().group(8, 0);6201fs_reg header = ubld.vgrf(BRW_REGISTER_TYPE_UD);6202ubld.MOV(header, brw_imm_ud(0));62036204/* Use a 2-wide MOV to fill out the address */6205assert(type_sz(addr.type) == 8 && addr.stride == 0);6206fs_reg addr_vec2 = addr;6207addr_vec2.type = BRW_REGISTER_TYPE_UD;6208addr_vec2.stride = 1;6209ubld.group(2, 0).MOV(header, addr_vec2);62106211return header;6212}62136214static void6215lower_lsc_a64_logical_send(const fs_builder &bld, fs_inst *inst)6216{6217const intel_device_info *devinfo = bld.shader->devinfo;62186219/* Get the logical send arguments. */6220const fs_reg &addr = inst->src[0];6221const fs_reg &src = inst->src[1];6222const unsigned src_sz = type_sz(src.type);62236224const unsigned src_comps = inst->components_read(1);6225assert(inst->src[2].file == IMM);6226const unsigned arg = inst->src[2].ud;6227const bool has_side_effects = inst->has_side_effects();62286229/* If the surface message has side effects and we're a fragment shader, we6230* have to predicate with the sample mask to avoid helper invocations.6231*/6232if (has_side_effects && bld.shader->stage == MESA_SHADER_FRAGMENT)6233emit_predicate_on_sample_mask(bld, inst);62346235fs_reg payload = retype(bld.move_to_vgrf(addr, 1), BRW_REGISTER_TYPE_UD);6236fs_reg payload2 = retype(bld.move_to_vgrf(src, src_comps),6237BRW_REGISTER_TYPE_UD);6238unsigned ex_mlen = src_comps * src_sz * inst->exec_size / REG_SIZE;62396240switch (inst->opcode) {6241case SHADER_OPCODE_A64_UNTYPED_READ_LOGICAL:6242inst->desc = lsc_msg_desc(devinfo, LSC_OP_LOAD_CMASK, inst->exec_size,6243LSC_ADDR_SURFTYPE_FLAT, LSC_ADDR_SIZE_A64,62441 /* num_coordinates */,6245LSC_DATA_SIZE_D32, arg /* num_channels */,6246false /* transpose */,6247LSC_CACHE_LOAD_L1STATE_L3MOCS,6248true /* has_dest */);6249break;6250case SHADER_OPCODE_A64_UNTYPED_WRITE_LOGICAL:6251inst->desc = lsc_msg_desc(devinfo, LSC_OP_STORE_CMASK, inst->exec_size,6252LSC_ADDR_SURFTYPE_FLAT, LSC_ADDR_SIZE_A64,62531 /* num_coordinates */,6254LSC_DATA_SIZE_D32, arg /* num_channels */,6255false /* transpose */,6256LSC_CACHE_STORE_L1STATE_L3MOCS,6257false /* has_dest */);6258break;6259case SHADER_OPCODE_A64_BYTE_SCATTERED_READ_LOGICAL:6260inst->desc = lsc_msg_desc(devinfo, LSC_OP_LOAD, inst->exec_size,6261LSC_ADDR_SURFTYPE_FLAT, LSC_ADDR_SIZE_A64,62621 /* num_coordinates */,6263lsc_bits_to_data_size(arg),62641 /* num_channels */,6265false /* transpose */,6266LSC_CACHE_STORE_L1STATE_L3MOCS,6267true /* has_dest */);6268break;6269case SHADER_OPCODE_A64_BYTE_SCATTERED_WRITE_LOGICAL:6270inst->desc = lsc_msg_desc(devinfo, LSC_OP_STORE, inst->exec_size,6271LSC_ADDR_SURFTYPE_FLAT, LSC_ADDR_SIZE_A64,62721 /* num_coordinates */,6273lsc_bits_to_data_size(arg),62741 /* num_channels */,6275false /* transpose */,6276LSC_CACHE_STORE_L1STATE_L3MOCS,6277false /* has_dest */);6278break;6279case SHADER_OPCODE_A64_UNTYPED_ATOMIC_LOGICAL:6280case SHADER_OPCODE_A64_UNTYPED_ATOMIC_INT16_LOGICAL:6281case SHADER_OPCODE_A64_UNTYPED_ATOMIC_INT64_LOGICAL: {6282case SHADER_OPCODE_A64_UNTYPED_ATOMIC_FLOAT16_LOGICAL:6283case SHADER_OPCODE_A64_UNTYPED_ATOMIC_FLOAT32_LOGICAL:6284/* Bspec: Atomic instruction -> Cache section:6285*6286* Atomic messages are always forced to "un-cacheable" in the L16287* cache.6288*/6289enum lsc_opcode opcode =6290(inst->opcode == SHADER_OPCODE_A64_UNTYPED_ATOMIC_LOGICAL ||6291inst->opcode == SHADER_OPCODE_A64_UNTYPED_ATOMIC_INT16_LOGICAL ||6292inst->opcode == SHADER_OPCODE_A64_UNTYPED_ATOMIC_INT64_LOGICAL) ?6293brw_atomic_op_to_lsc_atomic_op(arg) :6294brw_atomic_op_to_lsc_fatomic_op(arg);6295inst->desc = lsc_msg_desc(devinfo, opcode, inst->exec_size,6296LSC_ADDR_SURFTYPE_FLAT, LSC_ADDR_SIZE_A64,62971 /* num_coordinates */,6298lsc_bits_to_data_size(src_sz * 8),62991 /* num_channels */,6300false /* transpose */,6301LSC_CACHE_STORE_L1UC_L3WB,6302!inst->dst.is_null());6303break;6304}6305default:6306unreachable("Unknown A64 logical instruction");6307}63086309/* Update the original instruction. */6310inst->opcode = SHADER_OPCODE_SEND;6311inst->mlen = lsc_msg_desc_src0_len(devinfo, inst->desc);6312inst->ex_mlen = ex_mlen;6313inst->header_size = 0;6314inst->send_has_side_effects = has_side_effects;6315inst->send_is_volatile = !has_side_effects;63166317/* Set up SFID and descriptors */6318inst->sfid = GFX12_SFID_UGM;6319inst->resize_sources(4);6320inst->src[0] = brw_imm_ud(0); /* desc */6321inst->src[1] = brw_imm_ud(0); /* ex_desc */6322inst->src[2] = payload;6323inst->src[3] = payload2;6324}63256326static void6327lower_a64_logical_send(const fs_builder &bld, fs_inst *inst)6328{6329const intel_device_info *devinfo = bld.shader->devinfo;63306331const fs_reg &addr = inst->src[0];6332const fs_reg &src = inst->src[1];6333const unsigned src_comps = inst->components_read(1);6334assert(inst->src[2].file == IMM);6335const unsigned arg = inst->src[2].ud;6336const bool has_side_effects = inst->has_side_effects();63376338/* If the surface message has side effects and we're a fragment shader, we6339* have to predicate with the sample mask to avoid helper invocations.6340*/6341if (has_side_effects && bld.shader->stage == MESA_SHADER_FRAGMENT)6342emit_predicate_on_sample_mask(bld, inst);63436344fs_reg payload, payload2;6345unsigned mlen, ex_mlen = 0, header_size = 0;6346if (inst->opcode == SHADER_OPCODE_A64_OWORD_BLOCK_READ_LOGICAL ||6347inst->opcode == SHADER_OPCODE_A64_OWORD_BLOCK_WRITE_LOGICAL ||6348inst->opcode == SHADER_OPCODE_A64_UNALIGNED_OWORD_BLOCK_READ_LOGICAL) {6349assert(devinfo->ver >= 9);63506351/* OWORD messages only take a scalar address in a header */6352mlen = 1;6353header_size = 1;6354payload = emit_a64_oword_block_header(bld, addr);63556356if (inst->opcode == SHADER_OPCODE_A64_OWORD_BLOCK_WRITE_LOGICAL) {6357ex_mlen = src_comps * type_sz(src.type) * inst->exec_size / REG_SIZE;6358payload2 = retype(bld.move_to_vgrf(src, src_comps),6359BRW_REGISTER_TYPE_UD);6360}6361} else if (devinfo->ver >= 9) {6362/* On Skylake and above, we have SENDS */6363mlen = 2 * (inst->exec_size / 8);6364ex_mlen = src_comps * type_sz(src.type) * inst->exec_size / REG_SIZE;6365payload = retype(bld.move_to_vgrf(addr, 1), BRW_REGISTER_TYPE_UD);6366payload2 = retype(bld.move_to_vgrf(src, src_comps),6367BRW_REGISTER_TYPE_UD);6368} else {6369/* Add two because the address is 64-bit */6370const unsigned dwords = 2 + src_comps;6371mlen = dwords * (inst->exec_size / 8);63726373fs_reg sources[5];63746375sources[0] = addr;63766377for (unsigned i = 0; i < src_comps; i++)6378sources[1 + i] = offset(src, bld, i);63796380payload = bld.vgrf(BRW_REGISTER_TYPE_UD, dwords);6381bld.LOAD_PAYLOAD(payload, sources, 1 + src_comps, 0);6382}63836384uint32_t desc;6385switch (inst->opcode) {6386case SHADER_OPCODE_A64_UNTYPED_READ_LOGICAL:6387desc = brw_dp_a64_untyped_surface_rw_desc(devinfo, inst->exec_size,6388arg, /* num_channels */6389false /* write */);6390break;63916392case SHADER_OPCODE_A64_UNTYPED_WRITE_LOGICAL:6393desc = brw_dp_a64_untyped_surface_rw_desc(devinfo, inst->exec_size,6394arg, /* num_channels */6395true /* write */);6396break;63976398case SHADER_OPCODE_A64_OWORD_BLOCK_READ_LOGICAL:6399desc = brw_dp_a64_oword_block_rw_desc(devinfo,6400true, /* align_16B */6401arg, /* num_dwords */6402false /* write */);6403break;64046405case SHADER_OPCODE_A64_UNALIGNED_OWORD_BLOCK_READ_LOGICAL:6406desc = brw_dp_a64_oword_block_rw_desc(devinfo,6407false, /* align_16B */6408arg, /* num_dwords */6409false /* write */);6410break;64116412case SHADER_OPCODE_A64_OWORD_BLOCK_WRITE_LOGICAL:6413desc = brw_dp_a64_oword_block_rw_desc(devinfo,6414true, /* align_16B */6415arg, /* num_dwords */6416true /* write */);6417break;64186419case SHADER_OPCODE_A64_BYTE_SCATTERED_READ_LOGICAL:6420desc = brw_dp_a64_byte_scattered_rw_desc(devinfo, inst->exec_size,6421arg, /* bit_size */6422false /* write */);6423break;64246425case SHADER_OPCODE_A64_BYTE_SCATTERED_WRITE_LOGICAL:6426desc = brw_dp_a64_byte_scattered_rw_desc(devinfo, inst->exec_size,6427arg, /* bit_size */6428true /* write */);6429break;64306431case SHADER_OPCODE_A64_UNTYPED_ATOMIC_LOGICAL:6432desc = brw_dp_a64_untyped_atomic_desc(devinfo, inst->exec_size, 32,6433arg, /* atomic_op */6434!inst->dst.is_null());6435break;64366437case SHADER_OPCODE_A64_UNTYPED_ATOMIC_INT16_LOGICAL:6438desc = brw_dp_a64_untyped_atomic_desc(devinfo, inst->exec_size, 16,6439arg, /* atomic_op */6440!inst->dst.is_null());6441break;64426443case SHADER_OPCODE_A64_UNTYPED_ATOMIC_INT64_LOGICAL:6444desc = brw_dp_a64_untyped_atomic_desc(devinfo, inst->exec_size, 64,6445arg, /* atomic_op */6446!inst->dst.is_null());6447break;64486449case SHADER_OPCODE_A64_UNTYPED_ATOMIC_FLOAT16_LOGICAL:6450desc = brw_dp_a64_untyped_atomic_float_desc(devinfo, inst->exec_size,645116, /* bit_size */6452arg, /* atomic_op */6453!inst->dst.is_null());6454break;64556456case SHADER_OPCODE_A64_UNTYPED_ATOMIC_FLOAT32_LOGICAL:6457desc = brw_dp_a64_untyped_atomic_float_desc(devinfo, inst->exec_size,645832, /* bit_size */6459arg, /* atomic_op */6460!inst->dst.is_null());6461break;64626463default:6464unreachable("Unknown A64 logical instruction");6465}64666467/* Update the original instruction. */6468inst->opcode = SHADER_OPCODE_SEND;6469inst->mlen = mlen;6470inst->ex_mlen = ex_mlen;6471inst->header_size = header_size;6472inst->send_has_side_effects = has_side_effects;6473inst->send_is_volatile = !has_side_effects;64746475/* Set up SFID and descriptors */6476inst->sfid = HSW_SFID_DATAPORT_DATA_CACHE_1;6477inst->desc = desc;6478inst->resize_sources(4);6479inst->src[0] = brw_imm_ud(0); /* desc */6480inst->src[1] = brw_imm_ud(0); /* ex_desc */6481inst->src[2] = payload;6482inst->src[3] = payload2;6483}64846485static void6486lower_lsc_varying_pull_constant_logical_send(const fs_builder &bld,6487fs_inst *inst)6488{6489const intel_device_info *devinfo = bld.shader->devinfo;6490ASSERTED const brw_compiler *compiler = bld.shader->compiler;64916492fs_reg index = inst->src[0];64936494/* We are switching the instruction from an ALU-like instruction to a6495* send-from-grf instruction. Since sends can't handle strides or6496* source modifiers, we have to make a copy of the offset source.6497*/6498fs_reg ubo_offset = bld.move_to_vgrf(inst->src[1], 1);64996500assert(inst->src[2].file == BRW_IMMEDIATE_VALUE);6501unsigned alignment = inst->src[2].ud;65026503inst->opcode = SHADER_OPCODE_SEND;6504inst->sfid = GFX12_SFID_UGM;6505inst->resize_sources(3);6506inst->src[0] = brw_imm_ud(0);65076508if (index.file == IMM) {6509inst->src[1] = brw_imm_ud(lsc_bti_ex_desc(devinfo, index.ud));6510} else {6511const fs_builder ubld = bld.exec_all().group(1, 0);6512fs_reg tmp = ubld.vgrf(BRW_REGISTER_TYPE_UD);6513ubld.SHL(tmp, index, brw_imm_ud(24));6514inst->src[1] = component(tmp, 0);6515}65166517assert(!compiler->indirect_ubos_use_sampler);65186519inst->src[2] = ubo_offset; /* payload */6520if (alignment >= 4) {6521inst->desc = lsc_msg_desc(devinfo, LSC_OP_LOAD_CMASK, inst->exec_size,6522LSC_ADDR_SURFTYPE_BTI, LSC_ADDR_SIZE_A32,65231 /* num_coordinates */,6524LSC_DATA_SIZE_D32,65254 /* num_channels */,6526false /* transpose */,6527LSC_CACHE_LOAD_L1STATE_L3MOCS,6528true /* has_dest */);6529inst->mlen = lsc_msg_desc_src0_len(devinfo, inst->desc);6530} else {6531inst->desc = lsc_msg_desc(devinfo, LSC_OP_LOAD, inst->exec_size,6532LSC_ADDR_SURFTYPE_BTI, LSC_ADDR_SIZE_A32,65331 /* num_coordinates */,6534LSC_DATA_SIZE_D32,65351 /* num_channels */,6536false /* transpose */,6537LSC_CACHE_LOAD_L1STATE_L3MOCS,6538true /* has_dest */);6539inst->mlen = lsc_msg_desc_src0_len(devinfo, inst->desc);6540/* The byte scattered messages can only read one dword at a time so6541* we have to duplicate the message 4 times to read the full vec4.6542* Hopefully, dead code will clean up the mess if some of them aren't6543* needed.6544*/6545assert(inst->size_written == 16 * inst->exec_size);6546inst->size_written /= 4;6547for (unsigned c = 1; c < 4; c++) {6548/* Emit a copy of the instruction because we're about to modify6549* it. Because this loop starts at 1, we will emit copies for the6550* first 3 and the final one will be the modified instruction.6551*/6552bld.emit(*inst);65536554/* Offset the source */6555inst->src[2] = bld.vgrf(BRW_REGISTER_TYPE_UD);6556bld.ADD(inst->src[2], ubo_offset, brw_imm_ud(c * 4));65576558/* Offset the destination */6559inst->dst = offset(inst->dst, bld, 1);6560}6561}6562}65636564static void6565lower_varying_pull_constant_logical_send(const fs_builder &bld, fs_inst *inst)6566{6567const intel_device_info *devinfo = bld.shader->devinfo;6568const brw_compiler *compiler = bld.shader->compiler;65696570if (devinfo->ver >= 7) {6571fs_reg index = inst->src[0];6572/* We are switching the instruction from an ALU-like instruction to a6573* send-from-grf instruction. Since sends can't handle strides or6574* source modifiers, we have to make a copy of the offset source.6575*/6576fs_reg ubo_offset = bld.vgrf(BRW_REGISTER_TYPE_UD);6577bld.MOV(ubo_offset, inst->src[1]);65786579assert(inst->src[2].file == BRW_IMMEDIATE_VALUE);6580unsigned alignment = inst->src[2].ud;65816582inst->opcode = SHADER_OPCODE_SEND;6583inst->mlen = inst->exec_size / 8;6584inst->resize_sources(3);65856586if (index.file == IMM) {6587inst->desc = index.ud & 0xff;6588inst->src[0] = brw_imm_ud(0);6589} else {6590inst->desc = 0;6591const fs_builder ubld = bld.exec_all().group(1, 0);6592fs_reg tmp = ubld.vgrf(BRW_REGISTER_TYPE_UD);6593ubld.AND(tmp, index, brw_imm_ud(0xff));6594inst->src[0] = component(tmp, 0);6595}6596inst->src[1] = brw_imm_ud(0); /* ex_desc */6597inst->src[2] = ubo_offset; /* payload */65986599if (compiler->indirect_ubos_use_sampler) {6600const unsigned simd_mode =6601inst->exec_size <= 8 ? BRW_SAMPLER_SIMD_MODE_SIMD8 :6602BRW_SAMPLER_SIMD_MODE_SIMD16;66036604inst->sfid = BRW_SFID_SAMPLER;6605inst->desc |= brw_sampler_desc(devinfo, 0, 0,6606GFX5_SAMPLER_MESSAGE_SAMPLE_LD,6607simd_mode, 0);6608} else if (alignment >= 4) {6609inst->sfid = (devinfo->verx10 >= 75 ?6610HSW_SFID_DATAPORT_DATA_CACHE_1 :6611GFX7_SFID_DATAPORT_DATA_CACHE);6612inst->desc |= brw_dp_untyped_surface_rw_desc(devinfo, inst->exec_size,66134, /* num_channels */6614false /* write */);6615} else {6616inst->sfid = GFX7_SFID_DATAPORT_DATA_CACHE;6617inst->desc |= brw_dp_byte_scattered_rw_desc(devinfo, inst->exec_size,661832, /* bit_size */6619false /* write */);6620/* The byte scattered messages can only read one dword at a time so6621* we have to duplicate the message 4 times to read the full vec4.6622* Hopefully, dead code will clean up the mess if some of them aren't6623* needed.6624*/6625assert(inst->size_written == 16 * inst->exec_size);6626inst->size_written /= 4;6627for (unsigned c = 1; c < 4; c++) {6628/* Emit a copy of the instruction because we're about to modify6629* it. Because this loop starts at 1, we will emit copies for the6630* first 3 and the final one will be the modified instruction.6631*/6632bld.emit(*inst);66336634/* Offset the source */6635inst->src[2] = bld.vgrf(BRW_REGISTER_TYPE_UD);6636bld.ADD(inst->src[2], ubo_offset, brw_imm_ud(c * 4));66376638/* Offset the destination */6639inst->dst = offset(inst->dst, bld, 1);6640}6641}6642} else {6643const fs_reg payload(MRF, FIRST_PULL_LOAD_MRF(devinfo->ver),6644BRW_REGISTER_TYPE_UD);66456646bld.MOV(byte_offset(payload, REG_SIZE), inst->src[1]);66476648inst->opcode = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GFX4;6649inst->resize_sources(1);6650inst->base_mrf = payload.nr;6651inst->header_size = 1;6652inst->mlen = 1 + inst->exec_size / 8;6653}6654}66556656static void6657lower_math_logical_send(const fs_builder &bld, fs_inst *inst)6658{6659assert(bld.shader->devinfo->ver < 6);66606661inst->base_mrf = 2;6662inst->mlen = inst->sources * inst->exec_size / 8;66636664if (inst->sources > 1) {6665/* From the Ironlake PRM, Volume 4, Part 1, Section 6.1.136666* "Message Payload":6667*6668* "Operand0[7]. For the INT DIV functions, this operand is the6669* denominator."6670* ...6671* "Operand1[7]. For the INT DIV functions, this operand is the6672* numerator."6673*/6674const bool is_int_div = inst->opcode != SHADER_OPCODE_POW;6675const fs_reg src0 = is_int_div ? inst->src[1] : inst->src[0];6676const fs_reg src1 = is_int_div ? inst->src[0] : inst->src[1];66776678inst->resize_sources(1);6679inst->src[0] = src0;66806681assert(inst->exec_size == 8);6682bld.MOV(fs_reg(MRF, inst->base_mrf + 1, src1.type), src1);6683}6684}66856686static void6687lower_btd_logical_send(const fs_builder &bld, fs_inst *inst)6688{6689const intel_device_info *devinfo = bld.shader->devinfo;6690fs_reg global_addr = inst->src[0];6691const fs_reg &btd_record = inst->src[1];66926693const unsigned mlen = 2;6694const fs_builder ubld = bld.exec_all().group(8, 0);6695fs_reg header = ubld.vgrf(BRW_REGISTER_TYPE_UD, 2);66966697ubld.MOV(header, brw_imm_ud(0));6698switch (inst->opcode) {6699case SHADER_OPCODE_BTD_SPAWN_LOGICAL:6700assert(type_sz(global_addr.type) == 8 && global_addr.stride == 0);6701global_addr.type = BRW_REGISTER_TYPE_UD;6702global_addr.stride = 1;6703ubld.group(2, 0).MOV(header, global_addr);6704break;67056706case SHADER_OPCODE_BTD_RETIRE_LOGICAL:6707/* The bottom bit is the Stack ID release bit */6708ubld.group(1, 0).MOV(header, brw_imm_ud(1));6709break;67106711default:6712unreachable("Invalid BTD message");6713}67146715/* Stack IDs are always in R1 regardless of whether we're coming from a6716* bindless shader or a regular compute shader.6717*/6718fs_reg stack_ids =6719retype(byte_offset(header, REG_SIZE), BRW_REGISTER_TYPE_UW);6720bld.MOV(stack_ids, retype(brw_vec8_grf(1, 0), BRW_REGISTER_TYPE_UW));67216722unsigned ex_mlen = 0;6723fs_reg payload;6724if (inst->opcode == SHADER_OPCODE_BTD_SPAWN_LOGICAL) {6725ex_mlen = 2 * (inst->exec_size / 8);6726payload = bld.move_to_vgrf(btd_record, 1);6727} else {6728assert(inst->opcode == SHADER_OPCODE_BTD_RETIRE_LOGICAL);6729/* All these messages take a BTD and things complain if we don't provide6730* one for RETIRE. However, it shouldn't ever actually get used so fill6731* it with zero.6732*/6733ex_mlen = 2 * (inst->exec_size / 8);6734payload = bld.move_to_vgrf(brw_imm_uq(0), 1);6735}67366737/* Update the original instruction. */6738inst->opcode = SHADER_OPCODE_SEND;6739inst->mlen = mlen;6740inst->ex_mlen = ex_mlen;6741inst->header_size = 0; /* HW docs require has_header = false */6742inst->send_has_side_effects = true;6743inst->send_is_volatile = false;67446745/* Set up SFID and descriptors */6746inst->sfid = GEN_RT_SFID_BINDLESS_THREAD_DISPATCH;6747inst->desc = brw_btd_spawn_desc(devinfo, inst->exec_size,6748GEN_RT_BTD_MESSAGE_SPAWN);6749inst->resize_sources(4);6750inst->src[0] = brw_imm_ud(0); /* desc */6751inst->src[1] = brw_imm_ud(0); /* ex_desc */6752inst->src[2] = header;6753inst->src[3] = payload;6754}67556756static void6757lower_trace_ray_logical_send(const fs_builder &bld, fs_inst *inst)6758{6759const intel_device_info *devinfo = bld.shader->devinfo;6760const fs_reg &bvh_level = inst->src[0];6761assert(inst->src[1].file == BRW_IMMEDIATE_VALUE);6762const uint32_t trace_ray_control = inst->src[1].ud;67636764const unsigned mlen = 1;6765const fs_builder ubld = bld.exec_all().group(8, 0);6766fs_reg header = ubld.vgrf(BRW_REGISTER_TYPE_UD);6767ubld.MOV(header, brw_imm_ud(0));6768ubld.group(2, 0).MOV(header,6769retype(brw_vec2_grf(2, 0), BRW_REGISTER_TYPE_UD));6770/* TODO: Bit 128 is ray_query */67716772const unsigned ex_mlen = inst->exec_size / 8;6773fs_reg payload = bld.vgrf(BRW_REGISTER_TYPE_UD);6774const uint32_t trc_bits = SET_BITS(trace_ray_control, 9, 8);6775if (bvh_level.file == BRW_IMMEDIATE_VALUE) {6776bld.MOV(payload, brw_imm_ud(trc_bits | (bvh_level.ud & 0x7)));6777} else {6778bld.AND(payload, bvh_level, brw_imm_ud(0x7));6779if (trc_bits != 0)6780bld.OR(payload, payload, brw_imm_ud(trc_bits));6781}6782bld.AND(subscript(payload, BRW_REGISTER_TYPE_UW, 1),6783retype(brw_vec8_grf(1, 0), BRW_REGISTER_TYPE_UW),6784brw_imm_uw(0x7ff));67856786/* Update the original instruction. */6787inst->opcode = SHADER_OPCODE_SEND;6788inst->mlen = mlen;6789inst->ex_mlen = ex_mlen;6790inst->header_size = 0; /* HW docs require has_header = false */6791inst->send_has_side_effects = true;6792inst->send_is_volatile = false;67936794/* Set up SFID and descriptors */6795inst->sfid = GEN_RT_SFID_RAY_TRACE_ACCELERATOR;6796inst->desc = brw_rt_trace_ray_desc(devinfo, inst->exec_size);6797inst->resize_sources(4);6798inst->src[0] = brw_imm_ud(0); /* desc */6799inst->src[1] = brw_imm_ud(0); /* ex_desc */6800inst->src[2] = header;6801inst->src[3] = payload;6802}68036804bool6805fs_visitor::lower_logical_sends()6806{6807bool progress = false;68086809foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {6810const fs_builder ibld(this, block, inst);68116812switch (inst->opcode) {6813case FS_OPCODE_FB_WRITE_LOGICAL:6814assert(stage == MESA_SHADER_FRAGMENT);6815lower_fb_write_logical_send(ibld, inst,6816brw_wm_prog_data(prog_data),6817(const brw_wm_prog_key *)key,6818payload);6819break;68206821case FS_OPCODE_FB_READ_LOGICAL:6822lower_fb_read_logical_send(ibld, inst);6823break;68246825case SHADER_OPCODE_TEX_LOGICAL:6826lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TEX);6827break;68286829case SHADER_OPCODE_TXD_LOGICAL:6830lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TXD);6831break;68326833case SHADER_OPCODE_TXF_LOGICAL:6834lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TXF);6835break;68366837case SHADER_OPCODE_TXL_LOGICAL:6838lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TXL);6839break;68406841case SHADER_OPCODE_TXS_LOGICAL:6842lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TXS);6843break;68446845case SHADER_OPCODE_IMAGE_SIZE_LOGICAL:6846lower_sampler_logical_send(ibld, inst,6847SHADER_OPCODE_IMAGE_SIZE_LOGICAL);6848break;68496850case FS_OPCODE_TXB_LOGICAL:6851lower_sampler_logical_send(ibld, inst, FS_OPCODE_TXB);6852break;68536854case SHADER_OPCODE_TXF_CMS_LOGICAL:6855lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TXF_CMS);6856break;68576858case SHADER_OPCODE_TXF_CMS_W_LOGICAL:6859lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TXF_CMS_W);6860break;68616862case SHADER_OPCODE_TXF_UMS_LOGICAL:6863lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TXF_UMS);6864break;68656866case SHADER_OPCODE_TXF_MCS_LOGICAL:6867lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TXF_MCS);6868break;68696870case SHADER_OPCODE_LOD_LOGICAL:6871lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_LOD);6872break;68736874case SHADER_OPCODE_TG4_LOGICAL:6875lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TG4);6876break;68776878case SHADER_OPCODE_TG4_OFFSET_LOGICAL:6879lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TG4_OFFSET);6880break;68816882case SHADER_OPCODE_SAMPLEINFO_LOGICAL:6883lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_SAMPLEINFO);6884break;68856886case SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL:6887case SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL:6888case SHADER_OPCODE_UNTYPED_ATOMIC_LOGICAL:6889case SHADER_OPCODE_UNTYPED_ATOMIC_FLOAT_LOGICAL:6890case SHADER_OPCODE_BYTE_SCATTERED_READ_LOGICAL:6891case SHADER_OPCODE_BYTE_SCATTERED_WRITE_LOGICAL:6892if (devinfo->has_lsc) {6893lower_lsc_surface_logical_send(ibld, inst);6894break;6895}6896case SHADER_OPCODE_DWORD_SCATTERED_READ_LOGICAL:6897case SHADER_OPCODE_DWORD_SCATTERED_WRITE_LOGICAL:6898case SHADER_OPCODE_TYPED_SURFACE_READ_LOGICAL:6899case SHADER_OPCODE_TYPED_SURFACE_WRITE_LOGICAL:6900case SHADER_OPCODE_TYPED_ATOMIC_LOGICAL:6901lower_surface_logical_send(ibld, inst);6902break;69036904case SHADER_OPCODE_OWORD_BLOCK_READ_LOGICAL:6905case SHADER_OPCODE_UNALIGNED_OWORD_BLOCK_READ_LOGICAL:6906case SHADER_OPCODE_OWORD_BLOCK_WRITE_LOGICAL:6907lower_surface_block_logical_send(ibld, inst);6908break;69096910case SHADER_OPCODE_A64_UNTYPED_WRITE_LOGICAL:6911case SHADER_OPCODE_A64_UNTYPED_READ_LOGICAL:6912case SHADER_OPCODE_A64_BYTE_SCATTERED_WRITE_LOGICAL:6913case SHADER_OPCODE_A64_BYTE_SCATTERED_READ_LOGICAL:6914case SHADER_OPCODE_A64_UNTYPED_ATOMIC_LOGICAL:6915case SHADER_OPCODE_A64_UNTYPED_ATOMIC_INT16_LOGICAL:6916case SHADER_OPCODE_A64_UNTYPED_ATOMIC_INT64_LOGICAL:6917case SHADER_OPCODE_A64_UNTYPED_ATOMIC_FLOAT16_LOGICAL:6918case SHADER_OPCODE_A64_UNTYPED_ATOMIC_FLOAT32_LOGICAL:6919if (devinfo->has_lsc) {6920lower_lsc_a64_logical_send(ibld, inst);6921break;6922}6923case SHADER_OPCODE_A64_OWORD_BLOCK_READ_LOGICAL:6924case SHADER_OPCODE_A64_UNALIGNED_OWORD_BLOCK_READ_LOGICAL:6925case SHADER_OPCODE_A64_OWORD_BLOCK_WRITE_LOGICAL:6926lower_a64_logical_send(ibld, inst);6927break;69286929case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_LOGICAL:6930if (devinfo->has_lsc && !compiler->indirect_ubos_use_sampler)6931lower_lsc_varying_pull_constant_logical_send(ibld, inst);6932else6933lower_varying_pull_constant_logical_send(ibld, inst);6934break;69356936case SHADER_OPCODE_RCP:6937case SHADER_OPCODE_RSQ:6938case SHADER_OPCODE_SQRT:6939case SHADER_OPCODE_EXP2:6940case SHADER_OPCODE_LOG2:6941case SHADER_OPCODE_SIN:6942case SHADER_OPCODE_COS:6943case SHADER_OPCODE_POW:6944case SHADER_OPCODE_INT_QUOTIENT:6945case SHADER_OPCODE_INT_REMAINDER:6946/* The math opcodes are overloaded for the send-like and6947* expression-like instructions which seems kind of icky. Gfx6+ has6948* a native (but rather quirky) MATH instruction so we don't need to6949* do anything here. On Gfx4-5 we'll have to lower the Gfx6-like6950* logical instructions (which we can easily recognize because they6951* have mlen = 0) into send-like virtual instructions.6952*/6953if (devinfo->ver < 6 && inst->mlen == 0) {6954lower_math_logical_send(ibld, inst);6955break;69566957} else {6958continue;6959}69606961case SHADER_OPCODE_BTD_SPAWN_LOGICAL:6962case SHADER_OPCODE_BTD_RETIRE_LOGICAL:6963lower_btd_logical_send(ibld, inst);6964break;69656966case RT_OPCODE_TRACE_RAY_LOGICAL:6967lower_trace_ray_logical_send(ibld, inst);6968break;69696970default:6971continue;6972}69736974progress = true;6975}69766977if (progress)6978invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES);69796980return progress;6981}69826983static bool6984is_mixed_float_with_fp32_dst(const fs_inst *inst)6985{6986/* This opcode sometimes uses :W type on the source even if the operand is6987* a :HF, because in gfx7 there is no support for :HF, and thus it uses :W.6988*/6989if (inst->opcode == BRW_OPCODE_F16TO32)6990return true;69916992if (inst->dst.type != BRW_REGISTER_TYPE_F)6993return false;69946995for (int i = 0; i < inst->sources; i++) {6996if (inst->src[i].type == BRW_REGISTER_TYPE_HF)6997return true;6998}69997000return false;7001}70027003static bool7004is_mixed_float_with_packed_fp16_dst(const fs_inst *inst)7005{7006/* This opcode sometimes uses :W type on the destination even if the7007* destination is a :HF, because in gfx7 there is no support for :HF, and7008* thus it uses :W.7009*/7010if (inst->opcode == BRW_OPCODE_F32TO16 &&7011inst->dst.stride == 1)7012return true;70137014if (inst->dst.type != BRW_REGISTER_TYPE_HF ||7015inst->dst.stride != 1)7016return false;70177018for (int i = 0; i < inst->sources; i++) {7019if (inst->src[i].type == BRW_REGISTER_TYPE_F)7020return true;7021}70227023return false;7024}70257026/**7027* Get the closest allowed SIMD width for instruction \p inst accounting for7028* some common regioning and execution control restrictions that apply to FPU7029* instructions. These restrictions don't necessarily have any relevance to7030* instructions not executed by the FPU pipeline like extended math, control7031* flow or send message instructions.7032*7033* For virtual opcodes it's really up to the instruction -- In some cases7034* (e.g. where a virtual instruction unrolls into a simple sequence of FPU7035* instructions) it may simplify virtual instruction lowering if we can7036* enforce FPU-like regioning restrictions already on the virtual instruction,7037* in other cases (e.g. virtual send-like instructions) this may be7038* excessively restrictive.7039*/7040static unsigned7041get_fpu_lowered_simd_width(const struct intel_device_info *devinfo,7042const fs_inst *inst)7043{7044/* Maximum execution size representable in the instruction controls. */7045unsigned max_width = MIN2(32, inst->exec_size);70467047/* According to the PRMs:7048* "A. In Direct Addressing mode, a source cannot span more than 27049* adjacent GRF registers.7050* B. A destination cannot span more than 2 adjacent GRF registers."7051*7052* Look for the source or destination with the largest register region7053* which is the one that is going to limit the overall execution size of7054* the instruction due to this rule.7055*/7056unsigned reg_count = DIV_ROUND_UP(inst->size_written, REG_SIZE);70577058for (unsigned i = 0; i < inst->sources; i++)7059reg_count = MAX2(reg_count, DIV_ROUND_UP(inst->size_read(i), REG_SIZE));70607061/* Calculate the maximum execution size of the instruction based on the7062* factor by which it goes over the hardware limit of 2 GRFs.7063*/7064if (reg_count > 2)7065max_width = MIN2(max_width, inst->exec_size / DIV_ROUND_UP(reg_count, 2));70667067/* According to the IVB PRMs:7068* "When destination spans two registers, the source MUST span two7069* registers. The exception to the above rule:7070*7071* - When source is scalar, the source registers are not incremented.7072* - When source is packed integer Word and destination is packed7073* integer DWord, the source register is not incremented but the7074* source sub register is incremented."7075*7076* The hardware specs from Gfx4 to Gfx7.5 mention similar regioning7077* restrictions. The code below intentionally doesn't check whether the7078* destination type is integer because empirically the hardware doesn't7079* seem to care what the actual type is as long as it's dword-aligned.7080*/7081if (devinfo->ver < 8) {7082for (unsigned i = 0; i < inst->sources; i++) {7083/* IVB implements DF scalars as <0;2,1> regions. */7084const bool is_scalar_exception = is_uniform(inst->src[i]) &&7085(devinfo->is_haswell || type_sz(inst->src[i].type) != 8);7086const bool is_packed_word_exception =7087type_sz(inst->dst.type) == 4 && inst->dst.stride == 1 &&7088type_sz(inst->src[i].type) == 2 && inst->src[i].stride == 1;70897090/* We check size_read(i) against size_written instead of REG_SIZE7091* because we want to properly handle SIMD32. In SIMD32, you can end7092* up with writes to 4 registers and a source that reads 2 registers7093* and we may still need to lower all the way to SIMD8 in that case.7094*/7095if (inst->size_written > REG_SIZE &&7096inst->size_read(i) != 0 &&7097inst->size_read(i) < inst->size_written &&7098!is_scalar_exception && !is_packed_word_exception) {7099const unsigned reg_count = DIV_ROUND_UP(inst->size_written, REG_SIZE);7100max_width = MIN2(max_width, inst->exec_size / reg_count);7101}7102}7103}71047105if (devinfo->ver < 6) {7106/* From the G45 PRM, Volume 4 Page 361:7107*7108* "Operand Alignment Rule: With the exceptions listed below, a7109* source/destination operand in general should be aligned to even7110* 256-bit physical register with a region size equal to two 256-bit7111* physical registers."7112*7113* Normally we enforce this by allocating virtual registers to the7114* even-aligned class. But we need to handle payload registers.7115*/7116for (unsigned i = 0; i < inst->sources; i++) {7117if (inst->src[i].file == FIXED_GRF && (inst->src[i].nr & 1) &&7118inst->size_read(i) > REG_SIZE) {7119max_width = MIN2(max_width, 8);7120}7121}7122}71237124/* From the IVB PRMs:7125* "When an instruction is SIMD32, the low 16 bits of the execution mask7126* are applied for both halves of the SIMD32 instruction. If different7127* execution mask channels are required, split the instruction into two7128* SIMD16 instructions."7129*7130* There is similar text in the HSW PRMs. Gfx4-6 don't even implement7131* 32-wide control flow support in hardware and will behave similarly.7132*/7133if (devinfo->ver < 8 && !inst->force_writemask_all)7134max_width = MIN2(max_width, 16);71357136/* From the IVB PRMs (applies to HSW too):7137* "Instructions with condition modifiers must not use SIMD32."7138*7139* From the BDW PRMs (applies to later hardware too):7140* "Ternary instruction with condition modifiers must not use SIMD32."7141*/7142if (inst->conditional_mod && (devinfo->ver < 8 || inst->is_3src(devinfo)))7143max_width = MIN2(max_width, 16);71447145/* From the IVB PRMs (applies to other devices that don't have the7146* intel_device_info::supports_simd16_3src flag set):7147* "In Align16 access mode, SIMD16 is not allowed for DW operations and7148* SIMD8 is not allowed for DF operations."7149*/7150if (inst->is_3src(devinfo) && !devinfo->supports_simd16_3src)7151max_width = MIN2(max_width, inst->exec_size / reg_count);71527153/* Pre-Gfx8 EUs are hardwired to use the QtrCtrl+1 (where QtrCtrl is7154* the 8-bit quarter of the execution mask signals specified in the7155* instruction control fields) for the second compressed half of any7156* single-precision instruction (for double-precision instructions7157* it's hardwired to use NibCtrl+1, at least on HSW), which means that7158* the EU will apply the wrong execution controls for the second7159* sequential GRF write if the number of channels per GRF is not exactly7160* eight in single-precision mode (or four in double-float mode).7161*7162* In this situation we calculate the maximum size of the split7163* instructions so they only ever write to a single register.7164*/7165if (devinfo->ver < 8 && inst->size_written > REG_SIZE &&7166!inst->force_writemask_all) {7167const unsigned channels_per_grf = inst->exec_size /7168DIV_ROUND_UP(inst->size_written, REG_SIZE);7169const unsigned exec_type_size = get_exec_type_size(inst);7170assert(exec_type_size);71717172/* The hardware shifts exactly 8 channels per compressed half of the7173* instruction in single-precision mode and exactly 4 in double-precision.7174*/7175if (channels_per_grf != (exec_type_size == 8 ? 4 : 8))7176max_width = MIN2(max_width, channels_per_grf);71777178/* Lower all non-force_writemask_all DF instructions to SIMD4 on IVB/BYT7179* because HW applies the same channel enable signals to both halves of7180* the compressed instruction which will be just wrong under7181* non-uniform control flow.7182*/7183if (devinfo->verx10 == 70 &&7184(exec_type_size == 8 || type_sz(inst->dst.type) == 8))7185max_width = MIN2(max_width, 4);7186}71877188/* From the SKL PRM, Special Restrictions for Handling Mixed Mode7189* Float Operations:7190*7191* "No SIMD16 in mixed mode when destination is f32. Instruction7192* execution size must be no more than 8."7193*7194* FIXME: the simulator doesn't seem to complain if we don't do this and7195* empirical testing with existing CTS tests show that they pass just fine7196* without implementing this, however, since our interpretation of the PRM7197* is that conversion MOVs between HF and F are still mixed-float7198* instructions (and therefore subject to this restriction) we decided to7199* split them to be safe. Might be useful to do additional investigation to7200* lift the restriction if we can ensure that it is safe though, since these7201* conversions are common when half-float types are involved since many7202* instructions do not support HF types and conversions from/to F are7203* required.7204*/7205if (is_mixed_float_with_fp32_dst(inst))7206max_width = MIN2(max_width, 8);72077208/* From the SKL PRM, Special Restrictions for Handling Mixed Mode7209* Float Operations:7210*7211* "No SIMD16 in mixed mode when destination is packed f16 for both7212* Align1 and Align16."7213*/7214if (is_mixed_float_with_packed_fp16_dst(inst))7215max_width = MIN2(max_width, 8);72167217/* Only power-of-two execution sizes are representable in the instruction7218* control fields.7219*/7220return 1 << util_logbase2(max_width);7221}72227223/**7224* Get the maximum allowed SIMD width for instruction \p inst accounting for7225* various payload size restrictions that apply to sampler message7226* instructions.7227*7228* This is only intended to provide a maximum theoretical bound for the7229* execution size of the message based on the number of argument components7230* alone, which in most cases will determine whether the SIMD8 or SIMD167231* variant of the message can be used, though some messages may have7232* additional restrictions not accounted for here (e.g. pre-ILK hardware uses7233* the message length to determine the exact SIMD width and argument count,7234* which makes a number of sampler message combinations impossible to7235* represent).7236*/7237static unsigned7238get_sampler_lowered_simd_width(const struct intel_device_info *devinfo,7239const fs_inst *inst)7240{7241/* If we have a min_lod parameter on anything other than a simple sample7242* message, it will push it over 5 arguments and we have to fall back to7243* SIMD8.7244*/7245if (inst->opcode != SHADER_OPCODE_TEX &&7246inst->components_read(TEX_LOGICAL_SRC_MIN_LOD))7247return 8;72487249/* Calculate the number of coordinate components that have to be present7250* assuming that additional arguments follow the texel coordinates in the7251* message payload. On IVB+ there is no need for padding, on ILK-SNB we7252* need to pad to four or three components depending on the message,7253* pre-ILK we need to pad to at most three components.7254*/7255const unsigned req_coord_components =7256(devinfo->ver >= 7 ||7257!inst->components_read(TEX_LOGICAL_SRC_COORDINATE)) ? 0 :7258(devinfo->ver >= 5 && inst->opcode != SHADER_OPCODE_TXF_LOGICAL &&7259inst->opcode != SHADER_OPCODE_TXF_CMS_LOGICAL) ? 4 :72603;72617262/* On Gfx9+ the LOD argument is for free if we're able to use the LZ7263* variant of the TXL or TXF message.7264*/7265const bool implicit_lod = devinfo->ver >= 9 &&7266(inst->opcode == SHADER_OPCODE_TXL ||7267inst->opcode == SHADER_OPCODE_TXF) &&7268inst->src[TEX_LOGICAL_SRC_LOD].is_zero();72697270/* Calculate the total number of argument components that need to be passed7271* to the sampler unit.7272*/7273const unsigned num_payload_components =7274MAX2(inst->components_read(TEX_LOGICAL_SRC_COORDINATE),7275req_coord_components) +7276inst->components_read(TEX_LOGICAL_SRC_SHADOW_C) +7277(implicit_lod ? 0 : inst->components_read(TEX_LOGICAL_SRC_LOD)) +7278inst->components_read(TEX_LOGICAL_SRC_LOD2) +7279inst->components_read(TEX_LOGICAL_SRC_SAMPLE_INDEX) +7280(inst->opcode == SHADER_OPCODE_TG4_OFFSET_LOGICAL ?7281inst->components_read(TEX_LOGICAL_SRC_TG4_OFFSET) : 0) +7282inst->components_read(TEX_LOGICAL_SRC_MCS);72837284/* SIMD16 messages with more than five arguments exceed the maximum message7285* size supported by the sampler, regardless of whether a header is7286* provided or not.7287*/7288return MIN2(inst->exec_size,7289num_payload_components > MAX_SAMPLER_MESSAGE_SIZE / 2 ? 8 : 16);7290}72917292/**7293* Get the closest native SIMD width supported by the hardware for instruction7294* \p inst. The instruction will be left untouched by7295* fs_visitor::lower_simd_width() if the returned value is equal to the7296* original execution size.7297*/7298static unsigned7299get_lowered_simd_width(const struct intel_device_info *devinfo,7300const fs_inst *inst)7301{7302switch (inst->opcode) {7303case BRW_OPCODE_MOV:7304case BRW_OPCODE_SEL:7305case BRW_OPCODE_NOT:7306case BRW_OPCODE_AND:7307case BRW_OPCODE_OR:7308case BRW_OPCODE_XOR:7309case BRW_OPCODE_SHR:7310case BRW_OPCODE_SHL:7311case BRW_OPCODE_ASR:7312case BRW_OPCODE_ROR:7313case BRW_OPCODE_ROL:7314case BRW_OPCODE_CMPN:7315case BRW_OPCODE_CSEL:7316case BRW_OPCODE_F32TO16:7317case BRW_OPCODE_F16TO32:7318case BRW_OPCODE_BFREV:7319case BRW_OPCODE_BFE:7320case BRW_OPCODE_ADD:7321case BRW_OPCODE_MUL:7322case BRW_OPCODE_AVG:7323case BRW_OPCODE_FRC:7324case BRW_OPCODE_RNDU:7325case BRW_OPCODE_RNDD:7326case BRW_OPCODE_RNDE:7327case BRW_OPCODE_RNDZ:7328case BRW_OPCODE_LZD:7329case BRW_OPCODE_FBH:7330case BRW_OPCODE_FBL:7331case BRW_OPCODE_CBIT:7332case BRW_OPCODE_SAD2:7333case BRW_OPCODE_MAD:7334case BRW_OPCODE_LRP:7335case FS_OPCODE_PACK:7336case SHADER_OPCODE_SEL_EXEC:7337case SHADER_OPCODE_CLUSTER_BROADCAST:7338case SHADER_OPCODE_MOV_RELOC_IMM:7339return get_fpu_lowered_simd_width(devinfo, inst);73407341case BRW_OPCODE_CMP: {7342/* The Ivybridge/BayTrail WaCMPInstFlagDepClearedEarly workaround says that7343* when the destination is a GRF the dependency-clear bit on the flag7344* register is cleared early.7345*7346* Suggested workarounds are to disable coissuing CMP instructions7347* or to split CMP(16) instructions into two CMP(8) instructions.7348*7349* We choose to split into CMP(8) instructions since disabling7350* coissuing would affect CMP instructions not otherwise affected by7351* the errata.7352*/7353const unsigned max_width = (devinfo->verx10 == 70 &&7354!inst->dst.is_null() ? 8 : ~0);7355return MIN2(max_width, get_fpu_lowered_simd_width(devinfo, inst));7356}7357case BRW_OPCODE_BFI1:7358case BRW_OPCODE_BFI2:7359/* The Haswell WaForceSIMD8ForBFIInstruction workaround says that we7360* should7361* "Force BFI instructions to be executed always in SIMD8."7362*/7363return MIN2(devinfo->is_haswell ? 8 : ~0u,7364get_fpu_lowered_simd_width(devinfo, inst));73657366case BRW_OPCODE_IF:7367assert(inst->src[0].file == BAD_FILE || inst->exec_size <= 16);7368return inst->exec_size;73697370case SHADER_OPCODE_RCP:7371case SHADER_OPCODE_RSQ:7372case SHADER_OPCODE_SQRT:7373case SHADER_OPCODE_EXP2:7374case SHADER_OPCODE_LOG2:7375case SHADER_OPCODE_SIN:7376case SHADER_OPCODE_COS: {7377/* Unary extended math instructions are limited to SIMD8 on Gfx4 and7378* Gfx6. Extended Math Function is limited to SIMD8 with half-float.7379*/7380if (devinfo->ver == 6 || (devinfo->ver == 4 && !devinfo->is_g4x))7381return MIN2(8, inst->exec_size);7382if (inst->dst.type == BRW_REGISTER_TYPE_HF)7383return MIN2(8, inst->exec_size);7384return MIN2(16, inst->exec_size);7385}73867387case SHADER_OPCODE_POW: {7388/* SIMD16 is only allowed on Gfx7+. Extended Math Function is limited7389* to SIMD8 with half-float7390*/7391if (devinfo->ver < 7)7392return MIN2(8, inst->exec_size);7393if (inst->dst.type == BRW_REGISTER_TYPE_HF)7394return MIN2(8, inst->exec_size);7395return MIN2(16, inst->exec_size);7396}73977398case SHADER_OPCODE_USUB_SAT:7399case SHADER_OPCODE_ISUB_SAT:7400return get_fpu_lowered_simd_width(devinfo, inst);74017402case SHADER_OPCODE_INT_QUOTIENT:7403case SHADER_OPCODE_INT_REMAINDER:7404/* Integer division is limited to SIMD8 on all generations. */7405return MIN2(8, inst->exec_size);74067407case FS_OPCODE_LINTERP:7408case SHADER_OPCODE_GET_BUFFER_SIZE:7409case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:7410case FS_OPCODE_PACK_HALF_2x16_SPLIT:7411case FS_OPCODE_INTERPOLATE_AT_SAMPLE:7412case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:7413case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:7414return MIN2(16, inst->exec_size);74157416case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_LOGICAL:7417/* Pre-ILK hardware doesn't have a SIMD8 variant of the texel fetch7418* message used to implement varying pull constant loads, so expand it7419* to SIMD16. An alternative with longer message payload length but7420* shorter return payload would be to use the SIMD8 sampler message that7421* takes (header, u, v, r) as parameters instead of (header, u).7422*/7423return (devinfo->ver == 4 ? 16 : MIN2(16, inst->exec_size));74247425case FS_OPCODE_DDX_COARSE:7426case FS_OPCODE_DDX_FINE:7427case FS_OPCODE_DDY_COARSE:7428case FS_OPCODE_DDY_FINE:7429/* The implementation of this virtual opcode may require emitting7430* compressed Align16 instructions, which are severely limited on some7431* generations.7432*7433* From the Ivy Bridge PRM, volume 4 part 3, section 3.3.9 (Register7434* Region Restrictions):7435*7436* "In Align16 access mode, SIMD16 is not allowed for DW operations7437* and SIMD8 is not allowed for DF operations."7438*7439* In this context, "DW operations" means "operations acting on 32-bit7440* values", so it includes operations on floats.7441*7442* Gfx4 has a similar restriction. From the i965 PRM, section 11.5.37443* (Instruction Compression -> Rules and Restrictions):7444*7445* "A compressed instruction must be in Align1 access mode. Align167446* mode instructions cannot be compressed."7447*7448* Similar text exists in the g45 PRM.7449*7450* Empirically, compressed align16 instructions using odd register7451* numbers don't appear to work on Sandybridge either.7452*/7453return (devinfo->ver == 4 || devinfo->ver == 6 ||7454(devinfo->verx10 == 70) ?7455MIN2(8, inst->exec_size) : MIN2(16, inst->exec_size));74567457case SHADER_OPCODE_MULH:7458/* MULH is lowered to the MUL/MACH sequence using the accumulator, which7459* is 8-wide on Gfx7+.7460*/7461return (devinfo->ver >= 7 ? 8 :7462get_fpu_lowered_simd_width(devinfo, inst));74637464case FS_OPCODE_FB_WRITE_LOGICAL:7465/* Gfx6 doesn't support SIMD16 depth writes but we cannot handle them7466* here.7467*/7468assert(devinfo->ver != 6 ||7469inst->src[FB_WRITE_LOGICAL_SRC_SRC_DEPTH].file == BAD_FILE ||7470inst->exec_size == 8);7471/* Dual-source FB writes are unsupported in SIMD16 mode. */7472return (inst->src[FB_WRITE_LOGICAL_SRC_COLOR1].file != BAD_FILE ?74738 : MIN2(16, inst->exec_size));74747475case FS_OPCODE_FB_READ_LOGICAL:7476return MIN2(16, inst->exec_size);74777478case SHADER_OPCODE_TEX_LOGICAL:7479case SHADER_OPCODE_TXF_CMS_LOGICAL:7480case SHADER_OPCODE_TXF_UMS_LOGICAL:7481case SHADER_OPCODE_TXF_MCS_LOGICAL:7482case SHADER_OPCODE_LOD_LOGICAL:7483case SHADER_OPCODE_TG4_LOGICAL:7484case SHADER_OPCODE_SAMPLEINFO_LOGICAL:7485case SHADER_OPCODE_TXF_CMS_W_LOGICAL:7486case SHADER_OPCODE_TG4_OFFSET_LOGICAL:7487return get_sampler_lowered_simd_width(devinfo, inst);74887489case SHADER_OPCODE_TXD_LOGICAL:7490/* TXD is unsupported in SIMD16 mode. */7491return 8;74927493case SHADER_OPCODE_TXL_LOGICAL:7494case FS_OPCODE_TXB_LOGICAL:7495/* Only one execution size is representable pre-ILK depending on whether7496* the shadow reference argument is present.7497*/7498if (devinfo->ver == 4)7499return inst->src[TEX_LOGICAL_SRC_SHADOW_C].file == BAD_FILE ? 16 : 8;7500else7501return get_sampler_lowered_simd_width(devinfo, inst);75027503case SHADER_OPCODE_TXF_LOGICAL:7504case SHADER_OPCODE_TXS_LOGICAL:7505/* Gfx4 doesn't have SIMD8 variants for the RESINFO and LD-with-LOD7506* messages. Use SIMD16 instead.7507*/7508if (devinfo->ver == 4)7509return 16;7510else7511return get_sampler_lowered_simd_width(devinfo, inst);75127513case SHADER_OPCODE_TYPED_ATOMIC_LOGICAL:7514case SHADER_OPCODE_TYPED_SURFACE_READ_LOGICAL:7515case SHADER_OPCODE_TYPED_SURFACE_WRITE_LOGICAL:7516return 8;75177518case SHADER_OPCODE_UNTYPED_ATOMIC_LOGICAL:7519case SHADER_OPCODE_UNTYPED_ATOMIC_FLOAT_LOGICAL:7520case SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL:7521case SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL:7522case SHADER_OPCODE_BYTE_SCATTERED_WRITE_LOGICAL:7523case SHADER_OPCODE_BYTE_SCATTERED_READ_LOGICAL:7524case SHADER_OPCODE_DWORD_SCATTERED_WRITE_LOGICAL:7525case SHADER_OPCODE_DWORD_SCATTERED_READ_LOGICAL:7526return MIN2(16, inst->exec_size);75277528case SHADER_OPCODE_A64_UNTYPED_WRITE_LOGICAL:7529case SHADER_OPCODE_A64_UNTYPED_READ_LOGICAL:7530case SHADER_OPCODE_A64_BYTE_SCATTERED_WRITE_LOGICAL:7531case SHADER_OPCODE_A64_BYTE_SCATTERED_READ_LOGICAL:7532return devinfo->ver <= 8 ? 8 : MIN2(16, inst->exec_size);75337534case SHADER_OPCODE_A64_OWORD_BLOCK_READ_LOGICAL:7535case SHADER_OPCODE_A64_UNALIGNED_OWORD_BLOCK_READ_LOGICAL:7536case SHADER_OPCODE_A64_OWORD_BLOCK_WRITE_LOGICAL:7537assert(inst->exec_size <= 16);7538return inst->exec_size;75397540case SHADER_OPCODE_A64_UNTYPED_ATOMIC_LOGICAL:7541case SHADER_OPCODE_A64_UNTYPED_ATOMIC_INT16_LOGICAL:7542case SHADER_OPCODE_A64_UNTYPED_ATOMIC_INT64_LOGICAL:7543case SHADER_OPCODE_A64_UNTYPED_ATOMIC_FLOAT16_LOGICAL:7544case SHADER_OPCODE_A64_UNTYPED_ATOMIC_FLOAT32_LOGICAL:7545return 8;75467547case SHADER_OPCODE_URB_READ_SIMD8:7548case SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT:7549case SHADER_OPCODE_URB_WRITE_SIMD8:7550case SHADER_OPCODE_URB_WRITE_SIMD8_PER_SLOT:7551case SHADER_OPCODE_URB_WRITE_SIMD8_MASKED:7552case SHADER_OPCODE_URB_WRITE_SIMD8_MASKED_PER_SLOT:7553return MIN2(8, inst->exec_size);75547555case SHADER_OPCODE_QUAD_SWIZZLE: {7556const unsigned swiz = inst->src[1].ud;7557return (is_uniform(inst->src[0]) ?7558get_fpu_lowered_simd_width(devinfo, inst) :7559devinfo->ver < 11 && type_sz(inst->src[0].type) == 4 ? 8 :7560swiz == BRW_SWIZZLE_XYXY || swiz == BRW_SWIZZLE_ZWZW ? 4 :7561get_fpu_lowered_simd_width(devinfo, inst));7562}7563case SHADER_OPCODE_MOV_INDIRECT: {7564/* From IVB and HSW PRMs:7565*7566* "2.When the destination requires two registers and the sources are7567* indirect, the sources must use 1x1 regioning mode.7568*7569* In case of DF instructions in HSW/IVB, the exec_size is limited by7570* the EU decompression logic not handling VxH indirect addressing7571* correctly.7572*/7573const unsigned max_size = (devinfo->ver >= 8 ? 2 : 1) * REG_SIZE;7574/* Prior to Broadwell, we only have 8 address subregisters. */7575return MIN3(devinfo->ver >= 8 ? 16 : 8,7576max_size / (inst->dst.stride * type_sz(inst->dst.type)),7577inst->exec_size);7578}75797580case SHADER_OPCODE_LOAD_PAYLOAD: {7581const unsigned reg_count =7582DIV_ROUND_UP(inst->dst.component_size(inst->exec_size), REG_SIZE);75837584if (reg_count > 2) {7585/* Only LOAD_PAYLOAD instructions with per-channel destination region7586* can be easily lowered (which excludes headers and heterogeneous7587* types).7588*/7589assert(!inst->header_size);7590for (unsigned i = 0; i < inst->sources; i++)7591assert(type_sz(inst->dst.type) == type_sz(inst->src[i].type) ||7592inst->src[i].file == BAD_FILE);75937594return inst->exec_size / DIV_ROUND_UP(reg_count, 2);7595} else {7596return inst->exec_size;7597}7598}7599default:7600return inst->exec_size;7601}7602}76037604/**7605* Return true if splitting out the group of channels of instruction \p inst7606* given by lbld.group() requires allocating a temporary for the i-th source7607* of the lowered instruction.7608*/7609static inline bool7610needs_src_copy(const fs_builder &lbld, const fs_inst *inst, unsigned i)7611{7612return !(is_periodic(inst->src[i], lbld.dispatch_width()) ||7613(inst->components_read(i) == 1 &&7614lbld.dispatch_width() <= inst->exec_size)) ||7615(inst->flags_written(lbld.shader->devinfo) &7616flag_mask(inst->src[i], type_sz(inst->src[i].type)));7617}76187619/**7620* Extract the data that would be consumed by the channel group given by7621* lbld.group() from the i-th source region of instruction \p inst and return7622* it as result in packed form.7623*/7624static fs_reg7625emit_unzip(const fs_builder &lbld, fs_inst *inst, unsigned i)7626{7627assert(lbld.group() >= inst->group);76287629/* Specified channel group from the source region. */7630const fs_reg src = horiz_offset(inst->src[i], lbld.group() - inst->group);76317632if (needs_src_copy(lbld, inst, i)) {7633/* Builder of the right width to perform the copy avoiding uninitialized7634* data if the lowered execution size is greater than the original7635* execution size of the instruction.7636*/7637const fs_builder cbld = lbld.group(MIN2(lbld.dispatch_width(),7638inst->exec_size), 0);7639const fs_reg tmp = lbld.vgrf(inst->src[i].type, inst->components_read(i));76407641for (unsigned k = 0; k < inst->components_read(i); ++k)7642cbld.MOV(offset(tmp, lbld, k), offset(src, inst->exec_size, k));76437644return tmp;76457646} else if (is_periodic(inst->src[i], lbld.dispatch_width())) {7647/* The source is invariant for all dispatch_width-wide groups of the7648* original region.7649*/7650return inst->src[i];76517652} else {7653/* We can just point the lowered instruction at the right channel group7654* from the original region.7655*/7656return src;7657}7658}76597660/**7661* Return true if splitting out the group of channels of instruction \p inst7662* given by lbld.group() requires allocating a temporary for the destination7663* of the lowered instruction and copying the data back to the original7664* destination region.7665*/7666static inline bool7667needs_dst_copy(const fs_builder &lbld, const fs_inst *inst)7668{7669/* If the instruction writes more than one component we'll have to shuffle7670* the results of multiple lowered instructions in order to make sure that7671* they end up arranged correctly in the original destination region.7672*/7673if (inst->size_written > inst->dst.component_size(inst->exec_size))7674return true;76757676/* If the lowered execution size is larger than the original the result of7677* the instruction won't fit in the original destination, so we'll have to7678* allocate a temporary in any case.7679*/7680if (lbld.dispatch_width() > inst->exec_size)7681return true;76827683for (unsigned i = 0; i < inst->sources; i++) {7684/* If we already made a copy of the source for other reasons there won't7685* be any overlap with the destination.7686*/7687if (needs_src_copy(lbld, inst, i))7688continue;76897690/* In order to keep the logic simple we emit a copy whenever the7691* destination region doesn't exactly match an overlapping source, which7692* may point at the source and destination not being aligned group by7693* group which could cause one of the lowered instructions to overwrite7694* the data read from the same source by other lowered instructions.7695*/7696if (regions_overlap(inst->dst, inst->size_written,7697inst->src[i], inst->size_read(i)) &&7698!inst->dst.equals(inst->src[i]))7699return true;7700}77017702return false;7703}77047705/**7706* Insert data from a packed temporary into the channel group given by7707* lbld.group() of the destination region of instruction \p inst and return7708* the temporary as result. Any copy instructions that are required for7709* unzipping the previous value (in the case of partial writes) will be7710* inserted using \p lbld_before and any copy instructions required for7711* zipping up the destination of \p inst will be inserted using \p lbld_after.7712*/7713static fs_reg7714emit_zip(const fs_builder &lbld_before, const fs_builder &lbld_after,7715fs_inst *inst)7716{7717assert(lbld_before.dispatch_width() == lbld_after.dispatch_width());7718assert(lbld_before.group() == lbld_after.group());7719assert(lbld_after.group() >= inst->group);77207721/* Specified channel group from the destination region. */7722const fs_reg dst = horiz_offset(inst->dst, lbld_after.group() - inst->group);7723const unsigned dst_size = inst->size_written /7724inst->dst.component_size(inst->exec_size);77257726if (needs_dst_copy(lbld_after, inst)) {7727const fs_reg tmp = lbld_after.vgrf(inst->dst.type, dst_size);77287729if (inst->predicate) {7730/* Handle predication by copying the original contents of7731* the destination into the temporary before emitting the7732* lowered instruction.7733*/7734const fs_builder gbld_before =7735lbld_before.group(MIN2(lbld_before.dispatch_width(),7736inst->exec_size), 0);7737for (unsigned k = 0; k < dst_size; ++k) {7738gbld_before.MOV(offset(tmp, lbld_before, k),7739offset(dst, inst->exec_size, k));7740}7741}77427743const fs_builder gbld_after =7744lbld_after.group(MIN2(lbld_after.dispatch_width(),7745inst->exec_size), 0);7746for (unsigned k = 0; k < dst_size; ++k) {7747/* Use a builder of the right width to perform the copy avoiding7748* uninitialized data if the lowered execution size is greater than7749* the original execution size of the instruction.7750*/7751gbld_after.MOV(offset(dst, inst->exec_size, k),7752offset(tmp, lbld_after, k));7753}77547755return tmp;77567757} else {7758/* No need to allocate a temporary for the lowered instruction, just7759* take the right group of channels from the original region.7760*/7761return dst;7762}7763}77647765bool7766fs_visitor::lower_simd_width()7767{7768bool progress = false;77697770foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {7771const unsigned lower_width = get_lowered_simd_width(devinfo, inst);77727773if (lower_width != inst->exec_size) {7774/* Builder matching the original instruction. We may also need to7775* emit an instruction of width larger than the original, set the7776* execution size of the builder to the highest of both for now so7777* we're sure that both cases can be handled.7778*/7779const unsigned max_width = MAX2(inst->exec_size, lower_width);7780const fs_builder ibld = bld.at(block, inst)7781.exec_all(inst->force_writemask_all)7782.group(max_width, inst->group / max_width);77837784/* Split the copies in chunks of the execution width of either the7785* original or the lowered instruction, whichever is lower.7786*/7787const unsigned n = DIV_ROUND_UP(inst->exec_size, lower_width);7788const unsigned dst_size = inst->size_written /7789inst->dst.component_size(inst->exec_size);77907791assert(!inst->writes_accumulator && !inst->mlen);77927793/* Inserting the zip, unzip, and duplicated instructions in all of7794* the right spots is somewhat tricky. All of the unzip and any7795* instructions from the zip which unzip the destination prior to7796* writing need to happen before all of the per-group instructions7797* and the zip instructions need to happen after. In order to sort7798* this all out, we insert the unzip instructions before \p inst,7799* insert the per-group instructions after \p inst (i.e. before7800* inst->next), and insert the zip instructions before the7801* instruction after \p inst. Since we are inserting instructions7802* after \p inst, inst->next is a moving target and we need to save7803* it off here so that we insert the zip instructions in the right7804* place.7805*7806* Since we're inserting split instructions after after_inst, the7807* instructions will end up in the reverse order that we insert them.7808* However, certain render target writes require that the low group7809* instructions come before the high group. From the Ivy Bridge PRM7810* Vol. 4, Pt. 1, Section 3.9.11:7811*7812* "If multiple SIMD8 Dual Source messages are delivered by the7813* pixel shader thread, each SIMD8_DUALSRC_LO message must be7814* issued before the SIMD8_DUALSRC_HI message with the same Slot7815* Group Select setting."7816*7817* And, from Section 3.9.11.1 of the same PRM:7818*7819* "When SIMD32 or SIMD16 PS threads send render target writes7820* with multiple SIMD8 and SIMD16 messages, the following must7821* hold:7822*7823* All the slots (as described above) must have a corresponding7824* render target write irrespective of the slot's validity. A slot7825* is considered valid when at least one sample is enabled. For7826* example, a SIMD16 PS thread must send two SIMD8 render target7827* writes to cover all the slots.7828*7829* PS thread must send SIMD render target write messages with7830* increasing slot numbers. For example, SIMD16 thread has7831* Slot[15:0] and if two SIMD8 render target writes are used, the7832* first SIMD8 render target write must send Slot[7:0] and the7833* next one must send Slot[15:8]."7834*7835* In order to make low group instructions come before high group7836* instructions (this is required for some render target writes), we7837* split from the highest group to lowest.7838*/7839exec_node *const after_inst = inst->next;7840for (int i = n - 1; i >= 0; i--) {7841/* Emit a copy of the original instruction with the lowered width.7842* If the EOT flag was set throw it away except for the last7843* instruction to avoid killing the thread prematurely.7844*/7845fs_inst split_inst = *inst;7846split_inst.exec_size = lower_width;7847split_inst.eot = inst->eot && i == int(n - 1);78487849/* Select the correct channel enables for the i-th group, then7850* transform the sources and destination and emit the lowered7851* instruction.7852*/7853const fs_builder lbld = ibld.group(lower_width, i);78547855for (unsigned j = 0; j < inst->sources; j++)7856split_inst.src[j] = emit_unzip(lbld.at(block, inst), inst, j);78577858split_inst.dst = emit_zip(lbld.at(block, inst),7859lbld.at(block, after_inst), inst);7860split_inst.size_written =7861split_inst.dst.component_size(lower_width) * dst_size;78627863lbld.at(block, inst->next).emit(split_inst);7864}78657866inst->remove(block);7867progress = true;7868}7869}78707871if (progress)7872invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES);78737874return progress;7875}78767877/**7878* Transform barycentric vectors into the interleaved form expected by the PLN7879* instruction and returned by the Gfx7+ PI shared function.7880*7881* For channels 0-15 in SIMD16 mode they are expected to be laid out as7882* follows in the register file:7883*7884* rN+0: X[0-7]7885* rN+1: Y[0-7]7886* rN+2: X[8-15]7887* rN+3: Y[8-15]7888*7889* There is no need to handle SIMD32 here -- This is expected to be run after7890* SIMD lowering, since SIMD lowering relies on vectors having the standard7891* component layout.7892*/7893bool7894fs_visitor::lower_barycentrics()7895{7896const bool has_interleaved_layout = devinfo->has_pln || devinfo->ver >= 7;7897bool progress = false;78987899if (stage != MESA_SHADER_FRAGMENT || !has_interleaved_layout)7900return false;79017902foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {7903if (inst->exec_size < 16)7904continue;79057906const fs_builder ibld(this, block, inst);7907const fs_builder ubld = ibld.exec_all().group(8, 0);79087909switch (inst->opcode) {7910case FS_OPCODE_LINTERP : {7911assert(inst->exec_size == 16);7912const fs_reg tmp = ibld.vgrf(inst->src[0].type, 2);7913fs_reg srcs[4];79147915for (unsigned i = 0; i < ARRAY_SIZE(srcs); i++)7916srcs[i] = horiz_offset(offset(inst->src[0], ibld, i % 2),79178 * (i / 2));79187919ubld.LOAD_PAYLOAD(tmp, srcs, ARRAY_SIZE(srcs), ARRAY_SIZE(srcs));79207921inst->src[0] = tmp;7922progress = true;7923break;7924}7925case FS_OPCODE_INTERPOLATE_AT_SAMPLE:7926case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:7927case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET: {7928assert(inst->exec_size == 16);7929const fs_reg tmp = ibld.vgrf(inst->dst.type, 2);79307931for (unsigned i = 0; i < 2; i++) {7932for (unsigned g = 0; g < inst->exec_size / 8; g++) {7933fs_inst *mov = ibld.at(block, inst->next).group(8, g)7934.MOV(horiz_offset(offset(inst->dst, ibld, i),79358 * g),7936offset(tmp, ubld, 2 * g + i));7937mov->predicate = inst->predicate;7938mov->predicate_inverse = inst->predicate_inverse;7939mov->flag_subreg = inst->flag_subreg;7940}7941}79427943inst->dst = tmp;7944progress = true;7945break;7946}7947default:7948break;7949}7950}79517952if (progress)7953invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES);79547955return progress;7956}79577958/**7959* Lower a derivative instruction as the floating-point difference of two7960* swizzles of the source, specified as \p swz0 and \p swz1.7961*/7962static bool7963lower_derivative(fs_visitor *v, bblock_t *block, fs_inst *inst,7964unsigned swz0, unsigned swz1)7965{7966const fs_builder ibld(v, block, inst);7967const fs_reg tmp0 = ibld.vgrf(inst->src[0].type);7968const fs_reg tmp1 = ibld.vgrf(inst->src[0].type);79697970ibld.emit(SHADER_OPCODE_QUAD_SWIZZLE, tmp0, inst->src[0], brw_imm_ud(swz0));7971ibld.emit(SHADER_OPCODE_QUAD_SWIZZLE, tmp1, inst->src[0], brw_imm_ud(swz1));79727973inst->resize_sources(2);7974inst->src[0] = negate(tmp0);7975inst->src[1] = tmp1;7976inst->opcode = BRW_OPCODE_ADD;79777978return true;7979}79807981/**7982* Lower derivative instructions on platforms where codegen cannot implement7983* them efficiently (i.e. XeHP).7984*/7985bool7986fs_visitor::lower_derivatives()7987{7988bool progress = false;79897990if (devinfo->verx10 < 125)7991return false;79927993foreach_block_and_inst(block, fs_inst, inst, cfg) {7994if (inst->opcode == FS_OPCODE_DDX_COARSE)7995progress |= lower_derivative(this, block, inst,7996BRW_SWIZZLE_XXXX, BRW_SWIZZLE_YYYY);79977998else if (inst->opcode == FS_OPCODE_DDX_FINE)7999progress |= lower_derivative(this, block, inst,8000BRW_SWIZZLE_XXZZ, BRW_SWIZZLE_YYWW);80018002else if (inst->opcode == FS_OPCODE_DDY_COARSE)8003progress |= lower_derivative(this, block, inst,8004BRW_SWIZZLE_XXXX, BRW_SWIZZLE_ZZZZ);80058006else if (inst->opcode == FS_OPCODE_DDY_FINE)8007progress |= lower_derivative(this, block, inst,8008BRW_SWIZZLE_XYXY, BRW_SWIZZLE_ZWZW);8009}80108011if (progress)8012invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES);80138014return progress;8015}80168017void8018fs_visitor::dump_instructions() const8019{8020dump_instructions(NULL);8021}80228023void8024fs_visitor::dump_instructions(const char *name) const8025{8026FILE *file = stderr;8027if (name && geteuid() != 0) {8028file = fopen(name, "w");8029if (!file)8030file = stderr;8031}80328033if (cfg) {8034const register_pressure &rp = regpressure_analysis.require();8035unsigned ip = 0, max_pressure = 0;8036foreach_block_and_inst(block, backend_instruction, inst, cfg) {8037max_pressure = MAX2(max_pressure, rp.regs_live_at_ip[ip]);8038fprintf(file, "{%3d} %4d: ", rp.regs_live_at_ip[ip], ip);8039dump_instruction(inst, file);8040ip++;8041}8042fprintf(file, "Maximum %3d registers live at once.\n", max_pressure);8043} else {8044int ip = 0;8045foreach_in_list(backend_instruction, inst, &instructions) {8046fprintf(file, "%4d: ", ip++);8047dump_instruction(inst, file);8048}8049}80508051if (file != stderr) {8052fclose(file);8053}8054}80558056void8057fs_visitor::dump_instruction(const backend_instruction *be_inst) const8058{8059dump_instruction(be_inst, stderr);8060}80618062void8063fs_visitor::dump_instruction(const backend_instruction *be_inst, FILE *file) const8064{8065const fs_inst *inst = (const fs_inst *)be_inst;80668067if (inst->predicate) {8068fprintf(file, "(%cf%d.%d) ",8069inst->predicate_inverse ? '-' : '+',8070inst->flag_subreg / 2,8071inst->flag_subreg % 2);8072}80738074fprintf(file, "%s", brw_instruction_name(devinfo, inst->opcode));8075if (inst->saturate)8076fprintf(file, ".sat");8077if (inst->conditional_mod) {8078fprintf(file, "%s", conditional_modifier[inst->conditional_mod]);8079if (!inst->predicate &&8080(devinfo->ver < 5 || (inst->opcode != BRW_OPCODE_SEL &&8081inst->opcode != BRW_OPCODE_CSEL &&8082inst->opcode != BRW_OPCODE_IF &&8083inst->opcode != BRW_OPCODE_WHILE))) {8084fprintf(file, ".f%d.%d", inst->flag_subreg / 2,8085inst->flag_subreg % 2);8086}8087}8088fprintf(file, "(%d) ", inst->exec_size);80898090if (inst->mlen) {8091fprintf(file, "(mlen: %d) ", inst->mlen);8092}80938094if (inst->ex_mlen) {8095fprintf(file, "(ex_mlen: %d) ", inst->ex_mlen);8096}80978098if (inst->eot) {8099fprintf(file, "(EOT) ");8100}81018102switch (inst->dst.file) {8103case VGRF:8104fprintf(file, "vgrf%d", inst->dst.nr);8105break;8106case FIXED_GRF:8107fprintf(file, "g%d", inst->dst.nr);8108break;8109case MRF:8110fprintf(file, "m%d", inst->dst.nr);8111break;8112case BAD_FILE:8113fprintf(file, "(null)");8114break;8115case UNIFORM:8116fprintf(file, "***u%d***", inst->dst.nr);8117break;8118case ATTR:8119fprintf(file, "***attr%d***", inst->dst.nr);8120break;8121case ARF:8122switch (inst->dst.nr) {8123case BRW_ARF_NULL:8124fprintf(file, "null");8125break;8126case BRW_ARF_ADDRESS:8127fprintf(file, "a0.%d", inst->dst.subnr);8128break;8129case BRW_ARF_ACCUMULATOR:8130fprintf(file, "acc%d", inst->dst.subnr);8131break;8132case BRW_ARF_FLAG:8133fprintf(file, "f%d.%d", inst->dst.nr & 0xf, inst->dst.subnr);8134break;8135default:8136fprintf(file, "arf%d.%d", inst->dst.nr & 0xf, inst->dst.subnr);8137break;8138}8139break;8140case IMM:8141unreachable("not reached");8142}81438144if (inst->dst.offset ||8145(inst->dst.file == VGRF &&8146alloc.sizes[inst->dst.nr] * REG_SIZE != inst->size_written)) {8147const unsigned reg_size = (inst->dst.file == UNIFORM ? 4 : REG_SIZE);8148fprintf(file, "+%d.%d", inst->dst.offset / reg_size,8149inst->dst.offset % reg_size);8150}81518152if (inst->dst.stride != 1)8153fprintf(file, "<%u>", inst->dst.stride);8154fprintf(file, ":%s, ", brw_reg_type_to_letters(inst->dst.type));81558156for (int i = 0; i < inst->sources; i++) {8157if (inst->src[i].negate)8158fprintf(file, "-");8159if (inst->src[i].abs)8160fprintf(file, "|");8161switch (inst->src[i].file) {8162case VGRF:8163fprintf(file, "vgrf%d", inst->src[i].nr);8164break;8165case FIXED_GRF:8166fprintf(file, "g%d", inst->src[i].nr);8167break;8168case MRF:8169fprintf(file, "***m%d***", inst->src[i].nr);8170break;8171case ATTR:8172fprintf(file, "attr%d", inst->src[i].nr);8173break;8174case UNIFORM:8175fprintf(file, "u%d", inst->src[i].nr);8176break;8177case BAD_FILE:8178fprintf(file, "(null)");8179break;8180case IMM:8181switch (inst->src[i].type) {8182case BRW_REGISTER_TYPE_HF:8183fprintf(file, "%-ghf", _mesa_half_to_float(inst->src[i].ud & 0xffff));8184break;8185case BRW_REGISTER_TYPE_F:8186fprintf(file, "%-gf", inst->src[i].f);8187break;8188case BRW_REGISTER_TYPE_DF:8189fprintf(file, "%fdf", inst->src[i].df);8190break;8191case BRW_REGISTER_TYPE_W:8192case BRW_REGISTER_TYPE_D:8193fprintf(file, "%dd", inst->src[i].d);8194break;8195case BRW_REGISTER_TYPE_UW:8196case BRW_REGISTER_TYPE_UD:8197fprintf(file, "%uu", inst->src[i].ud);8198break;8199case BRW_REGISTER_TYPE_Q:8200fprintf(file, "%" PRId64 "q", inst->src[i].d64);8201break;8202case BRW_REGISTER_TYPE_UQ:8203fprintf(file, "%" PRIu64 "uq", inst->src[i].u64);8204break;8205case BRW_REGISTER_TYPE_VF:8206fprintf(file, "[%-gF, %-gF, %-gF, %-gF]",8207brw_vf_to_float((inst->src[i].ud >> 0) & 0xff),8208brw_vf_to_float((inst->src[i].ud >> 8) & 0xff),8209brw_vf_to_float((inst->src[i].ud >> 16) & 0xff),8210brw_vf_to_float((inst->src[i].ud >> 24) & 0xff));8211break;8212case BRW_REGISTER_TYPE_V:8213case BRW_REGISTER_TYPE_UV:8214fprintf(file, "%08x%s", inst->src[i].ud,8215inst->src[i].type == BRW_REGISTER_TYPE_V ? "V" : "UV");8216break;8217default:8218fprintf(file, "???");8219break;8220}8221break;8222case ARF:8223switch (inst->src[i].nr) {8224case BRW_ARF_NULL:8225fprintf(file, "null");8226break;8227case BRW_ARF_ADDRESS:8228fprintf(file, "a0.%d", inst->src[i].subnr);8229break;8230case BRW_ARF_ACCUMULATOR:8231fprintf(file, "acc%d", inst->src[i].subnr);8232break;8233case BRW_ARF_FLAG:8234fprintf(file, "f%d.%d", inst->src[i].nr & 0xf, inst->src[i].subnr);8235break;8236default:8237fprintf(file, "arf%d.%d", inst->src[i].nr & 0xf, inst->src[i].subnr);8238break;8239}8240break;8241}82428243if (inst->src[i].offset ||8244(inst->src[i].file == VGRF &&8245alloc.sizes[inst->src[i].nr] * REG_SIZE != inst->size_read(i))) {8246const unsigned reg_size = (inst->src[i].file == UNIFORM ? 4 : REG_SIZE);8247fprintf(file, "+%d.%d", inst->src[i].offset / reg_size,8248inst->src[i].offset % reg_size);8249}82508251if (inst->src[i].abs)8252fprintf(file, "|");82538254if (inst->src[i].file != IMM) {8255unsigned stride;8256if (inst->src[i].file == ARF || inst->src[i].file == FIXED_GRF) {8257unsigned hstride = inst->src[i].hstride;8258stride = (hstride == 0 ? 0 : (1 << (hstride - 1)));8259} else {8260stride = inst->src[i].stride;8261}8262if (stride != 1)8263fprintf(file, "<%u>", stride);82648265fprintf(file, ":%s", brw_reg_type_to_letters(inst->src[i].type));8266}82678268if (i < inst->sources - 1 && inst->src[i + 1].file != BAD_FILE)8269fprintf(file, ", ");8270}82718272fprintf(file, " ");82738274if (inst->force_writemask_all)8275fprintf(file, "NoMask ");82768277if (inst->exec_size != dispatch_width)8278fprintf(file, "group%d ", inst->group);82798280fprintf(file, "\n");8281}82828283void8284fs_visitor::setup_fs_payload_gfx6()8285{8286assert(stage == MESA_SHADER_FRAGMENT);8287struct brw_wm_prog_data *prog_data = brw_wm_prog_data(this->prog_data);8288const unsigned payload_width = MIN2(16, dispatch_width);8289assert(dispatch_width % payload_width == 0);8290assert(devinfo->ver >= 6);82918292/* R0: PS thread payload header. */8293payload.num_regs++;82948295for (unsigned j = 0; j < dispatch_width / payload_width; j++) {8296/* R1: masks, pixel X/Y coordinates. */8297payload.subspan_coord_reg[j] = payload.num_regs++;8298}82998300for (unsigned j = 0; j < dispatch_width / payload_width; j++) {8301/* R3-26: barycentric interpolation coordinates. These appear in the8302* same order that they appear in the brw_barycentric_mode enum. Each8303* set of coordinates occupies 2 registers if dispatch width == 8 and 48304* registers if dispatch width == 16. Coordinates only appear if they8305* were enabled using the "Barycentric Interpolation Mode" bits in8306* WM_STATE.8307*/8308for (int i = 0; i < BRW_BARYCENTRIC_MODE_COUNT; ++i) {8309if (prog_data->barycentric_interp_modes & (1 << i)) {8310payload.barycentric_coord_reg[i][j] = payload.num_regs;8311payload.num_regs += payload_width / 4;8312}8313}83148315/* R27-28: interpolated depth if uses source depth */8316if (prog_data->uses_src_depth) {8317payload.source_depth_reg[j] = payload.num_regs;8318payload.num_regs += payload_width / 8;8319}83208321/* R29-30: interpolated W set if GFX6_WM_USES_SOURCE_W. */8322if (prog_data->uses_src_w) {8323payload.source_w_reg[j] = payload.num_regs;8324payload.num_regs += payload_width / 8;8325}83268327/* R31: MSAA position offsets. */8328if (prog_data->uses_pos_offset) {8329payload.sample_pos_reg[j] = payload.num_regs;8330payload.num_regs++;8331}83328333/* R32-33: MSAA input coverage mask */8334if (prog_data->uses_sample_mask) {8335assert(devinfo->ver >= 7);8336payload.sample_mask_in_reg[j] = payload.num_regs;8337payload.num_regs += payload_width / 8;8338}83398340/* R66: Source Depth and/or W Attribute Vertex Deltas */8341if (prog_data->uses_depth_w_coefficients) {8342payload.depth_w_coef_reg[j] = payload.num_regs;8343payload.num_regs++;8344}8345}83468347if (nir->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {8348source_depth_to_render_target = true;8349}8350}83518352void8353fs_visitor::setup_vs_payload()8354{8355/* R0: thread header, R1: urb handles */8356payload.num_regs = 2;8357}83588359void8360fs_visitor::setup_gs_payload()8361{8362assert(stage == MESA_SHADER_GEOMETRY);83638364struct brw_gs_prog_data *gs_prog_data = brw_gs_prog_data(prog_data);8365struct brw_vue_prog_data *vue_prog_data = brw_vue_prog_data(prog_data);83668367/* R0: thread header, R1: output URB handles */8368payload.num_regs = 2;83698370if (gs_prog_data->include_primitive_id) {8371/* R2: Primitive ID 0..7 */8372payload.num_regs++;8373}83748375/* Always enable VUE handles so we can safely use pull model if needed.8376*8377* The push model for a GS uses a ton of register space even for trivial8378* scenarios with just a few inputs, so just make things easier and a bit8379* safer by always having pull model available.8380*/8381gs_prog_data->base.include_vue_handles = true;83828383/* R3..RN: ICP Handles for each incoming vertex (when using pull model) */8384payload.num_regs += nir->info.gs.vertices_in;83858386/* Use a maximum of 24 registers for push-model inputs. */8387const unsigned max_push_components = 24;83888389/* If pushing our inputs would take too many registers, reduce the URB read8390* length (which is in HWords, or 8 registers), and resort to pulling.8391*8392* Note that the GS reads <URB Read Length> HWords for every vertex - so we8393* have to multiply by VerticesIn to obtain the total storage requirement.8394*/8395if (8 * vue_prog_data->urb_read_length * nir->info.gs.vertices_in >8396max_push_components) {8397vue_prog_data->urb_read_length =8398ROUND_DOWN_TO(max_push_components / nir->info.gs.vertices_in, 8) / 8;8399}8400}84018402void8403fs_visitor::setup_cs_payload()8404{8405assert(devinfo->ver >= 7);8406/* TODO: Fill out uses_btd_stack_ids automatically */8407payload.num_regs = 1 + brw_cs_prog_data(prog_data)->uses_btd_stack_ids;8408}84098410brw::register_pressure::register_pressure(const fs_visitor *v)8411{8412const fs_live_variables &live = v->live_analysis.require();8413const unsigned num_instructions = v->cfg->num_blocks ?8414v->cfg->blocks[v->cfg->num_blocks - 1]->end_ip + 1 : 0;84158416regs_live_at_ip = new unsigned[num_instructions]();84178418for (unsigned reg = 0; reg < v->alloc.count; reg++) {8419for (int ip = live.vgrf_start[reg]; ip <= live.vgrf_end[reg]; ip++)8420regs_live_at_ip[ip] += v->alloc.sizes[reg];8421}8422}84238424brw::register_pressure::~register_pressure()8425{8426delete[] regs_live_at_ip;8427}84288429void8430fs_visitor::invalidate_analysis(brw::analysis_dependency_class c)8431{8432backend_shader::invalidate_analysis(c);8433live_analysis.invalidate(c);8434regpressure_analysis.invalidate(c);8435}84368437void8438fs_visitor::optimize()8439{8440/* Start by validating the shader we currently have. */8441validate();84428443/* bld is the common builder object pointing at the end of the program we8444* used to translate it into i965 IR. For the optimization and lowering8445* passes coming next, any code added after the end of the program without8446* having explicitly called fs_builder::at() clearly points at a mistake.8447* Ideally optimization passes wouldn't be part of the visitor so they8448* wouldn't have access to bld at all, but they do, so just in case some8449* pass forgets to ask for a location explicitly set it to NULL here to8450* make it trip. The dispatch width is initialized to a bogus value to8451* make sure that optimizations set the execution controls explicitly to8452* match the code they are manipulating instead of relying on the defaults.8453*/8454bld = fs_builder(this, 64);84558456assign_constant_locations();8457lower_constant_loads();84588459validate();84608461split_virtual_grfs();8462validate();84638464#define OPT(pass, args...) ({ \8465pass_num++; \8466bool this_progress = pass(args); \8467\8468if ((INTEL_DEBUG & DEBUG_OPTIMIZER) && this_progress) { \8469char filename[64]; \8470snprintf(filename, 64, "%s%d-%s-%02d-%02d-" #pass, \8471stage_abbrev, dispatch_width, nir->info.name, iteration, pass_num); \8472\8473backend_shader::dump_instructions(filename); \8474} \8475\8476validate(); \8477\8478progress = progress || this_progress; \8479this_progress; \8480})84818482if (INTEL_DEBUG & DEBUG_OPTIMIZER) {8483char filename[64];8484snprintf(filename, 64, "%s%d-%s-00-00-start",8485stage_abbrev, dispatch_width, nir->info.name);84868487backend_shader::dump_instructions(filename);8488}84898490bool progress = false;8491int iteration = 0;8492int pass_num = 0;84938494/* Before anything else, eliminate dead code. The results of some NIR8495* instructions may effectively be calculated twice. Once when the8496* instruction is encountered, and again when the user of that result is8497* encountered. Wipe those away before algebraic optimizations and8498* especially copy propagation can mix things up.8499*/8500OPT(dead_code_eliminate);85018502OPT(remove_extra_rounding_modes);85038504do {8505progress = false;8506pass_num = 0;8507iteration++;85088509OPT(remove_duplicate_mrf_writes);85108511OPT(opt_algebraic);8512OPT(opt_cse);8513OPT(opt_copy_propagation);8514OPT(opt_predicated_break, this);8515OPT(opt_cmod_propagation);8516OPT(dead_code_eliminate);8517OPT(opt_peephole_sel);8518OPT(dead_control_flow_eliminate, this);8519OPT(opt_register_renaming);8520OPT(opt_saturate_propagation);8521OPT(register_coalesce);8522OPT(compute_to_mrf);8523OPT(eliminate_find_live_channel);85248525OPT(compact_virtual_grfs);8526} while (progress);85278528progress = false;8529pass_num = 0;85308531if (OPT(lower_pack)) {8532OPT(register_coalesce);8533OPT(dead_code_eliminate);8534}85358536OPT(lower_simd_width);8537OPT(lower_barycentrics);8538OPT(lower_logical_sends);85398540/* After logical SEND lowering. */8541OPT(fixup_nomask_control_flow);85428543if (progress) {8544OPT(opt_copy_propagation);8545/* Only run after logical send lowering because it's easier to implement8546* in terms of physical sends.8547*/8548if (OPT(opt_zero_samples))8549OPT(opt_copy_propagation);8550/* Run after logical send lowering to give it a chance to CSE the8551* LOAD_PAYLOAD instructions created to construct the payloads of8552* e.g. texturing messages in cases where it wasn't possible to CSE the8553* whole logical instruction.8554*/8555OPT(opt_cse);8556OPT(register_coalesce);8557OPT(compute_to_mrf);8558OPT(dead_code_eliminate);8559OPT(remove_duplicate_mrf_writes);8560OPT(opt_peephole_sel);8561}85628563OPT(opt_redundant_halt);85648565if (OPT(lower_load_payload)) {8566split_virtual_grfs();85678568/* Lower 64 bit MOVs generated by payload lowering. */8569if (!devinfo->has_64bit_float && !devinfo->has_64bit_int)8570OPT(opt_algebraic);85718572OPT(register_coalesce);8573OPT(lower_simd_width);8574OPT(compute_to_mrf);8575OPT(dead_code_eliminate);8576}85778578OPT(opt_combine_constants);8579if (OPT(lower_integer_multiplication)) {8580/* If lower_integer_multiplication made progress, it may have produced8581* some 32x32-bit MULs in the process of lowering 64-bit MULs. Run it8582* one more time to clean those up if they exist.8583*/8584OPT(lower_integer_multiplication);8585}8586OPT(lower_sub_sat);85878588if (devinfo->ver <= 5 && OPT(lower_minmax)) {8589OPT(opt_cmod_propagation);8590OPT(opt_cse);8591OPT(opt_copy_propagation);8592OPT(dead_code_eliminate);8593}85948595progress = false;8596OPT(lower_derivatives);8597OPT(lower_regioning);8598if (progress) {8599OPT(opt_copy_propagation);8600OPT(dead_code_eliminate);8601OPT(lower_simd_width);8602}86038604OPT(fixup_sends_duplicate_payload);86058606lower_uniform_pull_constant_loads();86078608validate();8609}86108611/**8612* From the Skylake PRM Vol. 2a docs for sends:8613*8614* "It is required that the second block of GRFs does not overlap with the8615* first block."8616*8617* There are plenty of cases where we may accidentally violate this due to8618* having, for instance, both sources be the constant 0. This little pass8619* just adds a new vgrf for the second payload and copies it over.8620*/8621bool8622fs_visitor::fixup_sends_duplicate_payload()8623{8624bool progress = false;86258626foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {8627if (inst->opcode == SHADER_OPCODE_SEND && inst->ex_mlen > 0 &&8628regions_overlap(inst->src[2], inst->mlen * REG_SIZE,8629inst->src[3], inst->ex_mlen * REG_SIZE)) {8630fs_reg tmp = fs_reg(VGRF, alloc.allocate(inst->ex_mlen),8631BRW_REGISTER_TYPE_UD);8632/* Sadly, we've lost all notion of channels and bit sizes at this8633* point. Just WE_all it.8634*/8635const fs_builder ibld = bld.at(block, inst).exec_all().group(16, 0);8636fs_reg copy_src = retype(inst->src[3], BRW_REGISTER_TYPE_UD);8637fs_reg copy_dst = tmp;8638for (unsigned i = 0; i < inst->ex_mlen; i += 2) {8639if (inst->ex_mlen == i + 1) {8640/* Only one register left; do SIMD8 */8641ibld.group(8, 0).MOV(copy_dst, copy_src);8642} else {8643ibld.MOV(copy_dst, copy_src);8644}8645copy_src = offset(copy_src, ibld, 1);8646copy_dst = offset(copy_dst, ibld, 1);8647}8648inst->src[3] = tmp;8649progress = true;8650}8651}86528653if (progress)8654invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES);86558656return progress;8657}86588659/**8660* Three source instruction must have a GRF/MRF destination register.8661* ARF NULL is not allowed. Fix that up by allocating a temporary GRF.8662*/8663void8664fs_visitor::fixup_3src_null_dest()8665{8666bool progress = false;86678668foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {8669if (inst->is_3src(devinfo) && inst->dst.is_null()) {8670inst->dst = fs_reg(VGRF, alloc.allocate(dispatch_width / 8),8671inst->dst.type);8672progress = true;8673}8674}86758676if (progress)8677invalidate_analysis(DEPENDENCY_INSTRUCTION_DETAIL |8678DEPENDENCY_VARIABLES);8679}86808681/**8682* Find the first instruction in the program that might start a region of8683* divergent control flow due to a HALT jump. There is no8684* find_halt_control_flow_region_end(), the region of divergence extends until8685* the only SHADER_OPCODE_HALT_TARGET in the program.8686*/8687static const fs_inst *8688find_halt_control_flow_region_start(const fs_visitor *v)8689{8690foreach_block_and_inst(block, fs_inst, inst, v->cfg) {8691if (inst->opcode == BRW_OPCODE_HALT ||8692inst->opcode == SHADER_OPCODE_HALT_TARGET)8693return inst;8694}86958696return NULL;8697}86988699/**8700* Work around the Gfx12 hardware bug filed as Wa_1407528679. EU fusion8701* can cause a BB to be executed with all channels disabled, which will lead8702* to the execution of any NoMask instructions in it, even though any8703* execution-masked instructions will be correctly shot down. This may break8704* assumptions of some NoMask SEND messages whose descriptor depends on data8705* generated by live invocations of the shader.8706*8707* This avoids the problem by predicating certain instructions on an ANY8708* horizontal predicate that makes sure that their execution is omitted when8709* all channels of the program are disabled.8710*/8711bool8712fs_visitor::fixup_nomask_control_flow()8713{8714if (devinfo->ver != 12)8715return false;87168717const brw_predicate pred = dispatch_width > 16 ? BRW_PREDICATE_ALIGN1_ANY32H :8718dispatch_width > 8 ? BRW_PREDICATE_ALIGN1_ANY16H :8719BRW_PREDICATE_ALIGN1_ANY8H;8720const fs_inst *halt_start = find_halt_control_flow_region_start(this);8721unsigned depth = 0;8722bool progress = false;87238724const fs_live_variables &live_vars = live_analysis.require();87258726/* Scan the program backwards in order to be able to easily determine8727* whether the flag register is live at any point.8728*/8729foreach_block_reverse_safe(block, cfg) {8730BITSET_WORD flag_liveout = live_vars.block_data[block->num]8731.flag_liveout[0];8732STATIC_ASSERT(ARRAY_SIZE(live_vars.block_data[0].flag_liveout) == 1);87338734foreach_inst_in_block_reverse_safe(fs_inst, inst, block) {8735if (!inst->predicate && inst->exec_size >= 8)8736flag_liveout &= ~inst->flags_written(devinfo);87378738switch (inst->opcode) {8739case BRW_OPCODE_DO:8740case BRW_OPCODE_IF:8741/* Note that this doesn't handle BRW_OPCODE_HALT since only8742* the first one in the program closes the region of divergent8743* control flow due to any HALT instructions -- Instead this is8744* handled with the halt_start check below.8745*/8746depth--;8747break;87488749case BRW_OPCODE_WHILE:8750case BRW_OPCODE_ENDIF:8751case SHADER_OPCODE_HALT_TARGET:8752depth++;8753break;87548755default:8756/* Note that the vast majority of NoMask SEND instructions in the8757* program are harmless while executed in a block with all8758* channels disabled, since any instructions with side effects we8759* could hit here should be execution-masked.8760*8761* The main concern is NoMask SEND instructions where the message8762* descriptor or header depends on data generated by live8763* invocations of the shader (RESINFO and8764* FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD with a dynamically8765* computed surface index seem to be the only examples right now8766* where this could easily lead to GPU hangs). Unfortunately we8767* have no straightforward way to detect that currently, so just8768* predicate any NoMask SEND instructions we find under control8769* flow.8770*8771* If this proves to have a measurable performance impact it can8772* be easily extended with a whitelist of messages we know we can8773* safely omit the predication for.8774*/8775if (depth && inst->force_writemask_all &&8776is_send(inst) && !inst->predicate) {8777/* We need to load the execution mask into the flag register by8778* using a builder with channel group matching the whole shader8779* (rather than the default which is derived from the original8780* instruction), in order to avoid getting a right-shifted8781* value.8782*/8783const fs_builder ubld = fs_builder(this, block, inst)8784.exec_all().group(dispatch_width, 0);8785const fs_reg flag = retype(brw_flag_reg(0, 0),8786BRW_REGISTER_TYPE_UD);87878788/* Due to the lack of flag register allocation we need to save8789* and restore the flag register if it's live.8790*/8791const bool save_flag = flag_liveout &8792flag_mask(flag, dispatch_width / 8);8793const fs_reg tmp = ubld.group(1, 0).vgrf(flag.type);87948795if (save_flag)8796ubld.group(1, 0).MOV(tmp, flag);87978798ubld.emit(FS_OPCODE_LOAD_LIVE_CHANNELS);87998800set_predicate(pred, inst);8801inst->flag_subreg = 0;88028803if (save_flag)8804ubld.group(1, 0).at(block, inst->next).MOV(flag, tmp);88058806progress = true;8807}8808break;8809}88108811if (inst == halt_start)8812depth--;88138814flag_liveout |= inst->flags_read(devinfo);8815}8816}88178818if (progress)8819invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES);88208821return progress;8822}88238824void8825fs_visitor::allocate_registers(bool allow_spilling)8826{8827bool allocated;88288829static const enum instruction_scheduler_mode pre_modes[] = {8830SCHEDULE_PRE,8831SCHEDULE_PRE_NON_LIFO,8832SCHEDULE_PRE_LIFO,8833};88348835static const char *scheduler_mode_name[] = {8836"top-down",8837"non-lifo",8838"lifo"8839};88408841bool spill_all = allow_spilling && (INTEL_DEBUG & DEBUG_SPILL_FS);88428843/* Try each scheduling heuristic to see if it can successfully register8844* allocate without spilling. They should be ordered by decreasing8845* performance but increasing likelihood of allocating.8846*/8847for (unsigned i = 0; i < ARRAY_SIZE(pre_modes); i++) {8848schedule_instructions(pre_modes[i]);8849this->shader_stats.scheduler_mode = scheduler_mode_name[i];88508851if (0) {8852assign_regs_trivial();8853allocated = true;8854break;8855}88568857/* Scheduling may create additional opportunities for CMOD propagation,8858* so let's do it again. If CMOD propagation made any progress,8859* eliminate dead code one more time.8860*/8861bool progress = false;8862const int iteration = 99;8863int pass_num = 0;88648865if (OPT(opt_cmod_propagation)) {8866/* dead_code_eliminate "undoes" the fixing done by8867* fixup_3src_null_dest, so we have to do it again if8868* dead_code_eliminiate makes any progress.8869*/8870if (OPT(dead_code_eliminate))8871fixup_3src_null_dest();8872}88738874bool can_spill = allow_spilling &&8875(i == ARRAY_SIZE(pre_modes) - 1);88768877/* We should only spill registers on the last scheduling. */8878assert(!spilled_any_registers);88798880allocated = assign_regs(can_spill, spill_all);8881if (allocated)8882break;8883}88848885if (!allocated) {8886fail("Failure to register allocate. Reduce number of "8887"live scalar values to avoid this.");8888} else if (spilled_any_registers) {8889compiler->shader_perf_log(log_data,8890"%s shader triggered register spilling. "8891"Try reducing the number of live scalar "8892"values to improve performance.\n",8893stage_name);8894}88958896/* This must come after all optimization and register allocation, since8897* it inserts dead code that happens to have side effects, and it does8898* so based on the actual physical registers in use.8899*/8900insert_gfx4_send_dependency_workarounds();89018902if (failed)8903return;89048905opt_bank_conflicts();89068907schedule_instructions(SCHEDULE_POST);89088909if (last_scratch > 0) {8910ASSERTED unsigned max_scratch_size = 2 * 1024 * 1024;89118912prog_data->total_scratch = brw_get_scratch_size(last_scratch);89138914if (stage == MESA_SHADER_COMPUTE || stage == MESA_SHADER_KERNEL) {8915if (devinfo->is_haswell) {8916/* According to the MEDIA_VFE_STATE's "Per Thread Scratch Space"8917* field documentation, Haswell supports a minimum of 2kB of8918* scratch space for compute shaders, unlike every other stage8919* and platform.8920*/8921prog_data->total_scratch = MAX2(prog_data->total_scratch, 2048);8922} else if (devinfo->ver <= 7) {8923/* According to the MEDIA_VFE_STATE's "Per Thread Scratch Space"8924* field documentation, platforms prior to Haswell measure scratch8925* size linearly with a range of [1kB, 12kB] and 1kB granularity.8926*/8927prog_data->total_scratch = ALIGN(last_scratch, 1024);8928max_scratch_size = 12 * 1024;8929}8930}89318932/* We currently only support up to 2MB of scratch space. If we8933* need to support more eventually, the documentation suggests8934* that we could allocate a larger buffer, and partition it out8935* ourselves. We'd just have to undo the hardware's address8936* calculation by subtracting (FFTID * Per Thread Scratch Space)8937* and then add FFTID * (Larger Per Thread Scratch Space).8938*8939* See 3D-Media-GPGPU Engine > Media GPGPU Pipeline >8940* Thread Group Tracking > Local Memory/Scratch Space.8941*/8942assert(prog_data->total_scratch < max_scratch_size);8943}89448945lower_scoreboard();8946}89478948bool8949fs_visitor::run_vs()8950{8951assert(stage == MESA_SHADER_VERTEX);89528953setup_vs_payload();89548955if (shader_time_index >= 0)8956emit_shader_time_begin();89578958emit_nir_code();89598960if (failed)8961return false;89628963emit_urb_writes();89648965if (shader_time_index >= 0)8966emit_shader_time_end();89678968calculate_cfg();89698970optimize();89718972assign_curb_setup();8973assign_vs_urb_setup();89748975fixup_3src_null_dest();8976allocate_registers(true /* allow_spilling */);89778978return !failed;8979}89808981void8982fs_visitor::set_tcs_invocation_id()8983{8984struct brw_tcs_prog_data *tcs_prog_data = brw_tcs_prog_data(prog_data);8985struct brw_vue_prog_data *vue_prog_data = &tcs_prog_data->base;89868987const unsigned instance_id_mask =8988devinfo->ver >= 11 ? INTEL_MASK(22, 16) : INTEL_MASK(23, 17);8989const unsigned instance_id_shift =8990devinfo->ver >= 11 ? 16 : 17;89918992/* Get instance number from g0.2 bits 22:16 or 23:17 */8993fs_reg t = bld.vgrf(BRW_REGISTER_TYPE_UD);8994bld.AND(t, fs_reg(retype(brw_vec1_grf(0, 2), BRW_REGISTER_TYPE_UD)),8995brw_imm_ud(instance_id_mask));89968997invocation_id = bld.vgrf(BRW_REGISTER_TYPE_UD);89988999if (vue_prog_data->dispatch_mode == DISPATCH_MODE_TCS_8_PATCH) {9000/* gl_InvocationID is just the thread number */9001bld.SHR(invocation_id, t, brw_imm_ud(instance_id_shift));9002return;9003}90049005assert(vue_prog_data->dispatch_mode == DISPATCH_MODE_TCS_SINGLE_PATCH);90069007fs_reg channels_uw = bld.vgrf(BRW_REGISTER_TYPE_UW);9008fs_reg channels_ud = bld.vgrf(BRW_REGISTER_TYPE_UD);9009bld.MOV(channels_uw, fs_reg(brw_imm_uv(0x76543210)));9010bld.MOV(channels_ud, channels_uw);90119012if (tcs_prog_data->instances == 1) {9013invocation_id = channels_ud;9014} else {9015fs_reg instance_times_8 = bld.vgrf(BRW_REGISTER_TYPE_UD);9016bld.SHR(instance_times_8, t, brw_imm_ud(instance_id_shift - 3));9017bld.ADD(invocation_id, instance_times_8, channels_ud);9018}9019}90209021bool9022fs_visitor::run_tcs()9023{9024assert(stage == MESA_SHADER_TESS_CTRL);90259026struct brw_vue_prog_data *vue_prog_data = brw_vue_prog_data(prog_data);9027struct brw_tcs_prog_data *tcs_prog_data = brw_tcs_prog_data(prog_data);9028struct brw_tcs_prog_key *tcs_key = (struct brw_tcs_prog_key *) key;90299030assert(vue_prog_data->dispatch_mode == DISPATCH_MODE_TCS_SINGLE_PATCH ||9031vue_prog_data->dispatch_mode == DISPATCH_MODE_TCS_8_PATCH);90329033if (vue_prog_data->dispatch_mode == DISPATCH_MODE_TCS_SINGLE_PATCH) {9034/* r1-r4 contain the ICP handles. */9035payload.num_regs = 5;9036} else {9037assert(vue_prog_data->dispatch_mode == DISPATCH_MODE_TCS_8_PATCH);9038assert(tcs_key->input_vertices > 0);9039/* r1 contains output handles, r2 may contain primitive ID, then the9040* ICP handles occupy the next 1-32 registers.9041*/9042payload.num_regs = 2 + tcs_prog_data->include_primitive_id +9043tcs_key->input_vertices;9044}90459046if (shader_time_index >= 0)9047emit_shader_time_begin();90489049/* Initialize gl_InvocationID */9050set_tcs_invocation_id();90519052const bool fix_dispatch_mask =9053vue_prog_data->dispatch_mode == DISPATCH_MODE_TCS_SINGLE_PATCH &&9054(nir->info.tess.tcs_vertices_out % 8) != 0;90559056/* Fix the disptach mask */9057if (fix_dispatch_mask) {9058bld.CMP(bld.null_reg_ud(), invocation_id,9059brw_imm_ud(nir->info.tess.tcs_vertices_out), BRW_CONDITIONAL_L);9060bld.IF(BRW_PREDICATE_NORMAL);9061}90629063emit_nir_code();90649065if (fix_dispatch_mask) {9066bld.emit(BRW_OPCODE_ENDIF);9067}90689069/* Emit EOT write; set TR DS Cache bit */9070fs_reg srcs[3] = {9071fs_reg(get_tcs_output_urb_handle()),9072fs_reg(brw_imm_ud(WRITEMASK_X << 16)),9073fs_reg(brw_imm_ud(0)),9074};9075fs_reg payload = bld.vgrf(BRW_REGISTER_TYPE_UD, 3);9076bld.LOAD_PAYLOAD(payload, srcs, 3, 2);90779078fs_inst *inst = bld.emit(SHADER_OPCODE_URB_WRITE_SIMD8_MASKED,9079bld.null_reg_ud(), payload);9080inst->mlen = 3;9081inst->eot = true;90829083if (shader_time_index >= 0)9084emit_shader_time_end();90859086if (failed)9087return false;90889089calculate_cfg();90909091optimize();90929093assign_curb_setup();9094assign_tcs_urb_setup();90959096fixup_3src_null_dest();9097allocate_registers(true /* allow_spilling */);90989099return !failed;9100}91019102bool9103fs_visitor::run_tes()9104{9105assert(stage == MESA_SHADER_TESS_EVAL);91069107/* R0: thread header, R1-3: gl_TessCoord.xyz, R4: URB handles */9108payload.num_regs = 5;91099110if (shader_time_index >= 0)9111emit_shader_time_begin();91129113emit_nir_code();91149115if (failed)9116return false;91179118emit_urb_writes();91199120if (shader_time_index >= 0)9121emit_shader_time_end();91229123calculate_cfg();91249125optimize();91269127assign_curb_setup();9128assign_tes_urb_setup();91299130fixup_3src_null_dest();9131allocate_registers(true /* allow_spilling */);91329133return !failed;9134}91359136bool9137fs_visitor::run_gs()9138{9139assert(stage == MESA_SHADER_GEOMETRY);91409141setup_gs_payload();91429143this->final_gs_vertex_count = vgrf(glsl_type::uint_type);91449145if (gs_compile->control_data_header_size_bits > 0) {9146/* Create a VGRF to store accumulated control data bits. */9147this->control_data_bits = vgrf(glsl_type::uint_type);91489149/* If we're outputting more than 32 control data bits, then EmitVertex()9150* will set control_data_bits to 0 after emitting the first vertex.9151* Otherwise, we need to initialize it to 0 here.9152*/9153if (gs_compile->control_data_header_size_bits <= 32) {9154const fs_builder abld = bld.annotate("initialize control data bits");9155abld.MOV(this->control_data_bits, brw_imm_ud(0u));9156}9157}91589159if (shader_time_index >= 0)9160emit_shader_time_begin();91619162emit_nir_code();91639164emit_gs_thread_end();91659166if (shader_time_index >= 0)9167emit_shader_time_end();91689169if (failed)9170return false;91719172calculate_cfg();91739174optimize();91759176assign_curb_setup();9177assign_gs_urb_setup();91789179fixup_3src_null_dest();9180allocate_registers(true /* allow_spilling */);91819182return !failed;9183}91849185/* From the SKL PRM, Volume 16, Workarounds:9186*9187* 0877 3D Pixel Shader Hang possible when pixel shader dispatched with9188* only header phases (R0-R2)9189*9190* WA: Enable a non-header phase (e.g. push constant) when dispatch would9191* have been header only.9192*9193* Instead of enabling push constants one can alternatively enable one of the9194* inputs. Here one simply chooses "layer" which shouldn't impose much9195* overhead.9196*/9197static void9198gfx9_ps_header_only_workaround(struct brw_wm_prog_data *wm_prog_data)9199{9200if (wm_prog_data->num_varying_inputs)9201return;92029203if (wm_prog_data->base.curb_read_length)9204return;92059206wm_prog_data->urb_setup[VARYING_SLOT_LAYER] = 0;9207wm_prog_data->num_varying_inputs = 1;92089209brw_compute_urb_setup_index(wm_prog_data);9210}92119212bool9213fs_visitor::run_fs(bool allow_spilling, bool do_rep_send)9214{9215struct brw_wm_prog_data *wm_prog_data = brw_wm_prog_data(this->prog_data);9216brw_wm_prog_key *wm_key = (brw_wm_prog_key *) this->key;92179218assert(stage == MESA_SHADER_FRAGMENT);92199220if (devinfo->ver >= 6)9221setup_fs_payload_gfx6();9222else9223setup_fs_payload_gfx4();92249225if (0) {9226emit_dummy_fs();9227} else if (do_rep_send) {9228assert(dispatch_width == 16);9229emit_repclear_shader();9230} else {9231if (shader_time_index >= 0)9232emit_shader_time_begin();92339234if (nir->info.inputs_read > 0 ||9235BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_FRAG_COORD) ||9236(nir->info.outputs_read > 0 && !wm_key->coherent_fb_fetch)) {9237if (devinfo->ver < 6)9238emit_interpolation_setup_gfx4();9239else9240emit_interpolation_setup_gfx6();9241}92429243/* We handle discards by keeping track of the still-live pixels in f0.1.9244* Initialize it with the dispatched pixels.9245*/9246if (wm_prog_data->uses_kill) {9247const unsigned lower_width = MIN2(dispatch_width, 16);9248for (unsigned i = 0; i < dispatch_width / lower_width; i++) {9249const fs_reg dispatch_mask =9250devinfo->ver >= 6 ? brw_vec1_grf((i ? 2 : 1), 7) :9251brw_vec1_grf(0, 0);9252bld.exec_all().group(1, 0)9253.MOV(sample_mask_reg(bld.group(lower_width, i)),9254retype(dispatch_mask, BRW_REGISTER_TYPE_UW));9255}9256}92579258if (nir->info.writes_memory)9259wm_prog_data->has_side_effects = true;92609261emit_nir_code();92629263if (failed)9264return false;92659266if (wm_key->alpha_test_func)9267emit_alpha_test();92689269emit_fb_writes();92709271if (shader_time_index >= 0)9272emit_shader_time_end();92739274calculate_cfg();92759276optimize();92779278assign_curb_setup();92799280if (devinfo->ver >= 9)9281gfx9_ps_header_only_workaround(wm_prog_data);92829283assign_urb_setup();92849285fixup_3src_null_dest();92869287allocate_registers(allow_spilling);92889289if (failed)9290return false;9291}92929293return !failed;9294}92959296bool9297fs_visitor::run_cs(bool allow_spilling)9298{9299assert(stage == MESA_SHADER_COMPUTE || stage == MESA_SHADER_KERNEL);93009301setup_cs_payload();93029303if (shader_time_index >= 0)9304emit_shader_time_begin();93059306if (devinfo->is_haswell && prog_data->total_shared > 0) {9307/* Move SLM index from g0.0[27:24] to sr0.1[11:8] */9308const fs_builder abld = bld.exec_all().group(1, 0);9309abld.MOV(retype(brw_sr0_reg(1), BRW_REGISTER_TYPE_UW),9310suboffset(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UW), 1));9311}93129313emit_nir_code();93149315if (failed)9316return false;93179318emit_cs_terminate();93199320if (shader_time_index >= 0)9321emit_shader_time_end();93229323calculate_cfg();93249325optimize();93269327assign_curb_setup();93289329fixup_3src_null_dest();9330allocate_registers(allow_spilling);93319332if (failed)9333return false;93349335return !failed;9336}93379338bool9339fs_visitor::run_bs(bool allow_spilling)9340{9341assert(stage >= MESA_SHADER_RAYGEN && stage <= MESA_SHADER_CALLABLE);93429343/* R0: thread header, R1: stack IDs, R2: argument addresses */9344payload.num_regs = 3;93459346if (shader_time_index >= 0)9347emit_shader_time_begin();93489349emit_nir_code();93509351if (failed)9352return false;93539354/* TODO(RT): Perhaps rename this? */9355emit_cs_terminate();93569357if (shader_time_index >= 0)9358emit_shader_time_end();93599360calculate_cfg();93619362optimize();93639364assign_curb_setup();93659366fixup_3src_null_dest();9367allocate_registers(allow_spilling);93689369if (failed)9370return false;93719372return !failed;9373}93749375static bool9376is_used_in_not_interp_frag_coord(nir_ssa_def *def)9377{9378nir_foreach_use(src, def) {9379if (src->parent_instr->type != nir_instr_type_intrinsic)9380return true;93819382nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(src->parent_instr);9383if (intrin->intrinsic != nir_intrinsic_load_frag_coord)9384return true;9385}93869387nir_foreach_if_use(src, def)9388return true;93899390return false;9391}93929393/**9394* Return a bitfield where bit n is set if barycentric interpolation mode n9395* (see enum brw_barycentric_mode) is needed by the fragment shader.9396*9397* We examine the load_barycentric intrinsics rather than looking at input9398* variables so that we catch interpolateAtCentroid() messages too, which9399* also need the BRW_BARYCENTRIC_[NON]PERSPECTIVE_CENTROID mode set up.9400*/9401static unsigned9402brw_compute_barycentric_interp_modes(const struct intel_device_info *devinfo,9403const nir_shader *shader)9404{9405unsigned barycentric_interp_modes = 0;94069407nir_foreach_function(f, shader) {9408if (!f->impl)9409continue;94109411nir_foreach_block(block, f->impl) {9412nir_foreach_instr(instr, block) {9413if (instr->type != nir_instr_type_intrinsic)9414continue;94159416nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);9417switch (intrin->intrinsic) {9418case nir_intrinsic_load_barycentric_pixel:9419case nir_intrinsic_load_barycentric_centroid:9420case nir_intrinsic_load_barycentric_sample:9421break;9422default:9423continue;9424}94259426/* Ignore WPOS; it doesn't require interpolation. */9427assert(intrin->dest.is_ssa);9428if (!is_used_in_not_interp_frag_coord(&intrin->dest.ssa))9429continue;94309431enum glsl_interp_mode interp = (enum glsl_interp_mode)9432nir_intrinsic_interp_mode(intrin);9433nir_intrinsic_op bary_op = intrin->intrinsic;9434enum brw_barycentric_mode bary =9435brw_barycentric_mode(interp, bary_op);94369437barycentric_interp_modes |= 1 << bary;94389439if (devinfo->needs_unlit_centroid_workaround &&9440bary_op == nir_intrinsic_load_barycentric_centroid)9441barycentric_interp_modes |= 1 << centroid_to_pixel(bary);9442}9443}9444}94459446return barycentric_interp_modes;9447}94489449static void9450brw_compute_flat_inputs(struct brw_wm_prog_data *prog_data,9451const nir_shader *shader)9452{9453prog_data->flat_inputs = 0;94549455nir_foreach_shader_in_variable(var, shader) {9456unsigned slots = glsl_count_attribute_slots(var->type, false);9457for (unsigned s = 0; s < slots; s++) {9458int input_index = prog_data->urb_setup[var->data.location + s];94599460if (input_index < 0)9461continue;94629463/* flat shading */9464if (var->data.interpolation == INTERP_MODE_FLAT)9465prog_data->flat_inputs |= 1 << input_index;9466}9467}9468}94699470static uint8_t9471computed_depth_mode(const nir_shader *shader)9472{9473if (shader->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {9474switch (shader->info.fs.depth_layout) {9475case FRAG_DEPTH_LAYOUT_NONE:9476case FRAG_DEPTH_LAYOUT_ANY:9477return BRW_PSCDEPTH_ON;9478case FRAG_DEPTH_LAYOUT_GREATER:9479return BRW_PSCDEPTH_ON_GE;9480case FRAG_DEPTH_LAYOUT_LESS:9481return BRW_PSCDEPTH_ON_LE;9482case FRAG_DEPTH_LAYOUT_UNCHANGED:9483return BRW_PSCDEPTH_OFF;9484}9485}9486return BRW_PSCDEPTH_OFF;9487}94889489/**9490* Move load_interpolated_input with simple (payload-based) barycentric modes9491* to the top of the program so we don't emit multiple PLNs for the same input.9492*9493* This works around CSE not being able to handle non-dominating cases9494* such as:9495*9496* if (...) {9497* interpolate input9498* } else {9499* interpolate the same exact input9500* }9501*9502* This should be replaced by global value numbering someday.9503*/9504bool9505brw_nir_move_interpolation_to_top(nir_shader *nir)9506{9507bool progress = false;95089509nir_foreach_function(f, nir) {9510if (!f->impl)9511continue;95129513nir_block *top = nir_start_block(f->impl);9514exec_node *cursor_node = NULL;95159516nir_foreach_block(block, f->impl) {9517if (block == top)9518continue;95199520nir_foreach_instr_safe(instr, block) {9521if (instr->type != nir_instr_type_intrinsic)9522continue;95239524nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);9525if (intrin->intrinsic != nir_intrinsic_load_interpolated_input)9526continue;9527nir_intrinsic_instr *bary_intrinsic =9528nir_instr_as_intrinsic(intrin->src[0].ssa->parent_instr);9529nir_intrinsic_op op = bary_intrinsic->intrinsic;95309531/* Leave interpolateAtSample/Offset() where they are. */9532if (op == nir_intrinsic_load_barycentric_at_sample ||9533op == nir_intrinsic_load_barycentric_at_offset)9534continue;95359536nir_instr *move[3] = {9537&bary_intrinsic->instr,9538intrin->src[1].ssa->parent_instr,9539instr9540};95419542for (unsigned i = 0; i < ARRAY_SIZE(move); i++) {9543if (move[i]->block != top) {9544move[i]->block = top;9545exec_node_remove(&move[i]->node);9546if (cursor_node) {9547exec_node_insert_after(cursor_node, &move[i]->node);9548} else {9549exec_list_push_head(&top->instr_list, &move[i]->node);9550}9551cursor_node = &move[i]->node;9552progress = true;9553}9554}9555}9556}9557nir_metadata_preserve(f->impl, nir_metadata_block_index |9558nir_metadata_dominance);9559}95609561return progress;9562}95639564/**9565* Demote per-sample barycentric intrinsics to centroid.9566*9567* Useful when rendering to a non-multisampled buffer.9568*/9569bool9570brw_nir_demote_sample_qualifiers(nir_shader *nir)9571{9572bool progress = true;95739574nir_foreach_function(f, nir) {9575if (!f->impl)9576continue;95779578nir_builder b;9579nir_builder_init(&b, f->impl);95809581nir_foreach_block(block, f->impl) {9582nir_foreach_instr_safe(instr, block) {9583if (instr->type != nir_instr_type_intrinsic)9584continue;95859586nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);9587if (intrin->intrinsic != nir_intrinsic_load_barycentric_sample &&9588intrin->intrinsic != nir_intrinsic_load_barycentric_at_sample)9589continue;95909591b.cursor = nir_before_instr(instr);9592nir_ssa_def *centroid =9593nir_load_barycentric(&b, nir_intrinsic_load_barycentric_centroid,9594nir_intrinsic_interp_mode(intrin));9595nir_ssa_def_rewrite_uses(&intrin->dest.ssa,9596centroid);9597nir_instr_remove(instr);9598progress = true;9599}9600}96019602nir_metadata_preserve(f->impl, nir_metadata_block_index |9603nir_metadata_dominance);9604}96059606return progress;9607}96089609void9610brw_nir_populate_wm_prog_data(const nir_shader *shader,9611const struct intel_device_info *devinfo,9612const struct brw_wm_prog_key *key,9613struct brw_wm_prog_data *prog_data)9614{9615/* key->alpha_test_func means simulating alpha testing via discards,9616* so the shader definitely kills pixels.9617*/9618prog_data->uses_kill = shader->info.fs.uses_discard ||9619key->alpha_test_func;9620prog_data->uses_omask = !key->ignore_sample_mask_out &&9621(shader->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_SAMPLE_MASK));9622prog_data->computed_depth_mode = computed_depth_mode(shader);9623prog_data->computed_stencil =9624shader->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_STENCIL);96259626prog_data->persample_dispatch =9627key->multisample_fbo &&9628(key->persample_interp ||9629BITSET_TEST(shader->info.system_values_read, SYSTEM_VALUE_SAMPLE_ID) ||9630BITSET_TEST(shader->info.system_values_read, SYSTEM_VALUE_SAMPLE_POS) ||9631shader->info.fs.uses_sample_qualifier ||9632shader->info.outputs_read);96339634if (devinfo->ver >= 6) {9635prog_data->uses_sample_mask =9636BITSET_TEST(shader->info.system_values_read, SYSTEM_VALUE_SAMPLE_MASK_IN);96379638/* From the Ivy Bridge PRM documentation for 3DSTATE_PS:9639*9640* "MSDISPMODE_PERSAMPLE is required in order to select9641* POSOFFSET_SAMPLE"9642*9643* So we can only really get sample positions if we are doing real9644* per-sample dispatch. If we need gl_SamplePosition and we don't have9645* persample dispatch, we hard-code it to 0.5.9646*/9647prog_data->uses_pos_offset = prog_data->persample_dispatch &&9648BITSET_TEST(shader->info.system_values_read, SYSTEM_VALUE_SAMPLE_POS);9649}96509651prog_data->has_render_target_reads = shader->info.outputs_read != 0ull;96529653prog_data->early_fragment_tests = shader->info.fs.early_fragment_tests;9654prog_data->post_depth_coverage = shader->info.fs.post_depth_coverage;9655prog_data->inner_coverage = shader->info.fs.inner_coverage;96569657prog_data->barycentric_interp_modes =9658brw_compute_barycentric_interp_modes(devinfo, shader);96599660prog_data->per_coarse_pixel_dispatch =9661key->coarse_pixel &&9662!prog_data->persample_dispatch &&9663!prog_data->uses_sample_mask &&9664(prog_data->computed_depth_mode == BRW_PSCDEPTH_OFF) &&9665!prog_data->computed_stencil;96669667prog_data->uses_src_w =9668BITSET_TEST(shader->info.system_values_read, SYSTEM_VALUE_FRAG_COORD);9669prog_data->uses_src_depth =9670BITSET_TEST(shader->info.system_values_read, SYSTEM_VALUE_FRAG_COORD) &&9671!prog_data->per_coarse_pixel_dispatch;9672prog_data->uses_depth_w_coefficients =9673BITSET_TEST(shader->info.system_values_read, SYSTEM_VALUE_FRAG_COORD) &&9674prog_data->per_coarse_pixel_dispatch;96759676calculate_urb_setup(devinfo, key, prog_data, shader);9677brw_compute_flat_inputs(prog_data, shader);9678}96799680/**9681* Pre-gfx6, the register file of the EUs was shared between threads,9682* and each thread used some subset allocated on a 16-register block9683* granularity. The unit states wanted these block counts.9684*/9685static inline int9686brw_register_blocks(int reg_count)9687{9688return ALIGN(reg_count, 16) / 16 - 1;9689}96909691const unsigned *9692brw_compile_fs(const struct brw_compiler *compiler,9693void *mem_ctx,9694struct brw_compile_fs_params *params)9695{9696struct nir_shader *nir = params->nir;9697const struct brw_wm_prog_key *key = params->key;9698struct brw_wm_prog_data *prog_data = params->prog_data;9699bool allow_spilling = params->allow_spilling;9700const bool debug_enabled =9701INTEL_DEBUG & (params->debug_flag ? params->debug_flag : DEBUG_WM);97029703prog_data->base.stage = MESA_SHADER_FRAGMENT;97049705const struct intel_device_info *devinfo = compiler->devinfo;9706const unsigned max_subgroup_size = compiler->devinfo->ver >= 6 ? 32 : 16;97079708brw_nir_apply_key(nir, compiler, &key->base, max_subgroup_size, true);9709brw_nir_lower_fs_inputs(nir, devinfo, key);9710brw_nir_lower_fs_outputs(nir);97119712if (devinfo->ver < 6)9713brw_setup_vue_interpolation(params->vue_map, nir, prog_data);97149715/* From the SKL PRM, Volume 7, "Alpha Coverage":9716* "If Pixel Shader outputs oMask, AlphaToCoverage is disabled in9717* hardware, regardless of the state setting for this feature."9718*/9719if (devinfo->ver > 6 && key->alpha_to_coverage) {9720/* Run constant fold optimization in order to get the correct source9721* offset to determine render target 0 store instruction in9722* emit_alpha_to_coverage pass.9723*/9724NIR_PASS_V(nir, nir_opt_constant_folding);9725NIR_PASS_V(nir, brw_nir_lower_alpha_to_coverage);9726}97279728if (!key->multisample_fbo)9729NIR_PASS_V(nir, brw_nir_demote_sample_qualifiers);9730NIR_PASS_V(nir, brw_nir_move_interpolation_to_top);9731brw_postprocess_nir(nir, compiler, true, debug_enabled,9732key->base.robust_buffer_access);97339734brw_nir_populate_wm_prog_data(nir, compiler->devinfo, key, prog_data);97359736fs_visitor *v8 = NULL, *v16 = NULL, *v32 = NULL;9737cfg_t *simd8_cfg = NULL, *simd16_cfg = NULL, *simd32_cfg = NULL;9738float throughput = 0;9739bool has_spilled = false;97409741v8 = new fs_visitor(compiler, params->log_data, mem_ctx, &key->base,9742&prog_data->base, nir, 8,9743params->shader_time ? params->shader_time_index8 : -1,9744debug_enabled);9745if (!v8->run_fs(allow_spilling, false /* do_rep_send */)) {9746params->error_str = ralloc_strdup(mem_ctx, v8->fail_msg);9747delete v8;9748return NULL;9749} else if (!(INTEL_DEBUG & DEBUG_NO8)) {9750simd8_cfg = v8->cfg;9751prog_data->base.dispatch_grf_start_reg = v8->payload.num_regs;9752prog_data->reg_blocks_8 = brw_register_blocks(v8->grf_used);9753const performance &perf = v8->performance_analysis.require();9754throughput = MAX2(throughput, perf.throughput);9755has_spilled = v8->spilled_any_registers;9756allow_spilling = false;9757}97589759/* Limit dispatch width to simd8 with dual source blending on gfx8.9760* See: https://gitlab.freedesktop.org/mesa/mesa/-/issues/19179761*/9762if (devinfo->ver == 8 && prog_data->dual_src_blend &&9763!(INTEL_DEBUG & DEBUG_NO8)) {9764assert(!params->use_rep_send);9765v8->limit_dispatch_width(8, "gfx8 workaround: "9766"using SIMD8 when dual src blending.\n");9767}97689769if (key->coarse_pixel) {9770if (prog_data->dual_src_blend) {9771v8->limit_dispatch_width(8, "SIMD16 coarse pixel shading cannot"9772" use SIMD8 messages.\n");9773}9774v8->limit_dispatch_width(16, "SIMD32 not supported with coarse"9775" pixel shading.\n");9776}97779778if (!has_spilled &&9779v8->max_dispatch_width >= 16 &&9780(!(INTEL_DEBUG & DEBUG_NO16) || params->use_rep_send)) {9781/* Try a SIMD16 compile */9782v16 = new fs_visitor(compiler, params->log_data, mem_ctx, &key->base,9783&prog_data->base, nir, 16,9784params->shader_time ? params->shader_time_index16 : -1,9785debug_enabled);9786v16->import_uniforms(v8);9787if (!v16->run_fs(allow_spilling, params->use_rep_send)) {9788compiler->shader_perf_log(params->log_data,9789"SIMD16 shader failed to compile: %s",9790v16->fail_msg);9791} else {9792simd16_cfg = v16->cfg;9793prog_data->dispatch_grf_start_reg_16 = v16->payload.num_regs;9794prog_data->reg_blocks_16 = brw_register_blocks(v16->grf_used);9795const performance &perf = v16->performance_analysis.require();9796throughput = MAX2(throughput, perf.throughput);9797has_spilled = v16->spilled_any_registers;9798allow_spilling = false;9799}9800}98019802const bool simd16_failed = v16 && !simd16_cfg;98039804/* Currently, the compiler only supports SIMD32 on SNB+ */9805if (!has_spilled &&9806v8->max_dispatch_width >= 32 && !params->use_rep_send &&9807devinfo->ver >= 6 && !simd16_failed &&9808!(INTEL_DEBUG & DEBUG_NO32)) {9809/* Try a SIMD32 compile */9810v32 = new fs_visitor(compiler, params->log_data, mem_ctx, &key->base,9811&prog_data->base, nir, 32,9812params->shader_time ? params->shader_time_index32 : -1,9813debug_enabled);9814v32->import_uniforms(v8);9815if (!v32->run_fs(allow_spilling, false)) {9816compiler->shader_perf_log(params->log_data,9817"SIMD32 shader failed to compile: %s",9818v32->fail_msg);9819} else {9820const performance &perf = v32->performance_analysis.require();98219822if (!(INTEL_DEBUG & DEBUG_DO32) && throughput >= perf.throughput) {9823compiler->shader_perf_log(params->log_data, "SIMD32 shader inefficient\n");9824} else {9825simd32_cfg = v32->cfg;9826prog_data->dispatch_grf_start_reg_32 = v32->payload.num_regs;9827prog_data->reg_blocks_32 = brw_register_blocks(v32->grf_used);9828throughput = MAX2(throughput, perf.throughput);9829}9830}9831}98329833/* When the caller requests a repclear shader, they want SIMD16-only */9834if (params->use_rep_send)9835simd8_cfg = NULL;98369837/* Prior to Iron Lake, the PS had a single shader offset with a jump table9838* at the top to select the shader. We've never implemented that.9839* Instead, we just give them exactly one shader and we pick the widest one9840* available.9841*/9842if (compiler->devinfo->ver < 5) {9843if (simd32_cfg || simd16_cfg)9844simd8_cfg = NULL;9845if (simd32_cfg)9846simd16_cfg = NULL;9847}98489849/* If computed depth is enabled SNB only allows SIMD8. */9850if (compiler->devinfo->ver == 6 &&9851prog_data->computed_depth_mode != BRW_PSCDEPTH_OFF)9852assert(simd16_cfg == NULL && simd32_cfg == NULL);98539854if (compiler->devinfo->ver <= 5 && !simd8_cfg) {9855/* Iron lake and earlier only have one Dispatch GRF start field. Make9856* the data available in the base prog data struct for convenience.9857*/9858if (simd16_cfg) {9859prog_data->base.dispatch_grf_start_reg =9860prog_data->dispatch_grf_start_reg_16;9861} else if (simd32_cfg) {9862prog_data->base.dispatch_grf_start_reg =9863prog_data->dispatch_grf_start_reg_32;9864}9865}98669867if (prog_data->persample_dispatch) {9868/* Starting with SandyBridge (where we first get MSAA), the different9869* pixel dispatch combinations are grouped into classifications A9870* through F (SNB PRM Vol. 2 Part 1 Section 7.7.1). On most hardware9871* generations, the only configurations supporting persample dispatch9872* are those in which only one dispatch width is enabled.9873*9874* The Gfx12 hardware spec has a similar dispatch grouping table, but9875* the following conflicting restriction applies (from the page on9876* "Structure_3DSTATE_PS_BODY"), so we need to keep the SIMD16 shader:9877*9878* "SIMD32 may only be enabled if SIMD16 or (dual)SIMD8 is also9879* enabled."9880*/9881if (simd32_cfg || simd16_cfg)9882simd8_cfg = NULL;9883if (simd32_cfg && devinfo->ver < 12)9884simd16_cfg = NULL;9885}98869887fs_generator g(compiler, params->log_data, mem_ctx, &prog_data->base,9888v8->runtime_check_aads_emit, MESA_SHADER_FRAGMENT);98899890if (unlikely(debug_enabled)) {9891g.enable_debug(ralloc_asprintf(mem_ctx, "%s fragment shader %s",9892nir->info.label ?9893nir->info.label : "unnamed",9894nir->info.name));9895}98969897struct brw_compile_stats *stats = params->stats;98989899if (simd8_cfg) {9900prog_data->dispatch_8 = true;9901g.generate_code(simd8_cfg, 8, v8->shader_stats,9902v8->performance_analysis.require(), stats);9903stats = stats ? stats + 1 : NULL;9904}99059906if (simd16_cfg) {9907prog_data->dispatch_16 = true;9908prog_data->prog_offset_16 = g.generate_code(9909simd16_cfg, 16, v16->shader_stats,9910v16->performance_analysis.require(), stats);9911stats = stats ? stats + 1 : NULL;9912}99139914if (simd32_cfg) {9915prog_data->dispatch_32 = true;9916prog_data->prog_offset_32 = g.generate_code(9917simd32_cfg, 32, v32->shader_stats,9918v32->performance_analysis.require(), stats);9919stats = stats ? stats + 1 : NULL;9920}99219922g.add_const_data(nir->constant_data, nir->constant_data_size);99239924delete v8;9925delete v16;9926delete v32;99279928return g.get_assembly();9929}99309931fs_reg *9932fs_visitor::emit_cs_work_group_id_setup()9933{9934assert(stage == MESA_SHADER_COMPUTE || stage == MESA_SHADER_KERNEL);99359936fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::uvec3_type));99379938struct brw_reg r0_1(retype(brw_vec1_grf(0, 1), BRW_REGISTER_TYPE_UD));9939struct brw_reg r0_6(retype(brw_vec1_grf(0, 6), BRW_REGISTER_TYPE_UD));9940struct brw_reg r0_7(retype(brw_vec1_grf(0, 7), BRW_REGISTER_TYPE_UD));99419942bld.MOV(*reg, r0_1);9943bld.MOV(offset(*reg, bld, 1), r0_6);9944bld.MOV(offset(*reg, bld, 2), r0_7);99459946return reg;9947}99489949unsigned9950brw_cs_push_const_total_size(const struct brw_cs_prog_data *cs_prog_data,9951unsigned threads)9952{9953assert(cs_prog_data->push.per_thread.size % REG_SIZE == 0);9954assert(cs_prog_data->push.cross_thread.size % REG_SIZE == 0);9955return cs_prog_data->push.per_thread.size * threads +9956cs_prog_data->push.cross_thread.size;9957}99589959static void9960fill_push_const_block_info(struct brw_push_const_block *block, unsigned dwords)9961{9962block->dwords = dwords;9963block->regs = DIV_ROUND_UP(dwords, 8);9964block->size = block->regs * 32;9965}99669967static void9968cs_fill_push_const_info(const struct intel_device_info *devinfo,9969struct brw_cs_prog_data *cs_prog_data)9970{9971const struct brw_stage_prog_data *prog_data = &cs_prog_data->base;9972int subgroup_id_index = get_subgroup_id_param_index(devinfo, prog_data);9973bool cross_thread_supported = devinfo->verx10 >= 75;99749975/* The thread ID should be stored in the last param dword */9976assert(subgroup_id_index == -1 ||9977subgroup_id_index == (int)prog_data->nr_params - 1);99789979unsigned cross_thread_dwords, per_thread_dwords;9980if (!cross_thread_supported) {9981cross_thread_dwords = 0u;9982per_thread_dwords = prog_data->nr_params;9983} else if (subgroup_id_index >= 0) {9984/* Fill all but the last register with cross-thread payload */9985cross_thread_dwords = 8 * (subgroup_id_index / 8);9986per_thread_dwords = prog_data->nr_params - cross_thread_dwords;9987assert(per_thread_dwords > 0 && per_thread_dwords <= 8);9988} else {9989/* Fill all data using cross-thread payload */9990cross_thread_dwords = prog_data->nr_params;9991per_thread_dwords = 0u;9992}99939994fill_push_const_block_info(&cs_prog_data->push.cross_thread, cross_thread_dwords);9995fill_push_const_block_info(&cs_prog_data->push.per_thread, per_thread_dwords);99969997assert(cs_prog_data->push.cross_thread.dwords % 8 == 0 ||9998cs_prog_data->push.per_thread.size == 0);9999assert(cs_prog_data->push.cross_thread.dwords +10000cs_prog_data->push.per_thread.dwords ==10001prog_data->nr_params);10002}1000310004static bool10005filter_simd(const nir_instr *instr, const void * /* options */)10006{10007if (instr->type != nir_instr_type_intrinsic)10008return false;1000910010switch (nir_instr_as_intrinsic(instr)->intrinsic) {10011case nir_intrinsic_load_simd_width_intel:10012case nir_intrinsic_load_subgroup_id:10013return true;1001410015default:10016return false;10017}10018}1001910020static nir_ssa_def *10021lower_simd(nir_builder *b, nir_instr *instr, void *options)10022{10023uintptr_t simd_width = (uintptr_t)options;1002410025switch (nir_instr_as_intrinsic(instr)->intrinsic) {10026case nir_intrinsic_load_simd_width_intel:10027return nir_imm_int(b, simd_width);1002810029case nir_intrinsic_load_subgroup_id:10030/* If the whole workgroup fits in one thread, we can lower subgroup_id10031* to a constant zero.10032*/10033if (!b->shader->info.workgroup_size_variable) {10034unsigned local_workgroup_size = b->shader->info.workgroup_size[0] *10035b->shader->info.workgroup_size[1] *10036b->shader->info.workgroup_size[2];10037if (local_workgroup_size <= simd_width)10038return nir_imm_int(b, 0);10039}10040return NULL;1004110042default:10043return NULL;10044}10045}1004610047static void10048brw_nir_lower_simd(nir_shader *nir, unsigned dispatch_width)10049{10050nir_shader_lower_instructions(nir, filter_simd, lower_simd,10051(void *)(uintptr_t)dispatch_width);10052}1005310054static nir_shader *10055compile_cs_to_nir(const struct brw_compiler *compiler,10056void *mem_ctx,10057const struct brw_cs_prog_key *key,10058const nir_shader *src_shader,10059unsigned dispatch_width,10060bool debug_enabled)10061{10062nir_shader *shader = nir_shader_clone(mem_ctx, src_shader);10063brw_nir_apply_key(shader, compiler, &key->base, dispatch_width, true);1006410065NIR_PASS_V(shader, brw_nir_lower_simd, dispatch_width);1006610067/* Clean up after the local index and ID calculations. */10068NIR_PASS_V(shader, nir_opt_constant_folding);10069NIR_PASS_V(shader, nir_opt_dce);1007010071brw_postprocess_nir(shader, compiler, true, debug_enabled,10072key->base.robust_buffer_access);1007310074return shader;10075}1007610077const unsigned *10078brw_compile_cs(const struct brw_compiler *compiler,10079void *mem_ctx,10080struct brw_compile_cs_params *params)10081{10082const nir_shader *nir = params->nir;10083const struct brw_cs_prog_key *key = params->key;10084struct brw_cs_prog_data *prog_data = params->prog_data;10085int shader_time_index = params->shader_time ? params->shader_time_index : -1;1008610087const bool debug_enabled = INTEL_DEBUG & DEBUG_CS;1008810089prog_data->base.stage = MESA_SHADER_COMPUTE;10090prog_data->base.total_shared = nir->info.shared_size;1009110092/* Generate code for all the possible SIMD variants. */10093bool generate_all;1009410095unsigned min_dispatch_width;10096unsigned max_dispatch_width;1009710098if (nir->info.workgroup_size_variable) {10099generate_all = true;10100min_dispatch_width = 8;10101max_dispatch_width = 32;10102} else {10103generate_all = false;10104prog_data->local_size[0] = nir->info.workgroup_size[0];10105prog_data->local_size[1] = nir->info.workgroup_size[1];10106prog_data->local_size[2] = nir->info.workgroup_size[2];10107unsigned local_workgroup_size = prog_data->local_size[0] *10108prog_data->local_size[1] *10109prog_data->local_size[2];1011010111/* Limit max_threads to 64 for the GPGPU_WALKER command */10112const uint32_t max_threads = MIN2(64, compiler->devinfo->max_cs_threads);10113min_dispatch_width = util_next_power_of_two(10114MAX2(8, DIV_ROUND_UP(local_workgroup_size, max_threads)));10115assert(min_dispatch_width <= 32);10116max_dispatch_width = 32;10117}1011810119if ((int)key->base.subgroup_size_type >= (int)BRW_SUBGROUP_SIZE_REQUIRE_8) {10120/* These enum values are expressly chosen to be equal to the subgroup10121* size that they require.10122*/10123const unsigned required_dispatch_width =10124(unsigned)key->base.subgroup_size_type;10125assert(required_dispatch_width == 8 ||10126required_dispatch_width == 16 ||10127required_dispatch_width == 32);10128if (required_dispatch_width < min_dispatch_width ||10129required_dispatch_width > max_dispatch_width) {10130params->error_str = ralloc_strdup(mem_ctx,10131"Cannot satisfy explicit subgroup size");10132return NULL;10133}10134min_dispatch_width = max_dispatch_width = required_dispatch_width;10135}1013610137assert(min_dispatch_width <= max_dispatch_width);1013810139fs_visitor *v8 = NULL, *v16 = NULL, *v32 = NULL;10140fs_visitor *v = NULL;1014110142if (!(INTEL_DEBUG & DEBUG_NO8) &&10143min_dispatch_width <= 8 && max_dispatch_width >= 8) {10144nir_shader *nir8 = compile_cs_to_nir(compiler, mem_ctx, key,10145nir, 8, debug_enabled);10146v8 = new fs_visitor(compiler, params->log_data, mem_ctx, &key->base,10147&prog_data->base,10148nir8, 8, shader_time_index, debug_enabled);10149if (!v8->run_cs(true /* allow_spilling */)) {10150params->error_str = ralloc_strdup(mem_ctx, v8->fail_msg);10151delete v8;10152return NULL;10153}1015410155/* We should always be able to do SIMD32 for compute shaders */10156assert(v8->max_dispatch_width >= 32);1015710158v = v8;10159prog_data->prog_mask |= 1 << 0;10160if (v8->spilled_any_registers)10161prog_data->prog_spilled |= 1 << 0;10162cs_fill_push_const_info(compiler->devinfo, prog_data);10163}1016410165if (!(INTEL_DEBUG & DEBUG_NO16) &&10166(generate_all || !prog_data->prog_spilled) &&10167min_dispatch_width <= 16 && max_dispatch_width >= 16) {10168/* Try a SIMD16 compile */10169nir_shader *nir16 = compile_cs_to_nir(compiler, mem_ctx, key,10170nir, 16, debug_enabled);10171v16 = new fs_visitor(compiler, params->log_data, mem_ctx, &key->base,10172&prog_data->base,10173nir16, 16, shader_time_index, debug_enabled);10174if (v8)10175v16->import_uniforms(v8);1017610177const bool allow_spilling = generate_all || v == NULL;10178if (!v16->run_cs(allow_spilling)) {10179compiler->shader_perf_log(params->log_data,10180"SIMD16 shader failed to compile: %s",10181v16->fail_msg);10182if (!v) {10183assert(v8 == NULL);10184params->error_str = ralloc_asprintf(10185mem_ctx, "Not enough threads for SIMD8 and "10186"couldn't generate SIMD16: %s", v16->fail_msg);10187delete v16;10188return NULL;10189}10190} else {10191/* We should always be able to do SIMD32 for compute shaders */10192assert(v16->max_dispatch_width >= 32);1019310194v = v16;10195prog_data->prog_mask |= 1 << 1;10196if (v16->spilled_any_registers)10197prog_data->prog_spilled |= 1 << 1;10198cs_fill_push_const_info(compiler->devinfo, prog_data);10199}10200}1020110202/* The SIMD32 is only enabled for cases it is needed unless forced.10203*10204* TODO: Use performance_analysis and drop this boolean.10205*/10206const bool needs_32 = v == NULL ||10207(INTEL_DEBUG & DEBUG_DO32) ||10208generate_all;1020910210if (!(INTEL_DEBUG & DEBUG_NO32) &&10211(generate_all || !prog_data->prog_spilled) &&10212needs_32 &&10213min_dispatch_width <= 32 && max_dispatch_width >= 32) {10214/* Try a SIMD32 compile */10215nir_shader *nir32 = compile_cs_to_nir(compiler, mem_ctx, key,10216nir, 32, debug_enabled);10217v32 = new fs_visitor(compiler, params->log_data, mem_ctx, &key->base,10218&prog_data->base,10219nir32, 32, shader_time_index, debug_enabled);10220if (v8)10221v32->import_uniforms(v8);10222else if (v16)10223v32->import_uniforms(v16);1022410225const bool allow_spilling = generate_all || v == NULL;10226if (!v32->run_cs(allow_spilling)) {10227compiler->shader_perf_log(params->log_data,10228"SIMD32 shader failed to compile: %s",10229v32->fail_msg);10230if (!v) {10231assert(v8 == NULL);10232assert(v16 == NULL);10233params->error_str = ralloc_asprintf(10234mem_ctx, "Not enough threads for SIMD16 and "10235"couldn't generate SIMD32: %s", v32->fail_msg);10236delete v32;10237return NULL;10238}10239} else {10240v = v32;10241prog_data->prog_mask |= 1 << 2;10242if (v32->spilled_any_registers)10243prog_data->prog_spilled |= 1 << 2;10244cs_fill_push_const_info(compiler->devinfo, prog_data);10245}10246}1024710248if (unlikely(!v) && (INTEL_DEBUG & (DEBUG_NO8 | DEBUG_NO16 | DEBUG_NO32))) {10249params->error_str =10250ralloc_strdup(mem_ctx,10251"Cannot satisfy INTEL_DEBUG flags SIMD restrictions");10252return NULL;10253}1025410255assert(v);1025610257const unsigned *ret = NULL;1025810259fs_generator g(compiler, params->log_data, mem_ctx, &prog_data->base,10260v->runtime_check_aads_emit, MESA_SHADER_COMPUTE);10261if (unlikely(debug_enabled)) {10262char *name = ralloc_asprintf(mem_ctx, "%s compute shader %s",10263nir->info.label ?10264nir->info.label : "unnamed",10265nir->info.name);10266g.enable_debug(name);10267}1026810269struct brw_compile_stats *stats = params->stats;10270if (generate_all) {10271if (prog_data->prog_mask & (1 << 0)) {10272assert(v8);10273prog_data->prog_offset[0] =10274g.generate_code(v8->cfg, 8, v8->shader_stats,10275v8->performance_analysis.require(), stats);10276stats = stats ? stats + 1 : NULL;10277}1027810279if (prog_data->prog_mask & (1 << 1)) {10280assert(v16);10281prog_data->prog_offset[1] =10282g.generate_code(v16->cfg, 16, v16->shader_stats,10283v16->performance_analysis.require(), stats);10284stats = stats ? stats + 1 : NULL;10285}1028610287if (prog_data->prog_mask & (1 << 2)) {10288assert(v32);10289prog_data->prog_offset[2] =10290g.generate_code(v32->cfg, 32, v32->shader_stats,10291v32->performance_analysis.require(), stats);10292stats = stats ? stats + 1 : NULL;10293}10294} else {10295/* Only one dispatch width will be valid, and will be at offset 0,10296* which is already the default value of prog_offset_* fields.10297*/10298prog_data->prog_mask = 1 << (v->dispatch_width / 16);10299g.generate_code(v->cfg, v->dispatch_width, v->shader_stats,10300v->performance_analysis.require(), stats);10301}1030210303g.add_const_data(nir->constant_data, nir->constant_data_size);1030410305ret = g.get_assembly();1030610307delete v8;10308delete v16;10309delete v32;1031010311return ret;10312}1031310314static unsigned10315brw_cs_simd_size_for_group_size(const struct intel_device_info *devinfo,10316const struct brw_cs_prog_data *cs_prog_data,10317unsigned group_size)10318{10319const unsigned mask = cs_prog_data->prog_mask;10320assert(mask != 0);1032110322static const unsigned simd8 = 1 << 0;10323static const unsigned simd16 = 1 << 1;10324static const unsigned simd32 = 1 << 2;1032510326if ((INTEL_DEBUG & DEBUG_DO32) && (mask & simd32))10327return 32;1032810329/* Limit max_threads to 64 for the GPGPU_WALKER command */10330const uint32_t max_threads = MIN2(64, devinfo->max_cs_threads);1033110332if ((mask & simd8) && group_size <= 8 * max_threads) {10333/* Prefer SIMD16 if can do without spilling. Matches logic in10334* brw_compile_cs.10335*/10336if ((mask & simd16) && (~cs_prog_data->prog_spilled & simd16))10337return 16;10338return 8;10339}1034010341if ((mask & simd16) && group_size <= 16 * max_threads)10342return 16;1034310344assert(mask & simd32);10345assert(group_size <= 32 * max_threads);10346return 32;10347}1034810349struct brw_cs_dispatch_info10350brw_cs_get_dispatch_info(const struct intel_device_info *devinfo,10351const struct brw_cs_prog_data *prog_data,10352const unsigned *override_local_size)10353{10354struct brw_cs_dispatch_info info = {};1035510356const unsigned *sizes =10357override_local_size ? override_local_size :10358prog_data->local_size;1035910360info.group_size = sizes[0] * sizes[1] * sizes[2];10361info.simd_size =10362brw_cs_simd_size_for_group_size(devinfo, prog_data, info.group_size);10363info.threads = DIV_ROUND_UP(info.group_size, info.simd_size);1036410365const uint32_t remainder = info.group_size & (info.simd_size - 1);10366if (remainder > 0)10367info.right_mask = ~0u >> (32 - remainder);10368else10369info.right_mask = ~0u >> (32 - info.simd_size);1037010371return info;10372}1037310374static uint8_t10375compile_single_bs(const struct brw_compiler *compiler, void *log_data,10376void *mem_ctx,10377const struct brw_bs_prog_key *key,10378struct brw_bs_prog_data *prog_data,10379nir_shader *shader,10380fs_generator *g,10381struct brw_compile_stats *stats,10382int *prog_offset,10383char **error_str)10384{10385const bool debug_enabled = INTEL_DEBUG & DEBUG_RT;1038610387prog_data->base.stage = shader->info.stage;10388prog_data->max_stack_size = MAX2(prog_data->max_stack_size,10389shader->scratch_size);1039010391const unsigned max_dispatch_width = 16;10392brw_nir_apply_key(shader, compiler, &key->base, max_dispatch_width, true);10393brw_postprocess_nir(shader, compiler, true, debug_enabled,10394key->base.robust_buffer_access);1039510396fs_visitor *v = NULL, *v8 = NULL, *v16 = NULL;10397bool has_spilled = false;1039810399uint8_t simd_size = 0;10400if (likely(!(INTEL_DEBUG & DEBUG_NO8))) {10401v8 = new fs_visitor(compiler, log_data, mem_ctx, &key->base,10402&prog_data->base, shader,104038, -1 /* shader time */, debug_enabled);10404const bool allow_spilling = true;10405if (!v8->run_bs(allow_spilling)) {10406if (error_str)10407*error_str = ralloc_strdup(mem_ctx, v8->fail_msg);10408delete v8;10409return 0;10410} else {10411v = v8;10412simd_size = 8;10413if (v8->spilled_any_registers)10414has_spilled = true;10415}10416}1041710418if (!has_spilled && likely(!(INTEL_DEBUG & DEBUG_NO16))) {10419v16 = new fs_visitor(compiler, log_data, mem_ctx, &key->base,10420&prog_data->base, shader,1042116, -1 /* shader time */, debug_enabled);10422const bool allow_spilling = (v == NULL);10423if (!v16->run_bs(allow_spilling)) {10424compiler->shader_perf_log(log_data,10425"SIMD16 shader failed to compile: %s",10426v16->fail_msg);10427if (v == NULL) {10428assert(v8 == NULL);10429if (error_str) {10430*error_str = ralloc_asprintf(10431mem_ctx, "SIMD8 disabled and couldn't generate SIMD16: %s",10432v16->fail_msg);10433}10434delete v16;10435return 0;10436}10437} else {10438v = v16;10439simd_size = 16;10440if (v16->spilled_any_registers)10441has_spilled = true;10442}10443}1044410445if (unlikely(v == NULL)) {10446assert(INTEL_DEBUG & (DEBUG_NO8 | DEBUG_NO16));10447if (error_str) {10448*error_str = ralloc_strdup(mem_ctx,10449"Cannot satisfy INTEL_DEBUG flags SIMD restrictions");10450}10451return false;10452}1045310454assert(v);1045510456int offset = g->generate_code(v->cfg, simd_size, v->shader_stats,10457v->performance_analysis.require(), stats);10458if (prog_offset)10459*prog_offset = offset;10460else10461assert(offset == 0);1046210463delete v8;10464delete v16;1046510466return simd_size;10467}1046810469uint64_t10470brw_bsr(const struct intel_device_info *devinfo,10471uint32_t offset, uint8_t simd_size, uint8_t local_arg_offset)10472{10473assert(offset % 64 == 0);10474assert(simd_size == 8 || simd_size == 16);10475assert(local_arg_offset % 8 == 0);1047610477return offset |10478SET_BITS(simd_size > 8, 4, 4) |10479SET_BITS(local_arg_offset / 8, 2, 0);10480}1048110482const unsigned *10483brw_compile_bs(const struct brw_compiler *compiler, void *log_data,10484void *mem_ctx,10485const struct brw_bs_prog_key *key,10486struct brw_bs_prog_data *prog_data,10487nir_shader *shader,10488unsigned num_resume_shaders,10489struct nir_shader **resume_shaders,10490struct brw_compile_stats *stats,10491char **error_str)10492{10493const bool debug_enabled = INTEL_DEBUG & DEBUG_RT;1049410495prog_data->base.stage = shader->info.stage;10496prog_data->max_stack_size = 0;1049710498fs_generator g(compiler, log_data, mem_ctx, &prog_data->base,10499false, shader->info.stage);10500if (unlikely(debug_enabled)) {10501char *name = ralloc_asprintf(mem_ctx, "%s %s shader %s",10502shader->info.label ?10503shader->info.label : "unnamed",10504gl_shader_stage_name(shader->info.stage),10505shader->info.name);10506g.enable_debug(name);10507}1050810509prog_data->simd_size =10510compile_single_bs(compiler, log_data, mem_ctx, key, prog_data,10511shader, &g, stats, NULL, error_str);10512if (prog_data->simd_size == 0)10513return NULL;1051410515uint64_t *resume_sbt = ralloc_array(mem_ctx, uint64_t, num_resume_shaders);10516for (unsigned i = 0; i < num_resume_shaders; i++) {10517if (INTEL_DEBUG & DEBUG_RT) {10518char *name = ralloc_asprintf(mem_ctx, "%s %s resume(%u) shader %s",10519shader->info.label ?10520shader->info.label : "unnamed",10521gl_shader_stage_name(shader->info.stage),10522i, shader->info.name);10523g.enable_debug(name);10524}1052510526/* TODO: Figure out shader stats etc. for resume shaders */10527int offset = 0;10528uint8_t simd_size =10529compile_single_bs(compiler, log_data, mem_ctx, key, prog_data,10530resume_shaders[i], &g, NULL, &offset, error_str);10531if (simd_size == 0)10532return NULL;1053310534assert(offset > 0);10535resume_sbt[i] = brw_bsr(compiler->devinfo, offset, simd_size, 0);10536}1053710538/* We only have one constant data so we want to make sure they're all the10539* same.10540*/10541for (unsigned i = 0; i < num_resume_shaders; i++) {10542assert(resume_shaders[i]->constant_data_size ==10543shader->constant_data_size);10544assert(memcmp(resume_shaders[i]->constant_data,10545shader->constant_data,10546shader->constant_data_size) == 0);10547}1054810549g.add_const_data(shader->constant_data, shader->constant_data_size);10550g.add_resume_sbt(num_resume_shaders, resume_sbt);1055110552return g.get_assembly();10553}1055410555/**10556* Test the dispatch mask packing assumptions of10557* brw_stage_has_packed_dispatch(). Call this from e.g. the top of10558* fs_visitor::emit_nir_code() to cause a GPU hang if any shader invocation is10559* executed with an unexpected dispatch mask.10560*/10561static UNUSED void10562brw_fs_test_dispatch_packing(const fs_builder &bld)10563{10564const gl_shader_stage stage = bld.shader->stage;1056510566if (brw_stage_has_packed_dispatch(bld.shader->devinfo, stage,10567bld.shader->stage_prog_data)) {10568const fs_builder ubld = bld.exec_all().group(1, 0);10569const fs_reg tmp = component(bld.vgrf(BRW_REGISTER_TYPE_UD), 0);10570const fs_reg mask = (stage == MESA_SHADER_FRAGMENT ? brw_vmask_reg() :10571brw_dmask_reg());1057210573ubld.ADD(tmp, mask, brw_imm_ud(1));10574ubld.AND(tmp, mask, tmp);1057510576/* This will loop forever if the dispatch mask doesn't have the expected10577* form '2^n-1', in which case tmp will be non-zero.10578*/10579bld.emit(BRW_OPCODE_DO);10580bld.CMP(bld.null_reg_ud(), tmp, brw_imm_ud(0), BRW_CONDITIONAL_NZ);10581set_predicate(BRW_PREDICATE_NORMAL, bld.emit(BRW_OPCODE_WHILE));10582}10583}1058410585unsigned10586fs_visitor::workgroup_size() const10587{10588assert(stage == MESA_SHADER_COMPUTE);10589const struct brw_cs_prog_data *cs = brw_cs_prog_data(prog_data);10590return cs->local_size[0] * cs->local_size[1] * cs->local_size[2];10591}105921059310594