Path: blob/21.2-virgl/src/intel/compiler/brw_eu_emit.c
4550 views
/*1Copyright (C) Intel Corp. 2006. All Rights Reserved.2Intel funded Tungsten Graphics to3develop this 3D driver.45Permission is hereby granted, free of charge, to any person obtaining6a copy of this software and associated documentation files (the7"Software"), to deal in the Software without restriction, including8without limitation the rights to use, copy, modify, merge, publish,9distribute, sublicense, and/or sell copies of the Software, and to10permit persons to whom the Software is furnished to do so, subject to11the following conditions:1213The above copyright notice and this permission notice (including the14next paragraph) shall be included in all copies or substantial15portions of the Software.1617THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,18EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF19MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.20IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE21LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION22OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION23WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.2425**********************************************************************/26/*27* Authors:28* Keith Whitwell <[email protected]>29*/303132#include "brw_eu_defines.h"33#include "brw_eu.h"3435#include "util/ralloc.h"3637/**38* Prior to Sandybridge, the SEND instruction accepted non-MRF source39* registers, implicitly moving the operand to a message register.40*41* On Sandybridge, this is no longer the case. This function performs the42* explicit move; it should be called before emitting a SEND instruction.43*/44void45gfx6_resolve_implied_move(struct brw_codegen *p,46struct brw_reg *src,47unsigned msg_reg_nr)48{49const struct intel_device_info *devinfo = p->devinfo;50if (devinfo->ver < 6)51return;5253if (src->file == BRW_MESSAGE_REGISTER_FILE)54return;5556if (src->file != BRW_ARCHITECTURE_REGISTER_FILE || src->nr != BRW_ARF_NULL) {57assert(devinfo->ver < 12);58brw_push_insn_state(p);59brw_set_default_exec_size(p, BRW_EXECUTE_8);60brw_set_default_mask_control(p, BRW_MASK_DISABLE);61brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);62brw_MOV(p, retype(brw_message_reg(msg_reg_nr), BRW_REGISTER_TYPE_UD),63retype(*src, BRW_REGISTER_TYPE_UD));64brw_pop_insn_state(p);65}66*src = brw_message_reg(msg_reg_nr);67}6869static void70gfx7_convert_mrf_to_grf(struct brw_codegen *p, struct brw_reg *reg)71{72/* From the Ivybridge PRM, Volume 4 Part 3, page 218 ("send"):73* "The send with EOT should use register space R112-R127 for <src>. This is74* to enable loading of a new thread into the same slot while the message75* with EOT for current thread is pending dispatch."76*77* Since we're pretending to have 16 MRFs anyway, we may as well use the78* registers required for messages with EOT.79*/80const struct intel_device_info *devinfo = p->devinfo;81if (devinfo->ver >= 7 && reg->file == BRW_MESSAGE_REGISTER_FILE) {82reg->file = BRW_GENERAL_REGISTER_FILE;83reg->nr += GFX7_MRF_HACK_START;84}85}8687void88brw_set_dest(struct brw_codegen *p, brw_inst *inst, struct brw_reg dest)89{90const struct intel_device_info *devinfo = p->devinfo;9192if (dest.file == BRW_MESSAGE_REGISTER_FILE)93assert((dest.nr & ~BRW_MRF_COMPR4) < BRW_MAX_MRF(devinfo->ver));94else if (dest.file == BRW_GENERAL_REGISTER_FILE)95assert(dest.nr < 128);9697/* The hardware has a restriction where a destination of size Byte with98* a stride of 1 is only allowed for a packed byte MOV. For any other99* instruction, the stride must be at least 2, even when the destination100* is the NULL register.101*/102if (dest.file == BRW_ARCHITECTURE_REGISTER_FILE &&103dest.nr == BRW_ARF_NULL &&104type_sz(dest.type) == 1 &&105dest.hstride == BRW_HORIZONTAL_STRIDE_1) {106dest.hstride = BRW_HORIZONTAL_STRIDE_2;107}108109gfx7_convert_mrf_to_grf(p, &dest);110111if (devinfo->ver >= 12 &&112(brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SEND ||113brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SENDC)) {114assert(dest.file == BRW_GENERAL_REGISTER_FILE ||115dest.file == BRW_ARCHITECTURE_REGISTER_FILE);116assert(dest.address_mode == BRW_ADDRESS_DIRECT);117assert(dest.subnr == 0);118assert(brw_inst_exec_size(devinfo, inst) == BRW_EXECUTE_1 ||119(dest.hstride == BRW_HORIZONTAL_STRIDE_1 &&120dest.vstride == dest.width + 1));121assert(!dest.negate && !dest.abs);122brw_inst_set_dst_reg_file(devinfo, inst, dest.file);123brw_inst_set_dst_da_reg_nr(devinfo, inst, dest.nr);124125} else if (brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SENDS ||126brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SENDSC) {127assert(devinfo->ver < 12);128assert(dest.file == BRW_GENERAL_REGISTER_FILE ||129dest.file == BRW_ARCHITECTURE_REGISTER_FILE);130assert(dest.address_mode == BRW_ADDRESS_DIRECT);131assert(dest.subnr % 16 == 0);132assert(dest.hstride == BRW_HORIZONTAL_STRIDE_1 &&133dest.vstride == dest.width + 1);134assert(!dest.negate && !dest.abs);135brw_inst_set_dst_da_reg_nr(devinfo, inst, dest.nr);136brw_inst_set_dst_da16_subreg_nr(devinfo, inst, dest.subnr / 16);137brw_inst_set_send_dst_reg_file(devinfo, inst, dest.file);138} else {139brw_inst_set_dst_file_type(devinfo, inst, dest.file, dest.type);140brw_inst_set_dst_address_mode(devinfo, inst, dest.address_mode);141142if (dest.address_mode == BRW_ADDRESS_DIRECT) {143brw_inst_set_dst_da_reg_nr(devinfo, inst, dest.nr);144145if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {146brw_inst_set_dst_da1_subreg_nr(devinfo, inst, dest.subnr);147if (dest.hstride == BRW_HORIZONTAL_STRIDE_0)148dest.hstride = BRW_HORIZONTAL_STRIDE_1;149brw_inst_set_dst_hstride(devinfo, inst, dest.hstride);150} else {151brw_inst_set_dst_da16_subreg_nr(devinfo, inst, dest.subnr / 16);152brw_inst_set_da16_writemask(devinfo, inst, dest.writemask);153if (dest.file == BRW_GENERAL_REGISTER_FILE ||154dest.file == BRW_MESSAGE_REGISTER_FILE) {155assert(dest.writemask != 0);156}157/* From the Ivybridge PRM, Vol 4, Part 3, Section 5.2.4.1:158* Although Dst.HorzStride is a don't care for Align16, HW needs159* this to be programmed as "01".160*/161brw_inst_set_dst_hstride(devinfo, inst, 1);162}163} else {164brw_inst_set_dst_ia_subreg_nr(devinfo, inst, dest.subnr);165166/* These are different sizes in align1 vs align16:167*/168if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {169brw_inst_set_dst_ia1_addr_imm(devinfo, inst,170dest.indirect_offset);171if (dest.hstride == BRW_HORIZONTAL_STRIDE_0)172dest.hstride = BRW_HORIZONTAL_STRIDE_1;173brw_inst_set_dst_hstride(devinfo, inst, dest.hstride);174} else {175brw_inst_set_dst_ia16_addr_imm(devinfo, inst,176dest.indirect_offset);177/* even ignored in da16, still need to set as '01' */178brw_inst_set_dst_hstride(devinfo, inst, 1);179}180}181}182183/* Generators should set a default exec_size of either 8 (SIMD4x2 or SIMD8)184* or 16 (SIMD16), as that's normally correct. However, when dealing with185* small registers, it can be useful for us to automatically reduce it to186* match the register size.187*/188if (p->automatic_exec_sizes) {189/*190* In platforms that support fp64 we can emit instructions with a width191* of 4 that need two SIMD8 registers and an exec_size of 8 or 16. In192* these cases we need to make sure that these instructions have their193* exec sizes set properly when they are emitted and we can't rely on194* this code to fix it.195*/196bool fix_exec_size;197if (devinfo->ver >= 6)198fix_exec_size = dest.width < BRW_EXECUTE_4;199else200fix_exec_size = dest.width < BRW_EXECUTE_8;201202if (fix_exec_size)203brw_inst_set_exec_size(devinfo, inst, dest.width);204}205}206207void208brw_set_src0(struct brw_codegen *p, brw_inst *inst, struct brw_reg reg)209{210const struct intel_device_info *devinfo = p->devinfo;211212if (reg.file == BRW_MESSAGE_REGISTER_FILE)213assert((reg.nr & ~BRW_MRF_COMPR4) < BRW_MAX_MRF(devinfo->ver));214else if (reg.file == BRW_GENERAL_REGISTER_FILE)215assert(reg.nr < 128);216217gfx7_convert_mrf_to_grf(p, ®);218219if (devinfo->ver >= 6 &&220(brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SEND ||221brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SENDC ||222brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SENDS ||223brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SENDSC)) {224/* Any source modifiers or regions will be ignored, since this just225* identifies the MRF/GRF to start reading the message contents from.226* Check for some likely failures.227*/228assert(!reg.negate);229assert(!reg.abs);230assert(reg.address_mode == BRW_ADDRESS_DIRECT);231}232233if (devinfo->ver >= 12 &&234(brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SEND ||235brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SENDC)) {236assert(reg.file != BRW_IMMEDIATE_VALUE);237assert(reg.address_mode == BRW_ADDRESS_DIRECT);238assert(reg.subnr == 0);239assert(has_scalar_region(reg) ||240(reg.hstride == BRW_HORIZONTAL_STRIDE_1 &&241reg.vstride == reg.width + 1));242assert(!reg.negate && !reg.abs);243brw_inst_set_send_src0_reg_file(devinfo, inst, reg.file);244brw_inst_set_src0_da_reg_nr(devinfo, inst, reg.nr);245246} else if (brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SENDS ||247brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SENDSC) {248assert(reg.file == BRW_GENERAL_REGISTER_FILE);249assert(reg.address_mode == BRW_ADDRESS_DIRECT);250assert(reg.subnr % 16 == 0);251assert(has_scalar_region(reg) ||252(reg.hstride == BRW_HORIZONTAL_STRIDE_1 &&253reg.vstride == reg.width + 1));254assert(!reg.negate && !reg.abs);255brw_inst_set_src0_da_reg_nr(devinfo, inst, reg.nr);256brw_inst_set_src0_da16_subreg_nr(devinfo, inst, reg.subnr / 16);257} else {258brw_inst_set_src0_file_type(devinfo, inst, reg.file, reg.type);259brw_inst_set_src0_abs(devinfo, inst, reg.abs);260brw_inst_set_src0_negate(devinfo, inst, reg.negate);261brw_inst_set_src0_address_mode(devinfo, inst, reg.address_mode);262263if (reg.file == BRW_IMMEDIATE_VALUE) {264if (reg.type == BRW_REGISTER_TYPE_DF ||265brw_inst_opcode(devinfo, inst) == BRW_OPCODE_DIM)266brw_inst_set_imm_df(devinfo, inst, reg.df);267else if (reg.type == BRW_REGISTER_TYPE_UQ ||268reg.type == BRW_REGISTER_TYPE_Q)269brw_inst_set_imm_uq(devinfo, inst, reg.u64);270else271brw_inst_set_imm_ud(devinfo, inst, reg.ud);272273if (devinfo->ver < 12 && type_sz(reg.type) < 8) {274brw_inst_set_src1_reg_file(devinfo, inst,275BRW_ARCHITECTURE_REGISTER_FILE);276brw_inst_set_src1_reg_hw_type(devinfo, inst,277brw_inst_src0_reg_hw_type(devinfo, inst));278}279} else {280if (reg.address_mode == BRW_ADDRESS_DIRECT) {281brw_inst_set_src0_da_reg_nr(devinfo, inst, reg.nr);282if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {283brw_inst_set_src0_da1_subreg_nr(devinfo, inst, reg.subnr);284} else {285brw_inst_set_src0_da16_subreg_nr(devinfo, inst, reg.subnr / 16);286}287} else {288brw_inst_set_src0_ia_subreg_nr(devinfo, inst, reg.subnr);289290if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {291brw_inst_set_src0_ia1_addr_imm(devinfo, inst, reg.indirect_offset);292} else {293brw_inst_set_src0_ia16_addr_imm(devinfo, inst, reg.indirect_offset);294}295}296297if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {298if (reg.width == BRW_WIDTH_1 &&299brw_inst_exec_size(devinfo, inst) == BRW_EXECUTE_1) {300brw_inst_set_src0_hstride(devinfo, inst, BRW_HORIZONTAL_STRIDE_0);301brw_inst_set_src0_width(devinfo, inst, BRW_WIDTH_1);302brw_inst_set_src0_vstride(devinfo, inst, BRW_VERTICAL_STRIDE_0);303} else {304brw_inst_set_src0_hstride(devinfo, inst, reg.hstride);305brw_inst_set_src0_width(devinfo, inst, reg.width);306brw_inst_set_src0_vstride(devinfo, inst, reg.vstride);307}308} else {309brw_inst_set_src0_da16_swiz_x(devinfo, inst,310BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_X));311brw_inst_set_src0_da16_swiz_y(devinfo, inst,312BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_Y));313brw_inst_set_src0_da16_swiz_z(devinfo, inst,314BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_Z));315brw_inst_set_src0_da16_swiz_w(devinfo, inst,316BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_W));317318if (reg.vstride == BRW_VERTICAL_STRIDE_8) {319/* This is an oddity of the fact we're using the same320* descriptions for registers in align_16 as align_1:321*/322brw_inst_set_src0_vstride(devinfo, inst, BRW_VERTICAL_STRIDE_4);323} else if (devinfo->verx10 == 70 &&324reg.type == BRW_REGISTER_TYPE_DF &&325reg.vstride == BRW_VERTICAL_STRIDE_2) {326/* From SNB PRM:327*328* "For Align16 access mode, only encodings of 0000 and 0011329* are allowed. Other codes are reserved."330*331* Presumably the DevSNB behavior applies to IVB as well.332*/333brw_inst_set_src0_vstride(devinfo, inst, BRW_VERTICAL_STRIDE_4);334} else {335brw_inst_set_src0_vstride(devinfo, inst, reg.vstride);336}337}338}339}340}341342343void344brw_set_src1(struct brw_codegen *p, brw_inst *inst, struct brw_reg reg)345{346const struct intel_device_info *devinfo = p->devinfo;347348if (reg.file == BRW_GENERAL_REGISTER_FILE)349assert(reg.nr < 128);350351if (brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SENDS ||352brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SENDSC ||353(devinfo->ver >= 12 &&354(brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SEND ||355brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SENDC))) {356assert(reg.file == BRW_GENERAL_REGISTER_FILE ||357reg.file == BRW_ARCHITECTURE_REGISTER_FILE);358assert(reg.address_mode == BRW_ADDRESS_DIRECT);359assert(reg.subnr == 0);360assert(has_scalar_region(reg) ||361(reg.hstride == BRW_HORIZONTAL_STRIDE_1 &&362reg.vstride == reg.width + 1));363assert(!reg.negate && !reg.abs);364brw_inst_set_send_src1_reg_nr(devinfo, inst, reg.nr);365brw_inst_set_send_src1_reg_file(devinfo, inst, reg.file);366} else {367/* From the IVB PRM Vol. 4, Pt. 3, Section 3.3.3.5:368*369* "Accumulator registers may be accessed explicitly as src0370* operands only."371*/372assert(reg.file != BRW_ARCHITECTURE_REGISTER_FILE ||373reg.nr != BRW_ARF_ACCUMULATOR);374375gfx7_convert_mrf_to_grf(p, ®);376assert(reg.file != BRW_MESSAGE_REGISTER_FILE);377378brw_inst_set_src1_file_type(devinfo, inst, reg.file, reg.type);379brw_inst_set_src1_abs(devinfo, inst, reg.abs);380brw_inst_set_src1_negate(devinfo, inst, reg.negate);381382/* Only src1 can be immediate in two-argument instructions.383*/384assert(brw_inst_src0_reg_file(devinfo, inst) != BRW_IMMEDIATE_VALUE);385386if (reg.file == BRW_IMMEDIATE_VALUE) {387/* two-argument instructions can only use 32-bit immediates */388assert(type_sz(reg.type) < 8);389brw_inst_set_imm_ud(devinfo, inst, reg.ud);390} else {391/* This is a hardware restriction, which may or may not be lifted392* in the future:393*/394assert (reg.address_mode == BRW_ADDRESS_DIRECT);395/* assert (reg.file == BRW_GENERAL_REGISTER_FILE); */396397brw_inst_set_src1_da_reg_nr(devinfo, inst, reg.nr);398if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {399brw_inst_set_src1_da1_subreg_nr(devinfo, inst, reg.subnr);400} else {401brw_inst_set_src1_da16_subreg_nr(devinfo, inst, reg.subnr / 16);402}403404if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {405if (reg.width == BRW_WIDTH_1 &&406brw_inst_exec_size(devinfo, inst) == BRW_EXECUTE_1) {407brw_inst_set_src1_hstride(devinfo, inst, BRW_HORIZONTAL_STRIDE_0);408brw_inst_set_src1_width(devinfo, inst, BRW_WIDTH_1);409brw_inst_set_src1_vstride(devinfo, inst, BRW_VERTICAL_STRIDE_0);410} else {411brw_inst_set_src1_hstride(devinfo, inst, reg.hstride);412brw_inst_set_src1_width(devinfo, inst, reg.width);413brw_inst_set_src1_vstride(devinfo, inst, reg.vstride);414}415} else {416brw_inst_set_src1_da16_swiz_x(devinfo, inst,417BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_X));418brw_inst_set_src1_da16_swiz_y(devinfo, inst,419BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_Y));420brw_inst_set_src1_da16_swiz_z(devinfo, inst,421BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_Z));422brw_inst_set_src1_da16_swiz_w(devinfo, inst,423BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_W));424425if (reg.vstride == BRW_VERTICAL_STRIDE_8) {426/* This is an oddity of the fact we're using the same427* descriptions for registers in align_16 as align_1:428*/429brw_inst_set_src1_vstride(devinfo, inst, BRW_VERTICAL_STRIDE_4);430} else if (devinfo->verx10 == 70 &&431reg.type == BRW_REGISTER_TYPE_DF &&432reg.vstride == BRW_VERTICAL_STRIDE_2) {433/* From SNB PRM:434*435* "For Align16 access mode, only encodings of 0000 and 0011436* are allowed. Other codes are reserved."437*438* Presumably the DevSNB behavior applies to IVB as well.439*/440brw_inst_set_src1_vstride(devinfo, inst, BRW_VERTICAL_STRIDE_4);441} else {442brw_inst_set_src1_vstride(devinfo, inst, reg.vstride);443}444}445}446}447}448449/**450* Specify the descriptor and extended descriptor immediate for a SEND(C)451* message instruction.452*/453void454brw_set_desc_ex(struct brw_codegen *p, brw_inst *inst,455unsigned desc, unsigned ex_desc)456{457const struct intel_device_info *devinfo = p->devinfo;458assert(brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SEND ||459brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SENDC);460if (devinfo->ver < 12)461brw_inst_set_src1_file_type(devinfo, inst,462BRW_IMMEDIATE_VALUE, BRW_REGISTER_TYPE_UD);463brw_inst_set_send_desc(devinfo, inst, desc);464if (devinfo->ver >= 9)465brw_inst_set_send_ex_desc(devinfo, inst, ex_desc);466}467468static void brw_set_math_message( struct brw_codegen *p,469brw_inst *inst,470unsigned function,471unsigned integer_type,472bool low_precision,473unsigned dataType )474{475const struct intel_device_info *devinfo = p->devinfo;476unsigned msg_length;477unsigned response_length;478479/* Infer message length from the function */480switch (function) {481case BRW_MATH_FUNCTION_POW:482case BRW_MATH_FUNCTION_INT_DIV_QUOTIENT:483case BRW_MATH_FUNCTION_INT_DIV_REMAINDER:484case BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER:485msg_length = 2;486break;487default:488msg_length = 1;489break;490}491492/* Infer response length from the function */493switch (function) {494case BRW_MATH_FUNCTION_SINCOS:495case BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER:496response_length = 2;497break;498default:499response_length = 1;500break;501}502503brw_set_desc(p, inst, brw_message_desc(504devinfo, msg_length, response_length, false));505506brw_inst_set_sfid(devinfo, inst, BRW_SFID_MATH);507brw_inst_set_math_msg_function(devinfo, inst, function);508brw_inst_set_math_msg_signed_int(devinfo, inst, integer_type);509brw_inst_set_math_msg_precision(devinfo, inst, low_precision);510brw_inst_set_math_msg_saturate(devinfo, inst, brw_inst_saturate(devinfo, inst));511brw_inst_set_math_msg_data_type(devinfo, inst, dataType);512brw_inst_set_saturate(devinfo, inst, 0);513}514515516static void brw_set_ff_sync_message(struct brw_codegen *p,517brw_inst *insn,518bool allocate,519unsigned response_length,520bool end_of_thread)521{522const struct intel_device_info *devinfo = p->devinfo;523524brw_set_desc(p, insn, brw_message_desc(525devinfo, 1, response_length, true));526527brw_inst_set_sfid(devinfo, insn, BRW_SFID_URB);528brw_inst_set_eot(devinfo, insn, end_of_thread);529brw_inst_set_urb_opcode(devinfo, insn, 1); /* FF_SYNC */530brw_inst_set_urb_allocate(devinfo, insn, allocate);531/* The following fields are not used by FF_SYNC: */532brw_inst_set_urb_global_offset(devinfo, insn, 0);533brw_inst_set_urb_swizzle_control(devinfo, insn, 0);534brw_inst_set_urb_used(devinfo, insn, 0);535brw_inst_set_urb_complete(devinfo, insn, 0);536}537538static void brw_set_urb_message( struct brw_codegen *p,539brw_inst *insn,540enum brw_urb_write_flags flags,541unsigned msg_length,542unsigned response_length,543unsigned offset,544unsigned swizzle_control )545{546const struct intel_device_info *devinfo = p->devinfo;547548assert(devinfo->ver < 7 || swizzle_control != BRW_URB_SWIZZLE_TRANSPOSE);549assert(devinfo->ver < 7 || !(flags & BRW_URB_WRITE_ALLOCATE));550assert(devinfo->ver >= 7 || !(flags & BRW_URB_WRITE_PER_SLOT_OFFSET));551552brw_set_desc(p, insn, brw_message_desc(553devinfo, msg_length, response_length, true));554555brw_inst_set_sfid(devinfo, insn, BRW_SFID_URB);556brw_inst_set_eot(devinfo, insn, !!(flags & BRW_URB_WRITE_EOT));557558if (flags & BRW_URB_WRITE_OWORD) {559assert(msg_length == 2); /* header + one OWORD of data */560brw_inst_set_urb_opcode(devinfo, insn, BRW_URB_OPCODE_WRITE_OWORD);561} else {562brw_inst_set_urb_opcode(devinfo, insn, BRW_URB_OPCODE_WRITE_HWORD);563}564565brw_inst_set_urb_global_offset(devinfo, insn, offset);566brw_inst_set_urb_swizzle_control(devinfo, insn, swizzle_control);567568if (devinfo->ver < 8) {569brw_inst_set_urb_complete(devinfo, insn, !!(flags & BRW_URB_WRITE_COMPLETE));570}571572if (devinfo->ver < 7) {573brw_inst_set_urb_allocate(devinfo, insn, !!(flags & BRW_URB_WRITE_ALLOCATE));574brw_inst_set_urb_used(devinfo, insn, !(flags & BRW_URB_WRITE_UNUSED));575} else {576brw_inst_set_urb_per_slot_offset(devinfo, insn,577!!(flags & BRW_URB_WRITE_PER_SLOT_OFFSET));578}579}580581static void582gfx7_set_dp_scratch_message(struct brw_codegen *p,583brw_inst *inst,584bool write,585bool dword,586bool invalidate_after_read,587unsigned num_regs,588unsigned addr_offset,589unsigned mlen,590unsigned rlen,591bool header_present)592{593const struct intel_device_info *devinfo = p->devinfo;594assert(num_regs == 1 || num_regs == 2 || num_regs == 4 ||595(devinfo->ver >= 8 && num_regs == 8));596const unsigned block_size = (devinfo->ver >= 8 ? util_logbase2(num_regs) :597num_regs - 1);598599brw_set_desc(p, inst, brw_message_desc(600devinfo, mlen, rlen, header_present));601602brw_inst_set_sfid(devinfo, inst, GFX7_SFID_DATAPORT_DATA_CACHE);603brw_inst_set_dp_category(devinfo, inst, 1); /* Scratch Block Read/Write msgs */604brw_inst_set_scratch_read_write(devinfo, inst, write);605brw_inst_set_scratch_type(devinfo, inst, dword);606brw_inst_set_scratch_invalidate_after_read(devinfo, inst, invalidate_after_read);607brw_inst_set_scratch_block_size(devinfo, inst, block_size);608brw_inst_set_scratch_addr_offset(devinfo, inst, addr_offset);609}610611static void612brw_inst_set_state(const struct intel_device_info *devinfo,613brw_inst *insn,614const struct brw_insn_state *state)615{616brw_inst_set_exec_size(devinfo, insn, state->exec_size);617brw_inst_set_group(devinfo, insn, state->group);618brw_inst_set_compression(devinfo, insn, state->compressed);619brw_inst_set_access_mode(devinfo, insn, state->access_mode);620brw_inst_set_mask_control(devinfo, insn, state->mask_control);621if (devinfo->ver >= 12)622brw_inst_set_swsb(devinfo, insn, tgl_swsb_encode(devinfo, state->swsb));623brw_inst_set_saturate(devinfo, insn, state->saturate);624brw_inst_set_pred_control(devinfo, insn, state->predicate);625brw_inst_set_pred_inv(devinfo, insn, state->pred_inv);626627if (is_3src(devinfo, brw_inst_opcode(devinfo, insn)) &&628state->access_mode == BRW_ALIGN_16) {629brw_inst_set_3src_a16_flag_subreg_nr(devinfo, insn, state->flag_subreg % 2);630if (devinfo->ver >= 7)631brw_inst_set_3src_a16_flag_reg_nr(devinfo, insn, state->flag_subreg / 2);632} else {633brw_inst_set_flag_subreg_nr(devinfo, insn, state->flag_subreg % 2);634if (devinfo->ver >= 7)635brw_inst_set_flag_reg_nr(devinfo, insn, state->flag_subreg / 2);636}637638if (devinfo->ver >= 6)639brw_inst_set_acc_wr_control(devinfo, insn, state->acc_wr_control);640}641642static brw_inst *643brw_append_insns(struct brw_codegen *p, unsigned nr_insn, unsigned align)644{645assert(util_is_power_of_two_or_zero(sizeof(brw_inst)));646assert(util_is_power_of_two_or_zero(align));647const unsigned align_insn = MAX2(align / sizeof(brw_inst), 1);648const unsigned start_insn = ALIGN(p->nr_insn, align_insn);649const unsigned new_nr_insn = start_insn + nr_insn;650651if (p->store_size < new_nr_insn) {652p->store_size = util_next_power_of_two(new_nr_insn * sizeof(brw_inst));653p->store = reralloc(p->mem_ctx, p->store, brw_inst, p->store_size);654}655656/* Memset any padding due to alignment to 0. We don't want to be hashing657* or caching a bunch of random bits we got from a memory allocation.658*/659if (p->nr_insn < start_insn) {660memset(&p->store[p->nr_insn], 0,661(start_insn - p->nr_insn) * sizeof(brw_inst));662}663664assert(p->next_insn_offset == p->nr_insn * sizeof(brw_inst));665p->nr_insn = new_nr_insn;666p->next_insn_offset = new_nr_insn * sizeof(brw_inst);667668return &p->store[start_insn];669}670671void672brw_realign(struct brw_codegen *p, unsigned align)673{674brw_append_insns(p, 0, align);675}676677int678brw_append_data(struct brw_codegen *p, void *data,679unsigned size, unsigned align)680{681unsigned nr_insn = DIV_ROUND_UP(size, sizeof(brw_inst));682void *dst = brw_append_insns(p, nr_insn, align);683memcpy(dst, data, size);684685/* If it's not a whole number of instructions, memset the end */686if (size < nr_insn * sizeof(brw_inst))687memset(dst + size, 0, nr_insn * sizeof(brw_inst) - size);688689return dst - (void *)p->store;690}691692#define next_insn brw_next_insn693brw_inst *694brw_next_insn(struct brw_codegen *p, unsigned opcode)695{696const struct intel_device_info *devinfo = p->devinfo;697brw_inst *insn = brw_append_insns(p, 1, sizeof(brw_inst));698699memset(insn, 0, sizeof(*insn));700brw_inst_set_opcode(devinfo, insn, opcode);701702/* Apply the default instruction state */703brw_inst_set_state(devinfo, insn, p->current);704705return insn;706}707708void709brw_add_reloc(struct brw_codegen *p, uint32_t id,710enum brw_shader_reloc_type type,711uint32_t offset, uint32_t delta)712{713if (p->num_relocs + 1 > p->reloc_array_size) {714p->reloc_array_size = MAX2(16, p->reloc_array_size * 2);715p->relocs = reralloc(p->mem_ctx, p->relocs,716struct brw_shader_reloc, p->reloc_array_size);717}718719p->relocs[p->num_relocs++] = (struct brw_shader_reloc) {720.id = id,721.type = type,722.offset = offset,723.delta = delta,724};725}726727static brw_inst *728brw_alu1(struct brw_codegen *p, unsigned opcode,729struct brw_reg dest, struct brw_reg src)730{731brw_inst *insn = next_insn(p, opcode);732brw_set_dest(p, insn, dest);733brw_set_src0(p, insn, src);734return insn;735}736737static brw_inst *738brw_alu2(struct brw_codegen *p, unsigned opcode,739struct brw_reg dest, struct brw_reg src0, struct brw_reg src1)740{741/* 64-bit immediates are only supported on 1-src instructions */742assert(src0.file != BRW_IMMEDIATE_VALUE || type_sz(src0.type) <= 4);743assert(src1.file != BRW_IMMEDIATE_VALUE || type_sz(src1.type) <= 4);744745brw_inst *insn = next_insn(p, opcode);746brw_set_dest(p, insn, dest);747brw_set_src0(p, insn, src0);748brw_set_src1(p, insn, src1);749return insn;750}751752static int753get_3src_subreg_nr(struct brw_reg reg)754{755/* Normally, SubRegNum is in bytes (0..31). However, 3-src instructions756* use 32-bit units (components 0..7). Since they only support F/D/UD757* types, this doesn't lose any flexibility, but uses fewer bits.758*/759return reg.subnr / 4;760}761762static enum gfx10_align1_3src_vertical_stride763to_3src_align1_vstride(const struct intel_device_info *devinfo,764enum brw_vertical_stride vstride)765{766switch (vstride) {767case BRW_VERTICAL_STRIDE_0:768return BRW_ALIGN1_3SRC_VERTICAL_STRIDE_0;769case BRW_VERTICAL_STRIDE_1:770assert(devinfo->ver >= 12);771return BRW_ALIGN1_3SRC_VERTICAL_STRIDE_1;772case BRW_VERTICAL_STRIDE_2:773assert(devinfo->ver < 12);774return BRW_ALIGN1_3SRC_VERTICAL_STRIDE_2;775case BRW_VERTICAL_STRIDE_4:776return BRW_ALIGN1_3SRC_VERTICAL_STRIDE_4;777case BRW_VERTICAL_STRIDE_8:778case BRW_VERTICAL_STRIDE_16:779return BRW_ALIGN1_3SRC_VERTICAL_STRIDE_8;780default:781unreachable("invalid vstride");782}783}784785786static enum gfx10_align1_3src_src_horizontal_stride787to_3src_align1_hstride(enum brw_horizontal_stride hstride)788{789switch (hstride) {790case BRW_HORIZONTAL_STRIDE_0:791return BRW_ALIGN1_3SRC_SRC_HORIZONTAL_STRIDE_0;792case BRW_HORIZONTAL_STRIDE_1:793return BRW_ALIGN1_3SRC_SRC_HORIZONTAL_STRIDE_1;794case BRW_HORIZONTAL_STRIDE_2:795return BRW_ALIGN1_3SRC_SRC_HORIZONTAL_STRIDE_2;796case BRW_HORIZONTAL_STRIDE_4:797return BRW_ALIGN1_3SRC_SRC_HORIZONTAL_STRIDE_4;798default:799unreachable("invalid hstride");800}801}802803static brw_inst *804brw_alu3(struct brw_codegen *p, unsigned opcode, struct brw_reg dest,805struct brw_reg src0, struct brw_reg src1, struct brw_reg src2)806{807const struct intel_device_info *devinfo = p->devinfo;808brw_inst *inst = next_insn(p, opcode);809810gfx7_convert_mrf_to_grf(p, &dest);811812assert(dest.nr < 128);813814if (devinfo->ver >= 10)815assert(!(src0.file == BRW_IMMEDIATE_VALUE &&816src2.file == BRW_IMMEDIATE_VALUE));817818assert(src0.file == BRW_IMMEDIATE_VALUE || src0.nr < 128);819assert(src1.file != BRW_IMMEDIATE_VALUE && src1.nr < 128);820assert(src2.file == BRW_IMMEDIATE_VALUE || src2.nr < 128);821assert(dest.address_mode == BRW_ADDRESS_DIRECT);822assert(src0.address_mode == BRW_ADDRESS_DIRECT);823assert(src1.address_mode == BRW_ADDRESS_DIRECT);824assert(src2.address_mode == BRW_ADDRESS_DIRECT);825826if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {827assert(dest.file == BRW_GENERAL_REGISTER_FILE ||828dest.file == BRW_ARCHITECTURE_REGISTER_FILE);829830if (devinfo->ver >= 12) {831brw_inst_set_3src_a1_dst_reg_file(devinfo, inst, dest.file);832brw_inst_set_3src_dst_reg_nr(devinfo, inst, dest.nr);833} else {834if (dest.file == BRW_ARCHITECTURE_REGISTER_FILE) {835brw_inst_set_3src_a1_dst_reg_file(devinfo, inst,836BRW_ALIGN1_3SRC_ACCUMULATOR);837brw_inst_set_3src_dst_reg_nr(devinfo, inst, BRW_ARF_ACCUMULATOR);838} else {839brw_inst_set_3src_a1_dst_reg_file(devinfo, inst,840BRW_ALIGN1_3SRC_GENERAL_REGISTER_FILE);841brw_inst_set_3src_dst_reg_nr(devinfo, inst, dest.nr);842}843}844brw_inst_set_3src_a1_dst_subreg_nr(devinfo, inst, dest.subnr / 8);845846brw_inst_set_3src_a1_dst_hstride(devinfo, inst, BRW_ALIGN1_3SRC_DST_HORIZONTAL_STRIDE_1);847848if (brw_reg_type_is_floating_point(dest.type)) {849brw_inst_set_3src_a1_exec_type(devinfo, inst,850BRW_ALIGN1_3SRC_EXEC_TYPE_FLOAT);851} else {852brw_inst_set_3src_a1_exec_type(devinfo, inst,853BRW_ALIGN1_3SRC_EXEC_TYPE_INT);854}855856brw_inst_set_3src_a1_dst_type(devinfo, inst, dest.type);857brw_inst_set_3src_a1_src0_type(devinfo, inst, src0.type);858brw_inst_set_3src_a1_src1_type(devinfo, inst, src1.type);859brw_inst_set_3src_a1_src2_type(devinfo, inst, src2.type);860861if (src0.file == BRW_IMMEDIATE_VALUE) {862brw_inst_set_3src_a1_src0_imm(devinfo, inst, src0.ud);863} else {864brw_inst_set_3src_a1_src0_vstride(865devinfo, inst, to_3src_align1_vstride(devinfo, src0.vstride));866brw_inst_set_3src_a1_src0_hstride(devinfo, inst,867to_3src_align1_hstride(src0.hstride));868brw_inst_set_3src_a1_src0_subreg_nr(devinfo, inst, src0.subnr);869if (src0.type == BRW_REGISTER_TYPE_NF) {870brw_inst_set_3src_src0_reg_nr(devinfo, inst, BRW_ARF_ACCUMULATOR);871} else {872brw_inst_set_3src_src0_reg_nr(devinfo, inst, src0.nr);873}874brw_inst_set_3src_src0_abs(devinfo, inst, src0.abs);875brw_inst_set_3src_src0_negate(devinfo, inst, src0.negate);876}877brw_inst_set_3src_a1_src1_vstride(878devinfo, inst, to_3src_align1_vstride(devinfo, src1.vstride));879brw_inst_set_3src_a1_src1_hstride(devinfo, inst,880to_3src_align1_hstride(src1.hstride));881882brw_inst_set_3src_a1_src1_subreg_nr(devinfo, inst, src1.subnr);883if (src1.file == BRW_ARCHITECTURE_REGISTER_FILE) {884brw_inst_set_3src_src1_reg_nr(devinfo, inst, BRW_ARF_ACCUMULATOR);885} else {886brw_inst_set_3src_src1_reg_nr(devinfo, inst, src1.nr);887}888brw_inst_set_3src_src1_abs(devinfo, inst, src1.abs);889brw_inst_set_3src_src1_negate(devinfo, inst, src1.negate);890891if (src2.file == BRW_IMMEDIATE_VALUE) {892brw_inst_set_3src_a1_src2_imm(devinfo, inst, src2.ud);893} else {894brw_inst_set_3src_a1_src2_hstride(devinfo, inst,895to_3src_align1_hstride(src2.hstride));896/* no vstride on src2 */897brw_inst_set_3src_a1_src2_subreg_nr(devinfo, inst, src2.subnr);898brw_inst_set_3src_src2_reg_nr(devinfo, inst, src2.nr);899brw_inst_set_3src_src2_abs(devinfo, inst, src2.abs);900brw_inst_set_3src_src2_negate(devinfo, inst, src2.negate);901}902903assert(src0.file == BRW_GENERAL_REGISTER_FILE ||904src0.file == BRW_IMMEDIATE_VALUE ||905(src0.file == BRW_ARCHITECTURE_REGISTER_FILE &&906src0.type == BRW_REGISTER_TYPE_NF));907assert(src1.file == BRW_GENERAL_REGISTER_FILE ||908src1.file == BRW_ARCHITECTURE_REGISTER_FILE);909assert(src2.file == BRW_GENERAL_REGISTER_FILE ||910src2.file == BRW_IMMEDIATE_VALUE);911912if (devinfo->ver >= 12) {913if (src0.file == BRW_IMMEDIATE_VALUE) {914brw_inst_set_3src_a1_src0_is_imm(devinfo, inst, 1);915} else {916brw_inst_set_3src_a1_src0_reg_file(devinfo, inst, src0.file);917}918919brw_inst_set_3src_a1_src1_reg_file(devinfo, inst, src1.file);920921if (src2.file == BRW_IMMEDIATE_VALUE) {922brw_inst_set_3src_a1_src2_is_imm(devinfo, inst, 1);923} else {924brw_inst_set_3src_a1_src2_reg_file(devinfo, inst, src2.file);925}926} else {927brw_inst_set_3src_a1_src0_reg_file(devinfo, inst,928src0.file == BRW_GENERAL_REGISTER_FILE ?929BRW_ALIGN1_3SRC_GENERAL_REGISTER_FILE :930BRW_ALIGN1_3SRC_IMMEDIATE_VALUE);931brw_inst_set_3src_a1_src1_reg_file(devinfo, inst,932src1.file == BRW_GENERAL_REGISTER_FILE ?933BRW_ALIGN1_3SRC_GENERAL_REGISTER_FILE :934BRW_ALIGN1_3SRC_ACCUMULATOR);935brw_inst_set_3src_a1_src2_reg_file(devinfo, inst,936src2.file == BRW_GENERAL_REGISTER_FILE ?937BRW_ALIGN1_3SRC_GENERAL_REGISTER_FILE :938BRW_ALIGN1_3SRC_IMMEDIATE_VALUE);939}940941} else {942assert(dest.file == BRW_GENERAL_REGISTER_FILE ||943dest.file == BRW_MESSAGE_REGISTER_FILE);944assert(dest.type == BRW_REGISTER_TYPE_F ||945dest.type == BRW_REGISTER_TYPE_DF ||946dest.type == BRW_REGISTER_TYPE_D ||947dest.type == BRW_REGISTER_TYPE_UD ||948(dest.type == BRW_REGISTER_TYPE_HF && devinfo->ver >= 8));949if (devinfo->ver == 6) {950brw_inst_set_3src_a16_dst_reg_file(devinfo, inst,951dest.file == BRW_MESSAGE_REGISTER_FILE);952}953brw_inst_set_3src_dst_reg_nr(devinfo, inst, dest.nr);954brw_inst_set_3src_a16_dst_subreg_nr(devinfo, inst, dest.subnr / 4);955brw_inst_set_3src_a16_dst_writemask(devinfo, inst, dest.writemask);956957assert(src0.file == BRW_GENERAL_REGISTER_FILE);958brw_inst_set_3src_a16_src0_swizzle(devinfo, inst, src0.swizzle);959brw_inst_set_3src_a16_src0_subreg_nr(devinfo, inst, get_3src_subreg_nr(src0));960brw_inst_set_3src_src0_reg_nr(devinfo, inst, src0.nr);961brw_inst_set_3src_src0_abs(devinfo, inst, src0.abs);962brw_inst_set_3src_src0_negate(devinfo, inst, src0.negate);963brw_inst_set_3src_a16_src0_rep_ctrl(devinfo, inst,964src0.vstride == BRW_VERTICAL_STRIDE_0);965966assert(src1.file == BRW_GENERAL_REGISTER_FILE);967brw_inst_set_3src_a16_src1_swizzle(devinfo, inst, src1.swizzle);968brw_inst_set_3src_a16_src1_subreg_nr(devinfo, inst, get_3src_subreg_nr(src1));969brw_inst_set_3src_src1_reg_nr(devinfo, inst, src1.nr);970brw_inst_set_3src_src1_abs(devinfo, inst, src1.abs);971brw_inst_set_3src_src1_negate(devinfo, inst, src1.negate);972brw_inst_set_3src_a16_src1_rep_ctrl(devinfo, inst,973src1.vstride == BRW_VERTICAL_STRIDE_0);974975assert(src2.file == BRW_GENERAL_REGISTER_FILE);976brw_inst_set_3src_a16_src2_swizzle(devinfo, inst, src2.swizzle);977brw_inst_set_3src_a16_src2_subreg_nr(devinfo, inst, get_3src_subreg_nr(src2));978brw_inst_set_3src_src2_reg_nr(devinfo, inst, src2.nr);979brw_inst_set_3src_src2_abs(devinfo, inst, src2.abs);980brw_inst_set_3src_src2_negate(devinfo, inst, src2.negate);981brw_inst_set_3src_a16_src2_rep_ctrl(devinfo, inst,982src2.vstride == BRW_VERTICAL_STRIDE_0);983984if (devinfo->ver >= 7) {985/* Set both the source and destination types based on dest.type,986* ignoring the source register types. The MAD and LRP emitters ensure987* that all four types are float. The BFE and BFI2 emitters, however,988* may send us mixed D and UD types and want us to ignore that and use989* the destination type.990*/991brw_inst_set_3src_a16_src_type(devinfo, inst, dest.type);992brw_inst_set_3src_a16_dst_type(devinfo, inst, dest.type);993994/* From the Bspec, 3D Media GPGPU, Instruction fields, srcType:995*996* "Three source instructions can use operands with mixed-mode997* precision. When SrcType field is set to :f or :hf it defines998* precision for source 0 only, and fields Src1Type and Src2Type999* define precision for other source operands:1000*1001* 0b = :f. Single precision Float (32-bit).1002* 1b = :hf. Half precision Float (16-bit)."1003*/1004if (src1.type == BRW_REGISTER_TYPE_HF)1005brw_inst_set_3src_a16_src1_type(devinfo, inst, 1);10061007if (src2.type == BRW_REGISTER_TYPE_HF)1008brw_inst_set_3src_a16_src2_type(devinfo, inst, 1);1009}1010}10111012return inst;1013}101410151016/***********************************************************************1017* Convenience routines.1018*/1019#define ALU1(OP) \1020brw_inst *brw_##OP(struct brw_codegen *p, \1021struct brw_reg dest, \1022struct brw_reg src0) \1023{ \1024return brw_alu1(p, BRW_OPCODE_##OP, dest, src0); \1025}10261027#define ALU2(OP) \1028brw_inst *brw_##OP(struct brw_codegen *p, \1029struct brw_reg dest, \1030struct brw_reg src0, \1031struct brw_reg src1) \1032{ \1033return brw_alu2(p, BRW_OPCODE_##OP, dest, src0, src1); \1034}10351036#define ALU3(OP) \1037brw_inst *brw_##OP(struct brw_codegen *p, \1038struct brw_reg dest, \1039struct brw_reg src0, \1040struct brw_reg src1, \1041struct brw_reg src2) \1042{ \1043if (p->current->access_mode == BRW_ALIGN_16) { \1044if (src0.vstride == BRW_VERTICAL_STRIDE_0) \1045src0.swizzle = BRW_SWIZZLE_XXXX; \1046if (src1.vstride == BRW_VERTICAL_STRIDE_0) \1047src1.swizzle = BRW_SWIZZLE_XXXX; \1048if (src2.vstride == BRW_VERTICAL_STRIDE_0) \1049src2.swizzle = BRW_SWIZZLE_XXXX; \1050} \1051return brw_alu3(p, BRW_OPCODE_##OP, dest, src0, src1, src2); \1052}10531054#define ALU3F(OP) \1055brw_inst *brw_##OP(struct brw_codegen *p, \1056struct brw_reg dest, \1057struct brw_reg src0, \1058struct brw_reg src1, \1059struct brw_reg src2) \1060{ \1061assert(dest.type == BRW_REGISTER_TYPE_F || \1062dest.type == BRW_REGISTER_TYPE_DF); \1063if (dest.type == BRW_REGISTER_TYPE_F) { \1064assert(src0.type == BRW_REGISTER_TYPE_F); \1065assert(src1.type == BRW_REGISTER_TYPE_F); \1066assert(src2.type == BRW_REGISTER_TYPE_F); \1067} else if (dest.type == BRW_REGISTER_TYPE_DF) { \1068assert(src0.type == BRW_REGISTER_TYPE_DF); \1069assert(src1.type == BRW_REGISTER_TYPE_DF); \1070assert(src2.type == BRW_REGISTER_TYPE_DF); \1071} \1072\1073if (p->current->access_mode == BRW_ALIGN_16) { \1074if (src0.vstride == BRW_VERTICAL_STRIDE_0) \1075src0.swizzle = BRW_SWIZZLE_XXXX; \1076if (src1.vstride == BRW_VERTICAL_STRIDE_0) \1077src1.swizzle = BRW_SWIZZLE_XXXX; \1078if (src2.vstride == BRW_VERTICAL_STRIDE_0) \1079src2.swizzle = BRW_SWIZZLE_XXXX; \1080} \1081return brw_alu3(p, BRW_OPCODE_##OP, dest, src0, src1, src2); \1082}10831084ALU2(SEL)1085ALU1(NOT)1086ALU2(AND)1087ALU2(OR)1088ALU2(XOR)1089ALU2(SHR)1090ALU2(SHL)1091ALU1(DIM)1092ALU2(ASR)1093ALU2(ROL)1094ALU2(ROR)1095ALU3(CSEL)1096ALU1(FRC)1097ALU1(RNDD)1098ALU1(RNDE)1099ALU1(RNDU)1100ALU1(RNDZ)1101ALU2(MAC)1102ALU2(MACH)1103ALU1(LZD)1104ALU2(DP4)1105ALU2(DPH)1106ALU2(DP3)1107ALU2(DP2)1108ALU3(MAD)1109ALU3F(LRP)1110ALU1(BFREV)1111ALU3(BFE)1112ALU2(BFI1)1113ALU3(BFI2)1114ALU1(FBH)1115ALU1(FBL)1116ALU1(CBIT)1117ALU2(ADDC)1118ALU2(SUBB)11191120brw_inst *1121brw_MOV(struct brw_codegen *p, struct brw_reg dest, struct brw_reg src0)1122{1123const struct intel_device_info *devinfo = p->devinfo;11241125/* When converting F->DF on IVB/BYT, every odd source channel is ignored.1126* To avoid the problems that causes, we use an <X,2,0> source region to1127* read each element twice.1128*/1129if (devinfo->verx10 == 70 &&1130brw_get_default_access_mode(p) == BRW_ALIGN_1 &&1131dest.type == BRW_REGISTER_TYPE_DF &&1132(src0.type == BRW_REGISTER_TYPE_F ||1133src0.type == BRW_REGISTER_TYPE_D ||1134src0.type == BRW_REGISTER_TYPE_UD) &&1135!has_scalar_region(src0)) {1136assert(src0.vstride == src0.width + src0.hstride);1137src0.vstride = src0.hstride;1138src0.width = BRW_WIDTH_2;1139src0.hstride = BRW_HORIZONTAL_STRIDE_0;1140}11411142return brw_alu1(p, BRW_OPCODE_MOV, dest, src0);1143}11441145brw_inst *1146brw_ADD(struct brw_codegen *p, struct brw_reg dest,1147struct brw_reg src0, struct brw_reg src1)1148{1149/* 6.2.2: add */1150if (src0.type == BRW_REGISTER_TYPE_F ||1151(src0.file == BRW_IMMEDIATE_VALUE &&1152src0.type == BRW_REGISTER_TYPE_VF)) {1153assert(src1.type != BRW_REGISTER_TYPE_UD);1154assert(src1.type != BRW_REGISTER_TYPE_D);1155}11561157if (src1.type == BRW_REGISTER_TYPE_F ||1158(src1.file == BRW_IMMEDIATE_VALUE &&1159src1.type == BRW_REGISTER_TYPE_VF)) {1160assert(src0.type != BRW_REGISTER_TYPE_UD);1161assert(src0.type != BRW_REGISTER_TYPE_D);1162}11631164return brw_alu2(p, BRW_OPCODE_ADD, dest, src0, src1);1165}11661167brw_inst *1168brw_AVG(struct brw_codegen *p, struct brw_reg dest,1169struct brw_reg src0, struct brw_reg src1)1170{1171assert(dest.type == src0.type);1172assert(src0.type == src1.type);1173switch (src0.type) {1174case BRW_REGISTER_TYPE_B:1175case BRW_REGISTER_TYPE_UB:1176case BRW_REGISTER_TYPE_W:1177case BRW_REGISTER_TYPE_UW:1178case BRW_REGISTER_TYPE_D:1179case BRW_REGISTER_TYPE_UD:1180break;1181default:1182unreachable("Bad type for brw_AVG");1183}11841185return brw_alu2(p, BRW_OPCODE_AVG, dest, src0, src1);1186}11871188brw_inst *1189brw_MUL(struct brw_codegen *p, struct brw_reg dest,1190struct brw_reg src0, struct brw_reg src1)1191{1192/* 6.32.38: mul */1193if (src0.type == BRW_REGISTER_TYPE_D ||1194src0.type == BRW_REGISTER_TYPE_UD ||1195src1.type == BRW_REGISTER_TYPE_D ||1196src1.type == BRW_REGISTER_TYPE_UD) {1197assert(dest.type != BRW_REGISTER_TYPE_F);1198}11991200if (src0.type == BRW_REGISTER_TYPE_F ||1201(src0.file == BRW_IMMEDIATE_VALUE &&1202src0.type == BRW_REGISTER_TYPE_VF)) {1203assert(src1.type != BRW_REGISTER_TYPE_UD);1204assert(src1.type != BRW_REGISTER_TYPE_D);1205}12061207if (src1.type == BRW_REGISTER_TYPE_F ||1208(src1.file == BRW_IMMEDIATE_VALUE &&1209src1.type == BRW_REGISTER_TYPE_VF)) {1210assert(src0.type != BRW_REGISTER_TYPE_UD);1211assert(src0.type != BRW_REGISTER_TYPE_D);1212}12131214assert(src0.file != BRW_ARCHITECTURE_REGISTER_FILE ||1215src0.nr != BRW_ARF_ACCUMULATOR);1216assert(src1.file != BRW_ARCHITECTURE_REGISTER_FILE ||1217src1.nr != BRW_ARF_ACCUMULATOR);12181219return brw_alu2(p, BRW_OPCODE_MUL, dest, src0, src1);1220}12211222brw_inst *1223brw_LINE(struct brw_codegen *p, struct brw_reg dest,1224struct brw_reg src0, struct brw_reg src1)1225{1226src0.vstride = BRW_VERTICAL_STRIDE_0;1227src0.width = BRW_WIDTH_1;1228src0.hstride = BRW_HORIZONTAL_STRIDE_0;1229return brw_alu2(p, BRW_OPCODE_LINE, dest, src0, src1);1230}12311232brw_inst *1233brw_PLN(struct brw_codegen *p, struct brw_reg dest,1234struct brw_reg src0, struct brw_reg src1)1235{1236src0.vstride = BRW_VERTICAL_STRIDE_0;1237src0.width = BRW_WIDTH_1;1238src0.hstride = BRW_HORIZONTAL_STRIDE_0;1239src1.vstride = BRW_VERTICAL_STRIDE_8;1240src1.width = BRW_WIDTH_8;1241src1.hstride = BRW_HORIZONTAL_STRIDE_1;1242return brw_alu2(p, BRW_OPCODE_PLN, dest, src0, src1);1243}12441245brw_inst *1246brw_F32TO16(struct brw_codegen *p, struct brw_reg dst, struct brw_reg src)1247{1248const struct intel_device_info *devinfo = p->devinfo;1249const bool align16 = brw_get_default_access_mode(p) == BRW_ALIGN_16;1250/* The F32TO16 instruction doesn't support 32-bit destination types in1251* Align1 mode, and neither does the Gfx8 implementation in terms of a1252* converting MOV. Gfx7 does zero out the high 16 bits in Align16 mode as1253* an undocumented feature.1254*/1255const bool needs_zero_fill = (dst.type == BRW_REGISTER_TYPE_UD &&1256(!align16 || devinfo->ver >= 8));1257brw_inst *inst;12581259if (align16) {1260assert(dst.type == BRW_REGISTER_TYPE_UD);1261} else {1262assert(dst.type == BRW_REGISTER_TYPE_UD ||1263dst.type == BRW_REGISTER_TYPE_W ||1264dst.type == BRW_REGISTER_TYPE_UW ||1265dst.type == BRW_REGISTER_TYPE_HF);1266}12671268brw_push_insn_state(p);12691270if (needs_zero_fill) {1271brw_set_default_access_mode(p, BRW_ALIGN_1);1272dst = spread(retype(dst, BRW_REGISTER_TYPE_W), 2);1273}12741275if (devinfo->ver >= 8) {1276inst = brw_MOV(p, retype(dst, BRW_REGISTER_TYPE_HF), src);1277} else {1278assert(devinfo->ver == 7);1279inst = brw_alu1(p, BRW_OPCODE_F32TO16, dst, src);1280}12811282if (needs_zero_fill) {1283if (devinfo->ver < 12)1284brw_inst_set_no_dd_clear(devinfo, inst, true);1285brw_set_default_swsb(p, tgl_swsb_null());1286inst = brw_MOV(p, suboffset(dst, 1), brw_imm_w(0));1287if (devinfo->ver < 12)1288brw_inst_set_no_dd_check(devinfo, inst, true);1289}12901291brw_pop_insn_state(p);1292return inst;1293}12941295brw_inst *1296brw_F16TO32(struct brw_codegen *p, struct brw_reg dst, struct brw_reg src)1297{1298const struct intel_device_info *devinfo = p->devinfo;1299bool align16 = brw_get_default_access_mode(p) == BRW_ALIGN_16;13001301if (align16) {1302assert(src.type == BRW_REGISTER_TYPE_UD);1303} else {1304/* From the Ivybridge PRM, Vol4, Part3, Section 6.26 f16to32:1305*1306* Because this instruction does not have a 16-bit floating-point1307* type, the source data type must be Word (W). The destination type1308* must be F (Float).1309*/1310if (src.type == BRW_REGISTER_TYPE_UD)1311src = spread(retype(src, BRW_REGISTER_TYPE_W), 2);13121313assert(src.type == BRW_REGISTER_TYPE_W ||1314src.type == BRW_REGISTER_TYPE_UW ||1315src.type == BRW_REGISTER_TYPE_HF);1316}13171318if (devinfo->ver >= 8) {1319return brw_MOV(p, dst, retype(src, BRW_REGISTER_TYPE_HF));1320} else {1321assert(devinfo->ver == 7);1322return brw_alu1(p, BRW_OPCODE_F16TO32, dst, src);1323}1324}132513261327void brw_NOP(struct brw_codegen *p)1328{1329brw_inst *insn = next_insn(p, BRW_OPCODE_NOP);1330memset(insn, 0, sizeof(*insn));1331brw_inst_set_opcode(p->devinfo, insn, BRW_OPCODE_NOP);1332}13331334void brw_SYNC(struct brw_codegen *p, enum tgl_sync_function func)1335{1336brw_inst *insn = next_insn(p, BRW_OPCODE_SYNC);1337brw_inst_set_cond_modifier(p->devinfo, insn, func);1338}13391340/***********************************************************************1341* Comparisons, if/else/endif1342*/13431344brw_inst *1345brw_JMPI(struct brw_codegen *p, struct brw_reg index,1346unsigned predicate_control)1347{1348const struct intel_device_info *devinfo = p->devinfo;1349struct brw_reg ip = brw_ip_reg();1350brw_inst *inst = brw_alu2(p, BRW_OPCODE_JMPI, ip, ip, index);13511352brw_inst_set_exec_size(devinfo, inst, BRW_EXECUTE_1);1353brw_inst_set_qtr_control(devinfo, inst, BRW_COMPRESSION_NONE);1354brw_inst_set_mask_control(devinfo, inst, BRW_MASK_DISABLE);1355brw_inst_set_pred_control(devinfo, inst, predicate_control);13561357return inst;1358}13591360static void1361push_if_stack(struct brw_codegen *p, brw_inst *inst)1362{1363p->if_stack[p->if_stack_depth] = inst - p->store;13641365p->if_stack_depth++;1366if (p->if_stack_array_size <= p->if_stack_depth) {1367p->if_stack_array_size *= 2;1368p->if_stack = reralloc(p->mem_ctx, p->if_stack, int,1369p->if_stack_array_size);1370}1371}13721373static brw_inst *1374pop_if_stack(struct brw_codegen *p)1375{1376p->if_stack_depth--;1377return &p->store[p->if_stack[p->if_stack_depth]];1378}13791380static void1381push_loop_stack(struct brw_codegen *p, brw_inst *inst)1382{1383if (p->loop_stack_array_size <= (p->loop_stack_depth + 1)) {1384p->loop_stack_array_size *= 2;1385p->loop_stack = reralloc(p->mem_ctx, p->loop_stack, int,1386p->loop_stack_array_size);1387p->if_depth_in_loop = reralloc(p->mem_ctx, p->if_depth_in_loop, int,1388p->loop_stack_array_size);1389}13901391p->loop_stack[p->loop_stack_depth] = inst - p->store;1392p->loop_stack_depth++;1393p->if_depth_in_loop[p->loop_stack_depth] = 0;1394}13951396static brw_inst *1397get_inner_do_insn(struct brw_codegen *p)1398{1399return &p->store[p->loop_stack[p->loop_stack_depth - 1]];1400}14011402/* EU takes the value from the flag register and pushes it onto some1403* sort of a stack (presumably merging with any flag value already on1404* the stack). Within an if block, the flags at the top of the stack1405* control execution on each channel of the unit, eg. on each of the1406* 16 pixel values in our wm programs.1407*1408* When the matching 'else' instruction is reached (presumably by1409* countdown of the instruction count patched in by our ELSE/ENDIF1410* functions), the relevant flags are inverted.1411*1412* When the matching 'endif' instruction is reached, the flags are1413* popped off. If the stack is now empty, normal execution resumes.1414*/1415brw_inst *1416brw_IF(struct brw_codegen *p, unsigned execute_size)1417{1418const struct intel_device_info *devinfo = p->devinfo;1419brw_inst *insn;14201421insn = next_insn(p, BRW_OPCODE_IF);14221423/* Override the defaults for this instruction:1424*/1425if (devinfo->ver < 6) {1426brw_set_dest(p, insn, brw_ip_reg());1427brw_set_src0(p, insn, brw_ip_reg());1428brw_set_src1(p, insn, brw_imm_d(0x0));1429} else if (devinfo->ver == 6) {1430brw_set_dest(p, insn, brw_imm_w(0));1431brw_inst_set_gfx6_jump_count(devinfo, insn, 0);1432brw_set_src0(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));1433brw_set_src1(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));1434} else if (devinfo->ver == 7) {1435brw_set_dest(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));1436brw_set_src0(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));1437brw_set_src1(p, insn, brw_imm_w(0));1438brw_inst_set_jip(devinfo, insn, 0);1439brw_inst_set_uip(devinfo, insn, 0);1440} else {1441brw_set_dest(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));1442if (devinfo->ver < 12)1443brw_set_src0(p, insn, brw_imm_d(0));1444brw_inst_set_jip(devinfo, insn, 0);1445brw_inst_set_uip(devinfo, insn, 0);1446}14471448brw_inst_set_exec_size(devinfo, insn, execute_size);1449brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);1450brw_inst_set_pred_control(devinfo, insn, BRW_PREDICATE_NORMAL);1451brw_inst_set_mask_control(devinfo, insn, BRW_MASK_ENABLE);1452if (!p->single_program_flow && devinfo->ver < 6)1453brw_inst_set_thread_control(devinfo, insn, BRW_THREAD_SWITCH);14541455push_if_stack(p, insn);1456p->if_depth_in_loop[p->loop_stack_depth]++;1457return insn;1458}14591460/* This function is only used for gfx6-style IF instructions with an1461* embedded comparison (conditional modifier). It is not used on gfx7.1462*/1463brw_inst *1464gfx6_IF(struct brw_codegen *p, enum brw_conditional_mod conditional,1465struct brw_reg src0, struct brw_reg src1)1466{1467const struct intel_device_info *devinfo = p->devinfo;1468brw_inst *insn;14691470insn = next_insn(p, BRW_OPCODE_IF);14711472brw_set_dest(p, insn, brw_imm_w(0));1473brw_inst_set_exec_size(devinfo, insn, brw_get_default_exec_size(p));1474brw_inst_set_gfx6_jump_count(devinfo, insn, 0);1475brw_set_src0(p, insn, src0);1476brw_set_src1(p, insn, src1);14771478assert(brw_inst_qtr_control(devinfo, insn) == BRW_COMPRESSION_NONE);1479assert(brw_inst_pred_control(devinfo, insn) == BRW_PREDICATE_NONE);1480brw_inst_set_cond_modifier(devinfo, insn, conditional);14811482push_if_stack(p, insn);1483return insn;1484}14851486/**1487* In single-program-flow (SPF) mode, convert IF and ELSE into ADDs.1488*/1489static void1490convert_IF_ELSE_to_ADD(struct brw_codegen *p,1491brw_inst *if_inst, brw_inst *else_inst)1492{1493const struct intel_device_info *devinfo = p->devinfo;14941495/* The next instruction (where the ENDIF would be, if it existed) */1496brw_inst *next_inst = &p->store[p->nr_insn];14971498assert(p->single_program_flow);1499assert(if_inst != NULL && brw_inst_opcode(devinfo, if_inst) == BRW_OPCODE_IF);1500assert(else_inst == NULL || brw_inst_opcode(devinfo, else_inst) == BRW_OPCODE_ELSE);1501assert(brw_inst_exec_size(devinfo, if_inst) == BRW_EXECUTE_1);15021503/* Convert IF to an ADD instruction that moves the instruction pointer1504* to the first instruction of the ELSE block. If there is no ELSE1505* block, point to where ENDIF would be. Reverse the predicate.1506*1507* There's no need to execute an ENDIF since we don't need to do any1508* stack operations, and if we're currently executing, we just want to1509* continue normally.1510*/1511brw_inst_set_opcode(devinfo, if_inst, BRW_OPCODE_ADD);1512brw_inst_set_pred_inv(devinfo, if_inst, true);15131514if (else_inst != NULL) {1515/* Convert ELSE to an ADD instruction that points where the ENDIF1516* would be.1517*/1518brw_inst_set_opcode(devinfo, else_inst, BRW_OPCODE_ADD);15191520brw_inst_set_imm_ud(devinfo, if_inst, (else_inst - if_inst + 1) * 16);1521brw_inst_set_imm_ud(devinfo, else_inst, (next_inst - else_inst) * 16);1522} else {1523brw_inst_set_imm_ud(devinfo, if_inst, (next_inst - if_inst) * 16);1524}1525}15261527/**1528* Patch IF and ELSE instructions with appropriate jump targets.1529*/1530static void1531patch_IF_ELSE(struct brw_codegen *p,1532brw_inst *if_inst, brw_inst *else_inst, brw_inst *endif_inst)1533{1534const struct intel_device_info *devinfo = p->devinfo;15351536/* We shouldn't be patching IF and ELSE instructions in single program flow1537* mode when gen < 6, because in single program flow mode on those1538* platforms, we convert flow control instructions to conditional ADDs that1539* operate on IP (see brw_ENDIF).1540*1541* However, on Gfx6, writing to IP doesn't work in single program flow mode1542* (see the SandyBridge PRM, Volume 4 part 2, p79: "When SPF is ON, IP may1543* not be updated by non-flow control instructions."). And on later1544* platforms, there is no significant benefit to converting control flow1545* instructions to conditional ADDs. So we do patch IF and ELSE1546* instructions in single program flow mode on those platforms.1547*/1548if (devinfo->ver < 6)1549assert(!p->single_program_flow);15501551assert(if_inst != NULL && brw_inst_opcode(devinfo, if_inst) == BRW_OPCODE_IF);1552assert(endif_inst != NULL);1553assert(else_inst == NULL || brw_inst_opcode(devinfo, else_inst) == BRW_OPCODE_ELSE);15541555unsigned br = brw_jump_scale(devinfo);15561557assert(brw_inst_opcode(devinfo, endif_inst) == BRW_OPCODE_ENDIF);1558brw_inst_set_exec_size(devinfo, endif_inst, brw_inst_exec_size(devinfo, if_inst));15591560if (else_inst == NULL) {1561/* Patch IF -> ENDIF */1562if (devinfo->ver < 6) {1563/* Turn it into an IFF, which means no mask stack operations for1564* all-false and jumping past the ENDIF.1565*/1566brw_inst_set_opcode(devinfo, if_inst, BRW_OPCODE_IFF);1567brw_inst_set_gfx4_jump_count(devinfo, if_inst,1568br * (endif_inst - if_inst + 1));1569brw_inst_set_gfx4_pop_count(devinfo, if_inst, 0);1570} else if (devinfo->ver == 6) {1571/* As of gfx6, there is no IFF and IF must point to the ENDIF. */1572brw_inst_set_gfx6_jump_count(devinfo, if_inst, br*(endif_inst - if_inst));1573} else {1574brw_inst_set_uip(devinfo, if_inst, br * (endif_inst - if_inst));1575brw_inst_set_jip(devinfo, if_inst, br * (endif_inst - if_inst));1576}1577} else {1578brw_inst_set_exec_size(devinfo, else_inst, brw_inst_exec_size(devinfo, if_inst));15791580/* Patch IF -> ELSE */1581if (devinfo->ver < 6) {1582brw_inst_set_gfx4_jump_count(devinfo, if_inst,1583br * (else_inst - if_inst));1584brw_inst_set_gfx4_pop_count(devinfo, if_inst, 0);1585} else if (devinfo->ver == 6) {1586brw_inst_set_gfx6_jump_count(devinfo, if_inst,1587br * (else_inst - if_inst + 1));1588}15891590/* Patch ELSE -> ENDIF */1591if (devinfo->ver < 6) {1592/* BRW_OPCODE_ELSE pre-gfx6 should point just past the1593* matching ENDIF.1594*/1595brw_inst_set_gfx4_jump_count(devinfo, else_inst,1596br * (endif_inst - else_inst + 1));1597brw_inst_set_gfx4_pop_count(devinfo, else_inst, 1);1598} else if (devinfo->ver == 6) {1599/* BRW_OPCODE_ELSE on gfx6 should point to the matching ENDIF. */1600brw_inst_set_gfx6_jump_count(devinfo, else_inst,1601br * (endif_inst - else_inst));1602} else {1603/* The IF instruction's JIP should point just past the ELSE */1604brw_inst_set_jip(devinfo, if_inst, br * (else_inst - if_inst + 1));1605/* The IF instruction's UIP and ELSE's JIP should point to ENDIF */1606brw_inst_set_uip(devinfo, if_inst, br * (endif_inst - if_inst));1607brw_inst_set_jip(devinfo, else_inst, br * (endif_inst - else_inst));1608if (devinfo->ver >= 8) {1609/* Since we don't set branch_ctrl, the ELSE's JIP and UIP both1610* should point to ENDIF.1611*/1612brw_inst_set_uip(devinfo, else_inst, br * (endif_inst - else_inst));1613}1614}1615}1616}16171618void1619brw_ELSE(struct brw_codegen *p)1620{1621const struct intel_device_info *devinfo = p->devinfo;1622brw_inst *insn;16231624insn = next_insn(p, BRW_OPCODE_ELSE);16251626if (devinfo->ver < 6) {1627brw_set_dest(p, insn, brw_ip_reg());1628brw_set_src0(p, insn, brw_ip_reg());1629brw_set_src1(p, insn, brw_imm_d(0x0));1630} else if (devinfo->ver == 6) {1631brw_set_dest(p, insn, brw_imm_w(0));1632brw_inst_set_gfx6_jump_count(devinfo, insn, 0);1633brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));1634brw_set_src1(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));1635} else if (devinfo->ver == 7) {1636brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));1637brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));1638brw_set_src1(p, insn, brw_imm_w(0));1639brw_inst_set_jip(devinfo, insn, 0);1640brw_inst_set_uip(devinfo, insn, 0);1641} else {1642brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));1643if (devinfo->ver < 12)1644brw_set_src0(p, insn, brw_imm_d(0));1645brw_inst_set_jip(devinfo, insn, 0);1646brw_inst_set_uip(devinfo, insn, 0);1647}16481649brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);1650brw_inst_set_mask_control(devinfo, insn, BRW_MASK_ENABLE);1651if (!p->single_program_flow && devinfo->ver < 6)1652brw_inst_set_thread_control(devinfo, insn, BRW_THREAD_SWITCH);16531654push_if_stack(p, insn);1655}16561657void1658brw_ENDIF(struct brw_codegen *p)1659{1660const struct intel_device_info *devinfo = p->devinfo;1661brw_inst *insn = NULL;1662brw_inst *else_inst = NULL;1663brw_inst *if_inst = NULL;1664brw_inst *tmp;1665bool emit_endif = true;16661667/* In single program flow mode, we can express IF and ELSE instructions1668* equivalently as ADD instructions that operate on IP. On platforms prior1669* to Gfx6, flow control instructions cause an implied thread switch, so1670* this is a significant savings.1671*1672* However, on Gfx6, writing to IP doesn't work in single program flow mode1673* (see the SandyBridge PRM, Volume 4 part 2, p79: "When SPF is ON, IP may1674* not be updated by non-flow control instructions."). And on later1675* platforms, there is no significant benefit to converting control flow1676* instructions to conditional ADDs. So we only do this trick on Gfx4 and1677* Gfx5.1678*/1679if (devinfo->ver < 6 && p->single_program_flow)1680emit_endif = false;16811682/*1683* A single next_insn() may change the base address of instruction store1684* memory(p->store), so call it first before referencing the instruction1685* store pointer from an index1686*/1687if (emit_endif)1688insn = next_insn(p, BRW_OPCODE_ENDIF);16891690/* Pop the IF and (optional) ELSE instructions from the stack */1691p->if_depth_in_loop[p->loop_stack_depth]--;1692tmp = pop_if_stack(p);1693if (brw_inst_opcode(devinfo, tmp) == BRW_OPCODE_ELSE) {1694else_inst = tmp;1695tmp = pop_if_stack(p);1696}1697if_inst = tmp;16981699if (!emit_endif) {1700/* ENDIF is useless; don't bother emitting it. */1701convert_IF_ELSE_to_ADD(p, if_inst, else_inst);1702return;1703}17041705if (devinfo->ver < 6) {1706brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));1707brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));1708brw_set_src1(p, insn, brw_imm_d(0x0));1709} else if (devinfo->ver == 6) {1710brw_set_dest(p, insn, brw_imm_w(0));1711brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));1712brw_set_src1(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));1713} else if (devinfo->ver == 7) {1714brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));1715brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));1716brw_set_src1(p, insn, brw_imm_w(0));1717} else {1718brw_set_src0(p, insn, brw_imm_d(0));1719}17201721brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);1722brw_inst_set_mask_control(devinfo, insn, BRW_MASK_ENABLE);1723if (devinfo->ver < 6)1724brw_inst_set_thread_control(devinfo, insn, BRW_THREAD_SWITCH);17251726/* Also pop item off the stack in the endif instruction: */1727if (devinfo->ver < 6) {1728brw_inst_set_gfx4_jump_count(devinfo, insn, 0);1729brw_inst_set_gfx4_pop_count(devinfo, insn, 1);1730} else if (devinfo->ver == 6) {1731brw_inst_set_gfx6_jump_count(devinfo, insn, 2);1732} else {1733brw_inst_set_jip(devinfo, insn, 2);1734}1735patch_IF_ELSE(p, if_inst, else_inst, insn);1736}17371738brw_inst *1739brw_BREAK(struct brw_codegen *p)1740{1741const struct intel_device_info *devinfo = p->devinfo;1742brw_inst *insn;17431744insn = next_insn(p, BRW_OPCODE_BREAK);1745if (devinfo->ver >= 8) {1746brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));1747brw_set_src0(p, insn, brw_imm_d(0x0));1748} else if (devinfo->ver >= 6) {1749brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));1750brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));1751brw_set_src1(p, insn, brw_imm_d(0x0));1752} else {1753brw_set_dest(p, insn, brw_ip_reg());1754brw_set_src0(p, insn, brw_ip_reg());1755brw_set_src1(p, insn, brw_imm_d(0x0));1756brw_inst_set_gfx4_pop_count(devinfo, insn,1757p->if_depth_in_loop[p->loop_stack_depth]);1758}1759brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);1760brw_inst_set_exec_size(devinfo, insn, brw_get_default_exec_size(p));17611762return insn;1763}17641765brw_inst *1766brw_CONT(struct brw_codegen *p)1767{1768const struct intel_device_info *devinfo = p->devinfo;1769brw_inst *insn;17701771insn = next_insn(p, BRW_OPCODE_CONTINUE);1772brw_set_dest(p, insn, brw_ip_reg());1773if (devinfo->ver >= 8) {1774brw_set_src0(p, insn, brw_imm_d(0x0));1775} else {1776brw_set_src0(p, insn, brw_ip_reg());1777brw_set_src1(p, insn, brw_imm_d(0x0));1778}17791780if (devinfo->ver < 6) {1781brw_inst_set_gfx4_pop_count(devinfo, insn,1782p->if_depth_in_loop[p->loop_stack_depth]);1783}1784brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);1785brw_inst_set_exec_size(devinfo, insn, brw_get_default_exec_size(p));1786return insn;1787}17881789brw_inst *1790brw_HALT(struct brw_codegen *p)1791{1792const struct intel_device_info *devinfo = p->devinfo;1793brw_inst *insn;17941795insn = next_insn(p, BRW_OPCODE_HALT);1796brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));1797if (devinfo->ver < 6) {1798/* From the Gfx4 PRM:1799*1800* "IP register must be put (for example, by the assembler) at <dst>1801* and <src0> locations.1802*/1803brw_set_dest(p, insn, brw_ip_reg());1804brw_set_src0(p, insn, brw_ip_reg());1805brw_set_src1(p, insn, brw_imm_d(0x0)); /* exitcode updated later. */1806} else if (devinfo->ver < 8) {1807brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));1808brw_set_src1(p, insn, brw_imm_d(0x0)); /* UIP and JIP, updated later. */1809} else if (devinfo->ver < 12) {1810brw_set_src0(p, insn, brw_imm_d(0x0));1811}18121813brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);1814brw_inst_set_exec_size(devinfo, insn, brw_get_default_exec_size(p));1815return insn;1816}18171818/* DO/WHILE loop:1819*1820* The DO/WHILE is just an unterminated loop -- break or continue are1821* used for control within the loop. We have a few ways they can be1822* done.1823*1824* For uniform control flow, the WHILE is just a jump, so ADD ip, ip,1825* jip and no DO instruction.1826*1827* For non-uniform control flow pre-gfx6, there's a DO instruction to1828* push the mask, and a WHILE to jump back, and BREAK to get out and1829* pop the mask.1830*1831* For gfx6, there's no more mask stack, so no need for DO. WHILE1832* just points back to the first instruction of the loop.1833*/1834brw_inst *1835brw_DO(struct brw_codegen *p, unsigned execute_size)1836{1837const struct intel_device_info *devinfo = p->devinfo;18381839if (devinfo->ver >= 6 || p->single_program_flow) {1840push_loop_stack(p, &p->store[p->nr_insn]);1841return &p->store[p->nr_insn];1842} else {1843brw_inst *insn = next_insn(p, BRW_OPCODE_DO);18441845push_loop_stack(p, insn);18461847/* Override the defaults for this instruction:1848*/1849brw_set_dest(p, insn, brw_null_reg());1850brw_set_src0(p, insn, brw_null_reg());1851brw_set_src1(p, insn, brw_null_reg());18521853brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);1854brw_inst_set_exec_size(devinfo, insn, execute_size);1855brw_inst_set_pred_control(devinfo, insn, BRW_PREDICATE_NONE);18561857return insn;1858}1859}18601861/**1862* For pre-gfx6, we patch BREAK/CONT instructions to point at the WHILE1863* instruction here.1864*1865* For gfx6+, see brw_set_uip_jip(), which doesn't care so much about the loop1866* nesting, since it can always just point to the end of the block/current loop.1867*/1868static void1869brw_patch_break_cont(struct brw_codegen *p, brw_inst *while_inst)1870{1871const struct intel_device_info *devinfo = p->devinfo;1872brw_inst *do_inst = get_inner_do_insn(p);1873brw_inst *inst;1874unsigned br = brw_jump_scale(devinfo);18751876assert(devinfo->ver < 6);18771878for (inst = while_inst - 1; inst != do_inst; inst--) {1879/* If the jump count is != 0, that means that this instruction has already1880* been patched because it's part of a loop inside of the one we're1881* patching.1882*/1883if (brw_inst_opcode(devinfo, inst) == BRW_OPCODE_BREAK &&1884brw_inst_gfx4_jump_count(devinfo, inst) == 0) {1885brw_inst_set_gfx4_jump_count(devinfo, inst, br*((while_inst - inst) + 1));1886} else if (brw_inst_opcode(devinfo, inst) == BRW_OPCODE_CONTINUE &&1887brw_inst_gfx4_jump_count(devinfo, inst) == 0) {1888brw_inst_set_gfx4_jump_count(devinfo, inst, br * (while_inst - inst));1889}1890}1891}18921893brw_inst *1894brw_WHILE(struct brw_codegen *p)1895{1896const struct intel_device_info *devinfo = p->devinfo;1897brw_inst *insn, *do_insn;1898unsigned br = brw_jump_scale(devinfo);18991900if (devinfo->ver >= 6) {1901insn = next_insn(p, BRW_OPCODE_WHILE);1902do_insn = get_inner_do_insn(p);19031904if (devinfo->ver >= 8) {1905brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));1906if (devinfo->ver < 12)1907brw_set_src0(p, insn, brw_imm_d(0));1908brw_inst_set_jip(devinfo, insn, br * (do_insn - insn));1909} else if (devinfo->ver == 7) {1910brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));1911brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));1912brw_set_src1(p, insn, brw_imm_w(0));1913brw_inst_set_jip(devinfo, insn, br * (do_insn - insn));1914} else {1915brw_set_dest(p, insn, brw_imm_w(0));1916brw_inst_set_gfx6_jump_count(devinfo, insn, br * (do_insn - insn));1917brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));1918brw_set_src1(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));1919}19201921brw_inst_set_exec_size(devinfo, insn, brw_get_default_exec_size(p));19221923} else {1924if (p->single_program_flow) {1925insn = next_insn(p, BRW_OPCODE_ADD);1926do_insn = get_inner_do_insn(p);19271928brw_set_dest(p, insn, brw_ip_reg());1929brw_set_src0(p, insn, brw_ip_reg());1930brw_set_src1(p, insn, brw_imm_d((do_insn - insn) * 16));1931brw_inst_set_exec_size(devinfo, insn, BRW_EXECUTE_1);1932} else {1933insn = next_insn(p, BRW_OPCODE_WHILE);1934do_insn = get_inner_do_insn(p);19351936assert(brw_inst_opcode(devinfo, do_insn) == BRW_OPCODE_DO);19371938brw_set_dest(p, insn, brw_ip_reg());1939brw_set_src0(p, insn, brw_ip_reg());1940brw_set_src1(p, insn, brw_imm_d(0));19411942brw_inst_set_exec_size(devinfo, insn, brw_inst_exec_size(devinfo, do_insn));1943brw_inst_set_gfx4_jump_count(devinfo, insn, br * (do_insn - insn + 1));1944brw_inst_set_gfx4_pop_count(devinfo, insn, 0);19451946brw_patch_break_cont(p, insn);1947}1948}1949brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);19501951p->loop_stack_depth--;19521953return insn;1954}19551956/* FORWARD JUMPS:1957*/1958void brw_land_fwd_jump(struct brw_codegen *p, int jmp_insn_idx)1959{1960const struct intel_device_info *devinfo = p->devinfo;1961brw_inst *jmp_insn = &p->store[jmp_insn_idx];1962unsigned jmpi = 1;19631964if (devinfo->ver >= 5)1965jmpi = 2;19661967assert(brw_inst_opcode(devinfo, jmp_insn) == BRW_OPCODE_JMPI);1968assert(brw_inst_src1_reg_file(devinfo, jmp_insn) == BRW_IMMEDIATE_VALUE);19691970brw_inst_set_gfx4_jump_count(devinfo, jmp_insn,1971jmpi * (p->nr_insn - jmp_insn_idx - 1));1972}19731974/* To integrate with the above, it makes sense that the comparison1975* instruction should populate the flag register. It might be simpler1976* just to use the flag reg for most WM tasks?1977*/1978void brw_CMP(struct brw_codegen *p,1979struct brw_reg dest,1980unsigned conditional,1981struct brw_reg src0,1982struct brw_reg src1)1983{1984const struct intel_device_info *devinfo = p->devinfo;1985brw_inst *insn = next_insn(p, BRW_OPCODE_CMP);19861987brw_inst_set_cond_modifier(devinfo, insn, conditional);1988brw_set_dest(p, insn, dest);1989brw_set_src0(p, insn, src0);1990brw_set_src1(p, insn, src1);19911992/* Item WaCMPInstNullDstForcesThreadSwitch in the Haswell Bspec workarounds1993* page says:1994* "Any CMP instruction with a null destination must use a {switch}."1995*1996* It also applies to other Gfx7 platforms (IVB, BYT) even though it isn't1997* mentioned on their work-arounds pages.1998*/1999if (devinfo->ver == 7) {2000if (dest.file == BRW_ARCHITECTURE_REGISTER_FILE &&2001dest.nr == BRW_ARF_NULL) {2002brw_inst_set_thread_control(devinfo, insn, BRW_THREAD_SWITCH);2003}2004}2005}20062007void brw_CMPN(struct brw_codegen *p,2008struct brw_reg dest,2009unsigned conditional,2010struct brw_reg src0,2011struct brw_reg src1)2012{2013const struct intel_device_info *devinfo = p->devinfo;2014brw_inst *insn = next_insn(p, BRW_OPCODE_CMPN);20152016brw_inst_set_cond_modifier(devinfo, insn, conditional);2017brw_set_dest(p, insn, dest);2018brw_set_src0(p, insn, src0);2019brw_set_src1(p, insn, src1);20202021/* Page 166 of the Ivy Bridge PRM Volume 4 part 3 (Execution Unit ISA)2022* says:2023*2024* If the destination is the null register, the {Switch} instruction2025* option must be used.2026*2027* Page 77 of the Haswell PRM Volume 2b contains the same text.2028*/2029if (devinfo->ver == 7) {2030if (dest.file == BRW_ARCHITECTURE_REGISTER_FILE &&2031dest.nr == BRW_ARF_NULL) {2032brw_inst_set_thread_control(devinfo, insn, BRW_THREAD_SWITCH);2033}2034}2035}20362037/***********************************************************************2038* Helpers for the various SEND message types:2039*/20402041/** Extended math function, float[8].2042*/2043void gfx4_math(struct brw_codegen *p,2044struct brw_reg dest,2045unsigned function,2046unsigned msg_reg_nr,2047struct brw_reg src,2048unsigned precision )2049{2050const struct intel_device_info *devinfo = p->devinfo;2051brw_inst *insn = next_insn(p, BRW_OPCODE_SEND);2052unsigned data_type;2053if (has_scalar_region(src)) {2054data_type = BRW_MATH_DATA_SCALAR;2055} else {2056data_type = BRW_MATH_DATA_VECTOR;2057}20582059assert(devinfo->ver < 6);20602061/* Example code doesn't set predicate_control for send2062* instructions.2063*/2064brw_inst_set_pred_control(devinfo, insn, 0);2065brw_inst_set_base_mrf(devinfo, insn, msg_reg_nr);20662067brw_set_dest(p, insn, dest);2068brw_set_src0(p, insn, src);2069brw_set_math_message(p,2070insn,2071function,2072src.type == BRW_REGISTER_TYPE_D,2073precision,2074data_type);2075}20762077void gfx6_math(struct brw_codegen *p,2078struct brw_reg dest,2079unsigned function,2080struct brw_reg src0,2081struct brw_reg src1)2082{2083const struct intel_device_info *devinfo = p->devinfo;2084brw_inst *insn = next_insn(p, BRW_OPCODE_MATH);20852086assert(devinfo->ver >= 6);20872088assert(dest.file == BRW_GENERAL_REGISTER_FILE ||2089(devinfo->ver >= 7 && dest.file == BRW_MESSAGE_REGISTER_FILE));20902091assert(dest.hstride == BRW_HORIZONTAL_STRIDE_1);2092if (devinfo->ver == 6) {2093assert(src0.hstride == BRW_HORIZONTAL_STRIDE_1);2094assert(src1.hstride == BRW_HORIZONTAL_STRIDE_1);2095}20962097if (function == BRW_MATH_FUNCTION_INT_DIV_QUOTIENT ||2098function == BRW_MATH_FUNCTION_INT_DIV_REMAINDER ||2099function == BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER) {2100assert(src0.type != BRW_REGISTER_TYPE_F);2101assert(src1.type != BRW_REGISTER_TYPE_F);2102assert(src1.file == BRW_GENERAL_REGISTER_FILE ||2103(devinfo->ver >= 8 && src1.file == BRW_IMMEDIATE_VALUE));2104} else {2105assert(src0.type == BRW_REGISTER_TYPE_F ||2106(src0.type == BRW_REGISTER_TYPE_HF && devinfo->ver >= 9));2107assert(src1.type == BRW_REGISTER_TYPE_F ||2108(src1.type == BRW_REGISTER_TYPE_HF && devinfo->ver >= 9));2109}21102111/* Source modifiers are ignored for extended math instructions on Gfx6. */2112if (devinfo->ver == 6) {2113assert(!src0.negate);2114assert(!src0.abs);2115assert(!src1.negate);2116assert(!src1.abs);2117}21182119brw_inst_set_math_function(devinfo, insn, function);21202121brw_set_dest(p, insn, dest);2122brw_set_src0(p, insn, src0);2123brw_set_src1(p, insn, src1);2124}21252126/**2127* Return the right surface index to access the thread scratch space using2128* stateless dataport messages.2129*/2130unsigned2131brw_scratch_surface_idx(const struct brw_codegen *p)2132{2133/* The scratch space is thread-local so IA coherency is unnecessary. */2134if (p->devinfo->ver >= 8)2135return GFX8_BTI_STATELESS_NON_COHERENT;2136else2137return BRW_BTI_STATELESS;2138}21392140/**2141* Write a block of OWORDs (half a GRF each) from the scratch buffer,2142* using a constant offset per channel.2143*2144* The offset must be aligned to oword size (16 bytes). Used for2145* register spilling.2146*/2147void brw_oword_block_write_scratch(struct brw_codegen *p,2148struct brw_reg mrf,2149int num_regs,2150unsigned offset)2151{2152const struct intel_device_info *devinfo = p->devinfo;2153const unsigned target_cache =2154(devinfo->ver >= 7 ? GFX7_SFID_DATAPORT_DATA_CACHE :2155devinfo->ver >= 6 ? GFX6_SFID_DATAPORT_RENDER_CACHE :2156BRW_SFID_DATAPORT_WRITE);2157const struct tgl_swsb swsb = brw_get_default_swsb(p);2158uint32_t msg_type;21592160if (devinfo->ver >= 6)2161offset /= 16;21622163mrf = retype(mrf, BRW_REGISTER_TYPE_UD);21642165const unsigned mlen = 1 + num_regs;21662167/* Set up the message header. This is g0, with g0.2 filled with2168* the offset. We don't want to leave our offset around in g0 or2169* it'll screw up texture samples, so set it up inside the message2170* reg.2171*/2172{2173brw_push_insn_state(p);2174brw_set_default_exec_size(p, BRW_EXECUTE_8);2175brw_set_default_mask_control(p, BRW_MASK_DISABLE);2176brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);2177brw_set_default_swsb(p, tgl_swsb_src_dep(swsb));21782179brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));21802181/* set message header global offset field (reg 0, element 2) */2182brw_set_default_exec_size(p, BRW_EXECUTE_1);2183brw_set_default_swsb(p, tgl_swsb_null());2184brw_MOV(p,2185retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE,2186mrf.nr,21872), BRW_REGISTER_TYPE_UD),2188brw_imm_ud(offset));21892190brw_pop_insn_state(p);2191brw_set_default_swsb(p, tgl_swsb_dst_dep(swsb, 1));2192}21932194{2195struct brw_reg dest;2196brw_inst *insn = next_insn(p, BRW_OPCODE_SEND);2197int send_commit_msg;2198struct brw_reg src_header = retype(brw_vec8_grf(0, 0),2199BRW_REGISTER_TYPE_UW);22002201brw_inst_set_sfid(devinfo, insn, target_cache);2202brw_inst_set_compression(devinfo, insn, false);22032204if (brw_inst_exec_size(devinfo, insn) >= 16)2205src_header = vec16(src_header);22062207assert(brw_inst_pred_control(devinfo, insn) == BRW_PREDICATE_NONE);2208if (devinfo->ver < 6)2209brw_inst_set_base_mrf(devinfo, insn, mrf.nr);22102211/* Until gfx6, writes followed by reads from the same location2212* are not guaranteed to be ordered unless write_commit is set.2213* If set, then a no-op write is issued to the destination2214* register to set a dependency, and a read from the destination2215* can be used to ensure the ordering.2216*2217* For gfx6, only writes between different threads need ordering2218* protection. Our use of DP writes is all about register2219* spilling within a thread.2220*/2221if (devinfo->ver >= 6) {2222dest = retype(vec16(brw_null_reg()), BRW_REGISTER_TYPE_UW);2223send_commit_msg = 0;2224} else {2225dest = src_header;2226send_commit_msg = 1;2227}22282229brw_set_dest(p, insn, dest);2230if (devinfo->ver >= 6) {2231brw_set_src0(p, insn, mrf);2232} else {2233brw_set_src0(p, insn, brw_null_reg());2234}22352236if (devinfo->ver >= 6)2237msg_type = GFX6_DATAPORT_WRITE_MESSAGE_OWORD_BLOCK_WRITE;2238else2239msg_type = BRW_DATAPORT_WRITE_MESSAGE_OWORD_BLOCK_WRITE;22402241brw_set_desc(p, insn,2242brw_message_desc(devinfo, mlen, send_commit_msg, true) |2243brw_dp_write_desc(devinfo, brw_scratch_surface_idx(p),2244BRW_DATAPORT_OWORD_BLOCK_DWORDS(num_regs * 8),2245msg_type, send_commit_msg));2246}2247}224822492250/**2251* Read a block of owords (half a GRF each) from the scratch buffer2252* using a constant index per channel.2253*2254* Offset must be aligned to oword size (16 bytes). Used for register2255* spilling.2256*/2257void2258brw_oword_block_read_scratch(struct brw_codegen *p,2259struct brw_reg dest,2260struct brw_reg mrf,2261int num_regs,2262unsigned offset)2263{2264const struct intel_device_info *devinfo = p->devinfo;2265const struct tgl_swsb swsb = brw_get_default_swsb(p);22662267if (devinfo->ver >= 6)2268offset /= 16;22692270if (p->devinfo->ver >= 7) {2271/* On gen 7 and above, we no longer have message registers and we can2272* send from any register we want. By using the destination register2273* for the message, we guarantee that the implied message write won't2274* accidentally overwrite anything. This has been a problem because2275* the MRF registers and source for the final FB write are both fixed2276* and may overlap.2277*/2278mrf = retype(dest, BRW_REGISTER_TYPE_UD);2279} else {2280mrf = retype(mrf, BRW_REGISTER_TYPE_UD);2281}2282dest = retype(dest, BRW_REGISTER_TYPE_UW);22832284const unsigned rlen = num_regs;2285const unsigned target_cache =2286(devinfo->ver >= 7 ? GFX7_SFID_DATAPORT_DATA_CACHE :2287devinfo->ver >= 6 ? GFX6_SFID_DATAPORT_RENDER_CACHE :2288BRW_SFID_DATAPORT_READ);22892290{2291brw_push_insn_state(p);2292brw_set_default_swsb(p, tgl_swsb_src_dep(swsb));2293brw_set_default_exec_size(p, BRW_EXECUTE_8);2294brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);2295brw_set_default_mask_control(p, BRW_MASK_DISABLE);22962297brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));22982299/* set message header global offset field (reg 0, element 2) */2300brw_set_default_exec_size(p, BRW_EXECUTE_1);2301brw_set_default_swsb(p, tgl_swsb_null());2302brw_MOV(p, get_element_ud(mrf, 2), brw_imm_ud(offset));23032304brw_pop_insn_state(p);2305brw_set_default_swsb(p, tgl_swsb_dst_dep(swsb, 1));2306}23072308{2309brw_inst *insn = next_insn(p, BRW_OPCODE_SEND);23102311brw_inst_set_sfid(devinfo, insn, target_cache);2312assert(brw_inst_pred_control(devinfo, insn) == 0);2313brw_inst_set_compression(devinfo, insn, false);23142315brw_set_dest(p, insn, dest); /* UW? */2316if (devinfo->ver >= 6) {2317brw_set_src0(p, insn, mrf);2318} else {2319brw_set_src0(p, insn, brw_null_reg());2320brw_inst_set_base_mrf(devinfo, insn, mrf.nr);2321}23222323brw_set_desc(p, insn,2324brw_message_desc(devinfo, 1, rlen, true) |2325brw_dp_read_desc(devinfo, brw_scratch_surface_idx(p),2326BRW_DATAPORT_OWORD_BLOCK_DWORDS(num_regs * 8),2327BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ,2328BRW_DATAPORT_READ_TARGET_RENDER_CACHE));2329}2330}23312332void2333gfx7_block_read_scratch(struct brw_codegen *p,2334struct brw_reg dest,2335int num_regs,2336unsigned offset)2337{2338brw_inst *insn = next_insn(p, BRW_OPCODE_SEND);2339assert(brw_inst_pred_control(p->devinfo, insn) == BRW_PREDICATE_NONE);23402341brw_set_dest(p, insn, retype(dest, BRW_REGISTER_TYPE_UW));23422343/* The HW requires that the header is present; this is to get the g0.52344* scratch offset.2345*/2346brw_set_src0(p, insn, brw_vec8_grf(0, 0));23472348/* According to the docs, offset is "A 12-bit HWord offset into the memory2349* Immediate Memory buffer as specified by binding table 0xFF." An HWORD2350* is 32 bytes, which happens to be the size of a register.2351*/2352offset /= REG_SIZE;2353assert(offset < (1 << 12));23542355gfx7_set_dp_scratch_message(p, insn,2356false, /* scratch read */2357false, /* OWords */2358false, /* invalidate after read */2359num_regs,2360offset,23611, /* mlen: just g0 */2362num_regs, /* rlen */2363true); /* header present */2364}23652366/**2367* Read float[4] vectors from the data port constant cache.2368* Location (in buffer) should be a multiple of 16.2369* Used for fetching shader constants.2370*/2371void brw_oword_block_read(struct brw_codegen *p,2372struct brw_reg dest,2373struct brw_reg mrf,2374uint32_t offset,2375uint32_t bind_table_index)2376{2377const struct intel_device_info *devinfo = p->devinfo;2378const unsigned target_cache =2379(devinfo->ver >= 6 ? GFX6_SFID_DATAPORT_CONSTANT_CACHE :2380BRW_SFID_DATAPORT_READ);2381const unsigned exec_size = 1 << brw_get_default_exec_size(p);2382const struct tgl_swsb swsb = brw_get_default_swsb(p);23832384/* On newer hardware, offset is in units of owords. */2385if (devinfo->ver >= 6)2386offset /= 16;23872388mrf = retype(mrf, BRW_REGISTER_TYPE_UD);23892390brw_push_insn_state(p);2391brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);2392brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);2393brw_set_default_mask_control(p, BRW_MASK_DISABLE);23942395brw_push_insn_state(p);2396brw_set_default_exec_size(p, BRW_EXECUTE_8);2397brw_set_default_swsb(p, tgl_swsb_src_dep(swsb));2398brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));23992400/* set message header global offset field (reg 0, element 2) */2401brw_set_default_exec_size(p, BRW_EXECUTE_1);2402brw_set_default_swsb(p, tgl_swsb_null());2403brw_MOV(p,2404retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE,2405mrf.nr,24062), BRW_REGISTER_TYPE_UD),2407brw_imm_ud(offset));2408brw_pop_insn_state(p);24092410brw_set_default_swsb(p, tgl_swsb_dst_dep(swsb, 1));24112412brw_inst *insn = next_insn(p, BRW_OPCODE_SEND);24132414brw_inst_set_sfid(devinfo, insn, target_cache);24152416/* cast dest to a uword[8] vector */2417dest = retype(vec8(dest), BRW_REGISTER_TYPE_UW);24182419brw_set_dest(p, insn, dest);2420if (devinfo->ver >= 6) {2421brw_set_src0(p, insn, mrf);2422} else {2423brw_set_src0(p, insn, brw_null_reg());2424brw_inst_set_base_mrf(devinfo, insn, mrf.nr);2425}24262427brw_set_desc(p, insn,2428brw_message_desc(devinfo, 1, DIV_ROUND_UP(exec_size, 8), true) |2429brw_dp_read_desc(devinfo, bind_table_index,2430BRW_DATAPORT_OWORD_BLOCK_DWORDS(exec_size),2431BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ,2432BRW_DATAPORT_READ_TARGET_DATA_CACHE));24332434brw_pop_insn_state(p);2435}24362437brw_inst *2438brw_fb_WRITE(struct brw_codegen *p,2439struct brw_reg payload,2440struct brw_reg implied_header,2441unsigned msg_control,2442unsigned binding_table_index,2443unsigned msg_length,2444unsigned response_length,2445bool eot,2446bool last_render_target,2447bool header_present)2448{2449const struct intel_device_info *devinfo = p->devinfo;2450const unsigned target_cache =2451(devinfo->ver >= 6 ? GFX6_SFID_DATAPORT_RENDER_CACHE :2452BRW_SFID_DATAPORT_WRITE);2453brw_inst *insn;2454struct brw_reg dest, src0;24552456if (brw_get_default_exec_size(p) >= BRW_EXECUTE_16)2457dest = retype(vec16(brw_null_reg()), BRW_REGISTER_TYPE_UW);2458else2459dest = retype(vec8(brw_null_reg()), BRW_REGISTER_TYPE_UW);24602461if (devinfo->ver >= 6) {2462insn = next_insn(p, BRW_OPCODE_SENDC);2463} else {2464insn = next_insn(p, BRW_OPCODE_SEND);2465}2466brw_inst_set_sfid(devinfo, insn, target_cache);2467brw_inst_set_compression(devinfo, insn, false);24682469if (devinfo->ver >= 6) {2470/* headerless version, just submit color payload */2471src0 = payload;2472} else {2473assert(payload.file == BRW_MESSAGE_REGISTER_FILE);2474brw_inst_set_base_mrf(devinfo, insn, payload.nr);2475src0 = implied_header;2476}24772478brw_set_dest(p, insn, dest);2479brw_set_src0(p, insn, src0);2480brw_set_desc(p, insn,2481brw_message_desc(devinfo, msg_length, response_length,2482header_present) |2483brw_fb_write_desc(devinfo, binding_table_index, msg_control,2484last_render_target,2485false /* coarse_write */));2486brw_inst_set_eot(devinfo, insn, eot);24872488return insn;2489}24902491brw_inst *2492gfx9_fb_READ(struct brw_codegen *p,2493struct brw_reg dst,2494struct brw_reg payload,2495unsigned binding_table_index,2496unsigned msg_length,2497unsigned response_length,2498bool per_sample)2499{2500const struct intel_device_info *devinfo = p->devinfo;2501assert(devinfo->ver >= 9);2502brw_inst *insn = next_insn(p, BRW_OPCODE_SENDC);25032504brw_inst_set_sfid(devinfo, insn, GFX6_SFID_DATAPORT_RENDER_CACHE);2505brw_set_dest(p, insn, dst);2506brw_set_src0(p, insn, payload);2507brw_set_desc(2508p, insn,2509brw_message_desc(devinfo, msg_length, response_length, true) |2510brw_fb_read_desc(devinfo, binding_table_index, 0 /* msg_control */,25111 << brw_get_default_exec_size(p), per_sample));2512brw_inst_set_rt_slot_group(devinfo, insn, brw_get_default_group(p) / 16);25132514return insn;2515}25162517/**2518* Texture sample instruction.2519* Note: the msg_type plus msg_length values determine exactly what kind2520* of sampling operation is performed. See volume 4, page 161 of docs.2521*/2522void brw_SAMPLE(struct brw_codegen *p,2523struct brw_reg dest,2524unsigned msg_reg_nr,2525struct brw_reg src0,2526unsigned binding_table_index,2527unsigned sampler,2528unsigned msg_type,2529unsigned response_length,2530unsigned msg_length,2531unsigned header_present,2532unsigned simd_mode,2533unsigned return_format)2534{2535const struct intel_device_info *devinfo = p->devinfo;2536brw_inst *insn;25372538if (msg_reg_nr != -1)2539gfx6_resolve_implied_move(p, &src0, msg_reg_nr);25402541insn = next_insn(p, BRW_OPCODE_SEND);2542brw_inst_set_sfid(devinfo, insn, BRW_SFID_SAMPLER);2543brw_inst_set_pred_control(devinfo, insn, BRW_PREDICATE_NONE); /* XXX */25442545/* From the 965 PRM (volume 4, part 1, section 14.2.41):2546*2547* "Instruction compression is not allowed for this instruction (that2548* is, send). The hardware behavior is undefined if this instruction is2549* set as compressed. However, compress control can be set to "SecHalf"2550* to affect the EMask generation."2551*2552* No similar wording is found in later PRMs, but there are examples2553* utilizing send with SecHalf. More importantly, SIMD8 sampler messages2554* are allowed in SIMD16 mode and they could not work without SecHalf. For2555* these reasons, we allow BRW_COMPRESSION_2NDHALF here.2556*/2557brw_inst_set_compression(devinfo, insn, false);25582559if (devinfo->ver < 6)2560brw_inst_set_base_mrf(devinfo, insn, msg_reg_nr);25612562brw_set_dest(p, insn, dest);2563brw_set_src0(p, insn, src0);2564brw_set_desc(p, insn,2565brw_message_desc(devinfo, msg_length, response_length,2566header_present) |2567brw_sampler_desc(devinfo, binding_table_index, sampler,2568msg_type, simd_mode, return_format));2569}25702571/* Adjust the message header's sampler state pointer to2572* select the correct group of 16 samplers.2573*/2574void brw_adjust_sampler_state_pointer(struct brw_codegen *p,2575struct brw_reg header,2576struct brw_reg sampler_index)2577{2578/* The "Sampler Index" field can only store values between 0 and 15.2579* However, we can add an offset to the "Sampler State Pointer"2580* field, effectively selecting a different set of 16 samplers.2581*2582* The "Sampler State Pointer" needs to be aligned to a 32-byte2583* offset, and each sampler state is only 16-bytes, so we can't2584* exclusively use the offset - we have to use both.2585*/25862587const struct intel_device_info *devinfo = p->devinfo;25882589if (sampler_index.file == BRW_IMMEDIATE_VALUE) {2590const int sampler_state_size = 16; /* 16 bytes */2591uint32_t sampler = sampler_index.ud;25922593if (sampler >= 16) {2594assert(devinfo->verx10 >= 75);2595brw_ADD(p,2596get_element_ud(header, 3),2597get_element_ud(brw_vec8_grf(0, 0), 3),2598brw_imm_ud(16 * (sampler / 16) * sampler_state_size));2599}2600} else {2601/* Non-const sampler array indexing case */2602if (devinfo->verx10 <= 70) {2603return;2604}26052606struct brw_reg temp = get_element_ud(header, 3);26072608brw_push_insn_state(p);2609brw_AND(p, temp, get_element_ud(sampler_index, 0), brw_imm_ud(0x0f0));2610brw_set_default_swsb(p, tgl_swsb_regdist(1));2611brw_SHL(p, temp, temp, brw_imm_ud(4));2612brw_ADD(p,2613get_element_ud(header, 3),2614get_element_ud(brw_vec8_grf(0, 0), 3),2615temp);2616brw_pop_insn_state(p);2617}2618}26192620/* All these variables are pretty confusing - we might be better off2621* using bitmasks and macros for this, in the old style. Or perhaps2622* just having the caller instantiate the fields in dword3 itself.2623*/2624void brw_urb_WRITE(struct brw_codegen *p,2625struct brw_reg dest,2626unsigned msg_reg_nr,2627struct brw_reg src0,2628enum brw_urb_write_flags flags,2629unsigned msg_length,2630unsigned response_length,2631unsigned offset,2632unsigned swizzle)2633{2634const struct intel_device_info *devinfo = p->devinfo;2635brw_inst *insn;26362637gfx6_resolve_implied_move(p, &src0, msg_reg_nr);26382639if (devinfo->ver >= 7 && !(flags & BRW_URB_WRITE_USE_CHANNEL_MASKS)) {2640/* Enable Channel Masks in the URB_WRITE_HWORD message header */2641brw_push_insn_state(p);2642brw_set_default_access_mode(p, BRW_ALIGN_1);2643brw_set_default_mask_control(p, BRW_MASK_DISABLE);2644brw_set_default_exec_size(p, BRW_EXECUTE_1);2645brw_OR(p, retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE, msg_reg_nr, 5),2646BRW_REGISTER_TYPE_UD),2647retype(brw_vec1_grf(0, 5), BRW_REGISTER_TYPE_UD),2648brw_imm_ud(0xff00));2649brw_pop_insn_state(p);2650}26512652insn = next_insn(p, BRW_OPCODE_SEND);26532654assert(msg_length < BRW_MAX_MRF(devinfo->ver));26552656brw_set_dest(p, insn, dest);2657brw_set_src0(p, insn, src0);2658brw_set_src1(p, insn, brw_imm_d(0));26592660if (devinfo->ver < 6)2661brw_inst_set_base_mrf(devinfo, insn, msg_reg_nr);26622663brw_set_urb_message(p,2664insn,2665flags,2666msg_length,2667response_length,2668offset,2669swizzle);2670}26712672void2673brw_send_indirect_message(struct brw_codegen *p,2674unsigned sfid,2675struct brw_reg dst,2676struct brw_reg payload,2677struct brw_reg desc,2678unsigned desc_imm,2679bool eot)2680{2681const struct intel_device_info *devinfo = p->devinfo;2682struct brw_inst *send;26832684dst = retype(dst, BRW_REGISTER_TYPE_UW);26852686assert(desc.type == BRW_REGISTER_TYPE_UD);26872688if (desc.file == BRW_IMMEDIATE_VALUE) {2689send = next_insn(p, BRW_OPCODE_SEND);2690brw_set_src0(p, send, retype(payload, BRW_REGISTER_TYPE_UD));2691brw_set_desc(p, send, desc.ud | desc_imm);2692} else {2693const struct tgl_swsb swsb = brw_get_default_swsb(p);2694struct brw_reg addr = retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD);26952696brw_push_insn_state(p);2697brw_set_default_access_mode(p, BRW_ALIGN_1);2698brw_set_default_mask_control(p, BRW_MASK_DISABLE);2699brw_set_default_exec_size(p, BRW_EXECUTE_1);2700brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);2701brw_set_default_swsb(p, tgl_swsb_src_dep(swsb));27022703/* Load the indirect descriptor to an address register using OR so the2704* caller can specify additional descriptor bits with the desc_imm2705* immediate.2706*/2707brw_OR(p, addr, desc, brw_imm_ud(desc_imm));27082709brw_pop_insn_state(p);27102711brw_set_default_swsb(p, tgl_swsb_dst_dep(swsb, 1));2712send = next_insn(p, BRW_OPCODE_SEND);2713brw_set_src0(p, send, retype(payload, BRW_REGISTER_TYPE_UD));27142715if (devinfo->ver >= 12)2716brw_inst_set_send_sel_reg32_desc(devinfo, send, true);2717else2718brw_set_src1(p, send, addr);2719}27202721brw_set_dest(p, send, dst);2722brw_inst_set_sfid(devinfo, send, sfid);2723brw_inst_set_eot(devinfo, send, eot);2724}27252726void2727brw_send_indirect_split_message(struct brw_codegen *p,2728unsigned sfid,2729struct brw_reg dst,2730struct brw_reg payload0,2731struct brw_reg payload1,2732struct brw_reg desc,2733unsigned desc_imm,2734struct brw_reg ex_desc,2735unsigned ex_desc_imm,2736bool eot)2737{2738const struct intel_device_info *devinfo = p->devinfo;2739struct brw_inst *send;27402741dst = retype(dst, BRW_REGISTER_TYPE_UW);27422743assert(desc.type == BRW_REGISTER_TYPE_UD);27442745if (desc.file == BRW_IMMEDIATE_VALUE) {2746desc.ud |= desc_imm;2747} else {2748const struct tgl_swsb swsb = brw_get_default_swsb(p);2749struct brw_reg addr = retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD);27502751brw_push_insn_state(p);2752brw_set_default_access_mode(p, BRW_ALIGN_1);2753brw_set_default_mask_control(p, BRW_MASK_DISABLE);2754brw_set_default_exec_size(p, BRW_EXECUTE_1);2755brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);2756brw_set_default_swsb(p, tgl_swsb_src_dep(swsb));27572758/* Load the indirect descriptor to an address register using OR so the2759* caller can specify additional descriptor bits with the desc_imm2760* immediate.2761*/2762brw_OR(p, addr, desc, brw_imm_ud(desc_imm));27632764brw_pop_insn_state(p);2765desc = addr;27662767brw_set_default_swsb(p, tgl_swsb_dst_dep(swsb, 1));2768}27692770if (ex_desc.file == BRW_IMMEDIATE_VALUE &&2771(devinfo->ver >= 12 ||2772((ex_desc.ud | ex_desc_imm) & INTEL_MASK(15, 12)) == 0)) {2773ex_desc.ud |= ex_desc_imm;2774} else {2775const struct tgl_swsb swsb = brw_get_default_swsb(p);2776struct brw_reg addr = retype(brw_address_reg(2), BRW_REGISTER_TYPE_UD);27772778brw_push_insn_state(p);2779brw_set_default_access_mode(p, BRW_ALIGN_1);2780brw_set_default_mask_control(p, BRW_MASK_DISABLE);2781brw_set_default_exec_size(p, BRW_EXECUTE_1);2782brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);2783brw_set_default_swsb(p, tgl_swsb_src_dep(swsb));27842785/* Load the indirect extended descriptor to an address register using OR2786* so the caller can specify additional descriptor bits with the2787* desc_imm immediate.2788*2789* Even though the instruction dispatcher always pulls the SFID and EOT2790* fields from the instruction itself, actual external unit which2791* processes the message gets the SFID and EOT from the extended2792* descriptor which comes from the address register. If we don't OR2793* those two bits in, the external unit may get confused and hang.2794*/2795unsigned imm_part = ex_desc_imm | sfid | eot << 5;27962797if (ex_desc.file == BRW_IMMEDIATE_VALUE) {2798/* ex_desc bits 15:12 don't exist in the instruction encoding prior2799* to Gfx12, so we may have fallen back to an indirect extended2800* descriptor.2801*/2802brw_MOV(p, addr, brw_imm_ud(ex_desc.ud | imm_part));2803} else {2804brw_OR(p, addr, ex_desc, brw_imm_ud(imm_part));2805}28062807brw_pop_insn_state(p);2808ex_desc = addr;28092810brw_set_default_swsb(p, tgl_swsb_dst_dep(swsb, 1));2811}28122813send = next_insn(p, devinfo->ver >= 12 ? BRW_OPCODE_SEND : BRW_OPCODE_SENDS);2814brw_set_dest(p, send, dst);2815brw_set_src0(p, send, retype(payload0, BRW_REGISTER_TYPE_UD));2816brw_set_src1(p, send, retype(payload1, BRW_REGISTER_TYPE_UD));28172818if (desc.file == BRW_IMMEDIATE_VALUE) {2819brw_inst_set_send_sel_reg32_desc(devinfo, send, 0);2820brw_inst_set_send_desc(devinfo, send, desc.ud);2821} else {2822assert(desc.file == BRW_ARCHITECTURE_REGISTER_FILE);2823assert(desc.nr == BRW_ARF_ADDRESS);2824assert(desc.subnr == 0);2825brw_inst_set_send_sel_reg32_desc(devinfo, send, 1);2826}28272828if (ex_desc.file == BRW_IMMEDIATE_VALUE) {2829brw_inst_set_send_sel_reg32_ex_desc(devinfo, send, 0);2830brw_inst_set_sends_ex_desc(devinfo, send, ex_desc.ud);2831} else {2832assert(ex_desc.file == BRW_ARCHITECTURE_REGISTER_FILE);2833assert(ex_desc.nr == BRW_ARF_ADDRESS);2834assert((ex_desc.subnr & 0x3) == 0);2835brw_inst_set_send_sel_reg32_ex_desc(devinfo, send, 1);2836brw_inst_set_send_ex_desc_ia_subreg_nr(devinfo, send, ex_desc.subnr >> 2);2837}28382839brw_inst_set_sfid(devinfo, send, sfid);2840brw_inst_set_eot(devinfo, send, eot);2841}28422843static void2844brw_send_indirect_surface_message(struct brw_codegen *p,2845unsigned sfid,2846struct brw_reg dst,2847struct brw_reg payload,2848struct brw_reg surface,2849unsigned desc_imm)2850{2851if (surface.file != BRW_IMMEDIATE_VALUE) {2852const struct tgl_swsb swsb = brw_get_default_swsb(p);2853struct brw_reg addr = retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD);28542855brw_push_insn_state(p);2856brw_set_default_access_mode(p, BRW_ALIGN_1);2857brw_set_default_mask_control(p, BRW_MASK_DISABLE);2858brw_set_default_exec_size(p, BRW_EXECUTE_1);2859brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);2860brw_set_default_swsb(p, tgl_swsb_src_dep(swsb));28612862/* Mask out invalid bits from the surface index to avoid hangs e.g. when2863* some surface array is accessed out of bounds.2864*/2865brw_AND(p, addr,2866suboffset(vec1(retype(surface, BRW_REGISTER_TYPE_UD)),2867BRW_GET_SWZ(surface.swizzle, 0)),2868brw_imm_ud(0xff));28692870brw_pop_insn_state(p);28712872surface = addr;2873brw_set_default_swsb(p, tgl_swsb_dst_dep(swsb, 1));2874}28752876brw_send_indirect_message(p, sfid, dst, payload, surface, desc_imm, false);2877}28782879static bool2880while_jumps_before_offset(const struct intel_device_info *devinfo,2881brw_inst *insn, int while_offset, int start_offset)2882{2883int scale = 16 / brw_jump_scale(devinfo);2884int jip = devinfo->ver == 6 ? brw_inst_gfx6_jump_count(devinfo, insn)2885: brw_inst_jip(devinfo, insn);2886assert(jip < 0);2887return while_offset + jip * scale <= start_offset;2888}288928902891static int2892brw_find_next_block_end(struct brw_codegen *p, int start_offset)2893{2894int offset;2895void *store = p->store;2896const struct intel_device_info *devinfo = p->devinfo;28972898int depth = 0;28992900for (offset = next_offset(devinfo, store, start_offset);2901offset < p->next_insn_offset;2902offset = next_offset(devinfo, store, offset)) {2903brw_inst *insn = store + offset;29042905switch (brw_inst_opcode(devinfo, insn)) {2906case BRW_OPCODE_IF:2907depth++;2908break;2909case BRW_OPCODE_ENDIF:2910if (depth == 0)2911return offset;2912depth--;2913break;2914case BRW_OPCODE_WHILE:2915/* If the while doesn't jump before our instruction, it's the end2916* of a sibling do...while loop. Ignore it.2917*/2918if (!while_jumps_before_offset(devinfo, insn, offset, start_offset))2919continue;2920FALLTHROUGH;2921case BRW_OPCODE_ELSE:2922case BRW_OPCODE_HALT:2923if (depth == 0)2924return offset;2925break;2926default:2927break;2928}2929}29302931return 0;2932}29332934/* There is no DO instruction on gfx6, so to find the end of the loop2935* we have to see if the loop is jumping back before our start2936* instruction.2937*/2938static int2939brw_find_loop_end(struct brw_codegen *p, int start_offset)2940{2941const struct intel_device_info *devinfo = p->devinfo;2942int offset;2943void *store = p->store;29442945assert(devinfo->ver >= 6);29462947/* Always start after the instruction (such as a WHILE) we're trying to fix2948* up.2949*/2950for (offset = next_offset(devinfo, store, start_offset);2951offset < p->next_insn_offset;2952offset = next_offset(devinfo, store, offset)) {2953brw_inst *insn = store + offset;29542955if (brw_inst_opcode(devinfo, insn) == BRW_OPCODE_WHILE) {2956if (while_jumps_before_offset(devinfo, insn, offset, start_offset))2957return offset;2958}2959}2960assert(!"not reached");2961return start_offset;2962}29632964/* After program generation, go back and update the UIP and JIP of2965* BREAK, CONT, and HALT instructions to their correct locations.2966*/2967void2968brw_set_uip_jip(struct brw_codegen *p, int start_offset)2969{2970const struct intel_device_info *devinfo = p->devinfo;2971int offset;2972int br = brw_jump_scale(devinfo);2973int scale = 16 / br;2974void *store = p->store;29752976if (devinfo->ver < 6)2977return;29782979for (offset = start_offset; offset < p->next_insn_offset; offset += 16) {2980brw_inst *insn = store + offset;2981assert(brw_inst_cmpt_control(devinfo, insn) == 0);29822983int block_end_offset = brw_find_next_block_end(p, offset);2984switch (brw_inst_opcode(devinfo, insn)) {2985case BRW_OPCODE_BREAK:2986assert(block_end_offset != 0);2987brw_inst_set_jip(devinfo, insn, (block_end_offset - offset) / scale);2988/* Gfx7 UIP points to WHILE; Gfx6 points just after it */2989brw_inst_set_uip(devinfo, insn,2990(brw_find_loop_end(p, offset) - offset +2991(devinfo->ver == 6 ? 16 : 0)) / scale);2992break;2993case BRW_OPCODE_CONTINUE:2994assert(block_end_offset != 0);2995brw_inst_set_jip(devinfo, insn, (block_end_offset - offset) / scale);2996brw_inst_set_uip(devinfo, insn,2997(brw_find_loop_end(p, offset) - offset) / scale);29982999assert(brw_inst_uip(devinfo, insn) != 0);3000assert(brw_inst_jip(devinfo, insn) != 0);3001break;30023003case BRW_OPCODE_ENDIF: {3004int32_t jump = (block_end_offset == 0) ?30051 * br : (block_end_offset - offset) / scale;3006if (devinfo->ver >= 7)3007brw_inst_set_jip(devinfo, insn, jump);3008else3009brw_inst_set_gfx6_jump_count(devinfo, insn, jump);3010break;3011}30123013case BRW_OPCODE_HALT:3014/* From the Sandy Bridge PRM (volume 4, part 2, section 8.3.19):3015*3016* "In case of the halt instruction not inside any conditional3017* code block, the value of <JIP> and <UIP> should be the3018* same. In case of the halt instruction inside conditional code3019* block, the <UIP> should be the end of the program, and the3020* <JIP> should be end of the most inner conditional code block."3021*3022* The uip will have already been set by whoever set up the3023* instruction.3024*/3025if (block_end_offset == 0) {3026brw_inst_set_jip(devinfo, insn, brw_inst_uip(devinfo, insn));3027} else {3028brw_inst_set_jip(devinfo, insn, (block_end_offset - offset) / scale);3029}3030assert(brw_inst_uip(devinfo, insn) != 0);3031assert(brw_inst_jip(devinfo, insn) != 0);3032break;30333034default:3035break;3036}3037}3038}30393040void brw_ff_sync(struct brw_codegen *p,3041struct brw_reg dest,3042unsigned msg_reg_nr,3043struct brw_reg src0,3044bool allocate,3045unsigned response_length,3046bool eot)3047{3048const struct intel_device_info *devinfo = p->devinfo;3049brw_inst *insn;30503051gfx6_resolve_implied_move(p, &src0, msg_reg_nr);30523053insn = next_insn(p, BRW_OPCODE_SEND);3054brw_set_dest(p, insn, dest);3055brw_set_src0(p, insn, src0);3056brw_set_src1(p, insn, brw_imm_d(0));30573058if (devinfo->ver < 6)3059brw_inst_set_base_mrf(devinfo, insn, msg_reg_nr);30603061brw_set_ff_sync_message(p,3062insn,3063allocate,3064response_length,3065eot);3066}30673068/**3069* Emit the SEND instruction necessary to generate stream output data on Gfx63070* (for transform feedback).3071*3072* If send_commit_msg is true, this is the last piece of stream output data3073* from this thread, so send the data as a committed write. According to the3074* Sandy Bridge PRM (volume 2 part 1, section 4.5.1):3075*3076* "Prior to End of Thread with a URB_WRITE, the kernel must ensure all3077* writes are complete by sending the final write as a committed write."3078*/3079void3080brw_svb_write(struct brw_codegen *p,3081struct brw_reg dest,3082unsigned msg_reg_nr,3083struct brw_reg src0,3084unsigned binding_table_index,3085bool send_commit_msg)3086{3087const struct intel_device_info *devinfo = p->devinfo;3088assert(devinfo->ver == 6);3089const unsigned target_cache = GFX6_SFID_DATAPORT_RENDER_CACHE;3090brw_inst *insn;30913092gfx6_resolve_implied_move(p, &src0, msg_reg_nr);30933094insn = next_insn(p, BRW_OPCODE_SEND);3095brw_inst_set_sfid(devinfo, insn, target_cache);3096brw_set_dest(p, insn, dest);3097brw_set_src0(p, insn, src0);3098brw_set_desc(p, insn,3099brw_message_desc(devinfo, 1, send_commit_msg, true) |3100brw_dp_write_desc(devinfo, binding_table_index,31010, /* msg_control: ignored */3102GFX6_DATAPORT_WRITE_MESSAGE_STREAMED_VB_WRITE,3103send_commit_msg)); /* send_commit_msg */3104}31053106static unsigned3107brw_surface_payload_size(unsigned num_channels,3108unsigned exec_size /**< 0 for SIMD4x2 */)3109{3110if (exec_size == 0)3111return 1; /* SIMD4x2 */3112else if (exec_size <= 8)3113return num_channels;3114else3115return 2 * num_channels;3116}31173118void3119brw_untyped_atomic(struct brw_codegen *p,3120struct brw_reg dst,3121struct brw_reg payload,3122struct brw_reg surface,3123unsigned atomic_op,3124unsigned msg_length,3125bool response_expected,3126bool header_present)3127{3128const struct intel_device_info *devinfo = p->devinfo;3129const unsigned sfid = (devinfo->verx10 >= 75 ?3130HSW_SFID_DATAPORT_DATA_CACHE_1 :3131GFX7_SFID_DATAPORT_DATA_CACHE);3132const bool align1 = brw_get_default_access_mode(p) == BRW_ALIGN_1;3133/* SIMD4x2 untyped atomic instructions only exist on HSW+ */3134const bool has_simd4x2 = devinfo->verx10 >= 75;3135const unsigned exec_size = align1 ? 1 << brw_get_default_exec_size(p) :3136has_simd4x2 ? 0 : 8;3137const unsigned response_length =3138brw_surface_payload_size(response_expected, exec_size);3139const unsigned desc =3140brw_message_desc(devinfo, msg_length, response_length, header_present) |3141brw_dp_untyped_atomic_desc(devinfo, exec_size, atomic_op,3142response_expected);3143/* Mask out unused components -- This is especially important in Align163144* mode on generations that don't have native support for SIMD4x2 atomics,3145* because unused but enabled components will cause the dataport to perform3146* additional atomic operations on the addresses that happen to be in the3147* uninitialized Y, Z and W coordinates of the payload.3148*/3149const unsigned mask = align1 ? WRITEMASK_XYZW : WRITEMASK_X;31503151brw_send_indirect_surface_message(p, sfid, brw_writemask(dst, mask),3152payload, surface, desc);3153}31543155void3156brw_untyped_surface_read(struct brw_codegen *p,3157struct brw_reg dst,3158struct brw_reg payload,3159struct brw_reg surface,3160unsigned msg_length,3161unsigned num_channels)3162{3163const struct intel_device_info *devinfo = p->devinfo;3164const unsigned sfid = (devinfo->verx10 >= 75 ?3165HSW_SFID_DATAPORT_DATA_CACHE_1 :3166GFX7_SFID_DATAPORT_DATA_CACHE);3167const bool align1 = brw_get_default_access_mode(p) == BRW_ALIGN_1;3168const unsigned exec_size = align1 ? 1 << brw_get_default_exec_size(p) : 0;3169const unsigned response_length =3170brw_surface_payload_size(num_channels, exec_size);3171const unsigned desc =3172brw_message_desc(devinfo, msg_length, response_length, false) |3173brw_dp_untyped_surface_rw_desc(devinfo, exec_size, num_channels, false);31743175brw_send_indirect_surface_message(p, sfid, dst, payload, surface, desc);3176}31773178void3179brw_untyped_surface_write(struct brw_codegen *p,3180struct brw_reg payload,3181struct brw_reg surface,3182unsigned msg_length,3183unsigned num_channels,3184bool header_present)3185{3186const struct intel_device_info *devinfo = p->devinfo;3187const unsigned sfid = (devinfo->verx10 >= 75 ?3188HSW_SFID_DATAPORT_DATA_CACHE_1 :3189GFX7_SFID_DATAPORT_DATA_CACHE);3190const bool align1 = brw_get_default_access_mode(p) == BRW_ALIGN_1;3191/* SIMD4x2 untyped surface write instructions only exist on HSW+ */3192const bool has_simd4x2 = devinfo->verx10 >= 75;3193const unsigned exec_size = align1 ? 1 << brw_get_default_exec_size(p) :3194has_simd4x2 ? 0 : 8;3195const unsigned desc =3196brw_message_desc(devinfo, msg_length, 0, header_present) |3197brw_dp_untyped_surface_rw_desc(devinfo, exec_size, num_channels, true);3198/* Mask out unused components -- See comment in brw_untyped_atomic(). */3199const unsigned mask = !has_simd4x2 && !align1 ? WRITEMASK_X : WRITEMASK_XYZW;32003201brw_send_indirect_surface_message(p, sfid, brw_writemask(brw_null_reg(), mask),3202payload, surface, desc);3203}32043205static void3206brw_set_memory_fence_message(struct brw_codegen *p,3207struct brw_inst *insn,3208enum brw_message_target sfid,3209bool commit_enable,3210unsigned bti)3211{3212const struct intel_device_info *devinfo = p->devinfo;32133214brw_set_desc(p, insn, brw_message_desc(3215devinfo, 1, (commit_enable ? 1 : 0), true));32163217brw_inst_set_sfid(devinfo, insn, sfid);32183219switch (sfid) {3220case GFX6_SFID_DATAPORT_RENDER_CACHE:3221brw_inst_set_dp_msg_type(devinfo, insn, GFX7_DATAPORT_RC_MEMORY_FENCE);3222break;3223case GFX7_SFID_DATAPORT_DATA_CACHE:3224brw_inst_set_dp_msg_type(devinfo, insn, GFX7_DATAPORT_DC_MEMORY_FENCE);3225break;3226default:3227unreachable("Not reached");3228}32293230if (commit_enable)3231brw_inst_set_dp_msg_control(devinfo, insn, 1 << 5);32323233assert(devinfo->ver >= 11 || bti == 0);3234brw_inst_set_binding_table_index(devinfo, insn, bti);3235}32363237static void3238gfx12_set_memory_fence_message(struct brw_codegen *p,3239struct brw_inst *insn,3240enum brw_message_target sfid)3241{3242const unsigned mlen = 1; /* g0 header */3243/* Completion signaled by write to register. No data returned. */3244const unsigned rlen = 1;32453246brw_inst_set_sfid(p->devinfo, insn, sfid);32473248enum lsc_fence_scope scope = LSC_FENCE_THREADGROUP;3249enum lsc_flush_type flush_type = LSC_FLUSH_TYPE_NONE;32503251if (sfid == GFX12_SFID_TGM) {3252scope = LSC_FENCE_GPU;3253flush_type = LSC_FLUSH_TYPE_EVICT;3254}32553256brw_set_desc(p, insn, lsc_fence_msg_desc(p->devinfo, scope,3257flush_type, false) |3258brw_message_desc(p->devinfo, mlen, rlen, false));3259}32603261void3262brw_memory_fence(struct brw_codegen *p,3263struct brw_reg dst,3264struct brw_reg src,3265enum opcode send_op,3266enum brw_message_target sfid,3267bool commit_enable,3268unsigned bti)3269{3270const struct intel_device_info *devinfo = p->devinfo;32713272dst = retype(vec1(dst), BRW_REGISTER_TYPE_UW);3273src = retype(vec1(src), BRW_REGISTER_TYPE_UD);32743275/* Set dst as destination for dependency tracking, the MEMORY_FENCE3276* message doesn't write anything back.3277*/3278struct brw_inst *insn = next_insn(p, send_op);3279brw_inst_set_mask_control(devinfo, insn, BRW_MASK_DISABLE);3280brw_inst_set_exec_size(devinfo, insn, BRW_EXECUTE_1);3281brw_set_dest(p, insn, dst);3282brw_set_src0(p, insn, src);32833284/* All DG2 hardware requires LSC for fence messages, even A-step */3285if (devinfo->has_lsc)3286gfx12_set_memory_fence_message(p, insn, sfid);3287else3288brw_set_memory_fence_message(p, insn, sfid, commit_enable, bti);3289}32903291void3292brw_pixel_interpolator_query(struct brw_codegen *p,3293struct brw_reg dest,3294struct brw_reg mrf,3295bool noperspective,3296bool coarse_pixel_rate,3297unsigned mode,3298struct brw_reg data,3299unsigned msg_length,3300unsigned response_length)3301{3302const struct intel_device_info *devinfo = p->devinfo;3303const uint16_t exec_size = brw_get_default_exec_size(p);3304const unsigned slot_group = brw_get_default_group(p) / 16;3305const unsigned simd_mode = (exec_size == BRW_EXECUTE_16);3306const unsigned desc =3307brw_message_desc(devinfo, msg_length, response_length, false) |3308brw_pixel_interp_desc(devinfo, mode, noperspective, coarse_pixel_rate,3309simd_mode, slot_group);33103311/* brw_send_indirect_message will automatically use a direct send message3312* if data is actually immediate.3313*/3314brw_send_indirect_message(p,3315GFX7_SFID_PIXEL_INTERPOLATOR,3316dest,3317mrf,3318vec1(data),3319desc,3320false);3321}33223323void3324brw_find_live_channel(struct brw_codegen *p, struct brw_reg dst,3325struct brw_reg mask)3326{3327const struct intel_device_info *devinfo = p->devinfo;3328const unsigned exec_size = 1 << brw_get_default_exec_size(p);3329const unsigned qtr_control = brw_get_default_group(p) / 8;3330brw_inst *inst;33313332assert(devinfo->ver >= 7);3333assert(mask.type == BRW_REGISTER_TYPE_UD);33343335brw_push_insn_state(p);33363337/* The flag register is only used on Gfx7 in align1 mode, so avoid setting3338* unnecessary bits in the instruction words, get the information we need3339* and reset the default flag register. This allows more instructions to be3340* compacted.3341*/3342const unsigned flag_subreg = p->current->flag_subreg;3343brw_set_default_flag_reg(p, 0, 0);33443345if (brw_get_default_access_mode(p) == BRW_ALIGN_1) {3346brw_set_default_mask_control(p, BRW_MASK_DISABLE);33473348if (devinfo->ver >= 8) {3349/* Getting the first active channel index is easy on Gfx8: Just find3350* the first bit set in the execution mask. The register exists on3351* HSW already but it reads back as all ones when the current3352* instruction has execution masking disabled, so it's kind of3353* useless.3354*/3355struct brw_reg exec_mask =3356retype(brw_mask_reg(0), BRW_REGISTER_TYPE_UD);33573358brw_set_default_exec_size(p, BRW_EXECUTE_1);3359if (mask.file != BRW_IMMEDIATE_VALUE || mask.ud != 0xffffffff) {3360/* Unfortunately, ce0 does not take into account the thread3361* dispatch mask, which may be a problem in cases where it's not3362* tightly packed (i.e. it doesn't have the form '2^n - 1' for3363* some n). Combine ce0 with the given dispatch (or vector) mask3364* to mask off those channels which were never dispatched by the3365* hardware.3366*/3367brw_SHR(p, vec1(dst), mask, brw_imm_ud(qtr_control * 8));3368brw_set_default_swsb(p, tgl_swsb_regdist(1));3369brw_AND(p, vec1(dst), exec_mask, vec1(dst));3370exec_mask = vec1(dst);3371}33723373/* Quarter control has the effect of magically shifting the value of3374* ce0 so you'll get the first active channel relative to the3375* specified quarter control as result.3376*/3377inst = brw_FBL(p, vec1(dst), exec_mask);3378} else {3379const struct brw_reg flag = brw_flag_subreg(flag_subreg);33803381brw_set_default_exec_size(p, BRW_EXECUTE_1);3382brw_MOV(p, retype(flag, BRW_REGISTER_TYPE_UD), brw_imm_ud(0));33833384/* Run enough instructions returning zero with execution masking and3385* a conditional modifier enabled in order to get the full execution3386* mask in f1.0. We could use a single 32-wide move here if it3387* weren't because of the hardware bug that causes channel enables to3388* be applied incorrectly to the second half of 32-wide instructions3389* on Gfx7.3390*/3391const unsigned lower_size = MIN2(16, exec_size);3392for (unsigned i = 0; i < exec_size / lower_size; i++) {3393inst = brw_MOV(p, retype(brw_null_reg(), BRW_REGISTER_TYPE_UW),3394brw_imm_uw(0));3395brw_inst_set_mask_control(devinfo, inst, BRW_MASK_ENABLE);3396brw_inst_set_group(devinfo, inst, lower_size * i + 8 * qtr_control);3397brw_inst_set_cond_modifier(devinfo, inst, BRW_CONDITIONAL_Z);3398brw_inst_set_exec_size(devinfo, inst, cvt(lower_size) - 1);3399brw_inst_set_flag_reg_nr(devinfo, inst, flag_subreg / 2);3400brw_inst_set_flag_subreg_nr(devinfo, inst, flag_subreg % 2);3401}34023403/* Find the first bit set in the exec_size-wide portion of the flag3404* register that was updated by the last sequence of MOV3405* instructions.3406*/3407const enum brw_reg_type type = brw_int_type(exec_size / 8, false);3408brw_set_default_exec_size(p, BRW_EXECUTE_1);3409brw_FBL(p, vec1(dst), byte_offset(retype(flag, type), qtr_control));3410}3411} else {3412brw_set_default_mask_control(p, BRW_MASK_DISABLE);34133414if (devinfo->ver >= 8 &&3415mask.file == BRW_IMMEDIATE_VALUE && mask.ud == 0xffffffff) {3416/* In SIMD4x2 mode the first active channel index is just the3417* negation of the first bit of the mask register. Note that ce03418* doesn't take into account the dispatch mask, so the Gfx7 path3419* should be used instead unless you have the guarantee that the3420* dispatch mask is tightly packed (i.e. it has the form '2^n - 1'3421* for some n).3422*/3423inst = brw_AND(p, brw_writemask(dst, WRITEMASK_X),3424negate(retype(brw_mask_reg(0), BRW_REGISTER_TYPE_UD)),3425brw_imm_ud(1));34263427} else {3428/* Overwrite the destination without and with execution masking to3429* find out which of the channels is active.3430*/3431brw_push_insn_state(p);3432brw_set_default_exec_size(p, BRW_EXECUTE_4);3433brw_MOV(p, brw_writemask(vec4(dst), WRITEMASK_X),3434brw_imm_ud(1));34353436inst = brw_MOV(p, brw_writemask(vec4(dst), WRITEMASK_X),3437brw_imm_ud(0));3438brw_pop_insn_state(p);3439brw_inst_set_mask_control(devinfo, inst, BRW_MASK_ENABLE);3440}3441}34423443brw_pop_insn_state(p);3444}34453446void3447brw_broadcast(struct brw_codegen *p,3448struct brw_reg dst,3449struct brw_reg src,3450struct brw_reg idx)3451{3452const struct intel_device_info *devinfo = p->devinfo;3453const bool align1 = brw_get_default_access_mode(p) == BRW_ALIGN_1;3454brw_inst *inst;34553456brw_push_insn_state(p);3457brw_set_default_mask_control(p, BRW_MASK_DISABLE);3458brw_set_default_exec_size(p, align1 ? BRW_EXECUTE_1 : BRW_EXECUTE_4);34593460assert(src.file == BRW_GENERAL_REGISTER_FILE &&3461src.address_mode == BRW_ADDRESS_DIRECT);3462assert(!src.abs && !src.negate);3463assert(src.type == dst.type);34643465if ((src.vstride == 0 && (src.hstride == 0 || !align1)) ||3466idx.file == BRW_IMMEDIATE_VALUE) {3467/* Trivial, the source is already uniform or the index is a constant.3468* We will typically not get here if the optimizer is doing its job, but3469* asserting would be mean.3470*/3471const unsigned i = idx.file == BRW_IMMEDIATE_VALUE ? idx.ud : 0;3472src = align1 ? stride(suboffset(src, i), 0, 1, 0) :3473stride(suboffset(src, 4 * i), 0, 4, 1);34743475if (type_sz(src.type) > 4 && !devinfo->has_64bit_float) {3476brw_MOV(p, subscript(dst, BRW_REGISTER_TYPE_D, 0),3477subscript(src, BRW_REGISTER_TYPE_D, 0));3478brw_set_default_swsb(p, tgl_swsb_null());3479brw_MOV(p, subscript(dst, BRW_REGISTER_TYPE_D, 1),3480subscript(src, BRW_REGISTER_TYPE_D, 1));3481} else {3482brw_MOV(p, dst, src);3483}3484} else {3485/* From the Haswell PRM section "Register Region Restrictions":3486*3487* "The lower bits of the AddressImmediate must not overflow to3488* change the register address. The lower 5 bits of Address3489* Immediate when added to lower 5 bits of address register gives3490* the sub-register offset. The upper bits of Address Immediate3491* when added to upper bits of address register gives the register3492* address. Any overflow from sub-register offset is dropped."3493*3494* Fortunately, for broadcast, we never have a sub-register offset so3495* this isn't an issue.3496*/3497assert(src.subnr == 0);34983499if (align1) {3500const struct brw_reg addr =3501retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD);3502unsigned offset = src.nr * REG_SIZE + src.subnr;3503/* Limit in bytes of the signed indirect addressing immediate. */3504const unsigned limit = 512;35053506brw_push_insn_state(p);3507brw_set_default_mask_control(p, BRW_MASK_DISABLE);3508brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);35093510/* Take into account the component size and horizontal stride. */3511assert(src.vstride == src.hstride + src.width);3512brw_SHL(p, addr, vec1(idx),3513brw_imm_ud(util_logbase2(type_sz(src.type)) +3514src.hstride - 1));35153516/* We can only address up to limit bytes using the indirect3517* addressing immediate, account for the difference if the source3518* register is above this limit.3519*/3520if (offset >= limit) {3521brw_set_default_swsb(p, tgl_swsb_regdist(1));3522brw_ADD(p, addr, addr, brw_imm_ud(offset - offset % limit));3523offset = offset % limit;3524}35253526brw_pop_insn_state(p);35273528brw_set_default_swsb(p, tgl_swsb_regdist(1));35293530/* Use indirect addressing to fetch the specified component. */3531if (type_sz(src.type) > 4 &&3532(devinfo->is_cherryview || intel_device_info_is_9lp(devinfo) ||3533!devinfo->has_64bit_float)) {3534/* From the Cherryview PRM Vol 7. "Register Region Restrictions":3535*3536* "When source or destination datatype is 64b or operation is3537* integer DWord multiply, indirect addressing must not be3538* used."3539*3540* To work around both of this issue, we do two integer MOVs3541* insead of one 64-bit MOV. Because no double value should ever3542* cross a register boundary, it's safe to use the immediate3543* offset in the indirect here to handle adding 4 bytes to the3544* offset and avoid the extra ADD to the register file.3545*/3546brw_MOV(p, subscript(dst, BRW_REGISTER_TYPE_D, 0),3547retype(brw_vec1_indirect(addr.subnr, offset),3548BRW_REGISTER_TYPE_D));3549brw_set_default_swsb(p, tgl_swsb_null());3550brw_MOV(p, subscript(dst, BRW_REGISTER_TYPE_D, 1),3551retype(brw_vec1_indirect(addr.subnr, offset + 4),3552BRW_REGISTER_TYPE_D));3553} else {3554brw_MOV(p, dst,3555retype(brw_vec1_indirect(addr.subnr, offset), src.type));3556}3557} else {3558/* In SIMD4x2 mode the index can be either zero or one, replicate it3559* to all bits of a flag register,3560*/3561inst = brw_MOV(p,3562brw_null_reg(),3563stride(brw_swizzle(idx, BRW_SWIZZLE_XXXX), 4, 4, 1));3564brw_inst_set_pred_control(devinfo, inst, BRW_PREDICATE_NONE);3565brw_inst_set_cond_modifier(devinfo, inst, BRW_CONDITIONAL_NZ);3566brw_inst_set_flag_reg_nr(devinfo, inst, 1);35673568/* and use predicated SEL to pick the right channel. */3569inst = brw_SEL(p, dst,3570stride(suboffset(src, 4), 4, 4, 1),3571stride(src, 4, 4, 1));3572brw_inst_set_pred_control(devinfo, inst, BRW_PREDICATE_NORMAL);3573brw_inst_set_flag_reg_nr(devinfo, inst, 1);3574}3575}35763577brw_pop_insn_state(p);3578}35793580/**3581* This instruction is generated as a single-channel align1 instruction by3582* both the VS and FS stages when using INTEL_DEBUG=shader_time.3583*3584* We can't use the typed atomic op in the FS because that has the execution3585* mask ANDed with the pixel mask, but we just want to write the one dword for3586* all the pixels.3587*3588* We don't use the SIMD4x2 atomic ops in the VS because want to just write3589* one u32. So we use the same untyped atomic write message as the pixel3590* shader.3591*3592* The untyped atomic operation requires a BUFFER surface type with RAW3593* format, and is only accessible through the legacy DATA_CACHE dataport3594* messages.3595*/3596void brw_shader_time_add(struct brw_codegen *p,3597struct brw_reg payload,3598uint32_t surf_index)3599{3600const struct intel_device_info *devinfo = p->devinfo;3601const unsigned sfid = (devinfo->verx10 >= 75 ?3602HSW_SFID_DATAPORT_DATA_CACHE_1 :3603GFX7_SFID_DATAPORT_DATA_CACHE);3604assert(devinfo->ver >= 7);36053606brw_push_insn_state(p);3607brw_set_default_access_mode(p, BRW_ALIGN_1);3608brw_set_default_mask_control(p, BRW_MASK_DISABLE);3609brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);3610brw_inst *send = brw_next_insn(p, BRW_OPCODE_SEND);36113612/* We use brw_vec1_reg and unmasked because we want to increment the given3613* offset only once.3614*/3615brw_set_dest(p, send, brw_vec1_reg(BRW_ARCHITECTURE_REGISTER_FILE,3616BRW_ARF_NULL, 0));3617brw_set_src0(p, send, brw_vec1_reg(payload.file,3618payload.nr, 0));3619brw_set_desc(p, send, (brw_message_desc(devinfo, 2, 0, false) |3620brw_dp_untyped_atomic_desc(devinfo, 1, BRW_AOP_ADD,3621false)));36223623brw_inst_set_sfid(devinfo, send, sfid);3624brw_inst_set_binding_table_index(devinfo, send, surf_index);36253626brw_pop_insn_state(p);3627}362836293630/**3631* Emit the SEND message for a barrier3632*/3633void3634brw_barrier(struct brw_codegen *p, struct brw_reg src)3635{3636const struct intel_device_info *devinfo = p->devinfo;3637struct brw_inst *inst;36383639assert(devinfo->ver >= 7);36403641brw_push_insn_state(p);3642brw_set_default_access_mode(p, BRW_ALIGN_1);3643inst = next_insn(p, BRW_OPCODE_SEND);3644brw_set_dest(p, inst, retype(brw_null_reg(), BRW_REGISTER_TYPE_UW));3645brw_set_src0(p, inst, src);3646brw_set_src1(p, inst, brw_null_reg());3647brw_set_desc(p, inst, brw_message_desc(devinfo, 1, 0, false));36483649brw_inst_set_sfid(devinfo, inst, BRW_SFID_MESSAGE_GATEWAY);3650brw_inst_set_gateway_subfuncid(devinfo, inst,3651BRW_MESSAGE_GATEWAY_SFID_BARRIER_MSG);36523653brw_inst_set_mask_control(devinfo, inst, BRW_MASK_DISABLE);3654brw_pop_insn_state(p);3655}365636573658/**3659* Emit the wait instruction for a barrier3660*/3661void3662brw_WAIT(struct brw_codegen *p)3663{3664const struct intel_device_info *devinfo = p->devinfo;3665struct brw_inst *insn;36663667struct brw_reg src = brw_notification_reg();36683669insn = next_insn(p, BRW_OPCODE_WAIT);3670brw_set_dest(p, insn, src);3671brw_set_src0(p, insn, src);3672brw_set_src1(p, insn, brw_null_reg());36733674brw_inst_set_exec_size(devinfo, insn, BRW_EXECUTE_1);3675brw_inst_set_mask_control(devinfo, insn, BRW_MASK_DISABLE);3676}36773678void3679brw_float_controls_mode(struct brw_codegen *p,3680unsigned mode, unsigned mask)3681{3682/* From the Skylake PRM, Volume 7, page 760:3683* "Implementation Restriction on Register Access: When the control3684* register is used as an explicit source and/or destination, hardware3685* does not ensure execution pipeline coherency. Software must set the3686* thread control field to ‘switch’ for an instruction that uses3687* control register as an explicit operand."3688*3689* On Gfx12+ this is implemented in terms of SWSB annotations instead.3690*/3691brw_set_default_swsb(p, tgl_swsb_regdist(1));36923693brw_inst *inst = brw_AND(p, brw_cr0_reg(0), brw_cr0_reg(0),3694brw_imm_ud(~mask));3695brw_inst_set_exec_size(p->devinfo, inst, BRW_EXECUTE_1);3696if (p->devinfo->ver < 12)3697brw_inst_set_thread_control(p->devinfo, inst, BRW_THREAD_SWITCH);36983699if (mode) {3700brw_inst *inst_or = brw_OR(p, brw_cr0_reg(0), brw_cr0_reg(0),3701brw_imm_ud(mode));3702brw_inst_set_exec_size(p->devinfo, inst_or, BRW_EXECUTE_1);3703if (p->devinfo->ver < 12)3704brw_inst_set_thread_control(p->devinfo, inst_or, BRW_THREAD_SWITCH);3705}37063707if (p->devinfo->ver >= 12)3708brw_SYNC(p, TGL_SYNC_NOP);3709}37103711void3712brw_update_reloc_imm(const struct intel_device_info *devinfo,3713brw_inst *inst,3714uint32_t value)3715{3716/* Sanity check that the instruction is a MOV of an immediate */3717assert(brw_inst_opcode(devinfo, inst) == BRW_OPCODE_MOV);3718assert(brw_inst_src0_reg_file(devinfo, inst) == BRW_IMMEDIATE_VALUE);37193720/* If it was compacted, we can't safely rewrite */3721assert(brw_inst_cmpt_control(devinfo, inst) == 0);37223723brw_inst_set_imm_ud(devinfo, inst, value);3724}37253726/* A default value for constants that will be patched at run-time.3727* We pick an arbitrary value that prevents instruction compaction.3728*/3729#define DEFAULT_PATCH_IMM 0x4a7cc03737303731void3732brw_MOV_reloc_imm(struct brw_codegen *p,3733struct brw_reg dst,3734enum brw_reg_type src_type,3735uint32_t id)3736{3737assert(type_sz(src_type) == 4);3738assert(type_sz(dst.type) == 4);37393740brw_add_reloc(p, id, BRW_SHADER_RELOC_TYPE_MOV_IMM,3741p->next_insn_offset, 0);37423743brw_MOV(p, dst, retype(brw_imm_ud(DEFAULT_PATCH_IMM), src_type));3744}374537463747