Path: blob/21.2-virgl/src/intel/compiler/brw_fs_nir.cpp
4550 views
/*1* Copyright © 2010 Intel Corporation2*3* Permission is hereby granted, free of charge, to any person obtaining a4* copy of this software and associated documentation files (the "Software"),5* to deal in the Software without restriction, including without limitation6* the rights to use, copy, modify, merge, publish, distribute, sublicense,7* and/or sell copies of the Software, and to permit persons to whom the8* Software is furnished to do so, subject to the following conditions:9*10* The above copyright notice and this permission notice (including the next11* paragraph) shall be included in all copies or substantial portions of the12* Software.13*14* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR15* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,16* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL17* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER18* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING19* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS20* IN THE SOFTWARE.21*/2223#include "compiler/glsl/ir.h"24#include "brw_fs.h"25#include "brw_nir.h"26#include "brw_rt.h"27#include "brw_eu.h"28#include "nir_search_helpers.h"29#include "util/u_math.h"30#include "util/bitscan.h"3132using namespace brw;3334void35fs_visitor::emit_nir_code()36{37emit_shader_float_controls_execution_mode();3839/* emit the arrays used for inputs and outputs - load/store intrinsics will40* be converted to reads/writes of these arrays41*/42nir_setup_outputs();43nir_setup_uniforms();44nir_emit_system_values();45last_scratch = ALIGN(nir->scratch_size, 4) * dispatch_width;4647nir_emit_impl(nir_shader_get_entrypoint((nir_shader *)nir));4849bld.emit(SHADER_OPCODE_HALT_TARGET);50}5152void53fs_visitor::nir_setup_outputs()54{55if (stage == MESA_SHADER_TESS_CTRL || stage == MESA_SHADER_FRAGMENT)56return;5758unsigned vec4s[VARYING_SLOT_TESS_MAX] = { 0, };5960/* Calculate the size of output registers in a separate pass, before61* allocating them. With ARB_enhanced_layouts, multiple output variables62* may occupy the same slot, but have different type sizes.63*/64nir_foreach_shader_out_variable(var, nir) {65const int loc = var->data.driver_location;66const unsigned var_vec4s =67var->data.compact ? DIV_ROUND_UP(glsl_get_length(var->type), 4)68: type_size_vec4(var->type, true);69vec4s[loc] = MAX2(vec4s[loc], var_vec4s);70}7172for (unsigned loc = 0; loc < ARRAY_SIZE(vec4s);) {73if (vec4s[loc] == 0) {74loc++;75continue;76}7778unsigned reg_size = vec4s[loc];7980/* Check if there are any ranges that start within this range and extend81* past it. If so, include them in this allocation.82*/83for (unsigned i = 1; i < reg_size; i++) {84assert(i + loc < ARRAY_SIZE(vec4s));85reg_size = MAX2(vec4s[i + loc] + i, reg_size);86}8788fs_reg reg = bld.vgrf(BRW_REGISTER_TYPE_F, 4 * reg_size);89for (unsigned i = 0; i < reg_size; i++) {90assert(loc + i < ARRAY_SIZE(outputs));91outputs[loc + i] = offset(reg, bld, 4 * i);92}9394loc += reg_size;95}96}9798void99fs_visitor::nir_setup_uniforms()100{101/* Only the first compile gets to set up uniforms. */102if (push_constant_loc) {103assert(pull_constant_loc);104return;105}106107uniforms = nir->num_uniforms / 4;108109if ((stage == MESA_SHADER_COMPUTE || stage == MESA_SHADER_KERNEL) &&110devinfo->verx10 < 125) {111/* Add uniforms for builtins after regular NIR uniforms. */112assert(uniforms == prog_data->nr_params);113114uint32_t *param;115if (nir->info.workgroup_size_variable &&116compiler->lower_variable_group_size) {117param = brw_stage_prog_data_add_params(prog_data, 3);118for (unsigned i = 0; i < 3; i++) {119param[i] = (BRW_PARAM_BUILTIN_WORK_GROUP_SIZE_X + i);120group_size[i] = fs_reg(UNIFORM, uniforms++, BRW_REGISTER_TYPE_UD);121}122}123124/* Subgroup ID must be the last uniform on the list. This will make125* easier later to split between cross thread and per thread126* uniforms.127*/128param = brw_stage_prog_data_add_params(prog_data, 1);129*param = BRW_PARAM_BUILTIN_SUBGROUP_ID;130subgroup_id = fs_reg(UNIFORM, uniforms++, BRW_REGISTER_TYPE_UD);131}132}133134static bool135emit_system_values_block(nir_block *block, fs_visitor *v)136{137fs_reg *reg;138139nir_foreach_instr(instr, block) {140if (instr->type != nir_instr_type_intrinsic)141continue;142143nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);144switch (intrin->intrinsic) {145case nir_intrinsic_load_vertex_id:146case nir_intrinsic_load_base_vertex:147unreachable("should be lowered by nir_lower_system_values().");148149case nir_intrinsic_load_vertex_id_zero_base:150case nir_intrinsic_load_is_indexed_draw:151case nir_intrinsic_load_first_vertex:152case nir_intrinsic_load_instance_id:153case nir_intrinsic_load_base_instance:154case nir_intrinsic_load_draw_id:155unreachable("should be lowered by brw_nir_lower_vs_inputs().");156157case nir_intrinsic_load_invocation_id:158if (v->stage == MESA_SHADER_TESS_CTRL)159break;160assert(v->stage == MESA_SHADER_GEOMETRY);161reg = &v->nir_system_values[SYSTEM_VALUE_INVOCATION_ID];162if (reg->file == BAD_FILE) {163const fs_builder abld = v->bld.annotate("gl_InvocationID", NULL);164fs_reg g1(retype(brw_vec8_grf(1, 0), BRW_REGISTER_TYPE_UD));165fs_reg iid = abld.vgrf(BRW_REGISTER_TYPE_UD, 1);166abld.SHR(iid, g1, brw_imm_ud(27u));167*reg = iid;168}169break;170171case nir_intrinsic_load_sample_pos:172assert(v->stage == MESA_SHADER_FRAGMENT);173reg = &v->nir_system_values[SYSTEM_VALUE_SAMPLE_POS];174if (reg->file == BAD_FILE)175*reg = *v->emit_samplepos_setup();176break;177178case nir_intrinsic_load_sample_id:179assert(v->stage == MESA_SHADER_FRAGMENT);180reg = &v->nir_system_values[SYSTEM_VALUE_SAMPLE_ID];181if (reg->file == BAD_FILE)182*reg = *v->emit_sampleid_setup();183break;184185case nir_intrinsic_load_sample_mask_in:186assert(v->stage == MESA_SHADER_FRAGMENT);187assert(v->devinfo->ver >= 7);188reg = &v->nir_system_values[SYSTEM_VALUE_SAMPLE_MASK_IN];189if (reg->file == BAD_FILE)190*reg = *v->emit_samplemaskin_setup();191break;192193case nir_intrinsic_load_workgroup_id:194assert(v->stage == MESA_SHADER_COMPUTE ||195v->stage == MESA_SHADER_KERNEL);196reg = &v->nir_system_values[SYSTEM_VALUE_WORKGROUP_ID];197if (reg->file == BAD_FILE)198*reg = *v->emit_cs_work_group_id_setup();199break;200201case nir_intrinsic_load_helper_invocation:202assert(v->stage == MESA_SHADER_FRAGMENT);203reg = &v->nir_system_values[SYSTEM_VALUE_HELPER_INVOCATION];204if (reg->file == BAD_FILE) {205const fs_builder abld =206v->bld.annotate("gl_HelperInvocation", NULL);207208/* On Gfx6+ (gl_HelperInvocation is only exposed on Gfx7+) the209* pixel mask is in g1.7 of the thread payload.210*211* We move the per-channel pixel enable bit to the low bit of each212* channel by shifting the byte containing the pixel mask by the213* vector immediate 0x76543210UV.214*215* The region of <1,8,0> reads only 1 byte (the pixel masks for216* subspans 0 and 1) in SIMD8 and an additional byte (the pixel217* masks for 2 and 3) in SIMD16.218*/219fs_reg shifted = abld.vgrf(BRW_REGISTER_TYPE_UW, 1);220221for (unsigned i = 0; i < DIV_ROUND_UP(v->dispatch_width, 16); i++) {222const fs_builder hbld = abld.group(MIN2(16, v->dispatch_width), i);223hbld.SHR(offset(shifted, hbld, i),224stride(retype(brw_vec1_grf(1 + i, 7),225BRW_REGISTER_TYPE_UB),2261, 8, 0),227brw_imm_v(0x76543210));228}229230/* A set bit in the pixel mask means the channel is enabled, but231* that is the opposite of gl_HelperInvocation so we need to invert232* the mask.233*234* The negate source-modifier bit of logical instructions on Gfx8+235* performs 1's complement negation, so we can use that instead of236* a NOT instruction.237*/238fs_reg inverted = negate(shifted);239if (v->devinfo->ver < 8) {240inverted = abld.vgrf(BRW_REGISTER_TYPE_UW);241abld.NOT(inverted, shifted);242}243244/* We then resolve the 0/1 result to 0/~0 boolean values by ANDing245* with 1 and negating.246*/247fs_reg anded = abld.vgrf(BRW_REGISTER_TYPE_UD, 1);248abld.AND(anded, inverted, brw_imm_uw(1));249250fs_reg dst = abld.vgrf(BRW_REGISTER_TYPE_D, 1);251abld.MOV(dst, negate(retype(anded, BRW_REGISTER_TYPE_D)));252*reg = dst;253}254break;255256case nir_intrinsic_load_frag_shading_rate:257reg = &v->nir_system_values[SYSTEM_VALUE_FRAG_SHADING_RATE];258if (reg->file == BAD_FILE)259*reg = *v->emit_shading_rate_setup();260break;261262default:263break;264}265}266267return true;268}269270void271fs_visitor::nir_emit_system_values()272{273nir_system_values = ralloc_array(mem_ctx, fs_reg, SYSTEM_VALUE_MAX);274for (unsigned i = 0; i < SYSTEM_VALUE_MAX; i++) {275nir_system_values[i] = fs_reg();276}277278/* Always emit SUBGROUP_INVOCATION. Dead code will clean it up if we279* never end up using it.280*/281{282const fs_builder abld = bld.annotate("gl_SubgroupInvocation", NULL);283fs_reg ® = nir_system_values[SYSTEM_VALUE_SUBGROUP_INVOCATION];284reg = abld.vgrf(BRW_REGISTER_TYPE_UW);285286const fs_builder allbld8 = abld.group(8, 0).exec_all();287allbld8.MOV(reg, brw_imm_v(0x76543210));288if (dispatch_width > 8)289allbld8.ADD(byte_offset(reg, 16), reg, brw_imm_uw(8u));290if (dispatch_width > 16) {291const fs_builder allbld16 = abld.group(16, 0).exec_all();292allbld16.ADD(byte_offset(reg, 32), reg, brw_imm_uw(16u));293}294}295296nir_function_impl *impl = nir_shader_get_entrypoint((nir_shader *)nir);297nir_foreach_block(block, impl)298emit_system_values_block(block, this);299}300301void302fs_visitor::nir_emit_impl(nir_function_impl *impl)303{304nir_locals = ralloc_array(mem_ctx, fs_reg, impl->reg_alloc);305for (unsigned i = 0; i < impl->reg_alloc; i++) {306nir_locals[i] = fs_reg();307}308309foreach_list_typed(nir_register, reg, node, &impl->registers) {310unsigned array_elems =311reg->num_array_elems == 0 ? 1 : reg->num_array_elems;312unsigned size = array_elems * reg->num_components;313const brw_reg_type reg_type = reg->bit_size == 8 ? BRW_REGISTER_TYPE_B :314brw_reg_type_from_bit_size(reg->bit_size, BRW_REGISTER_TYPE_F);315nir_locals[reg->index] = bld.vgrf(reg_type, size);316}317318nir_ssa_values = reralloc(mem_ctx, nir_ssa_values, fs_reg,319impl->ssa_alloc);320321nir_emit_cf_list(&impl->body);322}323324void325fs_visitor::nir_emit_cf_list(exec_list *list)326{327exec_list_validate(list);328foreach_list_typed(nir_cf_node, node, node, list) {329switch (node->type) {330case nir_cf_node_if:331nir_emit_if(nir_cf_node_as_if(node));332break;333334case nir_cf_node_loop:335nir_emit_loop(nir_cf_node_as_loop(node));336break;337338case nir_cf_node_block:339nir_emit_block(nir_cf_node_as_block(node));340break;341342default:343unreachable("Invalid CFG node block");344}345}346}347348void349fs_visitor::nir_emit_if(nir_if *if_stmt)350{351bool invert;352fs_reg cond_reg;353354/* If the condition has the form !other_condition, use other_condition as355* the source, but invert the predicate on the if instruction.356*/357nir_alu_instr *cond = nir_src_as_alu_instr(if_stmt->condition);358if (cond != NULL && cond->op == nir_op_inot) {359invert = true;360cond_reg = get_nir_src(cond->src[0].src);361cond_reg = offset(cond_reg, bld, cond->src[0].swizzle[0]);362} else {363invert = false;364cond_reg = get_nir_src(if_stmt->condition);365}366367/* first, put the condition into f0 */368fs_inst *inst = bld.MOV(bld.null_reg_d(),369retype(cond_reg, BRW_REGISTER_TYPE_D));370inst->conditional_mod = BRW_CONDITIONAL_NZ;371372bld.IF(BRW_PREDICATE_NORMAL)->predicate_inverse = invert;373374nir_emit_cf_list(&if_stmt->then_list);375376if (!nir_cf_list_is_empty_block(&if_stmt->else_list)) {377bld.emit(BRW_OPCODE_ELSE);378nir_emit_cf_list(&if_stmt->else_list);379}380381bld.emit(BRW_OPCODE_ENDIF);382383if (devinfo->ver < 7)384limit_dispatch_width(16, "Non-uniform control flow unsupported "385"in SIMD32 mode.");386}387388void389fs_visitor::nir_emit_loop(nir_loop *loop)390{391bld.emit(BRW_OPCODE_DO);392393nir_emit_cf_list(&loop->body);394395bld.emit(BRW_OPCODE_WHILE);396397if (devinfo->ver < 7)398limit_dispatch_width(16, "Non-uniform control flow unsupported "399"in SIMD32 mode.");400}401402void403fs_visitor::nir_emit_block(nir_block *block)404{405nir_foreach_instr(instr, block) {406nir_emit_instr(instr);407}408}409410void411fs_visitor::nir_emit_instr(nir_instr *instr)412{413const fs_builder abld = bld.annotate(NULL, instr);414415switch (instr->type) {416case nir_instr_type_alu:417nir_emit_alu(abld, nir_instr_as_alu(instr), true);418break;419420case nir_instr_type_deref:421unreachable("All derefs should've been lowered");422break;423424case nir_instr_type_intrinsic:425switch (stage) {426case MESA_SHADER_VERTEX:427nir_emit_vs_intrinsic(abld, nir_instr_as_intrinsic(instr));428break;429case MESA_SHADER_TESS_CTRL:430nir_emit_tcs_intrinsic(abld, nir_instr_as_intrinsic(instr));431break;432case MESA_SHADER_TESS_EVAL:433nir_emit_tes_intrinsic(abld, nir_instr_as_intrinsic(instr));434break;435case MESA_SHADER_GEOMETRY:436nir_emit_gs_intrinsic(abld, nir_instr_as_intrinsic(instr));437break;438case MESA_SHADER_FRAGMENT:439nir_emit_fs_intrinsic(abld, nir_instr_as_intrinsic(instr));440break;441case MESA_SHADER_COMPUTE:442case MESA_SHADER_KERNEL:443nir_emit_cs_intrinsic(abld, nir_instr_as_intrinsic(instr));444break;445case MESA_SHADER_RAYGEN:446case MESA_SHADER_ANY_HIT:447case MESA_SHADER_CLOSEST_HIT:448case MESA_SHADER_MISS:449case MESA_SHADER_INTERSECTION:450case MESA_SHADER_CALLABLE:451nir_emit_bs_intrinsic(abld, nir_instr_as_intrinsic(instr));452break;453default:454unreachable("unsupported shader stage");455}456break;457458case nir_instr_type_tex:459nir_emit_texture(abld, nir_instr_as_tex(instr));460break;461462case nir_instr_type_load_const:463nir_emit_load_const(abld, nir_instr_as_load_const(instr));464break;465466case nir_instr_type_ssa_undef:467/* We create a new VGRF for undefs on every use (by handling468* them in get_nir_src()), rather than for each definition.469* This helps register coalescing eliminate MOVs from undef.470*/471break;472473case nir_instr_type_jump:474nir_emit_jump(abld, nir_instr_as_jump(instr));475break;476477default:478unreachable("unknown instruction type");479}480}481482/**483* Recognizes a parent instruction of nir_op_extract_* and changes the type to484* match instr.485*/486bool487fs_visitor::optimize_extract_to_float(nir_alu_instr *instr,488const fs_reg &result)489{490if (!instr->src[0].src.is_ssa ||491!instr->src[0].src.ssa->parent_instr)492return false;493494if (instr->src[0].src.ssa->parent_instr->type != nir_instr_type_alu)495return false;496497nir_alu_instr *src0 =498nir_instr_as_alu(instr->src[0].src.ssa->parent_instr);499500if (src0->op != nir_op_extract_u8 && src0->op != nir_op_extract_u16 &&501src0->op != nir_op_extract_i8 && src0->op != nir_op_extract_i16)502return false;503504unsigned element = nir_src_as_uint(src0->src[1].src);505506/* Element type to extract.*/507const brw_reg_type type = brw_int_type(508src0->op == nir_op_extract_u16 || src0->op == nir_op_extract_i16 ? 2 : 1,509src0->op == nir_op_extract_i16 || src0->op == nir_op_extract_i8);510511fs_reg op0 = get_nir_src(src0->src[0].src);512op0.type = brw_type_for_nir_type(devinfo,513(nir_alu_type)(nir_op_infos[src0->op].input_types[0] |514nir_src_bit_size(src0->src[0].src)));515op0 = offset(op0, bld, src0->src[0].swizzle[0]);516517bld.MOV(result, subscript(op0, type, element));518return true;519}520521bool522fs_visitor::optimize_frontfacing_ternary(nir_alu_instr *instr,523const fs_reg &result)524{525nir_intrinsic_instr *src0 = nir_src_as_intrinsic(instr->src[0].src);526if (src0 == NULL || src0->intrinsic != nir_intrinsic_load_front_face)527return false;528529if (!nir_src_is_const(instr->src[1].src) ||530!nir_src_is_const(instr->src[2].src))531return false;532533const float value1 = nir_src_as_float(instr->src[1].src);534const float value2 = nir_src_as_float(instr->src[2].src);535if (fabsf(value1) != 1.0f || fabsf(value2) != 1.0f)536return false;537538/* nir_opt_algebraic should have gotten rid of bcsel(b, a, a) */539assert(value1 == -value2);540541fs_reg tmp = vgrf(glsl_type::int_type);542543if (devinfo->ver >= 12) {544/* Bit 15 of g1.1 is 0 if the polygon is front facing. */545fs_reg g1 = fs_reg(retype(brw_vec1_grf(1, 1), BRW_REGISTER_TYPE_W));546547/* For (gl_FrontFacing ? 1.0 : -1.0), emit:548*549* or(8) tmp.1<2>W g0.0<0,1,0>W 0x00003f80W550* and(8) dst<1>D tmp<8,8,1>D 0xbf800000D551*552* and negate the result for (gl_FrontFacing ? -1.0 : 1.0).553*/554bld.OR(subscript(tmp, BRW_REGISTER_TYPE_W, 1),555g1, brw_imm_uw(0x3f80));556557if (value1 == -1.0f)558bld.MOV(tmp, negate(tmp));559560} else if (devinfo->ver >= 6) {561/* Bit 15 of g0.0 is 0 if the polygon is front facing. */562fs_reg g0 = fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_W));563564/* For (gl_FrontFacing ? 1.0 : -1.0), emit:565*566* or(8) tmp.1<2>W g0.0<0,1,0>W 0x00003f80W567* and(8) dst<1>D tmp<8,8,1>D 0xbf800000D568*569* and negate g0.0<0,1,0>W for (gl_FrontFacing ? -1.0 : 1.0).570*571* This negation looks like it's safe in practice, because bits 0:4 will572* surely be TRIANGLES573*/574575if (value1 == -1.0f) {576g0.negate = true;577}578579bld.OR(subscript(tmp, BRW_REGISTER_TYPE_W, 1),580g0, brw_imm_uw(0x3f80));581} else {582/* Bit 31 of g1.6 is 0 if the polygon is front facing. */583fs_reg g1_6 = fs_reg(retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_D));584585/* For (gl_FrontFacing ? 1.0 : -1.0), emit:586*587* or(8) tmp<1>D g1.6<0,1,0>D 0x3f800000D588* and(8) dst<1>D tmp<8,8,1>D 0xbf800000D589*590* and negate g1.6<0,1,0>D for (gl_FrontFacing ? -1.0 : 1.0).591*592* This negation looks like it's safe in practice, because bits 0:4 will593* surely be TRIANGLES594*/595596if (value1 == -1.0f) {597g1_6.negate = true;598}599600bld.OR(tmp, g1_6, brw_imm_d(0x3f800000));601}602bld.AND(retype(result, BRW_REGISTER_TYPE_D), tmp, brw_imm_d(0xbf800000));603604return true;605}606607static void608emit_find_msb_using_lzd(const fs_builder &bld,609const fs_reg &result,610const fs_reg &src,611bool is_signed)612{613fs_inst *inst;614fs_reg temp = src;615616if (is_signed) {617/* LZD of an absolute value source almost always does the right618* thing. There are two problem values:619*620* * 0x80000000. Since abs(0x80000000) == 0x80000000, LZD returns621* 0. However, findMSB(int(0x80000000)) == 30.622*623* * 0xffffffff. Since abs(0xffffffff) == 1, LZD returns624* 31. Section 8.8 (Integer Functions) of the GLSL 4.50 spec says:625*626* For a value of zero or negative one, -1 will be returned.627*628* * Negative powers of two. LZD(abs(-(1<<x))) returns x, but629* findMSB(-(1<<x)) should return x-1.630*631* For all negative number cases, including 0x80000000 and632* 0xffffffff, the correct value is obtained from LZD if instead of633* negating the (already negative) value the logical-not is used. A634* conditonal logical-not can be achieved in two instructions.635*/636temp = bld.vgrf(BRW_REGISTER_TYPE_D);637638bld.ASR(temp, src, brw_imm_d(31));639bld.XOR(temp, temp, src);640}641642bld.LZD(retype(result, BRW_REGISTER_TYPE_UD),643retype(temp, BRW_REGISTER_TYPE_UD));644645/* LZD counts from the MSB side, while GLSL's findMSB() wants the count646* from the LSB side. Subtract the result from 31 to convert the MSB647* count into an LSB count. If no bits are set, LZD will return 32.648* 31-32 = -1, which is exactly what findMSB() is supposed to return.649*/650inst = bld.ADD(result, retype(result, BRW_REGISTER_TYPE_D), brw_imm_d(31));651inst->src[0].negate = true;652}653654static brw_rnd_mode655brw_rnd_mode_from_nir_op (const nir_op op) {656switch (op) {657case nir_op_f2f16_rtz:658return BRW_RND_MODE_RTZ;659case nir_op_f2f16_rtne:660return BRW_RND_MODE_RTNE;661default:662unreachable("Operation doesn't support rounding mode");663}664}665666static brw_rnd_mode667brw_rnd_mode_from_execution_mode(unsigned execution_mode)668{669if (nir_has_any_rounding_mode_rtne(execution_mode))670return BRW_RND_MODE_RTNE;671if (nir_has_any_rounding_mode_rtz(execution_mode))672return BRW_RND_MODE_RTZ;673return BRW_RND_MODE_UNSPECIFIED;674}675676fs_reg677fs_visitor::prepare_alu_destination_and_sources(const fs_builder &bld,678nir_alu_instr *instr,679fs_reg *op,680bool need_dest)681{682fs_reg result =683need_dest ? get_nir_dest(instr->dest.dest) : bld.null_reg_ud();684685result.type = brw_type_for_nir_type(devinfo,686(nir_alu_type)(nir_op_infos[instr->op].output_type |687nir_dest_bit_size(instr->dest.dest)));688689assert(!instr->dest.saturate);690691for (unsigned i = 0; i < nir_op_infos[instr->op].num_inputs; i++) {692/* We don't lower to source modifiers so they should not exist. */693assert(!instr->src[i].abs);694assert(!instr->src[i].negate);695696op[i] = get_nir_src(instr->src[i].src);697op[i].type = brw_type_for_nir_type(devinfo,698(nir_alu_type)(nir_op_infos[instr->op].input_types[i] |699nir_src_bit_size(instr->src[i].src)));700}701702/* Move and vecN instrutions may still be vectored. Return the raw,703* vectored source and destination so that fs_visitor::nir_emit_alu can704* handle it. Other callers should not have to handle these kinds of705* instructions.706*/707switch (instr->op) {708case nir_op_mov:709case nir_op_vec2:710case nir_op_vec3:711case nir_op_vec4:712case nir_op_vec8:713case nir_op_vec16:714return result;715default:716break;717}718719/* At this point, we have dealt with any instruction that operates on720* more than a single channel. Therefore, we can just adjust the source721* and destination registers for that channel and emit the instruction.722*/723unsigned channel = 0;724if (nir_op_infos[instr->op].output_size == 0) {725/* Since NIR is doing the scalarizing for us, we should only ever see726* vectorized operations with a single channel.727*/728assert(util_bitcount(instr->dest.write_mask) == 1);729channel = ffs(instr->dest.write_mask) - 1;730731result = offset(result, bld, channel);732}733734for (unsigned i = 0; i < nir_op_infos[instr->op].num_inputs; i++) {735assert(nir_op_infos[instr->op].input_sizes[i] < 2);736op[i] = offset(op[i], bld, instr->src[i].swizzle[channel]);737}738739return result;740}741742void743fs_visitor::resolve_inot_sources(const fs_builder &bld, nir_alu_instr *instr,744fs_reg *op)745{746for (unsigned i = 0; i < 2; i++) {747nir_alu_instr *inot_instr = nir_src_as_alu_instr(instr->src[i].src);748749if (inot_instr != NULL && inot_instr->op == nir_op_inot) {750/* The source of the inot is now the source of instr. */751prepare_alu_destination_and_sources(bld, inot_instr, &op[i], false);752753assert(!op[i].negate);754op[i].negate = true;755} else {756op[i] = resolve_source_modifiers(op[i]);757}758}759}760761bool762fs_visitor::try_emit_b2fi_of_inot(const fs_builder &bld,763fs_reg result,764nir_alu_instr *instr)765{766if (devinfo->ver < 6 || devinfo->ver >= 12)767return false;768769nir_alu_instr *inot_instr = nir_src_as_alu_instr(instr->src[0].src);770771if (inot_instr == NULL || inot_instr->op != nir_op_inot)772return false;773774/* HF is also possible as a destination on BDW+. For nir_op_b2i, the set775* of valid size-changing combinations is a bit more complex.776*777* The source restriction is just because I was lazy about generating the778* constant below.779*/780if (nir_dest_bit_size(instr->dest.dest) != 32 ||781nir_src_bit_size(inot_instr->src[0].src) != 32)782return false;783784/* b2[fi](inot(a)) maps a=0 => 1, a=-1 => 0. Since a can only be 0 or -1,785* this is float(1 + a).786*/787fs_reg op;788789prepare_alu_destination_and_sources(bld, inot_instr, &op, false);790791/* Ignore the saturate modifier, if there is one. The result of the792* arithmetic can only be 0 or 1, so the clamping will do nothing anyway.793*/794bld.ADD(result, op, brw_imm_d(1));795796return true;797}798799/**800* Emit code for nir_op_fsign possibly fused with a nir_op_fmul801*802* If \c instr is not the \c nir_op_fsign, then \c fsign_src is the index of803* the source of \c instr that is a \c nir_op_fsign.804*/805void806fs_visitor::emit_fsign(const fs_builder &bld, const nir_alu_instr *instr,807fs_reg result, fs_reg *op, unsigned fsign_src)808{809fs_inst *inst;810811assert(instr->op == nir_op_fsign || instr->op == nir_op_fmul);812assert(fsign_src < nir_op_infos[instr->op].num_inputs);813814if (instr->op != nir_op_fsign) {815const nir_alu_instr *const fsign_instr =816nir_src_as_alu_instr(instr->src[fsign_src].src);817818/* op[fsign_src] has the nominal result of the fsign, and op[1 -819* fsign_src] has the other multiply source. This must be rearranged so820* that op[0] is the source of the fsign op[1] is the other multiply821* source.822*/823if (fsign_src != 0)824op[1] = op[0];825826op[0] = get_nir_src(fsign_instr->src[0].src);827828const nir_alu_type t =829(nir_alu_type)(nir_op_infos[instr->op].input_types[0] |830nir_src_bit_size(fsign_instr->src[0].src));831832op[0].type = brw_type_for_nir_type(devinfo, t);833834unsigned channel = 0;835if (nir_op_infos[instr->op].output_size == 0) {836/* Since NIR is doing the scalarizing for us, we should only ever see837* vectorized operations with a single channel.838*/839assert(util_bitcount(instr->dest.write_mask) == 1);840channel = ffs(instr->dest.write_mask) - 1;841}842843op[0] = offset(op[0], bld, fsign_instr->src[0].swizzle[channel]);844}845846if (type_sz(op[0].type) == 2) {847/* AND(val, 0x8000) gives the sign bit.848*849* Predicated OR ORs 1.0 (0x3c00) with the sign bit if val is not zero.850*/851fs_reg zero = retype(brw_imm_uw(0), BRW_REGISTER_TYPE_HF);852bld.CMP(bld.null_reg_f(), op[0], zero, BRW_CONDITIONAL_NZ);853854op[0].type = BRW_REGISTER_TYPE_UW;855result.type = BRW_REGISTER_TYPE_UW;856bld.AND(result, op[0], brw_imm_uw(0x8000u));857858if (instr->op == nir_op_fsign)859inst = bld.OR(result, result, brw_imm_uw(0x3c00u));860else {861/* Use XOR here to get the result sign correct. */862inst = bld.XOR(result, result, retype(op[1], BRW_REGISTER_TYPE_UW));863}864865inst->predicate = BRW_PREDICATE_NORMAL;866} else if (type_sz(op[0].type) == 4) {867/* AND(val, 0x80000000) gives the sign bit.868*869* Predicated OR ORs 1.0 (0x3f800000) with the sign bit if val is not870* zero.871*/872bld.CMP(bld.null_reg_f(), op[0], brw_imm_f(0.0f), BRW_CONDITIONAL_NZ);873874op[0].type = BRW_REGISTER_TYPE_UD;875result.type = BRW_REGISTER_TYPE_UD;876bld.AND(result, op[0], brw_imm_ud(0x80000000u));877878if (instr->op == nir_op_fsign)879inst = bld.OR(result, result, brw_imm_ud(0x3f800000u));880else {881/* Use XOR here to get the result sign correct. */882inst = bld.XOR(result, result, retype(op[1], BRW_REGISTER_TYPE_UD));883}884885inst->predicate = BRW_PREDICATE_NORMAL;886} else {887/* For doubles we do the same but we need to consider:888*889* - 2-src instructions can't operate with 64-bit immediates890* - The sign is encoded in the high 32-bit of each DF891* - We need to produce a DF result.892*/893894fs_reg zero = vgrf(glsl_type::double_type);895bld.MOV(zero, setup_imm_df(bld, 0.0));896bld.CMP(bld.null_reg_df(), op[0], zero, BRW_CONDITIONAL_NZ);897898bld.MOV(result, zero);899900fs_reg r = subscript(result, BRW_REGISTER_TYPE_UD, 1);901bld.AND(r, subscript(op[0], BRW_REGISTER_TYPE_UD, 1),902brw_imm_ud(0x80000000u));903904if (instr->op == nir_op_fsign) {905set_predicate(BRW_PREDICATE_NORMAL,906bld.OR(r, r, brw_imm_ud(0x3ff00000u)));907} else {908/* This could be done better in some cases. If the scale is an909* immediate with the low 32-bits all 0, emitting a separate XOR and910* OR would allow an algebraic optimization to remove the OR. There911* are currently zero instances of fsign(double(x))*IMM in shader-db912* or any test suite, so it is hard to care at this time.913*/914fs_reg result_int64 = retype(result, BRW_REGISTER_TYPE_UQ);915inst = bld.XOR(result_int64, result_int64,916retype(op[1], BRW_REGISTER_TYPE_UQ));917}918}919}920921/**922* Deteremine whether sources of a nir_op_fmul can be fused with a nir_op_fsign923*924* Checks the operands of a \c nir_op_fmul to determine whether or not925* \c emit_fsign could fuse the multiplication with the \c sign() calculation.926*927* \param instr The multiplication instruction928*929* \param fsign_src The source of \c instr that may or may not be a930* \c nir_op_fsign931*/932static bool933can_fuse_fmul_fsign(nir_alu_instr *instr, unsigned fsign_src)934{935assert(instr->op == nir_op_fmul);936937nir_alu_instr *const fsign_instr =938nir_src_as_alu_instr(instr->src[fsign_src].src);939940/* Rules:941*942* 1. instr->src[fsign_src] must be a nir_op_fsign.943* 2. The nir_op_fsign can only be used by this multiplication.944* 3. The source that is the nir_op_fsign does not have source modifiers.945* \c emit_fsign only examines the source modifiers of the source of the946* \c nir_op_fsign.947*948* The nir_op_fsign must also not have the saturate modifier, but steps949* have already been taken (in nir_opt_algebraic) to ensure that.950*/951return fsign_instr != NULL && fsign_instr->op == nir_op_fsign &&952is_used_once(fsign_instr);953}954955void956fs_visitor::nir_emit_alu(const fs_builder &bld, nir_alu_instr *instr,957bool need_dest)958{959struct brw_wm_prog_key *fs_key = (struct brw_wm_prog_key *) this->key;960fs_inst *inst;961unsigned execution_mode =962bld.shader->nir->info.float_controls_execution_mode;963964fs_reg op[NIR_MAX_VEC_COMPONENTS];965fs_reg result = prepare_alu_destination_and_sources(bld, instr, op, need_dest);966967switch (instr->op) {968case nir_op_mov:969case nir_op_vec2:970case nir_op_vec3:971case nir_op_vec4:972case nir_op_vec8:973case nir_op_vec16: {974fs_reg temp = result;975bool need_extra_copy = false;976for (unsigned i = 0; i < nir_op_infos[instr->op].num_inputs; i++) {977if (!instr->src[i].src.is_ssa &&978instr->dest.dest.reg.reg == instr->src[i].src.reg.reg) {979need_extra_copy = true;980temp = bld.vgrf(result.type, 4);981break;982}983}984985for (unsigned i = 0; i < 4; i++) {986if (!(instr->dest.write_mask & (1 << i)))987continue;988989if (instr->op == nir_op_mov) {990bld.MOV(offset(temp, bld, i),991offset(op[0], bld, instr->src[0].swizzle[i]));992} else {993bld.MOV(offset(temp, bld, i),994offset(op[i], bld, instr->src[i].swizzle[0]));995}996}997998/* In this case the source and destination registers were the same,999* so we need to insert an extra set of moves in order to deal with1000* any swizzling.1001*/1002if (need_extra_copy) {1003for (unsigned i = 0; i < 4; i++) {1004if (!(instr->dest.write_mask & (1 << i)))1005continue;10061007bld.MOV(offset(result, bld, i), offset(temp, bld, i));1008}1009}1010return;1011}10121013case nir_op_i2f32:1014case nir_op_u2f32:1015if (optimize_extract_to_float(instr, result))1016return;1017inst = bld.MOV(result, op[0]);1018break;10191020case nir_op_f2f16_rtne:1021case nir_op_f2f16_rtz:1022case nir_op_f2f16: {1023brw_rnd_mode rnd = BRW_RND_MODE_UNSPECIFIED;10241025if (nir_op_f2f16 == instr->op)1026rnd = brw_rnd_mode_from_execution_mode(execution_mode);1027else1028rnd = brw_rnd_mode_from_nir_op(instr->op);10291030if (BRW_RND_MODE_UNSPECIFIED != rnd)1031bld.emit(SHADER_OPCODE_RND_MODE, bld.null_reg_ud(), brw_imm_d(rnd));10321033/* In theory, it would be better to use BRW_OPCODE_F32TO16. Depending1034* on the HW gen, it is a special hw opcode or just a MOV, and1035* brw_F32TO16 (at brw_eu_emit) would do the work to chose.1036*1037* But if we want to use that opcode, we need to provide support on1038* different optimizations and lowerings. As right now HF support is1039* only for gfx8+, it will be better to use directly the MOV, and use1040* BRW_OPCODE_F32TO16 when/if we work for HF support on gfx7.1041*/1042assert(type_sz(op[0].type) < 8); /* brw_nir_lower_conversions */1043inst = bld.MOV(result, op[0]);1044break;1045}10461047case nir_op_b2i8:1048case nir_op_b2i16:1049case nir_op_b2i32:1050case nir_op_b2i64:1051case nir_op_b2f16:1052case nir_op_b2f32:1053case nir_op_b2f64:1054if (try_emit_b2fi_of_inot(bld, result, instr))1055break;1056op[0].type = BRW_REGISTER_TYPE_D;1057op[0].negate = !op[0].negate;1058FALLTHROUGH;1059case nir_op_i2f64:1060case nir_op_i2i64:1061case nir_op_u2f64:1062case nir_op_u2u64:1063case nir_op_f2f64:1064case nir_op_f2i64:1065case nir_op_f2u64:1066case nir_op_i2i32:1067case nir_op_u2u32:1068case nir_op_f2i32:1069case nir_op_f2u32:1070case nir_op_i2f16:1071case nir_op_i2i16:1072case nir_op_u2f16:1073case nir_op_u2u16:1074case nir_op_f2i16:1075case nir_op_f2u16:1076case nir_op_i2i8:1077case nir_op_u2u8:1078case nir_op_f2i8:1079case nir_op_f2u8:1080if (result.type == BRW_REGISTER_TYPE_B ||1081result.type == BRW_REGISTER_TYPE_UB ||1082result.type == BRW_REGISTER_TYPE_HF)1083assert(type_sz(op[0].type) < 8); /* brw_nir_lower_conversions */10841085if (op[0].type == BRW_REGISTER_TYPE_B ||1086op[0].type == BRW_REGISTER_TYPE_UB ||1087op[0].type == BRW_REGISTER_TYPE_HF)1088assert(type_sz(result.type) < 8); /* brw_nir_lower_conversions */10891090inst = bld.MOV(result, op[0]);1091break;10921093case nir_op_fsat:1094inst = bld.MOV(result, op[0]);1095inst->saturate = true;1096break;10971098case nir_op_fneg:1099case nir_op_ineg:1100op[0].negate = true;1101inst = bld.MOV(result, op[0]);1102break;11031104case nir_op_fabs:1105case nir_op_iabs:1106op[0].negate = false;1107op[0].abs = true;1108inst = bld.MOV(result, op[0]);1109break;11101111case nir_op_f2f32:1112if (nir_has_any_rounding_mode_enabled(execution_mode)) {1113brw_rnd_mode rnd =1114brw_rnd_mode_from_execution_mode(execution_mode);1115bld.emit(SHADER_OPCODE_RND_MODE, bld.null_reg_ud(),1116brw_imm_d(rnd));1117}11181119if (op[0].type == BRW_REGISTER_TYPE_HF)1120assert(type_sz(result.type) < 8); /* brw_nir_lower_conversions */11211122inst = bld.MOV(result, op[0]);1123break;11241125case nir_op_fsign:1126emit_fsign(bld, instr, result, op, 0);1127break;11281129case nir_op_frcp:1130inst = bld.emit(SHADER_OPCODE_RCP, result, op[0]);1131break;11321133case nir_op_fexp2:1134inst = bld.emit(SHADER_OPCODE_EXP2, result, op[0]);1135break;11361137case nir_op_flog2:1138inst = bld.emit(SHADER_OPCODE_LOG2, result, op[0]);1139break;11401141case nir_op_fsin:1142inst = bld.emit(SHADER_OPCODE_SIN, result, op[0]);1143break;11441145case nir_op_fcos:1146inst = bld.emit(SHADER_OPCODE_COS, result, op[0]);1147break;11481149case nir_op_fddx:1150if (fs_key->high_quality_derivatives) {1151inst = bld.emit(FS_OPCODE_DDX_FINE, result, op[0]);1152} else {1153inst = bld.emit(FS_OPCODE_DDX_COARSE, result, op[0]);1154}1155break;1156case nir_op_fddx_fine:1157inst = bld.emit(FS_OPCODE_DDX_FINE, result, op[0]);1158break;1159case nir_op_fddx_coarse:1160inst = bld.emit(FS_OPCODE_DDX_COARSE, result, op[0]);1161break;1162case nir_op_fddy:1163if (fs_key->high_quality_derivatives) {1164inst = bld.emit(FS_OPCODE_DDY_FINE, result, op[0]);1165} else {1166inst = bld.emit(FS_OPCODE_DDY_COARSE, result, op[0]);1167}1168break;1169case nir_op_fddy_fine:1170inst = bld.emit(FS_OPCODE_DDY_FINE, result, op[0]);1171break;1172case nir_op_fddy_coarse:1173inst = bld.emit(FS_OPCODE_DDY_COARSE, result, op[0]);1174break;11751176case nir_op_fadd:1177if (nir_has_any_rounding_mode_enabled(execution_mode)) {1178brw_rnd_mode rnd =1179brw_rnd_mode_from_execution_mode(execution_mode);1180bld.emit(SHADER_OPCODE_RND_MODE, bld.null_reg_ud(),1181brw_imm_d(rnd));1182}1183FALLTHROUGH;1184case nir_op_iadd:1185inst = bld.ADD(result, op[0], op[1]);1186break;11871188case nir_op_iadd_sat:1189case nir_op_uadd_sat:1190inst = bld.ADD(result, op[0], op[1]);1191inst->saturate = true;1192break;11931194case nir_op_isub_sat:1195bld.emit(SHADER_OPCODE_ISUB_SAT, result, op[0], op[1]);1196break;11971198case nir_op_usub_sat:1199bld.emit(SHADER_OPCODE_USUB_SAT, result, op[0], op[1]);1200break;12011202case nir_op_irhadd:1203case nir_op_urhadd:1204assert(nir_dest_bit_size(instr->dest.dest) < 64);1205inst = bld.AVG(result, op[0], op[1]);1206break;12071208case nir_op_ihadd:1209case nir_op_uhadd: {1210assert(nir_dest_bit_size(instr->dest.dest) < 64);1211fs_reg tmp = bld.vgrf(result.type);12121213if (devinfo->ver >= 8) {1214op[0] = resolve_source_modifiers(op[0]);1215op[1] = resolve_source_modifiers(op[1]);1216}12171218/* AVG(x, y) - ((x ^ y) & 1) */1219bld.XOR(tmp, op[0], op[1]);1220bld.AND(tmp, tmp, retype(brw_imm_ud(1), result.type));1221bld.AVG(result, op[0], op[1]);1222inst = bld.ADD(result, result, tmp);1223inst->src[1].negate = true;1224break;1225}12261227case nir_op_fmul:1228for (unsigned i = 0; i < 2; i++) {1229if (can_fuse_fmul_fsign(instr, i)) {1230emit_fsign(bld, instr, result, op, i);1231return;1232}1233}12341235/* We emit the rounding mode after the previous fsign optimization since1236* it won't result in a MUL, but will try to negate the value by other1237* means.1238*/1239if (nir_has_any_rounding_mode_enabled(execution_mode)) {1240brw_rnd_mode rnd =1241brw_rnd_mode_from_execution_mode(execution_mode);1242bld.emit(SHADER_OPCODE_RND_MODE, bld.null_reg_ud(),1243brw_imm_d(rnd));1244}12451246inst = bld.MUL(result, op[0], op[1]);1247break;12481249case nir_op_imul_2x32_64:1250case nir_op_umul_2x32_64:1251bld.MUL(result, op[0], op[1]);1252break;12531254case nir_op_imul_32x16:1255case nir_op_umul_32x16: {1256const bool ud = instr->op == nir_op_umul_32x16;12571258assert(nir_dest_bit_size(instr->dest.dest) == 32);12591260/* Before Gfx7, the order of the 32-bit source and the 16-bit source was1261* swapped. The extension isn't enabled on those platforms, so don't1262* pretend to support the differences.1263*/1264assert(devinfo->ver >= 7);12651266if (op[1].file == IMM)1267op[1] = ud ? brw_imm_uw(op[1].ud) : brw_imm_w(op[1].d);1268else {1269const enum brw_reg_type word_type =1270ud ? BRW_REGISTER_TYPE_UW : BRW_REGISTER_TYPE_W;12711272op[1] = subscript(op[1], word_type, 0);1273}12741275const enum brw_reg_type dword_type =1276ud ? BRW_REGISTER_TYPE_UD : BRW_REGISTER_TYPE_D;12771278bld.MUL(result, retype(op[0], dword_type), op[1]);1279break;1280}12811282case nir_op_imul:1283assert(nir_dest_bit_size(instr->dest.dest) < 64);1284bld.MUL(result, op[0], op[1]);1285break;12861287case nir_op_imul_high:1288case nir_op_umul_high:1289assert(nir_dest_bit_size(instr->dest.dest) < 64);1290bld.emit(SHADER_OPCODE_MULH, result, op[0], op[1]);1291break;12921293case nir_op_idiv:1294case nir_op_udiv:1295assert(nir_dest_bit_size(instr->dest.dest) < 64);1296bld.emit(SHADER_OPCODE_INT_QUOTIENT, result, op[0], op[1]);1297break;12981299case nir_op_uadd_carry:1300unreachable("Should have been lowered by carry_to_arith().");13011302case nir_op_usub_borrow:1303unreachable("Should have been lowered by borrow_to_arith().");13041305case nir_op_umod:1306case nir_op_irem:1307/* According to the sign table for INT DIV in the Ivy Bridge PRM, it1308* appears that our hardware just does the right thing for signed1309* remainder.1310*/1311assert(nir_dest_bit_size(instr->dest.dest) < 64);1312bld.emit(SHADER_OPCODE_INT_REMAINDER, result, op[0], op[1]);1313break;13141315case nir_op_imod: {1316/* Get a regular C-style remainder. If a % b == 0, set the predicate. */1317bld.emit(SHADER_OPCODE_INT_REMAINDER, result, op[0], op[1]);13181319/* Math instructions don't support conditional mod */1320inst = bld.MOV(bld.null_reg_d(), result);1321inst->conditional_mod = BRW_CONDITIONAL_NZ;13221323/* Now, we need to determine if signs of the sources are different.1324* When we XOR the sources, the top bit is 0 if they are the same and 11325* if they are different. We can then use a conditional modifier to1326* turn that into a predicate. This leads us to an XOR.l instruction.1327*1328* Technically, according to the PRM, you're not allowed to use .l on a1329* XOR instruction. However, emperical experiments and Curro's reading1330* of the simulator source both indicate that it's safe.1331*/1332fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_D);1333inst = bld.XOR(tmp, op[0], op[1]);1334inst->predicate = BRW_PREDICATE_NORMAL;1335inst->conditional_mod = BRW_CONDITIONAL_L;13361337/* If the result of the initial remainder operation is non-zero and the1338* two sources have different signs, add in a copy of op[1] to get the1339* final integer modulus value.1340*/1341inst = bld.ADD(result, result, op[1]);1342inst->predicate = BRW_PREDICATE_NORMAL;1343break;1344}13451346case nir_op_flt32:1347case nir_op_fge32:1348case nir_op_feq32:1349case nir_op_fneu32: {1350fs_reg dest = result;13511352const uint32_t bit_size = nir_src_bit_size(instr->src[0].src);1353if (bit_size != 32)1354dest = bld.vgrf(op[0].type, 1);13551356bld.CMP(dest, op[0], op[1], brw_cmod_for_nir_comparison(instr->op));13571358if (bit_size > 32) {1359bld.MOV(result, subscript(dest, BRW_REGISTER_TYPE_UD, 0));1360} else if(bit_size < 32) {1361/* When we convert the result to 32-bit we need to be careful and do1362* it as a signed conversion to get sign extension (for 32-bit true)1363*/1364const brw_reg_type src_type =1365brw_reg_type_from_bit_size(bit_size, BRW_REGISTER_TYPE_D);13661367bld.MOV(retype(result, BRW_REGISTER_TYPE_D), retype(dest, src_type));1368}1369break;1370}13711372case nir_op_ilt32:1373case nir_op_ult32:1374case nir_op_ige32:1375case nir_op_uge32:1376case nir_op_ieq32:1377case nir_op_ine32: {1378fs_reg dest = result;13791380const uint32_t bit_size = type_sz(op[0].type) * 8;1381if (bit_size != 32)1382dest = bld.vgrf(op[0].type, 1);13831384bld.CMP(dest, op[0], op[1],1385brw_cmod_for_nir_comparison(instr->op));13861387if (bit_size > 32) {1388bld.MOV(result, subscript(dest, BRW_REGISTER_TYPE_UD, 0));1389} else if (bit_size < 32) {1390/* When we convert the result to 32-bit we need to be careful and do1391* it as a signed conversion to get sign extension (for 32-bit true)1392*/1393const brw_reg_type src_type =1394brw_reg_type_from_bit_size(bit_size, BRW_REGISTER_TYPE_D);13951396bld.MOV(retype(result, BRW_REGISTER_TYPE_D), retype(dest, src_type));1397}1398break;1399}14001401case nir_op_inot:1402if (devinfo->ver >= 8) {1403nir_alu_instr *inot_src_instr = nir_src_as_alu_instr(instr->src[0].src);14041405if (inot_src_instr != NULL &&1406(inot_src_instr->op == nir_op_ior ||1407inot_src_instr->op == nir_op_ixor ||1408inot_src_instr->op == nir_op_iand)) {1409/* The sources of the source logical instruction are now the1410* sources of the instruction that will be generated.1411*/1412prepare_alu_destination_and_sources(bld, inot_src_instr, op, false);1413resolve_inot_sources(bld, inot_src_instr, op);14141415/* Smash all of the sources and destination to be signed. This1416* doesn't matter for the operation of the instruction, but cmod1417* propagation fails on unsigned sources with negation (due to1418* fs_inst::can_do_cmod returning false).1419*/1420result.type =1421brw_type_for_nir_type(devinfo,1422(nir_alu_type)(nir_type_int |1423nir_dest_bit_size(instr->dest.dest)));1424op[0].type =1425brw_type_for_nir_type(devinfo,1426(nir_alu_type)(nir_type_int |1427nir_src_bit_size(inot_src_instr->src[0].src)));1428op[1].type =1429brw_type_for_nir_type(devinfo,1430(nir_alu_type)(nir_type_int |1431nir_src_bit_size(inot_src_instr->src[1].src)));14321433/* For XOR, only invert one of the sources. Arbitrarily choose1434* the first source.1435*/1436op[0].negate = !op[0].negate;1437if (inot_src_instr->op != nir_op_ixor)1438op[1].negate = !op[1].negate;14391440switch (inot_src_instr->op) {1441case nir_op_ior:1442bld.AND(result, op[0], op[1]);1443return;14441445case nir_op_iand:1446bld.OR(result, op[0], op[1]);1447return;14481449case nir_op_ixor:1450bld.XOR(result, op[0], op[1]);1451return;14521453default:1454unreachable("impossible opcode");1455}1456}1457op[0] = resolve_source_modifiers(op[0]);1458}1459bld.NOT(result, op[0]);1460break;1461case nir_op_ixor:1462if (devinfo->ver >= 8) {1463resolve_inot_sources(bld, instr, op);1464}1465bld.XOR(result, op[0], op[1]);1466break;1467case nir_op_ior:1468if (devinfo->ver >= 8) {1469resolve_inot_sources(bld, instr, op);1470}1471bld.OR(result, op[0], op[1]);1472break;1473case nir_op_iand:1474if (devinfo->ver >= 8) {1475resolve_inot_sources(bld, instr, op);1476}1477bld.AND(result, op[0], op[1]);1478break;14791480case nir_op_fdot2:1481case nir_op_fdot3:1482case nir_op_fdot4:1483case nir_op_b32all_fequal2:1484case nir_op_b32all_iequal2:1485case nir_op_b32all_fequal3:1486case nir_op_b32all_iequal3:1487case nir_op_b32all_fequal4:1488case nir_op_b32all_iequal4:1489case nir_op_b32any_fnequal2:1490case nir_op_b32any_inequal2:1491case nir_op_b32any_fnequal3:1492case nir_op_b32any_inequal3:1493case nir_op_b32any_fnequal4:1494case nir_op_b32any_inequal4:1495unreachable("Lowered by nir_lower_alu_reductions");14961497case nir_op_ldexp:1498unreachable("not reached: should be handled by ldexp_to_arith()");14991500case nir_op_fsqrt:1501inst = bld.emit(SHADER_OPCODE_SQRT, result, op[0]);1502break;15031504case nir_op_frsq:1505inst = bld.emit(SHADER_OPCODE_RSQ, result, op[0]);1506break;15071508case nir_op_i2b32:1509case nir_op_f2b32: {1510uint32_t bit_size = nir_src_bit_size(instr->src[0].src);1511if (bit_size == 64) {1512/* two-argument instructions can't take 64-bit immediates */1513fs_reg zero;1514fs_reg tmp;15151516if (instr->op == nir_op_f2b32) {1517zero = vgrf(glsl_type::double_type);1518tmp = vgrf(glsl_type::double_type);1519bld.MOV(zero, setup_imm_df(bld, 0.0));1520} else {1521zero = vgrf(glsl_type::int64_t_type);1522tmp = vgrf(glsl_type::int64_t_type);1523bld.MOV(zero, brw_imm_q(0));1524}15251526/* A SIMD16 execution needs to be split in two instructions, so use1527* a vgrf instead of the flag register as dst so instruction splitting1528* works1529*/1530bld.CMP(tmp, op[0], zero, BRW_CONDITIONAL_NZ);1531bld.MOV(result, subscript(tmp, BRW_REGISTER_TYPE_UD, 0));1532} else {1533fs_reg zero;1534if (bit_size == 32) {1535zero = instr->op == nir_op_f2b32 ? brw_imm_f(0.0f) : brw_imm_d(0);1536} else {1537assert(bit_size == 16);1538zero = instr->op == nir_op_f2b32 ?1539retype(brw_imm_w(0), BRW_REGISTER_TYPE_HF) : brw_imm_w(0);1540}1541bld.CMP(result, op[0], zero, BRW_CONDITIONAL_NZ);1542}1543break;1544}15451546case nir_op_ftrunc:1547inst = bld.RNDZ(result, op[0]);1548if (devinfo->ver < 6) {1549set_condmod(BRW_CONDITIONAL_R, inst);1550set_predicate(BRW_PREDICATE_NORMAL,1551bld.ADD(result, result, brw_imm_f(1.0f)));1552inst = bld.MOV(result, result); /* for potential saturation */1553}1554break;15551556case nir_op_fceil: {1557op[0].negate = !op[0].negate;1558fs_reg temp = vgrf(glsl_type::float_type);1559bld.RNDD(temp, op[0]);1560temp.negate = true;1561inst = bld.MOV(result, temp);1562break;1563}1564case nir_op_ffloor:1565inst = bld.RNDD(result, op[0]);1566break;1567case nir_op_ffract:1568inst = bld.FRC(result, op[0]);1569break;1570case nir_op_fround_even:1571inst = bld.RNDE(result, op[0]);1572if (devinfo->ver < 6) {1573set_condmod(BRW_CONDITIONAL_R, inst);1574set_predicate(BRW_PREDICATE_NORMAL,1575bld.ADD(result, result, brw_imm_f(1.0f)));1576inst = bld.MOV(result, result); /* for potential saturation */1577}1578break;15791580case nir_op_fquantize2f16: {1581fs_reg tmp16 = bld.vgrf(BRW_REGISTER_TYPE_D);1582fs_reg tmp32 = bld.vgrf(BRW_REGISTER_TYPE_F);1583fs_reg zero = bld.vgrf(BRW_REGISTER_TYPE_F);15841585/* The destination stride must be at least as big as the source stride. */1586tmp16.type = BRW_REGISTER_TYPE_W;1587tmp16.stride = 2;15881589/* Check for denormal */1590fs_reg abs_src0 = op[0];1591abs_src0.abs = true;1592bld.CMP(bld.null_reg_f(), abs_src0, brw_imm_f(ldexpf(1.0, -14)),1593BRW_CONDITIONAL_L);1594/* Get the appropriately signed zero */1595bld.AND(retype(zero, BRW_REGISTER_TYPE_UD),1596retype(op[0], BRW_REGISTER_TYPE_UD),1597brw_imm_ud(0x80000000));1598/* Do the actual F32 -> F16 -> F32 conversion */1599bld.emit(BRW_OPCODE_F32TO16, tmp16, op[0]);1600bld.emit(BRW_OPCODE_F16TO32, tmp32, tmp16);1601/* Select that or zero based on normal status */1602inst = bld.SEL(result, zero, tmp32);1603inst->predicate = BRW_PREDICATE_NORMAL;1604break;1605}16061607case nir_op_imin:1608case nir_op_umin:1609case nir_op_fmin:1610inst = bld.emit_minmax(result, op[0], op[1], BRW_CONDITIONAL_L);1611break;16121613case nir_op_imax:1614case nir_op_umax:1615case nir_op_fmax:1616inst = bld.emit_minmax(result, op[0], op[1], BRW_CONDITIONAL_GE);1617break;16181619case nir_op_pack_snorm_2x16:1620case nir_op_pack_snorm_4x8:1621case nir_op_pack_unorm_2x16:1622case nir_op_pack_unorm_4x8:1623case nir_op_unpack_snorm_2x16:1624case nir_op_unpack_snorm_4x8:1625case nir_op_unpack_unorm_2x16:1626case nir_op_unpack_unorm_4x8:1627case nir_op_unpack_half_2x16:1628case nir_op_pack_half_2x16:1629unreachable("not reached: should be handled by lower_packing_builtins");16301631case nir_op_unpack_half_2x16_split_x_flush_to_zero:1632assert(FLOAT_CONTROLS_DENORM_FLUSH_TO_ZERO_FP16 & execution_mode);1633FALLTHROUGH;1634case nir_op_unpack_half_2x16_split_x:1635inst = bld.emit(BRW_OPCODE_F16TO32, result,1636subscript(op[0], BRW_REGISTER_TYPE_UW, 0));1637break;16381639case nir_op_unpack_half_2x16_split_y_flush_to_zero:1640assert(FLOAT_CONTROLS_DENORM_FLUSH_TO_ZERO_FP16 & execution_mode);1641FALLTHROUGH;1642case nir_op_unpack_half_2x16_split_y:1643inst = bld.emit(BRW_OPCODE_F16TO32, result,1644subscript(op[0], BRW_REGISTER_TYPE_UW, 1));1645break;16461647case nir_op_pack_64_2x32_split:1648case nir_op_pack_32_2x16_split:1649bld.emit(FS_OPCODE_PACK, result, op[0], op[1]);1650break;16511652case nir_op_unpack_64_2x32_split_x:1653case nir_op_unpack_64_2x32_split_y: {1654if (instr->op == nir_op_unpack_64_2x32_split_x)1655bld.MOV(result, subscript(op[0], BRW_REGISTER_TYPE_UD, 0));1656else1657bld.MOV(result, subscript(op[0], BRW_REGISTER_TYPE_UD, 1));1658break;1659}16601661case nir_op_unpack_32_2x16_split_x:1662case nir_op_unpack_32_2x16_split_y: {1663if (instr->op == nir_op_unpack_32_2x16_split_x)1664bld.MOV(result, subscript(op[0], BRW_REGISTER_TYPE_UW, 0));1665else1666bld.MOV(result, subscript(op[0], BRW_REGISTER_TYPE_UW, 1));1667break;1668}16691670case nir_op_fpow:1671inst = bld.emit(SHADER_OPCODE_POW, result, op[0], op[1]);1672break;16731674case nir_op_bitfield_reverse:1675assert(nir_dest_bit_size(instr->dest.dest) < 64);1676bld.BFREV(result, op[0]);1677break;16781679case nir_op_bit_count:1680assert(nir_dest_bit_size(instr->dest.dest) < 64);1681bld.CBIT(result, op[0]);1682break;16831684case nir_op_ufind_msb: {1685assert(nir_dest_bit_size(instr->dest.dest) < 64);1686emit_find_msb_using_lzd(bld, result, op[0], false);1687break;1688}16891690case nir_op_uclz:1691assert(nir_dest_bit_size(instr->dest.dest) == 32);1692bld.LZD(retype(result, BRW_REGISTER_TYPE_UD), op[0]);1693break;16941695case nir_op_ifind_msb: {1696assert(nir_dest_bit_size(instr->dest.dest) < 64);16971698if (devinfo->ver < 7) {1699emit_find_msb_using_lzd(bld, result, op[0], true);1700} else {1701bld.FBH(retype(result, BRW_REGISTER_TYPE_UD), op[0]);17021703/* FBH counts from the MSB side, while GLSL's findMSB() wants the1704* count from the LSB side. If FBH didn't return an error1705* (0xFFFFFFFF), then subtract the result from 31 to convert the MSB1706* count into an LSB count.1707*/1708bld.CMP(bld.null_reg_d(), result, brw_imm_d(-1), BRW_CONDITIONAL_NZ);17091710inst = bld.ADD(result, result, brw_imm_d(31));1711inst->predicate = BRW_PREDICATE_NORMAL;1712inst->src[0].negate = true;1713}1714break;1715}17161717case nir_op_find_lsb:1718assert(nir_dest_bit_size(instr->dest.dest) < 64);17191720if (devinfo->ver < 7) {1721fs_reg temp = vgrf(glsl_type::int_type);17221723/* (x & -x) generates a value that consists of only the LSB of x.1724* For all powers of 2, findMSB(y) == findLSB(y).1725*/1726fs_reg src = retype(op[0], BRW_REGISTER_TYPE_D);1727fs_reg negated_src = src;17281729/* One must be negated, and the other must be non-negated. It1730* doesn't matter which is which.1731*/1732negated_src.negate = true;1733src.negate = false;17341735bld.AND(temp, src, negated_src);1736emit_find_msb_using_lzd(bld, result, temp, false);1737} else {1738bld.FBL(result, op[0]);1739}1740break;17411742case nir_op_ubitfield_extract:1743case nir_op_ibitfield_extract:1744unreachable("should have been lowered");1745case nir_op_ubfe:1746case nir_op_ibfe:1747assert(nir_dest_bit_size(instr->dest.dest) < 64);1748bld.BFE(result, op[2], op[1], op[0]);1749break;1750case nir_op_bfm:1751assert(nir_dest_bit_size(instr->dest.dest) < 64);1752bld.BFI1(result, op[0], op[1]);1753break;1754case nir_op_bfi:1755assert(nir_dest_bit_size(instr->dest.dest) < 64);1756bld.BFI2(result, op[0], op[1], op[2]);1757break;17581759case nir_op_bitfield_insert:1760unreachable("not reached: should have been lowered");17611762case nir_op_ishl:1763bld.SHL(result, op[0], op[1]);1764break;1765case nir_op_ishr:1766bld.ASR(result, op[0], op[1]);1767break;1768case nir_op_ushr:1769bld.SHR(result, op[0], op[1]);1770break;17711772case nir_op_urol:1773bld.ROL(result, op[0], op[1]);1774break;1775case nir_op_uror:1776bld.ROR(result, op[0], op[1]);1777break;17781779case nir_op_pack_half_2x16_split:1780bld.emit(FS_OPCODE_PACK_HALF_2x16_SPLIT, result, op[0], op[1]);1781break;17821783case nir_op_ffma:1784if (nir_has_any_rounding_mode_enabled(execution_mode)) {1785brw_rnd_mode rnd =1786brw_rnd_mode_from_execution_mode(execution_mode);1787bld.emit(SHADER_OPCODE_RND_MODE, bld.null_reg_ud(),1788brw_imm_d(rnd));1789}17901791inst = bld.MAD(result, op[2], op[1], op[0]);1792break;17931794case nir_op_flrp:1795if (nir_has_any_rounding_mode_enabled(execution_mode)) {1796brw_rnd_mode rnd =1797brw_rnd_mode_from_execution_mode(execution_mode);1798bld.emit(SHADER_OPCODE_RND_MODE, bld.null_reg_ud(),1799brw_imm_d(rnd));1800}18011802inst = bld.LRP(result, op[0], op[1], op[2]);1803break;18041805case nir_op_b32csel:1806if (optimize_frontfacing_ternary(instr, result))1807return;18081809bld.CMP(bld.null_reg_d(), op[0], brw_imm_d(0), BRW_CONDITIONAL_NZ);1810inst = bld.SEL(result, op[1], op[2]);1811inst->predicate = BRW_PREDICATE_NORMAL;1812break;18131814case nir_op_extract_u8:1815case nir_op_extract_i8: {1816unsigned byte = nir_src_as_uint(instr->src[1].src);18171818/* The PRMs say:1819*1820* BDW+1821* There is no direct conversion from B/UB to Q/UQ or Q/UQ to B/UB.1822* Use two instructions and a word or DWord intermediate integer type.1823*/1824if (nir_dest_bit_size(instr->dest.dest) == 64) {1825const brw_reg_type type = brw_int_type(1, instr->op == nir_op_extract_i8);18261827if (instr->op == nir_op_extract_i8) {1828/* If we need to sign extend, extract to a word first */1829fs_reg w_temp = bld.vgrf(BRW_REGISTER_TYPE_W);1830bld.MOV(w_temp, subscript(op[0], type, byte));1831bld.MOV(result, w_temp);1832} else if (byte & 1) {1833/* Extract the high byte from the word containing the desired byte1834* offset.1835*/1836bld.SHR(result,1837subscript(op[0], BRW_REGISTER_TYPE_UW, byte / 2),1838brw_imm_uw(8));1839} else {1840/* Otherwise use an AND with 0xff and a word type */1841bld.AND(result,1842subscript(op[0], BRW_REGISTER_TYPE_UW, byte / 2),1843brw_imm_uw(0xff));1844}1845} else {1846const brw_reg_type type = brw_int_type(1, instr->op == nir_op_extract_i8);1847bld.MOV(result, subscript(op[0], type, byte));1848}1849break;1850}18511852case nir_op_extract_u16:1853case nir_op_extract_i16: {1854const brw_reg_type type = brw_int_type(2, instr->op == nir_op_extract_i16);1855unsigned word = nir_src_as_uint(instr->src[1].src);1856bld.MOV(result, subscript(op[0], type, word));1857break;1858}18591860default:1861unreachable("unhandled instruction");1862}18631864/* If we need to do a boolean resolve, replace the result with -(x & 1)1865* to sign extend the low bit to 0/~01866*/1867if (devinfo->ver <= 5 &&1868!result.is_null() &&1869(instr->instr.pass_flags & BRW_NIR_BOOLEAN_MASK) == BRW_NIR_BOOLEAN_NEEDS_RESOLVE) {1870fs_reg masked = vgrf(glsl_type::int_type);1871bld.AND(masked, result, brw_imm_d(1));1872masked.negate = true;1873bld.MOV(retype(result, BRW_REGISTER_TYPE_D), masked);1874}1875}18761877void1878fs_visitor::nir_emit_load_const(const fs_builder &bld,1879nir_load_const_instr *instr)1880{1881const brw_reg_type reg_type =1882brw_reg_type_from_bit_size(instr->def.bit_size, BRW_REGISTER_TYPE_D);1883fs_reg reg = bld.vgrf(reg_type, instr->def.num_components);18841885switch (instr->def.bit_size) {1886case 8:1887for (unsigned i = 0; i < instr->def.num_components; i++)1888bld.MOV(offset(reg, bld, i), setup_imm_b(bld, instr->value[i].i8));1889break;18901891case 16:1892for (unsigned i = 0; i < instr->def.num_components; i++)1893bld.MOV(offset(reg, bld, i), brw_imm_w(instr->value[i].i16));1894break;18951896case 32:1897for (unsigned i = 0; i < instr->def.num_components; i++)1898bld.MOV(offset(reg, bld, i), brw_imm_d(instr->value[i].i32));1899break;19001901case 64:1902assert(devinfo->ver >= 7);1903if (devinfo->ver == 7) {1904/* We don't get 64-bit integer types until gfx8 */1905for (unsigned i = 0; i < instr->def.num_components; i++) {1906bld.MOV(retype(offset(reg, bld, i), BRW_REGISTER_TYPE_DF),1907setup_imm_df(bld, instr->value[i].f64));1908}1909} else {1910for (unsigned i = 0; i < instr->def.num_components; i++)1911bld.MOV(offset(reg, bld, i), brw_imm_q(instr->value[i].i64));1912}1913break;19141915default:1916unreachable("Invalid bit size");1917}19181919nir_ssa_values[instr->def.index] = reg;1920}19211922fs_reg1923fs_visitor::get_nir_src(const nir_src &src)1924{1925fs_reg reg;1926if (src.is_ssa) {1927if (nir_src_is_undef(src)) {1928const brw_reg_type reg_type =1929brw_reg_type_from_bit_size(src.ssa->bit_size, BRW_REGISTER_TYPE_D);1930reg = bld.vgrf(reg_type, src.ssa->num_components);1931} else {1932reg = nir_ssa_values[src.ssa->index];1933}1934} else {1935/* We don't handle indirects on locals */1936assert(src.reg.indirect == NULL);1937reg = offset(nir_locals[src.reg.reg->index], bld,1938src.reg.base_offset * src.reg.reg->num_components);1939}19401941if (nir_src_bit_size(src) == 64 && devinfo->ver == 7) {1942/* The only 64-bit type available on gfx7 is DF, so use that. */1943reg.type = BRW_REGISTER_TYPE_DF;1944} else {1945/* To avoid floating-point denorm flushing problems, set the type by1946* default to an integer type - instructions that need floating point1947* semantics will set this to F if they need to1948*/1949reg.type = brw_reg_type_from_bit_size(nir_src_bit_size(src),1950BRW_REGISTER_TYPE_D);1951}19521953return reg;1954}19551956/**1957* Return an IMM for constants; otherwise call get_nir_src() as normal.1958*1959* This function should not be called on any value which may be 64 bits.1960* We could theoretically support 64-bit on gfx8+ but we choose not to1961* because it wouldn't work in general (no gfx7 support) and there are1962* enough restrictions in 64-bit immediates that you can't take the return1963* value and treat it the same as the result of get_nir_src().1964*/1965fs_reg1966fs_visitor::get_nir_src_imm(const nir_src &src)1967{1968assert(nir_src_bit_size(src) == 32);1969return nir_src_is_const(src) ?1970fs_reg(brw_imm_d(nir_src_as_int(src))) : get_nir_src(src);1971}19721973fs_reg1974fs_visitor::get_nir_dest(const nir_dest &dest)1975{1976if (dest.is_ssa) {1977const brw_reg_type reg_type =1978brw_reg_type_from_bit_size(dest.ssa.bit_size,1979dest.ssa.bit_size == 8 ?1980BRW_REGISTER_TYPE_D :1981BRW_REGISTER_TYPE_F);1982nir_ssa_values[dest.ssa.index] =1983bld.vgrf(reg_type, dest.ssa.num_components);1984bld.UNDEF(nir_ssa_values[dest.ssa.index]);1985return nir_ssa_values[dest.ssa.index];1986} else {1987/* We don't handle indirects on locals */1988assert(dest.reg.indirect == NULL);1989return offset(nir_locals[dest.reg.reg->index], bld,1990dest.reg.base_offset * dest.reg.reg->num_components);1991}1992}19931994void1995fs_visitor::emit_percomp(const fs_builder &bld, const fs_inst &inst,1996unsigned wr_mask)1997{1998for (unsigned i = 0; i < 4; i++) {1999if (!((wr_mask >> i) & 1))2000continue;20012002fs_inst *new_inst = new(mem_ctx) fs_inst(inst);2003new_inst->dst = offset(new_inst->dst, bld, i);2004for (unsigned j = 0; j < new_inst->sources; j++)2005if (new_inst->src[j].file == VGRF)2006new_inst->src[j] = offset(new_inst->src[j], bld, i);20072008bld.emit(new_inst);2009}2010}20112012static fs_inst *2013emit_pixel_interpolater_send(const fs_builder &bld,2014enum opcode opcode,2015const fs_reg &dst,2016const fs_reg &src,2017const fs_reg &desc,2018glsl_interp_mode interpolation)2019{2020struct brw_wm_prog_data *wm_prog_data =2021brw_wm_prog_data(bld.shader->stage_prog_data);20222023fs_inst *inst = bld.emit(opcode, dst, src, desc);2024/* 2 floats per slot returned */2025inst->size_written = 2 * dst.component_size(inst->exec_size);2026inst->pi_noperspective = interpolation == INTERP_MODE_NOPERSPECTIVE;20272028wm_prog_data->pulls_bary = true;20292030return inst;2031}20322033/**2034* Computes 1 << x, given a D/UD register containing some value x.2035*/2036static fs_reg2037intexp2(const fs_builder &bld, const fs_reg &x)2038{2039assert(x.type == BRW_REGISTER_TYPE_UD || x.type == BRW_REGISTER_TYPE_D);20402041fs_reg result = bld.vgrf(x.type, 1);2042fs_reg one = bld.vgrf(x.type, 1);20432044bld.MOV(one, retype(brw_imm_d(1), one.type));2045bld.SHL(result, one, x);2046return result;2047}20482049void2050fs_visitor::emit_gs_end_primitive(const nir_src &vertex_count_nir_src)2051{2052assert(stage == MESA_SHADER_GEOMETRY);20532054struct brw_gs_prog_data *gs_prog_data = brw_gs_prog_data(prog_data);20552056if (gs_compile->control_data_header_size_bits == 0)2057return;20582059/* We can only do EndPrimitive() functionality when the control data2060* consists of cut bits. Fortunately, the only time it isn't is when the2061* output type is points, in which case EndPrimitive() is a no-op.2062*/2063if (gs_prog_data->control_data_format !=2064GFX7_GS_CONTROL_DATA_FORMAT_GSCTL_CUT) {2065return;2066}20672068/* Cut bits use one bit per vertex. */2069assert(gs_compile->control_data_bits_per_vertex == 1);20702071fs_reg vertex_count = get_nir_src(vertex_count_nir_src);2072vertex_count.type = BRW_REGISTER_TYPE_UD;20732074/* Cut bit n should be set to 1 if EndPrimitive() was called after emitting2075* vertex n, 0 otherwise. So all we need to do here is mark bit2076* (vertex_count - 1) % 32 in the cut_bits register to indicate that2077* EndPrimitive() was called after emitting vertex (vertex_count - 1);2078* vec4_gs_visitor::emit_control_data_bits() will take care of the rest.2079*2080* Note that if EndPrimitive() is called before emitting any vertices, this2081* will cause us to set bit 31 of the control_data_bits register to 1.2082* That's fine because:2083*2084* - If max_vertices < 32, then vertex number 31 (zero-based) will never be2085* output, so the hardware will ignore cut bit 31.2086*2087* - If max_vertices == 32, then vertex number 31 is guaranteed to be the2088* last vertex, so setting cut bit 31 has no effect (since the primitive2089* is automatically ended when the GS terminates).2090*2091* - If max_vertices > 32, then the ir_emit_vertex visitor will reset the2092* control_data_bits register to 0 when the first vertex is emitted.2093*/20942095const fs_builder abld = bld.annotate("end primitive");20962097/* control_data_bits |= 1 << ((vertex_count - 1) % 32) */2098fs_reg prev_count = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);2099abld.ADD(prev_count, vertex_count, brw_imm_ud(0xffffffffu));2100fs_reg mask = intexp2(abld, prev_count);2101/* Note: we're relying on the fact that the GEN SHL instruction only pays2102* attention to the lower 5 bits of its second source argument, so on this2103* architecture, 1 << (vertex_count - 1) is equivalent to 1 <<2104* ((vertex_count - 1) % 32).2105*/2106abld.OR(this->control_data_bits, this->control_data_bits, mask);2107}21082109void2110fs_visitor::emit_gs_control_data_bits(const fs_reg &vertex_count)2111{2112assert(stage == MESA_SHADER_GEOMETRY);2113assert(gs_compile->control_data_bits_per_vertex != 0);21142115struct brw_gs_prog_data *gs_prog_data = brw_gs_prog_data(prog_data);21162117const fs_builder abld = bld.annotate("emit control data bits");2118const fs_builder fwa_bld = bld.exec_all();21192120/* We use a single UD register to accumulate control data bits (32 bits2121* for each of the SIMD8 channels). So we need to write a DWord (32 bits)2122* at a time.2123*2124* Unfortunately, the URB_WRITE_SIMD8 message uses 128-bit (OWord) offsets.2125* We have select a 128-bit group via the Global and Per-Slot Offsets, then2126* use the Channel Mask phase to enable/disable which DWord within that2127* group to write. (Remember, different SIMD8 channels may have emitted2128* different numbers of vertices, so we may need per-slot offsets.)2129*2130* Channel masking presents an annoying problem: we may have to replicate2131* the data up to 4 times:2132*2133* Msg = Handles, Per-Slot Offsets, Channel Masks, Data, Data, Data, Data.2134*2135* To avoid penalizing shaders that emit a small number of vertices, we2136* can avoid these sometimes: if the size of the control data header is2137* <= 128 bits, then there is only 1 OWord. All SIMD8 channels will land2138* land in the same 128-bit group, so we can skip per-slot offsets.2139*2140* Similarly, if the control data header is <= 32 bits, there is only one2141* DWord, so we can skip channel masks.2142*/2143enum opcode opcode = SHADER_OPCODE_URB_WRITE_SIMD8;21442145fs_reg channel_mask, per_slot_offset;21462147if (gs_compile->control_data_header_size_bits > 32) {2148opcode = SHADER_OPCODE_URB_WRITE_SIMD8_MASKED;2149channel_mask = vgrf(glsl_type::uint_type);2150}21512152if (gs_compile->control_data_header_size_bits > 128) {2153opcode = SHADER_OPCODE_URB_WRITE_SIMD8_MASKED_PER_SLOT;2154per_slot_offset = vgrf(glsl_type::uint_type);2155}21562157/* Figure out which DWord we're trying to write to using the formula:2158*2159* dword_index = (vertex_count - 1) * bits_per_vertex / 322160*2161* Since bits_per_vertex is a power of two, and is known at compile2162* time, this can be optimized to:2163*2164* dword_index = (vertex_count - 1) >> (6 - log2(bits_per_vertex))2165*/2166if (opcode != SHADER_OPCODE_URB_WRITE_SIMD8) {2167fs_reg dword_index = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);2168fs_reg prev_count = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);2169abld.ADD(prev_count, vertex_count, brw_imm_ud(0xffffffffu));2170unsigned log2_bits_per_vertex =2171util_last_bit(gs_compile->control_data_bits_per_vertex);2172abld.SHR(dword_index, prev_count, brw_imm_ud(6u - log2_bits_per_vertex));21732174if (per_slot_offset.file != BAD_FILE) {2175/* Set the per-slot offset to dword_index / 4, so that we'll write to2176* the appropriate OWord within the control data header.2177*/2178abld.SHR(per_slot_offset, dword_index, brw_imm_ud(2u));2179}21802181/* Set the channel masks to 1 << (dword_index % 4), so that we'll2182* write to the appropriate DWORD within the OWORD.2183*/2184fs_reg channel = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);2185fwa_bld.AND(channel, dword_index, brw_imm_ud(3u));2186channel_mask = intexp2(fwa_bld, channel);2187/* Then the channel masks need to be in bits 23:16. */2188fwa_bld.SHL(channel_mask, channel_mask, brw_imm_ud(16u));2189}21902191/* Store the control data bits in the message payload and send it. */2192unsigned mlen = 2;2193if (channel_mask.file != BAD_FILE)2194mlen += 4; /* channel masks, plus 3 extra copies of the data */2195if (per_slot_offset.file != BAD_FILE)2196mlen++;21972198fs_reg payload = bld.vgrf(BRW_REGISTER_TYPE_UD, mlen);2199fs_reg *sources = ralloc_array(mem_ctx, fs_reg, mlen);2200unsigned i = 0;2201sources[i++] = fs_reg(retype(brw_vec8_grf(1, 0), BRW_REGISTER_TYPE_UD));2202if (per_slot_offset.file != BAD_FILE)2203sources[i++] = per_slot_offset;2204if (channel_mask.file != BAD_FILE)2205sources[i++] = channel_mask;2206while (i < mlen) {2207sources[i++] = this->control_data_bits;2208}22092210abld.LOAD_PAYLOAD(payload, sources, mlen, mlen);2211fs_inst *inst = abld.emit(opcode, reg_undef, payload);2212inst->mlen = mlen;2213/* We need to increment Global Offset by 256-bits to make room for2214* Broadwell's extra "Vertex Count" payload at the beginning of the2215* URB entry. Since this is an OWord message, Global Offset is counted2216* in 128-bit units, so we must set it to 2.2217*/2218if (gs_prog_data->static_vertex_count == -1)2219inst->offset = 2;2220}22212222void2223fs_visitor::set_gs_stream_control_data_bits(const fs_reg &vertex_count,2224unsigned stream_id)2225{2226/* control_data_bits |= stream_id << ((2 * (vertex_count - 1)) % 32) */22272228/* Note: we are calling this *before* increasing vertex_count, so2229* this->vertex_count == vertex_count - 1 in the formula above.2230*/22312232/* Stream mode uses 2 bits per vertex */2233assert(gs_compile->control_data_bits_per_vertex == 2);22342235/* Must be a valid stream */2236assert(stream_id < MAX_VERTEX_STREAMS);22372238/* Control data bits are initialized to 0 so we don't have to set any2239* bits when sending vertices to stream 0.2240*/2241if (stream_id == 0)2242return;22432244const fs_builder abld = bld.annotate("set stream control data bits", NULL);22452246/* reg::sid = stream_id */2247fs_reg sid = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);2248abld.MOV(sid, brw_imm_ud(stream_id));22492250/* reg:shift_count = 2 * (vertex_count - 1) */2251fs_reg shift_count = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);2252abld.SHL(shift_count, vertex_count, brw_imm_ud(1u));22532254/* Note: we're relying on the fact that the GEN SHL instruction only pays2255* attention to the lower 5 bits of its second source argument, so on this2256* architecture, stream_id << 2 * (vertex_count - 1) is equivalent to2257* stream_id << ((2 * (vertex_count - 1)) % 32).2258*/2259fs_reg mask = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);2260abld.SHL(mask, sid, shift_count);2261abld.OR(this->control_data_bits, this->control_data_bits, mask);2262}22632264void2265fs_visitor::emit_gs_vertex(const nir_src &vertex_count_nir_src,2266unsigned stream_id)2267{2268assert(stage == MESA_SHADER_GEOMETRY);22692270struct brw_gs_prog_data *gs_prog_data = brw_gs_prog_data(prog_data);22712272fs_reg vertex_count = get_nir_src(vertex_count_nir_src);2273vertex_count.type = BRW_REGISTER_TYPE_UD;22742275/* Haswell and later hardware ignores the "Render Stream Select" bits2276* from the 3DSTATE_STREAMOUT packet when the SOL stage is disabled,2277* and instead sends all primitives down the pipeline for rasterization.2278* If the SOL stage is enabled, "Render Stream Select" is honored and2279* primitives bound to non-zero streams are discarded after stream output.2280*2281* Since the only purpose of primives sent to non-zero streams is to2282* be recorded by transform feedback, we can simply discard all geometry2283* bound to these streams when transform feedback is disabled.2284*/2285if (stream_id > 0 && !nir->info.has_transform_feedback_varyings)2286return;22872288/* If we're outputting 32 control data bits or less, then we can wait2289* until the shader is over to output them all. Otherwise we need to2290* output them as we go. Now is the time to do it, since we're about to2291* output the vertex_count'th vertex, so it's guaranteed that the2292* control data bits associated with the (vertex_count - 1)th vertex are2293* correct.2294*/2295if (gs_compile->control_data_header_size_bits > 32) {2296const fs_builder abld =2297bld.annotate("emit vertex: emit control data bits");22982299/* Only emit control data bits if we've finished accumulating a batch2300* of 32 bits. This is the case when:2301*2302* (vertex_count * bits_per_vertex) % 32 == 02303*2304* (in other words, when the last 5 bits of vertex_count *2305* bits_per_vertex are 0). Assuming bits_per_vertex == 2^n for some2306* integer n (which is always the case, since bits_per_vertex is2307* always 1 or 2), this is equivalent to requiring that the last 5-n2308* bits of vertex_count are 0:2309*2310* vertex_count & (2^(5-n) - 1) == 02311*2312* 2^(5-n) == 2^5 / 2^n == 32 / bits_per_vertex, so this is2313* equivalent to:2314*2315* vertex_count & (32 / bits_per_vertex - 1) == 02316*2317* TODO: If vertex_count is an immediate, we could do some of this math2318* at compile time...2319*/2320fs_inst *inst =2321abld.AND(bld.null_reg_d(), vertex_count,2322brw_imm_ud(32u / gs_compile->control_data_bits_per_vertex - 1u));2323inst->conditional_mod = BRW_CONDITIONAL_Z;23242325abld.IF(BRW_PREDICATE_NORMAL);2326/* If vertex_count is 0, then no control data bits have been2327* accumulated yet, so we can skip emitting them.2328*/2329abld.CMP(bld.null_reg_d(), vertex_count, brw_imm_ud(0u),2330BRW_CONDITIONAL_NEQ);2331abld.IF(BRW_PREDICATE_NORMAL);2332emit_gs_control_data_bits(vertex_count);2333abld.emit(BRW_OPCODE_ENDIF);23342335/* Reset control_data_bits to 0 so we can start accumulating a new2336* batch.2337*2338* Note: in the case where vertex_count == 0, this neutralizes the2339* effect of any call to EndPrimitive() that the shader may have2340* made before outputting its first vertex.2341*/2342inst = abld.MOV(this->control_data_bits, brw_imm_ud(0u));2343inst->force_writemask_all = true;2344abld.emit(BRW_OPCODE_ENDIF);2345}23462347emit_urb_writes(vertex_count);23482349/* In stream mode we have to set control data bits for all vertices2350* unless we have disabled control data bits completely (which we do2351* do for GL_POINTS outputs that don't use streams).2352*/2353if (gs_compile->control_data_header_size_bits > 0 &&2354gs_prog_data->control_data_format ==2355GFX7_GS_CONTROL_DATA_FORMAT_GSCTL_SID) {2356set_gs_stream_control_data_bits(vertex_count, stream_id);2357}2358}23592360void2361fs_visitor::emit_gs_input_load(const fs_reg &dst,2362const nir_src &vertex_src,2363unsigned base_offset,2364const nir_src &offset_src,2365unsigned num_components,2366unsigned first_component)2367{2368assert(type_sz(dst.type) == 4);2369struct brw_gs_prog_data *gs_prog_data = brw_gs_prog_data(prog_data);2370const unsigned push_reg_count = gs_prog_data->base.urb_read_length * 8;23712372/* TODO: figure out push input layout for invocations == 1 */2373if (gs_prog_data->invocations == 1 &&2374nir_src_is_const(offset_src) && nir_src_is_const(vertex_src) &&23754 * (base_offset + nir_src_as_uint(offset_src)) < push_reg_count) {2376int imm_offset = (base_offset + nir_src_as_uint(offset_src)) * 4 +2377nir_src_as_uint(vertex_src) * push_reg_count;2378for (unsigned i = 0; i < num_components; i++) {2379bld.MOV(offset(dst, bld, i),2380fs_reg(ATTR, imm_offset + i + first_component, dst.type));2381}2382return;2383}23842385/* Resort to the pull model. Ensure the VUE handles are provided. */2386assert(gs_prog_data->base.include_vue_handles);23872388unsigned first_icp_handle = gs_prog_data->include_primitive_id ? 3 : 2;2389fs_reg icp_handle = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);23902391if (gs_prog_data->invocations == 1) {2392if (nir_src_is_const(vertex_src)) {2393/* The vertex index is constant; just select the proper URB handle. */2394icp_handle =2395retype(brw_vec8_grf(first_icp_handle + nir_src_as_uint(vertex_src), 0),2396BRW_REGISTER_TYPE_UD);2397} else {2398/* The vertex index is non-constant. We need to use indirect2399* addressing to fetch the proper URB handle.2400*2401* First, we start with the sequence <7, 6, 5, 4, 3, 2, 1, 0>2402* indicating that channel <n> should read the handle from2403* DWord <n>. We convert that to bytes by multiplying by 4.2404*2405* Next, we convert the vertex index to bytes by multiplying2406* by 32 (shifting by 5), and add the two together. This is2407* the final indirect byte offset.2408*/2409fs_reg sequence = bld.vgrf(BRW_REGISTER_TYPE_UW, 1);2410fs_reg channel_offsets = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);2411fs_reg vertex_offset_bytes = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);2412fs_reg icp_offset_bytes = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);24132414/* sequence = <7, 6, 5, 4, 3, 2, 1, 0> */2415bld.MOV(sequence, fs_reg(brw_imm_v(0x76543210)));2416/* channel_offsets = 4 * sequence = <28, 24, 20, 16, 12, 8, 4, 0> */2417bld.SHL(channel_offsets, sequence, brw_imm_ud(2u));2418/* Convert vertex_index to bytes (multiply by 32) */2419bld.SHL(vertex_offset_bytes,2420retype(get_nir_src(vertex_src), BRW_REGISTER_TYPE_UD),2421brw_imm_ud(5u));2422bld.ADD(icp_offset_bytes, vertex_offset_bytes, channel_offsets);24232424/* Use first_icp_handle as the base offset. There is one register2425* of URB handles per vertex, so inform the register allocator that2426* we might read up to nir->info.gs.vertices_in registers.2427*/2428bld.emit(SHADER_OPCODE_MOV_INDIRECT, icp_handle,2429retype(brw_vec8_grf(first_icp_handle, 0), icp_handle.type),2430fs_reg(icp_offset_bytes),2431brw_imm_ud(nir->info.gs.vertices_in * REG_SIZE));2432}2433} else {2434assert(gs_prog_data->invocations > 1);24352436if (nir_src_is_const(vertex_src)) {2437unsigned vertex = nir_src_as_uint(vertex_src);2438assert(devinfo->ver >= 9 || vertex <= 5);2439bld.MOV(icp_handle,2440retype(brw_vec1_grf(first_icp_handle + vertex / 8, vertex % 8),2441BRW_REGISTER_TYPE_UD));2442} else {2443/* The vertex index is non-constant. We need to use indirect2444* addressing to fetch the proper URB handle.2445*2446*/2447fs_reg icp_offset_bytes = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);24482449/* Convert vertex_index to bytes (multiply by 4) */2450bld.SHL(icp_offset_bytes,2451retype(get_nir_src(vertex_src), BRW_REGISTER_TYPE_UD),2452brw_imm_ud(2u));24532454/* Use first_icp_handle as the base offset. There is one DWord2455* of URB handles per vertex, so inform the register allocator that2456* we might read up to ceil(nir->info.gs.vertices_in / 8) registers.2457*/2458bld.emit(SHADER_OPCODE_MOV_INDIRECT, icp_handle,2459retype(brw_vec8_grf(first_icp_handle, 0), icp_handle.type),2460fs_reg(icp_offset_bytes),2461brw_imm_ud(DIV_ROUND_UP(nir->info.gs.vertices_in, 8) *2462REG_SIZE));2463}2464}24652466fs_inst *inst;2467fs_reg indirect_offset = get_nir_src(offset_src);24682469if (nir_src_is_const(offset_src)) {2470/* Constant indexing - use global offset. */2471if (first_component != 0) {2472unsigned read_components = num_components + first_component;2473fs_reg tmp = bld.vgrf(dst.type, read_components);2474inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8, tmp, icp_handle);2475inst->size_written = read_components *2476tmp.component_size(inst->exec_size);2477for (unsigned i = 0; i < num_components; i++) {2478bld.MOV(offset(dst, bld, i),2479offset(tmp, bld, i + first_component));2480}2481} else {2482inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8, dst, icp_handle);2483inst->size_written = num_components *2484dst.component_size(inst->exec_size);2485}2486inst->offset = base_offset + nir_src_as_uint(offset_src);2487inst->mlen = 1;2488} else {2489/* Indirect indexing - use per-slot offsets as well. */2490const fs_reg srcs[] = { icp_handle, indirect_offset };2491unsigned read_components = num_components + first_component;2492fs_reg tmp = bld.vgrf(dst.type, read_components);2493fs_reg payload = bld.vgrf(BRW_REGISTER_TYPE_UD, 2);2494bld.LOAD_PAYLOAD(payload, srcs, ARRAY_SIZE(srcs), 0);2495if (first_component != 0) {2496inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT, tmp,2497payload);2498inst->size_written = read_components *2499tmp.component_size(inst->exec_size);2500for (unsigned i = 0; i < num_components; i++) {2501bld.MOV(offset(dst, bld, i),2502offset(tmp, bld, i + first_component));2503}2504} else {2505inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT, dst, payload);2506inst->size_written = num_components *2507dst.component_size(inst->exec_size);2508}2509inst->offset = base_offset;2510inst->mlen = 2;2511}2512}25132514fs_reg2515fs_visitor::get_indirect_offset(nir_intrinsic_instr *instr)2516{2517nir_src *offset_src = nir_get_io_offset_src(instr);25182519if (nir_src_is_const(*offset_src)) {2520/* The only constant offset we should find is 0. brw_nir.c's2521* add_const_offset_to_base() will fold other constant offsets2522* into instr->const_index[0].2523*/2524assert(nir_src_as_uint(*offset_src) == 0);2525return fs_reg();2526}25272528return get_nir_src(*offset_src);2529}25302531void2532fs_visitor::nir_emit_vs_intrinsic(const fs_builder &bld,2533nir_intrinsic_instr *instr)2534{2535assert(stage == MESA_SHADER_VERTEX);25362537fs_reg dest;2538if (nir_intrinsic_infos[instr->intrinsic].has_dest)2539dest = get_nir_dest(instr->dest);25402541switch (instr->intrinsic) {2542case nir_intrinsic_load_vertex_id:2543case nir_intrinsic_load_base_vertex:2544unreachable("should be lowered by nir_lower_system_values()");25452546case nir_intrinsic_load_input: {2547assert(nir_dest_bit_size(instr->dest) == 32);2548fs_reg src = fs_reg(ATTR, nir_intrinsic_base(instr) * 4, dest.type);2549src = offset(src, bld, nir_intrinsic_component(instr));2550src = offset(src, bld, nir_src_as_uint(instr->src[0]));25512552for (unsigned i = 0; i < instr->num_components; i++)2553bld.MOV(offset(dest, bld, i), offset(src, bld, i));2554break;2555}25562557case nir_intrinsic_load_vertex_id_zero_base:2558case nir_intrinsic_load_instance_id:2559case nir_intrinsic_load_base_instance:2560case nir_intrinsic_load_draw_id:2561case nir_intrinsic_load_first_vertex:2562case nir_intrinsic_load_is_indexed_draw:2563unreachable("lowered by brw_nir_lower_vs_inputs");25642565default:2566nir_emit_intrinsic(bld, instr);2567break;2568}2569}25702571fs_reg2572fs_visitor::get_tcs_single_patch_icp_handle(const fs_builder &bld,2573nir_intrinsic_instr *instr)2574{2575struct brw_tcs_prog_data *tcs_prog_data = brw_tcs_prog_data(prog_data);2576const nir_src &vertex_src = instr->src[0];2577nir_intrinsic_instr *vertex_intrin = nir_src_as_intrinsic(vertex_src);2578fs_reg icp_handle;25792580if (nir_src_is_const(vertex_src)) {2581/* Emit a MOV to resolve <0,1,0> regioning. */2582icp_handle = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);2583unsigned vertex = nir_src_as_uint(vertex_src);2584bld.MOV(icp_handle,2585retype(brw_vec1_grf(1 + (vertex >> 3), vertex & 7),2586BRW_REGISTER_TYPE_UD));2587} else if (tcs_prog_data->instances == 1 && vertex_intrin &&2588vertex_intrin->intrinsic == nir_intrinsic_load_invocation_id) {2589/* For the common case of only 1 instance, an array index of2590* gl_InvocationID means reading g1. Skip all the indirect work.2591*/2592icp_handle = retype(brw_vec8_grf(1, 0), BRW_REGISTER_TYPE_UD);2593} else {2594/* The vertex index is non-constant. We need to use indirect2595* addressing to fetch the proper URB handle.2596*/2597icp_handle = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);25982599/* Each ICP handle is a single DWord (4 bytes) */2600fs_reg vertex_offset_bytes = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);2601bld.SHL(vertex_offset_bytes,2602retype(get_nir_src(vertex_src), BRW_REGISTER_TYPE_UD),2603brw_imm_ud(2u));26042605/* Start at g1. We might read up to 4 registers. */2606bld.emit(SHADER_OPCODE_MOV_INDIRECT, icp_handle,2607retype(brw_vec8_grf(1, 0), icp_handle.type), vertex_offset_bytes,2608brw_imm_ud(4 * REG_SIZE));2609}26102611return icp_handle;2612}26132614fs_reg2615fs_visitor::get_tcs_eight_patch_icp_handle(const fs_builder &bld,2616nir_intrinsic_instr *instr)2617{2618struct brw_tcs_prog_key *tcs_key = (struct brw_tcs_prog_key *) key;2619struct brw_tcs_prog_data *tcs_prog_data = brw_tcs_prog_data(prog_data);2620const nir_src &vertex_src = instr->src[0];26212622unsigned first_icp_handle = tcs_prog_data->include_primitive_id ? 3 : 2;26232624if (nir_src_is_const(vertex_src)) {2625return fs_reg(retype(brw_vec8_grf(first_icp_handle +2626nir_src_as_uint(vertex_src), 0),2627BRW_REGISTER_TYPE_UD));2628}26292630/* The vertex index is non-constant. We need to use indirect2631* addressing to fetch the proper URB handle.2632*2633* First, we start with the sequence <7, 6, 5, 4, 3, 2, 1, 0>2634* indicating that channel <n> should read the handle from2635* DWord <n>. We convert that to bytes by multiplying by 4.2636*2637* Next, we convert the vertex index to bytes by multiplying2638* by 32 (shifting by 5), and add the two together. This is2639* the final indirect byte offset.2640*/2641fs_reg icp_handle = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);2642fs_reg sequence = bld.vgrf(BRW_REGISTER_TYPE_UW, 1);2643fs_reg channel_offsets = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);2644fs_reg vertex_offset_bytes = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);2645fs_reg icp_offset_bytes = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);26462647/* sequence = <7, 6, 5, 4, 3, 2, 1, 0> */2648bld.MOV(sequence, fs_reg(brw_imm_v(0x76543210)));2649/* channel_offsets = 4 * sequence = <28, 24, 20, 16, 12, 8, 4, 0> */2650bld.SHL(channel_offsets, sequence, brw_imm_ud(2u));2651/* Convert vertex_index to bytes (multiply by 32) */2652bld.SHL(vertex_offset_bytes,2653retype(get_nir_src(vertex_src), BRW_REGISTER_TYPE_UD),2654brw_imm_ud(5u));2655bld.ADD(icp_offset_bytes, vertex_offset_bytes, channel_offsets);26562657/* Use first_icp_handle as the base offset. There is one register2658* of URB handles per vertex, so inform the register allocator that2659* we might read up to nir->info.gs.vertices_in registers.2660*/2661bld.emit(SHADER_OPCODE_MOV_INDIRECT, icp_handle,2662retype(brw_vec8_grf(first_icp_handle, 0), icp_handle.type),2663icp_offset_bytes, brw_imm_ud(tcs_key->input_vertices * REG_SIZE));26642665return icp_handle;2666}26672668struct brw_reg2669fs_visitor::get_tcs_output_urb_handle()2670{2671struct brw_vue_prog_data *vue_prog_data = brw_vue_prog_data(prog_data);26722673if (vue_prog_data->dispatch_mode == DISPATCH_MODE_TCS_SINGLE_PATCH) {2674return retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD);2675} else {2676assert(vue_prog_data->dispatch_mode == DISPATCH_MODE_TCS_8_PATCH);2677return retype(brw_vec8_grf(1, 0), BRW_REGISTER_TYPE_UD);2678}2679}26802681void2682fs_visitor::nir_emit_tcs_intrinsic(const fs_builder &bld,2683nir_intrinsic_instr *instr)2684{2685assert(stage == MESA_SHADER_TESS_CTRL);2686struct brw_tcs_prog_key *tcs_key = (struct brw_tcs_prog_key *) key;2687struct brw_tcs_prog_data *tcs_prog_data = brw_tcs_prog_data(prog_data);2688struct brw_vue_prog_data *vue_prog_data = &tcs_prog_data->base;26892690bool eight_patch =2691vue_prog_data->dispatch_mode == DISPATCH_MODE_TCS_8_PATCH;26922693fs_reg dst;2694if (nir_intrinsic_infos[instr->intrinsic].has_dest)2695dst = get_nir_dest(instr->dest);26962697switch (instr->intrinsic) {2698case nir_intrinsic_load_primitive_id:2699bld.MOV(dst, fs_reg(eight_patch ? brw_vec8_grf(2, 0)2700: brw_vec1_grf(0, 1)));2701break;2702case nir_intrinsic_load_invocation_id:2703bld.MOV(retype(dst, invocation_id.type), invocation_id);2704break;2705case nir_intrinsic_load_patch_vertices_in:2706bld.MOV(retype(dst, BRW_REGISTER_TYPE_D),2707brw_imm_d(tcs_key->input_vertices));2708break;27092710case nir_intrinsic_control_barrier: {2711if (tcs_prog_data->instances == 1)2712break;27132714fs_reg m0 = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);2715fs_reg m0_2 = component(m0, 2);27162717const fs_builder chanbld = bld.exec_all().group(1, 0);27182719/* Zero the message header */2720bld.exec_all().MOV(m0, brw_imm_ud(0u));27212722if (devinfo->ver < 11) {2723/* Copy "Barrier ID" from r0.2, bits 16:13 */2724chanbld.AND(m0_2, retype(brw_vec1_grf(0, 2), BRW_REGISTER_TYPE_UD),2725brw_imm_ud(INTEL_MASK(16, 13)));27262727/* Shift it up to bits 27:24. */2728chanbld.SHL(m0_2, m0_2, brw_imm_ud(11));2729} else {2730chanbld.AND(m0_2, retype(brw_vec1_grf(0, 2), BRW_REGISTER_TYPE_UD),2731brw_imm_ud(INTEL_MASK(30, 24)));2732}27332734/* Set the Barrier Count and the enable bit */2735if (devinfo->ver < 11) {2736chanbld.OR(m0_2, m0_2,2737brw_imm_ud(tcs_prog_data->instances << 9 | (1 << 15)));2738} else {2739chanbld.OR(m0_2, m0_2,2740brw_imm_ud(tcs_prog_data->instances << 8 | (1 << 15)));2741}27422743bld.emit(SHADER_OPCODE_BARRIER, bld.null_reg_ud(), m0);2744break;2745}27462747case nir_intrinsic_load_input:2748unreachable("nir_lower_io should never give us these.");2749break;27502751case nir_intrinsic_load_per_vertex_input: {2752assert(nir_dest_bit_size(instr->dest) == 32);2753fs_reg indirect_offset = get_indirect_offset(instr);2754unsigned imm_offset = instr->const_index[0];2755fs_inst *inst;27562757fs_reg icp_handle =2758eight_patch ? get_tcs_eight_patch_icp_handle(bld, instr)2759: get_tcs_single_patch_icp_handle(bld, instr);27602761/* We can only read two double components with each URB read, so2762* we send two read messages in that case, each one loading up to2763* two double components.2764*/2765unsigned num_components = instr->num_components;2766unsigned first_component = nir_intrinsic_component(instr);27672768if (indirect_offset.file == BAD_FILE) {2769/* Constant indexing - use global offset. */2770if (first_component != 0) {2771unsigned read_components = num_components + first_component;2772fs_reg tmp = bld.vgrf(dst.type, read_components);2773inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8, tmp, icp_handle);2774for (unsigned i = 0; i < num_components; i++) {2775bld.MOV(offset(dst, bld, i),2776offset(tmp, bld, i + first_component));2777}2778} else {2779inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8, dst, icp_handle);2780}2781inst->offset = imm_offset;2782inst->mlen = 1;2783} else {2784/* Indirect indexing - use per-slot offsets as well. */2785const fs_reg srcs[] = { icp_handle, indirect_offset };2786fs_reg payload = bld.vgrf(BRW_REGISTER_TYPE_UD, 2);2787bld.LOAD_PAYLOAD(payload, srcs, ARRAY_SIZE(srcs), 0);2788if (first_component != 0) {2789unsigned read_components = num_components + first_component;2790fs_reg tmp = bld.vgrf(dst.type, read_components);2791inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT, tmp,2792payload);2793for (unsigned i = 0; i < num_components; i++) {2794bld.MOV(offset(dst, bld, i),2795offset(tmp, bld, i + first_component));2796}2797} else {2798inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT, dst,2799payload);2800}2801inst->offset = imm_offset;2802inst->mlen = 2;2803}2804inst->size_written = (num_components + first_component) *2805inst->dst.component_size(inst->exec_size);28062807/* Copy the temporary to the destination to deal with writemasking.2808*2809* Also attempt to deal with gl_PointSize being in the .w component.2810*/2811if (inst->offset == 0 && indirect_offset.file == BAD_FILE) {2812assert(type_sz(dst.type) == 4);2813inst->dst = bld.vgrf(dst.type, 4);2814inst->size_written = 4 * REG_SIZE;2815bld.MOV(dst, offset(inst->dst, bld, 3));2816}2817break;2818}28192820case nir_intrinsic_load_output:2821case nir_intrinsic_load_per_vertex_output: {2822assert(nir_dest_bit_size(instr->dest) == 32);2823fs_reg indirect_offset = get_indirect_offset(instr);2824unsigned imm_offset = instr->const_index[0];2825unsigned first_component = nir_intrinsic_component(instr);28262827struct brw_reg output_handles = get_tcs_output_urb_handle();28282829fs_inst *inst;2830if (indirect_offset.file == BAD_FILE) {2831/* This MOV replicates the output handle to all enabled channels2832* is SINGLE_PATCH mode.2833*/2834fs_reg patch_handle = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);2835bld.MOV(patch_handle, output_handles);28362837{2838if (first_component != 0) {2839unsigned read_components =2840instr->num_components + first_component;2841fs_reg tmp = bld.vgrf(dst.type, read_components);2842inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8, tmp,2843patch_handle);2844inst->size_written = read_components * REG_SIZE;2845for (unsigned i = 0; i < instr->num_components; i++) {2846bld.MOV(offset(dst, bld, i),2847offset(tmp, bld, i + first_component));2848}2849} else {2850inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8, dst,2851patch_handle);2852inst->size_written = instr->num_components * REG_SIZE;2853}2854inst->offset = imm_offset;2855inst->mlen = 1;2856}2857} else {2858/* Indirect indexing - use per-slot offsets as well. */2859const fs_reg srcs[] = { output_handles, indirect_offset };2860fs_reg payload = bld.vgrf(BRW_REGISTER_TYPE_UD, 2);2861bld.LOAD_PAYLOAD(payload, srcs, ARRAY_SIZE(srcs), 0);2862if (first_component != 0) {2863unsigned read_components =2864instr->num_components + first_component;2865fs_reg tmp = bld.vgrf(dst.type, read_components);2866inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT, tmp,2867payload);2868inst->size_written = read_components * REG_SIZE;2869for (unsigned i = 0; i < instr->num_components; i++) {2870bld.MOV(offset(dst, bld, i),2871offset(tmp, bld, i + first_component));2872}2873} else {2874inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT, dst,2875payload);2876inst->size_written = instr->num_components * REG_SIZE;2877}2878inst->offset = imm_offset;2879inst->mlen = 2;2880}2881break;2882}28832884case nir_intrinsic_store_output:2885case nir_intrinsic_store_per_vertex_output: {2886assert(nir_src_bit_size(instr->src[0]) == 32);2887fs_reg value = get_nir_src(instr->src[0]);2888fs_reg indirect_offset = get_indirect_offset(instr);2889unsigned imm_offset = instr->const_index[0];2890unsigned mask = instr->const_index[1];2891unsigned header_regs = 0;2892struct brw_reg output_handles = get_tcs_output_urb_handle();28932894fs_reg srcs[7];2895srcs[header_regs++] = output_handles;28962897if (indirect_offset.file != BAD_FILE) {2898srcs[header_regs++] = indirect_offset;2899}29002901if (mask == 0)2902break;29032904unsigned num_components = util_last_bit(mask);2905enum opcode opcode;29062907/* We can only pack two 64-bit components in a single message, so send2908* 2 messages if we have more components2909*/2910unsigned first_component = nir_intrinsic_component(instr);2911mask = mask << first_component;29122913if (mask != WRITEMASK_XYZW) {2914srcs[header_regs++] = brw_imm_ud(mask << 16);2915opcode = indirect_offset.file != BAD_FILE ?2916SHADER_OPCODE_URB_WRITE_SIMD8_MASKED_PER_SLOT :2917SHADER_OPCODE_URB_WRITE_SIMD8_MASKED;2918} else {2919opcode = indirect_offset.file != BAD_FILE ?2920SHADER_OPCODE_URB_WRITE_SIMD8_PER_SLOT :2921SHADER_OPCODE_URB_WRITE_SIMD8;2922}29232924for (unsigned i = 0; i < num_components; i++) {2925if (!(mask & (1 << (i + first_component))))2926continue;29272928srcs[header_regs + i + first_component] = offset(value, bld, i);2929}29302931unsigned mlen = header_regs + num_components + first_component;2932fs_reg payload =2933bld.vgrf(BRW_REGISTER_TYPE_UD, mlen);2934bld.LOAD_PAYLOAD(payload, srcs, mlen, header_regs);29352936fs_inst *inst = bld.emit(opcode, bld.null_reg_ud(), payload);2937inst->offset = imm_offset;2938inst->mlen = mlen;2939break;2940}29412942default:2943nir_emit_intrinsic(bld, instr);2944break;2945}2946}29472948void2949fs_visitor::nir_emit_tes_intrinsic(const fs_builder &bld,2950nir_intrinsic_instr *instr)2951{2952assert(stage == MESA_SHADER_TESS_EVAL);2953struct brw_tes_prog_data *tes_prog_data = brw_tes_prog_data(prog_data);29542955fs_reg dest;2956if (nir_intrinsic_infos[instr->intrinsic].has_dest)2957dest = get_nir_dest(instr->dest);29582959switch (instr->intrinsic) {2960case nir_intrinsic_load_primitive_id:2961bld.MOV(dest, fs_reg(brw_vec1_grf(0, 1)));2962break;2963case nir_intrinsic_load_tess_coord:2964/* gl_TessCoord is part of the payload in g1-3 */2965for (unsigned i = 0; i < 3; i++) {2966bld.MOV(offset(dest, bld, i), fs_reg(brw_vec8_grf(1 + i, 0)));2967}2968break;29692970case nir_intrinsic_load_input:2971case nir_intrinsic_load_per_vertex_input: {2972assert(nir_dest_bit_size(instr->dest) == 32);2973fs_reg indirect_offset = get_indirect_offset(instr);2974unsigned imm_offset = instr->const_index[0];2975unsigned first_component = nir_intrinsic_component(instr);29762977fs_inst *inst;2978if (indirect_offset.file == BAD_FILE) {2979/* Arbitrarily only push up to 32 vec4 slots worth of data,2980* which is 16 registers (since each holds 2 vec4 slots).2981*/2982const unsigned max_push_slots = 32;2983if (imm_offset < max_push_slots) {2984fs_reg src = fs_reg(ATTR, imm_offset / 2, dest.type);2985for (int i = 0; i < instr->num_components; i++) {2986unsigned comp = 4 * (imm_offset % 2) + i + first_component;2987bld.MOV(offset(dest, bld, i), component(src, comp));2988}29892990tes_prog_data->base.urb_read_length =2991MAX2(tes_prog_data->base.urb_read_length,2992(imm_offset / 2) + 1);2993} else {2994/* Replicate the patch handle to all enabled channels */2995const fs_reg srcs[] = {2996retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD)2997};2998fs_reg patch_handle = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);2999bld.LOAD_PAYLOAD(patch_handle, srcs, ARRAY_SIZE(srcs), 0);30003001if (first_component != 0) {3002unsigned read_components =3003instr->num_components + first_component;3004fs_reg tmp = bld.vgrf(dest.type, read_components);3005inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8, tmp,3006patch_handle);3007inst->size_written = read_components * REG_SIZE;3008for (unsigned i = 0; i < instr->num_components; i++) {3009bld.MOV(offset(dest, bld, i),3010offset(tmp, bld, i + first_component));3011}3012} else {3013inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8, dest,3014patch_handle);3015inst->size_written = instr->num_components * REG_SIZE;3016}3017inst->mlen = 1;3018inst->offset = imm_offset;3019}3020} else {3021/* Indirect indexing - use per-slot offsets as well. */30223023/* We can only read two double components with each URB read, so3024* we send two read messages in that case, each one loading up to3025* two double components.3026*/3027unsigned num_components = instr->num_components;3028const fs_reg srcs[] = {3029retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD),3030indirect_offset3031};3032fs_reg payload = bld.vgrf(BRW_REGISTER_TYPE_UD, 2);3033bld.LOAD_PAYLOAD(payload, srcs, ARRAY_SIZE(srcs), 0);30343035if (first_component != 0) {3036unsigned read_components =3037num_components + first_component;3038fs_reg tmp = bld.vgrf(dest.type, read_components);3039inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT, tmp,3040payload);3041for (unsigned i = 0; i < num_components; i++) {3042bld.MOV(offset(dest, bld, i),3043offset(tmp, bld, i + first_component));3044}3045} else {3046inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT, dest,3047payload);3048}3049inst->mlen = 2;3050inst->offset = imm_offset;3051inst->size_written = (num_components + first_component) *3052inst->dst.component_size(inst->exec_size);3053}3054break;3055}3056default:3057nir_emit_intrinsic(bld, instr);3058break;3059}3060}30613062void3063fs_visitor::nir_emit_gs_intrinsic(const fs_builder &bld,3064nir_intrinsic_instr *instr)3065{3066assert(stage == MESA_SHADER_GEOMETRY);3067fs_reg indirect_offset;30683069fs_reg dest;3070if (nir_intrinsic_infos[instr->intrinsic].has_dest)3071dest = get_nir_dest(instr->dest);30723073switch (instr->intrinsic) {3074case nir_intrinsic_load_primitive_id:3075assert(stage == MESA_SHADER_GEOMETRY);3076assert(brw_gs_prog_data(prog_data)->include_primitive_id);3077bld.MOV(retype(dest, BRW_REGISTER_TYPE_UD),3078retype(fs_reg(brw_vec8_grf(2, 0)), BRW_REGISTER_TYPE_UD));3079break;30803081case nir_intrinsic_load_input:3082unreachable("load_input intrinsics are invalid for the GS stage");30833084case nir_intrinsic_load_per_vertex_input:3085emit_gs_input_load(dest, instr->src[0], instr->const_index[0],3086instr->src[1], instr->num_components,3087nir_intrinsic_component(instr));3088break;30893090case nir_intrinsic_emit_vertex_with_counter:3091emit_gs_vertex(instr->src[0], instr->const_index[0]);3092break;30933094case nir_intrinsic_end_primitive_with_counter:3095emit_gs_end_primitive(instr->src[0]);3096break;30973098case nir_intrinsic_set_vertex_and_primitive_count:3099bld.MOV(this->final_gs_vertex_count, get_nir_src(instr->src[0]));3100break;31013102case nir_intrinsic_load_invocation_id: {3103fs_reg val = nir_system_values[SYSTEM_VALUE_INVOCATION_ID];3104assert(val.file != BAD_FILE);3105dest.type = val.type;3106bld.MOV(dest, val);3107break;3108}31093110default:3111nir_emit_intrinsic(bld, instr);3112break;3113}3114}31153116/**3117* Fetch the current render target layer index.3118*/3119static fs_reg3120fetch_render_target_array_index(const fs_builder &bld)3121{3122if (bld.shader->devinfo->ver >= 12) {3123/* The render target array index is provided in the thread payload as3124* bits 26:16 of r1.1.3125*/3126const fs_reg idx = bld.vgrf(BRW_REGISTER_TYPE_UD);3127bld.AND(idx, brw_uw1_reg(BRW_GENERAL_REGISTER_FILE, 1, 3),3128brw_imm_uw(0x7ff));3129return idx;3130} else if (bld.shader->devinfo->ver >= 6) {3131/* The render target array index is provided in the thread payload as3132* bits 26:16 of r0.0.3133*/3134const fs_reg idx = bld.vgrf(BRW_REGISTER_TYPE_UD);3135bld.AND(idx, brw_uw1_reg(BRW_GENERAL_REGISTER_FILE, 0, 1),3136brw_imm_uw(0x7ff));3137return idx;3138} else {3139/* Pre-SNB we only ever render into the first layer of the framebuffer3140* since layered rendering is not implemented.3141*/3142return brw_imm_ud(0);3143}3144}31453146/**3147* Fake non-coherent framebuffer read implemented using TXF to fetch from the3148* framebuffer at the current fragment coordinates and sample index.3149*/3150fs_inst *3151fs_visitor::emit_non_coherent_fb_read(const fs_builder &bld, const fs_reg &dst,3152unsigned target)3153{3154const struct intel_device_info *devinfo = bld.shader->devinfo;31553156assert(bld.shader->stage == MESA_SHADER_FRAGMENT);3157const brw_wm_prog_key *wm_key =3158reinterpret_cast<const brw_wm_prog_key *>(key);3159assert(!wm_key->coherent_fb_fetch);3160const struct brw_wm_prog_data *wm_prog_data =3161brw_wm_prog_data(stage_prog_data);31623163/* Calculate the surface index relative to the start of the texture binding3164* table block, since that's what the texturing messages expect.3165*/3166const unsigned surface = target +3167wm_prog_data->binding_table.render_target_read_start -3168wm_prog_data->base.binding_table.texture_start;31693170/* Calculate the fragment coordinates. */3171const fs_reg coords = bld.vgrf(BRW_REGISTER_TYPE_UD, 3);3172bld.MOV(offset(coords, bld, 0), pixel_x);3173bld.MOV(offset(coords, bld, 1), pixel_y);3174bld.MOV(offset(coords, bld, 2), fetch_render_target_array_index(bld));31753176/* Calculate the sample index and MCS payload when multisampling. Luckily3177* the MCS fetch message behaves deterministically for UMS surfaces, so it3178* shouldn't be necessary to recompile based on whether the framebuffer is3179* CMS or UMS.3180*/3181if (wm_key->multisample_fbo &&3182nir_system_values[SYSTEM_VALUE_SAMPLE_ID].file == BAD_FILE)3183nir_system_values[SYSTEM_VALUE_SAMPLE_ID] = *emit_sampleid_setup();31843185const fs_reg sample = nir_system_values[SYSTEM_VALUE_SAMPLE_ID];3186const fs_reg mcs = wm_key->multisample_fbo ?3187emit_mcs_fetch(coords, 3, brw_imm_ud(surface), fs_reg()) : fs_reg();31883189/* Use either a normal or a CMS texel fetch message depending on whether3190* the framebuffer is single or multisample. On SKL+ use the wide CMS3191* message just in case the framebuffer uses 16x multisampling, it should3192* be equivalent to the normal CMS fetch for lower multisampling modes.3193*/3194const opcode op = !wm_key->multisample_fbo ? SHADER_OPCODE_TXF_LOGICAL :3195devinfo->ver >= 9 ? SHADER_OPCODE_TXF_CMS_W_LOGICAL :3196SHADER_OPCODE_TXF_CMS_LOGICAL;31973198/* Emit the instruction. */3199fs_reg srcs[TEX_LOGICAL_NUM_SRCS];3200srcs[TEX_LOGICAL_SRC_COORDINATE] = coords;3201srcs[TEX_LOGICAL_SRC_LOD] = brw_imm_ud(0);3202srcs[TEX_LOGICAL_SRC_SAMPLE_INDEX] = sample;3203srcs[TEX_LOGICAL_SRC_MCS] = mcs;3204srcs[TEX_LOGICAL_SRC_SURFACE] = brw_imm_ud(surface);3205srcs[TEX_LOGICAL_SRC_SAMPLER] = brw_imm_ud(0);3206srcs[TEX_LOGICAL_SRC_COORD_COMPONENTS] = brw_imm_ud(3);3207srcs[TEX_LOGICAL_SRC_GRAD_COMPONENTS] = brw_imm_ud(0);32083209fs_inst *inst = bld.emit(op, dst, srcs, ARRAY_SIZE(srcs));3210inst->size_written = 4 * inst->dst.component_size(inst->exec_size);32113212return inst;3213}32143215/**3216* Actual coherent framebuffer read implemented using the native render target3217* read message. Requires SKL+.3218*/3219static fs_inst *3220emit_coherent_fb_read(const fs_builder &bld, const fs_reg &dst, unsigned target)3221{3222assert(bld.shader->devinfo->ver >= 9);3223fs_inst *inst = bld.emit(FS_OPCODE_FB_READ_LOGICAL, dst);3224inst->target = target;3225inst->size_written = 4 * inst->dst.component_size(inst->exec_size);32263227return inst;3228}32293230static fs_reg3231alloc_temporary(const fs_builder &bld, unsigned size, fs_reg *regs, unsigned n)3232{3233if (n && regs[0].file != BAD_FILE) {3234return regs[0];32353236} else {3237const fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_F, size);32383239for (unsigned i = 0; i < n; i++)3240regs[i] = tmp;32413242return tmp;3243}3244}32453246static fs_reg3247alloc_frag_output(fs_visitor *v, unsigned location)3248{3249assert(v->stage == MESA_SHADER_FRAGMENT);3250const brw_wm_prog_key *const key =3251reinterpret_cast<const brw_wm_prog_key *>(v->key);3252const unsigned l = GET_FIELD(location, BRW_NIR_FRAG_OUTPUT_LOCATION);3253const unsigned i = GET_FIELD(location, BRW_NIR_FRAG_OUTPUT_INDEX);32543255if (i > 0 || (key->force_dual_color_blend && l == FRAG_RESULT_DATA1))3256return alloc_temporary(v->bld, 4, &v->dual_src_output, 1);32573258else if (l == FRAG_RESULT_COLOR)3259return alloc_temporary(v->bld, 4, v->outputs,3260MAX2(key->nr_color_regions, 1));32613262else if (l == FRAG_RESULT_DEPTH)3263return alloc_temporary(v->bld, 1, &v->frag_depth, 1);32643265else if (l == FRAG_RESULT_STENCIL)3266return alloc_temporary(v->bld, 1, &v->frag_stencil, 1);32673268else if (l == FRAG_RESULT_SAMPLE_MASK)3269return alloc_temporary(v->bld, 1, &v->sample_mask, 1);32703271else if (l >= FRAG_RESULT_DATA0 &&3272l < FRAG_RESULT_DATA0 + BRW_MAX_DRAW_BUFFERS)3273return alloc_temporary(v->bld, 4,3274&v->outputs[l - FRAG_RESULT_DATA0], 1);32753276else3277unreachable("Invalid location");3278}32793280void3281fs_visitor::nir_emit_fs_intrinsic(const fs_builder &bld,3282nir_intrinsic_instr *instr)3283{3284assert(stage == MESA_SHADER_FRAGMENT);32853286fs_reg dest;3287if (nir_intrinsic_infos[instr->intrinsic].has_dest)3288dest = get_nir_dest(instr->dest);32893290switch (instr->intrinsic) {3291case nir_intrinsic_load_front_face:3292bld.MOV(retype(dest, BRW_REGISTER_TYPE_D),3293*emit_frontfacing_interpolation());3294break;32953296case nir_intrinsic_load_sample_pos: {3297fs_reg sample_pos = nir_system_values[SYSTEM_VALUE_SAMPLE_POS];3298assert(sample_pos.file != BAD_FILE);3299dest.type = sample_pos.type;3300bld.MOV(dest, sample_pos);3301bld.MOV(offset(dest, bld, 1), offset(sample_pos, bld, 1));3302break;3303}33043305case nir_intrinsic_load_layer_id:3306dest.type = BRW_REGISTER_TYPE_UD;3307bld.MOV(dest, fetch_render_target_array_index(bld));3308break;33093310case nir_intrinsic_is_helper_invocation: {3311/* Unlike the regular gl_HelperInvocation, that is defined at dispatch,3312* the helperInvocationEXT() (aka SpvOpIsHelperInvocationEXT) takes into3313* consideration demoted invocations. That information is stored in3314* f0.1.3315*/3316dest.type = BRW_REGISTER_TYPE_UD;33173318bld.MOV(dest, brw_imm_ud(0));33193320fs_inst *mov = bld.MOV(dest, brw_imm_ud(~0));3321mov->predicate = BRW_PREDICATE_NORMAL;3322mov->predicate_inverse = true;3323mov->flag_subreg = sample_mask_flag_subreg(this);3324break;3325}33263327case nir_intrinsic_load_helper_invocation:3328case nir_intrinsic_load_sample_mask_in:3329case nir_intrinsic_load_sample_id:3330case nir_intrinsic_load_frag_shading_rate: {3331gl_system_value sv = nir_system_value_from_intrinsic(instr->intrinsic);3332fs_reg val = nir_system_values[sv];3333assert(val.file != BAD_FILE);3334dest.type = val.type;3335bld.MOV(dest, val);3336break;3337}33383339case nir_intrinsic_store_output: {3340const fs_reg src = get_nir_src(instr->src[0]);3341const unsigned store_offset = nir_src_as_uint(instr->src[1]);3342const unsigned location = nir_intrinsic_base(instr) +3343SET_FIELD(store_offset, BRW_NIR_FRAG_OUTPUT_LOCATION);3344const fs_reg new_dest = retype(alloc_frag_output(this, location),3345src.type);33463347for (unsigned j = 0; j < instr->num_components; j++)3348bld.MOV(offset(new_dest, bld, nir_intrinsic_component(instr) + j),3349offset(src, bld, j));33503351break;3352}33533354case nir_intrinsic_load_output: {3355const unsigned l = GET_FIELD(nir_intrinsic_base(instr),3356BRW_NIR_FRAG_OUTPUT_LOCATION);3357assert(l >= FRAG_RESULT_DATA0);3358const unsigned load_offset = nir_src_as_uint(instr->src[0]);3359const unsigned target = l - FRAG_RESULT_DATA0 + load_offset;3360const fs_reg tmp = bld.vgrf(dest.type, 4);33613362if (reinterpret_cast<const brw_wm_prog_key *>(key)->coherent_fb_fetch)3363emit_coherent_fb_read(bld, tmp, target);3364else3365emit_non_coherent_fb_read(bld, tmp, target);33663367for (unsigned j = 0; j < instr->num_components; j++) {3368bld.MOV(offset(dest, bld, j),3369offset(tmp, bld, nir_intrinsic_component(instr) + j));3370}33713372break;3373}33743375case nir_intrinsic_demote:3376case nir_intrinsic_discard:3377case nir_intrinsic_terminate:3378case nir_intrinsic_demote_if:3379case nir_intrinsic_discard_if:3380case nir_intrinsic_terminate_if: {3381/* We track our discarded pixels in f0.1/f1.0. By predicating on it, we3382* can update just the flag bits that aren't yet discarded. If there's3383* no condition, we emit a CMP of g0 != g0, so all currently executing3384* channels will get turned off.3385*/3386fs_inst *cmp = NULL;3387if (instr->intrinsic == nir_intrinsic_demote_if ||3388instr->intrinsic == nir_intrinsic_discard_if ||3389instr->intrinsic == nir_intrinsic_terminate_if) {3390nir_alu_instr *alu = nir_src_as_alu_instr(instr->src[0]);33913392if (alu != NULL &&3393alu->op != nir_op_bcsel &&3394(devinfo->ver > 5 ||3395(alu->instr.pass_flags & BRW_NIR_BOOLEAN_MASK) != BRW_NIR_BOOLEAN_NEEDS_RESOLVE ||3396alu->op == nir_op_fneu32 || alu->op == nir_op_feq32 ||3397alu->op == nir_op_flt32 || alu->op == nir_op_fge32 ||3398alu->op == nir_op_ine32 || alu->op == nir_op_ieq32 ||3399alu->op == nir_op_ilt32 || alu->op == nir_op_ige32 ||3400alu->op == nir_op_ult32 || alu->op == nir_op_uge32)) {3401/* Re-emit the instruction that generated the Boolean value, but3402* do not store it. Since this instruction will be conditional,3403* other instructions that want to use the real Boolean value may3404* get garbage. This was a problem for piglit's fs-discard-exit-23405* test.3406*3407* Ideally we'd detect that the instruction cannot have a3408* conditional modifier before emitting the instructions. Alas,3409* that is nigh impossible. Instead, we're going to assume the3410* instruction (or last instruction) generated can have a3411* conditional modifier. If it cannot, fallback to the old-style3412* compare, and hope dead code elimination will clean up the3413* extra instructions generated.3414*/3415nir_emit_alu(bld, alu, false);34163417cmp = (fs_inst *) instructions.get_tail();3418if (cmp->conditional_mod == BRW_CONDITIONAL_NONE) {3419if (cmp->can_do_cmod())3420cmp->conditional_mod = BRW_CONDITIONAL_Z;3421else3422cmp = NULL;3423} else {3424/* The old sequence that would have been generated is,3425* basically, bool_result == false. This is equivalent to3426* !bool_result, so negate the old modifier.3427*/3428cmp->conditional_mod = brw_negate_cmod(cmp->conditional_mod);3429}3430}34313432if (cmp == NULL) {3433cmp = bld.CMP(bld.null_reg_f(), get_nir_src(instr->src[0]),3434brw_imm_d(0), BRW_CONDITIONAL_Z);3435}3436} else {3437fs_reg some_reg = fs_reg(retype(brw_vec8_grf(0, 0),3438BRW_REGISTER_TYPE_UW));3439cmp = bld.CMP(bld.null_reg_f(), some_reg, some_reg, BRW_CONDITIONAL_NZ);3440}34413442cmp->predicate = BRW_PREDICATE_NORMAL;3443cmp->flag_subreg = sample_mask_flag_subreg(this);34443445fs_inst *jump = bld.emit(BRW_OPCODE_HALT);3446jump->flag_subreg = sample_mask_flag_subreg(this);3447jump->predicate_inverse = true;34483449if (instr->intrinsic == nir_intrinsic_terminate ||3450instr->intrinsic == nir_intrinsic_terminate_if) {3451jump->predicate = BRW_PREDICATE_NORMAL;3452} else {3453/* Only jump when the whole quad is demoted. For historical3454* reasons this is also used for discard.3455*/3456jump->predicate = BRW_PREDICATE_ALIGN1_ANY4H;3457}34583459if (devinfo->ver < 7)3460limit_dispatch_width(346116, "Fragment discard/demote not implemented in SIMD32 mode.\n");3462break;3463}34643465case nir_intrinsic_load_input: {3466/* load_input is only used for flat inputs */3467assert(nir_dest_bit_size(instr->dest) == 32);3468unsigned base = nir_intrinsic_base(instr);3469unsigned comp = nir_intrinsic_component(instr);3470unsigned num_components = instr->num_components;34713472/* Special case fields in the VUE header */3473if (base == VARYING_SLOT_LAYER)3474comp = 1;3475else if (base == VARYING_SLOT_VIEWPORT)3476comp = 2;34773478for (unsigned int i = 0; i < num_components; i++) {3479bld.MOV(offset(dest, bld, i),3480retype(component(interp_reg(base, comp + i), 3), dest.type));3481}3482break;3483}34843485case nir_intrinsic_load_fs_input_interp_deltas: {3486assert(stage == MESA_SHADER_FRAGMENT);3487assert(nir_src_as_uint(instr->src[0]) == 0);3488fs_reg interp = interp_reg(nir_intrinsic_base(instr),3489nir_intrinsic_component(instr));3490dest.type = BRW_REGISTER_TYPE_F;3491bld.MOV(offset(dest, bld, 0), component(interp, 3));3492bld.MOV(offset(dest, bld, 1), component(interp, 1));3493bld.MOV(offset(dest, bld, 2), component(interp, 0));3494break;3495}34963497case nir_intrinsic_load_barycentric_pixel:3498case nir_intrinsic_load_barycentric_centroid:3499case nir_intrinsic_load_barycentric_sample: {3500/* Use the delta_xy values computed from the payload */3501const glsl_interp_mode interp_mode =3502(enum glsl_interp_mode) nir_intrinsic_interp_mode(instr);3503enum brw_barycentric_mode bary =3504brw_barycentric_mode(interp_mode, instr->intrinsic);3505const fs_reg srcs[] = { offset(this->delta_xy[bary], bld, 0),3506offset(this->delta_xy[bary], bld, 1) };3507bld.LOAD_PAYLOAD(dest, srcs, ARRAY_SIZE(srcs), 0);3508break;3509}35103511case nir_intrinsic_load_barycentric_at_sample: {3512const glsl_interp_mode interpolation =3513(enum glsl_interp_mode) nir_intrinsic_interp_mode(instr);35143515if (nir_src_is_const(instr->src[0])) {3516unsigned msg_data = nir_src_as_uint(instr->src[0]) << 4;35173518emit_pixel_interpolater_send(bld,3519FS_OPCODE_INTERPOLATE_AT_SAMPLE,3520dest,3521fs_reg(), /* src */3522brw_imm_ud(msg_data),3523interpolation);3524} else {3525const fs_reg sample_src = retype(get_nir_src(instr->src[0]),3526BRW_REGISTER_TYPE_UD);35273528if (nir_src_is_dynamically_uniform(instr->src[0])) {3529const fs_reg sample_id = bld.emit_uniformize(sample_src);3530const fs_reg msg_data = vgrf(glsl_type::uint_type);3531bld.exec_all().group(1, 0)3532.SHL(msg_data, sample_id, brw_imm_ud(4u));3533emit_pixel_interpolater_send(bld,3534FS_OPCODE_INTERPOLATE_AT_SAMPLE,3535dest,3536fs_reg(), /* src */3537component(msg_data, 0),3538interpolation);3539} else {3540/* Make a loop that sends a message to the pixel interpolater3541* for the sample number in each live channel. If there are3542* multiple channels with the same sample number then these3543* will be handled simultaneously with a single interation of3544* the loop.3545*/3546bld.emit(BRW_OPCODE_DO);35473548/* Get the next live sample number into sample_id_reg */3549const fs_reg sample_id = bld.emit_uniformize(sample_src);35503551/* Set the flag register so that we can perform the send3552* message on all channels that have the same sample number3553*/3554bld.CMP(bld.null_reg_ud(),3555sample_src, sample_id,3556BRW_CONDITIONAL_EQ);3557const fs_reg msg_data = vgrf(glsl_type::uint_type);3558bld.exec_all().group(1, 0)3559.SHL(msg_data, sample_id, brw_imm_ud(4u));3560fs_inst *inst =3561emit_pixel_interpolater_send(bld,3562FS_OPCODE_INTERPOLATE_AT_SAMPLE,3563dest,3564fs_reg(), /* src */3565component(msg_data, 0),3566interpolation);3567set_predicate(BRW_PREDICATE_NORMAL, inst);35683569/* Continue the loop if there are any live channels left */3570set_predicate_inv(BRW_PREDICATE_NORMAL,3571true, /* inverse */3572bld.emit(BRW_OPCODE_WHILE));3573}3574}3575break;3576}35773578case nir_intrinsic_load_barycentric_at_offset: {3579const glsl_interp_mode interpolation =3580(enum glsl_interp_mode) nir_intrinsic_interp_mode(instr);35813582nir_const_value *const_offset = nir_src_as_const_value(instr->src[0]);35833584if (const_offset) {3585assert(nir_src_bit_size(instr->src[0]) == 32);3586unsigned off_x = const_offset[0].u32 & 0xf;3587unsigned off_y = const_offset[1].u32 & 0xf;35883589emit_pixel_interpolater_send(bld,3590FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET,3591dest,3592fs_reg(), /* src */3593brw_imm_ud(off_x | (off_y << 4)),3594interpolation);3595} else {3596fs_reg src = retype(get_nir_src(instr->src[0]), BRW_REGISTER_TYPE_D);3597const enum opcode opcode = FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET;3598emit_pixel_interpolater_send(bld,3599opcode,3600dest,3601src,3602brw_imm_ud(0u),3603interpolation);3604}3605break;3606}36073608case nir_intrinsic_load_frag_coord:3609emit_fragcoord_interpolation(dest);3610break;36113612case nir_intrinsic_load_interpolated_input: {3613assert(instr->src[0].ssa &&3614instr->src[0].ssa->parent_instr->type == nir_instr_type_intrinsic);3615nir_intrinsic_instr *bary_intrinsic =3616nir_instr_as_intrinsic(instr->src[0].ssa->parent_instr);3617nir_intrinsic_op bary_intrin = bary_intrinsic->intrinsic;3618enum glsl_interp_mode interp_mode =3619(enum glsl_interp_mode) nir_intrinsic_interp_mode(bary_intrinsic);3620fs_reg dst_xy;36213622if (bary_intrin == nir_intrinsic_load_barycentric_at_offset ||3623bary_intrin == nir_intrinsic_load_barycentric_at_sample) {3624/* Use the result of the PI message. */3625dst_xy = retype(get_nir_src(instr->src[0]), BRW_REGISTER_TYPE_F);3626} else {3627/* Use the delta_xy values computed from the payload */3628enum brw_barycentric_mode bary =3629brw_barycentric_mode(interp_mode, bary_intrin);3630dst_xy = this->delta_xy[bary];3631}36323633for (unsigned int i = 0; i < instr->num_components; i++) {3634fs_reg interp =3635component(interp_reg(nir_intrinsic_base(instr),3636nir_intrinsic_component(instr) + i), 0);3637interp.type = BRW_REGISTER_TYPE_F;3638dest.type = BRW_REGISTER_TYPE_F;36393640if (devinfo->ver < 6 && interp_mode == INTERP_MODE_SMOOTH) {3641fs_reg tmp = vgrf(glsl_type::float_type);3642bld.emit(FS_OPCODE_LINTERP, tmp, dst_xy, interp);3643bld.MUL(offset(dest, bld, i), tmp, this->pixel_w);3644} else {3645bld.emit(FS_OPCODE_LINTERP, offset(dest, bld, i), dst_xy, interp);3646}3647}3648break;3649}36503651default:3652nir_emit_intrinsic(bld, instr);3653break;3654}3655}36563657void3658fs_visitor::nir_emit_cs_intrinsic(const fs_builder &bld,3659nir_intrinsic_instr *instr)3660{3661assert(stage == MESA_SHADER_COMPUTE || stage == MESA_SHADER_KERNEL);3662struct brw_cs_prog_data *cs_prog_data = brw_cs_prog_data(prog_data);36633664fs_reg dest;3665if (nir_intrinsic_infos[instr->intrinsic].has_dest)3666dest = get_nir_dest(instr->dest);36673668switch (instr->intrinsic) {3669case nir_intrinsic_control_barrier:3670/* The whole workgroup fits in a single HW thread, so all the3671* invocations are already executed lock-step. Instead of an actual3672* barrier just emit a scheduling fence, that will generate no code.3673*/3674if (!nir->info.workgroup_size_variable &&3675workgroup_size() <= dispatch_width) {3676bld.exec_all().group(1, 0).emit(FS_OPCODE_SCHEDULING_FENCE);3677break;3678}36793680emit_barrier();3681cs_prog_data->uses_barrier = true;3682break;36833684case nir_intrinsic_load_subgroup_id:3685if (devinfo->verx10 >= 125)3686bld.AND(retype(dest, BRW_REGISTER_TYPE_UD),3687retype(brw_vec1_grf(0, 2), BRW_REGISTER_TYPE_UD),3688brw_imm_ud(INTEL_MASK(7, 0)));3689else3690bld.MOV(retype(dest, BRW_REGISTER_TYPE_UD), subgroup_id);3691break;36923693case nir_intrinsic_load_local_invocation_id:3694case nir_intrinsic_load_workgroup_id: {3695gl_system_value sv = nir_system_value_from_intrinsic(instr->intrinsic);3696fs_reg val = nir_system_values[sv];3697assert(val.file != BAD_FILE);3698dest.type = val.type;3699for (unsigned i = 0; i < 3; i++)3700bld.MOV(offset(dest, bld, i), offset(val, bld, i));3701break;3702}37033704case nir_intrinsic_load_num_workgroups: {3705assert(nir_dest_bit_size(instr->dest) == 32);3706const unsigned surface =3707cs_prog_data->binding_table.work_groups_start;37083709cs_prog_data->uses_num_work_groups = true;37103711fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS];3712srcs[SURFACE_LOGICAL_SRC_SURFACE] = brw_imm_ud(surface);3713srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] = brw_imm_ud(1);3714srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(3); /* num components */3715srcs[SURFACE_LOGICAL_SRC_ADDRESS] = brw_imm_ud(0);3716srcs[SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK] = brw_imm_ud(0);3717fs_inst *inst =3718bld.emit(SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL,3719dest, srcs, SURFACE_LOGICAL_NUM_SRCS);3720inst->size_written = 3 * dispatch_width * 4;3721break;3722}37233724case nir_intrinsic_shared_atomic_add:3725case nir_intrinsic_shared_atomic_imin:3726case nir_intrinsic_shared_atomic_umin:3727case nir_intrinsic_shared_atomic_imax:3728case nir_intrinsic_shared_atomic_umax:3729case nir_intrinsic_shared_atomic_and:3730case nir_intrinsic_shared_atomic_or:3731case nir_intrinsic_shared_atomic_xor:3732case nir_intrinsic_shared_atomic_exchange:3733case nir_intrinsic_shared_atomic_comp_swap:3734nir_emit_shared_atomic(bld, brw_aop_for_nir_intrinsic(instr), instr);3735break;3736case nir_intrinsic_shared_atomic_fmin:3737case nir_intrinsic_shared_atomic_fmax:3738case nir_intrinsic_shared_atomic_fcomp_swap:3739nir_emit_shared_atomic_float(bld, brw_aop_for_nir_intrinsic(instr), instr);3740break;37413742case nir_intrinsic_load_shared: {3743assert(devinfo->ver >= 7);3744assert(stage == MESA_SHADER_COMPUTE || stage == MESA_SHADER_KERNEL);37453746const unsigned bit_size = nir_dest_bit_size(instr->dest);3747fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS];3748srcs[SURFACE_LOGICAL_SRC_SURFACE] = brw_imm_ud(GFX7_BTI_SLM);3749srcs[SURFACE_LOGICAL_SRC_ADDRESS] = get_nir_src(instr->src[0]);3750srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] = brw_imm_ud(1);3751srcs[SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK] = brw_imm_ud(0);37523753/* Make dest unsigned because that's what the temporary will be */3754dest.type = brw_reg_type_from_bit_size(bit_size, BRW_REGISTER_TYPE_UD);37553756/* Read the vector */3757assert(nir_dest_bit_size(instr->dest) <= 32);3758assert(nir_intrinsic_align(instr) > 0);3759if (nir_dest_bit_size(instr->dest) == 32 &&3760nir_intrinsic_align(instr) >= 4) {3761assert(nir_dest_num_components(instr->dest) <= 4);3762srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(instr->num_components);3763fs_inst *inst =3764bld.emit(SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL,3765dest, srcs, SURFACE_LOGICAL_NUM_SRCS);3766inst->size_written = instr->num_components * dispatch_width * 4;3767} else {3768assert(nir_dest_num_components(instr->dest) == 1);3769srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(bit_size);37703771fs_reg read_result = bld.vgrf(BRW_REGISTER_TYPE_UD);3772bld.emit(SHADER_OPCODE_BYTE_SCATTERED_READ_LOGICAL,3773read_result, srcs, SURFACE_LOGICAL_NUM_SRCS);3774bld.MOV(dest, subscript(read_result, dest.type, 0));3775}3776break;3777}37783779case nir_intrinsic_store_shared: {3780assert(devinfo->ver >= 7);3781assert(stage == MESA_SHADER_COMPUTE || stage == MESA_SHADER_KERNEL);37823783const unsigned bit_size = nir_src_bit_size(instr->src[0]);3784fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS];3785srcs[SURFACE_LOGICAL_SRC_SURFACE] = brw_imm_ud(GFX7_BTI_SLM);3786srcs[SURFACE_LOGICAL_SRC_ADDRESS] = get_nir_src(instr->src[1]);3787srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] = brw_imm_ud(1);3788srcs[SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK] = brw_imm_ud(1);37893790fs_reg data = get_nir_src(instr->src[0]);3791data.type = brw_reg_type_from_bit_size(bit_size, BRW_REGISTER_TYPE_UD);37923793assert(nir_src_bit_size(instr->src[0]) <= 32);3794assert(nir_intrinsic_write_mask(instr) ==3795(1u << instr->num_components) - 1);3796assert(nir_intrinsic_align(instr) > 0);3797if (nir_src_bit_size(instr->src[0]) == 32 &&3798nir_intrinsic_align(instr) >= 4) {3799assert(nir_src_num_components(instr->src[0]) <= 4);3800srcs[SURFACE_LOGICAL_SRC_DATA] = data;3801srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(instr->num_components);3802bld.emit(SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL,3803fs_reg(), srcs, SURFACE_LOGICAL_NUM_SRCS);3804} else {3805assert(nir_src_num_components(instr->src[0]) == 1);3806srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(bit_size);38073808srcs[SURFACE_LOGICAL_SRC_DATA] = bld.vgrf(BRW_REGISTER_TYPE_UD);3809bld.MOV(srcs[SURFACE_LOGICAL_SRC_DATA], data);38103811bld.emit(SHADER_OPCODE_BYTE_SCATTERED_WRITE_LOGICAL,3812fs_reg(), srcs, SURFACE_LOGICAL_NUM_SRCS);3813}3814break;3815}38163817case nir_intrinsic_load_workgroup_size: {3818assert(compiler->lower_variable_group_size);3819assert(nir->info.workgroup_size_variable);3820for (unsigned i = 0; i < 3; i++) {3821bld.MOV(retype(offset(dest, bld, i), BRW_REGISTER_TYPE_UD),3822group_size[i]);3823}3824break;3825}38263827default:3828nir_emit_intrinsic(bld, instr);3829break;3830}3831}38323833void3834fs_visitor::nir_emit_bs_intrinsic(const fs_builder &bld,3835nir_intrinsic_instr *instr)3836{3837assert(brw_shader_stage_is_bindless(stage));38383839fs_reg dest;3840if (nir_intrinsic_infos[instr->intrinsic].has_dest)3841dest = get_nir_dest(instr->dest);38423843switch (instr->intrinsic) {3844case nir_intrinsic_load_btd_global_arg_addr_intel:3845bld.MOV(dest, retype(brw_vec1_grf(2, 0), dest.type));3846break;38473848case nir_intrinsic_load_btd_local_arg_addr_intel:3849bld.MOV(dest, retype(brw_vec1_grf(2, 2), dest.type));3850break;38513852case nir_intrinsic_trace_ray_initial_intel:3853bld.emit(RT_OPCODE_TRACE_RAY_LOGICAL,3854bld.null_reg_ud(),3855brw_imm_ud(BRW_RT_BVH_LEVEL_WORLD),3856brw_imm_ud(GEN_RT_TRACE_RAY_INITAL));3857break;38583859case nir_intrinsic_trace_ray_commit_intel:3860bld.emit(RT_OPCODE_TRACE_RAY_LOGICAL,3861bld.null_reg_ud(),3862brw_imm_ud(BRW_RT_BVH_LEVEL_OBJECT),3863brw_imm_ud(GEN_RT_TRACE_RAY_COMMIT));3864break;38653866case nir_intrinsic_trace_ray_continue_intel:3867bld.emit(RT_OPCODE_TRACE_RAY_LOGICAL,3868bld.null_reg_ud(),3869brw_imm_ud(BRW_RT_BVH_LEVEL_OBJECT),3870brw_imm_ud(GEN_RT_TRACE_RAY_CONTINUE));3871break;38723873default:3874nir_emit_intrinsic(bld, instr);3875break;3876}3877}38783879static fs_reg3880brw_nir_reduction_op_identity(const fs_builder &bld,3881nir_op op, brw_reg_type type)3882{3883nir_const_value value = nir_alu_binop_identity(op, type_sz(type) * 8);3884switch (type_sz(type)) {3885case 1:3886if (type == BRW_REGISTER_TYPE_UB) {3887return brw_imm_uw(value.u8);3888} else {3889assert(type == BRW_REGISTER_TYPE_B);3890return brw_imm_w(value.i8);3891}3892case 2:3893return retype(brw_imm_uw(value.u16), type);3894case 4:3895return retype(brw_imm_ud(value.u32), type);3896case 8:3897if (type == BRW_REGISTER_TYPE_DF)3898return setup_imm_df(bld, value.f64);3899else3900return retype(brw_imm_u64(value.u64), type);3901default:3902unreachable("Invalid type size");3903}3904}39053906static opcode3907brw_op_for_nir_reduction_op(nir_op op)3908{3909switch (op) {3910case nir_op_iadd: return BRW_OPCODE_ADD;3911case nir_op_fadd: return BRW_OPCODE_ADD;3912case nir_op_imul: return BRW_OPCODE_MUL;3913case nir_op_fmul: return BRW_OPCODE_MUL;3914case nir_op_imin: return BRW_OPCODE_SEL;3915case nir_op_umin: return BRW_OPCODE_SEL;3916case nir_op_fmin: return BRW_OPCODE_SEL;3917case nir_op_imax: return BRW_OPCODE_SEL;3918case nir_op_umax: return BRW_OPCODE_SEL;3919case nir_op_fmax: return BRW_OPCODE_SEL;3920case nir_op_iand: return BRW_OPCODE_AND;3921case nir_op_ior: return BRW_OPCODE_OR;3922case nir_op_ixor: return BRW_OPCODE_XOR;3923default:3924unreachable("Invalid reduction operation");3925}3926}39273928static brw_conditional_mod3929brw_cond_mod_for_nir_reduction_op(nir_op op)3930{3931switch (op) {3932case nir_op_iadd: return BRW_CONDITIONAL_NONE;3933case nir_op_fadd: return BRW_CONDITIONAL_NONE;3934case nir_op_imul: return BRW_CONDITIONAL_NONE;3935case nir_op_fmul: return BRW_CONDITIONAL_NONE;3936case nir_op_imin: return BRW_CONDITIONAL_L;3937case nir_op_umin: return BRW_CONDITIONAL_L;3938case nir_op_fmin: return BRW_CONDITIONAL_L;3939case nir_op_imax: return BRW_CONDITIONAL_GE;3940case nir_op_umax: return BRW_CONDITIONAL_GE;3941case nir_op_fmax: return BRW_CONDITIONAL_GE;3942case nir_op_iand: return BRW_CONDITIONAL_NONE;3943case nir_op_ior: return BRW_CONDITIONAL_NONE;3944case nir_op_ixor: return BRW_CONDITIONAL_NONE;3945default:3946unreachable("Invalid reduction operation");3947}3948}39493950fs_reg3951fs_visitor::get_nir_image_intrinsic_image(const brw::fs_builder &bld,3952nir_intrinsic_instr *instr)3953{3954fs_reg image = retype(get_nir_src_imm(instr->src[0]), BRW_REGISTER_TYPE_UD);3955fs_reg surf_index = image;39563957if (stage_prog_data->binding_table.image_start > 0) {3958if (image.file == BRW_IMMEDIATE_VALUE) {3959surf_index =3960brw_imm_ud(image.d + stage_prog_data->binding_table.image_start);3961} else {3962surf_index = vgrf(glsl_type::uint_type);3963bld.ADD(surf_index, image,3964brw_imm_d(stage_prog_data->binding_table.image_start));3965}3966}39673968return bld.emit_uniformize(surf_index);3969}39703971fs_reg3972fs_visitor::get_nir_ssbo_intrinsic_index(const brw::fs_builder &bld,3973nir_intrinsic_instr *instr)3974{3975/* SSBO stores are weird in that their index is in src[1] */3976const bool is_store =3977instr->intrinsic == nir_intrinsic_store_ssbo ||3978instr->intrinsic == nir_intrinsic_store_ssbo_block_intel;3979const unsigned src = is_store ? 1 : 0;39803981if (nir_src_is_const(instr->src[src])) {3982unsigned index = stage_prog_data->binding_table.ssbo_start +3983nir_src_as_uint(instr->src[src]);3984return brw_imm_ud(index);3985} else {3986fs_reg surf_index = vgrf(glsl_type::uint_type);3987bld.ADD(surf_index, get_nir_src(instr->src[src]),3988brw_imm_ud(stage_prog_data->binding_table.ssbo_start));3989return bld.emit_uniformize(surf_index);3990}3991}39923993/**3994* The offsets we get from NIR act as if each SIMD channel has it's own blob3995* of contiguous space. However, if we actually place each SIMD channel in3996* it's own space, we end up with terrible cache performance because each SIMD3997* channel accesses a different cache line even when they're all accessing the3998* same byte offset. To deal with this problem, we swizzle the address using3999* a simple algorithm which ensures that any time a SIMD message reads or4000* writes the same address, it's all in the same cache line. We have to keep4001* the bottom two bits fixed so that we can read/write up to a dword at a time4002* and the individual element is contiguous. We do this by splitting the4003* address as follows:4004*4005* 31 4-6 2 04006* +-------------------------------+------------+----------+4007* | Hi address bits | chan index | addr low |4008* +-------------------------------+------------+----------+4009*4010* In other words, the bottom two address bits stay, and the top 30 get4011* shifted up so that we can stick the SIMD channel index in the middle. This4012* way, we can access 8, 16, or 32-bit elements and, when accessing a 32-bit4013* at the same logical offset, the scratch read/write instruction acts on4014* continuous elements and we get good cache locality.4015*/4016fs_reg4017fs_visitor::swizzle_nir_scratch_addr(const brw::fs_builder &bld,4018const fs_reg &nir_addr,4019bool in_dwords)4020{4021const fs_reg &chan_index =4022nir_system_values[SYSTEM_VALUE_SUBGROUP_INVOCATION];4023const unsigned chan_index_bits = ffs(dispatch_width) - 1;40244025fs_reg addr = bld.vgrf(BRW_REGISTER_TYPE_UD);4026if (in_dwords) {4027/* In this case, we know the address is aligned to a DWORD and we want4028* the final address in DWORDs.4029*/4030bld.SHL(addr, nir_addr, brw_imm_ud(chan_index_bits - 2));4031bld.OR(addr, addr, chan_index);4032} else {4033/* This case substantially more annoying because we have to pay4034* attention to those pesky two bottom bits.4035*/4036fs_reg addr_hi = bld.vgrf(BRW_REGISTER_TYPE_UD);4037bld.AND(addr_hi, nir_addr, brw_imm_ud(~0x3u));4038bld.SHL(addr_hi, addr_hi, brw_imm_ud(chan_index_bits));4039fs_reg chan_addr = bld.vgrf(BRW_REGISTER_TYPE_UD);4040bld.SHL(chan_addr, chan_index, brw_imm_ud(2));4041bld.AND(addr, nir_addr, brw_imm_ud(0x3u));4042bld.OR(addr, addr, addr_hi);4043bld.OR(addr, addr, chan_addr);4044}4045return addr;4046}40474048static unsigned4049choose_oword_block_size_dwords(unsigned dwords)4050{4051unsigned block;4052if (dwords >= 32) {4053block = 32;4054} else if (dwords >= 16) {4055block = 16;4056} else {4057block = 8;4058}4059assert(block <= dwords);4060return block;4061}40624063static void4064increment_a64_address(const fs_builder &bld, fs_reg address, uint32_t v)4065{4066if (bld.shader->devinfo->has_64bit_int) {4067bld.ADD(address, address, brw_imm_ud(v));4068} else {4069fs_reg low = retype(address, BRW_REGISTER_TYPE_UD);4070fs_reg high = offset(low, bld, 1);40714072/* Add low and if that overflows, add carry to high. */4073bld.ADD(low, low, brw_imm_ud(v))->conditional_mod = BRW_CONDITIONAL_O;4074bld.ADD(high, high, brw_imm_ud(0x1))->predicate = BRW_PREDICATE_NORMAL;4075}4076}40774078void4079fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr)4080{4081fs_reg dest;4082if (nir_intrinsic_infos[instr->intrinsic].has_dest)4083dest = get_nir_dest(instr->dest);40844085switch (instr->intrinsic) {4086case nir_intrinsic_image_load:4087case nir_intrinsic_image_store:4088case nir_intrinsic_image_atomic_add:4089case nir_intrinsic_image_atomic_imin:4090case nir_intrinsic_image_atomic_umin:4091case nir_intrinsic_image_atomic_imax:4092case nir_intrinsic_image_atomic_umax:4093case nir_intrinsic_image_atomic_and:4094case nir_intrinsic_image_atomic_or:4095case nir_intrinsic_image_atomic_xor:4096case nir_intrinsic_image_atomic_exchange:4097case nir_intrinsic_image_atomic_comp_swap:4098case nir_intrinsic_bindless_image_load:4099case nir_intrinsic_bindless_image_store:4100case nir_intrinsic_bindless_image_atomic_add:4101case nir_intrinsic_bindless_image_atomic_imin:4102case nir_intrinsic_bindless_image_atomic_umin:4103case nir_intrinsic_bindless_image_atomic_imax:4104case nir_intrinsic_bindless_image_atomic_umax:4105case nir_intrinsic_bindless_image_atomic_and:4106case nir_intrinsic_bindless_image_atomic_or:4107case nir_intrinsic_bindless_image_atomic_xor:4108case nir_intrinsic_bindless_image_atomic_exchange:4109case nir_intrinsic_bindless_image_atomic_comp_swap: {4110/* Get some metadata from the image intrinsic. */4111const nir_intrinsic_info *info = &nir_intrinsic_infos[instr->intrinsic];41124113fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS];41144115switch (instr->intrinsic) {4116case nir_intrinsic_image_load:4117case nir_intrinsic_image_store:4118case nir_intrinsic_image_atomic_add:4119case nir_intrinsic_image_atomic_imin:4120case nir_intrinsic_image_atomic_umin:4121case nir_intrinsic_image_atomic_imax:4122case nir_intrinsic_image_atomic_umax:4123case nir_intrinsic_image_atomic_and:4124case nir_intrinsic_image_atomic_or:4125case nir_intrinsic_image_atomic_xor:4126case nir_intrinsic_image_atomic_exchange:4127case nir_intrinsic_image_atomic_comp_swap:4128srcs[SURFACE_LOGICAL_SRC_SURFACE] =4129get_nir_image_intrinsic_image(bld, instr);4130break;41314132default:4133/* Bindless */4134srcs[SURFACE_LOGICAL_SRC_SURFACE_HANDLE] =4135bld.emit_uniformize(get_nir_src(instr->src[0]));4136break;4137}41384139srcs[SURFACE_LOGICAL_SRC_ADDRESS] = get_nir_src(instr->src[1]);4140srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] =4141brw_imm_ud(nir_image_intrinsic_coord_components(instr));41424143/* Emit an image load, store or atomic op. */4144if (instr->intrinsic == nir_intrinsic_image_load ||4145instr->intrinsic == nir_intrinsic_bindless_image_load) {4146srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(instr->num_components);4147srcs[SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK] = brw_imm_ud(0);4148fs_inst *inst =4149bld.emit(SHADER_OPCODE_TYPED_SURFACE_READ_LOGICAL,4150dest, srcs, SURFACE_LOGICAL_NUM_SRCS);4151inst->size_written = instr->num_components * dispatch_width * 4;4152} else if (instr->intrinsic == nir_intrinsic_image_store ||4153instr->intrinsic == nir_intrinsic_bindless_image_store) {4154srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(instr->num_components);4155srcs[SURFACE_LOGICAL_SRC_DATA] = get_nir_src(instr->src[3]);4156srcs[SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK] = brw_imm_ud(1);4157bld.emit(SHADER_OPCODE_TYPED_SURFACE_WRITE_LOGICAL,4158fs_reg(), srcs, SURFACE_LOGICAL_NUM_SRCS);4159} else {4160unsigned num_srcs = info->num_srcs;4161int op = brw_aop_for_nir_intrinsic(instr);4162if (op == BRW_AOP_INC || op == BRW_AOP_DEC) {4163assert(num_srcs == 4);4164num_srcs = 3;4165}41664167srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(op);41684169fs_reg data;4170if (num_srcs >= 4)4171data = get_nir_src(instr->src[3]);4172if (num_srcs >= 5) {4173fs_reg tmp = bld.vgrf(data.type, 2);4174fs_reg sources[2] = { data, get_nir_src(instr->src[4]) };4175bld.LOAD_PAYLOAD(tmp, sources, 2, 0);4176data = tmp;4177}4178srcs[SURFACE_LOGICAL_SRC_DATA] = data;4179srcs[SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK] = brw_imm_ud(1);41804181bld.emit(SHADER_OPCODE_TYPED_ATOMIC_LOGICAL,4182dest, srcs, SURFACE_LOGICAL_NUM_SRCS);4183}4184break;4185}41864187case nir_intrinsic_image_size:4188case nir_intrinsic_bindless_image_size: {4189/* Unlike the [un]typed load and store opcodes, the TXS that this turns4190* into will handle the binding table index for us in the geneerator.4191* Incidentally, this means that we can handle bindless with exactly the4192* same code.4193*/4194fs_reg image = retype(get_nir_src_imm(instr->src[0]),4195BRW_REGISTER_TYPE_UD);4196image = bld.emit_uniformize(image);41974198assert(nir_src_as_uint(instr->src[1]) == 0);41994200fs_reg srcs[TEX_LOGICAL_NUM_SRCS];4201if (instr->intrinsic == nir_intrinsic_image_size)4202srcs[TEX_LOGICAL_SRC_SURFACE] = image;4203else4204srcs[TEX_LOGICAL_SRC_SURFACE_HANDLE] = image;4205srcs[TEX_LOGICAL_SRC_SAMPLER] = brw_imm_d(0);4206srcs[TEX_LOGICAL_SRC_COORD_COMPONENTS] = brw_imm_d(0);4207srcs[TEX_LOGICAL_SRC_GRAD_COMPONENTS] = brw_imm_d(0);42084209/* Since the image size is always uniform, we can just emit a SIMD84210* query instruction and splat the result out.4211*/4212const fs_builder ubld = bld.exec_all().group(8, 0);42134214fs_reg tmp = ubld.vgrf(BRW_REGISTER_TYPE_UD, 4);4215fs_inst *inst = ubld.emit(SHADER_OPCODE_IMAGE_SIZE_LOGICAL,4216tmp, srcs, ARRAY_SIZE(srcs));4217inst->size_written = 4 * REG_SIZE;42184219for (unsigned c = 0; c < instr->dest.ssa.num_components; ++c) {4220if (c == 2 && nir_intrinsic_image_dim(instr) == GLSL_SAMPLER_DIM_CUBE) {4221bld.emit(SHADER_OPCODE_INT_QUOTIENT,4222offset(retype(dest, tmp.type), bld, c),4223component(offset(tmp, ubld, c), 0), brw_imm_ud(6));4224} else {4225bld.MOV(offset(retype(dest, tmp.type), bld, c),4226component(offset(tmp, ubld, c), 0));4227}4228}4229break;4230}42314232case nir_intrinsic_image_load_raw_intel: {4233fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS];4234srcs[SURFACE_LOGICAL_SRC_SURFACE] =4235get_nir_image_intrinsic_image(bld, instr);4236srcs[SURFACE_LOGICAL_SRC_ADDRESS] = get_nir_src(instr->src[1]);4237srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] = brw_imm_ud(1);4238srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(instr->num_components);4239srcs[SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK] = brw_imm_ud(0);42404241fs_inst *inst =4242bld.emit(SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL,4243dest, srcs, SURFACE_LOGICAL_NUM_SRCS);4244inst->size_written = instr->num_components * dispatch_width * 4;4245break;4246}42474248case nir_intrinsic_image_store_raw_intel: {4249fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS];4250srcs[SURFACE_LOGICAL_SRC_SURFACE] =4251get_nir_image_intrinsic_image(bld, instr);4252srcs[SURFACE_LOGICAL_SRC_ADDRESS] = get_nir_src(instr->src[1]);4253srcs[SURFACE_LOGICAL_SRC_DATA] = get_nir_src(instr->src[2]);4254srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] = brw_imm_ud(1);4255srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(instr->num_components);4256srcs[SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK] = brw_imm_ud(1);42574258bld.emit(SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL,4259fs_reg(), srcs, SURFACE_LOGICAL_NUM_SRCS);4260break;4261}42624263case nir_intrinsic_scoped_barrier:4264assert(nir_intrinsic_execution_scope(instr) == NIR_SCOPE_NONE);4265FALLTHROUGH;4266case nir_intrinsic_group_memory_barrier:4267case nir_intrinsic_memory_barrier_shared:4268case nir_intrinsic_memory_barrier_buffer:4269case nir_intrinsic_memory_barrier_image:4270case nir_intrinsic_memory_barrier:4271case nir_intrinsic_begin_invocation_interlock:4272case nir_intrinsic_end_invocation_interlock: {4273bool l3_fence, slm_fence, tgm_fence = false;4274const enum opcode opcode =4275instr->intrinsic == nir_intrinsic_begin_invocation_interlock ?4276SHADER_OPCODE_INTERLOCK : SHADER_OPCODE_MEMORY_FENCE;42774278switch (instr->intrinsic) {4279case nir_intrinsic_scoped_barrier: {4280nir_variable_mode modes = nir_intrinsic_memory_modes(instr);4281l3_fence = modes & (nir_var_shader_out |4282nir_var_mem_ssbo |4283nir_var_mem_global);4284slm_fence = modes & nir_var_mem_shared;42854286/* NIR currently doesn't have an image mode */4287if (devinfo->has_lsc)4288tgm_fence = modes & nir_var_mem_ssbo;4289break;4290}42914292case nir_intrinsic_begin_invocation_interlock:4293case nir_intrinsic_end_invocation_interlock:4294/* For beginInvocationInterlockARB(), we will generate a memory fence4295* but with a different opcode so that generator can pick SENDC4296* instead of SEND.4297*4298* For endInvocationInterlockARB(), we need to insert a memory fence which4299* stalls in the shader until the memory transactions prior to that4300* fence are complete. This ensures that the shader does not end before4301* any writes from its critical section have landed. Otherwise, you can4302* end up with a case where the next invocation on that pixel properly4303* stalls for previous FS invocation on its pixel to complete but4304* doesn't actually wait for the dataport memory transactions from that4305* thread to land before submitting its own.4306*4307* Handling them here will allow the logic for IVB render cache (see4308* below) to be reused.4309*/4310l3_fence = true;4311slm_fence = false;4312break;43134314default:4315l3_fence = instr->intrinsic != nir_intrinsic_memory_barrier_shared;4316slm_fence = instr->intrinsic == nir_intrinsic_group_memory_barrier ||4317instr->intrinsic == nir_intrinsic_memory_barrier ||4318instr->intrinsic == nir_intrinsic_memory_barrier_shared;4319tgm_fence = instr->intrinsic == nir_intrinsic_memory_barrier_image;4320break;4321}43224323if (stage != MESA_SHADER_COMPUTE && stage != MESA_SHADER_KERNEL)4324slm_fence = false;43254326/* If the workgroup fits in a single HW thread, the messages for SLM are4327* processed in-order and the shader itself is already synchronized so4328* the memory fence is not necessary.4329*4330* TODO: Check if applies for many HW threads sharing same Data Port.4331*/4332if (!nir->info.workgroup_size_variable &&4333slm_fence && workgroup_size() <= dispatch_width)4334slm_fence = false;43354336/* Prior to Gfx11, there's only L3 fence, so emit that instead. */4337if (slm_fence && devinfo->ver < 11) {4338slm_fence = false;4339l3_fence = true;4340}43414342/* IVB does typed surface access through the render cache, so we need4343* to flush it too.4344*/4345const bool needs_render_fence =4346devinfo->verx10 == 70;43474348/* Be conservative in Gfx11+ and always stall in a fence. Since there4349* are two different fences, and shader might want to synchronize4350* between them.4351*4352* TODO: Use scope and visibility information for the barriers from NIR4353* to make a better decision on whether we need to stall.4354*/4355const bool stall = devinfo->ver >= 11 || needs_render_fence ||4356instr->intrinsic == nir_intrinsic_end_invocation_interlock;43574358const bool commit_enable = stall ||4359devinfo->ver >= 10; /* HSD ES # 1404612949 */43604361unsigned fence_regs_count = 0;4362fs_reg fence_regs[3] = {};43634364const fs_builder ubld = bld.group(8, 0);43654366if (l3_fence) {4367fs_inst *fence =4368ubld.emit(opcode,4369ubld.vgrf(BRW_REGISTER_TYPE_UD),4370brw_vec8_grf(0, 0),4371brw_imm_ud(commit_enable),4372brw_imm_ud(0 /* BTI; ignored for LSC */));43734374fence->sfid = devinfo->has_lsc ?4375GFX12_SFID_UGM :4376GFX7_SFID_DATAPORT_DATA_CACHE;43774378fence_regs[fence_regs_count++] = fence->dst;43794380if (needs_render_fence) {4381fs_inst *render_fence =4382ubld.emit(opcode,4383ubld.vgrf(BRW_REGISTER_TYPE_UD),4384brw_vec8_grf(0, 0),4385brw_imm_ud(commit_enable),4386brw_imm_ud(/* bti */ 0));4387render_fence->sfid = GFX6_SFID_DATAPORT_RENDER_CACHE;43884389fence_regs[fence_regs_count++] = render_fence->dst;4390}43914392/* Translate l3_fence into untyped and typed fence on XeHP */4393if (devinfo->has_lsc && tgm_fence) {4394fs_inst *fence =4395ubld.emit(opcode,4396ubld.vgrf(BRW_REGISTER_TYPE_UD),4397brw_vec8_grf(0, 0),4398brw_imm_ud(commit_enable),4399brw_imm_ud(/* ignored */0));44004401fence->sfid = GFX12_SFID_TGM;4402fence_regs[fence_regs_count++] = fence->dst;4403}4404}44054406if (slm_fence) {4407assert(opcode == SHADER_OPCODE_MEMORY_FENCE);4408fs_inst *fence =4409ubld.emit(opcode,4410ubld.vgrf(BRW_REGISTER_TYPE_UD),4411brw_vec8_grf(0, 0),4412brw_imm_ud(commit_enable),4413brw_imm_ud(GFX7_BTI_SLM /* ignored for LSC */));4414if (devinfo->has_lsc)4415fence->sfid = GFX12_SFID_SLM;4416else4417fence->sfid = GFX7_SFID_DATAPORT_DATA_CACHE;44184419fence_regs[fence_regs_count++] = fence->dst;4420}44214422assert(fence_regs_count <= 3);44234424if (stall || fence_regs_count == 0) {4425ubld.exec_all().group(1, 0).emit(4426FS_OPCODE_SCHEDULING_FENCE, ubld.null_reg_ud(),4427fence_regs, fence_regs_count);4428}44294430break;4431}44324433case nir_intrinsic_memory_barrier_tcs_patch:4434break;44354436case nir_intrinsic_shader_clock: {4437/* We cannot do anything if there is an event, so ignore it for now */4438const fs_reg shader_clock = get_timestamp(bld);4439const fs_reg srcs[] = { component(shader_clock, 0),4440component(shader_clock, 1) };4441bld.LOAD_PAYLOAD(dest, srcs, ARRAY_SIZE(srcs), 0);4442break;4443}44444445case nir_intrinsic_image_samples:4446/* The driver does not support multi-sampled images. */4447bld.MOV(retype(dest, BRW_REGISTER_TYPE_D), brw_imm_d(1));4448break;44494450case nir_intrinsic_load_reloc_const_intel: {4451uint32_t id = nir_intrinsic_param_idx(instr);4452bld.emit(SHADER_OPCODE_MOV_RELOC_IMM,4453dest, brw_imm_ud(id));4454break;4455}44564457case nir_intrinsic_load_uniform: {4458/* Offsets are in bytes but they should always aligned to4459* the type size4460*/4461assert(instr->const_index[0] % 4 == 0 ||4462instr->const_index[0] % type_sz(dest.type) == 0);44634464fs_reg src(UNIFORM, instr->const_index[0] / 4, dest.type);44654466if (nir_src_is_const(instr->src[0])) {4467unsigned load_offset = nir_src_as_uint(instr->src[0]);4468assert(load_offset % type_sz(dest.type) == 0);4469/* For 16-bit types we add the module of the const_index[0]4470* offset to access to not 32-bit aligned element4471*/4472src.offset = load_offset + instr->const_index[0] % 4;44734474for (unsigned j = 0; j < instr->num_components; j++) {4475bld.MOV(offset(dest, bld, j), offset(src, bld, j));4476}4477} else {4478fs_reg indirect = retype(get_nir_src(instr->src[0]),4479BRW_REGISTER_TYPE_UD);44804481/* We need to pass a size to the MOV_INDIRECT but we don't want it to4482* go past the end of the uniform. In order to keep the n'th4483* component from running past, we subtract off the size of all but4484* one component of the vector.4485*/4486assert(instr->const_index[1] >=4487instr->num_components * (int) type_sz(dest.type));4488unsigned read_size = instr->const_index[1] -4489(instr->num_components - 1) * type_sz(dest.type);44904491bool supports_64bit_indirects =4492!devinfo->is_cherryview && !intel_device_info_is_9lp(devinfo);44934494if (type_sz(dest.type) != 8 || supports_64bit_indirects) {4495for (unsigned j = 0; j < instr->num_components; j++) {4496bld.emit(SHADER_OPCODE_MOV_INDIRECT,4497offset(dest, bld, j), offset(src, bld, j),4498indirect, brw_imm_ud(read_size));4499}4500} else {4501const unsigned num_mov_indirects =4502type_sz(dest.type) / type_sz(BRW_REGISTER_TYPE_UD);4503/* We read a little bit less per MOV INDIRECT, as they are now4504* 32-bits ones instead of 64-bit. Fix read_size then.4505*/4506const unsigned read_size_32bit = read_size -4507(num_mov_indirects - 1) * type_sz(BRW_REGISTER_TYPE_UD);4508for (unsigned j = 0; j < instr->num_components; j++) {4509for (unsigned i = 0; i < num_mov_indirects; i++) {4510bld.emit(SHADER_OPCODE_MOV_INDIRECT,4511subscript(offset(dest, bld, j), BRW_REGISTER_TYPE_UD, i),4512subscript(offset(src, bld, j), BRW_REGISTER_TYPE_UD, i),4513indirect, brw_imm_ud(read_size_32bit));4514}4515}4516}4517}4518break;4519}45204521case nir_intrinsic_load_ubo: {4522fs_reg surf_index;4523if (nir_src_is_const(instr->src[0])) {4524const unsigned index = stage_prog_data->binding_table.ubo_start +4525nir_src_as_uint(instr->src[0]);4526surf_index = brw_imm_ud(index);4527} else {4528/* The block index is not a constant. Evaluate the index expression4529* per-channel and add the base UBO index; we have to select a value4530* from any live channel.4531*/4532surf_index = vgrf(glsl_type::uint_type);4533bld.ADD(surf_index, get_nir_src(instr->src[0]),4534brw_imm_ud(stage_prog_data->binding_table.ubo_start));4535surf_index = bld.emit_uniformize(surf_index);4536}45374538if (!nir_src_is_const(instr->src[1])) {4539fs_reg base_offset = retype(get_nir_src(instr->src[1]),4540BRW_REGISTER_TYPE_UD);45414542for (int i = 0; i < instr->num_components; i++)4543VARYING_PULL_CONSTANT_LOAD(bld, offset(dest, bld, i), surf_index,4544base_offset, i * type_sz(dest.type),4545nir_dest_bit_size(instr->dest) / 8);45464547prog_data->has_ubo_pull = true;4548} else {4549/* Even if we are loading doubles, a pull constant load will load4550* a 32-bit vec4, so should only reserve vgrf space for that. If we4551* need to load a full dvec4 we will have to emit 2 loads. This is4552* similar to demote_pull_constants(), except that in that case we4553* see individual accesses to each component of the vector and then4554* we let CSE deal with duplicate loads. Here we see a vector access4555* and we have to split it if necessary.4556*/4557const unsigned type_size = type_sz(dest.type);4558const unsigned load_offset = nir_src_as_uint(instr->src[1]);45594560/* See if we've selected this as a push constant candidate */4561if (nir_src_is_const(instr->src[0])) {4562const unsigned ubo_block = nir_src_as_uint(instr->src[0]);4563const unsigned offset_256b = load_offset / 32;45644565fs_reg push_reg;4566for (int i = 0; i < 4; i++) {4567const struct brw_ubo_range *range = &prog_data->ubo_ranges[i];4568if (range->block == ubo_block &&4569offset_256b >= range->start &&4570offset_256b < range->start + range->length) {45714572push_reg = fs_reg(UNIFORM, UBO_START + i, dest.type);4573push_reg.offset = load_offset - 32 * range->start;4574break;4575}4576}45774578if (push_reg.file != BAD_FILE) {4579for (unsigned i = 0; i < instr->num_components; i++) {4580bld.MOV(offset(dest, bld, i),4581byte_offset(push_reg, i * type_size));4582}4583break;4584}4585}45864587prog_data->has_ubo_pull = true;45884589const unsigned block_sz = 64; /* Fetch one cacheline at a time. */4590const fs_builder ubld = bld.exec_all().group(block_sz / 4, 0);4591const fs_reg packed_consts = ubld.vgrf(BRW_REGISTER_TYPE_UD);45924593for (unsigned c = 0; c < instr->num_components;) {4594const unsigned base = load_offset + c * type_size;4595/* Number of usable components in the next block-aligned load. */4596const unsigned count = MIN2(instr->num_components - c,4597(block_sz - base % block_sz) / type_size);45984599ubld.emit(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD,4600packed_consts, surf_index,4601brw_imm_ud(base & ~(block_sz - 1)));46024603const fs_reg consts =4604retype(byte_offset(packed_consts, base & (block_sz - 1)),4605dest.type);46064607for (unsigned d = 0; d < count; d++)4608bld.MOV(offset(dest, bld, c + d), component(consts, d));46094610c += count;4611}4612}4613break;4614}46154616case nir_intrinsic_load_global:4617case nir_intrinsic_load_global_constant: {4618assert(devinfo->ver >= 8);46194620assert(nir_dest_bit_size(instr->dest) <= 32);4621assert(nir_intrinsic_align(instr) > 0);4622if (nir_dest_bit_size(instr->dest) == 32 &&4623nir_intrinsic_align(instr) >= 4) {4624assert(nir_dest_num_components(instr->dest) <= 4);4625fs_inst *inst = bld.emit(SHADER_OPCODE_A64_UNTYPED_READ_LOGICAL,4626dest,4627get_nir_src(instr->src[0]), /* Address */4628fs_reg(), /* No source data */4629brw_imm_ud(instr->num_components));4630inst->size_written = instr->num_components *4631inst->dst.component_size(inst->exec_size);4632} else {4633const unsigned bit_size = nir_dest_bit_size(instr->dest);4634assert(nir_dest_num_components(instr->dest) == 1);4635fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_UD);4636bld.emit(SHADER_OPCODE_A64_BYTE_SCATTERED_READ_LOGICAL,4637tmp,4638get_nir_src(instr->src[0]), /* Address */4639fs_reg(), /* No source data */4640brw_imm_ud(bit_size));4641bld.MOV(dest, subscript(tmp, dest.type, 0));4642}4643break;4644}46454646case nir_intrinsic_store_global:4647assert(devinfo->ver >= 8);46484649assert(nir_src_bit_size(instr->src[0]) <= 32);4650assert(nir_intrinsic_write_mask(instr) ==4651(1u << instr->num_components) - 1);4652assert(nir_intrinsic_align(instr) > 0);4653if (nir_src_bit_size(instr->src[0]) == 32 &&4654nir_intrinsic_align(instr) >= 4) {4655assert(nir_src_num_components(instr->src[0]) <= 4);4656bld.emit(SHADER_OPCODE_A64_UNTYPED_WRITE_LOGICAL,4657fs_reg(),4658get_nir_src(instr->src[1]), /* Address */4659get_nir_src(instr->src[0]), /* Data */4660brw_imm_ud(instr->num_components));4661} else {4662assert(nir_src_num_components(instr->src[0]) == 1);4663const unsigned bit_size = nir_src_bit_size(instr->src[0]);4664brw_reg_type data_type =4665brw_reg_type_from_bit_size(bit_size, BRW_REGISTER_TYPE_UD);4666fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_UD);4667bld.MOV(tmp, retype(get_nir_src(instr->src[0]), data_type));4668bld.emit(SHADER_OPCODE_A64_BYTE_SCATTERED_WRITE_LOGICAL,4669fs_reg(),4670get_nir_src(instr->src[1]), /* Address */4671tmp, /* Data */4672brw_imm_ud(nir_src_bit_size(instr->src[0])));4673}4674break;46754676case nir_intrinsic_global_atomic_add:4677case nir_intrinsic_global_atomic_imin:4678case nir_intrinsic_global_atomic_umin:4679case nir_intrinsic_global_atomic_imax:4680case nir_intrinsic_global_atomic_umax:4681case nir_intrinsic_global_atomic_and:4682case nir_intrinsic_global_atomic_or:4683case nir_intrinsic_global_atomic_xor:4684case nir_intrinsic_global_atomic_exchange:4685case nir_intrinsic_global_atomic_comp_swap:4686nir_emit_global_atomic(bld, brw_aop_for_nir_intrinsic(instr), instr);4687break;4688case nir_intrinsic_global_atomic_fmin:4689case nir_intrinsic_global_atomic_fmax:4690case nir_intrinsic_global_atomic_fcomp_swap:4691nir_emit_global_atomic_float(bld, brw_aop_for_nir_intrinsic(instr), instr);4692break;46934694case nir_intrinsic_load_global_const_block_intel: {4695assert(nir_dest_bit_size(instr->dest) == 32);4696assert(instr->num_components == 8 || instr->num_components == 16);46974698const fs_builder ubld = bld.exec_all().group(instr->num_components, 0);4699fs_reg load_val;47004701bool is_pred_const = nir_src_is_const(instr->src[1]);4702if (is_pred_const && nir_src_as_uint(instr->src[1]) == 0) {4703/* In this case, we don't want the UBO load at all. We really4704* shouldn't get here but it's possible.4705*/4706load_val = brw_imm_ud(0);4707} else {4708/* The uniform process may stomp the flag so do this first */4709fs_reg addr = bld.emit_uniformize(get_nir_src(instr->src[0]));47104711load_val = ubld.vgrf(BRW_REGISTER_TYPE_UD);47124713/* If the predicate is constant and we got here, then it's non-zero4714* and we don't need the predicate at all.4715*/4716if (!is_pred_const) {4717/* Load the predicate */4718fs_reg pred = bld.emit_uniformize(get_nir_src(instr->src[1]));4719fs_inst *mov = ubld.MOV(bld.null_reg_d(), pred);4720mov->conditional_mod = BRW_CONDITIONAL_NZ;47214722/* Stomp the destination with 0 if we're OOB */4723mov = ubld.MOV(load_val, brw_imm_ud(0));4724mov->predicate = BRW_PREDICATE_NORMAL;4725mov->predicate_inverse = true;4726}47274728fs_inst *load = ubld.emit(SHADER_OPCODE_A64_OWORD_BLOCK_READ_LOGICAL,4729load_val, addr,4730fs_reg(), /* No source data */4731brw_imm_ud(instr->num_components));47324733if (!is_pred_const)4734load->predicate = BRW_PREDICATE_NORMAL;4735}47364737/* From the HW perspective, we just did a single SIMD16 instruction4738* which loaded a dword in each SIMD channel. From NIR's perspective,4739* this instruction returns a vec16. Any users of this data in the4740* back-end will expect a vec16 per SIMD channel so we have to emit a4741* pile of MOVs to resolve this discrepancy. Fortunately, copy-prop4742* will generally clean them up for us.4743*/4744for (unsigned i = 0; i < instr->num_components; i++) {4745bld.MOV(retype(offset(dest, bld, i), BRW_REGISTER_TYPE_UD),4746component(load_val, i));4747}4748break;4749}47504751case nir_intrinsic_load_ssbo: {4752assert(devinfo->ver >= 7);47534754const unsigned bit_size = nir_dest_bit_size(instr->dest);4755fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS];4756srcs[SURFACE_LOGICAL_SRC_SURFACE] =4757get_nir_ssbo_intrinsic_index(bld, instr);4758srcs[SURFACE_LOGICAL_SRC_ADDRESS] = get_nir_src(instr->src[1]);4759srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] = brw_imm_ud(1);4760srcs[SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK] = brw_imm_ud(0);47614762/* Make dest unsigned because that's what the temporary will be */4763dest.type = brw_reg_type_from_bit_size(bit_size, BRW_REGISTER_TYPE_UD);47644765/* Read the vector */4766assert(nir_dest_bit_size(instr->dest) <= 32);4767assert(nir_intrinsic_align(instr) > 0);4768if (nir_dest_bit_size(instr->dest) == 32 &&4769nir_intrinsic_align(instr) >= 4) {4770assert(nir_dest_num_components(instr->dest) <= 4);4771srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(instr->num_components);4772fs_inst *inst =4773bld.emit(SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL,4774dest, srcs, SURFACE_LOGICAL_NUM_SRCS);4775inst->size_written = instr->num_components * dispatch_width * 4;4776} else {4777assert(nir_dest_num_components(instr->dest) == 1);4778srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(bit_size);47794780fs_reg read_result = bld.vgrf(BRW_REGISTER_TYPE_UD);4781bld.emit(SHADER_OPCODE_BYTE_SCATTERED_READ_LOGICAL,4782read_result, srcs, SURFACE_LOGICAL_NUM_SRCS);4783bld.MOV(dest, subscript(read_result, dest.type, 0));4784}4785break;4786}47874788case nir_intrinsic_store_ssbo: {4789assert(devinfo->ver >= 7);47904791const unsigned bit_size = nir_src_bit_size(instr->src[0]);4792fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS];4793srcs[SURFACE_LOGICAL_SRC_SURFACE] =4794get_nir_ssbo_intrinsic_index(bld, instr);4795srcs[SURFACE_LOGICAL_SRC_ADDRESS] = get_nir_src(instr->src[2]);4796srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] = brw_imm_ud(1);4797srcs[SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK] = brw_imm_ud(1);47984799fs_reg data = get_nir_src(instr->src[0]);4800data.type = brw_reg_type_from_bit_size(bit_size, BRW_REGISTER_TYPE_UD);48014802assert(nir_src_bit_size(instr->src[0]) <= 32);4803assert(nir_intrinsic_write_mask(instr) ==4804(1u << instr->num_components) - 1);4805assert(nir_intrinsic_align(instr) > 0);4806if (nir_src_bit_size(instr->src[0]) == 32 &&4807nir_intrinsic_align(instr) >= 4) {4808assert(nir_src_num_components(instr->src[0]) <= 4);4809srcs[SURFACE_LOGICAL_SRC_DATA] = data;4810srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(instr->num_components);4811bld.emit(SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL,4812fs_reg(), srcs, SURFACE_LOGICAL_NUM_SRCS);4813} else {4814assert(nir_src_num_components(instr->src[0]) == 1);4815srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(bit_size);48164817srcs[SURFACE_LOGICAL_SRC_DATA] = bld.vgrf(BRW_REGISTER_TYPE_UD);4818bld.MOV(srcs[SURFACE_LOGICAL_SRC_DATA], data);48194820bld.emit(SHADER_OPCODE_BYTE_SCATTERED_WRITE_LOGICAL,4821fs_reg(), srcs, SURFACE_LOGICAL_NUM_SRCS);4822}4823break;4824}48254826case nir_intrinsic_store_output: {4827assert(nir_src_bit_size(instr->src[0]) == 32);4828fs_reg src = get_nir_src(instr->src[0]);48294830unsigned store_offset = nir_src_as_uint(instr->src[1]);4831unsigned num_components = instr->num_components;4832unsigned first_component = nir_intrinsic_component(instr);48334834fs_reg new_dest = retype(offset(outputs[instr->const_index[0]], bld,48354 * store_offset), src.type);4836for (unsigned j = 0; j < num_components; j++) {4837bld.MOV(offset(new_dest, bld, j + first_component),4838offset(src, bld, j));4839}4840break;4841}48424843case nir_intrinsic_ssbo_atomic_add:4844case nir_intrinsic_ssbo_atomic_imin:4845case nir_intrinsic_ssbo_atomic_umin:4846case nir_intrinsic_ssbo_atomic_imax:4847case nir_intrinsic_ssbo_atomic_umax:4848case nir_intrinsic_ssbo_atomic_and:4849case nir_intrinsic_ssbo_atomic_or:4850case nir_intrinsic_ssbo_atomic_xor:4851case nir_intrinsic_ssbo_atomic_exchange:4852case nir_intrinsic_ssbo_atomic_comp_swap:4853nir_emit_ssbo_atomic(bld, brw_aop_for_nir_intrinsic(instr), instr);4854break;4855case nir_intrinsic_ssbo_atomic_fmin:4856case nir_intrinsic_ssbo_atomic_fmax:4857case nir_intrinsic_ssbo_atomic_fcomp_swap:4858nir_emit_ssbo_atomic_float(bld, brw_aop_for_nir_intrinsic(instr), instr);4859break;48604861case nir_intrinsic_get_ssbo_size: {4862assert(nir_src_num_components(instr->src[0]) == 1);4863unsigned ssbo_index = nir_src_is_const(instr->src[0]) ?4864nir_src_as_uint(instr->src[0]) : 0;48654866/* A resinfo's sampler message is used to get the buffer size. The4867* SIMD8's writeback message consists of four registers and SIMD16's4868* writeback message consists of 8 destination registers (two per each4869* component). Because we are only interested on the first channel of4870* the first returned component, where resinfo returns the buffer size4871* for SURFTYPE_BUFFER, we can just use the SIMD8 variant regardless of4872* the dispatch width.4873*/4874const fs_builder ubld = bld.exec_all().group(8, 0);4875fs_reg src_payload = ubld.vgrf(BRW_REGISTER_TYPE_UD);4876fs_reg ret_payload = ubld.vgrf(BRW_REGISTER_TYPE_UD, 4);48774878/* Set LOD = 0 */4879ubld.MOV(src_payload, brw_imm_d(0));48804881const unsigned index = prog_data->binding_table.ssbo_start + ssbo_index;4882fs_inst *inst = ubld.emit(SHADER_OPCODE_GET_BUFFER_SIZE, ret_payload,4883src_payload, brw_imm_ud(index));4884inst->header_size = 0;4885inst->mlen = 1;4886inst->size_written = 4 * REG_SIZE;48874888/* SKL PRM, vol07, 3D Media GPGPU Engine, Bounds Checking and Faulting:4889*4890* "Out-of-bounds checking is always performed at a DWord granularity. If4891* any part of the DWord is out-of-bounds then the whole DWord is4892* considered out-of-bounds."4893*4894* This implies that types with size smaller than 4-bytes need to be4895* padded if they don't complete the last dword of the buffer. But as we4896* need to maintain the original size we need to reverse the padding4897* calculation to return the correct size to know the number of elements4898* of an unsized array. As we stored in the last two bits of the surface4899* size the needed padding for the buffer, we calculate here the4900* original buffer_size reversing the surface_size calculation:4901*4902* surface_size = isl_align(buffer_size, 4) +4903* (isl_align(buffer_size) - buffer_size)4904*4905* buffer_size = surface_size & ~3 - surface_size & 34906*/49074908fs_reg size_aligned4 = ubld.vgrf(BRW_REGISTER_TYPE_UD);4909fs_reg size_padding = ubld.vgrf(BRW_REGISTER_TYPE_UD);4910fs_reg buffer_size = ubld.vgrf(BRW_REGISTER_TYPE_UD);49114912ubld.AND(size_padding, ret_payload, brw_imm_ud(3));4913ubld.AND(size_aligned4, ret_payload, brw_imm_ud(~3));4914ubld.ADD(buffer_size, size_aligned4, negate(size_padding));49154916bld.MOV(retype(dest, ret_payload.type), component(buffer_size, 0));4917break;4918}49194920case nir_intrinsic_load_scratch: {4921assert(devinfo->ver >= 7);49224923assert(nir_dest_num_components(instr->dest) == 1);4924const unsigned bit_size = nir_dest_bit_size(instr->dest);4925fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS];49264927if (devinfo->verx10 >= 125) {4928const fs_builder ubld = bld.exec_all().group(1, 0);4929fs_reg handle = component(ubld.vgrf(BRW_REGISTER_TYPE_UD), 0);4930ubld.AND(handle, retype(brw_vec1_grf(0, 5), BRW_REGISTER_TYPE_UD),4931brw_imm_ud(~0x3ffu));4932srcs[SURFACE_LOGICAL_SRC_SURFACE_HANDLE] = handle;4933} else if (devinfo->ver >= 8) {4934srcs[SURFACE_LOGICAL_SRC_SURFACE] =4935brw_imm_ud(GFX8_BTI_STATELESS_NON_COHERENT);4936} else {4937srcs[SURFACE_LOGICAL_SRC_SURFACE] = brw_imm_ud(BRW_BTI_STATELESS);4938}49394940srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] = brw_imm_ud(1);4941srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(bit_size);4942srcs[SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK] = brw_imm_ud(0);4943const fs_reg nir_addr = get_nir_src(instr->src[0]);49444945/* Make dest unsigned because that's what the temporary will be */4946dest.type = brw_reg_type_from_bit_size(bit_size, BRW_REGISTER_TYPE_UD);49474948/* Read the vector */4949assert(nir_dest_num_components(instr->dest) == 1);4950assert(nir_dest_bit_size(instr->dest) <= 32);4951assert(nir_intrinsic_align(instr) > 0);4952if (devinfo->verx10 >= 125) {4953assert(nir_dest_bit_size(instr->dest) == 32 &&4954nir_intrinsic_align(instr) >= 4);49554956srcs[SURFACE_LOGICAL_SRC_ADDRESS] =4957swizzle_nir_scratch_addr(bld, nir_addr, false);4958srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(1);49594960bld.emit(SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL,4961dest, srcs, SURFACE_LOGICAL_NUM_SRCS);4962} else if (nir_dest_bit_size(instr->dest) >= 4 &&4963nir_intrinsic_align(instr) >= 4) {4964/* The offset for a DWORD scattered message is in dwords. */4965srcs[SURFACE_LOGICAL_SRC_ADDRESS] =4966swizzle_nir_scratch_addr(bld, nir_addr, true);49674968bld.emit(SHADER_OPCODE_DWORD_SCATTERED_READ_LOGICAL,4969dest, srcs, SURFACE_LOGICAL_NUM_SRCS);4970} else {4971srcs[SURFACE_LOGICAL_SRC_ADDRESS] =4972swizzle_nir_scratch_addr(bld, nir_addr, false);49734974fs_reg read_result = bld.vgrf(BRW_REGISTER_TYPE_UD);4975bld.emit(SHADER_OPCODE_BYTE_SCATTERED_READ_LOGICAL,4976read_result, srcs, SURFACE_LOGICAL_NUM_SRCS);4977bld.MOV(dest, read_result);4978}4979break;4980}49814982case nir_intrinsic_store_scratch: {4983assert(devinfo->ver >= 7);49844985assert(nir_src_num_components(instr->src[0]) == 1);4986const unsigned bit_size = nir_src_bit_size(instr->src[0]);4987fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS];49884989if (devinfo->verx10 >= 125) {4990const fs_builder ubld = bld.exec_all().group(1, 0);4991fs_reg handle = component(ubld.vgrf(BRW_REGISTER_TYPE_UD), 0);4992ubld.AND(handle, retype(brw_vec1_grf(0, 5), BRW_REGISTER_TYPE_UD),4993brw_imm_ud(~0x3ffu));4994srcs[SURFACE_LOGICAL_SRC_SURFACE_HANDLE] = handle;4995} else if (devinfo->ver >= 8) {4996srcs[SURFACE_LOGICAL_SRC_SURFACE] =4997brw_imm_ud(GFX8_BTI_STATELESS_NON_COHERENT);4998} else {4999srcs[SURFACE_LOGICAL_SRC_SURFACE] = brw_imm_ud(BRW_BTI_STATELESS);5000}50015002srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] = brw_imm_ud(1);5003srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(bit_size);5004/**5005* While this instruction has side-effects, it should not be predicated5006* on sample mask, because otherwise fs helper invocations would5007* load undefined values from scratch memory. And scratch memory5008* load-stores are produced from operations without side-effects, thus5009* they should not have different behaviour in the helper invocations.5010*/5011srcs[SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK] = brw_imm_ud(0);5012const fs_reg nir_addr = get_nir_src(instr->src[1]);50135014fs_reg data = get_nir_src(instr->src[0]);5015data.type = brw_reg_type_from_bit_size(bit_size, BRW_REGISTER_TYPE_UD);50165017assert(nir_src_num_components(instr->src[0]) == 1);5018assert(nir_src_bit_size(instr->src[0]) <= 32);5019assert(nir_intrinsic_write_mask(instr) == 1);5020assert(nir_intrinsic_align(instr) > 0);5021if (devinfo->verx10 >= 125) {5022assert(nir_src_bit_size(instr->src[0]) == 32 &&5023nir_intrinsic_align(instr) >= 4);5024srcs[SURFACE_LOGICAL_SRC_DATA] = data;50255026srcs[SURFACE_LOGICAL_SRC_ADDRESS] =5027swizzle_nir_scratch_addr(bld, nir_addr, false);5028srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(1);50295030bld.emit(SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL,5031dest, srcs, SURFACE_LOGICAL_NUM_SRCS);5032} else if (nir_src_bit_size(instr->src[0]) == 32 &&5033nir_intrinsic_align(instr) >= 4) {5034srcs[SURFACE_LOGICAL_SRC_DATA] = data;50355036/* The offset for a DWORD scattered message is in dwords. */5037srcs[SURFACE_LOGICAL_SRC_ADDRESS] =5038swizzle_nir_scratch_addr(bld, nir_addr, true);50395040bld.emit(SHADER_OPCODE_DWORD_SCATTERED_WRITE_LOGICAL,5041fs_reg(), srcs, SURFACE_LOGICAL_NUM_SRCS);5042} else {5043srcs[SURFACE_LOGICAL_SRC_DATA] = bld.vgrf(BRW_REGISTER_TYPE_UD);5044bld.MOV(srcs[SURFACE_LOGICAL_SRC_DATA], data);50455046srcs[SURFACE_LOGICAL_SRC_ADDRESS] =5047swizzle_nir_scratch_addr(bld, nir_addr, false);50485049bld.emit(SHADER_OPCODE_BYTE_SCATTERED_WRITE_LOGICAL,5050fs_reg(), srcs, SURFACE_LOGICAL_NUM_SRCS);5051}5052break;5053}50545055case nir_intrinsic_load_subgroup_size:5056/* This should only happen for fragment shaders because every other case5057* is lowered in NIR so we can optimize on it.5058*/5059assert(stage == MESA_SHADER_FRAGMENT);5060bld.MOV(retype(dest, BRW_REGISTER_TYPE_D), brw_imm_d(dispatch_width));5061break;50625063case nir_intrinsic_load_subgroup_invocation:5064bld.MOV(retype(dest, BRW_REGISTER_TYPE_D),5065nir_system_values[SYSTEM_VALUE_SUBGROUP_INVOCATION]);5066break;50675068case nir_intrinsic_load_subgroup_eq_mask:5069case nir_intrinsic_load_subgroup_ge_mask:5070case nir_intrinsic_load_subgroup_gt_mask:5071case nir_intrinsic_load_subgroup_le_mask:5072case nir_intrinsic_load_subgroup_lt_mask:5073unreachable("not reached");50745075case nir_intrinsic_vote_any: {5076const fs_builder ubld = bld.exec_all().group(1, 0);50775078/* The any/all predicates do not consider channel enables. To prevent5079* dead channels from affecting the result, we initialize the flag with5080* with the identity value for the logical operation.5081*/5082if (dispatch_width == 32) {5083/* For SIMD32, we use a UD type so we fill both f0.0 and f0.1. */5084ubld.MOV(retype(brw_flag_reg(0, 0), BRW_REGISTER_TYPE_UD),5085brw_imm_ud(0));5086} else {5087ubld.MOV(brw_flag_reg(0, 0), brw_imm_uw(0));5088}5089bld.CMP(bld.null_reg_d(), get_nir_src(instr->src[0]), brw_imm_d(0), BRW_CONDITIONAL_NZ);50905091/* For some reason, the any/all predicates don't work properly with5092* SIMD32. In particular, it appears that a SEL with a QtrCtrl of 2H5093* doesn't read the correct subset of the flag register and you end up5094* getting garbage in the second half. Work around this by using a pair5095* of 1-wide MOVs and scattering the result.5096*/5097fs_reg res1 = ubld.vgrf(BRW_REGISTER_TYPE_D);5098ubld.MOV(res1, brw_imm_d(0));5099set_predicate(dispatch_width == 8 ? BRW_PREDICATE_ALIGN1_ANY8H :5100dispatch_width == 16 ? BRW_PREDICATE_ALIGN1_ANY16H :5101BRW_PREDICATE_ALIGN1_ANY32H,5102ubld.MOV(res1, brw_imm_d(-1)));51035104bld.MOV(retype(dest, BRW_REGISTER_TYPE_D), component(res1, 0));5105break;5106}5107case nir_intrinsic_vote_all: {5108const fs_builder ubld = bld.exec_all().group(1, 0);51095110/* The any/all predicates do not consider channel enables. To prevent5111* dead channels from affecting the result, we initialize the flag with5112* with the identity value for the logical operation.5113*/5114if (dispatch_width == 32) {5115/* For SIMD32, we use a UD type so we fill both f0.0 and f0.1. */5116ubld.MOV(retype(brw_flag_reg(0, 0), BRW_REGISTER_TYPE_UD),5117brw_imm_ud(0xffffffff));5118} else {5119ubld.MOV(brw_flag_reg(0, 0), brw_imm_uw(0xffff));5120}5121bld.CMP(bld.null_reg_d(), get_nir_src(instr->src[0]), brw_imm_d(0), BRW_CONDITIONAL_NZ);51225123/* For some reason, the any/all predicates don't work properly with5124* SIMD32. In particular, it appears that a SEL with a QtrCtrl of 2H5125* doesn't read the correct subset of the flag register and you end up5126* getting garbage in the second half. Work around this by using a pair5127* of 1-wide MOVs and scattering the result.5128*/5129fs_reg res1 = ubld.vgrf(BRW_REGISTER_TYPE_D);5130ubld.MOV(res1, brw_imm_d(0));5131set_predicate(dispatch_width == 8 ? BRW_PREDICATE_ALIGN1_ALL8H :5132dispatch_width == 16 ? BRW_PREDICATE_ALIGN1_ALL16H :5133BRW_PREDICATE_ALIGN1_ALL32H,5134ubld.MOV(res1, brw_imm_d(-1)));51355136bld.MOV(retype(dest, BRW_REGISTER_TYPE_D), component(res1, 0));5137break;5138}5139case nir_intrinsic_vote_feq:5140case nir_intrinsic_vote_ieq: {5141fs_reg value = get_nir_src(instr->src[0]);5142if (instr->intrinsic == nir_intrinsic_vote_feq) {5143const unsigned bit_size = nir_src_bit_size(instr->src[0]);5144value.type = bit_size == 8 ? BRW_REGISTER_TYPE_B :5145brw_reg_type_from_bit_size(bit_size, BRW_REGISTER_TYPE_F);5146}51475148fs_reg uniformized = bld.emit_uniformize(value);5149const fs_builder ubld = bld.exec_all().group(1, 0);51505151/* The any/all predicates do not consider channel enables. To prevent5152* dead channels from affecting the result, we initialize the flag with5153* with the identity value for the logical operation.5154*/5155if (dispatch_width == 32) {5156/* For SIMD32, we use a UD type so we fill both f0.0 and f0.1. */5157ubld.MOV(retype(brw_flag_reg(0, 0), BRW_REGISTER_TYPE_UD),5158brw_imm_ud(0xffffffff));5159} else {5160ubld.MOV(brw_flag_reg(0, 0), brw_imm_uw(0xffff));5161}5162bld.CMP(bld.null_reg_d(), value, uniformized, BRW_CONDITIONAL_Z);51635164/* For some reason, the any/all predicates don't work properly with5165* SIMD32. In particular, it appears that a SEL with a QtrCtrl of 2H5166* doesn't read the correct subset of the flag register and you end up5167* getting garbage in the second half. Work around this by using a pair5168* of 1-wide MOVs and scattering the result.5169*/5170fs_reg res1 = ubld.vgrf(BRW_REGISTER_TYPE_D);5171ubld.MOV(res1, brw_imm_d(0));5172set_predicate(dispatch_width == 8 ? BRW_PREDICATE_ALIGN1_ALL8H :5173dispatch_width == 16 ? BRW_PREDICATE_ALIGN1_ALL16H :5174BRW_PREDICATE_ALIGN1_ALL32H,5175ubld.MOV(res1, brw_imm_d(-1)));51765177bld.MOV(retype(dest, BRW_REGISTER_TYPE_D), component(res1, 0));5178break;5179}51805181case nir_intrinsic_ballot: {5182const fs_reg value = retype(get_nir_src(instr->src[0]),5183BRW_REGISTER_TYPE_UD);5184struct brw_reg flag = brw_flag_reg(0, 0);5185/* FIXME: For SIMD32 programs, this causes us to stomp on f0.1 as well5186* as f0.0. This is a problem for fragment programs as we currently use5187* f0.1 for discards. Fortunately, we don't support SIMD32 fragment5188* programs yet so this isn't a problem. When we do, something will5189* have to change.5190*/5191if (dispatch_width == 32)5192flag.type = BRW_REGISTER_TYPE_UD;51935194bld.exec_all().group(1, 0).MOV(flag, brw_imm_ud(0u));5195bld.CMP(bld.null_reg_ud(), value, brw_imm_ud(0u), BRW_CONDITIONAL_NZ);51965197if (instr->dest.ssa.bit_size > 32) {5198dest.type = BRW_REGISTER_TYPE_UQ;5199} else {5200dest.type = BRW_REGISTER_TYPE_UD;5201}5202bld.MOV(dest, flag);5203break;5204}52055206case nir_intrinsic_read_invocation: {5207const fs_reg value = get_nir_src(instr->src[0]);5208const fs_reg invocation = get_nir_src(instr->src[1]);5209fs_reg tmp = bld.vgrf(value.type);52105211bld.exec_all().emit(SHADER_OPCODE_BROADCAST, tmp, value,5212bld.emit_uniformize(invocation));52135214bld.MOV(retype(dest, value.type), fs_reg(component(tmp, 0)));5215break;5216}52175218case nir_intrinsic_read_first_invocation: {5219const fs_reg value = get_nir_src(instr->src[0]);5220bld.MOV(retype(dest, value.type), bld.emit_uniformize(value));5221break;5222}52235224case nir_intrinsic_shuffle: {5225const fs_reg value = get_nir_src(instr->src[0]);5226const fs_reg index = get_nir_src(instr->src[1]);52275228bld.emit(SHADER_OPCODE_SHUFFLE, retype(dest, value.type), value, index);5229break;5230}52315232case nir_intrinsic_first_invocation: {5233fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_UD);5234bld.exec_all().emit(SHADER_OPCODE_FIND_LIVE_CHANNEL, tmp);5235bld.MOV(retype(dest, BRW_REGISTER_TYPE_UD),5236fs_reg(component(tmp, 0)));5237break;5238}52395240case nir_intrinsic_quad_broadcast: {5241const fs_reg value = get_nir_src(instr->src[0]);5242const unsigned index = nir_src_as_uint(instr->src[1]);52435244bld.emit(SHADER_OPCODE_CLUSTER_BROADCAST, retype(dest, value.type),5245value, brw_imm_ud(index), brw_imm_ud(4));5246break;5247}52485249case nir_intrinsic_quad_swap_horizontal: {5250const fs_reg value = get_nir_src(instr->src[0]);5251const fs_reg tmp = bld.vgrf(value.type);5252if (devinfo->ver <= 7) {5253/* The hardware doesn't seem to support these crazy regions with5254* compressed instructions on gfx7 and earlier so we fall back to5255* using quad swizzles. Fortunately, we don't support 64-bit5256* anything in Vulkan on gfx7.5257*/5258assert(nir_src_bit_size(instr->src[0]) == 32);5259const fs_builder ubld = bld.exec_all();5260ubld.emit(SHADER_OPCODE_QUAD_SWIZZLE, tmp, value,5261brw_imm_ud(BRW_SWIZZLE4(1,0,3,2)));5262bld.MOV(retype(dest, value.type), tmp);5263} else {5264const fs_builder ubld = bld.exec_all().group(dispatch_width / 2, 0);52655266const fs_reg src_left = horiz_stride(value, 2);5267const fs_reg src_right = horiz_stride(horiz_offset(value, 1), 2);5268const fs_reg tmp_left = horiz_stride(tmp, 2);5269const fs_reg tmp_right = horiz_stride(horiz_offset(tmp, 1), 2);52705271ubld.MOV(tmp_left, src_right);5272ubld.MOV(tmp_right, src_left);52735274}5275bld.MOV(retype(dest, value.type), tmp);5276break;5277}52785279case nir_intrinsic_quad_swap_vertical: {5280const fs_reg value = get_nir_src(instr->src[0]);5281if (nir_src_bit_size(instr->src[0]) == 32) {5282/* For 32-bit, we can use a SIMD4x2 instruction to do this easily */5283const fs_reg tmp = bld.vgrf(value.type);5284const fs_builder ubld = bld.exec_all();5285ubld.emit(SHADER_OPCODE_QUAD_SWIZZLE, tmp, value,5286brw_imm_ud(BRW_SWIZZLE4(2,3,0,1)));5287bld.MOV(retype(dest, value.type), tmp);5288} else {5289/* For larger data types, we have to either emit dispatch_width many5290* MOVs or else fall back to doing indirects.5291*/5292fs_reg idx = bld.vgrf(BRW_REGISTER_TYPE_W);5293bld.XOR(idx, nir_system_values[SYSTEM_VALUE_SUBGROUP_INVOCATION],5294brw_imm_w(0x2));5295bld.emit(SHADER_OPCODE_SHUFFLE, retype(dest, value.type), value, idx);5296}5297break;5298}52995300case nir_intrinsic_quad_swap_diagonal: {5301const fs_reg value = get_nir_src(instr->src[0]);5302if (nir_src_bit_size(instr->src[0]) == 32) {5303/* For 32-bit, we can use a SIMD4x2 instruction to do this easily */5304const fs_reg tmp = bld.vgrf(value.type);5305const fs_builder ubld = bld.exec_all();5306ubld.emit(SHADER_OPCODE_QUAD_SWIZZLE, tmp, value,5307brw_imm_ud(BRW_SWIZZLE4(3,2,1,0)));5308bld.MOV(retype(dest, value.type), tmp);5309} else {5310/* For larger data types, we have to either emit dispatch_width many5311* MOVs or else fall back to doing indirects.5312*/5313fs_reg idx = bld.vgrf(BRW_REGISTER_TYPE_W);5314bld.XOR(idx, nir_system_values[SYSTEM_VALUE_SUBGROUP_INVOCATION],5315brw_imm_w(0x3));5316bld.emit(SHADER_OPCODE_SHUFFLE, retype(dest, value.type), value, idx);5317}5318break;5319}53205321case nir_intrinsic_reduce: {5322fs_reg src = get_nir_src(instr->src[0]);5323nir_op redop = (nir_op)nir_intrinsic_reduction_op(instr);5324unsigned cluster_size = nir_intrinsic_cluster_size(instr);5325if (cluster_size == 0 || cluster_size > dispatch_width)5326cluster_size = dispatch_width;53275328/* Figure out the source type */5329src.type = brw_type_for_nir_type(devinfo,5330(nir_alu_type)(nir_op_infos[redop].input_types[0] |5331nir_src_bit_size(instr->src[0])));53325333fs_reg identity = brw_nir_reduction_op_identity(bld, redop, src.type);5334opcode brw_op = brw_op_for_nir_reduction_op(redop);5335brw_conditional_mod cond_mod = brw_cond_mod_for_nir_reduction_op(redop);53365337/* Set up a register for all of our scratching around and initialize it5338* to reduction operation's identity value.5339*/5340fs_reg scan = bld.vgrf(src.type);5341bld.exec_all().emit(SHADER_OPCODE_SEL_EXEC, scan, src, identity);53425343bld.emit_scan(brw_op, scan, cluster_size, cond_mod);53445345dest.type = src.type;5346if (cluster_size * type_sz(src.type) >= REG_SIZE * 2) {5347/* In this case, CLUSTER_BROADCAST instruction isn't needed because5348* the distance between clusters is at least 2 GRFs. In this case,5349* we don't need the weird striding of the CLUSTER_BROADCAST5350* instruction and can just do regular MOVs.5351*/5352assert((cluster_size * type_sz(src.type)) % (REG_SIZE * 2) == 0);5353const unsigned groups =5354(dispatch_width * type_sz(src.type)) / (REG_SIZE * 2);5355const unsigned group_size = dispatch_width / groups;5356for (unsigned i = 0; i < groups; i++) {5357const unsigned cluster = (i * group_size) / cluster_size;5358const unsigned comp = cluster * cluster_size + (cluster_size - 1);5359bld.group(group_size, i).MOV(horiz_offset(dest, i * group_size),5360component(scan, comp));5361}5362} else {5363bld.emit(SHADER_OPCODE_CLUSTER_BROADCAST, dest, scan,5364brw_imm_ud(cluster_size - 1), brw_imm_ud(cluster_size));5365}5366break;5367}53685369case nir_intrinsic_inclusive_scan:5370case nir_intrinsic_exclusive_scan: {5371fs_reg src = get_nir_src(instr->src[0]);5372nir_op redop = (nir_op)nir_intrinsic_reduction_op(instr);53735374/* Figure out the source type */5375src.type = brw_type_for_nir_type(devinfo,5376(nir_alu_type)(nir_op_infos[redop].input_types[0] |5377nir_src_bit_size(instr->src[0])));53785379fs_reg identity = brw_nir_reduction_op_identity(bld, redop, src.type);5380opcode brw_op = brw_op_for_nir_reduction_op(redop);5381brw_conditional_mod cond_mod = brw_cond_mod_for_nir_reduction_op(redop);53825383/* Set up a register for all of our scratching around and initialize it5384* to reduction operation's identity value.5385*/5386fs_reg scan = bld.vgrf(src.type);5387const fs_builder allbld = bld.exec_all();5388allbld.emit(SHADER_OPCODE_SEL_EXEC, scan, src, identity);53895390if (instr->intrinsic == nir_intrinsic_exclusive_scan) {5391/* Exclusive scan is a bit harder because we have to do an annoying5392* shift of the contents before we can begin. To make things worse,5393* we can't do this with a normal stride; we have to use indirects.5394*/5395fs_reg shifted = bld.vgrf(src.type);5396fs_reg idx = bld.vgrf(BRW_REGISTER_TYPE_W);5397allbld.ADD(idx, nir_system_values[SYSTEM_VALUE_SUBGROUP_INVOCATION],5398brw_imm_w(-1));5399allbld.emit(SHADER_OPCODE_SHUFFLE, shifted, scan, idx);5400allbld.group(1, 0).MOV(component(shifted, 0), identity);5401scan = shifted;5402}54035404bld.emit_scan(brw_op, scan, dispatch_width, cond_mod);54055406bld.MOV(retype(dest, src.type), scan);5407break;5408}54095410case nir_intrinsic_load_global_block_intel: {5411assert(nir_dest_bit_size(instr->dest) == 32);54125413fs_reg address = bld.emit_uniformize(get_nir_src(instr->src[0]));54145415const fs_builder ubld1 = bld.exec_all().group(1, 0);5416const fs_builder ubld8 = bld.exec_all().group(8, 0);5417const fs_builder ubld16 = bld.exec_all().group(16, 0);54185419const unsigned total = instr->num_components * dispatch_width;5420unsigned loaded = 0;54215422while (loaded < total) {5423const unsigned block =5424choose_oword_block_size_dwords(total - loaded);5425const unsigned block_bytes = block * 4;54265427const fs_builder &ubld = block == 8 ? ubld8 : ubld16;5428ubld.emit(SHADER_OPCODE_A64_UNALIGNED_OWORD_BLOCK_READ_LOGICAL,5429retype(byte_offset(dest, loaded * 4), BRW_REGISTER_TYPE_UD),5430address,5431fs_reg(), /* No source data */5432brw_imm_ud(block))->size_written = block_bytes;54335434increment_a64_address(ubld1, address, block_bytes);5435loaded += block;5436}54375438assert(loaded == total);5439break;5440}54415442case nir_intrinsic_store_global_block_intel: {5443assert(nir_src_bit_size(instr->src[0]) == 32);54445445fs_reg address = bld.emit_uniformize(get_nir_src(instr->src[1]));5446fs_reg src = get_nir_src(instr->src[0]);54475448const fs_builder ubld1 = bld.exec_all().group(1, 0);5449const fs_builder ubld8 = bld.exec_all().group(8, 0);5450const fs_builder ubld16 = bld.exec_all().group(16, 0);54515452const unsigned total = instr->num_components * dispatch_width;5453unsigned written = 0;54545455while (written < total) {5456const unsigned block =5457choose_oword_block_size_dwords(total - written);54585459const fs_builder &ubld = block == 8 ? ubld8 : ubld16;5460ubld.emit(SHADER_OPCODE_A64_OWORD_BLOCK_WRITE_LOGICAL,5461fs_reg(),5462address,5463retype(byte_offset(src, written * 4), BRW_REGISTER_TYPE_UD),5464brw_imm_ud(block));54655466const unsigned block_bytes = block * 4;5467increment_a64_address(ubld1, address, block_bytes);5468written += block;5469}54705471assert(written == total);5472break;5473}54745475case nir_intrinsic_load_shared_block_intel:5476case nir_intrinsic_load_ssbo_block_intel: {5477assert(nir_dest_bit_size(instr->dest) == 32);54785479const bool is_ssbo =5480instr->intrinsic == nir_intrinsic_load_ssbo_block_intel;5481fs_reg address = bld.emit_uniformize(get_nir_src(instr->src[is_ssbo ? 1 : 0]));54825483fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS];5484srcs[SURFACE_LOGICAL_SRC_SURFACE] = is_ssbo ?5485get_nir_ssbo_intrinsic_index(bld, instr) : fs_reg(brw_imm_ud(GFX7_BTI_SLM));5486srcs[SURFACE_LOGICAL_SRC_ADDRESS] = address;54875488const fs_builder ubld1 = bld.exec_all().group(1, 0);5489const fs_builder ubld8 = bld.exec_all().group(8, 0);5490const fs_builder ubld16 = bld.exec_all().group(16, 0);54915492const unsigned total = instr->num_components * dispatch_width;5493unsigned loaded = 0;54945495while (loaded < total) {5496const unsigned block =5497choose_oword_block_size_dwords(total - loaded);5498const unsigned block_bytes = block * 4;54995500srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(block);55015502const fs_builder &ubld = block == 8 ? ubld8 : ubld16;5503ubld.emit(SHADER_OPCODE_UNALIGNED_OWORD_BLOCK_READ_LOGICAL,5504retype(byte_offset(dest, loaded * 4), BRW_REGISTER_TYPE_UD),5505srcs, SURFACE_LOGICAL_NUM_SRCS)->size_written = block_bytes;55065507ubld1.ADD(address, address, brw_imm_ud(block_bytes));5508loaded += block;5509}55105511assert(loaded == total);5512break;5513}55145515case nir_intrinsic_store_shared_block_intel:5516case nir_intrinsic_store_ssbo_block_intel: {5517assert(nir_src_bit_size(instr->src[0]) == 32);55185519const bool is_ssbo =5520instr->intrinsic == nir_intrinsic_store_ssbo_block_intel;55215522fs_reg address = bld.emit_uniformize(get_nir_src(instr->src[is_ssbo ? 2 : 1]));5523fs_reg src = get_nir_src(instr->src[0]);55245525fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS];5526srcs[SURFACE_LOGICAL_SRC_SURFACE] = is_ssbo ?5527get_nir_ssbo_intrinsic_index(bld, instr) : fs_reg(brw_imm_ud(GFX7_BTI_SLM));5528srcs[SURFACE_LOGICAL_SRC_ADDRESS] = address;55295530const fs_builder ubld1 = bld.exec_all().group(1, 0);5531const fs_builder ubld8 = bld.exec_all().group(8, 0);5532const fs_builder ubld16 = bld.exec_all().group(16, 0);55335534const unsigned total = instr->num_components * dispatch_width;5535unsigned written = 0;55365537while (written < total) {5538const unsigned block =5539choose_oword_block_size_dwords(total - written);55405541srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(block);5542srcs[SURFACE_LOGICAL_SRC_DATA] =5543retype(byte_offset(src, written * 4), BRW_REGISTER_TYPE_UD);55445545const fs_builder &ubld = block == 8 ? ubld8 : ubld16;5546ubld.emit(SHADER_OPCODE_OWORD_BLOCK_WRITE_LOGICAL,5547fs_reg(), srcs, SURFACE_LOGICAL_NUM_SRCS);55485549const unsigned block_bytes = block * 4;5550ubld1.ADD(address, address, brw_imm_ud(block_bytes));5551written += block;5552}55535554assert(written == total);5555break;5556}55575558case nir_intrinsic_load_btd_dss_id_intel:5559bld.emit(SHADER_OPCODE_GET_DSS_ID,5560retype(dest, BRW_REGISTER_TYPE_UD));5561break;55625563case nir_intrinsic_load_btd_stack_id_intel:5564if (stage == MESA_SHADER_COMPUTE) {5565assert(brw_cs_prog_data(prog_data)->uses_btd_stack_ids);5566} else {5567assert(brw_shader_stage_is_bindless(stage));5568}5569/* Stack IDs are always in R1 regardless of whether we're coming from a5570* bindless shader or a regular compute shader.5571*/5572bld.MOV(retype(dest, BRW_REGISTER_TYPE_UD),5573retype(brw_vec8_grf(1, 0), BRW_REGISTER_TYPE_UW));5574break;55755576case nir_intrinsic_btd_spawn_intel:5577if (stage == MESA_SHADER_COMPUTE) {5578assert(brw_cs_prog_data(prog_data)->uses_btd_stack_ids);5579} else {5580assert(brw_shader_stage_is_bindless(stage));5581}5582bld.emit(SHADER_OPCODE_BTD_SPAWN_LOGICAL, bld.null_reg_ud(),5583bld.emit_uniformize(get_nir_src(instr->src[0])),5584get_nir_src(instr->src[1]));5585break;55865587case nir_intrinsic_btd_retire_intel:5588if (stage == MESA_SHADER_COMPUTE) {5589assert(brw_cs_prog_data(prog_data)->uses_btd_stack_ids);5590} else {5591assert(brw_shader_stage_is_bindless(stage));5592}5593bld.emit(SHADER_OPCODE_BTD_RETIRE_LOGICAL);5594break;55955596default:5597unreachable("unknown intrinsic");5598}5599}56005601void5602fs_visitor::nir_emit_ssbo_atomic(const fs_builder &bld,5603int op, nir_intrinsic_instr *instr)5604{5605/* The BTI untyped atomic messages only support 32-bit atomics. If you5606* just look at the big table of messages in the Vol 7 of the SKL PRM, they5607* appear to exist. However, if you look at Vol 2a, there are no message5608* descriptors provided for Qword atomic ops except for A64 messages.5609*/5610assert(nir_dest_bit_size(instr->dest) == 32);56115612fs_reg dest;5613if (nir_intrinsic_infos[instr->intrinsic].has_dest)5614dest = get_nir_dest(instr->dest);56155616fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS];5617srcs[SURFACE_LOGICAL_SRC_SURFACE] = get_nir_ssbo_intrinsic_index(bld, instr);5618srcs[SURFACE_LOGICAL_SRC_ADDRESS] = get_nir_src(instr->src[1]);5619srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] = brw_imm_ud(1);5620srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(op);5621srcs[SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK] = brw_imm_ud(1);56225623fs_reg data;5624if (op != BRW_AOP_INC && op != BRW_AOP_DEC && op != BRW_AOP_PREDEC)5625data = get_nir_src(instr->src[2]);56265627if (op == BRW_AOP_CMPWR) {5628fs_reg tmp = bld.vgrf(data.type, 2);5629fs_reg sources[2] = { data, get_nir_src(instr->src[3]) };5630bld.LOAD_PAYLOAD(tmp, sources, 2, 0);5631data = tmp;5632}5633srcs[SURFACE_LOGICAL_SRC_DATA] = data;56345635/* Emit the actual atomic operation */56365637bld.emit(SHADER_OPCODE_UNTYPED_ATOMIC_LOGICAL,5638dest, srcs, SURFACE_LOGICAL_NUM_SRCS);5639}56405641void5642fs_visitor::nir_emit_ssbo_atomic_float(const fs_builder &bld,5643int op, nir_intrinsic_instr *instr)5644{5645fs_reg dest;5646if (nir_intrinsic_infos[instr->intrinsic].has_dest)5647dest = get_nir_dest(instr->dest);56485649fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS];5650srcs[SURFACE_LOGICAL_SRC_SURFACE] = get_nir_ssbo_intrinsic_index(bld, instr);5651srcs[SURFACE_LOGICAL_SRC_ADDRESS] = get_nir_src(instr->src[1]);5652srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] = brw_imm_ud(1);5653srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(op);5654srcs[SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK] = brw_imm_ud(1);56555656fs_reg data = get_nir_src(instr->src[2]);5657if (op == BRW_AOP_FCMPWR) {5658fs_reg tmp = bld.vgrf(data.type, 2);5659fs_reg sources[2] = { data, get_nir_src(instr->src[3]) };5660bld.LOAD_PAYLOAD(tmp, sources, 2, 0);5661data = tmp;5662}5663srcs[SURFACE_LOGICAL_SRC_DATA] = data;56645665/* Emit the actual atomic operation */56665667bld.emit(SHADER_OPCODE_UNTYPED_ATOMIC_FLOAT_LOGICAL,5668dest, srcs, SURFACE_LOGICAL_NUM_SRCS);5669}56705671void5672fs_visitor::nir_emit_shared_atomic(const fs_builder &bld,5673int op, nir_intrinsic_instr *instr)5674{5675fs_reg dest;5676if (nir_intrinsic_infos[instr->intrinsic].has_dest)5677dest = get_nir_dest(instr->dest);56785679fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS];5680srcs[SURFACE_LOGICAL_SRC_SURFACE] = brw_imm_ud(GFX7_BTI_SLM);5681srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] = brw_imm_ud(1);5682srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(op);5683srcs[SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK] = brw_imm_ud(1);56845685fs_reg data;5686if (op != BRW_AOP_INC && op != BRW_AOP_DEC && op != BRW_AOP_PREDEC)5687data = get_nir_src(instr->src[1]);5688if (op == BRW_AOP_CMPWR) {5689fs_reg tmp = bld.vgrf(data.type, 2);5690fs_reg sources[2] = { data, get_nir_src(instr->src[2]) };5691bld.LOAD_PAYLOAD(tmp, sources, 2, 0);5692data = tmp;5693}5694srcs[SURFACE_LOGICAL_SRC_DATA] = data;56955696/* Get the offset */5697if (nir_src_is_const(instr->src[0])) {5698srcs[SURFACE_LOGICAL_SRC_ADDRESS] =5699brw_imm_ud(instr->const_index[0] + nir_src_as_uint(instr->src[0]));5700} else {5701srcs[SURFACE_LOGICAL_SRC_ADDRESS] = vgrf(glsl_type::uint_type);5702bld.ADD(srcs[SURFACE_LOGICAL_SRC_ADDRESS],5703retype(get_nir_src(instr->src[0]), BRW_REGISTER_TYPE_UD),5704brw_imm_ud(instr->const_index[0]));5705}57065707/* Emit the actual atomic operation operation */57085709bld.emit(SHADER_OPCODE_UNTYPED_ATOMIC_LOGICAL,5710dest, srcs, SURFACE_LOGICAL_NUM_SRCS);5711}57125713void5714fs_visitor::nir_emit_shared_atomic_float(const fs_builder &bld,5715int op, nir_intrinsic_instr *instr)5716{5717fs_reg dest;5718if (nir_intrinsic_infos[instr->intrinsic].has_dest)5719dest = get_nir_dest(instr->dest);57205721fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS];5722srcs[SURFACE_LOGICAL_SRC_SURFACE] = brw_imm_ud(GFX7_BTI_SLM);5723srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] = brw_imm_ud(1);5724srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(op);5725srcs[SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK] = brw_imm_ud(1);57265727fs_reg data = get_nir_src(instr->src[1]);5728if (op == BRW_AOP_FCMPWR) {5729fs_reg tmp = bld.vgrf(data.type, 2);5730fs_reg sources[2] = { data, get_nir_src(instr->src[2]) };5731bld.LOAD_PAYLOAD(tmp, sources, 2, 0);5732data = tmp;5733}5734srcs[SURFACE_LOGICAL_SRC_DATA] = data;57355736/* Get the offset */5737if (nir_src_is_const(instr->src[0])) {5738srcs[SURFACE_LOGICAL_SRC_ADDRESS] =5739brw_imm_ud(instr->const_index[0] + nir_src_as_uint(instr->src[0]));5740} else {5741srcs[SURFACE_LOGICAL_SRC_ADDRESS] = vgrf(glsl_type::uint_type);5742bld.ADD(srcs[SURFACE_LOGICAL_SRC_ADDRESS],5743retype(get_nir_src(instr->src[0]), BRW_REGISTER_TYPE_UD),5744brw_imm_ud(instr->const_index[0]));5745}57465747/* Emit the actual atomic operation operation */57485749bld.emit(SHADER_OPCODE_UNTYPED_ATOMIC_FLOAT_LOGICAL,5750dest, srcs, SURFACE_LOGICAL_NUM_SRCS);5751}57525753static fs_reg5754expand_to_32bit(const fs_builder &bld, const fs_reg &src)5755{5756if (type_sz(src.type) == 2) {5757fs_reg src32 = bld.vgrf(BRW_REGISTER_TYPE_UD);5758bld.MOV(src32, retype(src, BRW_REGISTER_TYPE_UW));5759return src32;5760} else {5761return src;5762}5763}57645765void5766fs_visitor::nir_emit_global_atomic(const fs_builder &bld,5767int op, nir_intrinsic_instr *instr)5768{5769fs_reg dest;5770if (nir_intrinsic_infos[instr->intrinsic].has_dest)5771dest = get_nir_dest(instr->dest);57725773fs_reg addr = get_nir_src(instr->src[0]);57745775fs_reg data;5776if (op != BRW_AOP_INC && op != BRW_AOP_DEC && op != BRW_AOP_PREDEC)5777data = expand_to_32bit(bld, get_nir_src(instr->src[1]));57785779if (op == BRW_AOP_CMPWR) {5780fs_reg tmp = bld.vgrf(data.type, 2);5781fs_reg sources[2] = {5782data,5783expand_to_32bit(bld, get_nir_src(instr->src[2]))5784};5785bld.LOAD_PAYLOAD(tmp, sources, 2, 0);5786data = tmp;5787}57885789switch (nir_dest_bit_size(instr->dest)) {5790case 16: {5791fs_reg dest32 = bld.vgrf(BRW_REGISTER_TYPE_UD);5792bld.emit(SHADER_OPCODE_A64_UNTYPED_ATOMIC_INT16_LOGICAL,5793dest32, addr, data, brw_imm_ud(op));5794bld.MOV(retype(dest, BRW_REGISTER_TYPE_UW), dest32);5795break;5796}5797case 32:5798bld.emit(SHADER_OPCODE_A64_UNTYPED_ATOMIC_LOGICAL,5799dest, addr, data, brw_imm_ud(op));5800break;5801case 64:5802bld.emit(SHADER_OPCODE_A64_UNTYPED_ATOMIC_INT64_LOGICAL,5803dest, addr, data, brw_imm_ud(op));5804break;5805default:5806unreachable("Unsupported bit size");5807}5808}58095810void5811fs_visitor::nir_emit_global_atomic_float(const fs_builder &bld,5812int op, nir_intrinsic_instr *instr)5813{5814assert(nir_intrinsic_infos[instr->intrinsic].has_dest);5815fs_reg dest = get_nir_dest(instr->dest);58165817fs_reg addr = get_nir_src(instr->src[0]);58185819assert(op != BRW_AOP_INC && op != BRW_AOP_DEC && op != BRW_AOP_PREDEC);5820fs_reg data = expand_to_32bit(bld, get_nir_src(instr->src[1]));58215822if (op == BRW_AOP_FCMPWR) {5823fs_reg tmp = bld.vgrf(data.type, 2);5824fs_reg sources[2] = {5825data,5826expand_to_32bit(bld, get_nir_src(instr->src[2]))5827};5828bld.LOAD_PAYLOAD(tmp, sources, 2, 0);5829data = tmp;5830}58315832switch (nir_dest_bit_size(instr->dest)) {5833case 16: {5834fs_reg dest32 = bld.vgrf(BRW_REGISTER_TYPE_UD);5835bld.emit(SHADER_OPCODE_A64_UNTYPED_ATOMIC_FLOAT16_LOGICAL,5836dest32, addr, data, brw_imm_ud(op));5837bld.MOV(retype(dest, BRW_REGISTER_TYPE_UW), dest32);5838break;5839}5840case 32:5841bld.emit(SHADER_OPCODE_A64_UNTYPED_ATOMIC_FLOAT32_LOGICAL,5842dest, addr, data, brw_imm_ud(op));5843break;5844default:5845unreachable("Unsupported bit size");5846}5847}58485849void5850fs_visitor::nir_emit_texture(const fs_builder &bld, nir_tex_instr *instr)5851{5852unsigned texture = instr->texture_index;5853unsigned sampler = instr->sampler_index;58545855fs_reg srcs[TEX_LOGICAL_NUM_SRCS];58565857srcs[TEX_LOGICAL_SRC_SURFACE] = brw_imm_ud(texture);5858srcs[TEX_LOGICAL_SRC_SAMPLER] = brw_imm_ud(sampler);58595860int lod_components = 0;58615862/* The hardware requires a LOD for buffer textures */5863if (instr->sampler_dim == GLSL_SAMPLER_DIM_BUF)5864srcs[TEX_LOGICAL_SRC_LOD] = brw_imm_d(0);58655866uint32_t header_bits = 0;5867for (unsigned i = 0; i < instr->num_srcs; i++) {5868fs_reg src = get_nir_src(instr->src[i].src);5869switch (instr->src[i].src_type) {5870case nir_tex_src_bias:5871srcs[TEX_LOGICAL_SRC_LOD] =5872retype(get_nir_src_imm(instr->src[i].src), BRW_REGISTER_TYPE_F);5873break;5874case nir_tex_src_comparator:5875srcs[TEX_LOGICAL_SRC_SHADOW_C] = retype(src, BRW_REGISTER_TYPE_F);5876break;5877case nir_tex_src_coord:5878switch (instr->op) {5879case nir_texop_txf:5880case nir_texop_txf_ms:5881case nir_texop_txf_ms_mcs:5882case nir_texop_samples_identical:5883srcs[TEX_LOGICAL_SRC_COORDINATE] = retype(src, BRW_REGISTER_TYPE_D);5884break;5885default:5886srcs[TEX_LOGICAL_SRC_COORDINATE] = retype(src, BRW_REGISTER_TYPE_F);5887break;5888}58895890/* Wa_14013363432:5891*5892* Compiler should send U,V,R parameters even if V,R are 0.5893*/5894if (instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE && devinfo->verx10 == 125)5895assert(instr->coord_components == 3u + instr->is_array);5896break;5897case nir_tex_src_ddx:5898srcs[TEX_LOGICAL_SRC_LOD] = retype(src, BRW_REGISTER_TYPE_F);5899lod_components = nir_tex_instr_src_size(instr, i);5900break;5901case nir_tex_src_ddy:5902srcs[TEX_LOGICAL_SRC_LOD2] = retype(src, BRW_REGISTER_TYPE_F);5903break;5904case nir_tex_src_lod:5905switch (instr->op) {5906case nir_texop_txs:5907srcs[TEX_LOGICAL_SRC_LOD] =5908retype(get_nir_src_imm(instr->src[i].src), BRW_REGISTER_TYPE_UD);5909break;5910case nir_texop_txf:5911srcs[TEX_LOGICAL_SRC_LOD] =5912retype(get_nir_src_imm(instr->src[i].src), BRW_REGISTER_TYPE_D);5913break;5914default:5915srcs[TEX_LOGICAL_SRC_LOD] =5916retype(get_nir_src_imm(instr->src[i].src), BRW_REGISTER_TYPE_F);5917break;5918}5919break;5920case nir_tex_src_min_lod:5921srcs[TEX_LOGICAL_SRC_MIN_LOD] =5922retype(get_nir_src_imm(instr->src[i].src), BRW_REGISTER_TYPE_F);5923break;5924case nir_tex_src_ms_index:5925srcs[TEX_LOGICAL_SRC_SAMPLE_INDEX] = retype(src, BRW_REGISTER_TYPE_UD);5926break;59275928case nir_tex_src_offset: {5929uint32_t offset_bits = 0;5930if (brw_texture_offset(instr, i, &offset_bits)) {5931header_bits |= offset_bits;5932} else {5933srcs[TEX_LOGICAL_SRC_TG4_OFFSET] =5934retype(src, BRW_REGISTER_TYPE_D);5935}5936break;5937}59385939case nir_tex_src_projector:5940unreachable("should be lowered");59415942case nir_tex_src_texture_offset: {5943/* Emit code to evaluate the actual indexing expression */5944fs_reg tmp = vgrf(glsl_type::uint_type);5945bld.ADD(tmp, src, brw_imm_ud(texture));5946srcs[TEX_LOGICAL_SRC_SURFACE] = bld.emit_uniformize(tmp);5947break;5948}59495950case nir_tex_src_sampler_offset: {5951/* Emit code to evaluate the actual indexing expression */5952fs_reg tmp = vgrf(glsl_type::uint_type);5953bld.ADD(tmp, src, brw_imm_ud(sampler));5954srcs[TEX_LOGICAL_SRC_SAMPLER] = bld.emit_uniformize(tmp);5955break;5956}59575958case nir_tex_src_texture_handle:5959assert(nir_tex_instr_src_index(instr, nir_tex_src_texture_offset) == -1);5960srcs[TEX_LOGICAL_SRC_SURFACE] = fs_reg();5961srcs[TEX_LOGICAL_SRC_SURFACE_HANDLE] = bld.emit_uniformize(src);5962break;59635964case nir_tex_src_sampler_handle:5965assert(nir_tex_instr_src_index(instr, nir_tex_src_sampler_offset) == -1);5966srcs[TEX_LOGICAL_SRC_SAMPLER] = fs_reg();5967srcs[TEX_LOGICAL_SRC_SAMPLER_HANDLE] = bld.emit_uniformize(src);5968break;59695970case nir_tex_src_ms_mcs:5971assert(instr->op == nir_texop_txf_ms);5972srcs[TEX_LOGICAL_SRC_MCS] = retype(src, BRW_REGISTER_TYPE_D);5973break;59745975case nir_tex_src_plane: {5976const uint32_t plane = nir_src_as_uint(instr->src[i].src);5977const uint32_t texture_index =5978instr->texture_index +5979stage_prog_data->binding_table.plane_start[plane] -5980stage_prog_data->binding_table.texture_start;59815982srcs[TEX_LOGICAL_SRC_SURFACE] = brw_imm_ud(texture_index);5983break;5984}59855986default:5987unreachable("unknown texture source");5988}5989}59905991if (srcs[TEX_LOGICAL_SRC_MCS].file == BAD_FILE &&5992(instr->op == nir_texop_txf_ms ||5993instr->op == nir_texop_samples_identical)) {5994if (devinfo->ver >= 7 &&5995key_tex->compressed_multisample_layout_mask & (1 << texture)) {5996srcs[TEX_LOGICAL_SRC_MCS] =5997emit_mcs_fetch(srcs[TEX_LOGICAL_SRC_COORDINATE],5998instr->coord_components,5999srcs[TEX_LOGICAL_SRC_SURFACE],6000srcs[TEX_LOGICAL_SRC_SURFACE_HANDLE]);6001} else {6002srcs[TEX_LOGICAL_SRC_MCS] = brw_imm_ud(0u);6003}6004}60056006srcs[TEX_LOGICAL_SRC_COORD_COMPONENTS] = brw_imm_d(instr->coord_components);6007srcs[TEX_LOGICAL_SRC_GRAD_COMPONENTS] = brw_imm_d(lod_components);60086009enum opcode opcode;6010switch (instr->op) {6011case nir_texop_tex:6012opcode = SHADER_OPCODE_TEX_LOGICAL;6013break;6014case nir_texop_txb:6015opcode = FS_OPCODE_TXB_LOGICAL;6016break;6017case nir_texop_txl:6018opcode = SHADER_OPCODE_TXL_LOGICAL;6019break;6020case nir_texop_txd:6021opcode = SHADER_OPCODE_TXD_LOGICAL;6022break;6023case nir_texop_txf:6024opcode = SHADER_OPCODE_TXF_LOGICAL;6025break;6026case nir_texop_txf_ms:6027if ((key_tex->msaa_16 & (1 << sampler)))6028opcode = SHADER_OPCODE_TXF_CMS_W_LOGICAL;6029else6030opcode = SHADER_OPCODE_TXF_CMS_LOGICAL;6031break;6032case nir_texop_txf_ms_mcs:6033opcode = SHADER_OPCODE_TXF_MCS_LOGICAL;6034break;6035case nir_texop_query_levels:6036case nir_texop_txs:6037opcode = SHADER_OPCODE_TXS_LOGICAL;6038break;6039case nir_texop_lod:6040opcode = SHADER_OPCODE_LOD_LOGICAL;6041break;6042case nir_texop_tg4:6043if (srcs[TEX_LOGICAL_SRC_TG4_OFFSET].file != BAD_FILE)6044opcode = SHADER_OPCODE_TG4_OFFSET_LOGICAL;6045else6046opcode = SHADER_OPCODE_TG4_LOGICAL;6047break;6048case nir_texop_texture_samples:6049opcode = SHADER_OPCODE_SAMPLEINFO_LOGICAL;6050break;6051case nir_texop_samples_identical: {6052fs_reg dst = retype(get_nir_dest(instr->dest), BRW_REGISTER_TYPE_D);60536054/* If mcs is an immediate value, it means there is no MCS. In that case6055* just return false.6056*/6057if (srcs[TEX_LOGICAL_SRC_MCS].file == BRW_IMMEDIATE_VALUE) {6058bld.MOV(dst, brw_imm_ud(0u));6059} else if ((key_tex->msaa_16 & (1 << sampler))) {6060fs_reg tmp = vgrf(glsl_type::uint_type);6061bld.OR(tmp, srcs[TEX_LOGICAL_SRC_MCS],6062offset(srcs[TEX_LOGICAL_SRC_MCS], bld, 1));6063bld.CMP(dst, tmp, brw_imm_ud(0u), BRW_CONDITIONAL_EQ);6064} else {6065bld.CMP(dst, srcs[TEX_LOGICAL_SRC_MCS], brw_imm_ud(0u),6066BRW_CONDITIONAL_EQ);6067}6068return;6069}6070default:6071unreachable("unknown texture opcode");6072}60736074if (instr->op == nir_texop_tg4) {6075if (instr->component == 1 &&6076key_tex->gather_channel_quirk_mask & (1 << texture)) {6077/* gather4 sampler is broken for green channel on RG32F --6078* we must ask for blue instead.6079*/6080header_bits |= 2 << 16;6081} else {6082header_bits |= instr->component << 16;6083}6084}60856086fs_reg dst = bld.vgrf(brw_type_for_nir_type(devinfo, instr->dest_type), 4);6087fs_inst *inst = bld.emit(opcode, dst, srcs, ARRAY_SIZE(srcs));6088inst->offset = header_bits;60896090const unsigned dest_size = nir_tex_instr_dest_size(instr);6091if (devinfo->ver >= 9 &&6092instr->op != nir_texop_tg4 && instr->op != nir_texop_query_levels) {6093unsigned write_mask = instr->dest.is_ssa ?6094nir_ssa_def_components_read(&instr->dest.ssa):6095(1 << dest_size) - 1;6096assert(write_mask != 0); /* dead code should have been eliminated */6097inst->size_written = util_last_bit(write_mask) *6098inst->dst.component_size(inst->exec_size);6099} else {6100inst->size_written = 4 * inst->dst.component_size(inst->exec_size);6101}61026103if (srcs[TEX_LOGICAL_SRC_SHADOW_C].file != BAD_FILE)6104inst->shadow_compare = true;61056106if (instr->op == nir_texop_tg4 && devinfo->ver == 6)6107emit_gfx6_gather_wa(key_tex->gfx6_gather_wa[texture], dst);61086109fs_reg nir_dest[5];6110for (unsigned i = 0; i < dest_size; i++)6111nir_dest[i] = offset(dst, bld, i);61126113if (instr->op == nir_texop_query_levels) {6114/* # levels is in .w */6115if (devinfo->ver <= 9) {6116/**6117* Wa_1940217:6118*6119* When a surface of type SURFTYPE_NULL is accessed by resinfo, the6120* MIPCount returned is undefined instead of 0.6121*/6122fs_inst *mov = bld.MOV(bld.null_reg_d(), dst);6123mov->conditional_mod = BRW_CONDITIONAL_NZ;6124nir_dest[0] = bld.vgrf(BRW_REGISTER_TYPE_D);6125fs_inst *sel = bld.SEL(nir_dest[0], offset(dst, bld, 3), brw_imm_d(0));6126sel->predicate = BRW_PREDICATE_NORMAL;6127} else {6128nir_dest[0] = offset(dst, bld, 3);6129}6130} else if (instr->op == nir_texop_txs &&6131dest_size >= 3 && devinfo->ver < 7) {6132/* Gfx4-6 return 0 instead of 1 for single layer surfaces. */6133fs_reg depth = offset(dst, bld, 2);6134nir_dest[2] = vgrf(glsl_type::int_type);6135bld.emit_minmax(nir_dest[2], depth, brw_imm_d(1), BRW_CONDITIONAL_GE);6136}61376138bld.LOAD_PAYLOAD(get_nir_dest(instr->dest), nir_dest, dest_size, 0);6139}61406141void6142fs_visitor::nir_emit_jump(const fs_builder &bld, nir_jump_instr *instr)6143{6144switch (instr->type) {6145case nir_jump_break:6146bld.emit(BRW_OPCODE_BREAK);6147break;6148case nir_jump_continue:6149bld.emit(BRW_OPCODE_CONTINUE);6150break;6151case nir_jump_halt:6152bld.emit(BRW_OPCODE_HALT);6153break;6154case nir_jump_return:6155default:6156unreachable("unknown jump");6157}6158}61596160/*6161* This helper takes a source register and un/shuffles it into the destination6162* register.6163*6164* If source type size is smaller than destination type size the operation6165* needed is a component shuffle. The opposite case would be an unshuffle. If6166* source/destination type size is equal a shuffle is done that would be6167* equivalent to a simple MOV.6168*6169* For example, if source is a 16-bit type and destination is 32-bit. A 36170* components .xyz 16-bit vector on SIMD8 would be.6171*6172* |x1|x2|x3|x4|x5|x6|x7|x8|y1|y2|y3|y4|y5|y6|y7|y8|6173* |z1|z2|z3|z4|z5|z6|z7|z8| | | | | | | | |6174*6175* This helper will return the following 2 32-bit components with the 16-bit6176* values shuffled:6177*6178* |x1 y1|x2 y2|x3 y3|x4 y4|x5 y5|x6 y6|x7 y7|x8 y8|6179* |z1 |z2 |z3 |z4 |z5 |z6 |z7 |z8 |6180*6181* For unshuffle, the example would be the opposite, a 64-bit type source6182* and a 32-bit destination. A 2 component .xy 64-bit vector on SIMD86183* would be:6184*6185* | x1l x1h | x2l x2h | x3l x3h | x4l x4h |6186* | x5l x5h | x6l x6h | x7l x7h | x8l x8h |6187* | y1l y1h | y2l y2h | y3l y3h | y4l y4h |6188* | y5l y5h | y6l y6h | y7l y7h | y8l y8h |6189*6190* The returned result would be the following 4 32-bit components unshuffled:6191*6192* | x1l | x2l | x3l | x4l | x5l | x6l | x7l | x8l |6193* | x1h | x2h | x3h | x4h | x5h | x6h | x7h | x8h |6194* | y1l | y2l | y3l | y4l | y5l | y6l | y7l | y8l |6195* | y1h | y2h | y3h | y4h | y5h | y6h | y7h | y8h |6196*6197* - Source and destination register must not be overlapped.6198* - components units are measured in terms of the smaller type between6199* source and destination because we are un/shuffling the smaller6200* components from/into the bigger ones.6201* - first_component parameter allows skipping source components.6202*/6203void6204shuffle_src_to_dst(const fs_builder &bld,6205const fs_reg &dst,6206const fs_reg &src,6207uint32_t first_component,6208uint32_t components)6209{6210if (type_sz(src.type) == type_sz(dst.type)) {6211assert(!regions_overlap(dst,6212type_sz(dst.type) * bld.dispatch_width() * components,6213offset(src, bld, first_component),6214type_sz(src.type) * bld.dispatch_width() * components));6215for (unsigned i = 0; i < components; i++) {6216bld.MOV(retype(offset(dst, bld, i), src.type),6217offset(src, bld, i + first_component));6218}6219} else if (type_sz(src.type) < type_sz(dst.type)) {6220/* Source is shuffled into destination */6221unsigned size_ratio = type_sz(dst.type) / type_sz(src.type);6222assert(!regions_overlap(dst,6223type_sz(dst.type) * bld.dispatch_width() *6224DIV_ROUND_UP(components, size_ratio),6225offset(src, bld, first_component),6226type_sz(src.type) * bld.dispatch_width() * components));62276228brw_reg_type shuffle_type =6229brw_reg_type_from_bit_size(8 * type_sz(src.type),6230BRW_REGISTER_TYPE_D);6231for (unsigned i = 0; i < components; i++) {6232fs_reg shuffle_component_i =6233subscript(offset(dst, bld, i / size_ratio),6234shuffle_type, i % size_ratio);6235bld.MOV(shuffle_component_i,6236retype(offset(src, bld, i + first_component), shuffle_type));6237}6238} else {6239/* Source is unshuffled into destination */6240unsigned size_ratio = type_sz(src.type) / type_sz(dst.type);6241assert(!regions_overlap(dst,6242type_sz(dst.type) * bld.dispatch_width() * components,6243offset(src, bld, first_component / size_ratio),6244type_sz(src.type) * bld.dispatch_width() *6245DIV_ROUND_UP(components + (first_component % size_ratio),6246size_ratio)));62476248brw_reg_type shuffle_type =6249brw_reg_type_from_bit_size(8 * type_sz(dst.type),6250BRW_REGISTER_TYPE_D);6251for (unsigned i = 0; i < components; i++) {6252fs_reg shuffle_component_i =6253subscript(offset(src, bld, (first_component + i) / size_ratio),6254shuffle_type, (first_component + i) % size_ratio);6255bld.MOV(retype(offset(dst, bld, i), shuffle_type),6256shuffle_component_i);6257}6258}6259}62606261void6262shuffle_from_32bit_read(const fs_builder &bld,6263const fs_reg &dst,6264const fs_reg &src,6265uint32_t first_component,6266uint32_t components)6267{6268assert(type_sz(src.type) == 4);62696270/* This function takes components in units of the destination type while6271* shuffle_src_to_dst takes components in units of the smallest type6272*/6273if (type_sz(dst.type) > 4) {6274assert(type_sz(dst.type) == 8);6275first_component *= 2;6276components *= 2;6277}62786279shuffle_src_to_dst(bld, dst, src, first_component, components);6280}62816282fs_reg6283setup_imm_df(const fs_builder &bld, double v)6284{6285const struct intel_device_info *devinfo = bld.shader->devinfo;6286assert(devinfo->ver >= 7);62876288if (devinfo->ver >= 8)6289return brw_imm_df(v);62906291/* gfx7.5 does not support DF immediates straighforward but the DIM6292* instruction allows to set the 64-bit immediate value.6293*/6294if (devinfo->is_haswell) {6295const fs_builder ubld = bld.exec_all().group(1, 0);6296fs_reg dst = ubld.vgrf(BRW_REGISTER_TYPE_DF, 1);6297ubld.DIM(dst, brw_imm_df(v));6298return component(dst, 0);6299}63006301/* gfx7 does not support DF immediates, so we generate a 64-bit constant by6302* writing the low 32-bit of the constant to suboffset 0 of a VGRF and6303* the high 32-bit to suboffset 4 and then applying a stride of 0.6304*6305* Alternatively, we could also produce a normal VGRF (without stride 0)6306* by writing to all the channels in the VGRF, however, that would hit the6307* gfx7 bug where we have to split writes that span more than 1 register6308* into instructions with a width of 4 (otherwise the write to the second6309* register written runs into an execmask hardware bug) which isn't very6310* nice.6311*/6312union {6313double d;6314struct {6315uint32_t i1;6316uint32_t i2;6317};6318} di;63196320di.d = v;63216322const fs_builder ubld = bld.exec_all().group(1, 0);6323const fs_reg tmp = ubld.vgrf(BRW_REGISTER_TYPE_UD, 2);6324ubld.MOV(tmp, brw_imm_ud(di.i1));6325ubld.MOV(horiz_offset(tmp, 1), brw_imm_ud(di.i2));63266327return component(retype(tmp, BRW_REGISTER_TYPE_DF), 0);6328}63296330fs_reg6331setup_imm_b(const fs_builder &bld, int8_t v)6332{6333const fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_B);6334bld.MOV(tmp, brw_imm_w(v));6335return tmp;6336}63376338fs_reg6339setup_imm_ub(const fs_builder &bld, uint8_t v)6340{6341const fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_UB);6342bld.MOV(tmp, brw_imm_uw(v));6343return tmp;6344}634563466347