Path: blob/21.2-virgl/src/broadcom/compiler/v3d_nir_lower_io.c
4564 views
/*1* Copyright © 2015 Broadcom2*3* Permission is hereby granted, free of charge, to any person obtaining a4* copy of this software and associated documentation files (the "Software"),5* to deal in the Software without restriction, including without limitation6* the rights to use, copy, modify, merge, publish, distribute, sublicense,7* and/or sell copies of the Software, and to permit persons to whom the8* Software is furnished to do so, subject to the following conditions:9*10* The above copyright notice and this permission notice (including the next11* paragraph) shall be included in all copies or substantial portions of the12* Software.13*14* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR15* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,16* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL17* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER18* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING19* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS20* IN THE SOFTWARE.21*/2223#include "compiler/v3d_compiler.h"24#include "compiler/nir/nir_builder.h"2526/**27* Walks the NIR generated by TGSI-to-NIR or GLSL-to-NIR to lower its io28* intrinsics into something amenable to the V3D architecture.29*30* Most of the work is turning the VS's store_output intrinsics from working31* on a base representing the gallium-level vec4 driver_location to an offset32* within the VPM, and emitting the header that's read by the fixed function33* hardware between the VS and FS.34*35* We also adjust the offsets on uniform loads to be in bytes, since that's36* what we need for indirect addressing with general TMU access.37*/3839struct v3d_nir_lower_io_state {40int pos_vpm_offset;41int vp_vpm_offset;42int zs_vpm_offset;43int rcp_wc_vpm_offset;44int psiz_vpm_offset;45int varyings_vpm_offset;4647/* Geometry shader state */48struct {49/* VPM offset for the current vertex data output */50nir_variable *output_offset_var;51/* VPM offset for the current vertex header */52nir_variable *header_offset_var;53/* VPM header for the current vertex */54nir_variable *header_var;5556/* Size of the complete VPM output header */57uint32_t output_header_size;58/* Size of the output data for a single vertex */59uint32_t output_vertex_data_size;60} gs;6162BITSET_WORD varyings_stored[BITSET_WORDS(V3D_MAX_ANY_STAGE_INPUTS)];6364nir_ssa_def *pos[4];65};6667static void68v3d_nir_emit_ff_vpm_outputs(struct v3d_compile *c, nir_builder *b,69struct v3d_nir_lower_io_state *state);7071static void72v3d_nir_store_output(nir_builder *b, int base, nir_ssa_def *offset,73nir_ssa_def *chan)74{75if (offset) {76/* When generating the VIR instruction, the base and the offset77* are just going to get added together with an ADD instruction78* so we might as well do the add here at the NIR level instead79* and let the constant folding do its magic.80*/81offset = nir_iadd_imm(b, offset, base);82base = 0;83} else {84offset = nir_imm_int(b, 0);85}8687nir_store_output(b, chan, offset, .base = base, .write_mask = 0x1, .component = 0);88}8990/* Convert the uniform offset to bytes. If it happens to be a constant,91* constant-folding will clean up the shift for us.92*/93static void94v3d_nir_lower_uniform(struct v3d_compile *c, nir_builder *b,95nir_intrinsic_instr *intr)96{97/* On SPIR-V/Vulkan we are already getting our offsets in98* bytes.99*/100if (c->key->environment == V3D_ENVIRONMENT_VULKAN)101return;102103b->cursor = nir_before_instr(&intr->instr);104105nir_intrinsic_set_base(intr, nir_intrinsic_base(intr) * 16);106107nir_instr_rewrite_src(&intr->instr,108&intr->src[0],109nir_src_for_ssa(nir_ishl(b, intr->src[0].ssa,110nir_imm_int(b, 4))));111}112113static int114v3d_varying_slot_vpm_offset(struct v3d_compile *c, unsigned location, unsigned component)115{116uint32_t num_used_outputs = 0;117struct v3d_varying_slot *used_outputs = NULL;118switch (c->s->info.stage) {119case MESA_SHADER_VERTEX:120num_used_outputs = c->vs_key->num_used_outputs;121used_outputs = c->vs_key->used_outputs;122break;123case MESA_SHADER_GEOMETRY:124num_used_outputs = c->gs_key->num_used_outputs;125used_outputs = c->gs_key->used_outputs;126break;127default:128unreachable("Unsupported shader stage");129}130131for (int i = 0; i < num_used_outputs; i++) {132struct v3d_varying_slot slot = used_outputs[i];133134if (v3d_slot_get_slot(slot) == location &&135v3d_slot_get_component(slot) == component) {136return i;137}138}139140return -1;141}142143/* Lowers a store_output(gallium driver location) to a series of store_outputs144* with a driver_location equal to the offset in the VPM.145*146* For geometry shaders we need to emit multiple vertices so the VPM offsets147* need to be computed in the shader code based on the current vertex index.148*/149static void150v3d_nir_lower_vpm_output(struct v3d_compile *c, nir_builder *b,151nir_intrinsic_instr *intr,152struct v3d_nir_lower_io_state *state)153{154b->cursor = nir_before_instr(&intr->instr);155156/* If this is a geometry shader we need to emit our outputs157* to the current vertex offset in the VPM.158*/159nir_ssa_def *offset_reg =160c->s->info.stage == MESA_SHADER_GEOMETRY ?161nir_load_var(b, state->gs.output_offset_var) : NULL;162163int start_comp = nir_intrinsic_component(intr);164unsigned location = nir_intrinsic_io_semantics(intr).location;165nir_ssa_def *src = nir_ssa_for_src(b, intr->src[0],166intr->num_components);167/* Save off the components of the position for the setup of VPM inputs168* read by fixed function HW.169*/170if (location == VARYING_SLOT_POS) {171for (int i = 0; i < intr->num_components; i++) {172state->pos[start_comp + i] = nir_channel(b, src, i);173}174}175176/* Just psiz to the position in the FF header right now. */177if (location == VARYING_SLOT_PSIZ &&178state->psiz_vpm_offset != -1) {179v3d_nir_store_output(b, state->psiz_vpm_offset, offset_reg, src);180}181182if (location == VARYING_SLOT_LAYER) {183assert(c->s->info.stage == MESA_SHADER_GEOMETRY);184nir_ssa_def *header = nir_load_var(b, state->gs.header_var);185header = nir_iand(b, header, nir_imm_int(b, 0xff00ffff));186187/* From the GLES 3.2 spec:188*189* "When fragments are written to a layered framebuffer, the190* fragment’s layer number selects an image from the array191* of images at each attachment (...). If the fragment’s192* layer number is negative, or greater than or equal to193* the minimum number of layers of any attachment, the194* effects of the fragment on the framebuffer contents are195* undefined."196*197* This suggests we can just ignore that situation, however,198* for V3D an out-of-bounds layer index means that the binner199* might do out-of-bounds writes access to the tile state. The200* simulator has an assert to catch this, so we play safe here201* and we make sure that doesn't happen by setting gl_Layer202* to 0 in that case (we always allocate tile state for at203* least one layer).204*/205nir_ssa_def *fb_layers = nir_load_fb_layers_v3d(b, 32);206nir_ssa_def *cond = nir_ige(b, src, fb_layers);207nir_ssa_def *layer_id =208nir_bcsel(b, cond,209nir_imm_int(b, 0),210nir_ishl(b, src, nir_imm_int(b, 16)));211header = nir_ior(b, header, layer_id);212nir_store_var(b, state->gs.header_var, header, 0x1);213}214215/* Scalarize outputs if it hasn't happened already, since we want to216* schedule each VPM write individually. We can skip any outut217* components not read by the FS.218*/219for (int i = 0; i < intr->num_components; i++) {220int vpm_offset =221v3d_varying_slot_vpm_offset(c, location, start_comp + i);222223224if (vpm_offset == -1)225continue;226227if (nir_src_is_const(intr->src[1]))228vpm_offset += nir_src_as_uint(intr->src[1]) * 4;229230BITSET_SET(state->varyings_stored, vpm_offset);231232v3d_nir_store_output(b, state->varyings_vpm_offset + vpm_offset,233offset_reg, nir_channel(b, src, i));234}235236nir_instr_remove(&intr->instr);237}238239static inline void240reset_gs_header(nir_builder *b, struct v3d_nir_lower_io_state *state)241{242const uint8_t NEW_PRIMITIVE_OFFSET = 0;243const uint8_t VERTEX_DATA_LENGTH_OFFSET = 8;244245uint32_t vertex_data_size = state->gs.output_vertex_data_size;246assert((vertex_data_size & 0xffffff00) == 0);247248uint32_t header;249header = 1 << NEW_PRIMITIVE_OFFSET;250header |= vertex_data_size << VERTEX_DATA_LENGTH_OFFSET;251nir_store_var(b, state->gs.header_var, nir_imm_int(b, header), 0x1);252}253254static void255v3d_nir_lower_emit_vertex(struct v3d_compile *c, nir_builder *b,256nir_intrinsic_instr *instr,257struct v3d_nir_lower_io_state *state)258{259b->cursor = nir_before_instr(&instr->instr);260261nir_ssa_def *header = nir_load_var(b, state->gs.header_var);262nir_ssa_def *header_offset = nir_load_var(b, state->gs.header_offset_var);263nir_ssa_def *output_offset = nir_load_var(b, state->gs.output_offset_var);264265/* Emit fixed function outputs */266v3d_nir_emit_ff_vpm_outputs(c, b, state);267268/* Emit vertex header */269v3d_nir_store_output(b, 0, header_offset, header);270271/* Update VPM offset for next vertex output data and header */272output_offset =273nir_iadd(b, output_offset,274nir_imm_int(b, state->gs.output_vertex_data_size));275276header_offset = nir_iadd(b, header_offset, nir_imm_int(b, 1));277278/* Reset the New Primitive bit */279header = nir_iand(b, header, nir_imm_int(b, 0xfffffffe));280281nir_store_var(b, state->gs.output_offset_var, output_offset, 0x1);282nir_store_var(b, state->gs.header_offset_var, header_offset, 0x1);283nir_store_var(b, state->gs.header_var, header, 0x1);284285nir_instr_remove(&instr->instr);286}287288static void289v3d_nir_lower_end_primitive(struct v3d_compile *c, nir_builder *b,290nir_intrinsic_instr *instr,291struct v3d_nir_lower_io_state *state)292{293assert(state->gs.header_var);294b->cursor = nir_before_instr(&instr->instr);295reset_gs_header(b, state);296297nir_instr_remove(&instr->instr);298}299300/* Some vertex attribute formats may require to apply a swizzle but the hardware301* doesn't provide means to do that, so we need to apply the swizzle in the302* vertex shader.303*304* This is required at least in Vulkan to support madatory vertex attribute305* format VK_FORMAT_B8G8R8A8_UNORM.306*/307static void308v3d_nir_lower_vertex_input(struct v3d_compile *c, nir_builder *b,309nir_intrinsic_instr *instr)310{311assert(c->s->info.stage == MESA_SHADER_VERTEX);312313if (!c->vs_key->va_swap_rb_mask)314return;315316const uint32_t location = nir_intrinsic_io_semantics(instr).location;317318if (!(c->vs_key->va_swap_rb_mask & (1 << location)))319return;320321assert(instr->num_components == 1);322const uint32_t comp = nir_intrinsic_component(instr);323if (comp == 0 || comp == 2)324nir_intrinsic_set_component(instr, (comp + 2) % 4);325}326327static void328v3d_nir_lower_io_instr(struct v3d_compile *c, nir_builder *b,329struct nir_instr *instr,330struct v3d_nir_lower_io_state *state)331{332if (instr->type != nir_instr_type_intrinsic)333return;334nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);335336switch (intr->intrinsic) {337case nir_intrinsic_load_input:338if (c->s->info.stage == MESA_SHADER_VERTEX)339v3d_nir_lower_vertex_input(c, b, intr);340break;341342case nir_intrinsic_load_uniform:343v3d_nir_lower_uniform(c, b, intr);344break;345346case nir_intrinsic_store_output:347if (c->s->info.stage == MESA_SHADER_VERTEX ||348c->s->info.stage == MESA_SHADER_GEOMETRY) {349v3d_nir_lower_vpm_output(c, b, intr, state);350}351break;352353case nir_intrinsic_emit_vertex:354v3d_nir_lower_emit_vertex(c, b, intr, state);355break;356357case nir_intrinsic_end_primitive:358v3d_nir_lower_end_primitive(c, b, intr, state);359break;360361default:362break;363}364}365366/* Remap the output var's .driver_location. This is purely for367* nir_print_shader() so that store_output can map back to a variable name.368*/369static void370v3d_nir_lower_io_update_output_var_base(struct v3d_compile *c,371struct v3d_nir_lower_io_state *state)372{373nir_foreach_shader_out_variable_safe(var, c->s) {374if (var->data.location == VARYING_SLOT_POS &&375state->pos_vpm_offset != -1) {376var->data.driver_location = state->pos_vpm_offset;377continue;378}379380if (var->data.location == VARYING_SLOT_PSIZ &&381state->psiz_vpm_offset != -1) {382var->data.driver_location = state->psiz_vpm_offset;383continue;384}385386int vpm_offset =387v3d_varying_slot_vpm_offset(c,388var->data.location,389var->data.location_frac);390if (vpm_offset != -1) {391var->data.driver_location =392state->varyings_vpm_offset + vpm_offset;393} else {394/* If we couldn't find a mapping for the var, delete395* it so that its old .driver_location doesn't confuse396* nir_print_shader().397*/398exec_node_remove(&var->node);399}400}401}402403static void404v3d_nir_setup_vpm_layout_vs(struct v3d_compile *c,405struct v3d_nir_lower_io_state *state)406{407uint32_t vpm_offset = 0;408409state->pos_vpm_offset = -1;410state->vp_vpm_offset = -1;411state->zs_vpm_offset = -1;412state->rcp_wc_vpm_offset = -1;413state->psiz_vpm_offset = -1;414415bool needs_ff_outputs = c->vs_key->base.is_last_geometry_stage;416if (needs_ff_outputs) {417if (c->vs_key->is_coord) {418state->pos_vpm_offset = vpm_offset;419vpm_offset += 4;420}421422state->vp_vpm_offset = vpm_offset;423vpm_offset += 2;424425if (!c->vs_key->is_coord) {426state->zs_vpm_offset = vpm_offset++;427state->rcp_wc_vpm_offset = vpm_offset++;428}429430if (c->vs_key->per_vertex_point_size)431state->psiz_vpm_offset = vpm_offset++;432}433434state->varyings_vpm_offset = vpm_offset;435436c->vpm_output_size = MAX2(1, vpm_offset + c->vs_key->num_used_outputs);437}438439static void440v3d_nir_setup_vpm_layout_gs(struct v3d_compile *c,441struct v3d_nir_lower_io_state *state)442{443/* 1 header slot for number of output vertices */444uint32_t vpm_offset = 1;445446/* 1 header slot per output vertex */447const uint32_t num_vertices = c->s->info.gs.vertices_out;448vpm_offset += num_vertices;449450state->gs.output_header_size = vpm_offset;451452/* Vertex data: here we only compute offsets into a generic vertex data453* elements. When it is time to actually write a particular vertex to454* the VPM, we will add the offset for that vertex into the VPM output455* to these offsets.456*457* If geometry shaders are present, they are always the last shader458* stage before rasterization, so we always emit fixed function outputs.459*/460vpm_offset = 0;461if (c->gs_key->is_coord) {462state->pos_vpm_offset = vpm_offset;463vpm_offset += 4;464} else {465state->pos_vpm_offset = -1;466}467468state->vp_vpm_offset = vpm_offset;469vpm_offset += 2;470471if (!c->gs_key->is_coord) {472state->zs_vpm_offset = vpm_offset++;473state->rcp_wc_vpm_offset = vpm_offset++;474} else {475state->zs_vpm_offset = -1;476state->rcp_wc_vpm_offset = -1;477}478479/* Mesa enables OES_geometry_shader_point_size automatically with480* OES_geometry_shader so we always need to handle point size481* writes if present.482*/483if (c->gs_key->per_vertex_point_size)484state->psiz_vpm_offset = vpm_offset++;485486state->varyings_vpm_offset = vpm_offset;487488state->gs.output_vertex_data_size =489state->varyings_vpm_offset + c->gs_key->num_used_outputs;490491c->vpm_output_size =492state->gs.output_header_size +493state->gs.output_vertex_data_size * num_vertices;494}495496static void497v3d_nir_emit_ff_vpm_outputs(struct v3d_compile *c, nir_builder *b,498struct v3d_nir_lower_io_state *state)499{500/* If this is a geometry shader we need to emit our fixed function501* outputs to the current vertex offset in the VPM.502*/503nir_ssa_def *offset_reg =504c->s->info.stage == MESA_SHADER_GEOMETRY ?505nir_load_var(b, state->gs.output_offset_var) : NULL;506507for (int i = 0; i < 4; i++) {508if (!state->pos[i])509state->pos[i] = nir_ssa_undef(b, 1, 32);510}511512nir_ssa_def *rcp_wc = nir_frcp(b, state->pos[3]);513514if (state->pos_vpm_offset != -1) {515for (int i = 0; i < 4; i++) {516v3d_nir_store_output(b, state->pos_vpm_offset + i,517offset_reg, state->pos[i]);518}519}520521if (state->vp_vpm_offset != -1) {522for (int i = 0; i < 2; i++) {523nir_ssa_def *pos;524nir_ssa_def *scale;525pos = state->pos[i];526if (i == 0)527scale = nir_load_viewport_x_scale(b);528else529scale = nir_load_viewport_y_scale(b);530pos = nir_fmul(b, pos, scale);531pos = nir_fmul(b, pos, rcp_wc);532/* Pre-V3D 4.3 hardware has a quirk where it expects XY533* coordinates in .8 fixed-point format, but then it534* will internally round it to .6 fixed-point,535* introducing a double rounding. The double rounding536* can cause very slight differences in triangle537* raterization coverage that can actually be noticed by538* some CTS tests.539*540* The correct fix for this as recommended by Broadcom541* is to convert to .8 fixed-point with ffloor().542*/543pos = nir_f2i32(b, nir_ffloor(b, pos));544v3d_nir_store_output(b, state->vp_vpm_offset + i,545offset_reg, pos);546}547}548549if (state->zs_vpm_offset != -1) {550nir_ssa_def *z = state->pos[2];551z = nir_fmul(b, z, nir_load_viewport_z_scale(b));552z = nir_fmul(b, z, rcp_wc);553z = nir_fadd(b, z, nir_load_viewport_z_offset(b));554v3d_nir_store_output(b, state->zs_vpm_offset, offset_reg, z);555}556557if (state->rcp_wc_vpm_offset != -1) {558v3d_nir_store_output(b, state->rcp_wc_vpm_offset,559offset_reg, rcp_wc);560}561562/* Store 0 to varyings requested by the FS but not stored by the563* previous stage. This should be undefined behavior, but564* glsl-routing seems to rely on it.565*/566uint32_t num_used_outputs;567switch (c->s->info.stage) {568case MESA_SHADER_VERTEX:569num_used_outputs = c->vs_key->num_used_outputs;570break;571case MESA_SHADER_GEOMETRY:572num_used_outputs = c->gs_key->num_used_outputs;573break;574default:575unreachable("Unsupported shader stage");576}577578for (int i = 0; i < num_used_outputs; i++) {579if (!BITSET_TEST(state->varyings_stored, i)) {580v3d_nir_store_output(b, state->varyings_vpm_offset + i,581offset_reg, nir_imm_int(b, 0));582}583}584}585586static void587emit_gs_prolog(struct v3d_compile *c, nir_builder *b,588nir_function_impl *impl,589struct v3d_nir_lower_io_state *state)590{591nir_block *first = nir_start_block(impl);592b->cursor = nir_before_block(first);593594const struct glsl_type *uint_type = glsl_uint_type();595596assert(!state->gs.output_offset_var);597state->gs.output_offset_var =598nir_local_variable_create(impl, uint_type, "output_offset");599nir_store_var(b, state->gs.output_offset_var,600nir_imm_int(b, state->gs.output_header_size), 0x1);601602assert(!state->gs.header_offset_var);603state->gs.header_offset_var =604nir_local_variable_create(impl, uint_type, "header_offset");605nir_store_var(b, state->gs.header_offset_var, nir_imm_int(b, 1), 0x1);606607assert(!state->gs.header_var);608state->gs.header_var =609nir_local_variable_create(impl, uint_type, "header");610reset_gs_header(b, state);611}612613static void614emit_gs_vpm_output_header_prolog(struct v3d_compile *c, nir_builder *b,615struct v3d_nir_lower_io_state *state)616{617const uint8_t VERTEX_COUNT_OFFSET = 16;618619/* Our GS header has 1 generic header slot (at VPM offset 0) and then620* one slot per output vertex after it. This means we don't need to621* have a variable just to keep track of the number of vertices we622* emitted and instead we can just compute it here from the header623* offset variable by removing the one generic header slot that always624* goes at the begining of out header.625*/626nir_ssa_def *header_offset =627nir_load_var(b, state->gs.header_offset_var);628nir_ssa_def *vertex_count =629nir_isub(b, header_offset, nir_imm_int(b, 1));630nir_ssa_def *header =631nir_ior(b, nir_imm_int(b, state->gs.output_header_size),632nir_ishl(b, vertex_count,633nir_imm_int(b, VERTEX_COUNT_OFFSET)));634635v3d_nir_store_output(b, 0, NULL, header);636}637638void639v3d_nir_lower_io(nir_shader *s, struct v3d_compile *c)640{641struct v3d_nir_lower_io_state state = { 0 };642643/* Set up the layout of the VPM outputs. */644switch (s->info.stage) {645case MESA_SHADER_VERTEX:646v3d_nir_setup_vpm_layout_vs(c, &state);647break;648case MESA_SHADER_GEOMETRY:649v3d_nir_setup_vpm_layout_gs(c, &state);650break;651case MESA_SHADER_FRAGMENT:652case MESA_SHADER_COMPUTE:653break;654default:655unreachable("Unsupported shader stage");656}657658nir_foreach_function(function, s) {659if (function->impl) {660nir_builder b;661nir_builder_init(&b, function->impl);662663if (c->s->info.stage == MESA_SHADER_GEOMETRY)664emit_gs_prolog(c, &b, function->impl, &state);665666nir_foreach_block(block, function->impl) {667nir_foreach_instr_safe(instr, block)668v3d_nir_lower_io_instr(c, &b, instr,669&state);670}671672nir_block *last = nir_impl_last_block(function->impl);673b.cursor = nir_after_block(last);674if (s->info.stage == MESA_SHADER_VERTEX) {675v3d_nir_emit_ff_vpm_outputs(c, &b, &state);676} else if (s->info.stage == MESA_SHADER_GEOMETRY) {677emit_gs_vpm_output_header_prolog(c, &b, &state);678}679680nir_metadata_preserve(function->impl,681nir_metadata_block_index |682nir_metadata_dominance);683}684}685686if (s->info.stage == MESA_SHADER_VERTEX ||687s->info.stage == MESA_SHADER_GEOMETRY) {688v3d_nir_lower_io_update_output_var_base(c, &state);689}690}691692693