Path: blob/21.2-virgl/src/compiler/nir/nir_linking_helpers.c
4545 views
/*1* Copyright © 2015 Intel Corporation2*3* Permission is hereby granted, free of charge, to any person obtaining a4* copy of this software and associated documentation files (the "Software"),5* to deal in the Software without restriction, including without limitation6* the rights to use, copy, modify, merge, publish, distribute, sublicense,7* and/or sell copies of the Software, and to permit persons to whom the8* Software is furnished to do so, subject to the following conditions:9*10* The above copyright notice and this permission notice (including the next11* paragraph) shall be included in all copies or substantial portions of the12* Software.13*14* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR15* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,16* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL17* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER18* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING19* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS20* IN THE SOFTWARE.21*/2223#include "nir.h"24#include "nir_builder.h"25#include "util/set.h"26#include "util/hash_table.h"2728/* This file contains various little helpers for doing simple linking in29* NIR. Eventually, we'll probably want a full-blown varying packing30* implementation in here. Right now, it just deletes unused things.31*/3233/**34* Returns the bits in the inputs_read, or outputs_written35* bitfield corresponding to this variable.36*/37static uint64_t38get_variable_io_mask(nir_variable *var, gl_shader_stage stage)39{40if (var->data.location < 0)41return 0;4243unsigned location = var->data.patch ?44var->data.location - VARYING_SLOT_PATCH0 : var->data.location;4546assert(var->data.mode == nir_var_shader_in ||47var->data.mode == nir_var_shader_out);48assert(var->data.location >= 0);4950const struct glsl_type *type = var->type;51if (nir_is_arrayed_io(var, stage) || var->data.per_view) {52assert(glsl_type_is_array(type));53type = glsl_get_array_element(type);54}5556unsigned slots = glsl_count_attribute_slots(type, false);57return ((1ull << slots) - 1) << location;58}5960static uint8_t61get_num_components(nir_variable *var)62{63if (glsl_type_is_struct_or_ifc(glsl_without_array(var->type)))64return 4;6566return glsl_get_vector_elements(glsl_without_array(var->type));67}6869static void70tcs_add_output_reads(nir_shader *shader, uint64_t *read, uint64_t *patches_read)71{72nir_foreach_function(function, shader) {73if (!function->impl)74continue;7576nir_foreach_block(block, function->impl) {77nir_foreach_instr(instr, block) {78if (instr->type != nir_instr_type_intrinsic)79continue;8081nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);82if (intrin->intrinsic != nir_intrinsic_load_deref)83continue;8485nir_deref_instr *deref = nir_src_as_deref(intrin->src[0]);86if (!nir_deref_mode_is(deref, nir_var_shader_out))87continue;8889nir_variable *var = nir_deref_instr_get_variable(deref);90for (unsigned i = 0; i < get_num_components(var); i++) {91if (var->data.patch) {92patches_read[var->data.location_frac + i] |=93get_variable_io_mask(var, shader->info.stage);94} else {95read[var->data.location_frac + i] |=96get_variable_io_mask(var, shader->info.stage);97}98}99}100}101}102}103104/**105* Helper for removing unused shader I/O variables, by demoting them to global106* variables (which may then by dead code eliminated).107*108* Example usage is:109*110* progress = nir_remove_unused_io_vars(producer, nir_var_shader_out,111* read, patches_read) ||112* progress;113*114* The "used" should be an array of 4 uint64_ts (probably of VARYING_BIT_*)115* representing each .location_frac used. Note that for vector variables,116* only the first channel (.location_frac) is examined for deciding if the117* variable is used!118*/119bool120nir_remove_unused_io_vars(nir_shader *shader,121nir_variable_mode mode,122uint64_t *used_by_other_stage,123uint64_t *used_by_other_stage_patches)124{125bool progress = false;126uint64_t *used;127128assert(mode == nir_var_shader_in || mode == nir_var_shader_out);129130nir_foreach_variable_with_modes_safe(var, shader, mode) {131if (var->data.patch)132used = used_by_other_stage_patches;133else134used = used_by_other_stage;135136if (var->data.location < VARYING_SLOT_VAR0 && var->data.location >= 0)137continue;138139if (var->data.always_active_io)140continue;141142if (var->data.explicit_xfb_buffer)143continue;144145uint64_t other_stage = used[var->data.location_frac];146147if (!(other_stage & get_variable_io_mask(var, shader->info.stage))) {148/* This one is invalid, make it a global variable instead */149var->data.location = 0;150var->data.mode = nir_var_shader_temp;151152progress = true;153}154}155156if (progress)157nir_fixup_deref_modes(shader);158159return progress;160}161162bool163nir_remove_unused_varyings(nir_shader *producer, nir_shader *consumer)164{165assert(producer->info.stage != MESA_SHADER_FRAGMENT);166assert(consumer->info.stage != MESA_SHADER_VERTEX);167168uint64_t read[4] = { 0 }, written[4] = { 0 };169uint64_t patches_read[4] = { 0 }, patches_written[4] = { 0 };170171nir_foreach_shader_out_variable(var, producer) {172for (unsigned i = 0; i < get_num_components(var); i++) {173if (var->data.patch) {174patches_written[var->data.location_frac + i] |=175get_variable_io_mask(var, producer->info.stage);176} else {177written[var->data.location_frac + i] |=178get_variable_io_mask(var, producer->info.stage);179}180}181}182183nir_foreach_shader_in_variable(var, consumer) {184for (unsigned i = 0; i < get_num_components(var); i++) {185if (var->data.patch) {186patches_read[var->data.location_frac + i] |=187get_variable_io_mask(var, consumer->info.stage);188} else {189read[var->data.location_frac + i] |=190get_variable_io_mask(var, consumer->info.stage);191}192}193}194195/* Each TCS invocation can read data written by other TCS invocations,196* so even if the outputs are not used by the TES we must also make197* sure they are not read by the TCS before demoting them to globals.198*/199if (producer->info.stage == MESA_SHADER_TESS_CTRL)200tcs_add_output_reads(producer, read, patches_read);201202bool progress = false;203progress = nir_remove_unused_io_vars(producer, nir_var_shader_out, read,204patches_read);205206progress = nir_remove_unused_io_vars(consumer, nir_var_shader_in, written,207patches_written) || progress;208209return progress;210}211212static uint8_t213get_interp_type(nir_variable *var, const struct glsl_type *type,214bool default_to_smooth_interp)215{216if (glsl_type_is_integer(type))217return INTERP_MODE_FLAT;218else if (var->data.interpolation != INTERP_MODE_NONE)219return var->data.interpolation;220else if (default_to_smooth_interp)221return INTERP_MODE_SMOOTH;222else223return INTERP_MODE_NONE;224}225226#define INTERPOLATE_LOC_SAMPLE 0227#define INTERPOLATE_LOC_CENTROID 1228#define INTERPOLATE_LOC_CENTER 2229230static uint8_t231get_interp_loc(nir_variable *var)232{233if (var->data.sample)234return INTERPOLATE_LOC_SAMPLE;235else if (var->data.centroid)236return INTERPOLATE_LOC_CENTROID;237else238return INTERPOLATE_LOC_CENTER;239}240241static bool242is_packing_supported_for_type(const struct glsl_type *type)243{244/* We ignore complex types such as arrays, matrices, structs and bitsizes245* other then 32bit. All other vector types should have been split into246* scalar variables by the lower_io_to_scalar pass. The only exception247* should be OpenGL xfb varyings.248* TODO: add support for more complex types?249*/250return glsl_type_is_scalar(type) && glsl_type_is_32bit(type);251}252253struct assigned_comps254{255uint8_t comps;256uint8_t interp_type;257uint8_t interp_loc;258bool is_32bit;259bool is_mediump;260};261262/* Packing arrays and dual slot varyings is difficult so to avoid complex263* algorithms this function just assigns them their existing location for now.264* TODO: allow better packing of complex types.265*/266static void267get_unmoveable_components_masks(nir_shader *shader,268nir_variable_mode mode,269struct assigned_comps *comps,270gl_shader_stage stage,271bool default_to_smooth_interp)272{273nir_foreach_variable_with_modes_safe(var, shader, mode) {274assert(var->data.location >= 0);275276/* Only remap things that aren't built-ins. */277if (var->data.location >= VARYING_SLOT_VAR0 &&278var->data.location - VARYING_SLOT_VAR0 < MAX_VARYINGS_INCL_PATCH) {279280const struct glsl_type *type = var->type;281if (nir_is_arrayed_io(var, stage) || var->data.per_view) {282assert(glsl_type_is_array(type));283type = glsl_get_array_element(type);284}285286/* If we can pack this varying then don't mark the components as287* used.288*/289if (is_packing_supported_for_type(type))290continue;291292unsigned location = var->data.location - VARYING_SLOT_VAR0;293294unsigned elements =295glsl_type_is_vector_or_scalar(glsl_without_array(type)) ?296glsl_get_vector_elements(glsl_without_array(type)) : 4;297298bool dual_slot = glsl_type_is_dual_slot(glsl_without_array(type));299unsigned slots = glsl_count_attribute_slots(type, false);300unsigned dmul = glsl_type_is_64bit(glsl_without_array(type)) ? 2 : 1;301unsigned comps_slot2 = 0;302for (unsigned i = 0; i < slots; i++) {303if (dual_slot) {304if (i & 1) {305comps[location + i].comps |= ((1 << comps_slot2) - 1);306} else {307unsigned num_comps = 4 - var->data.location_frac;308comps_slot2 = (elements * dmul) - num_comps;309310/* Assume ARB_enhanced_layouts packing rules for doubles */311assert(var->data.location_frac == 0 ||312var->data.location_frac == 2);313assert(comps_slot2 <= 4);314315comps[location + i].comps |=316((1 << num_comps) - 1) << var->data.location_frac;317}318} else {319comps[location + i].comps |=320((1 << (elements * dmul)) - 1) << var->data.location_frac;321}322323comps[location + i].interp_type =324get_interp_type(var, type, default_to_smooth_interp);325comps[location + i].interp_loc = get_interp_loc(var);326comps[location + i].is_32bit =327glsl_type_is_32bit(glsl_without_array(type));328comps[location + i].is_mediump =329var->data.precision == GLSL_PRECISION_MEDIUM ||330var->data.precision == GLSL_PRECISION_LOW;331}332}333}334}335336struct varying_loc337{338uint8_t component;339uint32_t location;340};341342static void343mark_all_used_slots(nir_variable *var, uint64_t *slots_used,344uint64_t slots_used_mask, unsigned num_slots)345{346unsigned loc_offset = var->data.patch ? VARYING_SLOT_PATCH0 : 0;347348slots_used[var->data.patch ? 1 : 0] |= slots_used_mask &349BITFIELD64_RANGE(var->data.location - loc_offset, num_slots);350}351352static void353mark_used_slot(nir_variable *var, uint64_t *slots_used, unsigned offset)354{355unsigned loc_offset = var->data.patch ? VARYING_SLOT_PATCH0 : 0;356357slots_used[var->data.patch ? 1 : 0] |=358BITFIELD64_BIT(var->data.location - loc_offset + offset);359}360361static void362remap_slots_and_components(nir_shader *shader, nir_variable_mode mode,363struct varying_loc (*remap)[4],364uint64_t *slots_used, uint64_t *out_slots_read,365uint32_t *p_slots_used, uint32_t *p_out_slots_read)366{367const gl_shader_stage stage = shader->info.stage;368uint64_t out_slots_read_tmp[2] = {0};369uint64_t slots_used_tmp[2] = {0};370371/* We don't touch builtins so just copy the bitmask */372slots_used_tmp[0] = *slots_used & BITFIELD64_RANGE(0, VARYING_SLOT_VAR0);373374nir_foreach_variable_with_modes(var, shader, mode) {375assert(var->data.location >= 0);376377/* Only remap things that aren't built-ins */378if (var->data.location >= VARYING_SLOT_VAR0 &&379var->data.location - VARYING_SLOT_VAR0 < MAX_VARYINGS_INCL_PATCH) {380381const struct glsl_type *type = var->type;382if (nir_is_arrayed_io(var, stage) || var->data.per_view) {383assert(glsl_type_is_array(type));384type = glsl_get_array_element(type);385}386387unsigned num_slots = glsl_count_attribute_slots(type, false);388bool used_across_stages = false;389bool outputs_read = false;390391unsigned location = var->data.location - VARYING_SLOT_VAR0;392struct varying_loc *new_loc = &remap[location][var->data.location_frac];393394unsigned loc_offset = var->data.patch ? VARYING_SLOT_PATCH0 : 0;395uint64_t used = var->data.patch ? *p_slots_used : *slots_used;396uint64_t outs_used =397var->data.patch ? *p_out_slots_read : *out_slots_read;398uint64_t slots =399BITFIELD64_RANGE(var->data.location - loc_offset, num_slots);400401if (slots & used)402used_across_stages = true;403404if (slots & outs_used)405outputs_read = true;406407if (new_loc->location) {408var->data.location = new_loc->location;409var->data.location_frac = new_loc->component;410}411412if (var->data.always_active_io) {413/* We can't apply link time optimisations (specifically array414* splitting) to these so we need to copy the existing mask415* otherwise we will mess up the mask for things like partially416* marked arrays.417*/418if (used_across_stages)419mark_all_used_slots(var, slots_used_tmp, used, num_slots);420421if (outputs_read) {422mark_all_used_slots(var, out_slots_read_tmp, outs_used,423num_slots);424}425} else {426for (unsigned i = 0; i < num_slots; i++) {427if (used_across_stages)428mark_used_slot(var, slots_used_tmp, i);429430if (outputs_read)431mark_used_slot(var, out_slots_read_tmp, i);432}433}434}435}436437*slots_used = slots_used_tmp[0];438*out_slots_read = out_slots_read_tmp[0];439*p_slots_used = slots_used_tmp[1];440*p_out_slots_read = out_slots_read_tmp[1];441}442443struct varying_component {444nir_variable *var;445uint8_t interp_type;446uint8_t interp_loc;447bool is_32bit;448bool is_patch;449bool is_mediump;450bool is_intra_stage_only;451bool initialised;452};453454static int455cmp_varying_component(const void *comp1_v, const void *comp2_v)456{457struct varying_component *comp1 = (struct varying_component *) comp1_v;458struct varying_component *comp2 = (struct varying_component *) comp2_v;459460/* We want patches to be order at the end of the array */461if (comp1->is_patch != comp2->is_patch)462return comp1->is_patch ? 1 : -1;463464/* We want to try to group together TCS outputs that are only read by other465* TCS invocations and not consumed by the follow stage.466*/467if (comp1->is_intra_stage_only != comp2->is_intra_stage_only)468return comp1->is_intra_stage_only ? 1 : -1;469470/* Group mediump varyings together. */471if (comp1->is_mediump != comp2->is_mediump)472return comp1->is_mediump ? 1 : -1;473474/* We can only pack varyings with matching interpolation types so group475* them together.476*/477if (comp1->interp_type != comp2->interp_type)478return comp1->interp_type - comp2->interp_type;479480/* Interpolation loc must match also. */481if (comp1->interp_loc != comp2->interp_loc)482return comp1->interp_loc - comp2->interp_loc;483484/* If everything else matches just use the original location to sort */485const struct nir_variable_data *const data1 = &comp1->var->data;486const struct nir_variable_data *const data2 = &comp2->var->data;487if (data1->location != data2->location)488return data1->location - data2->location;489return (int)data1->location_frac - (int)data2->location_frac;490}491492static void493gather_varying_component_info(nir_shader *producer, nir_shader *consumer,494struct varying_component **varying_comp_info,495unsigned *varying_comp_info_size,496bool default_to_smooth_interp)497{498unsigned store_varying_info_idx[MAX_VARYINGS_INCL_PATCH][4] = {{0}};499unsigned num_of_comps_to_pack = 0;500501/* Count the number of varying that can be packed and create a mapping502* of those varyings to the array we will pass to qsort.503*/504nir_foreach_shader_out_variable(var, producer) {505506/* Only remap things that aren't builtins. */507if (var->data.location >= VARYING_SLOT_VAR0 &&508var->data.location - VARYING_SLOT_VAR0 < MAX_VARYINGS_INCL_PATCH) {509510/* We can't repack xfb varyings. */511if (var->data.always_active_io)512continue;513514const struct glsl_type *type = var->type;515if (nir_is_arrayed_io(var, producer->info.stage) || var->data.per_view) {516assert(glsl_type_is_array(type));517type = glsl_get_array_element(type);518}519520if (!is_packing_supported_for_type(type))521continue;522523unsigned loc = var->data.location - VARYING_SLOT_VAR0;524store_varying_info_idx[loc][var->data.location_frac] =525++num_of_comps_to_pack;526}527}528529*varying_comp_info_size = num_of_comps_to_pack;530*varying_comp_info = rzalloc_array(NULL, struct varying_component,531num_of_comps_to_pack);532533nir_function_impl *impl = nir_shader_get_entrypoint(consumer);534535/* Walk over the shader and populate the varying component info array */536nir_foreach_block(block, impl) {537nir_foreach_instr(instr, block) {538if (instr->type != nir_instr_type_intrinsic)539continue;540541nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);542if (intr->intrinsic != nir_intrinsic_load_deref &&543intr->intrinsic != nir_intrinsic_interp_deref_at_centroid &&544intr->intrinsic != nir_intrinsic_interp_deref_at_sample &&545intr->intrinsic != nir_intrinsic_interp_deref_at_offset &&546intr->intrinsic != nir_intrinsic_interp_deref_at_vertex)547continue;548549nir_deref_instr *deref = nir_src_as_deref(intr->src[0]);550if (!nir_deref_mode_is(deref, nir_var_shader_in))551continue;552553/* We only remap things that aren't builtins. */554nir_variable *in_var = nir_deref_instr_get_variable(deref);555if (in_var->data.location < VARYING_SLOT_VAR0)556continue;557558unsigned location = in_var->data.location - VARYING_SLOT_VAR0;559if (location >= MAX_VARYINGS_INCL_PATCH)560continue;561562unsigned var_info_idx =563store_varying_info_idx[location][in_var->data.location_frac];564if (!var_info_idx)565continue;566567struct varying_component *vc_info =568&(*varying_comp_info)[var_info_idx-1];569570if (!vc_info->initialised) {571const struct glsl_type *type = in_var->type;572if (nir_is_arrayed_io(in_var, consumer->info.stage) ||573in_var->data.per_view) {574assert(glsl_type_is_array(type));575type = glsl_get_array_element(type);576}577578vc_info->var = in_var;579vc_info->interp_type =580get_interp_type(in_var, type, default_to_smooth_interp);581vc_info->interp_loc = get_interp_loc(in_var);582vc_info->is_32bit = glsl_type_is_32bit(type);583vc_info->is_patch = in_var->data.patch;584vc_info->is_mediump = !producer->options->linker_ignore_precision &&585(in_var->data.precision == GLSL_PRECISION_MEDIUM ||586in_var->data.precision == GLSL_PRECISION_LOW);587vc_info->is_intra_stage_only = false;588vc_info->initialised = true;589}590}591}592593/* Walk over the shader and populate the varying component info array594* for varyings which are read by other TCS instances but are not consumed595* by the TES.596*/597if (producer->info.stage == MESA_SHADER_TESS_CTRL) {598impl = nir_shader_get_entrypoint(producer);599600nir_foreach_block(block, impl) {601nir_foreach_instr(instr, block) {602if (instr->type != nir_instr_type_intrinsic)603continue;604605nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);606if (intr->intrinsic != nir_intrinsic_load_deref)607continue;608609nir_deref_instr *deref = nir_src_as_deref(intr->src[0]);610if (!nir_deref_mode_is(deref, nir_var_shader_out))611continue;612613/* We only remap things that aren't builtins. */614nir_variable *out_var = nir_deref_instr_get_variable(deref);615if (out_var->data.location < VARYING_SLOT_VAR0)616continue;617618unsigned location = out_var->data.location - VARYING_SLOT_VAR0;619if (location >= MAX_VARYINGS_INCL_PATCH)620continue;621622unsigned var_info_idx =623store_varying_info_idx[location][out_var->data.location_frac];624if (!var_info_idx) {625/* Something went wrong, the shader interfaces didn't match, so626* abandon packing. This can happen for example when the627* inputs are scalars but the outputs are struct members.628*/629*varying_comp_info_size = 0;630break;631}632633struct varying_component *vc_info =634&(*varying_comp_info)[var_info_idx-1];635636if (!vc_info->initialised) {637const struct glsl_type *type = out_var->type;638if (nir_is_arrayed_io(out_var, producer->info.stage)) {639assert(glsl_type_is_array(type));640type = glsl_get_array_element(type);641}642643vc_info->var = out_var;644vc_info->interp_type =645get_interp_type(out_var, type, default_to_smooth_interp);646vc_info->interp_loc = get_interp_loc(out_var);647vc_info->is_32bit = glsl_type_is_32bit(type);648vc_info->is_patch = out_var->data.patch;649vc_info->is_mediump = !producer->options->linker_ignore_precision &&650(out_var->data.precision == GLSL_PRECISION_MEDIUM ||651out_var->data.precision == GLSL_PRECISION_LOW);652vc_info->is_intra_stage_only = true;653vc_info->initialised = true;654}655}656}657}658659for (unsigned i = 0; i < *varying_comp_info_size; i++ ) {660struct varying_component *vc_info = &(*varying_comp_info)[i];661if (!vc_info->initialised) {662/* Something went wrong, the shader interfaces didn't match, so663* abandon packing. This can happen for example when the outputs are664* scalars but the inputs are struct members.665*/666*varying_comp_info_size = 0;667break;668}669}670}671672static void673assign_remap_locations(struct varying_loc (*remap)[4],674struct assigned_comps *assigned_comps,675struct varying_component *info,676unsigned *cursor, unsigned *comp,677unsigned max_location)678{679unsigned tmp_cursor = *cursor;680unsigned tmp_comp = *comp;681682for (; tmp_cursor < max_location; tmp_cursor++) {683684if (assigned_comps[tmp_cursor].comps) {685/* We can only pack varyings with matching interpolation types,686* interpolation loc must match also.687* TODO: i965 can handle interpolation locations that don't match,688* but the radeonsi nir backend handles everything as vec4s and so689* expects this to be the same for all components. We could make this690* check driver specfific or drop it if NIR ever become the only691* radeonsi backend.692* TODO2: The radeonsi comment above is not true. Only "flat" is per693* vec4 (128-bit granularity), all other interpolation qualifiers are694* per component (16-bit granularity for float16, 32-bit granularity695* otherwise). Each vec4 (128 bits) must be either vec4 or f16vec8.696*/697if (assigned_comps[tmp_cursor].interp_type != info->interp_type ||698assigned_comps[tmp_cursor].interp_loc != info->interp_loc ||699assigned_comps[tmp_cursor].is_mediump != info->is_mediump) {700tmp_comp = 0;701continue;702}703704/* We can only pack varyings with matching types, and the current705* algorithm only supports packing 32-bit.706*/707if (!assigned_comps[tmp_cursor].is_32bit) {708tmp_comp = 0;709continue;710}711712while (tmp_comp < 4 &&713(assigned_comps[tmp_cursor].comps & (1 << tmp_comp))) {714tmp_comp++;715}716}717718if (tmp_comp == 4) {719tmp_comp = 0;720continue;721}722723unsigned location = info->var->data.location - VARYING_SLOT_VAR0;724725/* Once we have assigned a location mark it as used */726assigned_comps[tmp_cursor].comps |= (1 << tmp_comp);727assigned_comps[tmp_cursor].interp_type = info->interp_type;728assigned_comps[tmp_cursor].interp_loc = info->interp_loc;729assigned_comps[tmp_cursor].is_32bit = info->is_32bit;730assigned_comps[tmp_cursor].is_mediump = info->is_mediump;731732/* Assign remap location */733remap[location][info->var->data.location_frac].component = tmp_comp++;734remap[location][info->var->data.location_frac].location =735tmp_cursor + VARYING_SLOT_VAR0;736737break;738}739740*cursor = tmp_cursor;741*comp = tmp_comp;742}743744/* If there are empty components in the slot compact the remaining components745* as close to component 0 as possible. This will make it easier to fill the746* empty components with components from a different slot in a following pass.747*/748static void749compact_components(nir_shader *producer, nir_shader *consumer,750struct assigned_comps *assigned_comps,751bool default_to_smooth_interp)752{753struct varying_loc remap[MAX_VARYINGS_INCL_PATCH][4] = {{{0}, {0}}};754struct varying_component *varying_comp_info;755unsigned varying_comp_info_size;756757/* Gather varying component info */758gather_varying_component_info(producer, consumer, &varying_comp_info,759&varying_comp_info_size,760default_to_smooth_interp);761762/* Sort varying components. */763qsort(varying_comp_info, varying_comp_info_size,764sizeof(struct varying_component), cmp_varying_component);765766unsigned cursor = 0;767unsigned comp = 0;768769/* Set the remap array based on the sorted components */770for (unsigned i = 0; i < varying_comp_info_size; i++ ) {771struct varying_component *info = &varying_comp_info[i];772773assert(info->is_patch || cursor < MAX_VARYING);774if (info->is_patch) {775/* The list should be sorted with all non-patch inputs first followed776* by patch inputs. When we hit our first patch input, we need to777* reset the cursor to MAX_VARYING so we put them in the right slot.778*/779if (cursor < MAX_VARYING) {780cursor = MAX_VARYING;781comp = 0;782}783784assign_remap_locations(remap, assigned_comps, info,785&cursor, &comp, MAX_VARYINGS_INCL_PATCH);786} else {787assign_remap_locations(remap, assigned_comps, info,788&cursor, &comp, MAX_VARYING);789790/* Check if we failed to assign a remap location. This can happen if791* for example there are a bunch of unmovable components with792* mismatching interpolation types causing us to skip over locations793* that would have been useful for packing later components.794* The solution is to iterate over the locations again (this should795* happen very rarely in practice).796*/797if (cursor == MAX_VARYING) {798cursor = 0;799comp = 0;800assign_remap_locations(remap, assigned_comps, info,801&cursor, &comp, MAX_VARYING);802}803}804}805806ralloc_free(varying_comp_info);807808uint64_t zero = 0;809uint32_t zero32 = 0;810remap_slots_and_components(consumer, nir_var_shader_in, remap,811&consumer->info.inputs_read, &zero,812&consumer->info.patch_inputs_read, &zero32);813remap_slots_and_components(producer, nir_var_shader_out, remap,814&producer->info.outputs_written,815&producer->info.outputs_read,816&producer->info.patch_outputs_written,817&producer->info.patch_outputs_read);818}819820/* We assume that this has been called more-or-less directly after821* remove_unused_varyings. At this point, all of the varyings that we822* aren't going to be using have been completely removed and the823* inputs_read and outputs_written fields in nir_shader_info reflect824* this. Therefore, the total set of valid slots is the OR of the two825* sets of varyings; this accounts for varyings which one side may need826* to read/write even if the other doesn't. This can happen if, for827* instance, an array is used indirectly from one side causing it to be828* unsplittable but directly from the other.829*/830void831nir_compact_varyings(nir_shader *producer, nir_shader *consumer,832bool default_to_smooth_interp)833{834assert(producer->info.stage != MESA_SHADER_FRAGMENT);835assert(consumer->info.stage != MESA_SHADER_VERTEX);836837struct assigned_comps assigned_comps[MAX_VARYINGS_INCL_PATCH] = {{0}};838839get_unmoveable_components_masks(producer, nir_var_shader_out,840assigned_comps,841producer->info.stage,842default_to_smooth_interp);843get_unmoveable_components_masks(consumer, nir_var_shader_in,844assigned_comps,845consumer->info.stage,846default_to_smooth_interp);847848compact_components(producer, consumer, assigned_comps,849default_to_smooth_interp);850}851852/*853* Mark XFB varyings as always_active_io in the consumer so the linking opts854* don't touch them.855*/856void857nir_link_xfb_varyings(nir_shader *producer, nir_shader *consumer)858{859nir_variable *input_vars[MAX_VARYING] = { 0 };860861nir_foreach_shader_in_variable(var, consumer) {862if (var->data.location >= VARYING_SLOT_VAR0 &&863var->data.location - VARYING_SLOT_VAR0 < MAX_VARYING) {864865unsigned location = var->data.location - VARYING_SLOT_VAR0;866input_vars[location] = var;867}868}869870nir_foreach_shader_out_variable(var, producer) {871if (var->data.location >= VARYING_SLOT_VAR0 &&872var->data.location - VARYING_SLOT_VAR0 < MAX_VARYING) {873874if (!var->data.always_active_io)875continue;876877unsigned location = var->data.location - VARYING_SLOT_VAR0;878if (input_vars[location]) {879input_vars[location]->data.always_active_io = true;880}881}882}883}884885static bool886does_varying_match(nir_variable *out_var, nir_variable *in_var)887{888return in_var->data.location == out_var->data.location &&889in_var->data.location_frac == out_var->data.location_frac;890}891892static nir_variable *893get_matching_input_var(nir_shader *consumer, nir_variable *out_var)894{895nir_foreach_shader_in_variable(var, consumer) {896if (does_varying_match(out_var, var))897return var;898}899900return NULL;901}902903static bool904can_replace_varying(nir_variable *out_var)905{906/* Skip types that require more complex handling.907* TODO: add support for these types.908*/909if (glsl_type_is_array(out_var->type) ||910glsl_type_is_dual_slot(out_var->type) ||911glsl_type_is_matrix(out_var->type) ||912glsl_type_is_struct_or_ifc(out_var->type))913return false;914915/* Limit this pass to scalars for now to keep things simple. Most varyings916* should have been lowered to scalars at this point anyway.917*/918if (!glsl_type_is_scalar(out_var->type))919return false;920921if (out_var->data.location < VARYING_SLOT_VAR0 ||922out_var->data.location - VARYING_SLOT_VAR0 >= MAX_VARYING)923return false;924925return true;926}927928static bool929replace_constant_input(nir_shader *shader, nir_intrinsic_instr *store_intr)930{931nir_function_impl *impl = nir_shader_get_entrypoint(shader);932933nir_builder b;934nir_builder_init(&b, impl);935936nir_variable *out_var =937nir_deref_instr_get_variable(nir_src_as_deref(store_intr->src[0]));938939bool progress = false;940nir_foreach_block(block, impl) {941nir_foreach_instr(instr, block) {942if (instr->type != nir_instr_type_intrinsic)943continue;944945nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);946if (intr->intrinsic != nir_intrinsic_load_deref)947continue;948949nir_deref_instr *in_deref = nir_src_as_deref(intr->src[0]);950if (!nir_deref_mode_is(in_deref, nir_var_shader_in))951continue;952953nir_variable *in_var = nir_deref_instr_get_variable(in_deref);954955if (!does_varying_match(out_var, in_var))956continue;957958b.cursor = nir_before_instr(instr);959960nir_load_const_instr *out_const =961nir_instr_as_load_const(store_intr->src[1].ssa->parent_instr);962963/* Add new const to replace the input */964nir_ssa_def *nconst = nir_build_imm(&b, store_intr->num_components,965intr->dest.ssa.bit_size,966out_const->value);967968nir_ssa_def_rewrite_uses(&intr->dest.ssa, nconst);969970progress = true;971}972}973974return progress;975}976977static bool978replace_duplicate_input(nir_shader *shader, nir_variable *input_var,979nir_intrinsic_instr *dup_store_intr)980{981assert(input_var);982983nir_function_impl *impl = nir_shader_get_entrypoint(shader);984985nir_builder b;986nir_builder_init(&b, impl);987988nir_variable *dup_out_var =989nir_deref_instr_get_variable(nir_src_as_deref(dup_store_intr->src[0]));990991bool progress = false;992nir_foreach_block(block, impl) {993nir_foreach_instr(instr, block) {994if (instr->type != nir_instr_type_intrinsic)995continue;996997nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);998if (intr->intrinsic != nir_intrinsic_load_deref)999continue;10001001nir_deref_instr *in_deref = nir_src_as_deref(intr->src[0]);1002if (!nir_deref_mode_is(in_deref, nir_var_shader_in))1003continue;10041005nir_variable *in_var = nir_deref_instr_get_variable(in_deref);10061007if (!does_varying_match(dup_out_var, in_var) ||1008in_var->data.interpolation != input_var->data.interpolation ||1009get_interp_loc(in_var) != get_interp_loc(input_var))1010continue;10111012b.cursor = nir_before_instr(instr);10131014nir_ssa_def *load = nir_load_var(&b, input_var);1015nir_ssa_def_rewrite_uses(&intr->dest.ssa, load);10161017progress = true;1018}1019}10201021return progress;1022}10231024/* The GLSL ES 3.20 spec says:1025*1026* "The precision of a vertex output does not need to match the precision of1027* the corresponding fragment input. The minimum precision at which vertex1028* outputs are interpolated is the minimum of the vertex output precision and1029* the fragment input precision, with the exception that for highp,1030* implementations do not have to support full IEEE 754 precision." (9.1 "Input1031* Output Matching by Name in Linked Programs")1032*1033* To implement this, when linking shaders we will take the minimum precision1034* qualifier (allowing drivers to interpolate at lower precision). For1035* input/output between non-fragment stages (e.g. VERTEX to GEOMETRY), the spec1036* requires we use the *last* specified precision if there is a conflict.1037*1038* Precisions are ordered as (NONE, HIGH, MEDIUM, LOW). If either precision is1039* NONE, we'll return the other precision, since there is no conflict.1040* Otherwise for fragment interpolation, we'll pick the smallest of (HIGH,1041* MEDIUM, LOW) by picking the maximum of the raw values - note the ordering is1042* "backwards". For non-fragment stages, we'll pick the latter precision to1043* comply with the spec. (Note that the order matters.)1044*1045* For streamout, "Variables declared with lowp or mediump precision are1046* promoted to highp before being written." (12.2 "Transform Feedback", p. 3411047* of OpenGL ES 3.2 specification). So drivers should promote them1048* the transform feedback memory store, but not the output store.1049*/10501051static unsigned1052nir_link_precision(unsigned producer, unsigned consumer, bool fs)1053{1054if (producer == GLSL_PRECISION_NONE)1055return consumer;1056else if (consumer == GLSL_PRECISION_NONE)1057return producer;1058else1059return fs ? MAX2(producer, consumer) : consumer;1060}10611062void1063nir_link_varying_precision(nir_shader *producer, nir_shader *consumer)1064{1065bool frag = consumer->info.stage == MESA_SHADER_FRAGMENT;10661067nir_foreach_shader_out_variable(producer_var, producer) {1068/* Skip if the slot is not assigned */1069if (producer_var->data.location < 0)1070continue;10711072nir_variable *consumer_var = nir_find_variable_with_location(consumer,1073nir_var_shader_in, producer_var->data.location);10741075/* Skip if the variable will be eliminated */1076if (!consumer_var)1077continue;10781079/* Now we have a pair of variables. Let's pick the smaller precision. */1080unsigned precision_1 = producer_var->data.precision;1081unsigned precision_2 = consumer_var->data.precision;1082unsigned minimum = nir_link_precision(precision_1, precision_2, frag);10831084/* Propagate the new precision */1085producer_var->data.precision = consumer_var->data.precision = minimum;1086}1087}10881089bool1090nir_link_opt_varyings(nir_shader *producer, nir_shader *consumer)1091{1092/* TODO: Add support for more shader stage combinations */1093if (consumer->info.stage != MESA_SHADER_FRAGMENT ||1094(producer->info.stage != MESA_SHADER_VERTEX &&1095producer->info.stage != MESA_SHADER_TESS_EVAL))1096return false;10971098bool progress = false;10991100nir_function_impl *impl = nir_shader_get_entrypoint(producer);11011102struct hash_table *varying_values = _mesa_pointer_hash_table_create(NULL);11031104/* If we find a store in the last block of the producer we can be sure this1105* is the only possible value for this output.1106*/1107nir_block *last_block = nir_impl_last_block(impl);1108nir_foreach_instr_reverse(instr, last_block) {1109if (instr->type != nir_instr_type_intrinsic)1110continue;11111112nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);11131114if (intr->intrinsic != nir_intrinsic_store_deref)1115continue;11161117nir_deref_instr *out_deref = nir_src_as_deref(intr->src[0]);1118if (!nir_deref_mode_is(out_deref, nir_var_shader_out))1119continue;11201121nir_variable *out_var = nir_deref_instr_get_variable(out_deref);1122if (!can_replace_varying(out_var))1123continue;11241125if (intr->src[1].ssa->parent_instr->type == nir_instr_type_load_const) {1126progress |= replace_constant_input(consumer, intr);1127} else {1128struct hash_entry *entry =1129_mesa_hash_table_search(varying_values, intr->src[1].ssa);1130if (entry) {1131progress |= replace_duplicate_input(consumer,1132(nir_variable *) entry->data,1133intr);1134} else {1135nir_variable *in_var = get_matching_input_var(consumer, out_var);1136if (in_var) {1137_mesa_hash_table_insert(varying_values, intr->src[1].ssa,1138in_var);1139}1140}1141}1142}11431144_mesa_hash_table_destroy(varying_values, NULL);11451146return progress;1147}11481149/* TODO any better helper somewhere to sort a list? */11501151static void1152insert_sorted(struct exec_list *var_list, nir_variable *new_var)1153{1154nir_foreach_variable_in_list(var, var_list) {1155if (var->data.location > new_var->data.location) {1156exec_node_insert_node_before(&var->node, &new_var->node);1157return;1158}1159}1160exec_list_push_tail(var_list, &new_var->node);1161}11621163static void1164sort_varyings(nir_shader *shader, nir_variable_mode mode,1165struct exec_list *sorted_list)1166{1167exec_list_make_empty(sorted_list);1168nir_foreach_variable_with_modes_safe(var, shader, mode) {1169exec_node_remove(&var->node);1170insert_sorted(sorted_list, var);1171}1172}11731174void1175nir_assign_io_var_locations(nir_shader *shader, nir_variable_mode mode,1176unsigned *size, gl_shader_stage stage)1177{1178unsigned location = 0;1179unsigned assigned_locations[VARYING_SLOT_TESS_MAX];1180uint64_t processed_locs[2] = {0};11811182struct exec_list io_vars;1183sort_varyings(shader, mode, &io_vars);11841185int UNUSED last_loc = 0;1186bool last_partial = false;1187nir_foreach_variable_in_list(var, &io_vars) {1188const struct glsl_type *type = var->type;1189if (nir_is_arrayed_io(var, stage)) {1190assert(glsl_type_is_array(type));1191type = glsl_get_array_element(type);1192}11931194int base;1195if (var->data.mode == nir_var_shader_in && stage == MESA_SHADER_VERTEX)1196base = VERT_ATTRIB_GENERIC0;1197else if (var->data.mode == nir_var_shader_out &&1198stage == MESA_SHADER_FRAGMENT)1199base = FRAG_RESULT_DATA0;1200else1201base = VARYING_SLOT_VAR0;12021203unsigned var_size, driver_size;1204if (var->data.compact) {1205/* If we are inside a partial compact,1206* don't allow another compact to be in this slot1207* if it starts at component 0.1208*/1209if (last_partial && var->data.location_frac == 0) {1210location++;1211}12121213/* compact variables must be arrays of scalars */1214assert(!var->data.per_view);1215assert(glsl_type_is_array(type));1216assert(glsl_type_is_scalar(glsl_get_array_element(type)));1217unsigned start = 4 * location + var->data.location_frac;1218unsigned end = start + glsl_get_length(type);1219var_size = driver_size = end / 4 - location;1220last_partial = end % 4 != 0;1221} else {1222/* Compact variables bypass the normal varying compacting pass,1223* which means they cannot be in the same vec4 slot as a normal1224* variable. If part of the current slot is taken up by a compact1225* variable, we need to go to the next one.1226*/1227if (last_partial) {1228location++;1229last_partial = false;1230}12311232/* per-view variables have an extra array dimension, which is ignored1233* when counting user-facing slots (var->data.location), but *not*1234* with driver slots (var->data.driver_location). That is, each user1235* slot maps to multiple driver slots.1236*/1237driver_size = glsl_count_attribute_slots(type, false);1238if (var->data.per_view) {1239assert(glsl_type_is_array(type));1240var_size =1241glsl_count_attribute_slots(glsl_get_array_element(type), false);1242} else {1243var_size = driver_size;1244}1245}12461247/* Builtins don't allow component packing so we only need to worry about1248* user defined varyings sharing the same location.1249*/1250bool processed = false;1251if (var->data.location >= base) {1252unsigned glsl_location = var->data.location - base;12531254for (unsigned i = 0; i < var_size; i++) {1255if (processed_locs[var->data.index] &1256((uint64_t)1 << (glsl_location + i)))1257processed = true;1258else1259processed_locs[var->data.index] |=1260((uint64_t)1 << (glsl_location + i));1261}1262}12631264/* Because component packing allows varyings to share the same location1265* we may have already have processed this location.1266*/1267if (processed) {1268/* TODO handle overlapping per-view variables */1269assert(!var->data.per_view);1270unsigned driver_location = assigned_locations[var->data.location];1271var->data.driver_location = driver_location;12721273/* An array may be packed such that is crosses multiple other arrays1274* or variables, we need to make sure we have allocated the elements1275* consecutively if the previously proccessed var was shorter than1276* the current array we are processing.1277*1278* NOTE: The code below assumes the var list is ordered in ascending1279* location order.1280*/1281assert(last_loc <= var->data.location);1282last_loc = var->data.location;1283unsigned last_slot_location = driver_location + var_size;1284if (last_slot_location > location) {1285unsigned num_unallocated_slots = last_slot_location - location;1286unsigned first_unallocated_slot = var_size - num_unallocated_slots;1287for (unsigned i = first_unallocated_slot; i < var_size; i++) {1288assigned_locations[var->data.location + i] = location;1289location++;1290}1291}1292continue;1293}12941295for (unsigned i = 0; i < var_size; i++) {1296assigned_locations[var->data.location + i] = location + i;1297}12981299var->data.driver_location = location;1300location += driver_size;1301}13021303if (last_partial)1304location++;13051306exec_list_append(&shader->variables, &io_vars);1307*size = location;1308}13091310static uint64_t1311get_linked_variable_location(unsigned location, bool patch)1312{1313if (!patch)1314return location;13151316/* Reserve locations 0...3 for special patch variables1317* like tess factors and bounding boxes, and the generic patch1318* variables will come after them.1319*/1320if (location >= VARYING_SLOT_PATCH0)1321return location - VARYING_SLOT_PATCH0 + 4;1322else if (location >= VARYING_SLOT_TESS_LEVEL_OUTER &&1323location <= VARYING_SLOT_BOUNDING_BOX1)1324return location - VARYING_SLOT_TESS_LEVEL_OUTER;1325else1326unreachable("Unsupported variable in get_linked_variable_location.");1327}13281329static uint64_t1330get_linked_variable_io_mask(nir_variable *variable, gl_shader_stage stage)1331{1332const struct glsl_type *type = variable->type;13331334if (nir_is_arrayed_io(variable, stage)) {1335assert(glsl_type_is_array(type));1336type = glsl_get_array_element(type);1337}13381339unsigned slots = glsl_count_attribute_slots(type, false);1340if (variable->data.compact) {1341unsigned component_count = variable->data.location_frac + glsl_get_length(type);1342slots = DIV_ROUND_UP(component_count, 4);1343}13441345uint64_t mask = u_bit_consecutive64(0, slots);1346return mask;1347}13481349nir_linked_io_var_info1350nir_assign_linked_io_var_locations(nir_shader *producer, nir_shader *consumer)1351{1352assert(producer);1353assert(consumer);13541355uint64_t producer_output_mask = 0;1356uint64_t producer_patch_output_mask = 0;13571358nir_foreach_shader_out_variable(variable, producer) {1359uint64_t mask = get_linked_variable_io_mask(variable, producer->info.stage);1360uint64_t loc = get_linked_variable_location(variable->data.location, variable->data.patch);13611362if (variable->data.patch)1363producer_patch_output_mask |= mask << loc;1364else1365producer_output_mask |= mask << loc;1366}13671368uint64_t consumer_input_mask = 0;1369uint64_t consumer_patch_input_mask = 0;13701371nir_foreach_shader_in_variable(variable, consumer) {1372uint64_t mask = get_linked_variable_io_mask(variable, consumer->info.stage);1373uint64_t loc = get_linked_variable_location(variable->data.location, variable->data.patch);13741375if (variable->data.patch)1376consumer_patch_input_mask |= mask << loc;1377else1378consumer_input_mask |= mask << loc;1379}13801381uint64_t io_mask = producer_output_mask | consumer_input_mask;1382uint64_t patch_io_mask = producer_patch_output_mask | consumer_patch_input_mask;13831384nir_foreach_shader_out_variable(variable, producer) {1385uint64_t loc = get_linked_variable_location(variable->data.location, variable->data.patch);13861387if (variable->data.patch)1388variable->data.driver_location = util_bitcount64(patch_io_mask & u_bit_consecutive64(0, loc));1389else1390variable->data.driver_location = util_bitcount64(io_mask & u_bit_consecutive64(0, loc));1391}13921393nir_foreach_shader_in_variable(variable, consumer) {1394uint64_t loc = get_linked_variable_location(variable->data.location, variable->data.patch);13951396if (variable->data.patch)1397variable->data.driver_location = util_bitcount64(patch_io_mask & u_bit_consecutive64(0, loc));1398else1399variable->data.driver_location = util_bitcount64(io_mask & u_bit_consecutive64(0, loc));1400}14011402nir_linked_io_var_info result = {1403.num_linked_io_vars = util_bitcount64(io_mask),1404.num_linked_patch_io_vars = util_bitcount64(patch_io_mask),1405};14061407return result;1408}140914101411