Path: blob/21.2-virgl/src/gallium/drivers/freedreno/ir3/ir3_gallium.c
4574 views
/*1* Copyright (C) 2014 Rob Clark <[email protected]>2*3* Permission is hereby granted, free of charge, to any person obtaining a4* copy of this software and associated documentation files (the "Software"),5* to deal in the Software without restriction, including without limitation6* the rights to use, copy, modify, merge, publish, distribute, sublicense,7* and/or sell copies of the Software, and to permit persons to whom the8* Software is furnished to do so, subject to the following conditions:9*10* The above copyright notice and this permission notice (including the next11* paragraph) shall be included in all copies or substantial portions of the12* Software.13*14* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR15* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,16* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL17* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER18* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,19* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE20* SOFTWARE.21*22* Authors:23* Rob Clark <[email protected]>24*/2526#include "pipe/p_screen.h"27#include "pipe/p_state.h"28#include "tgsi/tgsi_dump.h"29#include "tgsi/tgsi_parse.h"30#include "util/format/u_format.h"31#include "util/u_inlines.h"32#include "util/u_memory.h"33#include "util/u_string.h"3435#include "nir/tgsi_to_nir.h"3637#include "freedreno_context.h"38#include "freedreno_util.h"3940#include "ir3/ir3_cache.h"41#include "ir3/ir3_compiler.h"42#include "ir3/ir3_gallium.h"43#include "ir3/ir3_nir.h"44#include "ir3/ir3_shader.h"4546/**47* The hardware cso for shader state48*49* Initially just a container for the ir3_shader, but this is where we'll50* plumb in async compile.51*/52struct ir3_shader_state {53struct ir3_shader *shader;5455/* Fence signalled when async compile is completed: */56struct util_queue_fence ready;57};5859/**60* Should initial variants be compiled synchronously?61*62* The only case where pipe_debug_message() is used in the initial-variants63* path is with FD_MESA_DEBUG=shaderdb. So if either debug is disabled (ie.64* debug.debug_message==NULL), or shaderdb stats are not enabled, we can65* compile the initial shader variant asynchronously.66*/67static bool68initial_variants_synchronous(struct fd_context *ctx)69{70return unlikely(ctx->debug.debug_message) || FD_DBG(SHADERDB) ||71FD_DBG(SERIALC);72}7374static void75dump_shader_info(struct ir3_shader_variant *v,76struct pipe_debug_callback *debug)77{78if (!FD_DBG(SHADERDB))79return;8081pipe_debug_message(82debug, SHADER_INFO,83"%s shader: %u inst, %u nops, %u non-nops, %u mov, %u cov, "84"%u dwords, %u last-baryf, %u half, %u full, %u constlen, "85"%u cat0, %u cat1, %u cat2, %u cat3, %u cat4, %u cat5, %u cat6, %u cat7, "86"%u sstall, %u (ss), %u (sy), %d waves, %d max_sun, %d loops\n",87ir3_shader_stage(v), v->info.instrs_count, v->info.nops_count,88v->info.instrs_count - v->info.nops_count, v->info.mov_count,89v->info.cov_count, v->info.sizedwords, v->info.last_baryf,90v->info.max_half_reg + 1, v->info.max_reg + 1, v->constlen,91v->info.instrs_per_cat[0], v->info.instrs_per_cat[1],92v->info.instrs_per_cat[2], v->info.instrs_per_cat[3],93v->info.instrs_per_cat[4], v->info.instrs_per_cat[5],94v->info.instrs_per_cat[6], v->info.instrs_per_cat[7], v->info.sstall,95v->info.ss, v->info.sy, v->info.max_waves, v->max_sun, v->loops);96}9798static void99upload_shader_variant(struct ir3_shader_variant *v)100{101struct shader_info *info = &v->shader->nir->info;102struct ir3_compiler *compiler = v->shader->compiler;103104assert(!v->bo);105106v->bo =107fd_bo_new(compiler->dev, v->info.size, 0,108"%s:%s", ir3_shader_stage(v), info->name);109110/* Always include shaders in kernel crash dumps. */111fd_bo_mark_for_dump(v->bo);112113memcpy(fd_bo_map(v->bo), v->bin, v->info.size);114}115116struct ir3_shader_variant *117ir3_shader_variant(struct ir3_shader *shader, struct ir3_shader_key key,118bool binning_pass, struct pipe_debug_callback *debug)119{120struct ir3_shader_variant *v;121bool created = false;122123/* Some shader key values may not be used by a given ir3_shader (for124* example, fragment shader saturates in the vertex shader), so clean out125* those flags to avoid recompiling.126*/127ir3_key_clear_unused(&key, shader);128129v = ir3_shader_get_variant(shader, &key, binning_pass, false, &created);130131if (created) {132if (shader->initial_variants_done) {133pipe_debug_message(debug, SHADER_INFO,134"%s shader: recompiling at draw time: global "135"0x%08x, vfsamples %x/%x, astc %x/%x\n",136ir3_shader_stage(v), key.global, key.vsamples,137key.fsamples, key.vastc_srgb, key.fastc_srgb);138}139140dump_shader_info(v, debug);141upload_shader_variant(v);142143if (v->binning) {144upload_shader_variant(v->binning);145dump_shader_info(v->binning, debug);146}147}148149return v;150}151152static void153copy_stream_out(struct ir3_stream_output_info *i,154const struct pipe_stream_output_info *p)155{156STATIC_ASSERT(ARRAY_SIZE(i->stride) == ARRAY_SIZE(p->stride));157STATIC_ASSERT(ARRAY_SIZE(i->output) == ARRAY_SIZE(p->output));158159i->num_outputs = p->num_outputs;160for (int n = 0; n < ARRAY_SIZE(i->stride); n++)161i->stride[n] = p->stride[n];162163for (int n = 0; n < ARRAY_SIZE(i->output); n++) {164i->output[n].register_index = p->output[n].register_index;165i->output[n].start_component = p->output[n].start_component;166i->output[n].num_components = p->output[n].num_components;167i->output[n].output_buffer = p->output[n].output_buffer;168i->output[n].dst_offset = p->output[n].dst_offset;169i->output[n].stream = p->output[n].stream;170}171}172173static void174create_initial_variants(struct ir3_shader_state *hwcso,175struct pipe_debug_callback *debug)176{177struct ir3_shader *shader = hwcso->shader;178struct ir3_compiler *compiler = shader->compiler;179nir_shader *nir = shader->nir;180181/* Compile standard variants immediately to try to avoid draw-time stalls182* to run the compiler.183*/184struct ir3_shader_key key = {185.tessellation = IR3_TESS_NONE,186.ucp_enables = MASK(nir->info.clip_distance_array_size),187.msaa = true,188};189190switch (nir->info.stage) {191case MESA_SHADER_TESS_EVAL:192key.tessellation = ir3_tess_mode(nir->info.tess.primitive_mode);193break;194195case MESA_SHADER_TESS_CTRL:196/* The primitive_mode field, while it exists for TCS, is not197* populated (since separable shaders between TCS/TES are legal,198* so TCS wouldn't have access to TES's declaration). Make a199* guess so that we shader-db something plausible for TCS.200*/201if (nir->info.outputs_written & VARYING_BIT_TESS_LEVEL_INNER)202key.tessellation = IR3_TESS_TRIANGLES;203else204key.tessellation = IR3_TESS_ISOLINES;205break;206207case MESA_SHADER_GEOMETRY:208key.has_gs = true;209break;210211default:212break;213}214215key.safe_constlen = false;216struct ir3_shader_variant *v = ir3_shader_variant(shader, key, false, debug);217if (!v)218return;219220if (v->constlen > compiler->max_const_safe) {221key.safe_constlen = true;222ir3_shader_variant(shader, key, false, debug);223}224225/* For vertex shaders, also compile initial binning pass shader: */226if (nir->info.stage == MESA_SHADER_VERTEX) {227key.safe_constlen = false;228v = ir3_shader_variant(shader, key, true, debug);229if (!v)230return;231232if (v->constlen > compiler->max_const_safe) {233key.safe_constlen = true;234ir3_shader_variant(shader, key, true, debug);235}236}237238shader->initial_variants_done = true;239}240241static void242create_initial_variants_async(void *job, void *gdata, int thread_index)243{244struct ir3_shader_state *hwcso = job;245struct pipe_debug_callback debug = {};246247create_initial_variants(hwcso, &debug);248}249250static void251create_initial_compute_variants_async(void *job, void *gdata, int thread_index)252{253struct ir3_shader_state *hwcso = job;254struct ir3_shader *shader = hwcso->shader;255struct pipe_debug_callback debug = {};256static struct ir3_shader_key key; /* static is implicitly zeroed */257258ir3_shader_variant(shader, key, false, &debug);259shader->initial_variants_done = true;260}261262/* a bit annoying that compute-shader and normal shader state objects263* aren't a bit more aligned.264*/265void *266ir3_shader_compute_state_create(struct pipe_context *pctx,267const struct pipe_compute_state *cso)268{269struct fd_context *ctx = fd_context(pctx);270271/* req_input_mem will only be non-zero for cl kernels (ie. clover).272* This isn't a perfect test because I guess it is possible (but273* uncommon) for none for the kernel parameters to be a global,274* but ctx->set_global_bindings() can't fail, so this is the next275* best place to fail if we need a newer version of kernel driver:276*/277if ((cso->req_input_mem > 0) &&278fd_device_version(ctx->dev) < FD_VERSION_BO_IOVA) {279return NULL;280}281282struct ir3_compiler *compiler = ctx->screen->compiler;283nir_shader *nir;284285if (cso->ir_type == PIPE_SHADER_IR_NIR) {286/* we take ownership of the reference: */287nir = (nir_shader *)cso->prog;288} else {289debug_assert(cso->ir_type == PIPE_SHADER_IR_TGSI);290if (ir3_shader_debug & IR3_DBG_DISASM) {291tgsi_dump(cso->prog, 0);292}293nir = tgsi_to_nir(cso->prog, pctx->screen, false);294}295296struct ir3_shader *shader = ir3_shader_from_nir(compiler, nir, 0, NULL);297struct ir3_shader_state *hwcso = calloc(1, sizeof(*hwcso));298299util_queue_fence_init(&hwcso->ready);300hwcso->shader = shader;301302/* Immediately compile a standard variant. We have so few variants in our303* shaders, that doing so almost eliminates draw-time recompiles. (This304* is also how we get data from shader-db's ./run)305*/306307if (initial_variants_synchronous(ctx)) {308static struct ir3_shader_key key; /* static is implicitly zeroed */309ir3_shader_variant(shader, key, false, &ctx->debug);310shader->initial_variants_done = true;311} else {312struct fd_screen *screen = ctx->screen;313util_queue_add_job(&screen->compile_queue, hwcso, &hwcso->ready,314create_initial_compute_variants_async, NULL, 0);315}316317return hwcso;318}319320void *321ir3_shader_state_create(struct pipe_context *pctx,322const struct pipe_shader_state *cso)323{324struct fd_context *ctx = fd_context(pctx);325struct ir3_compiler *compiler = ctx->screen->compiler;326struct ir3_shader_state *hwcso = calloc(1, sizeof(*hwcso));327328/*329* Convert to nir (if necessary):330*/331332nir_shader *nir;333if (cso->type == PIPE_SHADER_IR_NIR) {334/* we take ownership of the reference: */335nir = cso->ir.nir;336} else {337debug_assert(cso->type == PIPE_SHADER_IR_TGSI);338if (ir3_shader_debug & IR3_DBG_DISASM) {339tgsi_dump(cso->tokens, 0);340}341nir = tgsi_to_nir(cso->tokens, pctx->screen, false);342}343344/*345* Create ir3_shader:346*347* This part is cheap, it doesn't compile initial variants348*/349350struct ir3_stream_output_info stream_output = {};351copy_stream_out(&stream_output, &cso->stream_output);352353hwcso->shader = ir3_shader_from_nir(compiler, nir, 0, &stream_output);354355/*356* Create initial variants to avoid draw-time stalls. This is357* normally done asynchronously, unless debug is enabled (which358* will be the case for shader-db)359*/360361util_queue_fence_init(&hwcso->ready);362363if (initial_variants_synchronous(ctx)) {364create_initial_variants(hwcso, &ctx->debug);365} else {366util_queue_add_job(&ctx->screen->compile_queue, hwcso, &hwcso->ready,367create_initial_variants_async, NULL, 0);368}369370return hwcso;371}372373void374ir3_shader_state_delete(struct pipe_context *pctx, void *_hwcso)375{376struct fd_context *ctx = fd_context(pctx);377struct fd_screen *screen = ctx->screen;378struct ir3_shader_state *hwcso = _hwcso;379struct ir3_shader *so = hwcso->shader;380381ir3_cache_invalidate(ctx->shader_cache, hwcso);382383/* util_queue_drop_job() guarantees that either:384* 1) job did not execute385* 2) job completed386*387* In either case the fence is signaled388*/389util_queue_drop_job(&screen->compile_queue, &hwcso->ready);390391/* free the uploaded shaders, since this is handled outside of the392* shared ir3 code (ie. not used by turnip):393*/394for (struct ir3_shader_variant *v = so->variants; v; v = v->next) {395fd_bo_del(v->bo);396v->bo = NULL;397398if (v->binning && v->binning->bo) {399fd_bo_del(v->binning->bo);400v->binning->bo = NULL;401}402}403404ir3_shader_destroy(so);405util_queue_fence_destroy(&hwcso->ready);406free(hwcso);407}408409struct ir3_shader *410ir3_get_shader(struct ir3_shader_state *hwcso)411{412if (!hwcso)413return NULL;414415struct ir3_shader *shader = hwcso->shader;416perf_time (1000, "waited for %s:%s:%s variants",417_mesa_shader_stage_to_abbrev(shader->type),418shader->nir->info.name,419shader->nir->info.label) {420/* wait for initial variants to compile: */421util_queue_fence_wait(&hwcso->ready);422}423424return shader;425}426427struct shader_info *428ir3_get_shader_info(struct ir3_shader_state *hwcso)429{430if (!hwcso)431return NULL;432return &hwcso->shader->nir->info;433}434435/* fixup dirty shader state in case some "unrelated" (from the state-436* tracker's perspective) state change causes us to switch to a437* different variant.438*/439void440ir3_fixup_shader_state(struct pipe_context *pctx, struct ir3_shader_key *key)441{442struct fd_context *ctx = fd_context(pctx);443444if (!ir3_shader_key_equal(ctx->last.key, key)) {445if (ir3_shader_key_changes_fs(ctx->last.key, key)) {446fd_context_dirty_shader(ctx, PIPE_SHADER_FRAGMENT,447FD_DIRTY_SHADER_PROG);448}449450if (ir3_shader_key_changes_vs(ctx->last.key, key)) {451fd_context_dirty_shader(ctx, PIPE_SHADER_VERTEX, FD_DIRTY_SHADER_PROG);452}453454/* NOTE: currently only a6xx has gs/tess, but needs no455* gs/tess specific lowering.456*/457458*ctx->last.key = *key;459}460}461462static void463ir3_screen_finalize_nir(struct pipe_screen *pscreen, void *nir, bool optimize)464{465struct fd_screen *screen = fd_screen(pscreen);466467ir3_nir_lower_io_to_temporaries(nir);468ir3_finalize_nir(screen->compiler, nir);469}470471static void472ir3_set_max_shader_compiler_threads(struct pipe_screen *pscreen,473unsigned max_threads)474{475struct fd_screen *screen = fd_screen(pscreen);476477/* This function doesn't allow a greater number of threads than478* the queue had at its creation.479*/480util_queue_adjust_num_threads(&screen->compile_queue, max_threads);481}482483static bool484ir3_is_parallel_shader_compilation_finished(struct pipe_screen *pscreen,485void *shader,486enum pipe_shader_type shader_type)487{488struct ir3_shader_state *hwcso = (struct ir3_shader_state *)shader;489490return util_queue_fence_is_signalled(&hwcso->ready);491}492493void494ir3_prog_init(struct pipe_context *pctx)495{496pctx->create_vs_state = ir3_shader_state_create;497pctx->delete_vs_state = ir3_shader_state_delete;498499pctx->create_tcs_state = ir3_shader_state_create;500pctx->delete_tcs_state = ir3_shader_state_delete;501502pctx->create_tes_state = ir3_shader_state_create;503pctx->delete_tes_state = ir3_shader_state_delete;504505pctx->create_gs_state = ir3_shader_state_create;506pctx->delete_gs_state = ir3_shader_state_delete;507508pctx->create_fs_state = ir3_shader_state_create;509pctx->delete_fs_state = ir3_shader_state_delete;510}511512void513ir3_screen_init(struct pipe_screen *pscreen)514{515struct fd_screen *screen = fd_screen(pscreen);516517screen->compiler = ir3_compiler_create(screen->dev, screen->gpu_id, false);518519/* TODO do we want to limit things to # of fast cores, or just limit520* based on total # of both big and little cores. The little cores521* tend to be in-order and probably much slower for compiling than522* big cores. OTOH if they are sitting idle, maybe it is useful to523* use them?524*/525unsigned num_threads = sysconf(_SC_NPROCESSORS_ONLN) - 1;526527util_queue_init(&screen->compile_queue, "ir3q", 64, num_threads,528UTIL_QUEUE_INIT_RESIZE_IF_FULL |529UTIL_QUEUE_INIT_SET_FULL_THREAD_AFFINITY, NULL);530531pscreen->finalize_nir = ir3_screen_finalize_nir;532pscreen->set_max_shader_compiler_threads =533ir3_set_max_shader_compiler_threads;534pscreen->is_parallel_shader_compilation_finished =535ir3_is_parallel_shader_compilation_finished;536}537538void539ir3_screen_fini(struct pipe_screen *pscreen)540{541struct fd_screen *screen = fd_screen(pscreen);542543util_queue_destroy(&screen->compile_queue);544ir3_compiler_destroy(screen->compiler);545screen->compiler = NULL;546}547548void549ir3_update_max_tf_vtx(struct fd_context *ctx,550const struct ir3_shader_variant *v)551{552struct fd_streamout_stateobj *so = &ctx->streamout;553struct ir3_stream_output_info *info = &v->shader->stream_output;554uint32_t maxvtxcnt = 0x7fffffff;555556if (v->shader->stream_output.num_outputs == 0)557ctx->streamout.max_tf_vtx = 0;558if (so->num_targets == 0)559ctx->streamout.max_tf_vtx = 0;560561/* offset to write to is:562*563* total_vtxcnt = vtxcnt + offsets[i]564* offset = total_vtxcnt * stride[i]565*566* offset = vtxcnt * stride[i] ; calculated in shader567* + offsets[i] * stride[i] ; calculated at emit_tfbos()568*569* assuming for each vtx, each target buffer will have data written570* up to 'offset + stride[i]', that leaves maxvtxcnt as:571*572* buffer_size = (maxvtxcnt * stride[i]) + stride[i]573* maxvtxcnt = (buffer_size - stride[i]) / stride[i]574*575* but shader is actually doing a less-than (rather than less-than-576* equal) check, so we can drop the -stride[i].577*578* TODO is assumption about `offset + stride[i]` legit?579*/580for (unsigned i = 0; i < so->num_targets; i++) {581struct pipe_stream_output_target *target = so->targets[i];582unsigned stride = info->stride[i] * 4; /* convert dwords->bytes */583if (target) {584uint32_t max = target->buffer_size / stride;585maxvtxcnt = MIN2(maxvtxcnt, max);586}587}588589ctx->streamout.max_tf_vtx = maxvtxcnt;590}591592593