Path: blob/21.2-virgl/src/gallium/drivers/nouveau/nv50/nv50_program.c
4574 views
/*1* Copyright 2010 Christoph Bumiller2*3* Permission is hereby granted, free of charge, to any person obtaining a4* copy of this software and associated documentation files (the "Software"),5* to deal in the Software without restriction, including without limitation6* the rights to use, copy, modify, merge, publish, distribute, sublicense,7* and/or sell copies of the Software, and to permit persons to whom the8* Software is furnished to do so, subject to the following conditions:9*10* The above copyright notice and this permission notice shall be included in11* all copies or substantial portions of the Software.12*13* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR14* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,15* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL16* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR17* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,18* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR19* OTHER DEALINGS IN THE SOFTWARE.20*/2122#include "pipe/p_defines.h"2324#include "compiler/nir/nir.h"2526#include "nv50/nv50_context.h"27#include "nv50/nv50_program.h"2829#include "codegen/nv50_ir_driver.h"3031static inline unsigned32bitcount4(const uint32_t val)33{34static const uint8_t cnt[16]35= { 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4 };36return cnt[val & 0xf];37}3839static int40nv50_vertprog_assign_slots(struct nv50_ir_prog_info_out *info)41{42struct nv50_program *prog = (struct nv50_program *)info->driverPriv;43unsigned i, n, c;4445n = 0;46for (i = 0; i < info->numInputs; ++i) {47prog->in[i].id = i;48prog->in[i].sn = info->in[i].sn;49prog->in[i].si = info->in[i].si;50prog->in[i].hw = n;51prog->in[i].mask = info->in[i].mask;5253prog->vp.attrs[(4 * i) / 32] |= info->in[i].mask << ((4 * i) % 32);5455for (c = 0; c < 4; ++c)56if (info->in[i].mask & (1 << c))57info->in[i].slot[c] = n++;5859if (info->in[i].sn == TGSI_SEMANTIC_PRIMID)60prog->vp.attrs[2] |= NV50_3D_VP_GP_BUILTIN_ATTR_EN_PRIMITIVE_ID;61}62prog->in_nr = info->numInputs;6364for (i = 0; i < info->numSysVals; ++i) {65switch (info->sv[i].sn) {66case TGSI_SEMANTIC_INSTANCEID:67prog->vp.attrs[2] |= NV50_3D_VP_GP_BUILTIN_ATTR_EN_INSTANCE_ID;68continue;69case TGSI_SEMANTIC_VERTEXID:70prog->vp.attrs[2] |= NV50_3D_VP_GP_BUILTIN_ATTR_EN_VERTEX_ID;71prog->vp.attrs[2] |= NV50_3D_VP_GP_BUILTIN_ATTR_EN_VERTEX_ID_DRAW_ARRAYS_ADD_START;72continue;73default:74break;75}76}7778/*79* Corner case: VP has no inputs, but we will still need to submit data to80* draw it. HW will shout at us and won't draw anything if we don't enable81* any input, so let's just pretend it's the first one.82*/83if (prog->vp.attrs[0] == 0 &&84prog->vp.attrs[1] == 0 &&85prog->vp.attrs[2] == 0)86prog->vp.attrs[0] |= 0xf;8788/* VertexID before InstanceID */89if (info->io.vertexId < info->numSysVals)90info->sv[info->io.vertexId].slot[0] = n++;91if (info->io.instanceId < info->numSysVals)92info->sv[info->io.instanceId].slot[0] = n++;9394n = 0;95for (i = 0; i < info->numOutputs; ++i) {96switch (info->out[i].sn) {97case TGSI_SEMANTIC_PSIZE:98prog->vp.psiz = i;99break;100case TGSI_SEMANTIC_CLIPDIST:101prog->vp.clpd[info->out[i].si] = n;102break;103case TGSI_SEMANTIC_EDGEFLAG:104prog->vp.edgeflag = i;105break;106case TGSI_SEMANTIC_BCOLOR:107prog->vp.bfc[info->out[i].si] = i;108break;109case TGSI_SEMANTIC_LAYER:110prog->gp.has_layer = true;111prog->gp.layerid = n;112break;113case TGSI_SEMANTIC_VIEWPORT_INDEX:114prog->gp.has_viewport = true;115prog->gp.viewportid = n;116break;117default:118break;119}120prog->out[i].id = i;121prog->out[i].sn = info->out[i].sn;122prog->out[i].si = info->out[i].si;123prog->out[i].hw = n;124prog->out[i].mask = info->out[i].mask;125126for (c = 0; c < 4; ++c)127if (info->out[i].mask & (1 << c))128info->out[i].slot[c] = n++;129}130prog->out_nr = info->numOutputs;131prog->max_out = n;132if (!prog->max_out)133prog->max_out = 1;134135if (prog->vp.psiz < info->numOutputs)136prog->vp.psiz = prog->out[prog->vp.psiz].hw;137138return 0;139}140141static int142nv50_fragprog_assign_slots(struct nv50_ir_prog_info_out *info)143{144struct nv50_program *prog = (struct nv50_program *)info->driverPriv;145unsigned i, n, m, c;146unsigned nvary;147unsigned nflat;148unsigned nintp = 0;149150/* count recorded non-flat inputs */151for (m = 0, i = 0; i < info->numInputs; ++i) {152switch (info->in[i].sn) {153case TGSI_SEMANTIC_POSITION:154continue;155default:156m += info->in[i].flat ? 0 : 1;157break;158}159}160/* careful: id may be != i in info->in[prog->in[i].id] */161162/* Fill prog->in[] so that non-flat inputs are first and163* kick out special inputs that don't use the RESULT_MAP.164*/165for (n = 0, i = 0; i < info->numInputs; ++i) {166if (info->in[i].sn == TGSI_SEMANTIC_POSITION) {167prog->fp.interp |= info->in[i].mask << 24;168for (c = 0; c < 4; ++c)169if (info->in[i].mask & (1 << c))170info->in[i].slot[c] = nintp++;171} else {172unsigned j = info->in[i].flat ? m++ : n++;173174if (info->in[i].sn == TGSI_SEMANTIC_COLOR)175prog->vp.bfc[info->in[i].si] = j;176else if (info->in[i].sn == TGSI_SEMANTIC_PRIMID)177prog->vp.attrs[2] |= NV50_3D_VP_GP_BUILTIN_ATTR_EN_PRIMITIVE_ID;178179prog->in[j].id = i;180prog->in[j].mask = info->in[i].mask;181prog->in[j].sn = info->in[i].sn;182prog->in[j].si = info->in[i].si;183prog->in[j].linear = info->in[i].linear;184185prog->in_nr++;186}187}188if (!(prog->fp.interp & (8 << 24))) {189++nintp;190prog->fp.interp |= 8 << 24;191}192193for (i = 0; i < prog->in_nr; ++i) {194int j = prog->in[i].id;195196prog->in[i].hw = nintp;197for (c = 0; c < 4; ++c)198if (prog->in[i].mask & (1 << c))199info->in[j].slot[c] = nintp++;200}201/* (n == m) if m never increased, i.e. no flat inputs */202nflat = (n < m) ? (nintp - prog->in[n].hw) : 0;203nintp -= bitcount4(prog->fp.interp >> 24); /* subtract position inputs */204nvary = nintp - nflat;205206prog->fp.interp |= nvary << NV50_3D_FP_INTERPOLANT_CTRL_COUNT_NONFLAT__SHIFT;207prog->fp.interp |= nintp << NV50_3D_FP_INTERPOLANT_CTRL_COUNT__SHIFT;208209/* put front/back colors right after HPOS */210prog->fp.colors = 4 << NV50_3D_SEMANTIC_COLOR_FFC0_ID__SHIFT;211for (i = 0; i < 2; ++i)212if (prog->vp.bfc[i] < 0xff)213prog->fp.colors += bitcount4(prog->in[prog->vp.bfc[i]].mask) << 16;214215/* FP outputs */216217if (info->prop.fp.numColourResults > 1)218prog->fp.flags[0] |= NV50_3D_FP_CONTROL_MULTIPLE_RESULTS;219220for (i = 0; i < info->numOutputs; ++i) {221prog->out[i].id = i;222prog->out[i].sn = info->out[i].sn;223prog->out[i].si = info->out[i].si;224prog->out[i].mask = info->out[i].mask;225226if (i == info->io.fragDepth || i == info->io.sampleMask)227continue;228prog->out[i].hw = info->out[i].si * 4;229230for (c = 0; c < 4; ++c)231info->out[i].slot[c] = prog->out[i].hw + c;232233prog->max_out = MAX2(prog->max_out, prog->out[i].hw + 4);234}235236if (info->io.sampleMask < PIPE_MAX_SHADER_OUTPUTS) {237info->out[info->io.sampleMask].slot[0] = prog->max_out++;238prog->fp.has_samplemask = 1;239}240241if (info->io.fragDepth < PIPE_MAX_SHADER_OUTPUTS)242info->out[info->io.fragDepth].slot[2] = prog->max_out++;243244if (!prog->max_out)245prog->max_out = 4;246247return 0;248}249250static int251nv50_program_assign_varying_slots(struct nv50_ir_prog_info_out *info)252{253switch (info->type) {254case PIPE_SHADER_VERTEX:255return nv50_vertprog_assign_slots(info);256case PIPE_SHADER_GEOMETRY:257return nv50_vertprog_assign_slots(info);258case PIPE_SHADER_FRAGMENT:259return nv50_fragprog_assign_slots(info);260case PIPE_SHADER_COMPUTE:261return 0;262default:263return -1;264}265}266267static struct nv50_stream_output_state *268nv50_program_create_strmout_state(const struct nv50_ir_prog_info_out *info,269const struct pipe_stream_output_info *pso)270{271struct nv50_stream_output_state *so;272unsigned b, i, c;273unsigned base[4];274275so = MALLOC_STRUCT(nv50_stream_output_state);276if (!so)277return NULL;278memset(so->map, 0xff, sizeof(so->map));279280for (b = 0; b < 4; ++b)281so->num_attribs[b] = 0;282for (i = 0; i < pso->num_outputs; ++i) {283unsigned end = pso->output[i].dst_offset + pso->output[i].num_components;284b = pso->output[i].output_buffer;285assert(b < 4);286so->num_attribs[b] = MAX2(so->num_attribs[b], end);287}288289so->ctrl = NV50_3D_STRMOUT_BUFFERS_CTRL_INTERLEAVED;290291so->stride[0] = pso->stride[0] * 4;292base[0] = 0;293for (b = 1; b < 4; ++b) {294assert(!so->num_attribs[b] || so->num_attribs[b] == pso->stride[b]);295so->stride[b] = so->num_attribs[b] * 4;296if (so->num_attribs[b])297so->ctrl = (b + 1) << NV50_3D_STRMOUT_BUFFERS_CTRL_SEPARATE__SHIFT;298base[b] = align(base[b - 1] + so->num_attribs[b - 1], 4);299}300if (so->ctrl & NV50_3D_STRMOUT_BUFFERS_CTRL_INTERLEAVED) {301assert(so->stride[0] < NV50_3D_STRMOUT_BUFFERS_CTRL_STRIDE__MAX);302so->ctrl |= so->stride[0] << NV50_3D_STRMOUT_BUFFERS_CTRL_STRIDE__SHIFT;303}304305so->map_size = base[3] + so->num_attribs[3];306307for (i = 0; i < pso->num_outputs; ++i) {308const unsigned s = pso->output[i].start_component;309const unsigned p = pso->output[i].dst_offset;310const unsigned r = pso->output[i].register_index;311b = pso->output[i].output_buffer;312313if (r >= info->numOutputs)314continue;315316for (c = 0; c < pso->output[i].num_components; ++c)317so->map[base[b] + p + c] = info->out[r].slot[s + c];318}319320return so;321}322323bool324nv50_program_translate(struct nv50_program *prog, uint16_t chipset,325struct pipe_debug_callback *debug)326{327struct nv50_ir_prog_info *info;328struct nv50_ir_prog_info_out info_out = {};329int i, ret;330const uint8_t map_undef = (prog->type == PIPE_SHADER_VERTEX) ? 0x40 : 0x80;331332info = CALLOC_STRUCT(nv50_ir_prog_info);333if (!info)334return false;335336info->type = prog->type;337info->target = chipset;338339info->bin.sourceRep = prog->pipe.type;340switch (prog->pipe.type) {341case PIPE_SHADER_IR_TGSI:342info->bin.source = (void *)prog->pipe.tokens;343break;344case PIPE_SHADER_IR_NIR:345info->bin.source = (void *)nir_shader_clone(NULL, prog->pipe.ir.nir);346break;347default:348assert(!"unsupported IR!");349free(info);350return false;351}352353info->bin.smemSize = prog->cp.smem_size;354info->io.auxCBSlot = 15;355info->io.ucpBase = NV50_CB_AUX_UCP_OFFSET;356info->io.genUserClip = prog->vp.clpd_nr;357if (prog->fp.alphatest)358info->io.alphaRefBase = NV50_CB_AUX_ALPHATEST_OFFSET;359360info->io.suInfoBase = NV50_CB_AUX_TEX_MS_OFFSET;361info->io.bufInfoBase = NV50_CB_AUX_BUF_INFO(0);362info->io.sampleInfoBase = NV50_CB_AUX_SAMPLE_OFFSET;363info->io.msInfoCBSlot = 15;364info->io.msInfoBase = NV50_CB_AUX_MS_OFFSET;365366info->io.membarOffset = NV50_CB_AUX_MEMBAR_OFFSET;367info->io.gmemMembar = 15;368369info->assignSlots = nv50_program_assign_varying_slots;370371prog->vp.bfc[0] = 0xff;372prog->vp.bfc[1] = 0xff;373prog->vp.edgeflag = 0xff;374prog->vp.clpd[0] = map_undef;375prog->vp.clpd[1] = map_undef;376prog->vp.psiz = map_undef;377prog->gp.has_layer = 0;378prog->gp.has_viewport = 0;379380if (prog->type == PIPE_SHADER_COMPUTE)381info->prop.cp.inputOffset = 0x14;382383info_out.driverPriv = prog;384385#ifndef NDEBUG386info->optLevel = debug_get_num_option("NV50_PROG_OPTIMIZE", 3);387info->dbgFlags = debug_get_num_option("NV50_PROG_DEBUG", 0);388info->omitLineNum = debug_get_num_option("NV50_PROG_DEBUG_OMIT_LINENUM", 0);389#else390info->optLevel = 3;391#endif392393ret = nv50_ir_generate_code(info, &info_out);394if (ret) {395NOUVEAU_ERR("shader translation failed: %i\n", ret);396goto out;397}398399prog->code = info_out.bin.code;400prog->code_size = info_out.bin.codeSize;401prog->fixups = info_out.bin.relocData;402prog->interps = info_out.bin.fixupData;403prog->max_gpr = MAX2(4, (info_out.bin.maxGPR >> 1) + 1);404prog->tls_space = info_out.bin.tlsSpace;405prog->cp.smem_size = info_out.bin.smemSize;406prog->mul_zero_wins = info->io.mul_zero_wins;407prog->vp.need_vertex_id = info_out.io.vertexId < PIPE_MAX_SHADER_INPUTS;408409prog->vp.clip_enable = (1 << info_out.io.clipDistances) - 1;410prog->vp.cull_enable =411((1 << info_out.io.cullDistances) - 1) << info_out.io.clipDistances;412prog->vp.clip_mode = 0;413for (i = 0; i < info_out.io.cullDistances; ++i)414prog->vp.clip_mode |= 1 << ((info_out.io.clipDistances + i) * 4);415416if (prog->type == PIPE_SHADER_FRAGMENT) {417if (info_out.prop.fp.writesDepth) {418prog->fp.flags[0] |= NV50_3D_FP_CONTROL_EXPORTS_Z;419prog->fp.flags[1] = 0x11;420}421if (info_out.prop.fp.usesDiscard)422prog->fp.flags[0] |= NV50_3D_FP_CONTROL_USES_KIL;423} else424if (prog->type == PIPE_SHADER_GEOMETRY) {425switch (info_out.prop.gp.outputPrim) {426case PIPE_PRIM_LINE_STRIP:427prog->gp.prim_type = NV50_3D_GP_OUTPUT_PRIMITIVE_TYPE_LINE_STRIP;428break;429case PIPE_PRIM_TRIANGLE_STRIP:430prog->gp.prim_type = NV50_3D_GP_OUTPUT_PRIMITIVE_TYPE_TRIANGLE_STRIP;431break;432case PIPE_PRIM_POINTS:433default:434assert(info_out.prop.gp.outputPrim == PIPE_PRIM_POINTS);435prog->gp.prim_type = NV50_3D_GP_OUTPUT_PRIMITIVE_TYPE_POINTS;436break;437}438prog->gp.vert_count = CLAMP(info_out.prop.gp.maxVertices, 1, 1024);439} else440if (prog->type == PIPE_SHADER_COMPUTE) {441for (i = 0; i < NV50_MAX_GLOBALS; i++) {442prog->cp.gmem[i] = (struct nv50_gmem_state){443.valid = info_out.prop.cp.gmem[i].valid,444.image = info_out.prop.cp.gmem[i].image,445.slot = info_out.prop.cp.gmem[i].slot446};447}448}449450if (prog->pipe.stream_output.num_outputs)451prog->so = nv50_program_create_strmout_state(&info_out,452&prog->pipe.stream_output);453454pipe_debug_message(debug, SHADER_INFO,455"type: %d, local: %d, shared: %d, gpr: %d, inst: %d, bytes: %d",456prog->type, info_out.bin.tlsSpace, info_out.bin.smemSize,457prog->max_gpr, info_out.bin.instructions,458info_out.bin.codeSize);459460out:461if (info->bin.sourceRep == PIPE_SHADER_IR_NIR)462ralloc_free((void *)info->bin.source);463FREE(info);464return !ret;465}466467bool468nv50_program_upload_code(struct nv50_context *nv50, struct nv50_program *prog)469{470struct nouveau_heap *heap;471int ret;472uint32_t size = align(prog->code_size, 0x40);473uint8_t prog_type;474475switch (prog->type) {476case PIPE_SHADER_VERTEX: heap = nv50->screen->vp_code_heap; break;477case PIPE_SHADER_GEOMETRY: heap = nv50->screen->gp_code_heap; break;478case PIPE_SHADER_FRAGMENT: heap = nv50->screen->fp_code_heap; break;479case PIPE_SHADER_COMPUTE: heap = nv50->screen->fp_code_heap; break;480default:481assert(!"invalid program type");482return false;483}484485ret = nouveau_heap_alloc(heap, size, prog, &prog->mem);486if (ret) {487/* Out of space: evict everything to compactify the code segment, hoping488* the working set is much smaller and drifts slowly. Improve me !489*/490while (heap->next) {491struct nv50_program *evict = heap->next->priv;492if (evict)493nouveau_heap_free(&evict->mem);494}495debug_printf("WARNING: out of code space, evicting all shaders.\n");496ret = nouveau_heap_alloc(heap, size, prog, &prog->mem);497if (ret) {498NOUVEAU_ERR("shader too large (0x%x) to fit in code space ?\n", size);499return false;500}501}502503if (prog->type == PIPE_SHADER_COMPUTE) {504/* CP code must be uploaded in FP code segment. */505prog_type = 1;506} else {507prog->code_base = prog->mem->start;508prog_type = prog->type;509}510511ret = nv50_tls_realloc(nv50->screen, prog->tls_space);512if (ret < 0) {513nouveau_heap_free(&prog->mem);514return false;515}516if (ret > 0)517nv50->state.new_tls_space = true;518519if (prog->fixups)520nv50_ir_relocate_code(prog->fixups, prog->code, prog->code_base, 0, 0);521if (prog->interps)522nv50_ir_apply_fixups(prog->interps, prog->code,523prog->fp.force_persample_interp,524false /* flatshade */,525prog->fp.alphatest - 1,526false /* msaa */);527528nv50_sifc_linear_u8(&nv50->base, nv50->screen->code,529(prog_type << NV50_CODE_BO_SIZE_LOG2) + prog->code_base,530NOUVEAU_BO_VRAM, prog->code_size, prog->code);531532BEGIN_NV04(nv50->base.pushbuf, NV50_3D(CODE_CB_FLUSH), 1);533PUSH_DATA (nv50->base.pushbuf, 0);534535return true;536}537538void539nv50_program_destroy(struct nv50_context *nv50, struct nv50_program *p)540{541const struct pipe_shader_state pipe = p->pipe;542const ubyte type = p->type;543544if (p->mem)545nouveau_heap_free(&p->mem);546547FREE(p->code);548549FREE(p->fixups);550FREE(p->interps);551FREE(p->so);552553memset(p, 0, sizeof(*p));554555p->pipe = pipe;556p->type = type;557}558559560