Path: blob/21.2-virgl/src/gallium/drivers/nouveau/nvc0/nvc0_program.c
4574 views
/*1* Copyright 2010 Christoph Bumiller2*3* Permission is hereby granted, free of charge, to any person obtaining a4* copy of this software and associated documentation files (the "Software"),5* to deal in the Software without restriction, including without limitation6* the rights to use, copy, modify, merge, publish, distribute, sublicense,7* and/or sell copies of the Software, and to permit persons to whom the8* Software is furnished to do so, subject to the following conditions:9*10* The above copyright notice and this permission notice shall be included in11* all copies or substantial portions of the Software.12*13* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR14* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,15* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL16* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR17* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,18* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR19* OTHER DEALINGS IN THE SOFTWARE.20*/2122#include "pipe/p_defines.h"2324#include "compiler/nir/nir.h"25#include "tgsi/tgsi_ureg.h"26#include "util/blob.h"2728#include "nvc0/nvc0_context.h"2930#include "codegen/nv50_ir_driver.h"31#include "nvc0/nve4_compute.h"3233/* NOTE: Using a[0x270] in FP may cause an error even if we're using less than34* 124 scalar varying values.35*/36static uint32_t37nvc0_shader_input_address(unsigned sn, unsigned si)38{39switch (sn) {40case TGSI_SEMANTIC_TESSOUTER: return 0x000 + si * 0x4;41case TGSI_SEMANTIC_TESSINNER: return 0x010 + si * 0x4;42case TGSI_SEMANTIC_PATCH: return 0x020 + si * 0x10;43case TGSI_SEMANTIC_PRIMID: return 0x060;44case TGSI_SEMANTIC_LAYER: return 0x064;45case TGSI_SEMANTIC_VIEWPORT_INDEX:return 0x068;46case TGSI_SEMANTIC_PSIZE: return 0x06c;47case TGSI_SEMANTIC_POSITION: return 0x070;48case TGSI_SEMANTIC_GENERIC: return 0x080 + si * 0x10;49case TGSI_SEMANTIC_FOG: return 0x2e8;50case TGSI_SEMANTIC_COLOR: return 0x280 + si * 0x10;51case TGSI_SEMANTIC_BCOLOR: return 0x2a0 + si * 0x10;52case TGSI_SEMANTIC_CLIPDIST: return 0x2c0 + si * 0x10;53case TGSI_SEMANTIC_CLIPVERTEX: return 0x270;54case TGSI_SEMANTIC_PCOORD: return 0x2e0;55case TGSI_SEMANTIC_TESSCOORD: return 0x2f0;56case TGSI_SEMANTIC_INSTANCEID: return 0x2f8;57case TGSI_SEMANTIC_VERTEXID: return 0x2fc;58case TGSI_SEMANTIC_TEXCOORD: return 0x300 + si * 0x10;59default:60assert(!"invalid TGSI input semantic");61return ~0;62}63}6465static uint32_t66nvc0_shader_output_address(unsigned sn, unsigned si)67{68switch (sn) {69case TGSI_SEMANTIC_TESSOUTER: return 0x000 + si * 0x4;70case TGSI_SEMANTIC_TESSINNER: return 0x010 + si * 0x4;71case TGSI_SEMANTIC_PATCH: return 0x020 + si * 0x10;72case TGSI_SEMANTIC_PRIMID: return 0x060;73case TGSI_SEMANTIC_LAYER: return 0x064;74case TGSI_SEMANTIC_VIEWPORT_INDEX:return 0x068;75case TGSI_SEMANTIC_PSIZE: return 0x06c;76case TGSI_SEMANTIC_POSITION: return 0x070;77case TGSI_SEMANTIC_GENERIC: return 0x080 + si * 0x10;78case TGSI_SEMANTIC_FOG: return 0x2e8;79case TGSI_SEMANTIC_COLOR: return 0x280 + si * 0x10;80case TGSI_SEMANTIC_BCOLOR: return 0x2a0 + si * 0x10;81case TGSI_SEMANTIC_CLIPDIST: return 0x2c0 + si * 0x10;82case TGSI_SEMANTIC_CLIPVERTEX: return 0x270;83case TGSI_SEMANTIC_TEXCOORD: return 0x300 + si * 0x10;84case TGSI_SEMANTIC_VIEWPORT_MASK: return 0x3a0;85case TGSI_SEMANTIC_EDGEFLAG: return ~0;86default:87assert(!"invalid TGSI output semantic");88return ~0;89}90}9192static int93nvc0_vp_assign_input_slots(struct nv50_ir_prog_info_out *info)94{95unsigned i, c, n;9697for (n = 0, i = 0; i < info->numInputs; ++i) {98switch (info->in[i].sn) {99case TGSI_SEMANTIC_INSTANCEID: /* for SM4 only, in TGSI they're SVs */100case TGSI_SEMANTIC_VERTEXID:101info->in[i].mask = 0x1;102info->in[i].slot[0] =103nvc0_shader_input_address(info->in[i].sn, 0) / 4;104continue;105default:106break;107}108for (c = 0; c < 4; ++c)109info->in[i].slot[c] = (0x80 + n * 0x10 + c * 0x4) / 4;110++n;111}112113return 0;114}115116static int117nvc0_sp_assign_input_slots(struct nv50_ir_prog_info_out *info)118{119unsigned offset;120unsigned i, c;121122for (i = 0; i < info->numInputs; ++i) {123offset = nvc0_shader_input_address(info->in[i].sn, info->in[i].si);124125for (c = 0; c < 4; ++c)126info->in[i].slot[c] = (offset + c * 0x4) / 4;127}128129return 0;130}131132static int133nvc0_fp_assign_output_slots(struct nv50_ir_prog_info_out *info)134{135unsigned count = info->prop.fp.numColourResults * 4;136unsigned i, c;137138/* Compute the relative position of each color output, since skipped MRT139* positions will not have registers allocated to them.140*/141unsigned colors[8] = {0};142for (i = 0; i < info->numOutputs; ++i)143if (info->out[i].sn == TGSI_SEMANTIC_COLOR)144colors[info->out[i].si] = 1;145for (i = 0, c = 0; i < 8; i++)146if (colors[i])147colors[i] = c++;148for (i = 0; i < info->numOutputs; ++i)149if (info->out[i].sn == TGSI_SEMANTIC_COLOR)150for (c = 0; c < 4; ++c)151info->out[i].slot[c] = colors[info->out[i].si] * 4 + c;152153if (info->io.sampleMask < PIPE_MAX_SHADER_OUTPUTS)154info->out[info->io.sampleMask].slot[0] = count++;155else156if (info->target >= 0xe0)157count++; /* on Kepler, depth is always last colour reg + 2 */158159if (info->io.fragDepth < PIPE_MAX_SHADER_OUTPUTS)160info->out[info->io.fragDepth].slot[2] = count;161162return 0;163}164165static int166nvc0_sp_assign_output_slots(struct nv50_ir_prog_info_out *info)167{168unsigned offset;169unsigned i, c;170171for (i = 0; i < info->numOutputs; ++i) {172offset = nvc0_shader_output_address(info->out[i].sn, info->out[i].si);173174for (c = 0; c < 4; ++c)175info->out[i].slot[c] = (offset + c * 0x4) / 4;176}177178return 0;179}180181static int182nvc0_program_assign_varying_slots(struct nv50_ir_prog_info_out *info)183{184int ret;185186if (info->type == PIPE_SHADER_VERTEX)187ret = nvc0_vp_assign_input_slots(info);188else189ret = nvc0_sp_assign_input_slots(info);190if (ret)191return ret;192193if (info->type == PIPE_SHADER_FRAGMENT)194ret = nvc0_fp_assign_output_slots(info);195else196ret = nvc0_sp_assign_output_slots(info);197return ret;198}199200static inline void201nvc0_vtgp_hdr_update_oread(struct nvc0_program *vp, uint8_t slot)202{203uint8_t min = (vp->hdr[4] >> 12) & 0xff;204uint8_t max = (vp->hdr[4] >> 24);205206min = MIN2(min, slot);207max = MAX2(max, slot);208209vp->hdr[4] = (max << 24) | (min << 12);210}211212/* Common part of header generation for VP, TCP, TEP and GP. */213static int214nvc0_vtgp_gen_header(struct nvc0_program *vp, struct nv50_ir_prog_info_out *info)215{216unsigned i, c, a;217218for (i = 0; i < info->numInputs; ++i) {219if (info->in[i].patch)220continue;221for (c = 0; c < 4; ++c) {222a = info->in[i].slot[c];223if (info->in[i].mask & (1 << c))224vp->hdr[5 + a / 32] |= 1 << (a % 32);225}226}227228for (i = 0; i < info->numOutputs; ++i) {229if (info->out[i].patch)230continue;231for (c = 0; c < 4; ++c) {232if (!(info->out[i].mask & (1 << c)))233continue;234assert(info->out[i].slot[c] >= 0x40 / 4);235a = info->out[i].slot[c] - 0x40 / 4;236vp->hdr[13 + a / 32] |= 1 << (a % 32);237if (info->out[i].oread)238nvc0_vtgp_hdr_update_oread(vp, info->out[i].slot[c]);239}240}241242for (i = 0; i < info->numSysVals; ++i) {243switch (info->sv[i].sn) {244case TGSI_SEMANTIC_PRIMID:245vp->hdr[5] |= 1 << 24;246break;247case TGSI_SEMANTIC_INSTANCEID:248vp->hdr[10] |= 1 << 30;249break;250case TGSI_SEMANTIC_VERTEXID:251vp->hdr[10] |= 1 << 31;252break;253case TGSI_SEMANTIC_TESSCOORD:254/* We don't have the mask, nor the slots populated. While this could255* be achieved, the vast majority of the time if either of the coords256* are read, then both will be read.257*/258nvc0_vtgp_hdr_update_oread(vp, 0x2f0 / 4);259nvc0_vtgp_hdr_update_oread(vp, 0x2f4 / 4);260break;261default:262break;263}264}265266vp->vp.clip_enable = (1 << info->io.clipDistances) - 1;267vp->vp.cull_enable =268((1 << info->io.cullDistances) - 1) << info->io.clipDistances;269for (i = 0; i < info->io.cullDistances; ++i)270vp->vp.clip_mode |= 1 << ((info->io.clipDistances + i) * 4);271272if (info->io.genUserClip < 0)273vp->vp.num_ucps = PIPE_MAX_CLIP_PLANES + 1; /* prevent rebuilding */274275vp->vp.layer_viewport_relative = info->io.layer_viewport_relative;276277return 0;278}279280static int281nvc0_vp_gen_header(struct nvc0_program *vp, struct nv50_ir_prog_info_out *info)282{283vp->hdr[0] = 0x20061 | (1 << 10);284vp->hdr[4] = 0xff000;285286return nvc0_vtgp_gen_header(vp, info);287}288289static void290nvc0_tp_get_tess_mode(struct nvc0_program *tp, struct nv50_ir_prog_info_out *info)291{292if (info->prop.tp.outputPrim == PIPE_PRIM_MAX) {293tp->tp.tess_mode = ~0;294return;295}296switch (info->prop.tp.domain) {297case PIPE_PRIM_LINES:298tp->tp.tess_mode = NVC0_3D_TESS_MODE_PRIM_ISOLINES;299break;300case PIPE_PRIM_TRIANGLES:301tp->tp.tess_mode = NVC0_3D_TESS_MODE_PRIM_TRIANGLES;302break;303case PIPE_PRIM_QUADS:304tp->tp.tess_mode = NVC0_3D_TESS_MODE_PRIM_QUADS;305break;306default:307tp->tp.tess_mode = ~0;308return;309}310311/* It seems like lines want the "CW" bit to indicate they're connected, and312* spit out errors in dmesg when the "CONNECTED" bit is set.313*/314if (info->prop.tp.outputPrim != PIPE_PRIM_POINTS) {315if (info->prop.tp.domain == PIPE_PRIM_LINES)316tp->tp.tess_mode |= NVC0_3D_TESS_MODE_CW;317else318tp->tp.tess_mode |= NVC0_3D_TESS_MODE_CONNECTED;319}320321/* Winding only matters for triangles/quads, not lines. */322if (info->prop.tp.domain != PIPE_PRIM_LINES &&323info->prop.tp.outputPrim != PIPE_PRIM_POINTS &&324info->prop.tp.winding > 0)325tp->tp.tess_mode |= NVC0_3D_TESS_MODE_CW;326327switch (info->prop.tp.partitioning) {328case PIPE_TESS_SPACING_EQUAL:329tp->tp.tess_mode |= NVC0_3D_TESS_MODE_SPACING_EQUAL;330break;331case PIPE_TESS_SPACING_FRACTIONAL_ODD:332tp->tp.tess_mode |= NVC0_3D_TESS_MODE_SPACING_FRACTIONAL_ODD;333break;334case PIPE_TESS_SPACING_FRACTIONAL_EVEN:335tp->tp.tess_mode |= NVC0_3D_TESS_MODE_SPACING_FRACTIONAL_EVEN;336break;337default:338assert(!"invalid tessellator partitioning");339break;340}341}342343static int344nvc0_tcp_gen_header(struct nvc0_program *tcp, struct nv50_ir_prog_info_out *info)345{346unsigned opcs = 6; /* output patch constants (at least the TessFactors) */347348if (info->numPatchConstants)349opcs = 8 + info->numPatchConstants * 4;350351tcp->hdr[0] = 0x20061 | (2 << 10);352353tcp->hdr[1] = opcs << 24;354tcp->hdr[2] = info->prop.tp.outputPatchSize << 24;355356tcp->hdr[4] = 0xff000; /* initial min/max parallel output read address */357358nvc0_vtgp_gen_header(tcp, info);359360if (info->target >= NVISA_GM107_CHIPSET) {361/* On GM107+, the number of output patch components has moved in the TCP362* header, but it seems like blob still also uses the old position.363* Also, the high 8-bits are located in between the min/max parallel364* field and has to be set after updating the outputs. */365tcp->hdr[3] = (opcs & 0x0f) << 28;366tcp->hdr[4] |= (opcs & 0xf0) << 16;367}368369nvc0_tp_get_tess_mode(tcp, info);370371return 0;372}373374static int375nvc0_tep_gen_header(struct nvc0_program *tep, struct nv50_ir_prog_info_out *info)376{377tep->hdr[0] = 0x20061 | (3 << 10);378tep->hdr[4] = 0xff000;379380nvc0_vtgp_gen_header(tep, info);381382nvc0_tp_get_tess_mode(tep, info);383384tep->hdr[18] |= 0x3 << 12; /* ? */385386return 0;387}388389static int390nvc0_gp_gen_header(struct nvc0_program *gp, struct nv50_ir_prog_info_out *info)391{392gp->hdr[0] = 0x20061 | (4 << 10);393394gp->hdr[2] = MIN2(info->prop.gp.instanceCount, 32) << 24;395396switch (info->prop.gp.outputPrim) {397case PIPE_PRIM_POINTS:398gp->hdr[3] = 0x01000000;399gp->hdr[0] |= 0xf0000000;400break;401case PIPE_PRIM_LINE_STRIP:402gp->hdr[3] = 0x06000000;403gp->hdr[0] |= 0x10000000;404break;405case PIPE_PRIM_TRIANGLE_STRIP:406gp->hdr[3] = 0x07000000;407gp->hdr[0] |= 0x10000000;408break;409default:410assert(0);411break;412}413414gp->hdr[4] = CLAMP(info->prop.gp.maxVertices, 1, 1024);415416return nvc0_vtgp_gen_header(gp, info);417}418419#define NVC0_INTERP_FLAT (1 << 0)420#define NVC0_INTERP_PERSPECTIVE (2 << 0)421#define NVC0_INTERP_LINEAR (3 << 0)422#define NVC0_INTERP_CENTROID (1 << 2)423424static uint8_t425nvc0_hdr_interp_mode(const struct nv50_ir_varying *var)426{427if (var->linear)428return NVC0_INTERP_LINEAR;429if (var->flat)430return NVC0_INTERP_FLAT;431return NVC0_INTERP_PERSPECTIVE;432}433434static int435nvc0_fp_gen_header(struct nvc0_program *fp, struct nv50_ir_prog_info_out *info)436{437unsigned i, c, a, m;438439/* just 00062 on Kepler */440fp->hdr[0] = 0x20062 | (5 << 10);441fp->hdr[5] = 0x80000000; /* getting a trap if FRAG_COORD_UMASK.w = 0 */442443if (info->prop.fp.usesDiscard)444fp->hdr[0] |= 0x8000;445if (!info->prop.fp.separateFragData)446fp->hdr[0] |= 0x4000;447if (info->io.sampleMask < PIPE_MAX_SHADER_OUTPUTS)448fp->hdr[19] |= 0x1;449if (info->prop.fp.writesDepth) {450fp->hdr[19] |= 0x2;451fp->flags[0] = 0x11; /* deactivate ZCULL */452}453454for (i = 0; i < info->numInputs; ++i) {455m = nvc0_hdr_interp_mode(&info->in[i]);456if (info->in[i].sn == TGSI_SEMANTIC_COLOR) {457fp->fp.colors |= 1 << info->in[i].si;458if (info->in[i].sc)459fp->fp.color_interp[info->in[i].si] = m | (info->in[i].mask << 4);460}461for (c = 0; c < 4; ++c) {462if (!(info->in[i].mask & (1 << c)))463continue;464a = info->in[i].slot[c];465if (info->in[i].slot[0] >= (0x060 / 4) &&466info->in[i].slot[0] <= (0x07c / 4)) {467fp->hdr[5] |= 1 << (24 + (a - 0x060 / 4));468} else469if (info->in[i].slot[0] >= (0x2c0 / 4) &&470info->in[i].slot[0] <= (0x2fc / 4)) {471fp->hdr[14] |= (1 << (a - 0x280 / 4)) & 0x07ff0000;472} else {473if (info->in[i].slot[c] < (0x040 / 4) ||474info->in[i].slot[c] > (0x380 / 4))475continue;476a *= 2;477if (info->in[i].slot[0] >= (0x300 / 4))478a -= 32;479fp->hdr[4 + a / 32] |= m << (a % 32);480}481}482}483/* GM20x+ needs TGSI_SEMANTIC_POSITION to access sample locations */484if (info->prop.fp.readsSampleLocations && info->target >= NVISA_GM200_CHIPSET)485fp->hdr[5] |= 0x30000000;486487for (i = 0; i < info->numOutputs; ++i) {488if (info->out[i].sn == TGSI_SEMANTIC_COLOR)489fp->hdr[18] |= 0xf << (4 * info->out[i].si);490}491492/* There are no "regular" attachments, but the shader still needs to be493* executed. It seems like it wants to think that it has some color494* outputs in order to actually run.495*/496if (info->prop.fp.numColourResults == 0 && !info->prop.fp.writesDepth)497fp->hdr[18] |= 0xf;498499fp->fp.early_z = info->prop.fp.earlyFragTests;500fp->fp.sample_mask_in = info->prop.fp.usesSampleMaskIn;501fp->fp.reads_framebuffer = info->prop.fp.readsFramebuffer;502fp->fp.post_depth_coverage = info->prop.fp.postDepthCoverage;503504/* Mark position xy and layer as read */505if (fp->fp.reads_framebuffer)506fp->hdr[5] |= 0x32000000;507508return 0;509}510511static struct nvc0_transform_feedback_state *512nvc0_program_create_tfb_state(const struct nv50_ir_prog_info_out *info,513const struct pipe_stream_output_info *pso)514{515struct nvc0_transform_feedback_state *tfb;516unsigned b, i, c;517518tfb = MALLOC_STRUCT(nvc0_transform_feedback_state);519if (!tfb)520return NULL;521for (b = 0; b < 4; ++b) {522tfb->stride[b] = pso->stride[b] * 4;523tfb->varying_count[b] = 0;524}525memset(tfb->varying_index, 0xff, sizeof(tfb->varying_index)); /* = skip */526527for (i = 0; i < pso->num_outputs; ++i) {528unsigned s = pso->output[i].start_component;529unsigned p = pso->output[i].dst_offset;530const unsigned r = pso->output[i].register_index;531b = pso->output[i].output_buffer;532533if (r >= info->numOutputs)534continue;535536for (c = 0; c < pso->output[i].num_components; ++c)537tfb->varying_index[b][p++] = info->out[r].slot[s + c];538539tfb->varying_count[b] = MAX2(tfb->varying_count[b], p);540tfb->stream[b] = pso->output[i].stream;541}542for (b = 0; b < 4; ++b) // zero unused indices (looks nicer)543for (c = tfb->varying_count[b]; c & 3; ++c)544tfb->varying_index[b][c] = 0;545546return tfb;547}548549#ifndef NDEBUG550static void551nvc0_program_dump(struct nvc0_program *prog)552{553unsigned pos;554555if (prog->type != PIPE_SHADER_COMPUTE) {556_debug_printf("dumping HDR for type %i\n", prog->type);557for (pos = 0; pos < ARRAY_SIZE(prog->hdr); ++pos)558_debug_printf("HDR[%02"PRIxPTR"] = 0x%08x\n",559pos * sizeof(prog->hdr[0]), prog->hdr[pos]);560}561_debug_printf("shader binary code (0x%x bytes):", prog->code_size);562for (pos = 0; pos < prog->code_size / 4; ++pos) {563if ((pos % 8) == 0)564_debug_printf("\n");565_debug_printf("%08x ", prog->code[pos]);566}567_debug_printf("\n");568}569#endif570571bool572nvc0_program_translate(struct nvc0_program *prog, uint16_t chipset,573struct disk_cache *disk_shader_cache,574struct pipe_debug_callback *debug)575{576struct blob blob;577size_t cache_size;578struct nv50_ir_prog_info *info;579struct nv50_ir_prog_info_out info_out = {};580581int ret = 0;582cache_key key;583bool shader_loaded = false;584585info = CALLOC_STRUCT(nv50_ir_prog_info);586if (!info)587return false;588589info->type = prog->type;590info->target = chipset;591592info->bin.sourceRep = prog->pipe.type;593switch (prog->pipe.type) {594case PIPE_SHADER_IR_TGSI:595info->bin.source = (void *)prog->pipe.tokens;596break;597case PIPE_SHADER_IR_NIR:598info->bin.source = (void *)nir_shader_clone(NULL, prog->pipe.ir.nir);599break;600default:601assert(!"unsupported IR!");602free(info);603return false;604}605606#ifndef NDEBUG607info->target = debug_get_num_option("NV50_PROG_CHIPSET", chipset);608info->optLevel = debug_get_num_option("NV50_PROG_OPTIMIZE", 3);609info->dbgFlags = debug_get_num_option("NV50_PROG_DEBUG", 0);610info->omitLineNum = debug_get_num_option("NV50_PROG_DEBUG_OMIT_LINENUM", 0);611#else612info->optLevel = 3;613#endif614615info->bin.smemSize = prog->cp.smem_size;616info->io.genUserClip = prog->vp.num_ucps;617info->io.auxCBSlot = 15;618info->io.msInfoCBSlot = 15;619info->io.ucpBase = NVC0_CB_AUX_UCP_INFO;620info->io.drawInfoBase = NVC0_CB_AUX_DRAW_INFO;621info->io.msInfoBase = NVC0_CB_AUX_MS_INFO;622info->io.bufInfoBase = NVC0_CB_AUX_BUF_INFO(0);623info->io.suInfoBase = NVC0_CB_AUX_SU_INFO(0);624if (info->target >= NVISA_GK104_CHIPSET) {625info->io.texBindBase = NVC0_CB_AUX_TEX_INFO(0);626info->io.fbtexBindBase = NVC0_CB_AUX_FB_TEX_INFO;627info->io.bindlessBase = NVC0_CB_AUX_BINDLESS_INFO(0);628}629630if (prog->type == PIPE_SHADER_COMPUTE) {631if (info->target >= NVISA_GK104_CHIPSET) {632info->io.auxCBSlot = 7;633info->io.msInfoCBSlot = 7;634info->io.uboInfoBase = NVC0_CB_AUX_UBO_INFO(0);635}636info->prop.cp.gridInfoBase = NVC0_CB_AUX_GRID_INFO(0);637} else {638info->io.sampleInfoBase = NVC0_CB_AUX_SAMPLE_INFO;639}640641info->assignSlots = nvc0_program_assign_varying_slots;642643blob_init(&blob);644645if (disk_shader_cache) {646if (nv50_ir_prog_info_serialize(&blob, info)) {647void *cached_data = NULL;648649disk_cache_compute_key(disk_shader_cache, blob.data, blob.size, key);650cached_data = disk_cache_get(disk_shader_cache, key, &cache_size);651652if (cached_data && cache_size >= blob.size) { // blob.size is the size of serialized "info"653/* Blob contains only "info". In disk cache, "info_out" comes right after it */654size_t offset = blob.size;655if (nv50_ir_prog_info_out_deserialize(cached_data, cache_size, offset, &info_out))656shader_loaded = true;657else658debug_printf("WARNING: Couldn't deserialize shaders");659}660free(cached_data);661} else {662debug_printf("WARNING: Couldn't serialize input shaders");663}664}665if (!shader_loaded) {666cache_size = 0;667ret = nv50_ir_generate_code(info, &info_out);668if (ret) {669NOUVEAU_ERR("shader translation failed: %i\n", ret);670goto out;671}672if (disk_shader_cache) {673if (nv50_ir_prog_info_out_serialize(&blob, &info_out)) {674disk_cache_put(disk_shader_cache, key, blob.data, blob.size, NULL);675cache_size = blob.size;676} else {677debug_printf("WARNING: Couldn't serialize shaders");678}679}680}681blob_finish(&blob);682683prog->code = info_out.bin.code;684prog->code_size = info_out.bin.codeSize;685prog->relocs = info_out.bin.relocData;686prog->fixups = info_out.bin.fixupData;687if (info_out.target >= NVISA_GV100_CHIPSET)688prog->num_gprs = MIN2(info_out.bin.maxGPR + 5, 256); //XXX: why?689else690prog->num_gprs = MAX2(4, (info_out.bin.maxGPR + 1));691prog->cp.smem_size = info_out.bin.smemSize;692prog->num_barriers = info_out.numBarriers;693694prog->vp.need_vertex_id = info_out.io.vertexId < PIPE_MAX_SHADER_INPUTS;695prog->vp.need_draw_parameters = info_out.prop.vp.usesDrawParameters;696697if (info_out.io.edgeFlagOut < PIPE_MAX_ATTRIBS)698info_out.out[info_out.io.edgeFlagOut].mask = 0; /* for headergen */699prog->vp.edgeflag = info_out.io.edgeFlagIn;700701switch (prog->type) {702case PIPE_SHADER_VERTEX:703ret = nvc0_vp_gen_header(prog, &info_out);704break;705case PIPE_SHADER_TESS_CTRL:706ret = nvc0_tcp_gen_header(prog, &info_out);707break;708case PIPE_SHADER_TESS_EVAL:709ret = nvc0_tep_gen_header(prog, &info_out);710break;711case PIPE_SHADER_GEOMETRY:712ret = nvc0_gp_gen_header(prog, &info_out);713break;714case PIPE_SHADER_FRAGMENT:715ret = nvc0_fp_gen_header(prog, &info_out);716break;717case PIPE_SHADER_COMPUTE:718break;719default:720ret = -1;721NOUVEAU_ERR("unknown program type: %u\n", prog->type);722break;723}724if (ret)725goto out;726727if (info_out.bin.tlsSpace) {728assert(info_out.bin.tlsSpace < (1 << 24));729prog->hdr[0] |= 1 << 26;730prog->hdr[1] |= align(info_out.bin.tlsSpace, 0x10); /* l[] size */731prog->need_tls = true;732}733/* TODO: factor 2 only needed where joinat/precont is used,734* and we only have to count non-uniform branches735*/736/*737if ((info->maxCFDepth * 2) > 16) {738prog->hdr[2] |= (((info->maxCFDepth * 2) + 47) / 48) * 0x200;739prog->need_tls = true;740}741*/742if (info_out.io.globalAccess)743prog->hdr[0] |= 1 << 26;744if (info_out.io.globalAccess & 0x2)745prog->hdr[0] |= 1 << 16;746if (info_out.io.fp64)747prog->hdr[0] |= 1 << 27;748749if (prog->pipe.stream_output.num_outputs)750prog->tfb = nvc0_program_create_tfb_state(&info_out,751&prog->pipe.stream_output);752753pipe_debug_message(debug, SHADER_INFO,754"type: %d, local: %d, shared: %d, gpr: %d, inst: %d, bytes: %d, cached: %zd",755prog->type, info_out.bin.tlsSpace, info_out.bin.smemSize,756prog->num_gprs, info_out.bin.instructions,757info_out.bin.codeSize, cache_size);758759#ifndef NDEBUG760if (debug_get_option("NV50_PROG_CHIPSET", NULL) && info->dbgFlags)761nvc0_program_dump(prog);762#endif763764out:765if (info->bin.sourceRep == PIPE_SHADER_IR_NIR)766ralloc_free((void *)info->bin.source);767FREE(info);768return !ret;769}770771static inline int772nvc0_program_alloc_code(struct nvc0_context *nvc0, struct nvc0_program *prog)773{774struct nvc0_screen *screen = nvc0->screen;775const bool is_cp = prog->type == PIPE_SHADER_COMPUTE;776int ret;777uint32_t size = prog->code_size;778779if (!is_cp) {780if (screen->eng3d->oclass < TU102_3D_CLASS)781size += GF100_SHADER_HEADER_SIZE;782else783size += TU102_SHADER_HEADER_SIZE;784}785786/* On Fermi, SP_START_ID must be aligned to 0x40.787* On Kepler, the first instruction must be aligned to 0x80 because788* latency information is expected only at certain positions.789*/790if (screen->base.class_3d >= NVE4_3D_CLASS)791size = size + (is_cp ? 0x40 : 0x70);792size = align(size, 0x40);793794ret = nouveau_heap_alloc(screen->text_heap, size, prog, &prog->mem);795if (ret)796return ret;797prog->code_base = prog->mem->start;798799if (!is_cp) {800if (screen->base.class_3d >= NVE4_3D_CLASS &&801screen->base.class_3d < TU102_3D_CLASS) {802switch (prog->mem->start & 0xff) {803case 0x40: prog->code_base += 0x70; break;804case 0x80: prog->code_base += 0x30; break;805case 0xc0: prog->code_base += 0x70; break;806default:807prog->code_base += 0x30;808assert((prog->mem->start & 0xff) == 0x00);809break;810}811}812} else {813if (screen->base.class_3d >= NVE4_3D_CLASS) {814if (prog->mem->start & 0x40)815prog->code_base += 0x40;816assert((prog->code_base & 0x7f) == 0x00);817}818}819820return 0;821}822823static inline void824nvc0_program_upload_code(struct nvc0_context *nvc0, struct nvc0_program *prog)825{826struct nvc0_screen *screen = nvc0->screen;827const bool is_cp = prog->type == PIPE_SHADER_COMPUTE;828uint32_t code_pos = prog->code_base;829uint32_t size_sph = 0;830831if (!is_cp) {832if (screen->eng3d->oclass < TU102_3D_CLASS)833size_sph = GF100_SHADER_HEADER_SIZE;834else835size_sph = TU102_SHADER_HEADER_SIZE;836}837code_pos += size_sph;838839if (prog->relocs)840nv50_ir_relocate_code(prog->relocs, prog->code, code_pos,841screen->lib_code->start, 0);842if (prog->fixups) {843nv50_ir_apply_fixups(prog->fixups, prog->code,844prog->fp.force_persample_interp,845prog->fp.flatshade,8460 /* alphatest */,847prog->fp.msaa);848for (int i = 0; i < 2; i++) {849unsigned mask = prog->fp.color_interp[i] >> 4;850unsigned interp = prog->fp.color_interp[i] & 3;851if (!mask)852continue;853prog->hdr[14] &= ~(0xff << (8 * i));854if (prog->fp.flatshade)855interp = NVC0_INTERP_FLAT;856for (int c = 0; c < 4; c++)857if (mask & (1 << c))858prog->hdr[14] |= interp << (2 * (4 * i + c));859}860}861862if (!is_cp)863nvc0->base.push_data(&nvc0->base, screen->text, prog->code_base,864NV_VRAM_DOMAIN(&screen->base), size_sph, prog->hdr);865866nvc0->base.push_data(&nvc0->base, screen->text, code_pos,867NV_VRAM_DOMAIN(&screen->base), prog->code_size,868prog->code);869}870871bool872nvc0_program_upload(struct nvc0_context *nvc0, struct nvc0_program *prog)873{874struct nvc0_screen *screen = nvc0->screen;875const bool is_cp = prog->type == PIPE_SHADER_COMPUTE;876int ret;877uint32_t size = prog->code_size;878879if (!is_cp) {880if (screen->eng3d->oclass < TU102_3D_CLASS)881size += GF100_SHADER_HEADER_SIZE;882else883size += TU102_SHADER_HEADER_SIZE;884}885886ret = nvc0_program_alloc_code(nvc0, prog);887if (ret) {888struct nouveau_heap *heap = screen->text_heap;889struct nvc0_program *progs[] = { /* Sorted accordingly to SP_START_ID */890nvc0->compprog, nvc0->vertprog, nvc0->tctlprog,891nvc0->tevlprog, nvc0->gmtyprog, nvc0->fragprog892};893894/* Note that the code library, which is allocated before anything else,895* does not have a priv pointer. We can stop once we hit it.896*/897while (heap->next && heap->next->priv) {898struct nvc0_program *evict = heap->next->priv;899nouveau_heap_free(&evict->mem);900}901debug_printf("WARNING: out of code space, evicting all shaders.\n");902903/* Make sure to synchronize before deleting the code segment. */904IMMED_NVC0(nvc0->base.pushbuf, NVC0_3D(SERIALIZE), 0);905906if ((screen->text->size << 1) <= (1 << 23)) {907ret = nvc0_screen_resize_text_area(screen, screen->text->size << 1);908if (ret) {909NOUVEAU_ERR("Error allocating TEXT area: %d\n", ret);910return false;911}912913/* Re-upload the builtin function into the new code segment. */914nvc0_program_library_upload(nvc0);915}916917ret = nvc0_program_alloc_code(nvc0, prog);918if (ret) {919NOUVEAU_ERR("shader too large (0x%x) to fit in code space ?\n", size);920return false;921}922923/* All currently bound shaders have to be reuploaded. */924for (int i = 0; i < ARRAY_SIZE(progs); i++) {925if (!progs[i] || progs[i] == prog)926continue;927928ret = nvc0_program_alloc_code(nvc0, progs[i]);929if (ret) {930NOUVEAU_ERR("failed to re-upload a shader after code eviction.\n");931return false;932}933nvc0_program_upload_code(nvc0, progs[i]);934935if (progs[i]->type == PIPE_SHADER_COMPUTE) {936/* Caches have to be invalidated but the CP_START_ID will be937* updated in the launch_grid functions. */938BEGIN_NVC0(nvc0->base.pushbuf, NVC0_CP(FLUSH), 1);939PUSH_DATA (nvc0->base.pushbuf, NVC0_COMPUTE_FLUSH_CODE);940} else {941nvc0_program_sp_start_id(nvc0, i, progs[i]);942}943}944}945946nvc0_program_upload_code(nvc0, prog);947948#ifndef NDEBUG949if (debug_get_bool_option("NV50_PROG_DEBUG", false))950nvc0_program_dump(prog);951#endif952953BEGIN_NVC0(nvc0->base.pushbuf, NVC0_3D(MEM_BARRIER), 1);954PUSH_DATA (nvc0->base.pushbuf, 0x1011);955956return true;957}958959/* Upload code for builtin functions like integer division emulation. */960void961nvc0_program_library_upload(struct nvc0_context *nvc0)962{963struct nvc0_screen *screen = nvc0->screen;964int ret;965uint32_t size;966const uint32_t *code;967968if (screen->lib_code)969return;970971nv50_ir_get_target_library(screen->base.device->chipset, &code, &size);972if (!size)973return;974975ret = nouveau_heap_alloc(screen->text_heap, align(size, 0x100), NULL,976&screen->lib_code);977if (ret)978return;979980nvc0->base.push_data(&nvc0->base,981screen->text, screen->lib_code->start, NV_VRAM_DOMAIN(&screen->base),982size, code);983/* no need for a memory barrier, will be emitted with first program */984}985986void987nvc0_program_destroy(struct nvc0_context *nvc0, struct nvc0_program *prog)988{989const struct pipe_shader_state pipe = prog->pipe;990const ubyte type = prog->type;991992if (prog->mem)993nouveau_heap_free(&prog->mem);994FREE(prog->code); /* may be 0 for hardcoded shaders */995FREE(prog->relocs);996FREE(prog->fixups);997if (prog->tfb) {998if (nvc0->state.tfb == prog->tfb)999nvc0->state.tfb = NULL;1000FREE(prog->tfb);1001}10021003memset(prog, 0, sizeof(*prog));10041005prog->pipe = pipe;1006prog->type = type;1007}10081009void1010nvc0_program_init_tcp_empty(struct nvc0_context *nvc0)1011{1012struct ureg_program *ureg;10131014ureg = ureg_create(PIPE_SHADER_TESS_CTRL);1015if (!ureg)1016return;10171018ureg_property(ureg, TGSI_PROPERTY_TCS_VERTICES_OUT, 1);1019ureg_END(ureg);10201021nvc0->tcp_empty = ureg_create_shader_and_destroy(ureg, &nvc0->base.pipe);1022}102310241025