Path: blob/21.2-virgl/src/gallium/drivers/vc4/vc4_draw.c
4570 views
/*1* Copyright (c) 2014 Scott Mansell2* Copyright © 2014 Broadcom3*4* Permission is hereby granted, free of charge, to any person obtaining a5* copy of this software and associated documentation files (the "Software"),6* to deal in the Software without restriction, including without limitation7* the rights to use, copy, modify, merge, publish, distribute, sublicense,8* and/or sell copies of the Software, and to permit persons to whom the9* Software is furnished to do so, subject to the following conditions:10*11* The above copyright notice and this permission notice (including the next12* paragraph) shall be included in all copies or substantial portions of the13* Software.14*15* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR16* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,17* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL18* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER19* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING20* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS21* IN THE SOFTWARE.22*/2324#include "util/u_blitter.h"25#include "util/u_draw.h"26#include "util/u_prim.h"27#include "util/format/u_format.h"28#include "util/u_pack_color.h"29#include "util/u_split_draw.h"30#include "util/u_upload_mgr.h"31#include "indices/u_primconvert.h"3233#include "vc4_context.h"34#include "vc4_resource.h"3536#define VC4_HW_2116_COUNT 0x1ef03738static void39vc4_get_draw_cl_space(struct vc4_job *job, int vert_count)40{41/* The SW-5891 workaround may cause us to emit multiple shader recs42* and draw packets.43*/44int num_draws = DIV_ROUND_UP(vert_count, 65535 - 2) + 1;4546/* Binner gets our packet state -- vc4_emit.c contents,47* and the primitive itself.48*/49cl_ensure_space(&job->bcl,50256 + (VC4_PACKET_GL_ARRAY_PRIMITIVE_SIZE +51VC4_PACKET_GL_SHADER_STATE_SIZE) * num_draws);5253/* Nothing for rcl -- that's covered by vc4_context.c */5455/* shader_rec gets up to 12 dwords of reloc handles plus a maximally56* sized shader_rec (104 bytes base for 8 vattrs plus 32 bytes of57* vattr stride).58*/59cl_ensure_space(&job->shader_rec,60(12 * sizeof(uint32_t) + 104 + 8 * 32) * num_draws);6162/* Uniforms are covered by vc4_write_uniforms(). */6364/* There could be up to 16 textures per stage, plus misc other65* pointers.66*/67cl_ensure_space(&job->bo_handles, (2 * 16 + 20) * sizeof(uint32_t));68cl_ensure_space(&job->bo_pointers,69(2 * 16 + 20) * sizeof(struct vc4_bo *));70}7172/**73* Does the initial bining command list setup for drawing to a given FBO.74*/75static void76vc4_start_draw(struct vc4_context *vc4)77{78struct vc4_job *job = vc4->job;7980if (job->needs_flush)81return;8283vc4_get_draw_cl_space(job, 0);8485cl_emit(&job->bcl, TILE_BINNING_MODE_CONFIGURATION, bin) {86bin.width_in_tiles = job->draw_tiles_x;87bin.height_in_tiles = job->draw_tiles_y;88bin.multisample_mode_4x = job->msaa;89}9091/* START_TILE_BINNING resets the statechange counters in the hardware,92* which are what is used when a primitive is binned to a tile to93* figure out what new state packets need to be written to that tile's94* command list.95*/96cl_emit(&job->bcl, START_TILE_BINNING, start);9798/* Reset the current compressed primitives format. This gets modified99* by VC4_PACKET_GL_INDEXED_PRIMITIVE and100* VC4_PACKET_GL_ARRAY_PRIMITIVE, so it needs to be reset at the start101* of every tile.102*/103cl_emit(&job->bcl, PRIMITIVE_LIST_FORMAT, list) {104list.data_type = _16_BIT_INDEX;105list.primitive_type = TRIANGLES_LIST;106}107108job->needs_flush = true;109job->draw_width = vc4->framebuffer.width;110job->draw_height = vc4->framebuffer.height;111}112113static void114vc4_predraw_check_textures(struct pipe_context *pctx,115struct vc4_texture_stateobj *stage_tex)116{117struct vc4_context *vc4 = vc4_context(pctx);118119for (int i = 0; i < stage_tex->num_textures; i++) {120struct vc4_sampler_view *view =121vc4_sampler_view(stage_tex->textures[i]);122if (!view)123continue;124125if (view->texture != view->base.texture)126vc4_update_shadow_baselevel_texture(pctx, &view->base);127128vc4_flush_jobs_writing_resource(vc4, view->texture);129}130}131132static void133vc4_emit_gl_shader_state(struct vc4_context *vc4,134const struct pipe_draw_info *info,135const struct pipe_draw_start_count_bias *draws,136uint32_t extra_index_bias)137{138struct vc4_job *job = vc4->job;139/* VC4_DIRTY_VTXSTATE */140struct vc4_vertex_stateobj *vtx = vc4->vtx;141/* VC4_DIRTY_VTXBUF */142struct vc4_vertexbuf_stateobj *vertexbuf = &vc4->vertexbuf;143144/* The simulator throws a fit if VS or CS don't read an attribute, so145* we emit a dummy read.146*/147uint32_t num_elements_emit = MAX2(vtx->num_elements, 1);148149/* Emit the shader record. */150cl_start_shader_reloc(&job->shader_rec, 3 + num_elements_emit);151152cl_emit(&job->shader_rec, SHADER_RECORD, rec) {153rec.enable_clipping = true;154155/* VC4_DIRTY_COMPILED_FS */156rec.fragment_shader_is_single_threaded =157!vc4->prog.fs->fs_threaded;158159/* VC4_DIRTY_PRIM_MODE | VC4_DIRTY_RASTERIZER */160rec.point_size_included_in_shaded_vertex_data =161(info->mode == PIPE_PRIM_POINTS &&162vc4->rasterizer->base.point_size_per_vertex);163164/* VC4_DIRTY_COMPILED_FS */165rec.fragment_shader_number_of_varyings =166vc4->prog.fs->num_inputs;167rec.fragment_shader_code_address =168cl_address(vc4->prog.fs->bo, 0);169170rec.coordinate_shader_attribute_array_select_bits =171vc4->prog.cs->vattrs_live;172rec.coordinate_shader_total_attributes_size =173vc4->prog.cs->vattr_offsets[8];174rec.coordinate_shader_code_address =175cl_address(vc4->prog.cs->bo, 0);176177rec.vertex_shader_attribute_array_select_bits =178vc4->prog.vs->vattrs_live;179rec.vertex_shader_total_attributes_size =180vc4->prog.vs->vattr_offsets[8];181rec.vertex_shader_code_address =182cl_address(vc4->prog.vs->bo, 0);183};184185uint32_t max_index = 0xffff;186unsigned index_bias = info->index_size ? draws->index_bias : 0;187for (int i = 0; i < vtx->num_elements; i++) {188struct pipe_vertex_element *elem = &vtx->pipe[i];189struct pipe_vertex_buffer *vb =190&vertexbuf->vb[elem->vertex_buffer_index];191struct vc4_resource *rsc = vc4_resource(vb->buffer.resource);192/* not vc4->dirty tracked: vc4->last_index_bias */193uint32_t offset = (vb->buffer_offset +194elem->src_offset +195vb->stride * (index_bias +196extra_index_bias));197uint32_t vb_size = rsc->bo->size - offset;198uint32_t elem_size =199util_format_get_blocksize(elem->src_format);200201cl_emit(&job->shader_rec, ATTRIBUTE_RECORD, attr) {202attr.address = cl_address(rsc->bo, offset);203attr.number_of_bytes_minus_1 = elem_size - 1;204attr.stride = vb->stride;205attr.coordinate_shader_vpm_offset =206vc4->prog.cs->vattr_offsets[i];207attr.vertex_shader_vpm_offset =208vc4->prog.vs->vattr_offsets[i];209}210211if (vb->stride > 0) {212max_index = MIN2(max_index,213(vb_size - elem_size) / vb->stride);214}215}216217if (vtx->num_elements == 0) {218assert(num_elements_emit == 1);219struct vc4_bo *bo = vc4_bo_alloc(vc4->screen, 4096, "scratch VBO");220221cl_emit(&job->shader_rec, ATTRIBUTE_RECORD, attr) {222attr.address = cl_address(bo, 0);223attr.number_of_bytes_minus_1 = 16 - 1;224attr.stride = 0;225attr.coordinate_shader_vpm_offset = 0;226attr.vertex_shader_vpm_offset = 0;227}228229vc4_bo_unreference(&bo);230}231232cl_emit(&job->bcl, GL_SHADER_STATE, shader_state) {233/* Note that number of attributes == 0 in the packet means 8234* attributes. This field also contains the offset into235* shader_rec.236*/237assert(vtx->num_elements <= 8);238shader_state.number_of_attribute_arrays =239num_elements_emit & 0x7;240}241242vc4_write_uniforms(vc4, vc4->prog.fs,243&vc4->constbuf[PIPE_SHADER_FRAGMENT],244&vc4->fragtex);245vc4_write_uniforms(vc4, vc4->prog.vs,246&vc4->constbuf[PIPE_SHADER_VERTEX],247&vc4->verttex);248vc4_write_uniforms(vc4, vc4->prog.cs,249&vc4->constbuf[PIPE_SHADER_VERTEX],250&vc4->verttex);251252vc4->last_index_bias = index_bias + extra_index_bias;253vc4->max_index = max_index;254job->shader_rec_count++;255}256257/**258* HW-2116 workaround: Flush the batch before triggering the hardware state259* counter wraparound behavior.260*261* State updates are tracked by a global counter which increments at the first262* state update after a draw or a START_BINNING. Tiles can then have their263* state updated at draw time with a set of cheap checks for whether the264* state's copy of the global counter matches the global counter the last time265* that state was written to the tile.266*267* The state counters are relatively small and wrap around quickly, so you268* could get false negatives for needing to update a particular state in the269* tile. To avoid this, the hardware attempts to write all of the state in270* the tile at wraparound time. This apparently is broken, so we just flush271* everything before that behavior is triggered. A batch flush is sufficient272* to get our current contents drawn and reset the counters to 0.273*274* Note that we can't just use VC4_PACKET_FLUSH_ALL, because that caps the275* tiles with VC4_PACKET_RETURN_FROM_LIST.276*/277static void278vc4_hw_2116_workaround(struct pipe_context *pctx, int vert_count)279{280struct vc4_context *vc4 = vc4_context(pctx);281struct vc4_job *job = vc4_get_job_for_fbo(vc4);282283if (job->draw_calls_queued + vert_count / 65535 >= VC4_HW_2116_COUNT) {284perf_debug("Flushing batch due to HW-2116 workaround "285"(too many draw calls per scene\n");286vc4_job_submit(vc4, job);287}288}289290static void291vc4_draw_vbo(struct pipe_context *pctx, const struct pipe_draw_info *info,292unsigned drawid_offset,293const struct pipe_draw_indirect_info *indirect,294const struct pipe_draw_start_count_bias *draws,295unsigned num_draws)296{297if (num_draws > 1) {298util_draw_multi(pctx, info, drawid_offset, indirect, draws, num_draws);299return;300}301302if (!indirect && (!draws[0].count || !info->instance_count))303return;304305struct vc4_context *vc4 = vc4_context(pctx);306struct pipe_draw_info local_info;307308if (!indirect &&309!info->primitive_restart &&310!u_trim_pipe_prim(info->mode, (unsigned*)&draws[0].count))311return;312313if (info->mode >= PIPE_PRIM_QUADS) {314if (info->mode == PIPE_PRIM_QUADS &&315draws[0].count == 4 &&316!vc4->rasterizer->base.flatshade) {317local_info = *info;318local_info.mode = PIPE_PRIM_TRIANGLE_FAN;319info = &local_info;320} else {321util_primconvert_save_rasterizer_state(vc4->primconvert, &vc4->rasterizer->base);322util_primconvert_draw_vbo(vc4->primconvert, info, drawid_offset, indirect, draws, num_draws);323perf_debug("Fallback conversion for %d %s vertices\n",324draws[0].count, u_prim_name(info->mode));325return;326}327}328329/* Before setting up the draw, do any fixup blits necessary. */330vc4_predraw_check_textures(pctx, &vc4->verttex);331vc4_predraw_check_textures(pctx, &vc4->fragtex);332333vc4_hw_2116_workaround(pctx, draws[0].count);334335struct vc4_job *job = vc4_get_job_for_fbo(vc4);336337/* Make sure that the raster order flags haven't changed, which can338* only be set at job granularity.339*/340if (job->flags != vc4->rasterizer->tile_raster_order_flags) {341vc4_job_submit(vc4, job);342job = vc4_get_job_for_fbo(vc4);343}344345vc4_get_draw_cl_space(job, draws[0].count);346347if (vc4->prim_mode != info->mode) {348vc4->prim_mode = info->mode;349vc4->dirty |= VC4_DIRTY_PRIM_MODE;350}351352vc4_start_draw(vc4);353if (!vc4_update_compiled_shaders(vc4, info->mode)) {354debug_warn_once("shader compile failed, skipping draw call.\n");355return;356}357358vc4_emit_state(pctx);359360bool needs_drawarrays_shader_state = false;361362unsigned index_bias = info->index_size ? draws->index_bias : 0;363if ((vc4->dirty & (VC4_DIRTY_VTXBUF |364VC4_DIRTY_VTXSTATE |365VC4_DIRTY_PRIM_MODE |366VC4_DIRTY_RASTERIZER |367VC4_DIRTY_COMPILED_CS |368VC4_DIRTY_COMPILED_VS |369VC4_DIRTY_COMPILED_FS |370vc4->prog.cs->uniform_dirty_bits |371vc4->prog.vs->uniform_dirty_bits |372vc4->prog.fs->uniform_dirty_bits)) ||373vc4->last_index_bias != index_bias) {374if (info->index_size)375vc4_emit_gl_shader_state(vc4, info, draws, 0);376else377needs_drawarrays_shader_state = true;378}379380vc4->dirty = 0;381382/* Note that the primitive type fields match with OpenGL/gallium383* definitions, up to but not including QUADS.384*/385if (info->index_size) {386uint32_t index_size = info->index_size;387uint32_t offset = draws[0].start * index_size;388struct pipe_resource *prsc;389if (info->index_size == 4) {390prsc = vc4_get_shadow_index_buffer(pctx, info,391offset,392draws[0].count, &offset);393index_size = 2;394} else {395if (info->has_user_indices) {396unsigned start_offset = draws[0].start * info->index_size;397prsc = NULL;398u_upload_data(vc4->uploader, start_offset,399draws[0].count * index_size, 4,400(char*)info->index.user + start_offset,401&offset, &prsc);402} else {403prsc = info->index.resource;404}405}406struct vc4_resource *rsc = vc4_resource(prsc);407408struct vc4_cl_out *bcl = cl_start(&job->bcl);409410/* The original design for the VC4 kernel UABI had multiple411* packets that used relocations in the BCL (some of which412* needed two BOs), but later modifications eliminated all but413* this one usage. We have an arbitrary 32-bit offset value,414* and need to also supply an arbitrary 32-bit index buffer415* GEM handle, so we have this fake packet we emit in our BCL416* to be validated, which the kernel uses at validation time417* to perform the relocation in the IB packet (without418* emitting to the actual HW).419*/420uint32_t hindex = vc4_gem_hindex(job, rsc->bo);421if (job->last_gem_handle_hindex != hindex) {422cl_u8(&bcl, VC4_PACKET_GEM_HANDLES);423cl_u32(&bcl, hindex);424cl_u32(&bcl, 0);425job->last_gem_handle_hindex = hindex;426}427428cl_u8(&bcl, VC4_PACKET_GL_INDEXED_PRIMITIVE);429cl_u8(&bcl,430info->mode |431(index_size == 2 ?432VC4_INDEX_BUFFER_U16:433VC4_INDEX_BUFFER_U8));434cl_u32(&bcl, draws[0].count);435cl_u32(&bcl, offset);436cl_u32(&bcl, vc4->max_index);437438cl_end(&job->bcl, bcl);439job->draw_calls_queued++;440441if (info->index_size == 4 || info->has_user_indices)442pipe_resource_reference(&prsc, NULL);443} else {444uint32_t count = draws[0].count;445uint32_t start = draws[0].start;446uint32_t extra_index_bias = 0;447static const uint32_t max_verts = 65535;448449/* GFXH-515 / SW-5891: The binner emits 16 bit indices for450* drawarrays, which means that if start + count > 64k it451* would truncate the top bits. Work around this by emitting452* a limited number of primitives at a time and reemitting the453* shader state pointing farther down the vertex attribute454* arrays.455*456* To do this properly for line loops or trifans, we'd need to457* make a new VB containing the first vertex plus whatever458* remainder.459*/460if (start + count > max_verts) {461extra_index_bias = start;462start = 0;463needs_drawarrays_shader_state = true;464}465466while (count) {467uint32_t this_count = count;468uint32_t step;469470if (needs_drawarrays_shader_state) {471vc4_emit_gl_shader_state(vc4, info, draws,472extra_index_bias);473}474475u_split_draw(info, max_verts, &this_count, &step);476477cl_emit(&job->bcl, VERTEX_ARRAY_PRIMITIVES, array) {478array.primitive_mode = info->mode;479array.length = this_count;480array.index_of_first_vertex = start;481}482job->draw_calls_queued++;483484count -= step;485extra_index_bias += start + step;486start = 0;487needs_drawarrays_shader_state = true;488}489}490491/* We shouldn't have tripped the HW_2116 bug with the GFXH-515492* workaround.493*/494assert(job->draw_calls_queued <= VC4_HW_2116_COUNT);495496if (vc4->zsa && vc4->framebuffer.zsbuf) {497struct vc4_resource *rsc =498vc4_resource(vc4->framebuffer.zsbuf->texture);499500if (vc4->zsa->base.depth_enabled) {501job->resolve |= PIPE_CLEAR_DEPTH;502rsc->initialized_buffers = PIPE_CLEAR_DEPTH;503}504505if (vc4->zsa->base.stencil[0].enabled) {506job->resolve |= PIPE_CLEAR_STENCIL;507rsc->initialized_buffers |= PIPE_CLEAR_STENCIL;508}509}510511job->resolve |= PIPE_CLEAR_COLOR0;512513/* If we've used half of the presumably 256MB CMA area, flush the job514* so that we don't accumulate a job that will end up not being515* executable.516*/517if (job->bo_space > 128 * 1024 * 1024)518vc4_flush(pctx);519520if (vc4_debug & VC4_DEBUG_ALWAYS_FLUSH)521vc4_flush(pctx);522}523524static uint32_t525pack_rgba(enum pipe_format format, const float *rgba)526{527union util_color uc;528util_pack_color(rgba, format, &uc);529if (util_format_get_blocksize(format) == 2)530return uc.us;531else532return uc.ui[0];533}534535static void536vc4_clear(struct pipe_context *pctx, unsigned buffers, const struct pipe_scissor_state *scissor_state,537const union pipe_color_union *color, double depth, unsigned stencil)538{539struct vc4_context *vc4 = vc4_context(pctx);540struct vc4_job *job = vc4_get_job_for_fbo(vc4);541542if (buffers & PIPE_CLEAR_DEPTHSTENCIL) {543struct vc4_resource *rsc =544vc4_resource(vc4->framebuffer.zsbuf->texture);545unsigned zsclear = buffers & PIPE_CLEAR_DEPTHSTENCIL;546547/* Clearing ZS will clear both Z and stencil, so if we're548* trying to clear just one then we need to draw a quad to do549* it instead. We need to do this before setting up550* tile-based clears in vc4->job, because the blitter may551* submit the current job.552*/553if ((zsclear == PIPE_CLEAR_DEPTH ||554zsclear == PIPE_CLEAR_STENCIL) &&555(rsc->initialized_buffers & ~(zsclear | job->cleared)) &&556util_format_is_depth_and_stencil(vc4->framebuffer.zsbuf->format)) {557static const union pipe_color_union dummy_color = {};558559perf_debug("Partial clear of Z+stencil buffer, "560"drawing a quad instead of fast clearing\n");561vc4_blitter_save(vc4);562util_blitter_clear(vc4->blitter,563vc4->framebuffer.width,564vc4->framebuffer.height,5651,566zsclear,567&dummy_color, depth, stencil,568false);569buffers &= ~zsclear;570if (!buffers)571return;572job = vc4_get_job_for_fbo(vc4);573}574}575576/* We can't flag new buffers for clearing once we've queued draws. We577* could avoid this by using the 3d engine to clear.578*/579if (job->draw_calls_queued) {580perf_debug("Flushing rendering to process new clear.\n");581vc4_job_submit(vc4, job);582job = vc4_get_job_for_fbo(vc4);583}584585if (buffers & PIPE_CLEAR_COLOR0) {586struct vc4_resource *rsc =587vc4_resource(vc4->framebuffer.cbufs[0]->texture);588uint32_t clear_color;589590if (vc4_rt_format_is_565(vc4->framebuffer.cbufs[0]->format)) {591/* In 565 mode, the hardware will be packing our color592* for us.593*/594clear_color = pack_rgba(PIPE_FORMAT_R8G8B8A8_UNORM,595color->f);596} else {597/* Otherwise, we need to do this packing because we598* support multiple swizzlings of RGBA8888.599*/600clear_color =601pack_rgba(vc4->framebuffer.cbufs[0]->format,602color->f);603}604job->clear_color[0] = job->clear_color[1] = clear_color;605rsc->initialized_buffers |= (buffers & PIPE_CLEAR_COLOR0);606}607608if (buffers & PIPE_CLEAR_DEPTHSTENCIL) {609struct vc4_resource *rsc =610vc4_resource(vc4->framebuffer.zsbuf->texture);611612/* Though the depth buffer is stored with Z in the high 24,613* for this field we just need to store it in the low 24.614*/615if (buffers & PIPE_CLEAR_DEPTH) {616job->clear_depth = util_pack_z(PIPE_FORMAT_Z24X8_UNORM,617depth);618}619if (buffers & PIPE_CLEAR_STENCIL)620job->clear_stencil = stencil;621622rsc->initialized_buffers |= (buffers & PIPE_CLEAR_DEPTHSTENCIL);623}624625job->draw_min_x = 0;626job->draw_min_y = 0;627job->draw_max_x = vc4->framebuffer.width;628job->draw_max_y = vc4->framebuffer.height;629job->cleared |= buffers;630job->resolve |= buffers;631632vc4_start_draw(vc4);633}634635static void636vc4_clear_render_target(struct pipe_context *pctx, struct pipe_surface *ps,637const union pipe_color_union *color,638unsigned x, unsigned y, unsigned w, unsigned h,639bool render_condition_enabled)640{641fprintf(stderr, "unimpl: clear RT\n");642}643644static void645vc4_clear_depth_stencil(struct pipe_context *pctx, struct pipe_surface *ps,646unsigned buffers, double depth, unsigned stencil,647unsigned x, unsigned y, unsigned w, unsigned h,648bool render_condition_enabled)649{650fprintf(stderr, "unimpl: clear DS\n");651}652653void654vc4_draw_init(struct pipe_context *pctx)655{656pctx->draw_vbo = vc4_draw_vbo;657pctx->clear = vc4_clear;658pctx->clear_render_target = vc4_clear_render_target;659pctx->clear_depth_stencil = vc4_clear_depth_stencil;660}661662663