Path: blob/21.2-virgl/src/gallium/drivers/panfrost/pan_cmdstream.c
4570 views
/*1* Copyright (C) 2018 Alyssa Rosenzweig2* Copyright (C) 2020 Collabora Ltd.3* Copyright © 2017 Intel Corporation4*5* Permission is hereby granted, free of charge, to any person obtaining a6* copy of this software and associated documentation files (the "Software"),7* to deal in the Software without restriction, including without limitation8* the rights to use, copy, modify, merge, publish, distribute, sublicense,9* and/or sell copies of the Software, and to permit persons to whom the10* Software is furnished to do so, subject to the following conditions:11*12* The above copyright notice and this permission notice (including the next13* paragraph) shall be included in all copies or substantial portions of the14* Software.15*16* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR17* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,18* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL19* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER20* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,21* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE22* SOFTWARE.23*/2425#include "util/macros.h"26#include "util/u_prim.h"27#include "util/u_vbuf.h"28#include "util/u_helpers.h"29#include "util/u_draw.h"30#include "util/u_memory.h"31#include "pipe/p_defines.h"32#include "pipe/p_state.h"33#include "indices/u_primconvert.h"34#include "gallium/auxiliary/util/u_blend.h"3536#include "panfrost-quirks.h"3738#include "pan_pool.h"39#include "pan_bo.h"40#include "pan_context.h"41#include "pan_job.h"42#include "pan_shader.h"43#include "pan_texture.h"44#include "pan_util.h"45#include "pan_indirect_draw.h"46#include "pan_indirect_dispatch.h"47#include "pan_blitter.h"4849#include "midgard_pack.h"5051/* Statically assert that PIPE_* enums match the hardware enums.52* (As long as they match, we don't need to translate them.)53*/54UNUSED static void55pan_pipe_asserts()56{57#define PIPE_ASSERT(x) STATIC_ASSERT((int)x)5859/* Compare functions are natural in both Gallium and Mali */60PIPE_ASSERT(PIPE_FUNC_NEVER == MALI_FUNC_NEVER);61PIPE_ASSERT(PIPE_FUNC_LESS == MALI_FUNC_LESS);62PIPE_ASSERT(PIPE_FUNC_EQUAL == MALI_FUNC_EQUAL);63PIPE_ASSERT(PIPE_FUNC_LEQUAL == MALI_FUNC_LEQUAL);64PIPE_ASSERT(PIPE_FUNC_GREATER == MALI_FUNC_GREATER);65PIPE_ASSERT(PIPE_FUNC_NOTEQUAL == MALI_FUNC_NOT_EQUAL);66PIPE_ASSERT(PIPE_FUNC_GEQUAL == MALI_FUNC_GEQUAL);67PIPE_ASSERT(PIPE_FUNC_ALWAYS == MALI_FUNC_ALWAYS);68}6970static inline enum mali_sample_pattern71panfrost_sample_pattern(unsigned samples)72{73switch (samples) {74case 1: return MALI_SAMPLE_PATTERN_SINGLE_SAMPLED;75case 4: return MALI_SAMPLE_PATTERN_ROTATED_4X_GRID;76case 8: return MALI_SAMPLE_PATTERN_D3D_8X_GRID;77case 16: return MALI_SAMPLE_PATTERN_D3D_16X_GRID;78default: unreachable("Unsupported sample count");79}80}8182/* Gets a GPU address for the associated index buffer. Only gauranteed to be83* good for the duration of the draw (transient), could last longer. Also get84* the bounds on the index buffer for the range accessed by the draw. We do85* these operations together because there are natural optimizations which86* require them to be together. */8788static mali_ptr89panfrost_get_index_buffer_bounded(struct panfrost_batch *batch,90const struct pipe_draw_info *info,91const struct pipe_draw_start_count_bias *draw,92unsigned *min_index, unsigned *max_index)93{94struct panfrost_resource *rsrc = pan_resource(info->index.resource);95struct panfrost_context *ctx = batch->ctx;96off_t offset = draw->start * info->index_size;97bool needs_indices = true;98mali_ptr out = 0;99100if (info->index_bounds_valid) {101*min_index = info->min_index;102*max_index = info->max_index;103needs_indices = false;104}105106if (!info->has_user_indices) {107/* Only resources can be directly mapped */108panfrost_batch_read_rsrc(batch, rsrc, PIPE_SHADER_VERTEX);109out = rsrc->image.data.bo->ptr.gpu + offset;110111/* Check the cache */112needs_indices = !panfrost_minmax_cache_get(rsrc->index_cache,113draw->start,114draw->count,115min_index,116max_index);117} else {118/* Otherwise, we need to upload to transient memory */119const uint8_t *ibuf8 = (const uint8_t *) info->index.user;120struct panfrost_ptr T =121pan_pool_alloc_aligned(&batch->pool.base,122draw->count *123info->index_size,124info->index_size);125126memcpy(T.cpu, ibuf8 + offset, draw->count * info->index_size);127out = T.gpu;128}129130if (needs_indices) {131/* Fallback */132u_vbuf_get_minmax_index(&ctx->base, info, draw, min_index, max_index);133134if (!info->has_user_indices)135panfrost_minmax_cache_add(rsrc->index_cache,136draw->start, draw->count,137*min_index, *max_index);138}139140return out;141}142143static unsigned144translate_tex_wrap(enum pipe_tex_wrap w, bool supports_clamp, bool using_nearest)145{146/* Bifrost doesn't support the GL_CLAMP wrap mode, so instead use147* CLAMP_TO_EDGE and CLAMP_TO_BORDER. On Midgard, CLAMP is broken for148* nearest filtering, so use CLAMP_TO_EDGE in that case. */149150switch (w) {151case PIPE_TEX_WRAP_REPEAT: return MALI_WRAP_MODE_REPEAT;152case PIPE_TEX_WRAP_CLAMP:153return using_nearest ? MALI_WRAP_MODE_CLAMP_TO_EDGE :154(supports_clamp ? MALI_WRAP_MODE_CLAMP :155MALI_WRAP_MODE_CLAMP_TO_BORDER);156case PIPE_TEX_WRAP_CLAMP_TO_EDGE: return MALI_WRAP_MODE_CLAMP_TO_EDGE;157case PIPE_TEX_WRAP_CLAMP_TO_BORDER: return MALI_WRAP_MODE_CLAMP_TO_BORDER;158case PIPE_TEX_WRAP_MIRROR_REPEAT: return MALI_WRAP_MODE_MIRRORED_REPEAT;159case PIPE_TEX_WRAP_MIRROR_CLAMP:160return using_nearest ? MALI_WRAP_MODE_MIRRORED_CLAMP_TO_EDGE :161(supports_clamp ? MALI_WRAP_MODE_MIRRORED_CLAMP :162MALI_WRAP_MODE_MIRRORED_CLAMP_TO_BORDER);163case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE: return MALI_WRAP_MODE_MIRRORED_CLAMP_TO_EDGE;164case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER: return MALI_WRAP_MODE_MIRRORED_CLAMP_TO_BORDER;165default: unreachable("Invalid wrap");166}167}168169/* The hardware compares in the wrong order order, so we have to flip before170* encoding. Yes, really. */171172static enum mali_func173panfrost_sampler_compare_func(const struct pipe_sampler_state *cso)174{175return !cso->compare_mode ? MALI_FUNC_NEVER :176panfrost_flip_compare_func((enum mali_func) cso->compare_func);177}178179static enum mali_mipmap_mode180pan_pipe_to_mipmode(enum pipe_tex_mipfilter f)181{182switch (f) {183case PIPE_TEX_MIPFILTER_NEAREST: return MALI_MIPMAP_MODE_NEAREST;184case PIPE_TEX_MIPFILTER_LINEAR: return MALI_MIPMAP_MODE_TRILINEAR;185case PIPE_TEX_MIPFILTER_NONE: return MALI_MIPMAP_MODE_NONE;186default: unreachable("Invalid");187}188}189190static void191panfrost_sampler_desc_init(const struct pipe_sampler_state *cso,192struct mali_midgard_sampler_packed *hw)193{194bool using_nearest = cso->min_img_filter == PIPE_TEX_MIPFILTER_NEAREST;195196pan_pack(hw, MIDGARD_SAMPLER, cfg) {197cfg.magnify_nearest = cso->mag_img_filter == PIPE_TEX_FILTER_NEAREST;198cfg.minify_nearest = cso->min_img_filter == PIPE_TEX_FILTER_NEAREST;199cfg.mipmap_mode = (cso->min_mip_filter == PIPE_TEX_MIPFILTER_LINEAR) ?200MALI_MIPMAP_MODE_TRILINEAR : MALI_MIPMAP_MODE_NEAREST;201cfg.normalized_coordinates = cso->normalized_coords;202203cfg.lod_bias = FIXED_16(cso->lod_bias, true);204205cfg.minimum_lod = FIXED_16(cso->min_lod, false);206207/* If necessary, we disable mipmapping in the sampler descriptor by208* clamping the LOD as tight as possible (from 0 to epsilon,209* essentially -- remember these are fixed point numbers, so210* epsilon=1/256) */211212cfg.maximum_lod = (cso->min_mip_filter == PIPE_TEX_MIPFILTER_NONE) ?213cfg.minimum_lod + 1 :214FIXED_16(cso->max_lod, false);215216cfg.wrap_mode_s = translate_tex_wrap(cso->wrap_s, true, using_nearest);217cfg.wrap_mode_t = translate_tex_wrap(cso->wrap_t, true, using_nearest);218cfg.wrap_mode_r = translate_tex_wrap(cso->wrap_r, true, using_nearest);219220cfg.compare_function = panfrost_sampler_compare_func(cso);221cfg.seamless_cube_map = cso->seamless_cube_map;222223cfg.border_color_r = cso->border_color.ui[0];224cfg.border_color_g = cso->border_color.ui[1];225cfg.border_color_b = cso->border_color.ui[2];226cfg.border_color_a = cso->border_color.ui[3];227}228}229230static void231panfrost_sampler_desc_init_bifrost(const struct pipe_sampler_state *cso,232struct mali_bifrost_sampler_packed *hw)233{234bool using_nearest = cso->min_img_filter == PIPE_TEX_MIPFILTER_NEAREST;235236pan_pack(hw, BIFROST_SAMPLER, cfg) {237cfg.point_sample_magnify = cso->mag_img_filter == PIPE_TEX_FILTER_NEAREST;238cfg.point_sample_minify = cso->min_img_filter == PIPE_TEX_FILTER_NEAREST;239cfg.mipmap_mode = pan_pipe_to_mipmode(cso->min_mip_filter);240cfg.normalized_coordinates = cso->normalized_coords;241242cfg.lod_bias = FIXED_16(cso->lod_bias, true);243cfg.minimum_lod = FIXED_16(cso->min_lod, false);244cfg.maximum_lod = FIXED_16(cso->max_lod, false);245246if (cso->max_anisotropy > 1) {247cfg.maximum_anisotropy = cso->max_anisotropy;248cfg.lod_algorithm = MALI_LOD_ALGORITHM_ANISOTROPIC;249}250251cfg.wrap_mode_s = translate_tex_wrap(cso->wrap_s, false, using_nearest);252cfg.wrap_mode_t = translate_tex_wrap(cso->wrap_t, false, using_nearest);253cfg.wrap_mode_r = translate_tex_wrap(cso->wrap_r, false, using_nearest);254255cfg.compare_function = panfrost_sampler_compare_func(cso);256cfg.seamless_cube_map = cso->seamless_cube_map;257258cfg.border_color_r = cso->border_color.ui[0];259cfg.border_color_g = cso->border_color.ui[1];260cfg.border_color_b = cso->border_color.ui[2];261cfg.border_color_a = cso->border_color.ui[3];262}263}264265static void *266panfrost_create_sampler_state(267struct pipe_context *pctx,268const struct pipe_sampler_state *cso)269{270struct panfrost_sampler_state *so = CALLOC_STRUCT(panfrost_sampler_state);271struct panfrost_device *device = pan_device(pctx->screen);272273so->base = *cso;274275if (pan_is_bifrost(device))276panfrost_sampler_desc_init_bifrost(cso, (struct mali_bifrost_sampler_packed *) &so->hw);277else278panfrost_sampler_desc_init(cso, &so->hw);279280return so;281}282283static bool284panfrost_fs_required(285struct panfrost_shader_state *fs,286struct panfrost_blend_state *blend,287struct pipe_framebuffer_state *state,288const struct panfrost_zsa_state *zsa)289{290/* If we generally have side effects. This inclues use of discard,291* which can affect the results of an occlusion query. */292if (fs->info.fs.sidefx)293return true;294295/* Using an empty FS requires early-z to be enabled, but alpha test296* needs it disabled */297if ((enum mali_func) zsa->base.alpha_func != MALI_FUNC_ALWAYS)298return true;299300/* If colour is written we need to execute */301for (unsigned i = 0; i < state->nr_cbufs; ++i) {302if (state->cbufs[i] && !blend->info[i].no_colour)303return true;304}305306/* If depth is written and not implied we need to execute.307* TODO: Predicate on Z/S writes being enabled */308return (fs->info.fs.writes_depth || fs->info.fs.writes_stencil);309}310311static void312panfrost_emit_bifrost_blend(struct panfrost_batch *batch,313mali_ptr *blend_shaders, void *rts)314{315unsigned rt_count = batch->key.nr_cbufs;316struct panfrost_context *ctx = batch->ctx;317const struct panfrost_blend_state *so = ctx->blend;318const struct panfrost_device *dev = pan_device(ctx->base.screen);319struct panfrost_shader_state *fs = panfrost_get_shader_state(ctx, PIPE_SHADER_FRAGMENT);320321/* Always have at least one render target for depth-only passes */322for (unsigned i = 0; i < MAX2(rt_count, 1); ++i) {323/* Disable blending for unbacked render targets */324if (rt_count == 0 || !batch->key.cbufs[i] || so->info[i].no_colour) {325pan_pack(rts + i * MALI_BLEND_LENGTH, BLEND, cfg) {326cfg.enable = false;327cfg.bifrost.internal.mode = MALI_BIFROST_BLEND_MODE_OFF;328}329330continue;331}332333struct pan_blend_info info = so->info[i];334enum pipe_format format = batch->key.cbufs[i]->format;335const struct util_format_description *format_desc;336unsigned chan_size = 0;337338format_desc = util_format_description(format);339340for (unsigned i = 0; i < format_desc->nr_channels; i++)341chan_size = MAX2(format_desc->channel[0].size, chan_size);342343/* Fixed point constant */344float constant_f = pan_blend_get_constant(345info.constant_mask,346ctx->blend_color.color);347348u16 constant = constant_f * ((1 << chan_size) - 1);349constant <<= 16 - chan_size;350351struct mali_blend_packed *packed = rts + (i * MALI_BLEND_LENGTH);352353/* Word 0: Flags and constant */354pan_pack(packed, BLEND, cfg) {355cfg.srgb = util_format_is_srgb(batch->key.cbufs[i]->format);356cfg.load_destination = info.load_dest;357cfg.round_to_fb_precision = !ctx->blend->base.dither;358cfg.alpha_to_one = ctx->blend->base.alpha_to_one;359cfg.bifrost.constant = constant;360}361362if (!blend_shaders[i]) {363/* Word 1: Blend Equation */364STATIC_ASSERT(MALI_BLEND_EQUATION_LENGTH == 4);365packed->opaque[1] = so->equation[i];366}367368/* Words 2 and 3: Internal blend */369if (blend_shaders[i]) {370/* The blend shader's address needs to be at371* the same top 32 bit as the fragment shader.372* TODO: Ensure that's always the case.373*/374assert(!fs->bin.bo ||375(blend_shaders[i] & (0xffffffffull << 32)) ==376(fs->bin.gpu & (0xffffffffull << 32)));377378unsigned ret_offset = fs->info.bifrost.blend[i].return_offset;379assert(!(ret_offset & 0x7));380381pan_pack(&packed->opaque[2], BIFROST_INTERNAL_BLEND, cfg) {382cfg.mode = MALI_BIFROST_BLEND_MODE_SHADER;383cfg.shader.pc = (u32) blend_shaders[i];384cfg.shader.return_value = ret_offset ?385fs->bin.gpu + ret_offset : 0;386}387} else {388pan_pack(&packed->opaque[2], BIFROST_INTERNAL_BLEND, cfg) {389cfg.mode = info.opaque ?390MALI_BIFROST_BLEND_MODE_OPAQUE :391MALI_BIFROST_BLEND_MODE_FIXED_FUNCTION;392393/* If we want the conversion to work properly,394* num_comps must be set to 4395*/396cfg.fixed_function.num_comps = 4;397cfg.fixed_function.conversion.memory_format =398panfrost_format_to_bifrost_blend(dev, format);399cfg.fixed_function.conversion.register_format =400fs->info.bifrost.blend[i].format;401cfg.fixed_function.rt = i;402}403}404}405}406407static void408panfrost_emit_midgard_blend(struct panfrost_batch *batch,409mali_ptr *blend_shaders, void *rts)410{411unsigned rt_count = batch->key.nr_cbufs;412struct panfrost_context *ctx = batch->ctx;413const struct panfrost_blend_state *so = ctx->blend;414415/* Always have at least one render target for depth-only passes */416for (unsigned i = 0; i < MAX2(rt_count, 1); ++i) {417struct mali_blend_packed *packed = rts + (i * MALI_BLEND_LENGTH);418419/* Disable blending for unbacked render targets */420if (rt_count == 0 || !batch->key.cbufs[i] || so->info[i].no_colour) {421pan_pack(packed, BLEND, cfg) {422cfg.enable = false;423}424425continue;426}427428pan_pack(packed, BLEND, cfg) {429struct pan_blend_info info = so->info[i];430431cfg.srgb = util_format_is_srgb(batch->key.cbufs[i]->format);432cfg.load_destination = info.load_dest;433cfg.round_to_fb_precision = !ctx->blend->base.dither;434cfg.alpha_to_one = ctx->blend->base.alpha_to_one;435cfg.midgard.blend_shader = (blend_shaders[i] != 0);436if (blend_shaders[i]) {437cfg.midgard.shader_pc = blend_shaders[i];438} else {439cfg.midgard.constant = pan_blend_get_constant(440info.constant_mask,441ctx->blend_color.color);442}443}444445if (!blend_shaders[i]) {446/* Word 2: Blend Equation */447STATIC_ASSERT(MALI_BLEND_EQUATION_LENGTH == 4);448packed->opaque[2] = so->equation[i];449}450}451}452453static void454panfrost_emit_blend(struct panfrost_batch *batch, void *rts, mali_ptr *blend_shaders)455{456const struct panfrost_device *dev = pan_device(batch->ctx->base.screen);457struct panfrost_blend_state *so = batch->ctx->blend;458459if (pan_is_bifrost(dev))460panfrost_emit_bifrost_blend(batch, blend_shaders, rts);461else462panfrost_emit_midgard_blend(batch, blend_shaders, rts);463464for (unsigned i = 0; i < batch->key.nr_cbufs; ++i) {465if (!so->info[i].no_colour && batch->key.cbufs[i]) {466batch->draws |= (PIPE_CLEAR_COLOR0 << i);467batch->resolve |= (PIPE_CLEAR_COLOR0 << i);468}469}470}471472/* Construct a partial RSD corresponding to no executed fragment shader, and473* merge with the existing partial RSD. This depends only on the architecture,474* so packing separately allows the packs to be constant folded away. */475476static void477pan_merge_empty_fs(struct mali_renderer_state_packed *rsd, bool is_bifrost)478{479struct mali_renderer_state_packed empty_rsd;480481if (is_bifrost) {482pan_pack(&empty_rsd, RENDERER_STATE, cfg) {483cfg.properties.bifrost.shader_modifies_coverage = true;484cfg.properties.bifrost.allow_forward_pixel_to_kill = true;485cfg.properties.bifrost.allow_forward_pixel_to_be_killed = true;486cfg.properties.bifrost.zs_update_operation = MALI_PIXEL_KILL_STRONG_EARLY;487}488} else {489pan_pack(&empty_rsd, RENDERER_STATE, cfg) {490cfg.shader.shader = 0x1;491cfg.properties.midgard.work_register_count = 1;492cfg.properties.depth_source = MALI_DEPTH_SOURCE_FIXED_FUNCTION;493cfg.properties.midgard.force_early_z = true;494}495}496497pan_merge((*rsd), empty_rsd, RENDERER_STATE);498}499500/* Get the last blend shader, for an erratum workaround */501502static mali_ptr503panfrost_last_nonnull(mali_ptr *ptrs, unsigned count)504{505for (signed i = ((signed) count - 1); i >= 0; --i) {506if (ptrs[i])507return ptrs[i];508}509510return 0;511}512513static void514panfrost_prepare_fs_state(struct panfrost_context *ctx,515mali_ptr *blend_shaders,516struct mali_renderer_state_packed *rsd)517{518const struct panfrost_device *dev = pan_device(ctx->base.screen);519struct pipe_rasterizer_state *rast = &ctx->rasterizer->base;520const struct panfrost_zsa_state *zsa = ctx->depth_stencil;521struct panfrost_shader_state *fs = panfrost_get_shader_state(ctx, PIPE_SHADER_FRAGMENT);522struct panfrost_blend_state *so = ctx->blend;523bool alpha_to_coverage = ctx->blend->base.alpha_to_coverage;524bool msaa = rast->multisample;525526pan_pack(rsd, RENDERER_STATE, cfg) {527if (pan_is_bifrost(dev) && panfrost_fs_required(fs, so, &ctx->pipe_framebuffer, zsa)) {528/* Track if any colour buffer is reused across draws, either529* from reading it directly, or from failing to write it */530unsigned rt_mask = ctx->fb_rt_mask;531uint64_t rt_written = (fs->info.outputs_written >> FRAG_RESULT_DATA0);532bool blend_reads_dest = (so->load_dest_mask & rt_mask);533534cfg.properties.bifrost.allow_forward_pixel_to_kill =535fs->info.fs.can_fpk &&536!(rt_mask & ~rt_written) &&537!alpha_to_coverage &&538!blend_reads_dest;539} else if (!pan_is_bifrost(dev)) {540unsigned rt_count = ctx->pipe_framebuffer.nr_cbufs;541542if (panfrost_fs_required(fs, ctx->blend, &ctx->pipe_framebuffer, zsa)) {543cfg.properties.midgard.force_early_z =544fs->info.fs.can_early_z && !alpha_to_coverage &&545((enum mali_func) zsa->base.alpha_func == MALI_FUNC_ALWAYS);546547bool has_blend_shader = false;548549for (unsigned c = 0; c < rt_count; ++c)550has_blend_shader |= (blend_shaders[c] != 0);551552/* TODO: Reduce this limit? */553if (has_blend_shader)554cfg.properties.midgard.work_register_count = MAX2(fs->info.work_reg_count, 8);555else556cfg.properties.midgard.work_register_count = fs->info.work_reg_count;557558/* Hardware quirks around early-zs forcing559* without a depth buffer. Note this breaks560* occlusion queries. */561bool has_oq = ctx->occlusion_query && ctx->active_queries;562bool force_ez_with_discard = !zsa->enabled && !has_oq;563564cfg.properties.midgard.shader_reads_tilebuffer =565force_ez_with_discard && fs->info.fs.can_discard;566cfg.properties.midgard.shader_contains_discard =567!force_ez_with_discard && fs->info.fs.can_discard;568}569570if (dev->quirks & MIDGARD_SFBD && rt_count > 0) {571cfg.multisample_misc.sfbd_load_destination = so->info[0].load_dest;572cfg.multisample_misc.sfbd_blend_shader = (blend_shaders[0] != 0);573cfg.stencil_mask_misc.sfbd_write_enable = !so->info[0].no_colour;574cfg.stencil_mask_misc.sfbd_srgb = util_format_is_srgb(ctx->pipe_framebuffer.cbufs[0]->format);575cfg.stencil_mask_misc.sfbd_dither_disable = !so->base.dither;576cfg.stencil_mask_misc.sfbd_alpha_to_one = so->base.alpha_to_one;577578if (blend_shaders[0]) {579cfg.sfbd_blend_shader = blend_shaders[0];580} else {581cfg.sfbd_blend_constant = pan_blend_get_constant(582so->info[0].constant_mask,583ctx->blend_color.color);584}585} else if (dev->quirks & MIDGARD_SFBD) {586/* If there is no colour buffer, leaving fields default is587* fine, except for blending which is nonnullable */588cfg.sfbd_blend_equation.color_mask = 0xf;589cfg.sfbd_blend_equation.rgb.a = MALI_BLEND_OPERAND_A_SRC;590cfg.sfbd_blend_equation.rgb.b = MALI_BLEND_OPERAND_B_SRC;591cfg.sfbd_blend_equation.rgb.c = MALI_BLEND_OPERAND_C_ZERO;592cfg.sfbd_blend_equation.alpha.a = MALI_BLEND_OPERAND_A_SRC;593cfg.sfbd_blend_equation.alpha.b = MALI_BLEND_OPERAND_B_SRC;594cfg.sfbd_blend_equation.alpha.c = MALI_BLEND_OPERAND_C_ZERO;595} else {596/* Workaround on v5 */597cfg.sfbd_blend_shader = panfrost_last_nonnull(blend_shaders, rt_count);598}599}600601cfg.multisample_misc.sample_mask = msaa ? ctx->sample_mask : 0xFFFF;602603cfg.multisample_misc.evaluate_per_sample =604msaa && (ctx->min_samples > 1);605606cfg.stencil_mask_misc.alpha_to_coverage = alpha_to_coverage;607cfg.depth_units = rast->offset_units * 2.0f;608cfg.depth_factor = rast->offset_scale;609610bool back_enab = zsa->base.stencil[1].enabled;611cfg.stencil_front.reference_value = ctx->stencil_ref.ref_value[0];612cfg.stencil_back.reference_value = ctx->stencil_ref.ref_value[back_enab ? 1 : 0];613614/* v6+ fits register preload here, no alpha testing */615if (dev->arch <= 5)616cfg.alpha_reference = zsa->base.alpha_ref_value;617}618}619620static void621panfrost_emit_frag_shader(struct panfrost_context *ctx,622struct mali_renderer_state_packed *fragmeta,623mali_ptr *blend_shaders)624{625struct panfrost_device *dev = pan_device(ctx->base.screen);626const struct panfrost_zsa_state *zsa = ctx->depth_stencil;627const struct panfrost_rasterizer *rast = ctx->rasterizer;628struct panfrost_shader_state *fs =629panfrost_get_shader_state(ctx, PIPE_SHADER_FRAGMENT);630631/* We need to merge several several partial renderer state descriptors,632* so stage to temporary storage rather than reading back write-combine633* memory, which will trash performance. */634struct mali_renderer_state_packed rsd;635panfrost_prepare_fs_state(ctx, blend_shaders, &rsd);636637if ((dev->quirks & MIDGARD_SFBD)638&& ctx->pipe_framebuffer.nr_cbufs > 0639&& !blend_shaders[0]) {640641/* Word 14: SFBD Blend Equation */642STATIC_ASSERT(MALI_BLEND_EQUATION_LENGTH == 4);643rsd.opaque[14] = ctx->blend->equation[0];644}645646/* Merge with CSO state and upload */647if (panfrost_fs_required(fs, ctx->blend, &ctx->pipe_framebuffer, zsa))648pan_merge(rsd, fs->partial_rsd, RENDERER_STATE);649else650pan_merge_empty_fs(&rsd, pan_is_bifrost(dev));651652/* Word 8, 9 Misc state */653rsd.opaque[8] |= zsa->rsd_depth.opaque[0]654| rast->multisample.opaque[0];655656rsd.opaque[9] |= zsa->rsd_stencil.opaque[0]657| rast->stencil_misc.opaque[0];658659/* Word 10, 11 Stencil Front and Back */660rsd.opaque[10] |= zsa->stencil_front.opaque[0];661rsd.opaque[11] |= zsa->stencil_back.opaque[0];662663memcpy(fragmeta, &rsd, sizeof(rsd));664}665666static mali_ptr667panfrost_emit_compute_shader_meta(struct panfrost_batch *batch, enum pipe_shader_type stage)668{669struct panfrost_shader_state *ss = panfrost_get_shader_state(batch->ctx, stage);670671panfrost_batch_add_bo(batch, ss->bin.bo, PIPE_SHADER_VERTEX);672panfrost_batch_add_bo(batch, ss->state.bo, PIPE_SHADER_VERTEX);673674return ss->state.gpu;675}676677static mali_ptr678panfrost_emit_frag_shader_meta(struct panfrost_batch *batch)679{680struct panfrost_context *ctx = batch->ctx;681struct panfrost_shader_state *ss = panfrost_get_shader_state(ctx, PIPE_SHADER_FRAGMENT);682683panfrost_batch_add_bo(batch, ss->bin.bo, PIPE_SHADER_FRAGMENT);684685struct panfrost_device *dev = pan_device(ctx->base.screen);686unsigned rt_count = MAX2(ctx->pipe_framebuffer.nr_cbufs, 1);687struct panfrost_ptr xfer;688689if (dev->quirks & MIDGARD_SFBD) {690xfer = pan_pool_alloc_desc(&batch->pool.base, RENDERER_STATE);691} else {692xfer = pan_pool_alloc_desc_aggregate(&batch->pool.base,693PAN_DESC(RENDERER_STATE),694PAN_DESC_ARRAY(rt_count, BLEND));695}696697mali_ptr blend_shaders[PIPE_MAX_COLOR_BUFS];698unsigned shader_offset = 0;699struct panfrost_bo *shader_bo = NULL;700701for (unsigned c = 0; c < ctx->pipe_framebuffer.nr_cbufs; ++c) {702if (ctx->pipe_framebuffer.cbufs[c]) {703blend_shaders[c] = panfrost_get_blend(batch,704c, &shader_bo, &shader_offset);705}706}707708panfrost_emit_frag_shader(ctx, (struct mali_renderer_state_packed *) xfer.cpu, blend_shaders);709710if (!(dev->quirks & MIDGARD_SFBD))711panfrost_emit_blend(batch, xfer.cpu + MALI_RENDERER_STATE_LENGTH, blend_shaders);712else {713batch->draws |= PIPE_CLEAR_COLOR0;714batch->resolve |= PIPE_CLEAR_COLOR0;715}716717if (ctx->depth_stencil->base.depth_enabled)718batch->read |= PIPE_CLEAR_DEPTH;719720if (ctx->depth_stencil->base.stencil[0].enabled)721batch->read |= PIPE_CLEAR_STENCIL;722723return xfer.gpu;724}725726static mali_ptr727panfrost_emit_viewport(struct panfrost_batch *batch)728{729struct panfrost_context *ctx = batch->ctx;730const struct pipe_viewport_state *vp = &ctx->pipe_viewport;731const struct pipe_scissor_state *ss = &ctx->scissor;732const struct pipe_rasterizer_state *rast = &ctx->rasterizer->base;733734/* Derive min/max from translate/scale. Note since |x| >= 0 by735* definition, we have that -|x| <= |x| hence translate - |scale| <=736* translate + |scale|, so the ordering is correct here. */737float vp_minx = vp->translate[0] - fabsf(vp->scale[0]);738float vp_maxx = vp->translate[0] + fabsf(vp->scale[0]);739float vp_miny = vp->translate[1] - fabsf(vp->scale[1]);740float vp_maxy = vp->translate[1] + fabsf(vp->scale[1]);741float minz = (vp->translate[2] - fabsf(vp->scale[2]));742float maxz = (vp->translate[2] + fabsf(vp->scale[2]));743744/* Scissor to the intersection of viewport and to the scissor, clamped745* to the framebuffer */746747unsigned minx = MIN2(batch->key.width, MAX2((int) vp_minx, 0));748unsigned maxx = MIN2(batch->key.width, MAX2((int) vp_maxx, 0));749unsigned miny = MIN2(batch->key.height, MAX2((int) vp_miny, 0));750unsigned maxy = MIN2(batch->key.height, MAX2((int) vp_maxy, 0));751752if (ss && rast->scissor) {753minx = MAX2(ss->minx, minx);754miny = MAX2(ss->miny, miny);755maxx = MIN2(ss->maxx, maxx);756maxy = MIN2(ss->maxy, maxy);757}758759/* Set the range to [1, 1) so max values don't wrap round */760if (maxx == 0 || maxy == 0)761maxx = maxy = minx = miny = 1;762763struct panfrost_ptr T = pan_pool_alloc_desc(&batch->pool.base, VIEWPORT);764765pan_pack(T.cpu, VIEWPORT, cfg) {766/* [minx, maxx) and [miny, maxy) are exclusive ranges, but767* these are inclusive */768cfg.scissor_minimum_x = minx;769cfg.scissor_minimum_y = miny;770cfg.scissor_maximum_x = maxx - 1;771cfg.scissor_maximum_y = maxy - 1;772773cfg.minimum_z = rast->depth_clip_near ? minz : -INFINITY;774cfg.maximum_z = rast->depth_clip_far ? maxz : INFINITY;775}776777panfrost_batch_union_scissor(batch, minx, miny, maxx, maxy);778batch->scissor_culls_everything = (minx >= maxx || miny >= maxy);779780return T.gpu;781}782783static mali_ptr784panfrost_map_constant_buffer_gpu(struct panfrost_batch *batch,785enum pipe_shader_type st,786struct panfrost_constant_buffer *buf,787unsigned index)788{789struct pipe_constant_buffer *cb = &buf->cb[index];790struct panfrost_resource *rsrc = pan_resource(cb->buffer);791792if (rsrc) {793panfrost_batch_read_rsrc(batch, rsrc, st);794795/* Alignment gauranteed by796* PIPE_CAP_CONSTANT_BUFFER_OFFSET_ALIGNMENT */797return rsrc->image.data.bo->ptr.gpu + cb->buffer_offset;798} else if (cb->user_buffer) {799return pan_pool_upload_aligned(&batch->pool.base,800cb->user_buffer +801cb->buffer_offset,802cb->buffer_size, 16);803} else {804unreachable("No constant buffer");805}806}807808struct sysval_uniform {809union {810float f[4];811int32_t i[4];812uint32_t u[4];813uint64_t du[2];814};815};816817static void818panfrost_upload_viewport_scale_sysval(struct panfrost_batch *batch,819struct sysval_uniform *uniform)820{821struct panfrost_context *ctx = batch->ctx;822const struct pipe_viewport_state *vp = &ctx->pipe_viewport;823824uniform->f[0] = vp->scale[0];825uniform->f[1] = vp->scale[1];826uniform->f[2] = vp->scale[2];827}828829static void830panfrost_upload_viewport_offset_sysval(struct panfrost_batch *batch,831struct sysval_uniform *uniform)832{833struct panfrost_context *ctx = batch->ctx;834const struct pipe_viewport_state *vp = &ctx->pipe_viewport;835836uniform->f[0] = vp->translate[0];837uniform->f[1] = vp->translate[1];838uniform->f[2] = vp->translate[2];839}840841static void panfrost_upload_txs_sysval(struct panfrost_batch *batch,842enum pipe_shader_type st,843unsigned int sysvalid,844struct sysval_uniform *uniform)845{846struct panfrost_context *ctx = batch->ctx;847unsigned texidx = PAN_SYSVAL_ID_TO_TXS_TEX_IDX(sysvalid);848unsigned dim = PAN_SYSVAL_ID_TO_TXS_DIM(sysvalid);849bool is_array = PAN_SYSVAL_ID_TO_TXS_IS_ARRAY(sysvalid);850struct pipe_sampler_view *tex = &ctx->sampler_views[st][texidx]->base;851852assert(dim);853854if (tex->target == PIPE_BUFFER) {855assert(dim == 1);856uniform->i[0] =857tex->u.buf.size / util_format_get_blocksize(tex->format);858return;859}860861uniform->i[0] = u_minify(tex->texture->width0, tex->u.tex.first_level);862863if (dim > 1)864uniform->i[1] = u_minify(tex->texture->height0,865tex->u.tex.first_level);866867if (dim > 2)868uniform->i[2] = u_minify(tex->texture->depth0,869tex->u.tex.first_level);870871if (is_array)872uniform->i[dim] = tex->texture->array_size;873}874875static void panfrost_upload_image_size_sysval(struct panfrost_batch *batch,876enum pipe_shader_type st,877unsigned int sysvalid,878struct sysval_uniform *uniform)879{880struct panfrost_context *ctx = batch->ctx;881unsigned idx = PAN_SYSVAL_ID_TO_TXS_TEX_IDX(sysvalid);882unsigned dim = PAN_SYSVAL_ID_TO_TXS_DIM(sysvalid);883unsigned is_array = PAN_SYSVAL_ID_TO_TXS_IS_ARRAY(sysvalid);884885assert(dim && dim < 4);886887struct pipe_image_view *image = &ctx->images[st][idx];888889if (image->resource->target == PIPE_BUFFER) {890unsigned blocksize = util_format_get_blocksize(image->format);891uniform->i[0] = image->resource->width0 / blocksize;892return;893}894895uniform->i[0] = u_minify(image->resource->width0,896image->u.tex.level);897898if (dim > 1)899uniform->i[1] = u_minify(image->resource->height0,900image->u.tex.level);901902if (dim > 2)903uniform->i[2] = u_minify(image->resource->depth0,904image->u.tex.level);905906if (is_array)907uniform->i[dim] = image->resource->array_size;908}909910static void911panfrost_upload_ssbo_sysval(struct panfrost_batch *batch,912enum pipe_shader_type st,913unsigned ssbo_id,914struct sysval_uniform *uniform)915{916struct panfrost_context *ctx = batch->ctx;917918assert(ctx->ssbo_mask[st] & (1 << ssbo_id));919struct pipe_shader_buffer sb = ctx->ssbo[st][ssbo_id];920921/* Compute address */922struct panfrost_resource *rsrc = pan_resource(sb.buffer);923struct panfrost_bo *bo = rsrc->image.data.bo;924925panfrost_batch_write_rsrc(batch, rsrc, st);926927util_range_add(&rsrc->base, &rsrc->valid_buffer_range,928sb.buffer_offset, sb.buffer_size);929930/* Upload address and size as sysval */931uniform->du[0] = bo->ptr.gpu + sb.buffer_offset;932uniform->u[2] = sb.buffer_size;933}934935static void936panfrost_upload_sampler_sysval(struct panfrost_batch *batch,937enum pipe_shader_type st,938unsigned samp_idx,939struct sysval_uniform *uniform)940{941struct panfrost_context *ctx = batch->ctx;942struct pipe_sampler_state *sampl = &ctx->samplers[st][samp_idx]->base;943944uniform->f[0] = sampl->min_lod;945uniform->f[1] = sampl->max_lod;946uniform->f[2] = sampl->lod_bias;947948/* Even without any errata, Midgard represents "no mipmapping" as949* fixing the LOD with the clamps; keep behaviour consistent. c.f.950* panfrost_create_sampler_state which also explains our choice of951* epsilon value (again to keep behaviour consistent) */952953if (sampl->min_mip_filter == PIPE_TEX_MIPFILTER_NONE)954uniform->f[1] = uniform->f[0] + (1.0/256.0);955}956957static void958panfrost_upload_num_work_groups_sysval(struct panfrost_batch *batch,959struct sysval_uniform *uniform)960{961struct panfrost_context *ctx = batch->ctx;962963uniform->u[0] = ctx->compute_grid->grid[0];964uniform->u[1] = ctx->compute_grid->grid[1];965uniform->u[2] = ctx->compute_grid->grid[2];966}967968static void969panfrost_upload_local_group_size_sysval(struct panfrost_batch *batch,970struct sysval_uniform *uniform)971{972struct panfrost_context *ctx = batch->ctx;973974uniform->u[0] = ctx->compute_grid->block[0];975uniform->u[1] = ctx->compute_grid->block[1];976uniform->u[2] = ctx->compute_grid->block[2];977}978979static void980panfrost_upload_work_dim_sysval(struct panfrost_batch *batch,981struct sysval_uniform *uniform)982{983struct panfrost_context *ctx = batch->ctx;984985uniform->u[0] = ctx->compute_grid->work_dim;986}987988/* Sample positions are pushed in a Bifrost specific format on Bifrost. On989* Midgard, we emulate the Bifrost path with some extra arithmetic in the990* shader, to keep the code as unified as possible. */991992static void993panfrost_upload_sample_positions_sysval(struct panfrost_batch *batch,994struct sysval_uniform *uniform)995{996struct panfrost_context *ctx = batch->ctx;997struct panfrost_device *dev = pan_device(ctx->base.screen);998999unsigned samples = util_framebuffer_get_num_samples(&batch->key);1000uniform->du[0] = panfrost_sample_positions(dev, panfrost_sample_pattern(samples));1001}10021003static void1004panfrost_upload_multisampled_sysval(struct panfrost_batch *batch,1005struct sysval_uniform *uniform)1006{1007unsigned samples = util_framebuffer_get_num_samples(&batch->key);1008uniform->u[0] = samples > 1;1009}10101011static void1012panfrost_upload_rt_conversion_sysval(struct panfrost_batch *batch,1013unsigned size_and_rt, struct sysval_uniform *uniform)1014{1015struct panfrost_context *ctx = batch->ctx;1016struct panfrost_device *dev = pan_device(ctx->base.screen);1017unsigned rt = size_and_rt & 0xF;1018unsigned size = size_and_rt >> 4;10191020if (rt < batch->key.nr_cbufs && batch->key.cbufs[rt]) {1021enum pipe_format format = batch->key.cbufs[rt]->format;1022uniform->u[0] =1023pan_blend_get_bifrost_desc(dev, format, rt, size) >> 32;1024} else {1025pan_pack(&uniform->u[0], BIFROST_INTERNAL_CONVERSION, cfg)1026cfg.memory_format = dev->formats[PIPE_FORMAT_NONE].hw;1027}1028}10291030void1031panfrost_analyze_sysvals(struct panfrost_shader_state *ss)1032{1033unsigned dirty = 0;1034unsigned dirty_shader =1035PAN_DIRTY_STAGE_RENDERER | PAN_DIRTY_STAGE_CONST;10361037for (unsigned i = 0; i < ss->info.sysvals.sysval_count; ++i) {1038switch (PAN_SYSVAL_TYPE(ss->info.sysvals.sysvals[i])) {1039case PAN_SYSVAL_VIEWPORT_SCALE:1040case PAN_SYSVAL_VIEWPORT_OFFSET:1041dirty |= PAN_DIRTY_VIEWPORT;1042break;10431044case PAN_SYSVAL_TEXTURE_SIZE:1045dirty_shader |= PAN_DIRTY_STAGE_TEXTURE;1046break;10471048case PAN_SYSVAL_SSBO:1049dirty_shader |= PAN_DIRTY_STAGE_SSBO;1050break;10511052case PAN_SYSVAL_SAMPLER:1053dirty_shader |= PAN_DIRTY_STAGE_SAMPLER;1054break;10551056case PAN_SYSVAL_IMAGE_SIZE:1057dirty_shader |= PAN_DIRTY_STAGE_IMAGE;1058break;10591060case PAN_SYSVAL_NUM_WORK_GROUPS:1061case PAN_SYSVAL_LOCAL_GROUP_SIZE:1062case PAN_SYSVAL_WORK_DIM:1063case PAN_SYSVAL_VERTEX_INSTANCE_OFFSETS:1064dirty |= PAN_DIRTY_PARAMS;1065break;10661067case PAN_SYSVAL_DRAWID:1068dirty |= PAN_DIRTY_DRAWID;1069break;10701071case PAN_SYSVAL_SAMPLE_POSITIONS:1072case PAN_SYSVAL_MULTISAMPLED:1073case PAN_SYSVAL_RT_CONVERSION:1074/* Nothing beyond the batch itself */1075break;1076default:1077unreachable("Invalid sysval");1078}1079}10801081ss->dirty_3d = dirty;1082ss->dirty_shader = dirty_shader;1083}10841085static void1086panfrost_upload_sysvals(struct panfrost_batch *batch,1087const struct panfrost_ptr *ptr,1088struct panfrost_shader_state *ss,1089enum pipe_shader_type st)1090{1091struct sysval_uniform *uniforms = ptr->cpu;10921093for (unsigned i = 0; i < ss->info.sysvals.sysval_count; ++i) {1094int sysval = ss->info.sysvals.sysvals[i];10951096switch (PAN_SYSVAL_TYPE(sysval)) {1097case PAN_SYSVAL_VIEWPORT_SCALE:1098panfrost_upload_viewport_scale_sysval(batch,1099&uniforms[i]);1100break;1101case PAN_SYSVAL_VIEWPORT_OFFSET:1102panfrost_upload_viewport_offset_sysval(batch,1103&uniforms[i]);1104break;1105case PAN_SYSVAL_TEXTURE_SIZE:1106panfrost_upload_txs_sysval(batch, st,1107PAN_SYSVAL_ID(sysval),1108&uniforms[i]);1109break;1110case PAN_SYSVAL_SSBO:1111panfrost_upload_ssbo_sysval(batch, st,1112PAN_SYSVAL_ID(sysval),1113&uniforms[i]);1114break;1115case PAN_SYSVAL_NUM_WORK_GROUPS:1116for (unsigned j = 0; j < 3; j++) {1117batch->num_wg_sysval[j] =1118ptr->gpu + (i * sizeof(*uniforms)) + (j * 4);1119}1120panfrost_upload_num_work_groups_sysval(batch,1121&uniforms[i]);1122break;1123case PAN_SYSVAL_LOCAL_GROUP_SIZE:1124panfrost_upload_local_group_size_sysval(batch,1125&uniforms[i]);1126break;1127case PAN_SYSVAL_WORK_DIM:1128panfrost_upload_work_dim_sysval(batch,1129&uniforms[i]);1130break;1131case PAN_SYSVAL_SAMPLER:1132panfrost_upload_sampler_sysval(batch, st,1133PAN_SYSVAL_ID(sysval),1134&uniforms[i]);1135break;1136case PAN_SYSVAL_IMAGE_SIZE:1137panfrost_upload_image_size_sysval(batch, st,1138PAN_SYSVAL_ID(sysval),1139&uniforms[i]);1140break;1141case PAN_SYSVAL_SAMPLE_POSITIONS:1142panfrost_upload_sample_positions_sysval(batch,1143&uniforms[i]);1144break;1145case PAN_SYSVAL_MULTISAMPLED:1146panfrost_upload_multisampled_sysval(batch,1147&uniforms[i]);1148break;1149case PAN_SYSVAL_RT_CONVERSION:1150panfrost_upload_rt_conversion_sysval(batch,1151PAN_SYSVAL_ID(sysval), &uniforms[i]);1152break;1153case PAN_SYSVAL_VERTEX_INSTANCE_OFFSETS:1154batch->ctx->first_vertex_sysval_ptr =1155ptr->gpu + (i * sizeof(*uniforms));1156batch->ctx->base_vertex_sysval_ptr =1157batch->ctx->first_vertex_sysval_ptr + 4;1158batch->ctx->base_instance_sysval_ptr =1159batch->ctx->first_vertex_sysval_ptr + 8;11601161uniforms[i].u[0] = batch->ctx->offset_start;1162uniforms[i].u[1] = batch->ctx->base_vertex;1163uniforms[i].u[2] = batch->ctx->base_instance;1164break;1165case PAN_SYSVAL_DRAWID:1166uniforms[i].u[0] = batch->ctx->drawid;1167break;1168default:1169assert(0);1170}1171}1172}11731174static const void *1175panfrost_map_constant_buffer_cpu(struct panfrost_context *ctx,1176struct panfrost_constant_buffer *buf,1177unsigned index)1178{1179struct pipe_constant_buffer *cb = &buf->cb[index];1180struct panfrost_resource *rsrc = pan_resource(cb->buffer);11811182if (rsrc) {1183panfrost_bo_mmap(rsrc->image.data.bo);1184panfrost_flush_writer(ctx, rsrc);1185panfrost_bo_wait(rsrc->image.data.bo, INT64_MAX, false);11861187return rsrc->image.data.bo->ptr.cpu + cb->buffer_offset;1188} else if (cb->user_buffer) {1189return cb->user_buffer + cb->buffer_offset;1190} else1191unreachable("No constant buffer");1192}11931194static mali_ptr1195panfrost_emit_const_buf(struct panfrost_batch *batch,1196enum pipe_shader_type stage,1197mali_ptr *push_constants)1198{1199struct panfrost_context *ctx = batch->ctx;1200struct panfrost_shader_variants *all = ctx->shader[stage];12011202if (!all)1203return 0;12041205struct panfrost_constant_buffer *buf = &ctx->constant_buffer[stage];1206struct panfrost_shader_state *ss = &all->variants[all->active_variant];12071208/* Allocate room for the sysval and the uniforms */1209size_t sys_size = sizeof(float) * 4 * ss->info.sysvals.sysval_count;1210struct panfrost_ptr transfer =1211pan_pool_alloc_aligned(&batch->pool.base, sys_size, 16);12121213/* Upload sysvals requested by the shader */1214panfrost_upload_sysvals(batch, &transfer, ss, stage);12151216/* Next up, attach UBOs. UBO count includes gaps but no sysval UBO */1217struct panfrost_shader_state *shader = panfrost_get_shader_state(ctx, stage);1218unsigned ubo_count = shader->info.ubo_count - (sys_size ? 1 : 0);1219unsigned sysval_ubo = sys_size ? ubo_count : ~0;12201221struct panfrost_ptr ubos =1222pan_pool_alloc_desc_array(&batch->pool.base,1223ubo_count + 1,1224UNIFORM_BUFFER);12251226uint64_t *ubo_ptr = (uint64_t *) ubos.cpu;12271228/* Upload sysval as a final UBO */12291230if (sys_size) {1231pan_pack(ubo_ptr + ubo_count, UNIFORM_BUFFER, cfg) {1232cfg.entries = DIV_ROUND_UP(sys_size, 16);1233cfg.pointer = transfer.gpu;1234}1235}12361237/* The rest are honest-to-goodness UBOs */12381239u_foreach_bit(ubo, ss->info.ubo_mask & buf->enabled_mask) {1240size_t usz = buf->cb[ubo].buffer_size;12411242if (usz == 0) {1243ubo_ptr[ubo] = 0;1244continue;1245}12461247/* Issue (57) for the ARB_uniform_buffer_object spec says that1248* the buffer can be larger than the uniform data inside it,1249* so clamp ubo size to what hardware supports. */12501251pan_pack(ubo_ptr + ubo, UNIFORM_BUFFER, cfg) {1252cfg.entries = MIN2(DIV_ROUND_UP(usz, 16), 1 << 12);1253cfg.pointer = panfrost_map_constant_buffer_gpu(batch,1254stage, buf, ubo);1255}1256}12571258if (ss->info.push.count == 0)1259return ubos.gpu;12601261/* Copy push constants required by the shader */1262struct panfrost_ptr push_transfer =1263pan_pool_alloc_aligned(&batch->pool.base,1264ss->info.push.count * 4, 16);12651266uint32_t *push_cpu = (uint32_t *) push_transfer.cpu;1267*push_constants = push_transfer.gpu;12681269for (unsigned i = 0; i < ss->info.push.count; ++i) {1270struct panfrost_ubo_word src = ss->info.push.words[i];12711272if (src.ubo == sysval_ubo) {1273unsigned sysval_idx = src.offset / 16;1274unsigned sysval_comp = (src.offset % 16) / 4;1275unsigned sysval_type = PAN_SYSVAL_TYPE(ss->info.sysvals.sysvals[sysval_idx]);1276mali_ptr ptr = push_transfer.gpu + (4 * i);12771278switch (sysval_type) {1279case PAN_SYSVAL_VERTEX_INSTANCE_OFFSETS:1280switch (sysval_comp) {1281case 0:1282batch->ctx->first_vertex_sysval_ptr = ptr;1283break;1284case 1:1285batch->ctx->base_vertex_sysval_ptr = ptr;1286break;1287case 2:1288batch->ctx->base_instance_sysval_ptr = ptr;1289break;1290case 3:1291/* Spurious (Midgard doesn't pack) */1292break;1293default:1294unreachable("Invalid vertex/instance offset component\n");1295}1296break;12971298case PAN_SYSVAL_NUM_WORK_GROUPS:1299batch->num_wg_sysval[sysval_comp] = ptr;1300break;13011302default:1303break;1304}1305}1306/* Map the UBO, this should be cheap. However this is reading1307* from write-combine memory which is _very_ slow. It might pay1308* off to upload sysvals to a staging buffer on the CPU on the1309* assumption sysvals will get pushed (TODO) */13101311const void *mapped_ubo = (src.ubo == sysval_ubo) ? transfer.cpu :1312panfrost_map_constant_buffer_cpu(ctx, buf, src.ubo);13131314/* TODO: Is there any benefit to combining ranges */1315memcpy(push_cpu + i, (uint8_t *) mapped_ubo + src.offset, 4);1316}13171318return ubos.gpu;1319}13201321static mali_ptr1322panfrost_emit_shared_memory(struct panfrost_batch *batch,1323const struct pipe_grid_info *info)1324{1325struct panfrost_context *ctx = batch->ctx;1326struct panfrost_device *dev = pan_device(ctx->base.screen);1327struct panfrost_shader_variants *all = ctx->shader[PIPE_SHADER_COMPUTE];1328struct panfrost_shader_state *ss = &all->variants[all->active_variant];1329struct panfrost_ptr t =1330pan_pool_alloc_desc(&batch->pool.base, LOCAL_STORAGE);13311332pan_pack(t.cpu, LOCAL_STORAGE, ls) {1333unsigned wls_single_size =1334util_next_power_of_two(MAX2(ss->info.wls_size, 128));13351336if (ss->info.wls_size) {1337ls.wls_instances =1338util_next_power_of_two(info->grid[0]) *1339util_next_power_of_two(info->grid[1]) *1340util_next_power_of_two(info->grid[2]);13411342ls.wls_size_scale = util_logbase2(wls_single_size) + 1;13431344unsigned wls_size = wls_single_size * ls.wls_instances * dev->core_count;13451346ls.wls_base_pointer =1347(panfrost_batch_get_shared_memory(batch,1348wls_size,13491))->ptr.gpu;1350} else {1351ls.wls_instances = MALI_LOCAL_STORAGE_NO_WORKGROUP_MEM;1352}13531354if (ss->info.tls_size) {1355unsigned shift =1356panfrost_get_stack_shift(ss->info.tls_size);1357struct panfrost_bo *bo =1358panfrost_batch_get_scratchpad(batch,1359ss->info.tls_size,1360dev->thread_tls_alloc,1361dev->core_count);13621363ls.tls_size = shift;1364ls.tls_base_pointer = bo->ptr.gpu;1365}1366};13671368return t.gpu;1369}13701371static mali_ptr1372panfrost_get_tex_desc(struct panfrost_batch *batch,1373enum pipe_shader_type st,1374struct panfrost_sampler_view *view)1375{1376if (!view)1377return (mali_ptr) 0;13781379struct pipe_sampler_view *pview = &view->base;1380struct panfrost_resource *rsrc = pan_resource(pview->texture);13811382panfrost_batch_read_rsrc(batch, rsrc, st);1383panfrost_batch_add_bo(batch, view->state.bo, st);13841385return view->state.gpu;1386}13871388static void1389panfrost_update_sampler_view(struct panfrost_sampler_view *view,1390struct pipe_context *pctx)1391{1392struct panfrost_resource *rsrc = pan_resource(view->base.texture);1393if (view->texture_bo != rsrc->image.data.bo->ptr.gpu ||1394view->modifier != rsrc->image.layout.modifier) {1395panfrost_bo_unreference(view->state.bo);1396panfrost_create_sampler_view_bo(view, pctx, &rsrc->base);1397}1398}13991400static mali_ptr1401panfrost_emit_texture_descriptors(struct panfrost_batch *batch,1402enum pipe_shader_type stage)1403{1404struct panfrost_context *ctx = batch->ctx;1405struct panfrost_device *device = pan_device(ctx->base.screen);14061407if (!ctx->sampler_view_count[stage])1408return 0;14091410if (pan_is_bifrost(device)) {1411struct panfrost_ptr T =1412pan_pool_alloc_desc_array(&batch->pool.base,1413ctx->sampler_view_count[stage],1414BIFROST_TEXTURE);1415struct mali_bifrost_texture_packed *out =1416(struct mali_bifrost_texture_packed *) T.cpu;14171418for (int i = 0; i < ctx->sampler_view_count[stage]; ++i) {1419struct panfrost_sampler_view *view = ctx->sampler_views[stage][i];1420struct pipe_sampler_view *pview = &view->base;1421struct panfrost_resource *rsrc = pan_resource(pview->texture);14221423panfrost_update_sampler_view(view, &ctx->base);1424out[i] = view->bifrost_descriptor;14251426panfrost_batch_read_rsrc(batch, rsrc, stage);1427panfrost_batch_add_bo(batch, view->state.bo, stage);1428}14291430return T.gpu;1431} else {1432uint64_t trampolines[PIPE_MAX_SHADER_SAMPLER_VIEWS];14331434for (int i = 0; i < ctx->sampler_view_count[stage]; ++i) {1435struct panfrost_sampler_view *view = ctx->sampler_views[stage][i];14361437panfrost_update_sampler_view(view, &ctx->base);14381439trampolines[i] = panfrost_get_tex_desc(batch, stage, view);1440}14411442return pan_pool_upload_aligned(&batch->pool.base,1443trampolines,1444sizeof(uint64_t) *1445ctx->sampler_view_count[stage],1446sizeof(uint64_t));1447}1448}14491450static mali_ptr1451panfrost_emit_sampler_descriptors(struct panfrost_batch *batch,1452enum pipe_shader_type stage)1453{1454struct panfrost_context *ctx = batch->ctx;14551456if (!ctx->sampler_count[stage])1457return 0;14581459assert(MALI_BIFROST_SAMPLER_LENGTH == MALI_MIDGARD_SAMPLER_LENGTH);1460assert(MALI_BIFROST_SAMPLER_ALIGN == MALI_MIDGARD_SAMPLER_ALIGN);14611462struct panfrost_ptr T =1463pan_pool_alloc_desc_array(&batch->pool.base,1464ctx->sampler_count[stage],1465MIDGARD_SAMPLER);1466struct mali_midgard_sampler_packed *out = (struct mali_midgard_sampler_packed *) T.cpu;14671468for (unsigned i = 0; i < ctx->sampler_count[stage]; ++i)1469out[i] = ctx->samplers[stage][i]->hw;14701471return T.gpu;1472}14731474/* Packs all image attribute descs and attribute buffer descs.1475* `first_image_buf_index` must be the index of the first image attribute buffer descriptor.1476*/1477static void1478emit_image_attribs(struct panfrost_context *ctx, enum pipe_shader_type shader,1479struct mali_attribute_packed *attribs, unsigned first_buf)1480{1481struct panfrost_device *dev = pan_device(ctx->base.screen);1482unsigned last_bit = util_last_bit(ctx->image_mask[shader]);14831484for (unsigned i = 0; i < last_bit; ++i) {1485enum pipe_format format = ctx->images[shader][i].format;14861487pan_pack(attribs + i, ATTRIBUTE, cfg) {1488/* Continuation record means 2 buffers per image */1489cfg.buffer_index = first_buf + (i * 2);1490cfg.offset_enable = !pan_is_bifrost(dev);1491cfg.format = dev->formats[format].hw;1492}1493}1494}14951496static enum mali_attribute_type1497pan_modifier_to_attr_type(uint64_t modifier)1498{1499switch (modifier) {1500case DRM_FORMAT_MOD_LINEAR:1501return MALI_ATTRIBUTE_TYPE_3D_LINEAR;1502case DRM_FORMAT_MOD_ARM_16X16_BLOCK_U_INTERLEAVED:1503return MALI_ATTRIBUTE_TYPE_3D_INTERLEAVED;1504default:1505unreachable("Invalid modifier for attribute record");1506}1507}15081509static void1510emit_image_bufs(struct panfrost_batch *batch, enum pipe_shader_type shader,1511struct mali_attribute_buffer_packed *bufs,1512unsigned first_image_buf_index)1513{1514struct panfrost_context *ctx = batch->ctx;1515unsigned last_bit = util_last_bit(ctx->image_mask[shader]);15161517for (unsigned i = 0; i < last_bit; ++i) {1518struct pipe_image_view *image = &ctx->images[shader][i];15191520/* TODO: understand how v3d/freedreno does it */1521if (!(ctx->image_mask[shader] & (1 << i)) ||1522!(image->shader_access & PIPE_IMAGE_ACCESS_READ_WRITE)) {1523/* Unused image bindings */1524pan_pack(bufs + (i * 2), ATTRIBUTE_BUFFER, cfg);1525pan_pack(bufs + (i * 2) + 1, ATTRIBUTE_BUFFER, cfg);1526continue;1527}15281529struct panfrost_resource *rsrc = pan_resource(image->resource);15301531/* TODO: MSAA */1532assert(image->resource->nr_samples <= 1 && "MSAA'd images not supported");15331534bool is_3d = rsrc->base.target == PIPE_TEXTURE_3D;1535bool is_buffer = rsrc->base.target == PIPE_BUFFER;15361537unsigned offset = is_buffer ? image->u.buf.offset :1538panfrost_texture_offset(&rsrc->image.layout,1539image->u.tex.level,1540is_3d ? 0 : image->u.tex.first_layer,1541is_3d ? image->u.tex.first_layer : 0);15421543if (image->shader_access & PIPE_IMAGE_ACCESS_WRITE) {1544panfrost_batch_write_rsrc(batch, rsrc, shader);15451546unsigned level = is_buffer ? 0 : image->u.tex.level;1547BITSET_SET(rsrc->valid.data, level);15481549if (is_buffer) {1550util_range_add(&rsrc->base, &rsrc->valid_buffer_range,15510, rsrc->base.width0);1552}1553} else {1554panfrost_batch_read_rsrc(batch, rsrc, shader);1555}15561557pan_pack(bufs + (i * 2), ATTRIBUTE_BUFFER, cfg) {1558cfg.type = pan_modifier_to_attr_type(rsrc->image.layout.modifier);1559cfg.pointer = rsrc->image.data.bo->ptr.gpu + offset;1560cfg.stride = util_format_get_blocksize(image->format);1561cfg.size = rsrc->image.data.bo->size - offset;1562}15631564if (is_buffer) {1565pan_pack(bufs + (i * 2) + 1, ATTRIBUTE_BUFFER_CONTINUATION_3D, cfg) {1566cfg.s_dimension = rsrc->base.width0 /1567util_format_get_blocksize(image->format);1568cfg.t_dimension = cfg.r_dimension = 1;1569}15701571continue;1572}15731574pan_pack(bufs + (i * 2) + 1, ATTRIBUTE_BUFFER_CONTINUATION_3D, cfg) {1575unsigned level = image->u.tex.level;15761577cfg.s_dimension = u_minify(rsrc->base.width0, level);1578cfg.t_dimension = u_minify(rsrc->base.height0, level);1579cfg.r_dimension = is_3d ?1580u_minify(rsrc->base.depth0, level) :1581image->u.tex.last_layer - image->u.tex.first_layer + 1;15821583cfg.row_stride =1584rsrc->image.layout.slices[level].row_stride;15851586if (rsrc->base.target != PIPE_TEXTURE_2D) {1587cfg.slice_stride =1588panfrost_get_layer_stride(&rsrc->image.layout,1589level);1590}1591}1592}1593}15941595static mali_ptr1596panfrost_emit_image_attribs(struct panfrost_batch *batch,1597mali_ptr *buffers,1598enum pipe_shader_type type)1599{1600struct panfrost_context *ctx = batch->ctx;1601struct panfrost_shader_state *shader = panfrost_get_shader_state(ctx, type);16021603if (!shader->info.attribute_count) {1604*buffers = 0;1605return 0;1606}16071608struct panfrost_device *dev = pan_device(ctx->base.screen);16091610/* Images always need a MALI_ATTRIBUTE_BUFFER_CONTINUATION_3D */1611unsigned attr_count = shader->info.attribute_count;1612unsigned buf_count = (attr_count * 2) + (pan_is_bifrost(dev) ? 1 : 0);16131614struct panfrost_ptr bufs =1615pan_pool_alloc_desc_array(&batch->pool.base, buf_count, ATTRIBUTE_BUFFER);16161617struct panfrost_ptr attribs =1618pan_pool_alloc_desc_array(&batch->pool.base, attr_count, ATTRIBUTE);16191620emit_image_attribs(ctx, type, attribs.cpu, 0);1621emit_image_bufs(batch, type, bufs.cpu, 0);16221623/* We need an empty attrib buf to stop the prefetching on Bifrost */1624if (pan_is_bifrost(dev)) {1625pan_pack(bufs.cpu +1626((buf_count - 1) * MALI_ATTRIBUTE_BUFFER_LENGTH),1627ATTRIBUTE_BUFFER, cfg);1628}16291630*buffers = bufs.gpu;1631return attribs.gpu;1632}16331634static mali_ptr1635panfrost_emit_vertex_data(struct panfrost_batch *batch,1636mali_ptr *buffers)1637{1638struct panfrost_context *ctx = batch->ctx;1639struct panfrost_device *dev = pan_device(ctx->base.screen);1640struct panfrost_vertex_state *so = ctx->vertex;1641struct panfrost_shader_state *vs = panfrost_get_shader_state(ctx, PIPE_SHADER_VERTEX);1642bool instanced = ctx->indirect_draw || ctx->instance_count > 1;1643uint32_t image_mask = ctx->image_mask[PIPE_SHADER_VERTEX];1644unsigned nr_images = util_last_bit(image_mask);16451646/* Worst case: everything is NPOT, which is only possible if instancing1647* is enabled. Otherwise single record is gauranteed.1648* Also, we allocate more memory than what's needed here if either instancing1649* is enabled or images are present, this can be improved. */1650unsigned bufs_per_attrib = (instanced || nr_images > 0) ? 2 : 1;1651unsigned nr_bufs = ((so->nr_bufs + nr_images) * bufs_per_attrib) +1652(pan_is_bifrost(dev) ? 1 : 0);16531654/* Midgard needs vertexid/instanceid handled specially */1655bool special_vbufs = dev->arch < 6 && vs->info.attribute_count >= PAN_VERTEX_ID;16561657if (special_vbufs)1658nr_bufs += 2;16591660if (!nr_bufs) {1661*buffers = 0;1662return 0;1663}16641665struct panfrost_ptr S =1666pan_pool_alloc_desc_array(&batch->pool.base, nr_bufs,1667ATTRIBUTE_BUFFER);1668struct panfrost_ptr T =1669pan_pool_alloc_desc_array(&batch->pool.base,1670vs->info.attribute_count,1671ATTRIBUTE);16721673struct mali_attribute_buffer_packed *bufs =1674(struct mali_attribute_buffer_packed *) S.cpu;16751676struct mali_attribute_packed *out =1677(struct mali_attribute_packed *) T.cpu;16781679unsigned attrib_to_buffer[PIPE_MAX_ATTRIBS] = { 0 };1680unsigned k = 0;16811682for (unsigned i = 0; i < so->nr_bufs; ++i) {1683unsigned vbi = so->buffers[i].vbi;1684unsigned divisor = so->buffers[i].divisor;1685attrib_to_buffer[i] = k;16861687if (!(ctx->vb_mask & (1 << vbi)))1688continue;16891690struct pipe_vertex_buffer *buf = &ctx->vertex_buffers[vbi];1691struct panfrost_resource *rsrc;16921693rsrc = pan_resource(buf->buffer.resource);1694if (!rsrc)1695continue;16961697panfrost_batch_read_rsrc(batch, rsrc, PIPE_SHADER_VERTEX);16981699/* Mask off lower bits, see offset fixup below */1700mali_ptr raw_addr = rsrc->image.data.bo->ptr.gpu + buf->buffer_offset;1701mali_ptr addr = raw_addr & ~63;17021703/* Since we advanced the base pointer, we shrink the buffer1704* size, but add the offset we subtracted */1705unsigned size = rsrc->base.width0 + (raw_addr - addr)1706- buf->buffer_offset;17071708/* When there is a divisor, the hardware-level divisor is1709* the product of the instance divisor and the padded count */1710unsigned stride = buf->stride;17111712if (ctx->indirect_draw) {1713/* We allocated 2 records for each attribute buffer */1714assert((k & 1) == 0);17151716/* With indirect draws we can't guess the vertex_count.1717* Pre-set the address, stride and size fields, the1718* compute shader do the rest.1719*/1720pan_pack(bufs + k, ATTRIBUTE_BUFFER, cfg) {1721cfg.type = MALI_ATTRIBUTE_TYPE_1D;1722cfg.pointer = addr;1723cfg.stride = stride;1724cfg.size = size;1725}17261727/* We store the unmodified divisor in the continuation1728* slot so the compute shader can retrieve it.1729*/1730pan_pack(bufs + k + 1, ATTRIBUTE_BUFFER_CONTINUATION_NPOT, cfg) {1731cfg.divisor = divisor;1732}17331734k += 2;1735continue;1736}17371738unsigned hw_divisor = ctx->padded_count * divisor;17391740if (ctx->instance_count <= 1) {1741/* Per-instance would be every attribute equal */1742if (divisor)1743stride = 0;17441745pan_pack(bufs + k, ATTRIBUTE_BUFFER, cfg) {1746cfg.pointer = addr;1747cfg.stride = stride;1748cfg.size = size;1749}1750} else if (!divisor) {1751pan_pack(bufs + k, ATTRIBUTE_BUFFER, cfg) {1752cfg.type = MALI_ATTRIBUTE_TYPE_1D_MODULUS;1753cfg.pointer = addr;1754cfg.stride = stride;1755cfg.size = size;1756cfg.divisor = ctx->padded_count;1757}1758} else if (util_is_power_of_two_or_zero(hw_divisor)) {1759pan_pack(bufs + k, ATTRIBUTE_BUFFER, cfg) {1760cfg.type = MALI_ATTRIBUTE_TYPE_1D_POT_DIVISOR;1761cfg.pointer = addr;1762cfg.stride = stride;1763cfg.size = size;1764cfg.divisor_r = __builtin_ctz(hw_divisor);1765}17661767} else {1768unsigned shift = 0, extra_flags = 0;17691770unsigned magic_divisor =1771panfrost_compute_magic_divisor(hw_divisor, &shift, &extra_flags);17721773/* Records with continuations must be aligned */1774k = ALIGN_POT(k, 2);1775attrib_to_buffer[i] = k;17761777pan_pack(bufs + k, ATTRIBUTE_BUFFER, cfg) {1778cfg.type = MALI_ATTRIBUTE_TYPE_1D_NPOT_DIVISOR;1779cfg.pointer = addr;1780cfg.stride = stride;1781cfg.size = size;17821783cfg.divisor_r = shift;1784cfg.divisor_e = extra_flags;1785}17861787pan_pack(bufs + k + 1, ATTRIBUTE_BUFFER_CONTINUATION_NPOT, cfg) {1788cfg.divisor_numerator = magic_divisor;1789cfg.divisor = divisor;1790}17911792++k;1793}17941795++k;1796}17971798/* Add special gl_VertexID/gl_InstanceID buffers */1799if (unlikely(special_vbufs)) {1800panfrost_vertex_id(ctx->padded_count, &bufs[k], ctx->instance_count > 1);18011802pan_pack(out + PAN_VERTEX_ID, ATTRIBUTE, cfg) {1803cfg.buffer_index = k++;1804cfg.format = so->formats[PAN_VERTEX_ID];1805}18061807panfrost_instance_id(ctx->padded_count, &bufs[k], ctx->instance_count > 1);18081809pan_pack(out + PAN_INSTANCE_ID, ATTRIBUTE, cfg) {1810cfg.buffer_index = k++;1811cfg.format = so->formats[PAN_INSTANCE_ID];1812}1813}18141815k = ALIGN_POT(k, 2);1816emit_image_attribs(ctx, PIPE_SHADER_VERTEX, out + so->num_elements, k);1817emit_image_bufs(batch, PIPE_SHADER_VERTEX, bufs + k, k);1818k += (util_last_bit(ctx->image_mask[PIPE_SHADER_VERTEX]) * 2);18191820/* We need an empty attrib buf to stop the prefetching on Bifrost */1821if (pan_is_bifrost(dev))1822pan_pack(&bufs[k], ATTRIBUTE_BUFFER, cfg);18231824/* Attribute addresses require 64-byte alignment, so let:1825*1826* base' = base & ~63 = base - (base & 63)1827* offset' = offset + (base & 63)1828*1829* Since base' + offset' = base + offset, these are equivalent1830* addressing modes and now base is 64 aligned.1831*/18321833for (unsigned i = 0; i < so->num_elements; ++i) {1834unsigned vbi = so->pipe[i].vertex_buffer_index;1835struct pipe_vertex_buffer *buf = &ctx->vertex_buffers[vbi];18361837/* BOs are aligned; just fixup for buffer_offset */1838signed src_offset = so->pipe[i].src_offset;1839src_offset += (buf->buffer_offset & 63);18401841/* Base instance offset */1842if (ctx->base_instance && so->pipe[i].instance_divisor) {1843src_offset += (ctx->base_instance * buf->stride) /1844so->pipe[i].instance_divisor;1845}18461847/* Also, somewhat obscurely per-instance data needs to be1848* offset in response to a delayed start in an indexed draw */18491850if (so->pipe[i].instance_divisor && ctx->instance_count > 1)1851src_offset -= buf->stride * ctx->offset_start;18521853pan_pack(out + i, ATTRIBUTE, cfg) {1854cfg.buffer_index = attrib_to_buffer[so->element_buffer[i]];1855cfg.format = so->formats[i];1856cfg.offset = src_offset;1857}1858}18591860*buffers = S.gpu;1861return T.gpu;1862}18631864static mali_ptr1865panfrost_emit_varyings(struct panfrost_batch *batch,1866struct mali_attribute_buffer_packed *slot,1867unsigned stride, unsigned count)1868{1869unsigned size = stride * count;1870mali_ptr ptr =1871batch->ctx->indirect_draw ? 0 :1872pan_pool_alloc_aligned(&batch->invisible_pool.base, size, 64).gpu;18731874pan_pack(slot, ATTRIBUTE_BUFFER, cfg) {1875cfg.stride = stride;1876cfg.size = size;1877cfg.pointer = ptr;1878}18791880return ptr;1881}18821883static unsigned1884panfrost_xfb_offset(unsigned stride, struct pipe_stream_output_target *target)1885{1886return target->buffer_offset + (pan_so_target(target)->offset * stride);1887}18881889static void1890panfrost_emit_streamout(struct panfrost_batch *batch,1891struct mali_attribute_buffer_packed *slot,1892unsigned stride, unsigned count,1893struct pipe_stream_output_target *target)1894{1895unsigned max_size = target->buffer_size;1896unsigned expected_size = stride * count;18971898/* Grab the BO and bind it to the batch */1899struct panfrost_resource *rsrc = pan_resource(target->buffer);1900struct panfrost_bo *bo = rsrc->image.data.bo;19011902panfrost_batch_write_rsrc(batch, rsrc, PIPE_SHADER_VERTEX);1903panfrost_batch_read_rsrc(batch, rsrc, PIPE_SHADER_FRAGMENT);19041905unsigned offset = panfrost_xfb_offset(stride, target);19061907pan_pack(slot, ATTRIBUTE_BUFFER, cfg) {1908cfg.pointer = bo->ptr.gpu + (offset & ~63);1909cfg.stride = stride;1910cfg.size = MIN2(max_size, expected_size) + (offset & 63);19111912util_range_add(&rsrc->base, &rsrc->valid_buffer_range,1913offset, cfg.size);1914}1915}19161917/* Helpers for manipulating stream out information so we can pack varyings1918* accordingly. Compute the src_offset for a given captured varying */19191920static struct pipe_stream_output *1921pan_get_so(struct pipe_stream_output_info *info, gl_varying_slot loc)1922{1923for (unsigned i = 0; i < info->num_outputs; ++i) {1924if (info->output[i].register_index == loc)1925return &info->output[i];1926}19271928unreachable("Varying not captured");1929}19301931/* Given a varying, figure out which index it corresponds to */19321933static inline unsigned1934pan_varying_index(unsigned present, enum pan_special_varying v)1935{1936return util_bitcount(present & BITFIELD_MASK(v));1937}19381939/* Get the base offset for XFB buffers, which by convention come after1940* everything else. Wrapper function for semantic reasons; by construction this1941* is just popcount. */19421943static inline unsigned1944pan_xfb_base(unsigned present)1945{1946return util_bitcount(present);1947}19481949/* Determines which varying buffers are required */19501951static inline unsigned1952pan_varying_present(const struct panfrost_device *dev,1953struct pan_shader_info *producer,1954struct pan_shader_info *consumer,1955uint16_t point_coord_mask)1956{1957/* At the moment we always emit general and position buffers. Not1958* strictly necessary but usually harmless */19591960unsigned present = BITFIELD_BIT(PAN_VARY_GENERAL) | BITFIELD_BIT(PAN_VARY_POSITION);19611962/* Enable special buffers by the shader info */19631964if (producer->vs.writes_point_size)1965present |= BITFIELD_BIT(PAN_VARY_PSIZ);19661967/* On Bifrost, special fragment varyings are replaced by LD_VAR_SPECIAL */1968if (pan_is_bifrost(dev))1969return present;19701971/* On Midgard, these exist as real varyings */1972if (consumer->fs.reads_point_coord)1973present |= BITFIELD_BIT(PAN_VARY_PNTCOORD);19741975if (consumer->fs.reads_face)1976present |= BITFIELD_BIT(PAN_VARY_FACE);19771978if (consumer->fs.reads_frag_coord)1979present |= BITFIELD_BIT(PAN_VARY_FRAGCOORD);19801981/* Also, if we have a point sprite, we need a point coord buffer */19821983for (unsigned i = 0; i < consumer->varyings.input_count; i++) {1984gl_varying_slot loc = consumer->varyings.input[i].location;19851986if (util_varying_is_point_coord(loc, point_coord_mask))1987present |= BITFIELD_BIT(PAN_VARY_PNTCOORD);1988}19891990return present;1991}19921993/* Emitters for varying records */19941995static void1996pan_emit_vary(const struct panfrost_device *dev,1997struct mali_attribute_packed *out,1998unsigned buffer_index,1999mali_pixel_format format, unsigned offset)2000{2001pan_pack(out, ATTRIBUTE, cfg) {2002cfg.buffer_index = buffer_index;2003cfg.offset_enable = !pan_is_bifrost(dev);2004cfg.format = format;2005cfg.offset = offset;2006}2007}20082009/* Special records */20102011static const struct {2012unsigned components;2013enum mali_format format;2014} pan_varying_formats[PAN_VARY_MAX] = {2015[PAN_VARY_POSITION] = { 4, MALI_SNAP_4 },2016[PAN_VARY_PSIZ] = { 1, MALI_R16F },2017[PAN_VARY_PNTCOORD] = { 1, MALI_R16F },2018[PAN_VARY_FACE] = { 1, MALI_R32I },2019[PAN_VARY_FRAGCOORD] = { 4, MALI_RGBA32F },2020};20212022static mali_pixel_format2023pan_special_format(const struct panfrost_device *dev,2024enum pan_special_varying buf)2025{2026assert(buf < PAN_VARY_MAX);2027mali_pixel_format format = (pan_varying_formats[buf].format << 12);20282029if (dev->quirks & HAS_SWIZZLES) {2030unsigned nr = pan_varying_formats[buf].components;2031format |= panfrost_get_default_swizzle(nr);2032}20332034return format;2035}20362037static void2038pan_emit_vary_special(const struct panfrost_device *dev,2039struct mali_attribute_packed *out,2040unsigned present, enum pan_special_varying buf)2041{2042pan_emit_vary(dev, out, pan_varying_index(present, buf),2043pan_special_format(dev, buf), 0);2044}20452046/* Negative indicates a varying is not found */20472048static signed2049pan_find_vary(const struct pan_shader_varying *vary,2050unsigned vary_count, unsigned loc)2051{2052for (unsigned i = 0; i < vary_count; ++i) {2053if (vary[i].location == loc)2054return i;2055}20562057return -1;2058}20592060/* Assign varying locations for the general buffer. Returns the calculated2061* per-vertex stride, and outputs offsets into the passed array. Negative2062* offset indicates a varying is not used. */20632064static unsigned2065pan_assign_varyings(const struct panfrost_device *dev,2066struct pan_shader_info *producer,2067struct pan_shader_info *consumer,2068signed *offsets)2069{2070unsigned producer_count = producer->varyings.output_count;2071unsigned consumer_count = consumer->varyings.input_count;20722073const struct pan_shader_varying *producer_vars = producer->varyings.output;2074const struct pan_shader_varying *consumer_vars = consumer->varyings.input;20752076unsigned stride = 0;20772078for (unsigned i = 0; i < producer_count; ++i) {2079signed loc = pan_find_vary(consumer_vars, consumer_count,2080producer_vars[i].location);20812082if (loc >= 0) {2083offsets[i] = stride;20842085enum pipe_format format = consumer_vars[loc].format;2086stride += util_format_get_blocksize(format);2087} else {2088offsets[i] = -1;2089}2090}20912092return stride;2093}20942095/* Emitter for a single varying (attribute) descriptor */20962097static void2098panfrost_emit_varying(const struct panfrost_device *dev,2099struct mali_attribute_packed *out,2100const struct pan_shader_varying varying,2101enum pipe_format pipe_format,2102unsigned present,2103uint16_t point_sprite_mask,2104struct pipe_stream_output_info *xfb,2105uint64_t xfb_loc_mask,2106unsigned max_xfb,2107unsigned *xfb_offsets,2108signed offset,2109enum pan_special_varying pos_varying)2110{2111/* Note: varying.format != pipe_format in some obscure cases due to a2112* limitation of the NIR linker. This should be fixed in the future to2113* eliminate the additional lookups. See:2114* dEQP-GLES3.functional.shaders.conditionals.if.sequence_statements_vertex2115*/2116gl_varying_slot loc = varying.location;2117mali_pixel_format format = dev->formats[pipe_format].hw;21182119struct pipe_stream_output *o = (xfb_loc_mask & BITFIELD64_BIT(loc)) ?2120pan_get_so(xfb, loc) : NULL;21212122if (util_varying_is_point_coord(loc, point_sprite_mask)) {2123pan_emit_vary_special(dev, out, present, PAN_VARY_PNTCOORD);2124} else if (o && o->output_buffer < max_xfb) {2125unsigned fixup_offset = xfb_offsets[o->output_buffer] & 63;21262127pan_emit_vary(dev, out,2128pan_xfb_base(present) + o->output_buffer,2129format, (o->dst_offset * 4) + fixup_offset);2130} else if (loc == VARYING_SLOT_POS) {2131pan_emit_vary_special(dev, out, present, pos_varying);2132} else if (loc == VARYING_SLOT_PSIZ) {2133pan_emit_vary_special(dev, out, present, PAN_VARY_PSIZ);2134} else if (loc == VARYING_SLOT_FACE) {2135pan_emit_vary_special(dev, out, present, PAN_VARY_FACE);2136} else if (offset < 0) {2137pan_emit_vary(dev, out, 0, (MALI_CONSTANT << 12), 0);2138} else {2139STATIC_ASSERT(PAN_VARY_GENERAL == 0);2140pan_emit_vary(dev, out, 0, format, offset);2141}2142}21432144/* Links varyings and uploads ATTRIBUTE descriptors. Can execute at link time,2145* rather than draw time (under good conditions). */21462147static void2148panfrost_emit_varying_descs(2149struct panfrost_pool *pool,2150struct panfrost_shader_state *producer,2151struct panfrost_shader_state *consumer,2152struct panfrost_streamout *xfb,2153uint16_t point_coord_mask,2154struct pan_linkage *out)2155{2156struct panfrost_device *dev = pool->base.dev;2157struct pipe_stream_output_info *xfb_info = &producer->stream_output;2158unsigned producer_count = producer->info.varyings.output_count;2159unsigned consumer_count = consumer->info.varyings.input_count;21602161/* Offsets within the general varying buffer, indexed by location */2162signed offsets[PIPE_MAX_ATTRIBS];2163assert(producer_count < ARRAY_SIZE(offsets));2164assert(consumer_count < ARRAY_SIZE(offsets));21652166/* Allocate enough descriptors for both shader stages */2167struct panfrost_ptr T =2168pan_pool_alloc_desc_array(&pool->base,2169producer_count + consumer_count,2170ATTRIBUTE);21712172/* Take a reference if we're being put on the CSO */2173if (!pool->owned) {2174out->bo = pool->transient_bo;2175panfrost_bo_reference(out->bo);2176}21772178struct mali_attribute_packed *descs = T.cpu;2179out->producer = producer_count ? T.gpu : 0;2180out->consumer = consumer_count ? T.gpu +2181(MALI_ATTRIBUTE_LENGTH * producer_count) : 0;21822183/* Lay out the varyings. Must use producer to lay out, in order to2184* respect transform feedback precisions. */2185out->present = pan_varying_present(dev, &producer->info,2186&consumer->info, point_coord_mask);21872188out->stride = pan_assign_varyings(dev, &producer->info,2189&consumer->info, offsets);21902191unsigned xfb_offsets[PIPE_MAX_SO_BUFFERS];21922193for (unsigned i = 0; i < xfb->num_targets; ++i) {2194xfb_offsets[i] = panfrost_xfb_offset(xfb_info->stride[i] * 4,2195xfb->targets[i]);2196}21972198for (unsigned i = 0; i < producer_count; ++i) {2199signed j = pan_find_vary(consumer->info.varyings.input,2200consumer->info.varyings.input_count,2201producer->info.varyings.output[i].location);22022203enum pipe_format format = (j >= 0) ?2204consumer->info.varyings.input[j].format :2205producer->info.varyings.output[i].format;22062207panfrost_emit_varying(dev, descs + i,2208producer->info.varyings.output[i], format,2209out->present, 0, &producer->stream_output,2210producer->so_mask, xfb->num_targets,2211xfb_offsets, offsets[i], PAN_VARY_POSITION);2212}22132214for (unsigned i = 0; i < consumer_count; ++i) {2215signed j = pan_find_vary(producer->info.varyings.output,2216producer->info.varyings.output_count,2217consumer->info.varyings.input[i].location);22182219signed offset = (j >= 0) ? offsets[j] : -1;22202221panfrost_emit_varying(dev, descs + producer_count + i,2222consumer->info.varyings.input[i],2223consumer->info.varyings.input[i].format,2224out->present, point_coord_mask,2225&producer->stream_output, producer->so_mask,2226xfb->num_targets, xfb_offsets, offset,2227PAN_VARY_FRAGCOORD);2228}2229}22302231static void2232pan_emit_special_input(struct mali_attribute_buffer_packed *out,2233unsigned present,2234enum pan_special_varying v,2235unsigned special)2236{2237if (present & BITFIELD_BIT(v)) {2238unsigned idx = pan_varying_index(present, v);22392240pan_pack(out + idx, ATTRIBUTE_BUFFER, cfg) {2241cfg.special = special;2242cfg.type = 0;2243}2244}2245}22462247static void2248panfrost_emit_varying_descriptor(struct panfrost_batch *batch,2249unsigned vertex_count,2250mali_ptr *vs_attribs,2251mali_ptr *fs_attribs,2252mali_ptr *buffers,2253unsigned *buffer_count,2254mali_ptr *position,2255mali_ptr *psiz,2256bool point_coord_replace)2257{2258/* Load the shaders */2259struct panfrost_context *ctx = batch->ctx;2260struct panfrost_device *dev = pan_device(ctx->base.screen);2261struct panfrost_shader_state *vs, *fs;22622263vs = panfrost_get_shader_state(ctx, PIPE_SHADER_VERTEX);2264fs = panfrost_get_shader_state(ctx, PIPE_SHADER_FRAGMENT);22652266uint16_t point_coord_mask = ctx->rasterizer->base.sprite_coord_enable;22672268/* TODO: point sprites need lowering on Bifrost */2269if (!point_coord_replace || pan_is_bifrost(dev))2270point_coord_mask = 0;22712272/* In good conditions, we only need to link varyings once */2273bool prelink =2274(point_coord_mask == 0) &&2275(ctx->streamout.num_targets == 0) &&2276!vs->info.separable &&2277!fs->info.separable;22782279/* Try to reduce copies */2280struct pan_linkage _linkage;2281struct pan_linkage *linkage = prelink ? &vs->linkage : &_linkage;22822283/* Emit ATTRIBUTE descriptors if needed */2284if (!prelink || vs->linkage.bo == NULL) {2285struct panfrost_pool *pool =2286prelink ? &ctx->descs : &batch->pool;22872288panfrost_emit_varying_descs(pool, vs, fs, &ctx->streamout, point_coord_mask, linkage);2289}22902291struct pipe_stream_output_info *so = &vs->stream_output;2292unsigned present = linkage->present, stride = linkage->stride;2293unsigned xfb_base = pan_xfb_base(present);2294struct panfrost_ptr T =2295pan_pool_alloc_desc_array(&batch->pool.base,2296xfb_base +2297ctx->streamout.num_targets + 1,2298ATTRIBUTE_BUFFER);2299struct mali_attribute_buffer_packed *varyings =2300(struct mali_attribute_buffer_packed *) T.cpu;23012302if (buffer_count)2303*buffer_count = xfb_base + ctx->streamout.num_targets;23042305/* Suppress prefetch on Bifrost */2306memset(varyings + (xfb_base * ctx->streamout.num_targets), 0, sizeof(*varyings));23072308/* Emit the stream out buffers. We need enough room for all the2309* vertices we emit across all instances */23102311unsigned out_count = ctx->instance_count *2312u_stream_outputs_for_vertices(ctx->active_prim, ctx->vertex_count);23132314for (unsigned i = 0; i < ctx->streamout.num_targets; ++i) {2315panfrost_emit_streamout(batch, &varyings[xfb_base + i],2316so->stride[i] * 4,2317out_count,2318ctx->streamout.targets[i]);2319}23202321if (stride) {2322panfrost_emit_varyings(batch,2323&varyings[pan_varying_index(present, PAN_VARY_GENERAL)],2324stride, vertex_count);2325}23262327/* fp32 vec4 gl_Position */2328*position = panfrost_emit_varyings(batch,2329&varyings[pan_varying_index(present, PAN_VARY_POSITION)],2330sizeof(float) * 4, vertex_count);23312332if (present & BITFIELD_BIT(PAN_VARY_PSIZ)) {2333*psiz = panfrost_emit_varyings(batch,2334&varyings[pan_varying_index(present, PAN_VARY_PSIZ)],23352, vertex_count);2336}23372338pan_emit_special_input(varyings, present,2339PAN_VARY_PNTCOORD, MALI_ATTRIBUTE_SPECIAL_POINT_COORD);2340pan_emit_special_input(varyings, present, PAN_VARY_FACE,2341MALI_ATTRIBUTE_SPECIAL_FRONT_FACING);2342pan_emit_special_input(varyings, present, PAN_VARY_FRAGCOORD,2343MALI_ATTRIBUTE_SPECIAL_FRAG_COORD);23442345*buffers = T.gpu;2346*vs_attribs = linkage->producer;2347*fs_attribs = linkage->consumer;2348}23492350static void2351panfrost_emit_vertex_tiler_jobs(struct panfrost_batch *batch,2352const struct panfrost_ptr *vertex_job,2353const struct panfrost_ptr *tiler_job)2354{2355struct panfrost_context *ctx = batch->ctx;23562357/* If rasterizer discard is enable, only submit the vertex. XXX - set2358* job_barrier in case buffers get ping-ponged and we need to enforce2359* ordering, this has a perf hit! See2360* KHR-GLES31.core.vertex_attrib_binding.advanced-iterations */23612362unsigned vertex = panfrost_add_job(&batch->pool.base, &batch->scoreboard,2363MALI_JOB_TYPE_VERTEX, true, false,2364ctx->indirect_draw ?2365batch->indirect_draw_job_id : 0,23660, vertex_job, false);23672368if (ctx->rasterizer->base.rasterizer_discard || batch->scissor_culls_everything)2369return;23702371panfrost_add_job(&batch->pool.base, &batch->scoreboard,2372MALI_JOB_TYPE_TILER, false, false,2373vertex, 0, tiler_job, false);2374}23752376static void2377emit_tls(struct panfrost_batch *batch)2378{2379struct panfrost_device *dev = pan_device(batch->ctx->base.screen);23802381/* Emitted with the FB descriptor on Midgard. */2382if (!pan_is_bifrost(dev) && batch->framebuffer.gpu)2383return;23842385struct panfrost_bo *tls_bo =2386batch->stack_size ?2387panfrost_batch_get_scratchpad(batch,2388batch->stack_size,2389dev->thread_tls_alloc,2390dev->core_count):2391NULL;2392struct pan_tls_info tls = {2393.tls = {2394.ptr = tls_bo ? tls_bo->ptr.gpu : 0,2395.size = batch->stack_size,2396},2397};23982399assert(batch->tls.cpu);2400pan_emit_tls(dev, &tls, batch->tls.cpu);2401}24022403static void2404emit_fbd(struct panfrost_batch *batch, const struct pan_fb_info *fb)2405{2406struct panfrost_device *dev = pan_device(batch->ctx->base.screen);2407struct panfrost_bo *tls_bo =2408batch->stack_size ?2409panfrost_batch_get_scratchpad(batch,2410batch->stack_size,2411dev->thread_tls_alloc,2412dev->core_count):2413NULL;2414struct pan_tls_info tls = {2415.tls = {2416.ptr = tls_bo ? tls_bo->ptr.gpu : 0,2417.size = batch->stack_size,2418},2419};24202421batch->framebuffer.gpu |=2422pan_emit_fbd(dev, fb, &tls, &batch->tiler_ctx,2423batch->framebuffer.cpu);2424}24252426/* Mark a surface as written */24272428static void2429panfrost_initialize_surface(struct panfrost_batch *batch,2430struct pipe_surface *surf)2431{2432if (surf) {2433struct panfrost_resource *rsrc = pan_resource(surf->texture);2434BITSET_SET(rsrc->valid.data, surf->u.tex.level);2435}2436}24372438/* Generate a fragment job. This should be called once per frame. (According to2439* presentations, this is supposed to correspond to eglSwapBuffers) */24402441static mali_ptr2442emit_fragment_job(struct panfrost_batch *batch, const struct pan_fb_info *pfb)2443{2444struct panfrost_device *dev = pan_device(batch->ctx->base.screen);24452446/* Mark the affected buffers as initialized, since we're writing to it.2447* Also, add the surfaces we're writing to to the batch */24482449struct pipe_framebuffer_state *fb = &batch->key;24502451for (unsigned i = 0; i < fb->nr_cbufs; ++i)2452panfrost_initialize_surface(batch, fb->cbufs[i]);24532454panfrost_initialize_surface(batch, fb->zsbuf);24552456/* The passed tile coords can be out of range in some cases, so we need2457* to clamp them to the framebuffer size to avoid a TILE_RANGE_FAULT.2458* Theoretically we also need to clamp the coordinates positive, but we2459* avoid that edge case as all four values are unsigned. Also,2460* theoretically we could clamp the minima, but if that has to happen2461* the asserts would fail anyway (since the maxima would get clamped2462* and then be smaller than the minima). An edge case of sorts occurs2463* when no scissors are added to draw, so by default min=~0 and max=0.2464* But that can't happen if any actual drawing occurs (beyond a2465* wallpaper reload), so this is again irrelevant in practice. */24662467batch->maxx = MIN2(batch->maxx, fb->width);2468batch->maxy = MIN2(batch->maxy, fb->height);24692470/* Rendering region must be at least 1x1; otherwise, there is nothing2471* to do and the whole job chain should have been discarded. */24722473assert(batch->maxx > batch->minx);2474assert(batch->maxy > batch->miny);24752476struct panfrost_ptr transfer =2477pan_pool_alloc_desc(&batch->pool.base, FRAGMENT_JOB);24782479pan_emit_fragment_job(dev, pfb, batch->framebuffer.gpu,2480transfer.cpu);24812482return transfer.gpu;2483}24842485#define DEFINE_CASE(c) case PIPE_PRIM_##c: return MALI_DRAW_MODE_##c;24862487static uint8_t2488pan_draw_mode(enum pipe_prim_type mode)2489{2490switch (mode) {2491DEFINE_CASE(POINTS);2492DEFINE_CASE(LINES);2493DEFINE_CASE(LINE_LOOP);2494DEFINE_CASE(LINE_STRIP);2495DEFINE_CASE(TRIANGLES);2496DEFINE_CASE(TRIANGLE_STRIP);2497DEFINE_CASE(TRIANGLE_FAN);2498DEFINE_CASE(QUADS);2499DEFINE_CASE(QUAD_STRIP);2500DEFINE_CASE(POLYGON);25012502default:2503unreachable("Invalid draw mode");2504}2505}25062507#undef DEFINE_CASE25082509/* Count generated primitives (when there is no geom/tess shaders) for2510* transform feedback */25112512static void2513panfrost_statistics_record(2514struct panfrost_context *ctx,2515const struct pipe_draw_info *info,2516const struct pipe_draw_start_count_bias *draw)2517{2518if (!ctx->active_queries)2519return;25202521uint32_t prims = u_prims_for_vertices(info->mode, draw->count);2522ctx->prims_generated += prims;25232524if (!ctx->streamout.num_targets)2525return;25262527ctx->tf_prims_generated += prims;2528}25292530static void2531panfrost_update_streamout_offsets(struct panfrost_context *ctx)2532{2533for (unsigned i = 0; i < ctx->streamout.num_targets; ++i) {2534unsigned count;25352536count = u_stream_outputs_for_vertices(ctx->active_prim,2537ctx->vertex_count);2538pan_so_target(ctx->streamout.targets[i])->offset += count;2539}2540}25412542static inline void2543pan_emit_draw_descs(struct panfrost_batch *batch,2544struct MALI_DRAW *d, enum pipe_shader_type st)2545{2546d->offset_start = batch->ctx->offset_start;2547d->instance_size = batch->ctx->instance_count > 1 ?2548batch->ctx->padded_count : 1;25492550d->uniform_buffers = batch->uniform_buffers[st];2551d->push_uniforms = batch->push_uniforms[st];2552d->textures = batch->textures[st];2553d->samplers = batch->samplers[st];2554}25552556static inline enum mali_index_type2557panfrost_translate_index_size(unsigned size)2558{2559STATIC_ASSERT(MALI_INDEX_TYPE_NONE == 0);2560STATIC_ASSERT(MALI_INDEX_TYPE_UINT8 == 1);2561STATIC_ASSERT(MALI_INDEX_TYPE_UINT16 == 2);25622563return (size == 4) ? MALI_INDEX_TYPE_UINT32 : size;2564}25652566static void2567panfrost_draw_emit_vertex(struct panfrost_batch *batch,2568const struct pipe_draw_info *info,2569void *invocation_template,2570mali_ptr vs_vary, mali_ptr varyings,2571mali_ptr attribs, mali_ptr attrib_bufs,2572void *job)2573{2574struct panfrost_context *ctx = batch->ctx;2575struct panfrost_device *device = pan_device(ctx->base.screen);25762577void *section =2578pan_section_ptr(job, COMPUTE_JOB, INVOCATION);2579memcpy(section, invocation_template, MALI_INVOCATION_LENGTH);25802581pan_section_pack(job, COMPUTE_JOB, PARAMETERS, cfg) {2582cfg.job_task_split = 5;2583}25842585pan_section_pack(job, COMPUTE_JOB, DRAW, cfg) {2586cfg.draw_descriptor_is_64b = true;2587if (!pan_is_bifrost(device))2588cfg.texture_descriptor_is_64b = true;2589cfg.state = batch->rsd[PIPE_SHADER_VERTEX];2590cfg.attributes = attribs;2591cfg.attribute_buffers = attrib_bufs;2592cfg.varyings = vs_vary;2593cfg.varying_buffers = vs_vary ? varyings : 0;2594cfg.thread_storage = batch->tls.gpu;2595pan_emit_draw_descs(batch, &cfg, PIPE_SHADER_VERTEX);2596}25972598pan_section_pack(job, COMPUTE_JOB, DRAW_PADDING, cfg);2599}26002601static void2602panfrost_emit_primitive_size(struct panfrost_context *ctx,2603bool points, mali_ptr size_array,2604void *prim_size)2605{2606struct panfrost_rasterizer *rast = ctx->rasterizer;26072608pan_pack(prim_size, PRIMITIVE_SIZE, cfg) {2609if (panfrost_writes_point_size(ctx)) {2610cfg.size_array = size_array;2611} else {2612cfg.constant = points ?2613rast->base.point_size :2614rast->base.line_width;2615}2616}2617}26182619static bool2620panfrost_is_implicit_prim_restart(const struct pipe_draw_info *info)2621{2622unsigned implicit_index = (1 << (info->index_size * 8)) - 1;2623bool implicit = info->restart_index == implicit_index;2624return info->primitive_restart && implicit;2625}26262627static inline void2628panfrost_update_state_tex(struct panfrost_batch *batch,2629enum pipe_shader_type st)2630{2631struct panfrost_context *ctx = batch->ctx;2632struct panfrost_shader_state *ss = panfrost_get_shader_state(ctx, st);26332634unsigned dirty_3d = ctx->dirty;2635unsigned dirty = ctx->dirty_shader[st];26362637if (dirty & PAN_DIRTY_STAGE_TEXTURE) {2638batch->textures[st] =2639panfrost_emit_texture_descriptors(batch, st);2640}26412642if (dirty & PAN_DIRTY_STAGE_SAMPLER) {2643batch->samplers[st] =2644panfrost_emit_sampler_descriptors(batch, st);2645}26462647if ((dirty & ss->dirty_shader) || (dirty_3d & ss->dirty_3d)) {2648batch->uniform_buffers[st] = panfrost_emit_const_buf(batch, st,2649&batch->push_uniforms[st]);2650}2651}26522653static inline void2654panfrost_update_state_3d(struct panfrost_batch *batch)2655{2656unsigned dirty = batch->ctx->dirty;26572658if (dirty & (PAN_DIRTY_VIEWPORT | PAN_DIRTY_SCISSOR))2659batch->viewport = panfrost_emit_viewport(batch);26602661if (dirty & PAN_DIRTY_TLS_SIZE)2662panfrost_batch_adjust_stack_size(batch);2663}26642665static void2666panfrost_update_state_vs(struct panfrost_batch *batch)2667{2668enum pipe_shader_type st = PIPE_SHADER_VERTEX;2669unsigned dirty = batch->ctx->dirty_shader[st];26702671if (dirty & PAN_DIRTY_STAGE_RENDERER)2672batch->rsd[st] = panfrost_emit_compute_shader_meta(batch, st);26732674panfrost_update_state_tex(batch, st);2675}26762677static void2678panfrost_update_state_fs(struct panfrost_batch *batch)2679{2680enum pipe_shader_type st = PIPE_SHADER_FRAGMENT;2681unsigned dirty = batch->ctx->dirty_shader[st];26822683if (dirty & PAN_DIRTY_STAGE_RENDERER)2684batch->rsd[st] = panfrost_emit_frag_shader_meta(batch);26852686if (dirty & PAN_DIRTY_STAGE_IMAGE) {2687batch->attribs[st] = panfrost_emit_image_attribs(batch,2688&batch->attrib_bufs[st], st);2689}26902691panfrost_update_state_tex(batch, st);2692}26932694static void2695panfrost_draw_emit_tiler(struct panfrost_batch *batch,2696const struct pipe_draw_info *info,2697const struct pipe_draw_start_count_bias *draw,2698void *invocation_template,2699mali_ptr indices, mali_ptr fs_vary, mali_ptr varyings,2700mali_ptr pos, mali_ptr psiz, void *job)2701{2702struct panfrost_context *ctx = batch->ctx;2703struct pipe_rasterizer_state *rast = &ctx->rasterizer->base;2704struct panfrost_device *device = pan_device(ctx->base.screen);27052706void *section = pan_is_bifrost(device) ?2707pan_section_ptr(job, BIFROST_TILER_JOB, INVOCATION) :2708pan_section_ptr(job, MIDGARD_TILER_JOB, INVOCATION);2709memcpy(section, invocation_template, MALI_INVOCATION_LENGTH);27102711section = pan_is_bifrost(device) ?2712pan_section_ptr(job, BIFROST_TILER_JOB, PRIMITIVE) :2713pan_section_ptr(job, MIDGARD_TILER_JOB, PRIMITIVE);2714pan_pack(section, PRIMITIVE, cfg) {2715cfg.draw_mode = pan_draw_mode(info->mode);2716if (panfrost_writes_point_size(ctx))2717cfg.point_size_array_format = MALI_POINT_SIZE_ARRAY_FORMAT_FP16;27182719/* For line primitives, PRIMITIVE.first_provoking_vertex must2720* be set to true and the provoking vertex is selected with2721* DRAW.flat_shading_vertex.2722*/2723if (info->mode == PIPE_PRIM_LINES ||2724info->mode == PIPE_PRIM_LINE_LOOP ||2725info->mode == PIPE_PRIM_LINE_STRIP)2726cfg.first_provoking_vertex = true;2727else2728cfg.first_provoking_vertex = rast->flatshade_first;27292730if (panfrost_is_implicit_prim_restart(info)) {2731cfg.primitive_restart = MALI_PRIMITIVE_RESTART_IMPLICIT;2732} else if (info->primitive_restart) {2733cfg.primitive_restart = MALI_PRIMITIVE_RESTART_EXPLICIT;2734cfg.primitive_restart_index = info->restart_index;2735}27362737cfg.job_task_split = 6;27382739cfg.index_count = ctx->indirect_draw ? 1 : draw->count;2740cfg.index_type = panfrost_translate_index_size(info->index_size);27412742if (cfg.index_type) {2743cfg.indices = indices;2744cfg.base_vertex_offset = draw->index_bias - ctx->offset_start;2745}2746}27472748bool points = info->mode == PIPE_PRIM_POINTS;2749void *prim_size = pan_is_bifrost(device) ?2750pan_section_ptr(job, BIFROST_TILER_JOB, PRIMITIVE_SIZE) :2751pan_section_ptr(job, MIDGARD_TILER_JOB, PRIMITIVE_SIZE);27522753if (pan_is_bifrost(device)) {2754panfrost_emit_primitive_size(ctx, points, psiz, prim_size);2755pan_section_pack(job, BIFROST_TILER_JOB, TILER, cfg) {2756cfg.address = panfrost_batch_get_bifrost_tiler(batch, ~0);2757}2758pan_section_pack(job, BIFROST_TILER_JOB, PADDING, padding) {}2759}27602761section = pan_is_bifrost(device) ?2762pan_section_ptr(job, BIFROST_TILER_JOB, DRAW) :2763pan_section_ptr(job, MIDGARD_TILER_JOB, DRAW);2764pan_pack(section, DRAW, cfg) {2765cfg.four_components_per_vertex = true;2766cfg.draw_descriptor_is_64b = true;2767if (!pan_is_bifrost(device))2768cfg.texture_descriptor_is_64b = true;2769cfg.front_face_ccw = rast->front_ccw;2770cfg.cull_front_face = rast->cull_face & PIPE_FACE_FRONT;2771cfg.cull_back_face = rast->cull_face & PIPE_FACE_BACK;2772cfg.position = pos;2773cfg.state = batch->rsd[PIPE_SHADER_FRAGMENT];2774cfg.attributes = batch->attribs[PIPE_SHADER_FRAGMENT];2775cfg.attribute_buffers = batch->attrib_bufs[PIPE_SHADER_FRAGMENT];2776cfg.viewport = batch->viewport;2777cfg.varyings = fs_vary;2778cfg.varying_buffers = fs_vary ? varyings : 0;2779cfg.thread_storage = batch->tls.gpu;27802781/* For all primitives but lines DRAW.flat_shading_vertex must2782* be set to 0 and the provoking vertex is selected with the2783* PRIMITIVE.first_provoking_vertex field.2784*/2785if (info->mode == PIPE_PRIM_LINES ||2786info->mode == PIPE_PRIM_LINE_LOOP ||2787info->mode == PIPE_PRIM_LINE_STRIP) {2788/* The logic is inverted on bifrost. */2789cfg.flat_shading_vertex =2790pan_is_bifrost(device) ?2791rast->flatshade_first : !rast->flatshade_first;2792}27932794pan_emit_draw_descs(batch, &cfg, PIPE_SHADER_FRAGMENT);27952796if (ctx->occlusion_query && ctx->active_queries) {2797if (ctx->occlusion_query->type == PIPE_QUERY_OCCLUSION_COUNTER)2798cfg.occlusion_query = MALI_OCCLUSION_MODE_COUNTER;2799else2800cfg.occlusion_query = MALI_OCCLUSION_MODE_PREDICATE;28012802struct panfrost_resource *rsrc = pan_resource(ctx->occlusion_query->rsrc);2803cfg.occlusion = rsrc->image.data.bo->ptr.gpu;2804panfrost_batch_write_rsrc(ctx->batch, rsrc,2805PIPE_SHADER_FRAGMENT);2806}2807}28082809if (!pan_is_bifrost(device))2810panfrost_emit_primitive_size(ctx, points, psiz, prim_size);2811else2812pan_section_pack(job, BIFROST_TILER_JOB, DRAW_PADDING, cfg);2813}28142815static void2816panfrost_direct_draw(struct panfrost_batch *batch,2817const struct pipe_draw_info *info,2818unsigned drawid_offset,2819const struct pipe_draw_start_count_bias *draw)2820{2821if (!draw->count || !info->instance_count)2822return;28232824struct panfrost_context *ctx = batch->ctx;2825struct panfrost_device *device = pan_device(ctx->base.screen);28262827/* Fallback for unsupported modes */2828if (!(ctx->draw_modes & BITFIELD_BIT(info->mode))) {2829if (draw->count < 4) {2830/* Degenerate case? */2831return;2832}28332834util_primconvert_save_rasterizer_state(ctx->primconvert, &ctx->rasterizer->base);2835util_primconvert_draw_vbo(ctx->primconvert, info, drawid_offset, NULL, draw, 1);2836return;2837}28382839/* Take into account a negative bias */2840ctx->indirect_draw = false;2841ctx->vertex_count = draw->count + (info->index_size ? abs(draw->index_bias) : 0);2842ctx->instance_count = info->instance_count;2843ctx->base_vertex = info->index_size ? draw->index_bias : 0;2844ctx->base_instance = info->start_instance;2845ctx->active_prim = info->mode;2846ctx->drawid = drawid_offset;28472848struct panfrost_ptr tiler =2849pan_is_bifrost(device) ?2850pan_pool_alloc_desc(&batch->pool.base, BIFROST_TILER_JOB) :2851pan_pool_alloc_desc(&batch->pool.base, MIDGARD_TILER_JOB);2852struct panfrost_ptr vertex =2853pan_pool_alloc_desc(&batch->pool.base, COMPUTE_JOB);28542855unsigned vertex_count = ctx->vertex_count;28562857unsigned min_index = 0, max_index = 0;2858mali_ptr indices = 0;28592860if (info->index_size) {2861indices = panfrost_get_index_buffer_bounded(batch, info, draw,2862&min_index,2863&max_index);28642865/* Use the corresponding values */2866vertex_count = max_index - min_index + 1;2867ctx->offset_start = min_index + draw->index_bias;2868} else {2869ctx->offset_start = draw->start;2870}28712872if (info->instance_count > 1)2873ctx->padded_count = panfrost_padded_vertex_count(vertex_count);2874else2875ctx->padded_count = vertex_count;28762877panfrost_statistics_record(ctx, info, draw);28782879struct mali_invocation_packed invocation;2880if (info->instance_count > 1) {2881panfrost_pack_work_groups_compute(&invocation,28821, vertex_count, info->instance_count,28831, 1, 1, true, false);2884} else {2885pan_pack(&invocation, INVOCATION, cfg) {2886cfg.invocations = MALI_POSITIVE(vertex_count);2887cfg.size_y_shift = 0;2888cfg.size_z_shift = 0;2889cfg.workgroups_x_shift = 0;2890cfg.workgroups_y_shift = 0;2891cfg.workgroups_z_shift = 32;2892cfg.thread_group_split = MALI_SPLIT_MIN_EFFICIENT;2893}2894}28952896/* Emit all sort of descriptors. */2897mali_ptr varyings = 0, vs_vary = 0, fs_vary = 0, pos = 0, psiz = 0;28982899panfrost_emit_varying_descriptor(batch,2900ctx->padded_count *2901ctx->instance_count,2902&vs_vary, &fs_vary, &varyings,2903NULL, &pos, &psiz,2904info->mode == PIPE_PRIM_POINTS);29052906mali_ptr attribs, attrib_bufs;2907attribs = panfrost_emit_vertex_data(batch, &attrib_bufs);29082909panfrost_update_state_3d(batch);2910panfrost_update_state_vs(batch);2911panfrost_update_state_fs(batch);2912panfrost_clean_state_3d(ctx);29132914/* Fire off the draw itself */2915panfrost_draw_emit_vertex(batch, info, &invocation,2916vs_vary, varyings, attribs, attrib_bufs, vertex.cpu);2917panfrost_draw_emit_tiler(batch, info, draw, &invocation, indices,2918fs_vary, varyings, pos, psiz, tiler.cpu);2919panfrost_emit_vertex_tiler_jobs(batch, &vertex, &tiler);29202921/* Increment transform feedback offsets */2922panfrost_update_streamout_offsets(ctx);2923}29242925static void2926panfrost_indirect_draw(struct panfrost_batch *batch,2927const struct pipe_draw_info *info,2928unsigned drawid_offset,2929const struct pipe_draw_indirect_info *indirect,2930const struct pipe_draw_start_count_bias *draw)2931{2932/* Indirect draw count and multi-draw not supported. */2933assert(indirect->draw_count == 1 && !indirect->indirect_draw_count);29342935struct panfrost_context *ctx = batch->ctx;2936struct panfrost_device *dev = pan_device(ctx->base.screen);29372938/* TODO: update statistics (see panfrost_statistics_record()) */2939/* TODO: Increment transform feedback offsets */2940assert(ctx->streamout.num_targets == 0);29412942assert(ctx->draw_modes & (1 << info->mode));2943ctx->active_prim = info->mode;2944ctx->drawid = drawid_offset;2945ctx->indirect_draw = true;29462947struct panfrost_ptr tiler =2948pan_pool_alloc_aligned(&batch->pool.base,2949pan_is_bifrost(dev) ?2950MALI_BIFROST_TILER_JOB_LENGTH :2951MALI_MIDGARD_TILER_JOB_LENGTH,295264);2953struct panfrost_ptr vertex =2954pan_pool_alloc_aligned(&batch->pool.base,2955MALI_COMPUTE_JOB_LENGTH,295664);29572958struct panfrost_shader_state *vs =2959panfrost_get_shader_state(ctx, PIPE_SHADER_VERTEX);29602961struct panfrost_bo *index_buf = NULL;29622963if (info->index_size) {2964assert(!info->has_user_indices);2965struct panfrost_resource *rsrc = pan_resource(info->index.resource);2966index_buf = rsrc->image.data.bo;2967panfrost_batch_read_rsrc(batch, rsrc, PIPE_SHADER_VERTEX);2968}29692970mali_ptr varyings = 0, vs_vary = 0, fs_vary = 0, pos = 0, psiz = 0;2971unsigned varying_buf_count;29722973/* We want to create templates, set all count fields to 0 to reflect2974* that.2975*/2976ctx->instance_count = ctx->vertex_count = ctx->padded_count = 0;2977ctx->offset_start = 0;29782979/* Set the {first,base}_vertex sysvals to NULL. Will be updated if the2980* vertex shader uses gl_VertexID or gl_BaseVertex.2981*/2982ctx->first_vertex_sysval_ptr = 0;2983ctx->base_vertex_sysval_ptr = 0;2984ctx->base_instance_sysval_ptr = 0;29852986panfrost_update_state_3d(batch);2987panfrost_update_state_vs(batch);2988panfrost_update_state_fs(batch);2989panfrost_clean_state_3d(ctx);29902991bool point_coord_replace = (info->mode == PIPE_PRIM_POINTS);29922993panfrost_emit_varying_descriptor(batch, 0,2994&vs_vary, &fs_vary, &varyings,2995&varying_buf_count, &pos, &psiz,2996point_coord_replace);29972998mali_ptr attribs, attrib_bufs;2999attribs = panfrost_emit_vertex_data(batch, &attrib_bufs);30003001/* Zero-ed invocation, the compute job will update it. */3002static struct mali_invocation_packed invocation;30033004/* Fire off the draw itself */3005panfrost_draw_emit_vertex(batch, info, &invocation, vs_vary, varyings,3006attribs, attrib_bufs, vertex.cpu);3007panfrost_draw_emit_tiler(batch, info, draw, &invocation,3008index_buf ? index_buf->ptr.gpu : 0,3009fs_vary, varyings, pos, psiz, tiler.cpu);30103011/* Add the varying heap BO to the batch if we're allocating varyings. */3012if (varyings) {3013panfrost_batch_add_bo(batch,3014dev->indirect_draw_shaders.varying_heap,3015PIPE_SHADER_VERTEX);3016}30173018assert(indirect->buffer);30193020struct panfrost_resource *draw_buf = pan_resource(indirect->buffer);30213022/* Don't count images: those attributes don't need to be patched. */3023unsigned attrib_count =3024vs->info.attribute_count -3025util_bitcount(ctx->image_mask[PIPE_SHADER_VERTEX]);30263027panfrost_batch_read_rsrc(batch, draw_buf, PIPE_SHADER_VERTEX);30283029struct pan_indirect_draw_info draw_info = {3030.last_indirect_draw = batch->indirect_draw_job_id,3031.draw_buf = draw_buf->image.data.bo->ptr.gpu + indirect->offset,3032.index_buf = index_buf ? index_buf->ptr.gpu : 0,3033.first_vertex_sysval = ctx->first_vertex_sysval_ptr,3034.base_vertex_sysval = ctx->base_vertex_sysval_ptr,3035.base_instance_sysval = ctx->base_instance_sysval_ptr,3036.vertex_job = vertex.gpu,3037.tiler_job = tiler.gpu,3038.attrib_bufs = attrib_bufs,3039.attribs = attribs,3040.attrib_count = attrib_count,3041.varying_bufs = varyings,3042.index_size = info->index_size,3043};30443045if (panfrost_writes_point_size(ctx))3046draw_info.flags |= PAN_INDIRECT_DRAW_UPDATE_PRIM_SIZE;30473048if (vs->info.vs.writes_point_size)3049draw_info.flags |= PAN_INDIRECT_DRAW_HAS_PSIZ;305030513052if (info->primitive_restart) {3053draw_info.restart_index = info->restart_index;3054draw_info.flags |= PAN_INDIRECT_DRAW_PRIMITIVE_RESTART;3055}30563057batch->indirect_draw_job_id =3058panfrost_emit_indirect_draw(&batch->pool.base,3059&batch->scoreboard,3060&draw_info,3061&batch->indirect_draw_ctx);30623063panfrost_emit_vertex_tiler_jobs(batch, &vertex, &tiler);3064}30653066static void3067panfrost_draw_vbo(struct pipe_context *pipe,3068const struct pipe_draw_info *info,3069unsigned drawid_offset,3070const struct pipe_draw_indirect_info *indirect,3071const struct pipe_draw_start_count_bias *draws,3072unsigned num_draws)3073{3074struct panfrost_context *ctx = pan_context(pipe);3075struct panfrost_device *dev = pan_device(pipe->screen);30763077if (!panfrost_render_condition_check(ctx))3078return;30793080/* Emulate indirect draws when debugging */3081if (dev->debug & PAN_DBG_NOINDIRECT && indirect && indirect->buffer) {3082assert(num_draws == 1);3083util_draw_indirect(pipe, info, indirect);3084return;3085}30863087/* Do some common setup */3088struct panfrost_batch *batch = panfrost_get_batch_for_fbo(ctx);30893090/* Don't add too many jobs to a single batch. Hardware has a hard limit3091* of 65536 jobs, but we choose a smaller soft limit (arbitrary) to3092* avoid the risk of timeouts. This might not be a good idea. */3093if (unlikely(batch->scoreboard.job_index > 10000))3094batch = panfrost_get_fresh_batch_for_fbo(ctx);30953096unsigned zs_draws = ctx->depth_stencil->draws;3097batch->draws |= zs_draws;3098batch->resolve |= zs_draws;30993100/* Mark everything dirty when debugging */3101if (unlikely(dev->debug & PAN_DBG_DIRTY))3102panfrost_dirty_state_all(ctx);31033104/* Conservatively assume draw parameters always change */3105ctx->dirty |= PAN_DIRTY_PARAMS | PAN_DIRTY_DRAWID;31063107if (indirect) {3108assert(num_draws == 1);31093110if (indirect->count_from_stream_output) {3111struct pipe_draw_start_count_bias tmp_draw = *draws;3112struct panfrost_streamout_target *so =3113pan_so_target(indirect->count_from_stream_output);31143115tmp_draw.start = 0;3116tmp_draw.count = so->offset;3117tmp_draw.index_bias = 0;3118panfrost_direct_draw(batch, info, drawid_offset, &tmp_draw);3119return;3120}31213122panfrost_indirect_draw(batch, info, drawid_offset, indirect, &draws[0]);3123return;3124}31253126struct pipe_draw_info tmp_info = *info;3127unsigned drawid = drawid_offset;31283129for (unsigned i = 0; i < num_draws; i++) {3130panfrost_direct_draw(batch, &tmp_info, drawid, &draws[i]);31313132if (tmp_info.increment_draw_id) {3133ctx->dirty |= PAN_DIRTY_DRAWID;3134drawid++;3135}3136}31373138}31393140/* Launch grid is the compute equivalent of draw_vbo, so in this routine, we3141* construct the COMPUTE job and some of its payload.3142*/31433144static void3145panfrost_launch_grid(struct pipe_context *pipe,3146const struct pipe_grid_info *info)3147{3148struct panfrost_context *ctx = pan_context(pipe);3149struct panfrost_device *dev = pan_device(pipe->screen);31503151/* XXX - shouldn't be necessary with working memory barriers. Affected3152* test: KHR-GLES31.core.compute_shader.pipeline-post-xfb */3153panfrost_flush_all_batches(ctx);31543155struct panfrost_batch *batch = panfrost_get_batch_for_fbo(ctx);31563157struct panfrost_shader_state *cs =3158&ctx->shader[PIPE_SHADER_COMPUTE]->variants[0];31593160/* Indirect dispatch can't handle workgroup local storage since that3161* would require dynamic memory allocation. Bail in this case. */3162if (info->indirect && !cs->info.wls_size) {3163struct pipe_transfer *transfer;3164uint32_t *params = pipe_buffer_map_range(pipe, info->indirect,3165info->indirect_offset,31663 * sizeof(uint32_t),3167PIPE_MAP_READ,3168&transfer);31693170struct pipe_grid_info direct = *info;3171direct.indirect = NULL;3172direct.grid[0] = params[0];3173direct.grid[1] = params[1];3174direct.grid[2] = params[2];3175pipe_buffer_unmap(pipe, transfer);31763177if (params[0] && params[1] && params[2])3178panfrost_launch_grid(pipe, &direct);31793180return;3181}31823183ctx->compute_grid = info;31843185struct panfrost_ptr t =3186pan_pool_alloc_desc(&batch->pool.base, COMPUTE_JOB);31873188/* We implement OpenCL inputs as uniforms (or a UBO -- same thing), so3189* reuse the graphics path for this by lowering to Gallium */31903191struct pipe_constant_buffer ubuf = {3192.buffer = NULL,3193.buffer_offset = 0,3194.buffer_size = ctx->shader[PIPE_SHADER_COMPUTE]->cbase.req_input_mem,3195.user_buffer = info->input3196};31973198if (info->input)3199pipe->set_constant_buffer(pipe, PIPE_SHADER_COMPUTE, 0, false, &ubuf);32003201/* Invoke according to the grid info */32023203void *invocation =3204pan_section_ptr(t.cpu, COMPUTE_JOB, INVOCATION);3205unsigned num_wg[3] = { info->grid[0], info->grid[1], info->grid[2] };32063207if (info->indirect)3208num_wg[0] = num_wg[1] = num_wg[2] = 1;32093210panfrost_pack_work_groups_compute(invocation,3211num_wg[0], num_wg[1], num_wg[2],3212info->block[0], info->block[1],3213info->block[2],3214false, info->indirect != NULL);32153216pan_section_pack(t.cpu, COMPUTE_JOB, PARAMETERS, cfg) {3217cfg.job_task_split =3218util_logbase2_ceil(info->block[0] + 1) +3219util_logbase2_ceil(info->block[1] + 1) +3220util_logbase2_ceil(info->block[2] + 1);3221}32223223pan_section_pack(t.cpu, COMPUTE_JOB, DRAW, cfg) {3224cfg.draw_descriptor_is_64b = true;3225if (!pan_is_bifrost(dev))3226cfg.texture_descriptor_is_64b = true;3227cfg.state = panfrost_emit_compute_shader_meta(batch, PIPE_SHADER_COMPUTE);3228cfg.attributes = panfrost_emit_image_attribs(batch, &cfg.attribute_buffers, PIPE_SHADER_COMPUTE);3229cfg.thread_storage = panfrost_emit_shared_memory(batch, info);3230cfg.uniform_buffers = panfrost_emit_const_buf(batch,3231PIPE_SHADER_COMPUTE, &cfg.push_uniforms);3232cfg.textures = panfrost_emit_texture_descriptors(batch,3233PIPE_SHADER_COMPUTE);3234cfg.samplers = panfrost_emit_sampler_descriptors(batch,3235PIPE_SHADER_COMPUTE);3236}32373238pan_section_pack(t.cpu, COMPUTE_JOB, DRAW_PADDING, cfg);32393240unsigned indirect_dep = 0;3241if (info->indirect) {3242struct pan_indirect_dispatch_info indirect = {3243.job = t.gpu,3244.indirect_dim = pan_resource(info->indirect)->image.data.bo->ptr.gpu +3245info->indirect_offset,3246.num_wg_sysval = {3247batch->num_wg_sysval[0],3248batch->num_wg_sysval[1],3249batch->num_wg_sysval[2],3250},3251};32523253indirect_dep = pan_indirect_dispatch_emit(&batch->pool.base,3254&batch->scoreboard,3255&indirect);3256}32573258panfrost_add_job(&batch->pool.base, &batch->scoreboard,3259MALI_JOB_TYPE_COMPUTE, true, false,3260indirect_dep, 0, &t, false);3261panfrost_flush_all_batches(ctx);3262}32633264static void *3265panfrost_create_rasterizer_state(3266struct pipe_context *pctx,3267const struct pipe_rasterizer_state *cso)3268{3269struct panfrost_rasterizer *so = CALLOC_STRUCT(panfrost_rasterizer);32703271so->base = *cso;32723273/* Gauranteed with the core GL call, so don't expose ARB_polygon_offset */3274assert(cso->offset_clamp == 0.0);32753276pan_pack(&so->multisample, MULTISAMPLE_MISC, cfg) {3277cfg.multisample_enable = cso->multisample;3278cfg.fixed_function_near_discard = cso->depth_clip_near;3279cfg.fixed_function_far_discard = cso->depth_clip_far;3280cfg.shader_depth_range_fixed = true;3281}32823283pan_pack(&so->stencil_misc, STENCIL_MASK_MISC, cfg) {3284cfg.depth_range_1 = cso->offset_tri;3285cfg.depth_range_2 = cso->offset_tri;3286cfg.single_sampled_lines = !cso->multisample;3287}32883289return so;3290}32913292/* Assigns a vertex buffer for a given (index, divisor) tuple */32933294static unsigned3295pan_assign_vertex_buffer(struct pan_vertex_buffer *buffers,3296unsigned *nr_bufs,3297unsigned vbi,3298unsigned divisor)3299{3300/* Look up the buffer */3301for (unsigned i = 0; i < (*nr_bufs); ++i) {3302if (buffers[i].vbi == vbi && buffers[i].divisor == divisor)3303return i;3304}33053306/* Else, create a new buffer */3307unsigned idx = (*nr_bufs)++;33083309buffers[idx] = (struct pan_vertex_buffer) {3310.vbi = vbi,3311.divisor = divisor3312};33133314return idx;3315}33163317static void *3318panfrost_create_vertex_elements_state(3319struct pipe_context *pctx,3320unsigned num_elements,3321const struct pipe_vertex_element *elements)3322{3323struct panfrost_vertex_state *so = CALLOC_STRUCT(panfrost_vertex_state);3324struct panfrost_device *dev = pan_device(pctx->screen);33253326so->num_elements = num_elements;3327memcpy(so->pipe, elements, sizeof(*elements) * num_elements);33283329/* Assign attribute buffers corresponding to the vertex buffers, keyed3330* for a particular divisor since that's how instancing works on Mali */3331for (unsigned i = 0; i < num_elements; ++i) {3332so->element_buffer[i] = pan_assign_vertex_buffer(3333so->buffers, &so->nr_bufs,3334elements[i].vertex_buffer_index,3335elements[i].instance_divisor);3336}33373338for (int i = 0; i < num_elements; ++i) {3339enum pipe_format fmt = elements[i].src_format;3340const struct util_format_description *desc = util_format_description(fmt);3341so->formats[i] = dev->formats[desc->format].hw;3342assert(so->formats[i]);3343}33443345/* Let's also prepare vertex builtins */3346so->formats[PAN_VERTEX_ID] = dev->formats[PIPE_FORMAT_R32_UINT].hw;3347so->formats[PAN_INSTANCE_ID] = dev->formats[PIPE_FORMAT_R32_UINT].hw;33483349return so;3350}33513352static inline unsigned3353pan_pipe_to_stencil_op(enum pipe_stencil_op in)3354{3355switch (in) {3356case PIPE_STENCIL_OP_KEEP: return MALI_STENCIL_OP_KEEP;3357case PIPE_STENCIL_OP_ZERO: return MALI_STENCIL_OP_ZERO;3358case PIPE_STENCIL_OP_REPLACE: return MALI_STENCIL_OP_REPLACE;3359case PIPE_STENCIL_OP_INCR: return MALI_STENCIL_OP_INCR_SAT;3360case PIPE_STENCIL_OP_DECR: return MALI_STENCIL_OP_DECR_SAT;3361case PIPE_STENCIL_OP_INCR_WRAP: return MALI_STENCIL_OP_INCR_WRAP;3362case PIPE_STENCIL_OP_DECR_WRAP: return MALI_STENCIL_OP_DECR_WRAP;3363case PIPE_STENCIL_OP_INVERT: return MALI_STENCIL_OP_INVERT;3364default: unreachable("Invalid stencil op");3365}3366}33673368static inline void3369pan_pipe_to_stencil(const struct pipe_stencil_state *in,3370struct mali_stencil_packed *out)3371{3372pan_pack(out, STENCIL, s) {3373s.mask = in->valuemask;3374s.compare_function = (enum mali_func) in->func;3375s.stencil_fail = pan_pipe_to_stencil_op(in->fail_op);3376s.depth_fail = pan_pipe_to_stencil_op(in->zfail_op);3377s.depth_pass = pan_pipe_to_stencil_op(in->zpass_op);3378}3379}33803381static void *3382panfrost_create_depth_stencil_state(struct pipe_context *pipe,3383const struct pipe_depth_stencil_alpha_state *zsa)3384{3385struct panfrost_device *dev = pan_device(pipe->screen);3386struct panfrost_zsa_state *so = CALLOC_STRUCT(panfrost_zsa_state);3387so->base = *zsa;33883389/* Normalize (there's no separate enable) */3390if (!zsa->alpha_enabled)3391so->base.alpha_func = MALI_FUNC_ALWAYS;33923393/* Prepack relevant parts of the Renderer State Descriptor. They will3394* be ORed in at draw-time */3395pan_pack(&so->rsd_depth, MULTISAMPLE_MISC, cfg) {3396cfg.depth_function = zsa->depth_enabled ?3397(enum mali_func) zsa->depth_func : MALI_FUNC_ALWAYS;33983399cfg.depth_write_mask = zsa->depth_writemask;3400}34013402pan_pack(&so->rsd_stencil, STENCIL_MASK_MISC, cfg) {3403cfg.stencil_enable = zsa->stencil[0].enabled;34043405cfg.stencil_mask_front = zsa->stencil[0].writemask;3406cfg.stencil_mask_back = zsa->stencil[1].enabled ?3407zsa->stencil[1].writemask : zsa->stencil[0].writemask;34083409if (dev->arch < 6) {3410cfg.alpha_test_compare_function =3411(enum mali_func) so->base.alpha_func;3412}3413}34143415/* Stencil tests have their own words in the RSD */3416pan_pipe_to_stencil(&zsa->stencil[0], &so->stencil_front);34173418if (zsa->stencil[1].enabled)3419pan_pipe_to_stencil(&zsa->stencil[1], &so->stencil_back);3420else3421so->stencil_back = so->stencil_front;34223423so->enabled = zsa->stencil[0].enabled ||3424(zsa->depth_enabled && zsa->depth_func != PIPE_FUNC_ALWAYS);34253426/* Write masks need tracking together */3427if (zsa->depth_writemask)3428so->draws |= PIPE_CLEAR_DEPTH;34293430if (zsa->stencil[0].enabled)3431so->draws |= PIPE_CLEAR_STENCIL;34323433/* TODO: Bounds test should be easy */3434assert(!zsa->depth_bounds_test);34353436return so;3437}34383439void3440panfrost_create_sampler_view_bo(struct panfrost_sampler_view *so,3441struct pipe_context *pctx,3442struct pipe_resource *texture)3443{3444struct panfrost_device *device = pan_device(pctx->screen);3445struct panfrost_context *ctx = pan_context(pctx);3446struct panfrost_resource *prsrc = (struct panfrost_resource *)texture;3447enum pipe_format format = so->base.format;3448assert(prsrc->image.data.bo);34493450/* Format to access the stencil portion of a Z32_S8 texture */3451if (format == PIPE_FORMAT_X32_S8X24_UINT) {3452assert(prsrc->separate_stencil);3453texture = &prsrc->separate_stencil->base;3454prsrc = (struct panfrost_resource *)texture;3455format = texture->format;3456}34573458const struct util_format_description *desc = util_format_description(format);34593460bool fake_rgtc = !panfrost_supports_compressed_format(device, MALI_BC4_UNORM);34613462if (desc->layout == UTIL_FORMAT_LAYOUT_RGTC && fake_rgtc) {3463if (desc->is_snorm)3464format = PIPE_FORMAT_R8G8B8A8_SNORM;3465else3466format = PIPE_FORMAT_R8G8B8A8_UNORM;3467desc = util_format_description(format);3468}34693470so->texture_bo = prsrc->image.data.bo->ptr.gpu;3471so->modifier = prsrc->image.layout.modifier;34723473/* MSAA only supported for 2D textures */34743475assert(texture->nr_samples <= 1 ||3476so->base.target == PIPE_TEXTURE_2D ||3477so->base.target == PIPE_TEXTURE_2D_ARRAY);34783479enum mali_texture_dimension type =3480panfrost_translate_texture_dimension(so->base.target);34813482bool is_buffer = (so->base.target == PIPE_BUFFER);34833484unsigned first_level = is_buffer ? 0 : so->base.u.tex.first_level;3485unsigned last_level = is_buffer ? 0 : so->base.u.tex.last_level;3486unsigned first_layer = is_buffer ? 0 : so->base.u.tex.first_layer;3487unsigned last_layer = is_buffer ? 0 : so->base.u.tex.last_layer;3488unsigned buf_offset = is_buffer ? so->base.u.buf.offset : 0;3489unsigned buf_size = (is_buffer ? so->base.u.buf.size : 0) /3490util_format_get_blocksize(format);34913492if (so->base.target == PIPE_TEXTURE_3D) {3493first_layer /= prsrc->image.layout.depth;3494last_layer /= prsrc->image.layout.depth;3495assert(!first_layer && !last_layer);3496}34973498struct pan_image_view iview = {3499.format = format,3500.dim = type,3501.first_level = first_level,3502.last_level = last_level,3503.first_layer = first_layer,3504.last_layer = last_layer,3505.swizzle = {3506so->base.swizzle_r,3507so->base.swizzle_g,3508so->base.swizzle_b,3509so->base.swizzle_a,3510},3511.image = &prsrc->image,35123513.buf.offset = buf_offset,3514.buf.size = buf_size,3515};35163517unsigned size =3518(pan_is_bifrost(device) ? 0 : MALI_MIDGARD_TEXTURE_LENGTH) +3519panfrost_estimate_texture_payload_size(device, &iview);35203521struct panfrost_ptr payload = pan_pool_alloc_aligned(&ctx->descs.base, size, 64);3522so->state = panfrost_pool_take_ref(&ctx->descs, payload.gpu);35233524void *tex = pan_is_bifrost(device) ?3525&so->bifrost_descriptor : payload.cpu;35263527if (!pan_is_bifrost(device)) {3528payload.cpu += MALI_MIDGARD_TEXTURE_LENGTH;3529payload.gpu += MALI_MIDGARD_TEXTURE_LENGTH;3530}35313532panfrost_new_texture(device, &iview, tex, &payload);3533}35343535static struct pipe_sampler_view *3536panfrost_create_sampler_view(3537struct pipe_context *pctx,3538struct pipe_resource *texture,3539const struct pipe_sampler_view *template)3540{3541struct panfrost_sampler_view *so = rzalloc(pctx, struct panfrost_sampler_view);35423543pipe_reference(NULL, &texture->reference);35443545so->base = *template;3546so->base.texture = texture;3547so->base.reference.count = 1;3548so->base.context = pctx;35493550panfrost_create_sampler_view_bo(so, pctx, texture);35513552return (struct pipe_sampler_view *) so;3553}35543555/* A given Gallium blend state can be encoded to the hardware in numerous,3556* dramatically divergent ways due to the interactions of blending with3557* framebuffer formats. Conceptually, there are two modes:3558*3559* - Fixed-function blending (for suitable framebuffer formats, suitable blend3560* state, and suitable blend constant)3561*3562* - Blend shaders (for everything else)3563*3564* A given Gallium blend configuration will compile to exactly one3565* fixed-function blend state, if it compiles to any, although the constant3566* will vary across runs as that is tracked outside of the Gallium CSO.3567*3568* However, that same blend configuration will compile to many different blend3569* shaders, depending on the framebuffer formats active. The rationale is that3570* blend shaders override not just fixed-function blending but also3571* fixed-function format conversion, so blend shaders are keyed to a particular3572* framebuffer format. As an example, the tilebuffer format is identical for3573* RG16F and RG16UI -- both are simply 32-bit raw pixels -- so both require3574* blend shaders.3575*3576* All of this state is encapsulated in the panfrost_blend_state struct3577* (our subclass of pipe_blend_state).3578*/35793580/* Create a blend CSO. Essentially, try to compile a fixed-function3581* expression and initialize blend shaders */35823583static void *3584panfrost_create_blend_state(struct pipe_context *pipe,3585const struct pipe_blend_state *blend)3586{3587struct panfrost_device *dev = pan_device(pipe->screen);3588struct panfrost_blend_state *so = CALLOC_STRUCT(panfrost_blend_state);3589so->base = *blend;35903591so->pan.logicop_enable = blend->logicop_enable;3592so->pan.logicop_func = blend->logicop_func;3593so->pan.rt_count = blend->max_rt + 1;35943595for (unsigned c = 0; c < so->pan.rt_count; ++c) {3596unsigned g = blend->independent_blend_enable ? c : 0;3597const struct pipe_rt_blend_state pipe = blend->rt[g];3598struct pan_blend_equation equation = {0};35993600equation.color_mask = pipe.colormask;3601equation.blend_enable = pipe.blend_enable;36023603if (pipe.blend_enable) {3604equation.rgb_func = util_blend_func_to_shader(pipe.rgb_func);3605equation.rgb_src_factor = util_blend_factor_to_shader(pipe.rgb_src_factor);3606equation.rgb_invert_src_factor = util_blend_factor_is_inverted(pipe.rgb_src_factor);3607equation.rgb_dst_factor = util_blend_factor_to_shader(pipe.rgb_dst_factor);3608equation.rgb_invert_dst_factor = util_blend_factor_is_inverted(pipe.rgb_dst_factor);3609equation.alpha_func = util_blend_func_to_shader(pipe.alpha_func);3610equation.alpha_src_factor = util_blend_factor_to_shader(pipe.alpha_src_factor);3611equation.alpha_invert_src_factor = util_blend_factor_is_inverted(pipe.alpha_src_factor);3612equation.alpha_dst_factor = util_blend_factor_to_shader(pipe.alpha_dst_factor);3613equation.alpha_invert_dst_factor = util_blend_factor_is_inverted(pipe.alpha_dst_factor);3614}36153616/* Determine some common properties */3617unsigned constant_mask = pan_blend_constant_mask(equation);3618so->info[c] = (struct pan_blend_info) {3619.no_colour = (equation.color_mask == 0),3620.opaque = pan_blend_is_opaque(equation),3621.constant_mask = constant_mask,36223623/* TODO: check the dest for the logicop */3624.load_dest = blend->logicop_enable ||3625pan_blend_reads_dest(equation),36263627/* Could this possibly be fixed-function? */3628.fixed_function = !blend->logicop_enable &&3629pan_blend_can_fixed_function(equation) &&3630(!constant_mask ||3631pan_blend_supports_constant(dev->arch, c))3632};36333634so->pan.rts[c].equation = equation;36353636/* Bifrost needs to know if any render target loads its3637* destination in the hot draw path, so precompute this */3638if (so->info[c].load_dest)3639so->load_dest_mask |= BITFIELD_BIT(c);36403641/* Converting equations to Mali style is expensive, do it at3642* CSO create time instead of draw-time */3643if (so->info[c].fixed_function) {3644pan_pack(&so->equation[c], BLEND_EQUATION, cfg)3645pan_blend_to_fixed_function_equation(equation, &cfg);3646}3647}36483649return so;3650}36513652static void3653prepare_rsd(struct panfrost_device *dev,3654struct panfrost_shader_state *state,3655struct panfrost_pool *pool, bool upload)3656{3657struct mali_renderer_state_packed *out = &state->partial_rsd;36583659if (upload) {3660struct panfrost_ptr ptr =3661pan_pool_alloc_desc(&pool->base, RENDERER_STATE);36623663state->state = panfrost_pool_take_ref(pool, ptr.gpu);3664out = ptr.cpu;3665}36663667pan_pack(out, RENDERER_STATE, cfg) {3668pan_shader_prepare_rsd(dev, &state->info, state->bin.gpu,3669&cfg);3670}3671}36723673static void3674panfrost_get_sample_position(struct pipe_context *context,3675unsigned sample_count,3676unsigned sample_index,3677float *out_value)3678{3679panfrost_query_sample_position(3680panfrost_sample_pattern(sample_count),3681sample_index,3682out_value);3683}36843685static void3686screen_destroy(struct pipe_screen *pscreen)3687{3688struct panfrost_device *dev = pan_device(pscreen);3689pan_blitter_cleanup(dev);3690}36913692static void3693preload(struct panfrost_batch *batch, struct pan_fb_info *fb)3694{3695struct panfrost_device *dev = pan_device(batch->ctx->base.screen);36963697pan_preload_fb(&batch->pool.base, &batch->scoreboard, fb, batch->tls.gpu,3698pan_is_bifrost(dev) ? batch->tiler_ctx.bifrost : 0);3699}37003701void3702panfrost_cmdstream_screen_init(struct panfrost_screen *screen)3703{3704struct panfrost_device *dev = &screen->dev;37053706screen->vtbl.prepare_rsd = prepare_rsd;3707screen->vtbl.emit_tls = emit_tls;3708screen->vtbl.emit_fbd = emit_fbd;3709screen->vtbl.emit_fragment_job = emit_fragment_job;3710screen->vtbl.screen_destroy = screen_destroy;3711screen->vtbl.preload = preload;37123713pan_blitter_init(dev, &screen->blitter.bin_pool.base,3714&screen->blitter.desc_pool.base);3715}37163717void3718panfrost_cmdstream_context_init(struct pipe_context *pipe)3719{3720pipe->draw_vbo = panfrost_draw_vbo;3721pipe->launch_grid = panfrost_launch_grid;37223723pipe->create_vertex_elements_state = panfrost_create_vertex_elements_state;3724pipe->create_rasterizer_state = panfrost_create_rasterizer_state;3725pipe->create_depth_stencil_alpha_state = panfrost_create_depth_stencil_state;3726pipe->create_sampler_view = panfrost_create_sampler_view;3727pipe->create_sampler_state = panfrost_create_sampler_state;3728pipe->create_blend_state = panfrost_create_blend_state;37293730pipe->get_sample_position = panfrost_get_sample_position;3731}373237333734