Path: blob/21.2-virgl/src/panfrost/lib/pan_indirect_draw.c
4560 views
/*1* Copyright (C) 2021 Collabora, Ltd.2*3* Permission is hereby granted, free of charge, to any person obtaining a4* copy of this software and associated documentation files (the "Software"),5* to deal in the Software without restriction, including without limitation6* the rights to use, copy, modify, merge, publish, distribute, sublicense,7* and/or sell copies of the Software, and to permit persons to whom the8* Software is furnished to do so, subject to the following conditions:9*10* The above copyright notice and this permission notice (including the next11* paragraph) shall be included in all copies or substantial portions of the12* Software.13*14* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR15* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,16* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL17* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER18* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,19* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE20* SOFTWARE.21*22*/2324#include <stdio.h>25#include "pan_bo.h"26#include "pan_shader.h"27#include "pan_scoreboard.h"28#include "pan_encoder.h"29#include "pan_indirect_draw.h"30#include "pan_pool.h"31#include "pan_util.h"32#include "panfrost-quirks.h"33#include "compiler/nir/nir_builder.h"34#include "util/u_memory.h"35#include "util/macros.h"3637#define WORD(x) ((x) * 4)3839#define LOOP \40for (nir_loop *l = nir_push_loop(b); l != NULL; \41nir_pop_loop(b, l), l = NULL)42#define BREAK nir_jump(b, nir_jump_break)43#define CONTINUE nir_jump(b, nir_jump_continue)4445#define IF(cond) nir_push_if(b, cond);46#define ELSE nir_push_else(b, NULL);47#define ENDIF nir_pop_if(b, NULL);4849#define MIN_MAX_JOBS 1285051struct draw_data {52nir_ssa_def *draw_buf;53nir_ssa_def *draw_buf_stride;54nir_ssa_def *index_buf;55nir_ssa_def *restart_index;56nir_ssa_def *vertex_count;57nir_ssa_def *start_instance;58nir_ssa_def *instance_count;59nir_ssa_def *vertex_start;60nir_ssa_def *index_bias;61nir_ssa_def *draw_ctx;62nir_ssa_def *min_max_ctx;63};6465struct instance_size {66nir_ssa_def *raw;67nir_ssa_def *padded;68nir_ssa_def *packed;69};7071struct jobs_data {72nir_ssa_def *vertex_job;73nir_ssa_def *tiler_job;74nir_ssa_def *base_vertex_offset;75nir_ssa_def *first_vertex_sysval;76nir_ssa_def *base_vertex_sysval;77nir_ssa_def *base_instance_sysval;78nir_ssa_def *offset_start;79nir_ssa_def *invocation;80};8182struct varyings_data {83nir_ssa_def *varying_bufs;84nir_ssa_def *pos_ptr;85nir_ssa_def *psiz_ptr;86nir_variable *mem_ptr;87};8889struct attribs_data {90nir_ssa_def *attrib_count;91nir_ssa_def *attrib_bufs;92nir_ssa_def *attribs;93};9495struct indirect_draw_shader_builder {96nir_builder b;97const struct panfrost_device *dev;98unsigned flags;99bool index_min_max_search;100unsigned index_size;101struct draw_data draw;102struct instance_size instance_size;103struct jobs_data jobs;104struct varyings_data varyings;105struct attribs_data attribs;106};107108/* Describes an indirect draw (see glDrawArraysIndirect()) */109110struct indirect_draw_info {111uint32_t count;112uint32_t instance_count;113uint32_t start;114uint32_t start_instance;115};116117struct indirect_indexed_draw_info {118uint32_t count;119uint32_t instance_count;120uint32_t start;121int32_t index_bias;122uint32_t start_instance;123};124125/* Store the min/max index in a separate context. This is not supported yet, but126* the DDK seems to put all min/max search jobs at the beginning of the job chain127* when multiple indirect draws are issued to avoid the serialization caused by128* the draw patching jobs which have the suppress_prefetch flag set. Merging the129* min/max and draw contexts would prevent such optimizations (draw contexts are130* shared by all indirect draw in a batch).131*/132133struct min_max_context {134uint32_t min;135uint32_t max;136};137138/* Per-batch context shared by all indirect draws queued to a given batch. */139140struct indirect_draw_context {141/* Pointer to the top of the varying heap. */142mali_ptr varying_mem;143};144145/* Indirect draw shader inputs. Those are stored in a UBO. */146147struct indirect_draw_inputs {148/* indirect_draw_context pointer */149mali_ptr draw_ctx;150151/* min_max_context pointer */152mali_ptr min_max_ctx;153154/* Pointer to an array of indirect_draw_info objects */155mali_ptr draw_buf;156157/* Pointer to an uint32_t containing the number of draws to issue */158mali_ptr draw_count_ptr;159160/* index buffer */161mali_ptr index_buf;162163/* {base,first}_{vertex,instance} sysvals */164mali_ptr first_vertex_sysval;165mali_ptr base_vertex_sysval;166mali_ptr base_instance_sysval;167168/* Pointers to various cmdstream structs that need to be patched */169mali_ptr vertex_job;170mali_ptr tiler_job;171mali_ptr attrib_bufs;172mali_ptr attribs;173mali_ptr varying_bufs;174uint32_t draw_count;175uint32_t draw_buf_stride;176uint32_t restart_index;177uint32_t attrib_count;178};179180static nir_ssa_def *181get_input_data(nir_builder *b, unsigned offset, unsigned size)182{183assert(!(offset & 0x3));184assert(size && !(size & 0x3));185186return nir_load_ubo(b, 1, size,187nir_imm_int(b, 0),188nir_imm_int(b, offset),189.align_mul = 4,190.align_offset = 0,191.range_base = 0,192.range = ~0);193}194195#define get_input_field(b, name) \196get_input_data(b, offsetof(struct indirect_draw_inputs, name), \197sizeof(((struct indirect_draw_inputs *)0)->name) * 8)198199static nir_ssa_def *200get_address(nir_builder *b, nir_ssa_def *base, nir_ssa_def *offset)201{202return nir_iadd(b, base, nir_u2u64(b, offset));203}204205static nir_ssa_def *206get_address_imm(nir_builder *b, nir_ssa_def *base, unsigned offset)207{208return get_address(b, base, nir_imm_int(b, offset));209}210211static nir_ssa_def *212load_global(nir_builder *b, nir_ssa_def *addr, unsigned ncomps, unsigned bit_size)213{214return nir_load_global(b, addr, 4, ncomps, bit_size);215}216217static void218store_global(nir_builder *b, nir_ssa_def *addr,219nir_ssa_def *value, unsigned ncomps)220{221nir_store_global(b, addr, 4, value, (1 << ncomps) - 1);222}223224static nir_ssa_def *225get_draw_ctx_data(struct indirect_draw_shader_builder *builder,226unsigned offset, unsigned size)227{228nir_builder *b = &builder->b;229return load_global(b,230get_address_imm(b, builder->draw.draw_ctx, offset),2311, size);232}233234static void235set_draw_ctx_data(struct indirect_draw_shader_builder *builder,236unsigned offset, nir_ssa_def *value, unsigned size)237{238nir_builder *b = &builder->b;239store_global(b,240get_address_imm(b, builder->draw.draw_ctx, offset),241value, 1);242}243244#define get_draw_ctx_field(builder, name) \245get_draw_ctx_data(builder, \246offsetof(struct indirect_draw_context, name), \247sizeof(((struct indirect_draw_context *)0)->name) * 8)248249#define set_draw_ctx_field(builder, name, val) \250set_draw_ctx_data(builder, \251offsetof(struct indirect_draw_context, name), \252val, \253sizeof(((struct indirect_draw_context *)0)->name) * 8)254255static nir_ssa_def *256get_min_max_ctx_data(struct indirect_draw_shader_builder *builder,257unsigned offset, unsigned size)258{259nir_builder *b = &builder->b;260return load_global(b,261get_address_imm(b, builder->draw.min_max_ctx, offset),2621, size);263}264265#define get_min_max_ctx_field(builder, name) \266get_min_max_ctx_data(builder, \267offsetof(struct min_max_context, name), \268sizeof(((struct min_max_context *)0)->name) * 8)269270static void271update_min(struct indirect_draw_shader_builder *builder, nir_ssa_def *val)272{273nir_builder *b = &builder->b;274nir_ssa_def *addr =275get_address_imm(b,276builder->draw.min_max_ctx,277offsetof(struct min_max_context, min));278nir_global_atomic_umin(b, 32, addr, val);279}280281static void282update_max(struct indirect_draw_shader_builder *builder, nir_ssa_def *val)283{284nir_builder *b = &builder->b;285nir_ssa_def *addr =286get_address_imm(b,287builder->draw.min_max_ctx,288offsetof(struct min_max_context, max));289nir_global_atomic_umax(b, 32, addr, val);290}291292#define get_draw_field(b, draw_ptr, field) \293load_global(b, \294get_address_imm(b, draw_ptr, \295offsetof(struct indirect_draw_info, field)), \2961, sizeof(((struct indirect_draw_info *)0)->field) * 8)297298#define get_indexed_draw_field(b, draw_ptr, field) \299load_global(b, \300get_address_imm(b, draw_ptr, \301offsetof(struct indirect_indexed_draw_info, field)), \3021, sizeof(((struct indirect_indexed_draw_info *)0)->field) * 8)303304static void305extract_inputs(struct indirect_draw_shader_builder *builder)306{307nir_builder *b = &builder->b;308309builder->draw.draw_ctx = get_input_field(b, draw_ctx);310builder->draw.draw_buf = get_input_field(b, draw_buf);311builder->draw.draw_buf_stride = get_input_field(b, draw_buf_stride);312313if (builder->index_size) {314builder->draw.index_buf = get_input_field(b, index_buf);315builder->draw.min_max_ctx = get_input_field(b, min_max_ctx);316if (builder->flags & PAN_INDIRECT_DRAW_PRIMITIVE_RESTART) {317builder->draw.restart_index =318get_input_field(b, restart_index);319}320}321322if (builder->index_min_max_search)323return;324325builder->jobs.first_vertex_sysval = get_input_field(b, first_vertex_sysval);326builder->jobs.base_vertex_sysval = get_input_field(b, base_vertex_sysval);327builder->jobs.base_instance_sysval = get_input_field(b, base_instance_sysval);328builder->jobs.vertex_job = get_input_field(b, vertex_job);329builder->jobs.tiler_job = get_input_field(b, tiler_job);330builder->attribs.attrib_bufs = get_input_field(b, attrib_bufs);331builder->attribs.attribs = get_input_field(b, attribs);332builder->attribs.attrib_count = get_input_field(b, attrib_count);333builder->varyings.varying_bufs = get_input_field(b, varying_bufs);334builder->varyings.mem_ptr =335nir_local_variable_create(b->impl,336glsl_uint64_t_type(),337"var_mem_ptr");338nir_store_var(b, builder->varyings.mem_ptr,339get_draw_ctx_field(builder, varying_mem), 3);340}341342static void343init_shader_builder(struct indirect_draw_shader_builder *builder,344const struct panfrost_device *dev,345unsigned flags, unsigned index_size,346bool index_min_max_search)347{348memset(builder, 0, sizeof(*builder));349builder->dev = dev;350builder->flags = flags;351builder->index_size = index_size;352353builder->index_min_max_search = index_min_max_search;354355if (index_min_max_search) {356builder->b =357nir_builder_init_simple_shader(MESA_SHADER_COMPUTE,358pan_shader_get_compiler_options(dev),359"indirect_draw_min_max_index(index_size=%d)",360builder->index_size);361} else {362builder->b =363nir_builder_init_simple_shader(MESA_SHADER_COMPUTE,364pan_shader_get_compiler_options(dev),365"indirect_draw(index_size=%d%s%s%s)",366builder->index_size,367flags & PAN_INDIRECT_DRAW_HAS_PSIZ ?368",psiz" : "",369flags & PAN_INDIRECT_DRAW_PRIMITIVE_RESTART ?370",primitive_restart" : "",371flags & PAN_INDIRECT_DRAW_UPDATE_PRIM_SIZE ?372",update_primitive_size" : "");373}374375nir_builder *b = &builder->b;376b->shader->info.internal = true;377nir_variable_create(b->shader, nir_var_mem_ubo,378glsl_uint_type(), "inputs");379b->shader->info.num_ubos++;380381extract_inputs(builder);382}383384static void385update_job(struct indirect_draw_shader_builder *builder, enum mali_job_type type)386{387nir_builder *b = &builder->b;388nir_ssa_def *job_ptr =389type == MALI_JOB_TYPE_VERTEX ?390builder->jobs.vertex_job : builder->jobs.tiler_job;391392/* Update the invocation words. */393store_global(b, get_address_imm(b, job_ptr, WORD(8)),394builder->jobs.invocation, 2);395396unsigned draw_offset =397type == MALI_JOB_TYPE_VERTEX ?398pan_section_offset(COMPUTE_JOB, DRAW) :399pan_is_bifrost(builder->dev) ?400pan_section_offset(BIFROST_TILER_JOB, DRAW) :401pan_section_offset(MIDGARD_TILER_JOB, DRAW);402unsigned prim_offset =403pan_is_bifrost(builder->dev) ?404pan_section_offset(BIFROST_TILER_JOB, PRIMITIVE) :405pan_section_offset(MIDGARD_TILER_JOB, PRIMITIVE);406unsigned psiz_offset =407pan_is_bifrost(builder->dev) ?408pan_section_offset(BIFROST_TILER_JOB, PRIMITIVE_SIZE) :409pan_section_offset(MIDGARD_TILER_JOB, PRIMITIVE_SIZE);410unsigned index_size = builder->index_size;411412if (type == MALI_JOB_TYPE_TILER) {413/* Update PRIMITIVE.{base_vertex_offset,count} */414store_global(b,415get_address_imm(b, job_ptr, prim_offset + WORD(1)),416builder->jobs.base_vertex_offset, 1);417store_global(b,418get_address_imm(b, job_ptr, prim_offset + WORD(3)),419nir_iadd_imm(b, builder->draw.vertex_count, -1), 1);420421if (index_size) {422nir_ssa_def *addr =423get_address_imm(b, job_ptr, prim_offset + WORD(4));424nir_ssa_def *indices = load_global(b, addr, 1, 64);425nir_ssa_def *offset =426nir_imul_imm(b, builder->draw.vertex_start, index_size);427428indices = get_address(b, indices, offset);429store_global(b, addr, indices, 2);430}431432/* Update PRIMITIVE_SIZE.size_array */433if ((builder->flags & PAN_INDIRECT_DRAW_HAS_PSIZ) &&434(builder->flags & PAN_INDIRECT_DRAW_UPDATE_PRIM_SIZE)) {435store_global(b,436get_address_imm(b, job_ptr, psiz_offset + WORD(0)),437builder->varyings.psiz_ptr, 2);438}439440/* Update DRAW.position */441store_global(b, get_address_imm(b, job_ptr, draw_offset + WORD(4)),442builder->varyings.pos_ptr, 2);443}444445nir_ssa_def *draw_w01 =446load_global(b, get_address_imm(b, job_ptr, draw_offset + WORD(0)), 2, 32);447nir_ssa_def *draw_w0 = nir_channel(b, draw_w01, 0);448449/* Update DRAW.{instance_size,offset_start} */450nir_ssa_def *instance_size =451nir_bcsel(b,452nir_ult(b, builder->draw.instance_count, nir_imm_int(b, 2)),453nir_imm_int(b, 0), builder->instance_size.packed);454draw_w01 = nir_vec2(b,455nir_ior(b, nir_iand_imm(b, draw_w0, 0xffff),456nir_ishl(b, instance_size, nir_imm_int(b, 16))),457builder->jobs.offset_start);458store_global(b, get_address_imm(b, job_ptr, draw_offset + WORD(0)),459draw_w01, 2);460}461462static void463split_div(nir_builder *b, nir_ssa_def *div, nir_ssa_def **r_e, nir_ssa_def **d)464{465/* TODO: Lower this 64bit div to something GPU-friendly */466nir_ssa_def *r = nir_imax(b, nir_ufind_msb(b, div), nir_imm_int(b, 0));467nir_ssa_def *div64 = nir_u2u64(b, div);468nir_ssa_def *half_div64 = nir_u2u64(b, nir_ushr_imm(b, div, 1));469nir_ssa_def *f0 = nir_iadd(b,470nir_ishl(b, nir_imm_int64(b, 1),471nir_iadd_imm(b, r, 32)),472half_div64);473nir_ssa_def *fi = nir_idiv(b, f0, div64);474nir_ssa_def *ff = nir_isub(b, f0, nir_imul(b, fi, div64));475nir_ssa_def *e = nir_bcsel(b, nir_ult(b, half_div64, ff),476nir_imm_int(b, 1 << 5), nir_imm_int(b, 0));477*d = nir_iand_imm(b, nir_u2u32(b, fi), ~(1 << 31));478*r_e = nir_ior(b, r, e);479}480481static void482update_vertex_attrib_buf(struct indirect_draw_shader_builder *builder,483nir_ssa_def *attrib_buf_ptr,484enum mali_attribute_type type,485nir_ssa_def *div1,486nir_ssa_def *div2)487{488nir_builder *b = &builder->b;489unsigned type_mask = BITFIELD_MASK(6);490nir_ssa_def *w01 = load_global(b, attrib_buf_ptr, 2, 32);491nir_ssa_def *w0 = nir_channel(b, w01, 0);492nir_ssa_def *w1 = nir_channel(b, w01, 1);493494/* Word 0 and 1 of the attribute descriptor contain the type,495* pointer and the the divisor exponent.496*/497w0 = nir_iand_imm(b, nir_channel(b, w01, 0), ~type_mask);498w0 = nir_ior(b, w0, nir_imm_int(b, type));499w1 = nir_ior(b, w1, nir_ishl(b, div1, nir_imm_int(b, 24)));500501store_global(b, attrib_buf_ptr, nir_vec2(b, w0, w1), 2);502503if (type == MALI_ATTRIBUTE_TYPE_1D_NPOT_DIVISOR) {504/* If the divisor is not a power of two, the divisor numerator505* is passed in word 1 of the continuation attribute (word 5506* if we consider the attribute and its continuation as a507* single attribute).508*/509assert(div2);510store_global(b, get_address_imm(b, attrib_buf_ptr, WORD(5)),511div2, 1);512}513}514515static void516zero_attrib_buf_stride(struct indirect_draw_shader_builder *builder,517nir_ssa_def *attrib_buf_ptr)518{519/* Stride is an unadorned 32-bit uint at word 2 */520nir_builder *b = &builder->b;521store_global(b, get_address_imm(b, attrib_buf_ptr, WORD(2)),522nir_imm_int(b, 0), 1);523}524525static void526adjust_attrib_offset(struct indirect_draw_shader_builder *builder,527nir_ssa_def *attrib_ptr, nir_ssa_def *attrib_buf_ptr,528nir_ssa_def *instance_div)529{530nir_builder *b = &builder->b;531nir_ssa_def *zero = nir_imm_int(b, 0);532nir_ssa_def *two = nir_imm_int(b, 2);533nir_ssa_def *sub_cur_offset =534nir_iand(b, nir_ine(b, builder->jobs.offset_start, zero),535nir_uge(b, builder->draw.instance_count, two));536537nir_ssa_def *add_base_inst_offset =538nir_iand(b, nir_ine(b, builder->draw.start_instance, zero),539nir_ine(b, instance_div, zero));540541IF (nir_ior(b, sub_cur_offset, add_base_inst_offset)) {542nir_ssa_def *offset =543load_global(b, get_address_imm(b, attrib_ptr, WORD(1)), 1, 32);544nir_ssa_def *stride =545load_global(b, get_address_imm(b, attrib_buf_ptr, WORD(2)), 1, 32);546547/* Per-instance data needs to be offset in response to a548* delayed start in an indexed draw.549*/550551IF (add_base_inst_offset) {552offset = nir_iadd(b, offset,553nir_idiv(b,554nir_imul(b, stride,555builder->draw.start_instance),556instance_div));557} ENDIF558559IF (sub_cur_offset) {560offset = nir_isub(b, offset,561nir_imul(b, stride,562builder->jobs.offset_start));563} ENDIF564565store_global(b, get_address_imm(b, attrib_ptr, WORD(1)),566offset, 1);567} ENDIF568}569570/* x is power of two or zero <===> x has 0 (zero) or 1 (POT) bits set */571572static nir_ssa_def *573nir_is_power_of_two_or_zero(nir_builder *b, nir_ssa_def *x)574{575return nir_ult(b, nir_bit_count(b, x), nir_imm_int(b, 2));576}577578/* Based on panfrost_emit_vertex_data() */579580static void581update_vertex_attribs(struct indirect_draw_shader_builder *builder)582{583nir_builder *b = &builder->b;584nir_variable *attrib_idx_var =585nir_local_variable_create(b->impl, glsl_uint_type(),586"attrib_idx");587nir_store_var(b, attrib_idx_var, nir_imm_int(b, 0), 1);588nir_ssa_def *single_instance =589nir_ult(b, builder->draw.instance_count, nir_imm_int(b, 2));590591LOOP {592nir_ssa_def *attrib_idx = nir_load_var(b, attrib_idx_var);593IF (nir_uge(b, attrib_idx, builder->attribs.attrib_count))594BREAK;595ENDIF596597nir_ssa_def *attrib_buf_ptr =598get_address(b, builder->attribs.attrib_bufs,599nir_imul_imm(b, attrib_idx,6002 * MALI_ATTRIBUTE_BUFFER_LENGTH));601nir_ssa_def *attrib_ptr =602get_address(b, builder->attribs.attribs,603nir_imul_imm(b, attrib_idx,604MALI_ATTRIBUTE_LENGTH));605606nir_ssa_def *r_e, *d;607608if (!pan_is_bifrost(builder->dev)) {609IF (nir_ieq_imm(b, attrib_idx, PAN_VERTEX_ID)) {610nir_ssa_def *r_p =611nir_bcsel(b, single_instance,612nir_imm_int(b, 0x9f),613builder->instance_size.packed);614615store_global(b,616get_address_imm(b, attrib_buf_ptr, WORD(4)),617nir_ishl(b, r_p, nir_imm_int(b, 24)), 1);618619nir_store_var(b, attrib_idx_var,620nir_iadd_imm(b, attrib_idx, 1), 1);621CONTINUE;622} ENDIF623624IF (nir_ieq_imm(b, attrib_idx, PAN_INSTANCE_ID)) {625split_div(b, builder->instance_size.padded,626&r_e, &d);627nir_ssa_def *default_div =628nir_ior(b, single_instance,629nir_ult(b,630builder->instance_size.padded,631nir_imm_int(b, 2)));632r_e = nir_bcsel(b, default_div,633nir_imm_int(b, 0x3f), r_e);634d = nir_bcsel(b, default_div,635nir_imm_int(b, (1u << 31) - 1), d);636store_global(b,637get_address_imm(b, attrib_buf_ptr, WORD(1)),638nir_vec2(b, nir_ishl(b, r_e, nir_imm_int(b, 24)), d),6392);640nir_store_var(b, attrib_idx_var,641nir_iadd_imm(b, attrib_idx, 1), 1);642CONTINUE;643} ENDIF644}645646nir_ssa_def *instance_div =647load_global(b, get_address_imm(b, attrib_buf_ptr, WORD(7)), 1, 32);648649nir_ssa_def *div = nir_imul(b, instance_div, builder->instance_size.padded);650651nir_ssa_def *multi_instance =652nir_uge(b, builder->draw.instance_count, nir_imm_int(b, 2));653654IF (nir_ine(b, div, nir_imm_int(b, 0))) {655IF (multi_instance) {656IF (nir_is_power_of_two_or_zero(b, div)) {657nir_ssa_def *exp =658nir_imax(b, nir_ufind_msb(b, div),659nir_imm_int(b, 0));660update_vertex_attrib_buf(builder, attrib_buf_ptr,661MALI_ATTRIBUTE_TYPE_1D_POT_DIVISOR,662exp, NULL);663} ELSE {664split_div(b, div, &r_e, &d);665update_vertex_attrib_buf(builder, attrib_buf_ptr,666MALI_ATTRIBUTE_TYPE_1D_NPOT_DIVISOR,667r_e, d);668} ENDIF669} ELSE {670/* Single instance with a non-0 divisor: all671* accesses should point to attribute 0 */672zero_attrib_buf_stride(builder, attrib_buf_ptr);673} ENDIF674675adjust_attrib_offset(builder, attrib_ptr, attrib_buf_ptr, instance_div);676} ELSE IF (multi_instance) {677update_vertex_attrib_buf(builder, attrib_buf_ptr,678MALI_ATTRIBUTE_TYPE_1D_MODULUS,679builder->instance_size.packed, NULL);680} ENDIF ENDIF681682nir_store_var(b, attrib_idx_var, nir_iadd_imm(b, attrib_idx, 1), 1);683}684}685686static nir_ssa_def *687update_varying_buf(struct indirect_draw_shader_builder *builder,688nir_ssa_def *varying_buf_ptr,689nir_ssa_def *vertex_count)690{691nir_builder *b = &builder->b;692693nir_ssa_def *stride =694load_global(b, get_address_imm(b, varying_buf_ptr, WORD(2)), 1, 32);695nir_ssa_def *size = nir_imul(b, stride, vertex_count);696nir_ssa_def *aligned_size =697nir_iand_imm(b, nir_iadd_imm(b, size, 63), ~63);698nir_ssa_def *var_mem_ptr =699nir_load_var(b, builder->varyings.mem_ptr);700nir_ssa_def *w0 =701nir_ior(b, nir_unpack_64_2x32_split_x(b, var_mem_ptr),702nir_imm_int(b, MALI_ATTRIBUTE_TYPE_1D));703nir_ssa_def *w1 = nir_unpack_64_2x32_split_y(b, var_mem_ptr);704store_global(b, get_address_imm(b, varying_buf_ptr, WORD(0)),705nir_vec4(b, w0, w1, stride, size), 4);706707nir_store_var(b, builder->varyings.mem_ptr,708get_address(b, var_mem_ptr, aligned_size), 3);709710return var_mem_ptr;711}712713/* Based on panfrost_emit_varying_descriptor() */714715static void716update_varyings(struct indirect_draw_shader_builder *builder)717{718nir_builder *b = &builder->b;719nir_ssa_def *vertex_count =720nir_imul(b, builder->instance_size.padded,721builder->draw.instance_count);722nir_ssa_def *buf_ptr =723get_address_imm(b, builder->varyings.varying_bufs,724PAN_VARY_GENERAL *725MALI_ATTRIBUTE_BUFFER_LENGTH);726update_varying_buf(builder, buf_ptr, vertex_count);727728buf_ptr = get_address_imm(b, builder->varyings.varying_bufs,729PAN_VARY_POSITION *730MALI_ATTRIBUTE_BUFFER_LENGTH);731builder->varyings.pos_ptr =732update_varying_buf(builder, buf_ptr, vertex_count);733734if (builder->flags & PAN_INDIRECT_DRAW_HAS_PSIZ) {735buf_ptr = get_address_imm(b, builder->varyings.varying_bufs,736PAN_VARY_PSIZ *737MALI_ATTRIBUTE_BUFFER_LENGTH);738builder->varyings.psiz_ptr =739update_varying_buf(builder, buf_ptr, vertex_count);740}741742set_draw_ctx_field(builder, varying_mem,743nir_load_var(b, builder->varyings.mem_ptr));744}745746/* Based on panfrost_pack_work_groups_compute() */747748static void749get_invocation(struct indirect_draw_shader_builder *builder)750{751nir_builder *b = &builder->b;752nir_ssa_def *one = nir_imm_int(b, 1);753nir_ssa_def *max_vertex =754nir_usub_sat(b, builder->instance_size.raw, one);755nir_ssa_def *max_instance =756nir_usub_sat(b, builder->draw.instance_count, one);757nir_ssa_def *split =758nir_bcsel(b, nir_ieq_imm(b, max_instance, 0),759nir_imm_int(b, 32),760nir_iadd_imm(b, nir_ufind_msb(b, max_vertex), 1));761762builder->jobs.invocation =763nir_vec2(b,764nir_ior(b, max_vertex,765nir_ishl(b, max_instance, split)),766nir_ior(b, nir_ishl(b, split, nir_imm_int(b, 22)),767nir_imm_int(b, 2 << 28)));768}769770/* Based on panfrost_padded_vertex_count() */771772static nir_ssa_def *773get_padded_count(nir_builder *b, nir_ssa_def *val, nir_ssa_def **packed)774{775nir_ssa_def *one = nir_imm_int(b, 1);776nir_ssa_def *zero = nir_imm_int(b, 0);777nir_ssa_def *eleven = nir_imm_int(b, 11);778nir_ssa_def *four = nir_imm_int(b, 4);779780nir_ssa_def *exp =781nir_usub_sat(b, nir_imax(b, nir_ufind_msb(b, val), zero), four);782nir_ssa_def *base = nir_ushr(b, val, exp);783784base = nir_iadd(b, base,785nir_bcsel(b, nir_ine(b, val, nir_ishl(b, base, exp)), one, zero));786787nir_ssa_def *rshift = nir_imax(b, nir_find_lsb(b, base), zero);788exp = nir_iadd(b, exp, rshift);789base = nir_ushr(b, base, rshift);790base = nir_iadd(b, base, nir_bcsel(b, nir_uge(b, base, eleven), one, zero));791rshift = nir_imax(b, nir_find_lsb(b, base), zero);792exp = nir_iadd(b, exp, rshift);793base = nir_ushr(b, base, rshift);794795*packed = nir_ior(b, exp,796nir_ishl(b, nir_ushr_imm(b, base, 1), nir_imm_int(b, 5)));797return nir_ishl(b, base, exp);798}799800static void801update_jobs(struct indirect_draw_shader_builder *builder)802{803get_invocation(builder);804update_job(builder, MALI_JOB_TYPE_VERTEX);805update_job(builder, MALI_JOB_TYPE_TILER);806}807808static void809get_instance_size(struct indirect_draw_shader_builder *builder)810{811nir_builder *b = &builder->b;812813if (!builder->index_size) {814builder->jobs.base_vertex_offset = nir_imm_int(b, 0);815builder->jobs.offset_start = builder->draw.vertex_start;816builder->instance_size.raw = builder->draw.vertex_count;817return;818}819820unsigned index_size = builder->index_size;821nir_ssa_def *min = get_min_max_ctx_field(builder, min);822nir_ssa_def *max = get_min_max_ctx_field(builder, max);823824/* We handle unaligned indices here to avoid the extra complexity in825* the min/max search job.826*/827if (builder->index_size < 4) {828nir_variable *min_var =829nir_local_variable_create(b->impl, glsl_uint_type(), "min");830nir_store_var(b, min_var, min, 1);831nir_variable *max_var =832nir_local_variable_create(b->impl, glsl_uint_type(), "max");833nir_store_var(b, max_var, max, 1);834835nir_ssa_def *base =836get_address(b, builder->draw.index_buf,837nir_imul_imm(b, builder->draw.vertex_start, index_size));838nir_ssa_def *offset = nir_iand_imm(b, nir_unpack_64_2x32_split_x(b, base), 3);839nir_ssa_def *end =840nir_iadd(b, offset,841nir_imul_imm(b, builder->draw.vertex_count, index_size));842nir_ssa_def *aligned_end = nir_iand_imm(b, end, ~3);843unsigned shift = index_size * 8;844unsigned mask = (1 << shift) - 1;845846base = nir_iand(b, base, nir_imm_int64(b, ~3ULL));847848/* Unaligned start offset, we need to ignore any data that's849* outside the requested range. We also handle ranges that are850* covering less than 2 words here.851*/852IF (nir_ior(b, nir_ine(b, offset, nir_imm_int(b, 0)), nir_ieq(b, aligned_end, nir_imm_int(b, 0)))) {853min = nir_load_var(b, min_var);854max = nir_load_var(b, max_var);855856nir_ssa_def *val = load_global(b, base, 1, 32);857for (unsigned i = 0; i < sizeof(uint32_t); i += index_size) {858nir_ssa_def *oob =859nir_ior(b,860nir_ult(b, nir_imm_int(b, i), offset),861nir_uge(b, nir_imm_int(b, i), end));862nir_ssa_def *data = nir_iand_imm(b, val, mask);863864min = nir_umin(b, min,865nir_bcsel(b, oob, nir_imm_int(b, UINT32_MAX), data));866max = nir_umax(b, max,867nir_bcsel(b, oob, nir_imm_int(b, 0), data));868val = nir_ushr_imm(b, val, shift);869}870871nir_store_var(b, min_var, min, 1);872nir_store_var(b, max_var, max, 1);873} ENDIF874875nir_ssa_def *remaining = nir_isub(b, end, aligned_end);876877/* The last word contains less than 4bytes of data, we need to878* discard anything falling outside the requested range.879*/880IF (nir_iand(b, nir_ine(b, end, aligned_end), nir_ine(b, aligned_end, nir_imm_int(b, 0)))) {881min = nir_load_var(b, min_var);882max = nir_load_var(b, max_var);883884nir_ssa_def *val = load_global(b, get_address(b, base, aligned_end), 1, 32);885for (unsigned i = 0; i < sizeof(uint32_t); i += index_size) {886nir_ssa_def *oob = nir_uge(b, nir_imm_int(b, i), remaining);887nir_ssa_def *data = nir_iand_imm(b, val, mask);888889min = nir_umin(b, min,890nir_bcsel(b, oob, nir_imm_int(b, UINT32_MAX), data));891max = nir_umax(b, max,892nir_bcsel(b, oob, nir_imm_int(b, 0), data));893val = nir_ushr_imm(b, val, shift);894}895896nir_store_var(b, min_var, min, 1);897nir_store_var(b, max_var, max, 1);898} ENDIF899900min = nir_load_var(b, min_var);901max = nir_load_var(b, max_var);902}903904builder->jobs.base_vertex_offset = nir_ineg(b, min);905builder->jobs.offset_start = nir_iadd(b, min, builder->draw.index_bias);906builder->instance_size.raw = nir_iadd_imm(b, nir_usub_sat(b, max, min), 1);907}908909/* Patch a draw sequence */910911static void912patch(struct indirect_draw_shader_builder *builder)913{914unsigned index_size = builder->index_size;915nir_builder *b = &builder->b;916917nir_ssa_def *draw_ptr = builder->draw.draw_buf;918919if (index_size) {920builder->draw.vertex_count = get_indexed_draw_field(b, draw_ptr, count);921builder->draw.start_instance = get_indexed_draw_field(b, draw_ptr, start_instance);922builder->draw.instance_count =923get_indexed_draw_field(b, draw_ptr, instance_count);924builder->draw.vertex_start = get_indexed_draw_field(b, draw_ptr, start);925builder->draw.index_bias = get_indexed_draw_field(b, draw_ptr, index_bias);926} else {927builder->draw.vertex_count = get_draw_field(b, draw_ptr, count);928builder->draw.start_instance = get_draw_field(b, draw_ptr, start_instance);929builder->draw.instance_count = get_draw_field(b, draw_ptr, instance_count);930builder->draw.vertex_start = get_draw_field(b, draw_ptr, start);931}932933assert(builder->draw.vertex_count->num_components);934935get_instance_size(builder);936937builder->instance_size.padded =938get_padded_count(b, builder->instance_size.raw,939&builder->instance_size.packed);940941update_varyings(builder);942update_jobs(builder);943update_vertex_attribs(builder);944945IF (nir_ine(b, builder->jobs.first_vertex_sysval, nir_imm_int64(b, 0))) {946store_global(b, builder->jobs.first_vertex_sysval,947builder->jobs.offset_start, 1);948} ENDIF949950IF (nir_ine(b, builder->jobs.base_vertex_sysval, nir_imm_int64(b, 0))) {951store_global(b, builder->jobs.base_vertex_sysval,952index_size ?953builder->draw.index_bias :954nir_imm_int(b, 0),9551);956} ENDIF957958IF (nir_ine(b, builder->jobs.base_instance_sysval, nir_imm_int64(b, 0))) {959store_global(b, builder->jobs.base_instance_sysval,960builder->draw.start_instance, 1);961} ENDIF962963}964965/* Search the min/max index in the range covered by the indirect draw call */966967static void968get_index_min_max(struct indirect_draw_shader_builder *builder)969{970nir_ssa_def *restart_index = builder->draw.restart_index;971unsigned index_size = builder->index_size;972nir_builder *b = &builder->b;973974nir_ssa_def *draw_ptr = builder->draw.draw_buf;975976builder->draw.vertex_count = get_draw_field(b, draw_ptr, count);977builder->draw.vertex_start = get_draw_field(b, draw_ptr, start);978979nir_ssa_def *thread_id = nir_channel(b, nir_load_global_invocation_id(b, 32), 0);980nir_variable *min_var =981nir_local_variable_create(b->impl, glsl_uint_type(), "min");982nir_store_var(b, min_var, nir_imm_int(b, UINT32_MAX), 1);983nir_variable *max_var =984nir_local_variable_create(b->impl, glsl_uint_type(), "max");985nir_store_var(b, max_var, nir_imm_int(b, 0), 1);986987nir_ssa_def *base =988get_address(b, builder->draw.index_buf,989nir_imul_imm(b, builder->draw.vertex_start, index_size));990991992nir_ssa_def *start = nir_iand_imm(b, nir_unpack_64_2x32_split_x(b, base), 3);993nir_ssa_def *end =994nir_iadd(b, start, nir_imul_imm(b, builder->draw.vertex_count, index_size));995996base = nir_iand(b, base, nir_imm_int64(b, ~3ULL));997998/* Align on 4 bytes, non-aligned indices are handled in the indirect draw job. */999start = nir_iand_imm(b, nir_iadd_imm(b, start, 3), ~3);1000end = nir_iand_imm(b, end, ~3);10011002/* Add the job offset. */1003start = nir_iadd(b, start, nir_imul_imm(b, thread_id, sizeof(uint32_t)));10041005nir_variable *offset_var =1006nir_local_variable_create(b->impl, glsl_uint_type(), "offset");1007nir_store_var(b, offset_var, start, 1);10081009LOOP {1010nir_ssa_def *offset = nir_load_var(b, offset_var);1011IF (nir_uge(b, offset, end))1012BREAK;1013ENDIF10141015nir_ssa_def *val = load_global(b, get_address(b, base, offset), 1, 32);1016nir_ssa_def *old_min = nir_load_var(b, min_var);1017nir_ssa_def *old_max = nir_load_var(b, max_var);1018nir_ssa_def *new_min;1019nir_ssa_def *new_max;10201021/* TODO: use 8/16 bit arithmetic when index_size < 4. */1022for (unsigned i = 0; i < 4; i += index_size) {1023nir_ssa_def *data = nir_ushr_imm(b, val, i * 8);1024data = nir_iand_imm(b, data, (1ULL << (index_size * 8)) - 1);1025new_min = nir_umin(b, old_min, data);1026new_max = nir_umax(b, old_max, data);1027if (restart_index) {1028new_min = nir_bcsel(b, nir_ine(b, restart_index, data), new_min, old_min);1029new_max = nir_bcsel(b, nir_ine(b, restart_index, data), new_max, old_max);1030}1031old_min = new_min;1032old_max = new_max;1033}10341035nir_store_var(b, min_var, new_min, 1);1036nir_store_var(b, max_var, new_max, 1);1037nir_store_var(b, offset_var,1038nir_iadd_imm(b, offset, MIN_MAX_JOBS * sizeof(uint32_t)), 1);1039}10401041IF (nir_ult(b, start, end))1042update_min(builder, nir_load_var(b, min_var));1043update_max(builder, nir_load_var(b, max_var));1044ENDIF1045}10461047static unsigned1048get_shader_id(unsigned flags, unsigned index_size, bool index_min_max_search)1049{1050if (!index_min_max_search) {1051flags &= PAN_INDIRECT_DRAW_FLAGS_MASK;1052flags &= ~PAN_INDIRECT_DRAW_INDEX_SIZE_MASK;1053if (index_size)1054flags |= (util_logbase2(index_size) + 1);1055return flags;1056}10571058return PAN_INDIRECT_DRAW_MIN_MAX_SEARCH_1B_INDEX +1059util_logbase2(index_size);1060}10611062static void1063create_indirect_draw_shader(struct panfrost_device *dev,1064unsigned flags, unsigned index_size,1065bool index_min_max_search)1066{1067assert(flags < PAN_INDIRECT_DRAW_NUM_SHADERS);1068struct indirect_draw_shader_builder builder;1069init_shader_builder(&builder, dev, flags, index_size, index_min_max_search);10701071nir_builder *b = &builder.b;10721073if (index_min_max_search)1074get_index_min_max(&builder);1075else1076patch(&builder);10771078struct panfrost_compile_inputs inputs = { .gpu_id = dev->gpu_id };1079struct pan_shader_info shader_info;1080struct util_dynarray binary;10811082util_dynarray_init(&binary, NULL);1083pan_shader_compile(dev, b->shader, &inputs, &binary, &shader_info);10841085assert(!shader_info.tls_size);1086assert(!shader_info.wls_size);1087assert(!shader_info.sysvals.sysval_count);10881089unsigned shader_id = get_shader_id(flags, index_size, index_min_max_search);1090struct pan_indirect_draw_shader *draw_shader =1091&dev->indirect_draw_shaders.shaders[shader_id];1092void *state = dev->indirect_draw_shaders.states->ptr.cpu +1093(shader_id * MALI_RENDERER_STATE_LENGTH);10941095pthread_mutex_lock(&dev->indirect_draw_shaders.lock);1096if (!draw_shader->rsd) {1097mali_ptr address =1098pan_pool_upload_aligned(dev->indirect_draw_shaders.bin_pool,1099binary.data, binary.size,1100pan_is_bifrost(dev) ? 128 : 64);1101if (!pan_is_bifrost(dev))1102address |= shader_info.midgard.first_tag;11031104util_dynarray_fini(&binary);11051106pan_pack(state, RENDERER_STATE, cfg) {1107pan_shader_prepare_rsd(dev, &shader_info, address, &cfg);1108}1109pthread_mutex_unlock(&dev->indirect_draw_shaders.lock);11101111draw_shader->push = shader_info.push;1112draw_shader->rsd = dev->indirect_draw_shaders.states->ptr.gpu +1113(shader_id * MALI_RENDERER_STATE_LENGTH);1114}1115pthread_mutex_unlock(&dev->indirect_draw_shaders.lock);11161117ralloc_free(b->shader);1118}11191120static mali_ptr1121get_renderer_state(struct panfrost_device *dev, unsigned flags,1122unsigned index_size, bool index_min_max_search)1123{1124unsigned shader_id = get_shader_id(flags, index_size, index_min_max_search);1125struct pan_indirect_draw_shader *info =1126&dev->indirect_draw_shaders.shaders[shader_id];11271128if (!info->rsd) {1129create_indirect_draw_shader(dev, flags, index_size,1130index_min_max_search);1131assert(info->rsd);1132}11331134return info->rsd;1135}11361137static mali_ptr1138get_tls(const struct panfrost_device *dev)1139{1140return dev->indirect_draw_shaders.states->ptr.gpu +1141(PAN_INDIRECT_DRAW_NUM_SHADERS * MALI_RENDERER_STATE_LENGTH);1142}11431144static mali_ptr1145get_ubos(struct pan_pool *pool,1146const struct indirect_draw_inputs *inputs)1147{1148struct panfrost_ptr inputs_buf =1149pan_pool_alloc_aligned(pool, sizeof(inputs), 16);11501151memcpy(inputs_buf.cpu, &inputs, sizeof(inputs));11521153struct panfrost_ptr ubos_buf =1154pan_pool_alloc_desc(pool, UNIFORM_BUFFER);11551156pan_pack(ubos_buf.cpu, UNIFORM_BUFFER, cfg) {1157cfg.entries = DIV_ROUND_UP(sizeof(inputs), 16);1158cfg.pointer = inputs_buf.gpu;1159}11601161return ubos_buf.gpu;1162}11631164static mali_ptr1165get_push_uniforms(struct pan_pool *pool,1166const struct pan_indirect_draw_shader *shader,1167const struct indirect_draw_inputs *inputs)1168{1169if (!shader->push.count)1170return 0;11711172struct panfrost_ptr push_consts_buf =1173pan_pool_alloc_aligned(pool, shader->push.count * 4, 16);1174uint32_t *out = push_consts_buf.cpu;1175uint8_t *in = (uint8_t *)inputs;11761177for (unsigned i = 0; i < shader->push.count; ++i)1178memcpy(out + i, in + shader->push.words[i].offset, 4);11791180return push_consts_buf.gpu;1181}11821183static void1184panfrost_indirect_draw_alloc_deps(struct panfrost_device *dev)1185{1186pthread_mutex_lock(&dev->indirect_draw_shaders.lock);1187if (dev->indirect_draw_shaders.states)1188goto out;11891190unsigned state_bo_size = (PAN_INDIRECT_DRAW_NUM_SHADERS *1191MALI_RENDERER_STATE_LENGTH) +1192MALI_LOCAL_STORAGE_LENGTH;11931194dev->indirect_draw_shaders.states =1195panfrost_bo_create(dev, state_bo_size, 0, "Indirect draw states");11961197/* Prepare the thread storage descriptor now since it's invariant. */1198void *tsd = dev->indirect_draw_shaders.states->ptr.cpu +1199(PAN_INDIRECT_DRAW_NUM_SHADERS * MALI_RENDERER_STATE_LENGTH);1200pan_pack(tsd, LOCAL_STORAGE, ls) {1201ls.wls_instances = MALI_LOCAL_STORAGE_NO_WORKGROUP_MEM;1202};12031204/* FIXME: Currently allocating 512M of growable memory, meaning that we1205* only allocate what we really use, the problem is:1206* - allocation happens 2M at a time, which might be more than we1207* actually need1208* - the memory is attached to the device to speed up subsequent1209* indirect draws, but that also means it's never shrinked1210*/1211dev->indirect_draw_shaders.varying_heap =1212panfrost_bo_create(dev, 512 * 1024 * 1024,1213PAN_BO_INVISIBLE | PAN_BO_GROWABLE,1214"Indirect draw varying heap");12151216out:1217pthread_mutex_unlock(&dev->indirect_draw_shaders.lock);1218}12191220static unsigned1221panfrost_emit_index_min_max_search(struct pan_pool *pool,1222struct pan_scoreboard *scoreboard,1223const struct pan_indirect_draw_info *draw_info,1224const struct indirect_draw_inputs *inputs,1225struct indirect_draw_context *draw_ctx,1226mali_ptr ubos)1227{1228struct panfrost_device *dev = pool->dev;1229unsigned index_size = draw_info->index_size;12301231if (!index_size)1232return 0;12331234mali_ptr rsd =1235get_renderer_state(dev, draw_info->flags,1236draw_info->index_size, true);1237unsigned shader_id =1238get_shader_id(draw_info->flags, draw_info->index_size, true);1239const struct pan_indirect_draw_shader *shader =1240&dev->indirect_draw_shaders.shaders[shader_id];1241struct panfrost_ptr job =1242pan_pool_alloc_desc(pool, COMPUTE_JOB);1243void *invocation =1244pan_section_ptr(job.cpu, COMPUTE_JOB, INVOCATION);1245panfrost_pack_work_groups_compute(invocation,12461, 1, 1, MIN_MAX_JOBS, 1, 1,1247false, false);12481249pan_section_pack(job.cpu, COMPUTE_JOB, PARAMETERS, cfg) {1250cfg.job_task_split = 7;1251}12521253pan_section_pack(job.cpu, COMPUTE_JOB, DRAW, cfg) {1254cfg.draw_descriptor_is_64b = true;1255cfg.texture_descriptor_is_64b = !pan_is_bifrost(dev);1256cfg.state = rsd;1257cfg.thread_storage = get_tls(pool->dev);1258cfg.uniform_buffers = ubos;1259cfg.push_uniforms = get_push_uniforms(pool, shader, inputs);1260}12611262pan_section_pack(job.cpu, COMPUTE_JOB, DRAW_PADDING, cfg);12631264return panfrost_add_job(pool, scoreboard, MALI_JOB_TYPE_COMPUTE,1265false, false, 0, 0, &job, false);1266}12671268unsigned1269panfrost_emit_indirect_draw(struct pan_pool *pool,1270struct pan_scoreboard *scoreboard,1271const struct pan_indirect_draw_info *draw_info,1272struct panfrost_ptr *ctx)1273{1274struct panfrost_device *dev = pool->dev;12751276/* Currently only tested on Bifrost, but the logic should be the same1277* on Midgard.1278*/1279assert(pan_is_bifrost(dev));12801281panfrost_indirect_draw_alloc_deps(dev);12821283struct panfrost_ptr job =1284pan_pool_alloc_desc(pool, COMPUTE_JOB);1285mali_ptr rsd =1286get_renderer_state(dev, draw_info->flags,1287draw_info->index_size, false);12881289struct indirect_draw_context draw_ctx = {1290.varying_mem = dev->indirect_draw_shaders.varying_heap->ptr.gpu,1291};12921293struct panfrost_ptr draw_ctx_ptr = *ctx;1294if (!draw_ctx_ptr.cpu) {1295draw_ctx_ptr = pan_pool_alloc_aligned(pool,1296sizeof(draw_ctx),1297sizeof(mali_ptr));1298}12991300struct indirect_draw_inputs inputs = {1301.draw_ctx = draw_ctx_ptr.gpu,1302.draw_buf = draw_info->draw_buf,1303.index_buf = draw_info->index_buf,1304.first_vertex_sysval = draw_info->first_vertex_sysval,1305.base_vertex_sysval = draw_info->base_vertex_sysval,1306.base_instance_sysval = draw_info->base_instance_sysval,1307.vertex_job = draw_info->vertex_job,1308.tiler_job = draw_info->tiler_job,1309.attrib_bufs = draw_info->attrib_bufs,1310.attribs = draw_info->attribs,1311.varying_bufs = draw_info->varying_bufs,1312.attrib_count = draw_info->attrib_count,1313};13141315if (draw_info->index_size) {1316inputs.restart_index = draw_info->restart_index;13171318struct panfrost_ptr min_max_ctx_ptr =1319pan_pool_alloc_aligned(pool,1320sizeof(struct min_max_context),13214);1322struct min_max_context *ctx = min_max_ctx_ptr.cpu;13231324ctx->min = UINT32_MAX;1325ctx->max = 0;1326inputs.min_max_ctx = min_max_ctx_ptr.gpu;1327}13281329unsigned shader_id =1330get_shader_id(draw_info->flags, draw_info->index_size, false);1331const struct pan_indirect_draw_shader *shader =1332&dev->indirect_draw_shaders.shaders[shader_id];1333mali_ptr ubos = get_ubos(pool, &inputs);13341335void *invocation =1336pan_section_ptr(job.cpu, COMPUTE_JOB, INVOCATION);1337panfrost_pack_work_groups_compute(invocation,13381, 1, 1, 1, 1, 1,1339false, false);13401341pan_section_pack(job.cpu, COMPUTE_JOB, PARAMETERS, cfg) {1342cfg.job_task_split = 2;1343}13441345pan_section_pack(job.cpu, COMPUTE_JOB, DRAW, cfg) {1346cfg.draw_descriptor_is_64b = true;1347cfg.texture_descriptor_is_64b = !pan_is_bifrost(dev);1348cfg.state = rsd;1349cfg.thread_storage = get_tls(pool->dev);1350cfg.uniform_buffers = ubos;1351cfg.push_uniforms = get_push_uniforms(pool, shader, &inputs);1352}13531354pan_section_pack(job.cpu, COMPUTE_JOB, DRAW_PADDING, cfg);13551356unsigned global_dep = draw_info->last_indirect_draw;1357unsigned local_dep =1358panfrost_emit_index_min_max_search(pool, scoreboard, draw_info,1359&inputs, &draw_ctx, ubos);13601361if (!ctx->cpu) {1362*ctx = draw_ctx_ptr;1363memcpy(ctx->cpu, &draw_ctx, sizeof(draw_ctx));1364}13651366return panfrost_add_job(pool, scoreboard, MALI_JOB_TYPE_COMPUTE,1367false, true, local_dep, global_dep,1368&job, false);1369}13701371void1372panfrost_init_indirect_draw_shaders(struct panfrost_device *dev,1373struct pan_pool *bin_pool)1374{1375/* We allocate the states and varying_heap BO lazily to avoid1376* reserving memory when indirect draws are not used.1377*/1378pthread_mutex_init(&dev->indirect_draw_shaders.lock, NULL);1379dev->indirect_draw_shaders.bin_pool = bin_pool;1380}13811382void1383panfrost_cleanup_indirect_draw_shaders(struct panfrost_device *dev)1384{1385panfrost_bo_unreference(dev->indirect_draw_shaders.states);1386panfrost_bo_unreference(dev->indirect_draw_shaders.varying_heap);1387pthread_mutex_destroy(&dev->indirect_draw_shaders.lock);1388}138913901391