Path: blob/21.2-virgl/src/broadcom/vulkan/v3dv_cmd_buffer.c
4560 views
/*1* Copyright © 2019 Raspberry Pi2*3* Permission is hereby granted, free of charge, to any person obtaining a4* copy of this software and associated documentation files (the "Software"),5* to deal in the Software without restriction, including without limitation6* the rights to use, copy, modify, merge, publish, distribute, sublicense,7* and/or sell copies of the Software, and to permit persons to whom the8* Software is furnished to do so, subject to the following conditions:9*10* The above copyright notice and this permission notice (including the next11* paragraph) shall be included in all copies or substantial portions of the12* Software.13*14* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR15* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,16* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL17* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER18* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING19* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS20* IN THE SOFTWARE.21*/2223#include "v3dv_private.h"24#include "util/u_pack_color.h"25#include "vk_format_info.h"26#include "vk_util.h"2728const struct v3dv_dynamic_state default_dynamic_state = {29.viewport = {30.count = 0,31},32.scissor = {33.count = 0,34},35.stencil_compare_mask =36{37.front = ~0u,38.back = ~0u,39},40.stencil_write_mask =41{42.front = ~0u,43.back = ~0u,44},45.stencil_reference =46{47.front = 0u,48.back = 0u,49},50.blend_constants = { 0.0f, 0.0f, 0.0f, 0.0f },51.depth_bias = {52.constant_factor = 0.0f,53.depth_bias_clamp = 0.0f,54.slope_factor = 0.0f,55},56.line_width = 1.0f,57};5859void60v3dv_job_add_bo(struct v3dv_job *job, struct v3dv_bo *bo)61{62if (!bo)63return;6465if (job->bo_handle_mask & bo->handle_bit) {66if (_mesa_set_search(job->bos, bo))67return;68}6970_mesa_set_add(job->bos, bo);71job->bo_count++;72job->bo_handle_mask |= bo->handle_bit;73}7475void76v3dv_job_add_bo_unchecked(struct v3dv_job *job, struct v3dv_bo *bo)77{78assert(bo);79_mesa_set_add(job->bos, bo);80job->bo_count++;81job->bo_handle_mask |= bo->handle_bit;82}8384VKAPI_ATTR VkResult VKAPI_CALL85v3dv_CreateCommandPool(VkDevice _device,86const VkCommandPoolCreateInfo *pCreateInfo,87const VkAllocationCallbacks *pAllocator,88VkCommandPool *pCmdPool)89{90V3DV_FROM_HANDLE(v3dv_device, device, _device);91struct v3dv_cmd_pool *pool;9293/* We only support one queue */94assert(pCreateInfo->queueFamilyIndex == 0);9596pool = vk_object_zalloc(&device->vk, pAllocator, sizeof(*pool),97VK_OBJECT_TYPE_COMMAND_POOL);98if (pool == NULL)99return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);100101if (pAllocator)102pool->alloc = *pAllocator;103else104pool->alloc = device->vk.alloc;105106list_inithead(&pool->cmd_buffers);107108*pCmdPool = v3dv_cmd_pool_to_handle(pool);109110return VK_SUCCESS;111}112113static void114cmd_buffer_init(struct v3dv_cmd_buffer *cmd_buffer,115struct v3dv_device *device,116struct v3dv_cmd_pool *pool,117VkCommandBufferLevel level)118{119/* Do not reset the base object! If we are calling this from a command120* buffer reset that would reset the loader's dispatch table for the121* command buffer, and any other relevant info from vk_object_base122*/123const uint32_t base_size = sizeof(struct vk_object_base);124uint8_t *cmd_buffer_driver_start = ((uint8_t *) cmd_buffer) + base_size;125memset(cmd_buffer_driver_start, 0, sizeof(*cmd_buffer) - base_size);126127cmd_buffer->device = device;128cmd_buffer->pool = pool;129cmd_buffer->level = level;130131list_inithead(&cmd_buffer->private_objs);132list_inithead(&cmd_buffer->jobs);133list_inithead(&cmd_buffer->list_link);134135assert(pool);136list_addtail(&cmd_buffer->pool_link, &pool->cmd_buffers);137138cmd_buffer->state.subpass_idx = -1;139cmd_buffer->state.meta.subpass_idx = -1;140141cmd_buffer->status = V3DV_CMD_BUFFER_STATUS_INITIALIZED;142}143144static VkResult145cmd_buffer_create(struct v3dv_device *device,146struct v3dv_cmd_pool *pool,147VkCommandBufferLevel level,148VkCommandBuffer *pCommandBuffer)149{150struct v3dv_cmd_buffer *cmd_buffer;151cmd_buffer = vk_object_zalloc(&device->vk,152&pool->alloc,153sizeof(*cmd_buffer),154VK_OBJECT_TYPE_COMMAND_BUFFER);155if (cmd_buffer == NULL)156return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);157158cmd_buffer_init(cmd_buffer, device, pool, level);159160*pCommandBuffer = v3dv_cmd_buffer_to_handle(cmd_buffer);161162return VK_SUCCESS;163}164165static void166job_destroy_gpu_cl_resources(struct v3dv_job *job)167{168assert(job->type == V3DV_JOB_TYPE_GPU_CL ||169job->type == V3DV_JOB_TYPE_GPU_CL_SECONDARY);170171v3dv_cl_destroy(&job->bcl);172v3dv_cl_destroy(&job->rcl);173v3dv_cl_destroy(&job->indirect);174175/* Since we don't ref BOs when we add them to the command buffer, don't176* unref them here either. Bo's will be freed when their corresponding API177* objects are destroyed.178*/179_mesa_set_destroy(job->bos, NULL);180181v3dv_bo_free(job->device, job->tile_alloc);182v3dv_bo_free(job->device, job->tile_state);183}184185static void186job_destroy_cloned_gpu_cl_resources(struct v3dv_job *job)187{188assert(job->type == V3DV_JOB_TYPE_GPU_CL);189190list_for_each_entry_safe(struct v3dv_bo, bo, &job->bcl.bo_list, list_link) {191list_del(&bo->list_link);192vk_free(&job->device->vk.alloc, bo);193}194195list_for_each_entry_safe(struct v3dv_bo, bo, &job->rcl.bo_list, list_link) {196list_del(&bo->list_link);197vk_free(&job->device->vk.alloc, bo);198}199200list_for_each_entry_safe(struct v3dv_bo, bo, &job->indirect.bo_list, list_link) {201list_del(&bo->list_link);202vk_free(&job->device->vk.alloc, bo);203}204}205206static void207job_destroy_gpu_csd_resources(struct v3dv_job *job)208{209assert(job->type == V3DV_JOB_TYPE_GPU_CSD);210assert(job->cmd_buffer);211212v3dv_cl_destroy(&job->indirect);213214_mesa_set_destroy(job->bos, NULL);215216if (job->csd.shared_memory)217v3dv_bo_free(job->device, job->csd.shared_memory);218}219220static void221job_destroy_cpu_wait_events_resources(struct v3dv_job *job)222{223assert(job->type == V3DV_JOB_TYPE_CPU_WAIT_EVENTS);224assert(job->cmd_buffer);225vk_free(&job->cmd_buffer->device->vk.alloc, job->cpu.event_wait.events);226}227228static void229job_destroy_cpu_csd_indirect_resources(struct v3dv_job *job)230{231assert(job->type == V3DV_JOB_TYPE_CPU_CSD_INDIRECT);232assert(job->cmd_buffer);233v3dv_job_destroy(job->cpu.csd_indirect.csd_job);234}235236void237v3dv_job_destroy(struct v3dv_job *job)238{239assert(job);240241list_del(&job->list_link);242243/* Cloned jobs don't make deep copies of the original jobs, so they don't244* own any of their resources. However, they do allocate clones of BO245* structs, so make sure we free those.246*/247if (!job->is_clone) {248switch (job->type) {249case V3DV_JOB_TYPE_GPU_CL:250case V3DV_JOB_TYPE_GPU_CL_SECONDARY:251job_destroy_gpu_cl_resources(job);252break;253case V3DV_JOB_TYPE_GPU_CSD:254job_destroy_gpu_csd_resources(job);255break;256case V3DV_JOB_TYPE_CPU_WAIT_EVENTS:257job_destroy_cpu_wait_events_resources(job);258break;259case V3DV_JOB_TYPE_CPU_CSD_INDIRECT:260job_destroy_cpu_csd_indirect_resources(job);261break;262default:263break;264}265} else {266/* Cloned jobs */267if (job->type == V3DV_JOB_TYPE_GPU_CL)268job_destroy_cloned_gpu_cl_resources(job);269}270271vk_free(&job->device->vk.alloc, job);272}273274void275v3dv_cmd_buffer_add_private_obj(struct v3dv_cmd_buffer *cmd_buffer,276uint64_t obj,277v3dv_cmd_buffer_private_obj_destroy_cb destroy_cb)278{279struct v3dv_cmd_buffer_private_obj *pobj =280vk_alloc(&cmd_buffer->device->vk.alloc, sizeof(*pobj), 8,281VK_SYSTEM_ALLOCATION_SCOPE_COMMAND);282if (!pobj) {283v3dv_flag_oom(cmd_buffer, NULL);284return;285}286287pobj->obj = obj;288pobj->destroy_cb = destroy_cb;289290list_addtail(&pobj->list_link, &cmd_buffer->private_objs);291}292293static void294cmd_buffer_destroy_private_obj(struct v3dv_cmd_buffer *cmd_buffer,295struct v3dv_cmd_buffer_private_obj *pobj)296{297assert(pobj && pobj->obj && pobj->destroy_cb);298pobj->destroy_cb(v3dv_device_to_handle(cmd_buffer->device),299pobj->obj,300&cmd_buffer->device->vk.alloc);301list_del(&pobj->list_link);302vk_free(&cmd_buffer->device->vk.alloc, pobj);303}304305static void306cmd_buffer_free_resources(struct v3dv_cmd_buffer *cmd_buffer)307{308list_for_each_entry_safe(struct v3dv_job, job,309&cmd_buffer->jobs, list_link) {310v3dv_job_destroy(job);311}312313if (cmd_buffer->state.job)314v3dv_job_destroy(cmd_buffer->state.job);315316if (cmd_buffer->state.attachments)317vk_free(&cmd_buffer->pool->alloc, cmd_buffer->state.attachments);318319if (cmd_buffer->state.query.end.alloc_count > 0)320vk_free(&cmd_buffer->device->vk.alloc, cmd_buffer->state.query.end.states);321322if (cmd_buffer->push_constants_resource.bo)323v3dv_bo_free(cmd_buffer->device, cmd_buffer->push_constants_resource.bo);324325list_for_each_entry_safe(struct v3dv_cmd_buffer_private_obj, pobj,326&cmd_buffer->private_objs, list_link) {327cmd_buffer_destroy_private_obj(cmd_buffer, pobj);328}329330if (cmd_buffer->state.meta.attachments) {331assert(cmd_buffer->state.meta.attachment_alloc_count > 0);332vk_free(&cmd_buffer->device->vk.alloc, cmd_buffer->state.meta.attachments);333}334}335336static void337cmd_buffer_destroy(struct v3dv_cmd_buffer *cmd_buffer)338{339list_del(&cmd_buffer->pool_link);340cmd_buffer_free_resources(cmd_buffer);341vk_object_free(&cmd_buffer->device->vk, &cmd_buffer->pool->alloc, cmd_buffer);342}343344static bool345attachment_list_is_subset(struct v3dv_subpass_attachment *l1, uint32_t l1_count,346struct v3dv_subpass_attachment *l2, uint32_t l2_count)347{348for (uint32_t i = 0; i < l1_count; i++) {349uint32_t attachment_idx = l1[i].attachment;350if (attachment_idx == VK_ATTACHMENT_UNUSED)351continue;352353uint32_t j;354for (j = 0; j < l2_count; j++) {355if (l2[j].attachment == attachment_idx)356break;357}358if (j == l2_count)359return false;360}361362return true;363}364365static bool366cmd_buffer_can_merge_subpass(struct v3dv_cmd_buffer *cmd_buffer,367uint32_t subpass_idx)368{369const struct v3dv_cmd_buffer_state *state = &cmd_buffer->state;370assert(state->pass);371372const struct v3dv_physical_device *physical_device =373&cmd_buffer->device->instance->physicalDevice;374375if (cmd_buffer->level != VK_COMMAND_BUFFER_LEVEL_PRIMARY)376return false;377378if (!cmd_buffer->state.job)379return false;380381if (cmd_buffer->state.job->always_flush)382return false;383384if (!physical_device->options.merge_jobs)385return false;386387/* Each render pass starts a new job */388if (subpass_idx == 0)389return false;390391/* Two subpasses can be merged in the same job if we can emit a single RCL392* for them (since the RCL includes the END_OF_RENDERING command that393* triggers the "render job finished" interrupt). We can do this so long394* as both subpasses render against the same attachments.395*/396assert(state->subpass_idx == subpass_idx - 1);397struct v3dv_subpass *prev_subpass = &state->pass->subpasses[state->subpass_idx];398struct v3dv_subpass *subpass = &state->pass->subpasses[subpass_idx];399400/* Because the list of subpass attachments can include VK_ATTACHMENT_UNUSED,401* we need to check that for each subpass all its used attachments are402* used by the other subpass.403*/404bool compatible =405attachment_list_is_subset(prev_subpass->color_attachments,406prev_subpass->color_count,407subpass->color_attachments,408subpass->color_count);409if (!compatible)410return false;411412compatible =413attachment_list_is_subset(subpass->color_attachments,414subpass->color_count,415prev_subpass->color_attachments,416prev_subpass->color_count);417if (!compatible)418return false;419420if (subpass->ds_attachment.attachment !=421prev_subpass->ds_attachment.attachment)422return false;423424/* FIXME: Since some attachment formats can't be resolved using the TLB we425* need to emit separate resolve jobs for them and that would not be426* compatible with subpass merges. We could fix that by testing if any of427* the attachments to resolve doesn't suppotr TLB resolves.428*/429if (prev_subpass->resolve_attachments || subpass->resolve_attachments)430return false;431432return true;433}434435/**436* Computes and sets the job frame tiling information required to setup frame437* binning and rendering.438*/439static struct v3dv_frame_tiling *440job_compute_frame_tiling(struct v3dv_job *job,441uint32_t width,442uint32_t height,443uint32_t layers,444uint32_t render_target_count,445uint8_t max_internal_bpp,446bool msaa)447{448static const uint8_t tile_sizes[] = {44964, 64,45064, 32,45132, 32,45232, 16,45316, 16,45416, 8,4558, 8456};457458assert(job);459struct v3dv_frame_tiling *tiling = &job->frame_tiling;460461tiling->width = width;462tiling->height = height;463tiling->layers = layers;464tiling->render_target_count = render_target_count;465tiling->msaa = msaa;466467uint32_t tile_size_index = 0;468469if (render_target_count > 2)470tile_size_index += 2;471else if (render_target_count > 1)472tile_size_index += 1;473474if (msaa)475tile_size_index += 2;476477tiling->internal_bpp = max_internal_bpp;478tile_size_index += tiling->internal_bpp;479assert(tile_size_index < ARRAY_SIZE(tile_sizes) / 2);480481tiling->tile_width = tile_sizes[tile_size_index * 2];482tiling->tile_height = tile_sizes[tile_size_index * 2 + 1];483484tiling->draw_tiles_x = DIV_ROUND_UP(width, tiling->tile_width);485tiling->draw_tiles_y = DIV_ROUND_UP(height, tiling->tile_height);486487/* Size up our supertiles until we get under the limit */488const uint32_t max_supertiles = 256;489tiling->supertile_width = 1;490tiling->supertile_height = 1;491for (;;) {492tiling->frame_width_in_supertiles =493DIV_ROUND_UP(tiling->draw_tiles_x, tiling->supertile_width);494tiling->frame_height_in_supertiles =495DIV_ROUND_UP(tiling->draw_tiles_y, tiling->supertile_height);496const uint32_t num_supertiles = tiling->frame_width_in_supertiles *497tiling->frame_height_in_supertiles;498if (num_supertiles < max_supertiles)499break;500501if (tiling->supertile_width < tiling->supertile_height)502tiling->supertile_width++;503else504tiling->supertile_height++;505}506507return tiling;508}509510void511v3dv_job_start_frame(struct v3dv_job *job,512uint32_t width,513uint32_t height,514uint32_t layers,515uint32_t render_target_count,516uint8_t max_internal_bpp,517bool msaa)518{519assert(job);520521/* Start by computing frame tiling spec for this job */522const struct v3dv_frame_tiling *tiling =523job_compute_frame_tiling(job,524width, height, layers,525render_target_count, max_internal_bpp, msaa);526527v3dv_cl_ensure_space_with_branch(&job->bcl, 256);528v3dv_return_if_oom(NULL, job);529530/* The PTB will request the tile alloc initial size per tile at start531* of tile binning.532*/533uint32_t tile_alloc_size = 64 * tiling->layers *534tiling->draw_tiles_x *535tiling->draw_tiles_y;536537/* The PTB allocates in aligned 4k chunks after the initial setup. */538tile_alloc_size = align(tile_alloc_size, 4096);539540/* Include the first two chunk allocations that the PTB does so that541* we definitely clear the OOM condition before triggering one (the HW542* won't trigger OOM during the first allocations).543*/544tile_alloc_size += 8192;545546/* For performance, allocate some extra initial memory after the PTB's547* minimal allocations, so that we hopefully don't have to block the548* GPU on the kernel handling an OOM signal.549*/550tile_alloc_size += 512 * 1024;551552job->tile_alloc = v3dv_bo_alloc(job->device, tile_alloc_size,553"tile_alloc", true);554if (!job->tile_alloc) {555v3dv_flag_oom(NULL, job);556return;557}558559v3dv_job_add_bo_unchecked(job, job->tile_alloc);560561const uint32_t tsda_per_tile_size = 256;562const uint32_t tile_state_size = tiling->layers *563tiling->draw_tiles_x *564tiling->draw_tiles_y *565tsda_per_tile_size;566job->tile_state = v3dv_bo_alloc(job->device, tile_state_size, "TSDA", true);567if (!job->tile_state) {568v3dv_flag_oom(NULL, job);569return;570}571572v3dv_job_add_bo_unchecked(job, job->tile_state);573574v3dv_X(job->device, job_emit_binning_prolog)(job, tiling, layers);575576job->ez_state = V3D_EZ_UNDECIDED;577job->first_ez_state = V3D_EZ_UNDECIDED;578}579580static void581cmd_buffer_end_render_pass_frame(struct v3dv_cmd_buffer *cmd_buffer)582{583assert(cmd_buffer->state.job);584585/* Typically, we have a single job for each subpass and we emit the job's RCL586* here when we are ending the frame for the subpass. However, some commands587* such as vkCmdClearAttachments need to run in their own separate job and588* they emit their own RCL even if they execute inside a subpass. In this589* scenario, we don't want to emit subpass RCL when we end the frame for590* those jobs, so we only emit the subpass RCL if the job has not recorded591* any RCL commands of its own.592*/593if (v3dv_cl_offset(&cmd_buffer->state.job->rcl) == 0)594v3dv_X(cmd_buffer->device, cmd_buffer_emit_render_pass_rcl)(cmd_buffer);595596v3dv_X(cmd_buffer->device, job_emit_binning_flush)(cmd_buffer->state.job);597}598599struct v3dv_job *600v3dv_cmd_buffer_create_cpu_job(struct v3dv_device *device,601enum v3dv_job_type type,602struct v3dv_cmd_buffer *cmd_buffer,603uint32_t subpass_idx)604{605struct v3dv_job *job = vk_zalloc(&device->vk.alloc,606sizeof(struct v3dv_job), 8,607VK_SYSTEM_ALLOCATION_SCOPE_COMMAND);608if (!job) {609v3dv_flag_oom(cmd_buffer, NULL);610return NULL;611}612613v3dv_job_init(job, type, device, cmd_buffer, subpass_idx);614return job;615}616617static void618cmd_buffer_add_cpu_jobs_for_pending_state(struct v3dv_cmd_buffer *cmd_buffer)619{620struct v3dv_cmd_buffer_state *state = &cmd_buffer->state;621622if (state->query.end.used_count > 0) {623const uint32_t query_count = state->query.end.used_count;624for (uint32_t i = 0; i < query_count; i++) {625assert(i < state->query.end.used_count);626struct v3dv_job *job =627v3dv_cmd_buffer_create_cpu_job(cmd_buffer->device,628V3DV_JOB_TYPE_CPU_END_QUERY,629cmd_buffer, -1);630v3dv_return_if_oom(cmd_buffer, NULL);631632job->cpu.query_end = state->query.end.states[i];633list_addtail(&job->list_link, &cmd_buffer->jobs);634}635}636}637638void639v3dv_cmd_buffer_finish_job(struct v3dv_cmd_buffer *cmd_buffer)640{641struct v3dv_job *job = cmd_buffer->state.job;642if (!job)643return;644645if (cmd_buffer->state.oom) {646v3dv_job_destroy(job);647cmd_buffer->state.job = NULL;648return;649}650651/* If we have created a job for a command buffer then we should have652* recorded something into it: if the job was started in a render pass, it653* should at least have the start frame commands, otherwise, it should have654* a transfer command. The only exception are secondary command buffers655* inside a render pass.656*/657assert(cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_SECONDARY ||658v3dv_cl_offset(&job->bcl) > 0);659660/* When we merge multiple subpasses into the same job we must only emit one661* RCL, so we do that here, when we decided that we need to finish the job.662* Any rendering that happens outside a render pass is never merged, so663* the RCL should have been emitted by the time we got here.664*/665assert(v3dv_cl_offset(&job->rcl) != 0 || cmd_buffer->state.pass);666667/* If we are finishing a job inside a render pass we have two scenarios:668*669* 1. It is a regular CL, in which case we will submit the job to the GPU,670* so we may need to generate an RCL and add a binning flush.671*672* 2. It is a partial CL recorded in a secondary command buffer, in which673* case we are not submitting it directly to the GPU but rather branch to674* it from a primary command buffer. In this case we just want to end675* the BCL with a RETURN_FROM_SUB_LIST and the RCL and binning flush676* will be the primary job that branches to this CL.677*/678if (cmd_buffer->state.pass) {679if (job->type == V3DV_JOB_TYPE_GPU_CL) {680cmd_buffer_end_render_pass_frame(cmd_buffer);681} else {682assert(job->type == V3DV_JOB_TYPE_GPU_CL_SECONDARY);683v3dv_X(cmd_buffer->device, cmd_buffer_end_render_pass_secondary)(cmd_buffer);684}685}686687list_addtail(&job->list_link, &cmd_buffer->jobs);688cmd_buffer->state.job = NULL;689690/* If we have recorded any state with this last GPU job that requires to691* emit CPU jobs after the job is completed, add them now. The only692* exception is secondary command buffers inside a render pass, because in693* that case we want to defer this until we finish recording the primary694* job into which we execute the secondary.695*/696if (cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_PRIMARY ||697!cmd_buffer->state.pass) {698cmd_buffer_add_cpu_jobs_for_pending_state(cmd_buffer);699}700}701702static bool703job_type_is_gpu(struct v3dv_job *job)704{705switch (job->type) {706case V3DV_JOB_TYPE_GPU_CL:707case V3DV_JOB_TYPE_GPU_CL_SECONDARY:708case V3DV_JOB_TYPE_GPU_TFU:709case V3DV_JOB_TYPE_GPU_CSD:710return true;711default:712return false;713}714}715716static void717cmd_buffer_serialize_job_if_needed(struct v3dv_cmd_buffer *cmd_buffer,718struct v3dv_job *job)719{720assert(cmd_buffer && job);721722if (!cmd_buffer->state.has_barrier)723return;724725/* Serialization only affects GPU jobs, CPU jobs are always automatically726* serialized.727*/728if (!job_type_is_gpu(job))729return;730731job->serialize = true;732if (cmd_buffer->state.has_bcl_barrier &&733(job->type == V3DV_JOB_TYPE_GPU_CL ||734job->type == V3DV_JOB_TYPE_GPU_CL_SECONDARY)) {735job->needs_bcl_sync = true;736}737738cmd_buffer->state.has_barrier = false;739cmd_buffer->state.has_bcl_barrier = false;740}741742void743v3dv_job_init(struct v3dv_job *job,744enum v3dv_job_type type,745struct v3dv_device *device,746struct v3dv_cmd_buffer *cmd_buffer,747int32_t subpass_idx)748{749assert(job);750751/* Make sure we haven't made this new job current before calling here */752assert(!cmd_buffer || cmd_buffer->state.job != job);753754job->type = type;755756job->device = device;757job->cmd_buffer = cmd_buffer;758759list_inithead(&job->list_link);760761if (type == V3DV_JOB_TYPE_GPU_CL ||762type == V3DV_JOB_TYPE_GPU_CL_SECONDARY ||763type == V3DV_JOB_TYPE_GPU_CSD) {764job->bos =765_mesa_set_create(NULL, _mesa_hash_pointer, _mesa_key_pointer_equal);766job->bo_count = 0;767768v3dv_cl_init(job, &job->indirect);769770if (V3D_DEBUG & V3D_DEBUG_ALWAYS_FLUSH)771job->always_flush = true;772}773774if (type == V3DV_JOB_TYPE_GPU_CL ||775type == V3DV_JOB_TYPE_GPU_CL_SECONDARY) {776v3dv_cl_init(job, &job->bcl);777v3dv_cl_init(job, &job->rcl);778}779780if (cmd_buffer) {781/* Flag all state as dirty. Generally, we need to re-emit state for each782* new job.783*784* FIXME: there may be some exceptions, in which case we could skip some785* bits.786*/787cmd_buffer->state.dirty = ~0;788cmd_buffer->state.dirty_descriptor_stages = ~0;789790/* Honor inheritance of occlussion queries in secondaries if requested */791if (cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_SECONDARY &&792cmd_buffer->state.inheritance.occlusion_query_enable) {793cmd_buffer->state.dirty &= ~V3DV_CMD_DIRTY_OCCLUSION_QUERY;794}795796/* Keep track of the first subpass that we are recording in this new job.797* We will use this when we emit the RCL to decide how to emit our loads798* and stores.799*/800if (cmd_buffer->state.pass)801job->first_subpass = subpass_idx;802803cmd_buffer_serialize_job_if_needed(cmd_buffer, job);804}805}806807struct v3dv_job *808v3dv_cmd_buffer_start_job(struct v3dv_cmd_buffer *cmd_buffer,809int32_t subpass_idx,810enum v3dv_job_type type)811{812/* Don't create a new job if we can merge the current subpass into813* the current job.814*/815if (cmd_buffer->state.pass &&816subpass_idx != -1 &&817cmd_buffer_can_merge_subpass(cmd_buffer, subpass_idx)) {818cmd_buffer->state.job->is_subpass_finish = false;819return cmd_buffer->state.job;820}821822/* Ensure we are not starting a new job without finishing a previous one */823if (cmd_buffer->state.job != NULL)824v3dv_cmd_buffer_finish_job(cmd_buffer);825826assert(cmd_buffer->state.job == NULL);827struct v3dv_job *job = vk_zalloc(&cmd_buffer->device->vk.alloc,828sizeof(struct v3dv_job), 8,829VK_SYSTEM_ALLOCATION_SCOPE_COMMAND);830831if (!job) {832fprintf(stderr, "Error: failed to allocate CPU memory for job\n");833v3dv_flag_oom(cmd_buffer, NULL);834return NULL;835}836837v3dv_job_init(job, type, cmd_buffer->device, cmd_buffer, subpass_idx);838cmd_buffer->state.job = job;839840return job;841}842843static VkResult844cmd_buffer_reset(struct v3dv_cmd_buffer *cmd_buffer,845VkCommandBufferResetFlags flags)846{847if (cmd_buffer->status != V3DV_CMD_BUFFER_STATUS_INITIALIZED) {848struct v3dv_device *device = cmd_buffer->device;849struct v3dv_cmd_pool *pool = cmd_buffer->pool;850VkCommandBufferLevel level = cmd_buffer->level;851852/* cmd_buffer_init below will re-add the command buffer to the pool853* so remove it here so we don't end up adding it again.854*/855list_del(&cmd_buffer->pool_link);856857/* FIXME: For now we always free all resources as if858* VK_COMMAND_BUFFER_RESET_RELEASE_RESOURCES_BIT was set.859*/860if (cmd_buffer->status != V3DV_CMD_BUFFER_STATUS_NEW)861cmd_buffer_free_resources(cmd_buffer);862863cmd_buffer_init(cmd_buffer, device, pool, level);864}865866assert(cmd_buffer->status == V3DV_CMD_BUFFER_STATUS_INITIALIZED);867return VK_SUCCESS;868}869870VKAPI_ATTR VkResult VKAPI_CALL871v3dv_AllocateCommandBuffers(VkDevice _device,872const VkCommandBufferAllocateInfo *pAllocateInfo,873VkCommandBuffer *pCommandBuffers)874{875V3DV_FROM_HANDLE(v3dv_device, device, _device);876V3DV_FROM_HANDLE(v3dv_cmd_pool, pool, pAllocateInfo->commandPool);877878VkResult result = VK_SUCCESS;879uint32_t i;880881for (i = 0; i < pAllocateInfo->commandBufferCount; i++) {882result = cmd_buffer_create(device, pool, pAllocateInfo->level,883&pCommandBuffers[i]);884if (result != VK_SUCCESS)885break;886}887888if (result != VK_SUCCESS) {889v3dv_FreeCommandBuffers(_device, pAllocateInfo->commandPool,890i, pCommandBuffers);891for (i = 0; i < pAllocateInfo->commandBufferCount; i++)892pCommandBuffers[i] = VK_NULL_HANDLE;893}894895return result;896}897898VKAPI_ATTR void VKAPI_CALL899v3dv_FreeCommandBuffers(VkDevice device,900VkCommandPool commandPool,901uint32_t commandBufferCount,902const VkCommandBuffer *pCommandBuffers)903{904for (uint32_t i = 0; i < commandBufferCount; i++) {905V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, pCommandBuffers[i]);906907if (!cmd_buffer)908continue;909910cmd_buffer_destroy(cmd_buffer);911}912}913914VKAPI_ATTR void VKAPI_CALL915v3dv_DestroyCommandPool(VkDevice _device,916VkCommandPool commandPool,917const VkAllocationCallbacks *pAllocator)918{919V3DV_FROM_HANDLE(v3dv_device, device, _device);920V3DV_FROM_HANDLE(v3dv_cmd_pool, pool, commandPool);921922if (!pool)923return;924925list_for_each_entry_safe(struct v3dv_cmd_buffer, cmd_buffer,926&pool->cmd_buffers, pool_link) {927cmd_buffer_destroy(cmd_buffer);928}929930vk_object_free(&device->vk, pAllocator, pool);931}932933VKAPI_ATTR void VKAPI_CALL934v3dv_TrimCommandPool(VkDevice device,935VkCommandPool commandPool,936VkCommandPoolTrimFlags flags)937{938/* We don't need to do anything here, our command pools never hold on to939* any resources from command buffers that are freed or reset.940*/941}942943944static void945cmd_buffer_subpass_handle_pending_resolves(struct v3dv_cmd_buffer *cmd_buffer)946{947assert(cmd_buffer->state.subpass_idx < cmd_buffer->state.pass->subpass_count);948const struct v3dv_render_pass *pass = cmd_buffer->state.pass;949const struct v3dv_subpass *subpass =950&pass->subpasses[cmd_buffer->state.subpass_idx];951952if (!subpass->resolve_attachments)953return;954955struct v3dv_framebuffer *fb = cmd_buffer->state.framebuffer;956957/* At this point we have already ended the current subpass and now we are958* about to emit vkCmdResolveImage calls to get the resolves we can't handle959* handle in the subpass RCL.960*961* vkCmdResolveImage is not supposed to be called inside a render pass so962* before we call that we need to make sure our command buffer state reflects963* that we are no longer in a subpass by finishing the current job and964* resetting the framebuffer and render pass state temporarily and then965* restoring it after we are done with the resolves.966*/967if (cmd_buffer->state.job)968v3dv_cmd_buffer_finish_job(cmd_buffer);969struct v3dv_framebuffer *restore_fb = cmd_buffer->state.framebuffer;970struct v3dv_render_pass *restore_pass = cmd_buffer->state.pass;971uint32_t restore_subpass_idx = cmd_buffer->state.subpass_idx;972cmd_buffer->state.framebuffer = NULL;973cmd_buffer->state.pass = NULL;974cmd_buffer->state.subpass_idx = -1;975976VkCommandBuffer cmd_buffer_handle = v3dv_cmd_buffer_to_handle(cmd_buffer);977for (uint32_t i = 0; i < subpass->color_count; i++) {978const uint32_t src_attachment_idx =979subpass->color_attachments[i].attachment;980if (src_attachment_idx == VK_ATTACHMENT_UNUSED)981continue;982983if (pass->attachments[src_attachment_idx].use_tlb_resolve)984continue;985986const uint32_t dst_attachment_idx =987subpass->resolve_attachments[i].attachment;988if (dst_attachment_idx == VK_ATTACHMENT_UNUSED)989continue;990991struct v3dv_image_view *src_iview = fb->attachments[src_attachment_idx];992struct v3dv_image_view *dst_iview = fb->attachments[dst_attachment_idx];993994VkImageResolve2KHR region = {995.sType = VK_STRUCTURE_TYPE_IMAGE_RESOLVE_2_KHR,996.srcSubresource = {997VK_IMAGE_ASPECT_COLOR_BIT,998src_iview->base_level,999src_iview->first_layer,1000src_iview->last_layer - src_iview->first_layer + 1,1001},1002.srcOffset = { 0, 0, 0 },1003.dstSubresource = {1004VK_IMAGE_ASPECT_COLOR_BIT,1005dst_iview->base_level,1006dst_iview->first_layer,1007dst_iview->last_layer - dst_iview->first_layer + 1,1008},1009.dstOffset = { 0, 0, 0 },1010.extent = src_iview->image->extent,1011};10121013VkResolveImageInfo2KHR resolve_info = {1014.sType = VK_STRUCTURE_TYPE_RESOLVE_IMAGE_INFO_2_KHR,1015.srcImage = v3dv_image_to_handle(src_iview->image),1016.srcImageLayout = VK_IMAGE_LAYOUT_GENERAL,1017.dstImage = v3dv_image_to_handle(dst_iview->image),1018.dstImageLayout = VK_IMAGE_LAYOUT_GENERAL,1019.regionCount = 1,1020.pRegions = ®ion,1021};1022v3dv_CmdResolveImage2KHR(cmd_buffer_handle, &resolve_info);1023}10241025cmd_buffer->state.framebuffer = restore_fb;1026cmd_buffer->state.pass = restore_pass;1027cmd_buffer->state.subpass_idx = restore_subpass_idx;1028}10291030static VkResult1031cmd_buffer_begin_render_pass_secondary(1032struct v3dv_cmd_buffer *cmd_buffer,1033const VkCommandBufferInheritanceInfo *inheritance_info)1034{1035assert(cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_SECONDARY);1036assert(cmd_buffer->usage_flags & VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT);1037assert(inheritance_info);10381039cmd_buffer->state.pass =1040v3dv_render_pass_from_handle(inheritance_info->renderPass);1041assert(cmd_buffer->state.pass);10421043cmd_buffer->state.framebuffer =1044v3dv_framebuffer_from_handle(inheritance_info->framebuffer);10451046assert(inheritance_info->subpass < cmd_buffer->state.pass->subpass_count);1047cmd_buffer->state.subpass_idx = inheritance_info->subpass;10481049cmd_buffer->state.inheritance.occlusion_query_enable =1050inheritance_info->occlusionQueryEnable;10511052/* Secondaries that execute inside a render pass won't start subpasses1053* so we want to create a job for them here.1054*/1055struct v3dv_job *job =1056v3dv_cmd_buffer_start_job(cmd_buffer, inheritance_info->subpass,1057V3DV_JOB_TYPE_GPU_CL_SECONDARY);1058if (!job) {1059v3dv_flag_oom(cmd_buffer, NULL);1060return VK_ERROR_OUT_OF_HOST_MEMORY;1061}10621063/* Secondary command buffers don't know about the render area, but our1064* scissor setup accounts for it, so let's make sure we make it large1065* enough that it doesn't actually constrain any rendering. This should1066* be fine, since the Vulkan spec states:1067*1068* "The application must ensure (using scissor if necessary) that all1069* rendering is contained within the render area."1070*1071* FIXME: setup constants for the max framebuffer dimensions and use them1072* here and when filling in VkPhysicalDeviceLimits.1073*/1074const struct v3dv_framebuffer *framebuffer = cmd_buffer->state.framebuffer;1075cmd_buffer->state.render_area.offset.x = 0;1076cmd_buffer->state.render_area.offset.y = 0;1077cmd_buffer->state.render_area.extent.width =1078framebuffer ? framebuffer->width : 4096;1079cmd_buffer->state.render_area.extent.height =1080framebuffer ? framebuffer->height : 4096;10811082return VK_SUCCESS;1083}10841085VKAPI_ATTR VkResult VKAPI_CALL1086v3dv_BeginCommandBuffer(VkCommandBuffer commandBuffer,1087const VkCommandBufferBeginInfo *pBeginInfo)1088{1089V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);10901091/* If this is the first vkBeginCommandBuffer, we must initialize the1092* command buffer's state. Otherwise, we must reset its state. In both1093* cases we reset it.1094*/1095VkResult result = cmd_buffer_reset(cmd_buffer, 0);1096if (result != VK_SUCCESS)1097return result;10981099assert(cmd_buffer->status == V3DV_CMD_BUFFER_STATUS_INITIALIZED);11001101cmd_buffer->usage_flags = pBeginInfo->flags;11021103if (cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_SECONDARY) {1104if (pBeginInfo->flags & VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT) {1105result =1106cmd_buffer_begin_render_pass_secondary(cmd_buffer,1107pBeginInfo->pInheritanceInfo);1108if (result != VK_SUCCESS)1109return result;1110}1111}11121113cmd_buffer->status = V3DV_CMD_BUFFER_STATUS_RECORDING;11141115return VK_SUCCESS;1116}11171118VKAPI_ATTR VkResult VKAPI_CALL1119v3dv_ResetCommandBuffer(VkCommandBuffer commandBuffer,1120VkCommandBufferResetFlags flags)1121{1122V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);1123return cmd_buffer_reset(cmd_buffer, flags);1124}11251126VKAPI_ATTR VkResult VKAPI_CALL1127v3dv_ResetCommandPool(VkDevice device,1128VkCommandPool commandPool,1129VkCommandPoolResetFlags flags)1130{1131V3DV_FROM_HANDLE(v3dv_cmd_pool, pool, commandPool);11321133VkCommandBufferResetFlags reset_flags = 0;1134if (flags & VK_COMMAND_POOL_RESET_RELEASE_RESOURCES_BIT)1135reset_flags = VK_COMMAND_BUFFER_RESET_RELEASE_RESOURCES_BIT;1136list_for_each_entry_safe(struct v3dv_cmd_buffer, cmd_buffer,1137&pool->cmd_buffers, pool_link) {1138cmd_buffer_reset(cmd_buffer, reset_flags);1139}11401141return VK_SUCCESS;1142}11431144static void1145cmd_buffer_update_tile_alignment(struct v3dv_cmd_buffer *cmd_buffer)1146{1147/* Render areas and scissor/viewport are only relevant inside render passes,1148* otherwise we are dealing with transfer operations where these elements1149* don't apply.1150*/1151assert(cmd_buffer->state.pass);1152const VkRect2D *rect = &cmd_buffer->state.render_area;11531154/* We should only call this at the beginning of a subpass so we should1155* always have framebuffer information available.1156*/1157assert(cmd_buffer->state.framebuffer);1158cmd_buffer->state.tile_aligned_render_area =1159v3dv_subpass_area_is_tile_aligned(cmd_buffer->device, rect,1160cmd_buffer->state.framebuffer,1161cmd_buffer->state.pass,1162cmd_buffer->state.subpass_idx);11631164if (!cmd_buffer->state.tile_aligned_render_area) {1165perf_debug("Render area for subpass %d of render pass %p doesn't "1166"match render pass granularity.\n",1167cmd_buffer->state.subpass_idx, cmd_buffer->state.pass);1168}1169}11701171static void1172cmd_buffer_state_set_attachment_clear_color(struct v3dv_cmd_buffer *cmd_buffer,1173uint32_t attachment_idx,1174const VkClearColorValue *color)1175{1176assert(attachment_idx < cmd_buffer->state.pass->attachment_count);11771178const struct v3dv_render_pass_attachment *attachment =1179&cmd_buffer->state.pass->attachments[attachment_idx];11801181uint32_t internal_type, internal_bpp;1182const struct v3dv_format *format =1183v3dv_X(cmd_buffer->device, get_format)(attachment->desc.format);11841185v3dv_X(cmd_buffer->device, get_internal_type_bpp_for_output_format)1186(format->rt_type, &internal_type, &internal_bpp);11871188uint32_t internal_size = 4 << internal_bpp;11891190struct v3dv_cmd_buffer_attachment_state *attachment_state =1191&cmd_buffer->state.attachments[attachment_idx];11921193v3dv_X(cmd_buffer->device, get_hw_clear_color)1194(color, internal_type, internal_size, &attachment_state->clear_value.color[0]);11951196attachment_state->vk_clear_value.color = *color;1197}11981199static void1200cmd_buffer_state_set_attachment_clear_depth_stencil(1201struct v3dv_cmd_buffer *cmd_buffer,1202uint32_t attachment_idx,1203bool clear_depth, bool clear_stencil,1204const VkClearDepthStencilValue *ds)1205{1206struct v3dv_cmd_buffer_attachment_state *attachment_state =1207&cmd_buffer->state.attachments[attachment_idx];12081209if (clear_depth)1210attachment_state->clear_value.z = ds->depth;12111212if (clear_stencil)1213attachment_state->clear_value.s = ds->stencil;12141215attachment_state->vk_clear_value.depthStencil = *ds;1216}12171218static void1219cmd_buffer_state_set_clear_values(struct v3dv_cmd_buffer *cmd_buffer,1220uint32_t count, const VkClearValue *values)1221{1222struct v3dv_cmd_buffer_state *state = &cmd_buffer->state;1223const struct v3dv_render_pass *pass = state->pass;12241225/* There could be less clear values than attachments in the render pass, in1226* which case we only want to process as many as we have, or there could be1227* more, in which case we want to ignore those for which we don't have a1228* corresponding attachment.1229*/1230count = MIN2(count, pass->attachment_count);1231for (uint32_t i = 0; i < count; i++) {1232const struct v3dv_render_pass_attachment *attachment =1233&pass->attachments[i];12341235if (attachment->desc.loadOp != VK_ATTACHMENT_LOAD_OP_CLEAR)1236continue;12371238VkImageAspectFlags aspects = vk_format_aspects(attachment->desc.format);1239if (aspects & VK_IMAGE_ASPECT_COLOR_BIT) {1240cmd_buffer_state_set_attachment_clear_color(cmd_buffer, i,1241&values[i].color);1242} else if (aspects & (VK_IMAGE_ASPECT_DEPTH_BIT |1243VK_IMAGE_ASPECT_STENCIL_BIT)) {1244cmd_buffer_state_set_attachment_clear_depth_stencil(1245cmd_buffer, i,1246aspects & VK_IMAGE_ASPECT_DEPTH_BIT,1247aspects & VK_IMAGE_ASPECT_STENCIL_BIT,1248&values[i].depthStencil);1249}1250}1251}12521253static void1254cmd_buffer_init_render_pass_attachment_state(struct v3dv_cmd_buffer *cmd_buffer,1255const VkRenderPassBeginInfo *pRenderPassBegin)1256{1257cmd_buffer_state_set_clear_values(cmd_buffer,1258pRenderPassBegin->clearValueCount,1259pRenderPassBegin->pClearValues);1260}12611262static void1263cmd_buffer_ensure_render_pass_attachment_state(struct v3dv_cmd_buffer *cmd_buffer)1264{1265struct v3dv_cmd_buffer_state *state = &cmd_buffer->state;1266const struct v3dv_render_pass *pass = state->pass;12671268if (state->attachment_alloc_count < pass->attachment_count) {1269if (state->attachments > 0) {1270assert(state->attachment_alloc_count > 0);1271vk_free(&cmd_buffer->device->vk.alloc, state->attachments);1272}12731274uint32_t size = sizeof(struct v3dv_cmd_buffer_attachment_state) *1275pass->attachment_count;1276state->attachments = vk_zalloc(&cmd_buffer->device->vk.alloc, size, 8,1277VK_SYSTEM_ALLOCATION_SCOPE_COMMAND);1278if (!state->attachments) {1279v3dv_flag_oom(cmd_buffer, NULL);1280return;1281}1282state->attachment_alloc_count = pass->attachment_count;1283}12841285assert(state->attachment_alloc_count >= pass->attachment_count);1286}12871288VKAPI_ATTR void VKAPI_CALL1289v3dv_CmdBeginRenderPass(VkCommandBuffer commandBuffer,1290const VkRenderPassBeginInfo *pRenderPassBegin,1291VkSubpassContents contents)1292{1293V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);1294V3DV_FROM_HANDLE(v3dv_render_pass, pass, pRenderPassBegin->renderPass);1295V3DV_FROM_HANDLE(v3dv_framebuffer, framebuffer, pRenderPassBegin->framebuffer);12961297struct v3dv_cmd_buffer_state *state = &cmd_buffer->state;1298state->pass = pass;1299state->framebuffer = framebuffer;13001301cmd_buffer_ensure_render_pass_attachment_state(cmd_buffer);1302v3dv_return_if_oom(cmd_buffer, NULL);13031304cmd_buffer_init_render_pass_attachment_state(cmd_buffer, pRenderPassBegin);13051306state->render_area = pRenderPassBegin->renderArea;13071308/* If our render area is smaller than the current clip window we will have1309* to emit a new clip window to constraint it to the render area.1310*/1311uint32_t min_render_x = state->render_area.offset.x;1312uint32_t min_render_y = state->render_area.offset.y;1313uint32_t max_render_x = min_render_x + state->render_area.extent.width - 1;1314uint32_t max_render_y = min_render_y + state->render_area.extent.height - 1;1315uint32_t min_clip_x = state->clip_window.offset.x;1316uint32_t min_clip_y = state->clip_window.offset.y;1317uint32_t max_clip_x = min_clip_x + state->clip_window.extent.width - 1;1318uint32_t max_clip_y = min_clip_y + state->clip_window.extent.height - 1;1319if (min_render_x > min_clip_x || min_render_y > min_clip_y ||1320max_render_x < max_clip_x || max_render_y < max_clip_y) {1321state->dirty |= V3DV_CMD_DIRTY_SCISSOR;1322}13231324/* Setup for first subpass */1325v3dv_cmd_buffer_subpass_start(cmd_buffer, 0);1326}13271328VKAPI_ATTR void VKAPI_CALL1329v3dv_CmdNextSubpass(VkCommandBuffer commandBuffer, VkSubpassContents contents)1330{1331V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);13321333struct v3dv_cmd_buffer_state *state = &cmd_buffer->state;1334assert(state->subpass_idx < state->pass->subpass_count - 1);13351336/* Finish the previous subpass */1337v3dv_cmd_buffer_subpass_finish(cmd_buffer);1338cmd_buffer_subpass_handle_pending_resolves(cmd_buffer);13391340/* Start the next subpass */1341v3dv_cmd_buffer_subpass_start(cmd_buffer, state->subpass_idx + 1);1342}13431344static void1345cmd_buffer_emit_subpass_clears(struct v3dv_cmd_buffer *cmd_buffer)1346{1347assert(cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_PRIMARY);13481349assert(cmd_buffer->state.pass);1350assert(cmd_buffer->state.subpass_idx < cmd_buffer->state.pass->subpass_count);1351const struct v3dv_cmd_buffer_state *state = &cmd_buffer->state;1352const struct v3dv_render_pass *pass = state->pass;1353const struct v3dv_subpass *subpass = &pass->subpasses[state->subpass_idx];13541355/* We only need to emit subpass clears as draw calls when the render1356* area is not aligned to tile boundaries or for GFXH-1461.1357*/1358if (cmd_buffer->state.tile_aligned_render_area &&1359!subpass->do_depth_clear_with_draw &&1360!subpass->do_depth_clear_with_draw) {1361return;1362}13631364uint32_t att_count = 0;1365VkClearAttachment atts[V3D_MAX_DRAW_BUFFERS + 1]; /* 4 color + D/S */13661367/* We only need to emit subpass clears as draw calls for color attachments1368* if the render area is not aligned to tile boundaries.1369*/1370if (!cmd_buffer->state.tile_aligned_render_area) {1371for (uint32_t i = 0; i < subpass->color_count; i++) {1372const uint32_t att_idx = subpass->color_attachments[i].attachment;1373if (att_idx == VK_ATTACHMENT_UNUSED)1374continue;13751376struct v3dv_render_pass_attachment *att = &pass->attachments[att_idx];1377if (att->desc.loadOp != VK_ATTACHMENT_LOAD_OP_CLEAR)1378continue;13791380if (state->subpass_idx != att->first_subpass)1381continue;13821383atts[att_count].aspectMask = VK_IMAGE_ASPECT_COLOR_BIT;1384atts[att_count].colorAttachment = i;1385atts[att_count].clearValue = state->attachments[att_idx].vk_clear_value;1386att_count++;1387}1388}13891390/* For D/S we may also need to emit a subpass clear for GFXH-1461 */1391const uint32_t ds_att_idx = subpass->ds_attachment.attachment;1392if (ds_att_idx != VK_ATTACHMENT_UNUSED) {1393struct v3dv_render_pass_attachment *att = &pass->attachments[ds_att_idx];1394if (state->subpass_idx == att->first_subpass) {1395VkImageAspectFlags aspects = vk_format_aspects(att->desc.format);1396if (att->desc.loadOp != VK_ATTACHMENT_LOAD_OP_CLEAR ||1397(cmd_buffer->state.tile_aligned_render_area &&1398!subpass->do_depth_clear_with_draw)) {1399aspects &= ~VK_IMAGE_ASPECT_DEPTH_BIT;1400}1401if (att->desc.stencilLoadOp != VK_ATTACHMENT_LOAD_OP_CLEAR ||1402(cmd_buffer->state.tile_aligned_render_area &&1403!subpass->do_stencil_clear_with_draw)) {1404aspects &= ~VK_IMAGE_ASPECT_STENCIL_BIT;1405}1406if (aspects) {1407atts[att_count].aspectMask = aspects;1408atts[att_count].colorAttachment = 0; /* Ignored */1409atts[att_count].clearValue =1410state->attachments[ds_att_idx].vk_clear_value;1411att_count++;1412}1413}1414}14151416if (att_count == 0)1417return;14181419if (!cmd_buffer->state.tile_aligned_render_area) {1420perf_debug("Render area doesn't match render pass granularity, falling "1421"back to vkCmdClearAttachments for "1422"VK_ATTACHMENT_LOAD_OP_CLEAR.\n");1423} else if (subpass->do_depth_clear_with_draw ||1424subpass->do_stencil_clear_with_draw) {1425perf_debug("Subpass clears DEPTH but loads STENCIL (or viceversa), "1426"falling back to vkCmdClearAttachments for "1427"VK_ATTACHMENT_LOAD_OP_CLEAR.\n");1428}14291430/* From the Vulkan 1.0 spec:1431*1432* "VK_ATTACHMENT_LOAD_OP_CLEAR specifies that the contents within the1433* render area will be cleared to a uniform value, which is specified1434* when a render pass instance is begun."1435*1436* So the clear is only constrained by the render area and not by pipeline1437* state such as scissor or viewport, these are the semantics of1438* vkCmdClearAttachments as well.1439*/1440VkCommandBuffer _cmd_buffer = v3dv_cmd_buffer_to_handle(cmd_buffer);1441VkClearRect rect = {1442.rect = state->render_area,1443.baseArrayLayer = 0,1444.layerCount = 1,1445};1446v3dv_CmdClearAttachments(_cmd_buffer, att_count, atts, 1, &rect);1447}14481449static struct v3dv_job *1450cmd_buffer_subpass_create_job(struct v3dv_cmd_buffer *cmd_buffer,1451uint32_t subpass_idx,1452enum v3dv_job_type type)1453{1454assert(type == V3DV_JOB_TYPE_GPU_CL ||1455type == V3DV_JOB_TYPE_GPU_CL_SECONDARY);14561457struct v3dv_cmd_buffer_state *state = &cmd_buffer->state;1458assert(subpass_idx < state->pass->subpass_count);14591460/* Starting a new job can trigger a finish of the current one, so don't1461* change the command buffer state for the new job until we are done creating1462* the new job.1463*/1464struct v3dv_job *job =1465v3dv_cmd_buffer_start_job(cmd_buffer, subpass_idx, type);1466if (!job)1467return NULL;14681469state->subpass_idx = subpass_idx;14701471/* If we are starting a new job we need to setup binning. We only do this1472* for V3DV_JOB_TYPE_GPU_CL jobs because V3DV_JOB_TYPE_GPU_CL_SECONDARY1473* jobs are not submitted to the GPU directly, and are instead meant to be1474* branched to from other V3DV_JOB_TYPE_GPU_CL jobs.1475*/1476if (type == V3DV_JOB_TYPE_GPU_CL &&1477job->first_subpass == state->subpass_idx) {1478const struct v3dv_subpass *subpass =1479&state->pass->subpasses[state->subpass_idx];14801481const struct v3dv_framebuffer *framebuffer = state->framebuffer;14821483uint8_t internal_bpp;1484bool msaa;1485v3dv_X(job->device, framebuffer_compute_internal_bpp_msaa)1486(framebuffer, subpass, &internal_bpp, &msaa);14871488v3dv_job_start_frame(job,1489framebuffer->width,1490framebuffer->height,1491framebuffer->layers,1492subpass->color_count,1493internal_bpp,1494msaa);1495}14961497return job;1498}14991500struct v3dv_job *1501v3dv_cmd_buffer_subpass_start(struct v3dv_cmd_buffer *cmd_buffer,1502uint32_t subpass_idx)1503{1504assert(cmd_buffer->state.pass);1505assert(subpass_idx < cmd_buffer->state.pass->subpass_count);15061507struct v3dv_job *job =1508cmd_buffer_subpass_create_job(cmd_buffer, subpass_idx,1509V3DV_JOB_TYPE_GPU_CL);1510if (!job)1511return NULL;15121513/* Check if our render area is aligned to tile boundaries. We have to do1514* this in each subpass because the subset of attachments used can change1515* and with that the tile size selected by the hardware can change too.1516*/1517cmd_buffer_update_tile_alignment(cmd_buffer);15181519/* If we can't use TLB clears then we need to emit draw clears for any1520* LOAD_OP_CLEAR attachments in this subpass now. We might also need to emit1521* Depth/Stencil clears if we hit GFXH-1461.1522*1523* Secondary command buffers don't start subpasses (and may not even have1524* framebuffer state), so we only care about this in primaries. The only1525* exception could be a secondary runnning inside a subpass that needs to1526* record a meta operation (with its own render pass) that relies on1527* attachment load clears, but we don't have any instances of that right1528* now.1529*/1530if (cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_PRIMARY)1531cmd_buffer_emit_subpass_clears(cmd_buffer);15321533return job;1534}15351536struct v3dv_job *1537v3dv_cmd_buffer_subpass_resume(struct v3dv_cmd_buffer *cmd_buffer,1538uint32_t subpass_idx)1539{1540assert(cmd_buffer->state.pass);1541assert(subpass_idx < cmd_buffer->state.pass->subpass_count);15421543struct v3dv_job *job;1544if (cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_PRIMARY) {1545job = cmd_buffer_subpass_create_job(cmd_buffer, subpass_idx,1546V3DV_JOB_TYPE_GPU_CL);1547} else {1548assert(cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_SECONDARY);1549job = cmd_buffer_subpass_create_job(cmd_buffer, subpass_idx,1550V3DV_JOB_TYPE_GPU_CL_SECONDARY);1551}15521553if (!job)1554return NULL;15551556job->is_subpass_continue = true;15571558return job;1559}15601561void1562v3dv_cmd_buffer_subpass_finish(struct v3dv_cmd_buffer *cmd_buffer)1563{1564/* We can end up here without a job if the last command recorded into the1565* subpass already finished the job (for example a pipeline barrier). In1566* that case we miss to set the is_subpass_finish flag, but that is not1567* required for proper behavior.1568*/1569struct v3dv_job *job = cmd_buffer->state.job;1570if (job)1571job->is_subpass_finish = true;1572}15731574VKAPI_ATTR void VKAPI_CALL1575v3dv_CmdEndRenderPass(VkCommandBuffer commandBuffer)1576{1577V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);15781579/* Finalize last subpass */1580struct v3dv_cmd_buffer_state *state = &cmd_buffer->state;1581assert(state->subpass_idx == state->pass->subpass_count - 1);1582v3dv_cmd_buffer_subpass_finish(cmd_buffer);1583v3dv_cmd_buffer_finish_job(cmd_buffer);15841585cmd_buffer_subpass_handle_pending_resolves(cmd_buffer);15861587/* We are no longer inside a render pass */1588state->framebuffer = NULL;1589state->pass = NULL;1590state->subpass_idx = -1;1591}15921593VKAPI_ATTR VkResult VKAPI_CALL1594v3dv_EndCommandBuffer(VkCommandBuffer commandBuffer)1595{1596V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);15971598if (cmd_buffer->state.oom)1599return VK_ERROR_OUT_OF_HOST_MEMORY;16001601/* Primaries should have ended any recording jobs by the time they hit1602* vkEndRenderPass (if we are inside a render pass). Commands outside1603* a render pass instance (for both primaries and secondaries) spawn1604* complete jobs too. So the only case where we can get here without1605* finishing a recording job is when we are recording a secondary1606* inside a render pass.1607*/1608if (cmd_buffer->state.job) {1609assert(cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_SECONDARY &&1610cmd_buffer->state.pass);1611v3dv_cmd_buffer_finish_job(cmd_buffer);1612}16131614cmd_buffer->status = V3DV_CMD_BUFFER_STATUS_EXECUTABLE;16151616return VK_SUCCESS;1617}16181619static void1620clone_bo_list(struct v3dv_cmd_buffer *cmd_buffer,1621struct list_head *dst,1622struct list_head *src)1623{1624assert(cmd_buffer);16251626list_inithead(dst);1627list_for_each_entry(struct v3dv_bo, bo, src, list_link) {1628struct v3dv_bo *clone_bo =1629vk_alloc(&cmd_buffer->device->vk.alloc, sizeof(struct v3dv_bo), 8,1630VK_SYSTEM_ALLOCATION_SCOPE_COMMAND);1631if (!clone_bo) {1632v3dv_flag_oom(cmd_buffer, NULL);1633return;1634}16351636*clone_bo = *bo;1637list_addtail(&clone_bo->list_link, dst);1638}1639}16401641/* Clones a job for inclusion in the given command buffer. Note that this1642* doesn't make a deep copy so the cloned job it doesn't own any resources.1643* Useful when we need to have a job in more than one list, which happens1644* for jobs recorded in secondary command buffers when we want to execute1645* them in primaries.1646*/1647struct v3dv_job *1648v3dv_job_clone_in_cmd_buffer(struct v3dv_job *job,1649struct v3dv_cmd_buffer *cmd_buffer)1650{1651struct v3dv_job *clone_job = vk_alloc(&job->device->vk.alloc,1652sizeof(struct v3dv_job), 8,1653VK_SYSTEM_ALLOCATION_SCOPE_COMMAND);1654if (!clone_job) {1655v3dv_flag_oom(cmd_buffer, NULL);1656return NULL;1657}16581659/* Cloned jobs don't duplicate resources! */1660*clone_job = *job;1661clone_job->is_clone = true;1662clone_job->cmd_buffer = cmd_buffer;1663list_addtail(&clone_job->list_link, &cmd_buffer->jobs);16641665/* We need to regen the BO lists so that they point to the BO list in the1666* cloned job. Otherwise functions like list_length() will loop forever.1667*/1668if (job->type == V3DV_JOB_TYPE_GPU_CL) {1669clone_bo_list(cmd_buffer, &clone_job->bcl.bo_list, &job->bcl.bo_list);1670clone_bo_list(cmd_buffer, &clone_job->rcl.bo_list, &job->rcl.bo_list);1671clone_bo_list(cmd_buffer, &clone_job->indirect.bo_list,1672&job->indirect.bo_list);1673}16741675return clone_job;1676}16771678static void1679cmd_buffer_execute_outside_pass(struct v3dv_cmd_buffer *primary,1680uint32_t cmd_buffer_count,1681const VkCommandBuffer *cmd_buffers)1682{1683bool pending_barrier = false;1684bool pending_bcl_barrier = false;1685for (uint32_t i = 0; i < cmd_buffer_count; i++) {1686V3DV_FROM_HANDLE(v3dv_cmd_buffer, secondary, cmd_buffers[i]);16871688assert(!(secondary->usage_flags &1689VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT));16901691/* Secondary command buffers that execute outside a render pass create1692* complete jobs with an RCL and tile setup, so we simply want to merge1693* their job list into the primary's. However, because they may be1694* executed into multiple primaries at the same time and we only have a1695* single list_link in each job, we can't just add then to the primary's1696* job list and we instead have to clone them first.1697*1698* Alternatively, we could create a "execute secondary" CPU job that1699* when executed in a queue, would submit all the jobs in the referenced1700* secondary command buffer. However, this would raise some challenges1701* to make it work with the implementation of wait threads in the queue1702* which we use for event waits, for example.1703*/1704list_for_each_entry(struct v3dv_job, secondary_job,1705&secondary->jobs, list_link) {1706/* These can only happen inside a render pass */1707assert(secondary_job->type != V3DV_JOB_TYPE_GPU_CL_SECONDARY);1708struct v3dv_job *job = v3dv_job_clone_in_cmd_buffer(secondary_job, primary);1709if (!job)1710return;17111712if (pending_barrier) {1713job->serialize = true;1714if (pending_bcl_barrier)1715job->needs_bcl_sync = true;1716pending_barrier = false;1717pending_bcl_barrier = false;1718}1719}17201721/* If this secondary had any pending barrier state we will need that1722* barrier state consumed with whatever comes after it (first job in1723* the next secondary or the primary, if this was the last secondary).1724*/1725assert(secondary->state.has_barrier || !secondary->state.has_bcl_barrier);1726pending_barrier = secondary->state.has_barrier;1727pending_bcl_barrier = secondary->state.has_bcl_barrier;1728}17291730if (pending_barrier) {1731primary->state.has_barrier = true;1732primary->state.has_bcl_barrier |= pending_bcl_barrier;1733}1734}17351736VKAPI_ATTR void VKAPI_CALL1737v3dv_CmdExecuteCommands(VkCommandBuffer commandBuffer,1738uint32_t commandBufferCount,1739const VkCommandBuffer *pCommandBuffers)1740{1741V3DV_FROM_HANDLE(v3dv_cmd_buffer, primary, commandBuffer);17421743if (primary->state.pass != NULL) {1744v3dv_X(primary->device, cmd_buffer_execute_inside_pass)1745(primary, commandBufferCount, pCommandBuffers);1746} else {1747cmd_buffer_execute_outside_pass(primary,1748commandBufferCount, pCommandBuffers);1749}1750}17511752/* This goes though the list of possible dynamic states in the pipeline and,1753* for those that are not configured as dynamic, copies relevant state into1754* the command buffer.1755*/1756static void1757cmd_buffer_bind_pipeline_static_state(struct v3dv_cmd_buffer *cmd_buffer,1758const struct v3dv_dynamic_state *src)1759{1760struct v3dv_dynamic_state *dest = &cmd_buffer->state.dynamic;1761uint32_t dynamic_mask = src->mask;1762uint32_t dirty = 0;17631764if (!(dynamic_mask & V3DV_DYNAMIC_VIEWPORT)) {1765dest->viewport.count = src->viewport.count;1766if (memcmp(&dest->viewport.viewports, &src->viewport.viewports,1767src->viewport.count * sizeof(VkViewport))) {1768typed_memcpy(dest->viewport.viewports,1769src->viewport.viewports,1770src->viewport.count);1771typed_memcpy(dest->viewport.scale, src->viewport.scale,1772src->viewport.count);1773typed_memcpy(dest->viewport.translate, src->viewport.translate,1774src->viewport.count);1775dirty |= V3DV_CMD_DIRTY_VIEWPORT;1776}1777}17781779if (!(dynamic_mask & V3DV_DYNAMIC_SCISSOR)) {1780dest->scissor.count = src->scissor.count;1781if (memcmp(&dest->scissor.scissors, &src->scissor.scissors,1782src->scissor.count * sizeof(VkRect2D))) {1783typed_memcpy(dest->scissor.scissors,1784src->scissor.scissors, src->scissor.count);1785dirty |= V3DV_CMD_DIRTY_SCISSOR;1786}1787}17881789if (!(dynamic_mask & V3DV_DYNAMIC_STENCIL_COMPARE_MASK)) {1790if (memcmp(&dest->stencil_compare_mask, &src->stencil_compare_mask,1791sizeof(src->stencil_compare_mask))) {1792dest->stencil_compare_mask = src->stencil_compare_mask;1793dirty |= V3DV_CMD_DIRTY_STENCIL_COMPARE_MASK;1794}1795}17961797if (!(dynamic_mask & V3DV_DYNAMIC_STENCIL_WRITE_MASK)) {1798if (memcmp(&dest->stencil_write_mask, &src->stencil_write_mask,1799sizeof(src->stencil_write_mask))) {1800dest->stencil_write_mask = src->stencil_write_mask;1801dirty |= V3DV_CMD_DIRTY_STENCIL_WRITE_MASK;1802}1803}18041805if (!(dynamic_mask & V3DV_DYNAMIC_STENCIL_REFERENCE)) {1806if (memcmp(&dest->stencil_reference, &src->stencil_reference,1807sizeof(src->stencil_reference))) {1808dest->stencil_reference = src->stencil_reference;1809dirty |= V3DV_CMD_DIRTY_STENCIL_REFERENCE;1810}1811}18121813if (!(dynamic_mask & V3DV_DYNAMIC_BLEND_CONSTANTS)) {1814if (memcmp(dest->blend_constants, src->blend_constants,1815sizeof(src->blend_constants))) {1816memcpy(dest->blend_constants, src->blend_constants,1817sizeof(src->blend_constants));1818dirty |= V3DV_CMD_DIRTY_BLEND_CONSTANTS;1819}1820}18211822if (!(dynamic_mask & V3DV_DYNAMIC_DEPTH_BIAS)) {1823if (memcmp(&dest->depth_bias, &src->depth_bias,1824sizeof(src->depth_bias))) {1825memcpy(&dest->depth_bias, &src->depth_bias, sizeof(src->depth_bias));1826dirty |= V3DV_CMD_DIRTY_DEPTH_BIAS;1827}1828}18291830if (!(dynamic_mask & V3DV_DYNAMIC_LINE_WIDTH)) {1831if (dest->line_width != src->line_width) {1832dest->line_width = src->line_width;1833dirty |= V3DV_CMD_DIRTY_LINE_WIDTH;1834}1835}18361837cmd_buffer->state.dynamic.mask = dynamic_mask;1838cmd_buffer->state.dirty |= dirty;1839}18401841static void1842bind_graphics_pipeline(struct v3dv_cmd_buffer *cmd_buffer,1843struct v3dv_pipeline *pipeline)1844{1845assert(pipeline && !(pipeline->active_stages & VK_SHADER_STAGE_COMPUTE_BIT));1846if (cmd_buffer->state.gfx.pipeline == pipeline)1847return;18481849cmd_buffer->state.gfx.pipeline = pipeline;18501851cmd_buffer_bind_pipeline_static_state(cmd_buffer, &pipeline->dynamic_state);18521853cmd_buffer->state.dirty |= V3DV_CMD_DIRTY_PIPELINE;1854}18551856static void1857bind_compute_pipeline(struct v3dv_cmd_buffer *cmd_buffer,1858struct v3dv_pipeline *pipeline)1859{1860assert(pipeline && pipeline->active_stages == VK_SHADER_STAGE_COMPUTE_BIT);18611862if (cmd_buffer->state.compute.pipeline == pipeline)1863return;18641865cmd_buffer->state.compute.pipeline = pipeline;1866cmd_buffer->state.dirty |= V3DV_CMD_DIRTY_COMPUTE_PIPELINE;1867}18681869VKAPI_ATTR void VKAPI_CALL1870v3dv_CmdBindPipeline(VkCommandBuffer commandBuffer,1871VkPipelineBindPoint pipelineBindPoint,1872VkPipeline _pipeline)1873{1874V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);1875V3DV_FROM_HANDLE(v3dv_pipeline, pipeline, _pipeline);18761877switch (pipelineBindPoint) {1878case VK_PIPELINE_BIND_POINT_COMPUTE:1879bind_compute_pipeline(cmd_buffer, pipeline);1880break;18811882case VK_PIPELINE_BIND_POINT_GRAPHICS:1883bind_graphics_pipeline(cmd_buffer, pipeline);1884break;18851886default:1887assert(!"invalid bind point");1888break;1889}1890}18911892/* FIXME: C&P from radv. tu has similar code. Perhaps common place? */1893void1894v3dv_viewport_compute_xform(const VkViewport *viewport,1895float scale[3],1896float translate[3])1897{1898float x = viewport->x;1899float y = viewport->y;1900float half_width = 0.5f * viewport->width;1901float half_height = 0.5f * viewport->height;1902double n = viewport->minDepth;1903double f = viewport->maxDepth;19041905scale[0] = half_width;1906translate[0] = half_width + x;1907scale[1] = half_height;1908translate[1] = half_height + y;19091910scale[2] = (f - n);1911translate[2] = n;19121913/* It seems that if the scale is small enough the hardware won't clip1914* correctly so we work around this my choosing the smallest scale that1915* seems to work.1916*1917* This case is exercised by CTS:1918* dEQP-VK.draw.inverted_depth_ranges.nodepthclamp_deltazero1919*/1920const float min_abs_scale = 0.000009f;1921if (fabs(scale[2]) < min_abs_scale)1922scale[2] = min_abs_scale * (scale[2] < 0 ? -1.0f : 1.0f);1923}19241925VKAPI_ATTR void VKAPI_CALL1926v3dv_CmdSetViewport(VkCommandBuffer commandBuffer,1927uint32_t firstViewport,1928uint32_t viewportCount,1929const VkViewport *pViewports)1930{1931V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);1932struct v3dv_cmd_buffer_state *state = &cmd_buffer->state;1933const uint32_t total_count = firstViewport + viewportCount;19341935assert(firstViewport < MAX_VIEWPORTS);1936assert(total_count >= 1 && total_count <= MAX_VIEWPORTS);19371938if (state->dynamic.viewport.count < total_count)1939state->dynamic.viewport.count = total_count;19401941if (!memcmp(state->dynamic.viewport.viewports + firstViewport,1942pViewports, viewportCount * sizeof(*pViewports))) {1943return;1944}19451946memcpy(state->dynamic.viewport.viewports + firstViewport, pViewports,1947viewportCount * sizeof(*pViewports));19481949for (uint32_t i = firstViewport; i < total_count; i++) {1950v3dv_viewport_compute_xform(&state->dynamic.viewport.viewports[i],1951state->dynamic.viewport.scale[i],1952state->dynamic.viewport.translate[i]);1953}19541955cmd_buffer->state.dirty |= V3DV_CMD_DIRTY_VIEWPORT;1956}19571958VKAPI_ATTR void VKAPI_CALL1959v3dv_CmdSetScissor(VkCommandBuffer commandBuffer,1960uint32_t firstScissor,1961uint32_t scissorCount,1962const VkRect2D *pScissors)1963{1964V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);1965struct v3dv_cmd_buffer_state *state = &cmd_buffer->state;19661967assert(firstScissor < MAX_SCISSORS);1968assert(firstScissor + scissorCount >= 1 &&1969firstScissor + scissorCount <= MAX_SCISSORS);19701971if (state->dynamic.scissor.count < firstScissor + scissorCount)1972state->dynamic.scissor.count = firstScissor + scissorCount;19731974if (!memcmp(state->dynamic.scissor.scissors + firstScissor,1975pScissors, scissorCount * sizeof(*pScissors))) {1976return;1977}19781979memcpy(state->dynamic.scissor.scissors + firstScissor, pScissors,1980scissorCount * sizeof(*pScissors));19811982cmd_buffer->state.dirty |= V3DV_CMD_DIRTY_SCISSOR;1983}19841985static void1986emit_scissor(struct v3dv_cmd_buffer *cmd_buffer)1987{1988if (cmd_buffer->state.dynamic.viewport.count == 0)1989return;19901991struct v3dv_dynamic_state *dynamic = &cmd_buffer->state.dynamic;19921993/* FIXME: right now we only support one viewport. viewporst[0] would work1994* now, but would need to change if we allow multiple viewports.1995*/1996float *vptranslate = dynamic->viewport.translate[0];1997float *vpscale = dynamic->viewport.scale[0];19981999float vp_minx = -fabsf(vpscale[0]) + vptranslate[0];2000float vp_maxx = fabsf(vpscale[0]) + vptranslate[0];2001float vp_miny = -fabsf(vpscale[1]) + vptranslate[1];2002float vp_maxy = fabsf(vpscale[1]) + vptranslate[1];20032004/* Quoting from v3dx_emit:2005* "Clip to the scissor if it's enabled, but still clip to the2006* drawable regardless since that controls where the binner2007* tries to put things.2008*2009* Additionally, always clip the rendering to the viewport,2010* since the hardware does guardband clipping, meaning2011* primitives would rasterize outside of the view volume."2012*/2013uint32_t minx, miny, maxx, maxy;20142015/* From the Vulkan spec:2016*2017* "The application must ensure (using scissor if necessary) that all2018* rendering is contained within the render area. The render area must be2019* contained within the framebuffer dimensions."2020*2021* So it is the application's responsibility to ensure this. Still, we can2022* help by automatically restricting the scissor rect to the render area.2023*/2024minx = MAX2(vp_minx, cmd_buffer->state.render_area.offset.x);2025miny = MAX2(vp_miny, cmd_buffer->state.render_area.offset.y);2026maxx = MIN2(vp_maxx, cmd_buffer->state.render_area.offset.x +2027cmd_buffer->state.render_area.extent.width);2028maxy = MIN2(vp_maxy, cmd_buffer->state.render_area.offset.y +2029cmd_buffer->state.render_area.extent.height);20302031minx = vp_minx;2032miny = vp_miny;2033maxx = vp_maxx;2034maxy = vp_maxy;20352036/* Clip against user provided scissor if needed.2037*2038* FIXME: right now we only allow one scissor. Below would need to be2039* updated if we support more2040*/2041if (dynamic->scissor.count > 0) {2042VkRect2D *scissor = &dynamic->scissor.scissors[0];2043minx = MAX2(minx, scissor->offset.x);2044miny = MAX2(miny, scissor->offset.y);2045maxx = MIN2(maxx, scissor->offset.x + scissor->extent.width);2046maxy = MIN2(maxy, scissor->offset.y + scissor->extent.height);2047}20482049/* If the scissor is outside the viewport area we end up with2050* min{x,y} > max{x,y}.2051*/2052if (minx > maxx)2053maxx = minx;2054if (miny > maxy)2055maxy = miny;20562057cmd_buffer->state.clip_window.offset.x = minx;2058cmd_buffer->state.clip_window.offset.y = miny;2059cmd_buffer->state.clip_window.extent.width = maxx - minx;2060cmd_buffer->state.clip_window.extent.height = maxy - miny;20612062v3dv_X(cmd_buffer->device, job_emit_clip_window)2063(cmd_buffer->state.job, &cmd_buffer->state.clip_window);20642065cmd_buffer->state.dirty &= ~V3DV_CMD_DIRTY_SCISSOR;2066}20672068static void2069update_gfx_uniform_state(struct v3dv_cmd_buffer *cmd_buffer,2070uint32_t dirty_uniform_state)2071{2072/* We need to update uniform streams if any piece of state that is passed2073* to the shader as a uniform may have changed.2074*2075* If only descriptor sets are dirty then we can safely ignore updates2076* for shader stages that don't access descriptors.2077*/20782079struct v3dv_pipeline *pipeline = cmd_buffer->state.gfx.pipeline;2080assert(pipeline);20812082const bool has_new_pipeline = dirty_uniform_state & V3DV_CMD_DIRTY_PIPELINE;2083const bool has_new_viewport = dirty_uniform_state & V3DV_CMD_DIRTY_VIEWPORT;2084const bool has_new_push_constants = dirty_uniform_state & V3DV_CMD_DIRTY_PUSH_CONSTANTS;2085const bool has_new_descriptors = dirty_uniform_state & V3DV_CMD_DIRTY_DESCRIPTOR_SETS;20862087/* VK_SHADER_STAGE_FRAGMENT_BIT */2088const bool has_new_descriptors_fs =2089has_new_descriptors &&2090(cmd_buffer->state.dirty_descriptor_stages & VK_SHADER_STAGE_FRAGMENT_BIT);20912092const bool has_new_push_constants_fs =2093has_new_push_constants &&2094(cmd_buffer->state.dirty_push_constants_stages & VK_SHADER_STAGE_FRAGMENT_BIT);20952096const bool needs_fs_update = has_new_pipeline ||2097has_new_push_constants_fs ||2098has_new_descriptors_fs;20992100if (needs_fs_update) {2101struct v3dv_shader_variant *fs_variant =2102pipeline->shared_data->variants[BROADCOM_SHADER_FRAGMENT];21032104cmd_buffer->state.uniforms.fs =2105v3dv_write_uniforms(cmd_buffer, pipeline, fs_variant);2106}21072108/* VK_SHADER_STAGE_GEOMETRY_BIT */2109if (pipeline->has_gs) {2110const bool has_new_descriptors_gs =2111has_new_descriptors &&2112(cmd_buffer->state.dirty_descriptor_stages &2113VK_SHADER_STAGE_GEOMETRY_BIT);21142115const bool has_new_push_constants_gs =2116has_new_push_constants &&2117(cmd_buffer->state.dirty_push_constants_stages &2118VK_SHADER_STAGE_GEOMETRY_BIT);21192120const bool needs_gs_update = has_new_viewport ||2121has_new_pipeline ||2122has_new_push_constants_gs ||2123has_new_descriptors_gs;21242125if (needs_gs_update) {2126struct v3dv_shader_variant *gs_variant =2127pipeline->shared_data->variants[BROADCOM_SHADER_GEOMETRY];21282129struct v3dv_shader_variant *gs_bin_variant =2130pipeline->shared_data->variants[BROADCOM_SHADER_GEOMETRY_BIN];21312132cmd_buffer->state.uniforms.gs =2133v3dv_write_uniforms(cmd_buffer, pipeline, gs_variant);21342135cmd_buffer->state.uniforms.gs_bin =2136v3dv_write_uniforms(cmd_buffer, pipeline, gs_bin_variant);2137}2138}21392140/* VK_SHADER_STAGE_VERTEX_BIT */2141const bool has_new_descriptors_vs =2142has_new_descriptors &&2143(cmd_buffer->state.dirty_descriptor_stages & VK_SHADER_STAGE_VERTEX_BIT);21442145const bool has_new_push_constants_vs =2146has_new_push_constants &&2147(cmd_buffer->state.dirty_push_constants_stages & VK_SHADER_STAGE_VERTEX_BIT);21482149const bool needs_vs_update = has_new_viewport ||2150has_new_pipeline ||2151has_new_push_constants_vs ||2152has_new_descriptors_vs;21532154if (needs_vs_update) {2155struct v3dv_shader_variant *vs_variant =2156pipeline->shared_data->variants[BROADCOM_SHADER_VERTEX];21572158struct v3dv_shader_variant *vs_bin_variant =2159pipeline->shared_data->variants[BROADCOM_SHADER_VERTEX_BIN];21602161cmd_buffer->state.uniforms.vs =2162v3dv_write_uniforms(cmd_buffer, pipeline, vs_variant);21632164cmd_buffer->state.uniforms.vs_bin =2165v3dv_write_uniforms(cmd_buffer, pipeline, vs_bin_variant);2166}2167}21682169/* This stores command buffer state that we might be about to stomp for2170* a meta operation.2171*/2172void2173v3dv_cmd_buffer_meta_state_push(struct v3dv_cmd_buffer *cmd_buffer,2174bool push_descriptor_state)2175{2176struct v3dv_cmd_buffer_state *state = &cmd_buffer->state;21772178if (state->subpass_idx != -1) {2179state->meta.subpass_idx = state->subpass_idx;2180state->meta.framebuffer = v3dv_framebuffer_to_handle(state->framebuffer);2181state->meta.pass = v3dv_render_pass_to_handle(state->pass);21822183const uint32_t attachment_state_item_size =2184sizeof(struct v3dv_cmd_buffer_attachment_state);2185const uint32_t attachment_state_total_size =2186attachment_state_item_size * state->attachment_alloc_count;2187if (state->meta.attachment_alloc_count < state->attachment_alloc_count) {2188if (state->meta.attachment_alloc_count > 0)2189vk_free(&cmd_buffer->device->vk.alloc, state->meta.attachments);21902191state->meta.attachments = vk_zalloc(&cmd_buffer->device->vk.alloc,2192attachment_state_total_size, 8,2193VK_SYSTEM_ALLOCATION_SCOPE_COMMAND);2194if (!state->meta.attachments) {2195v3dv_flag_oom(cmd_buffer, NULL);2196return;2197}2198state->meta.attachment_alloc_count = state->attachment_alloc_count;2199}2200state->meta.attachment_count = state->attachment_alloc_count;2201memcpy(state->meta.attachments, state->attachments,2202attachment_state_total_size);22032204state->meta.tile_aligned_render_area = state->tile_aligned_render_area;2205memcpy(&state->meta.render_area, &state->render_area, sizeof(VkRect2D));2206}22072208/* We expect that meta operations are graphics-only, so we only take into2209* account the graphics pipeline, and the graphics state2210*/2211state->meta.gfx.pipeline = state->gfx.pipeline;2212memcpy(&state->meta.dynamic, &state->dynamic, sizeof(state->dynamic));22132214struct v3dv_descriptor_state *gfx_descriptor_state =2215&cmd_buffer->state.gfx.descriptor_state;22162217if (push_descriptor_state) {2218if (gfx_descriptor_state->valid != 0) {2219memcpy(&state->meta.gfx.descriptor_state, gfx_descriptor_state,2220sizeof(state->gfx.descriptor_state));2221}2222state->meta.has_descriptor_state = true;2223} else {2224state->meta.has_descriptor_state = false;2225}22262227/* FIXME: if we keep track of wether we have bound any push constant state2228* at all we could restruct this only to cases where it is actually2229* necessary.2230*/2231memcpy(state->meta.push_constants, cmd_buffer->push_constants_data,2232sizeof(state->meta.push_constants));2233}22342235/* This restores command buffer state after a meta operation2236*/2237void2238v3dv_cmd_buffer_meta_state_pop(struct v3dv_cmd_buffer *cmd_buffer,2239uint32_t dirty_dynamic_state,2240bool needs_subpass_resume)2241{2242struct v3dv_cmd_buffer_state *state = &cmd_buffer->state;22432244if (state->meta.subpass_idx != -1) {2245state->pass = v3dv_render_pass_from_handle(state->meta.pass);2246state->framebuffer = v3dv_framebuffer_from_handle(state->meta.framebuffer);22472248assert(state->meta.attachment_count <= state->attachment_alloc_count);2249const uint32_t attachment_state_item_size =2250sizeof(struct v3dv_cmd_buffer_attachment_state);2251const uint32_t attachment_state_total_size =2252attachment_state_item_size * state->meta.attachment_count;2253memcpy(state->attachments, state->meta.attachments,2254attachment_state_total_size);22552256state->tile_aligned_render_area = state->meta.tile_aligned_render_area;2257memcpy(&state->render_area, &state->meta.render_area, sizeof(VkRect2D));22582259/* Is needs_subpass_resume is true it means that the emitted the meta2260* operation in its own job (possibly with an RT config that is2261* incompatible with the current subpass), so resuming subpass execution2262* after it requires that we create a new job with the subpass RT setup.2263*/2264if (needs_subpass_resume)2265v3dv_cmd_buffer_subpass_resume(cmd_buffer, state->meta.subpass_idx);2266} else {2267state->subpass_idx = -1;2268}22692270if (state->meta.gfx.pipeline != NULL) {2271struct v3dv_pipeline *pipeline = state->meta.gfx.pipeline;2272VkPipelineBindPoint pipeline_binding =2273v3dv_pipeline_get_binding_point(pipeline);2274v3dv_CmdBindPipeline(v3dv_cmd_buffer_to_handle(cmd_buffer),2275pipeline_binding,2276v3dv_pipeline_to_handle(state->meta.gfx.pipeline));2277} else {2278state->gfx.pipeline = NULL;2279}22802281if (dirty_dynamic_state) {2282memcpy(&state->dynamic, &state->meta.dynamic, sizeof(state->dynamic));2283state->dirty |= dirty_dynamic_state;2284}22852286if (state->meta.has_descriptor_state) {2287if (state->meta.gfx.descriptor_state.valid != 0) {2288memcpy(&state->gfx.descriptor_state, &state->meta.gfx.descriptor_state,2289sizeof(state->gfx.descriptor_state));2290} else {2291state->gfx.descriptor_state.valid = 0;2292}2293}22942295memcpy(cmd_buffer->push_constants_data, state->meta.push_constants,2296sizeof(state->meta.push_constants));22972298state->meta.gfx.pipeline = NULL;2299state->meta.framebuffer = VK_NULL_HANDLE;2300state->meta.pass = VK_NULL_HANDLE;2301state->meta.subpass_idx = -1;2302state->meta.has_descriptor_state = false;2303}23042305static struct v3dv_job *2306cmd_buffer_pre_draw_split_job(struct v3dv_cmd_buffer *cmd_buffer)2307{2308struct v3dv_job *job = cmd_buffer->state.job;2309assert(job);23102311/* If the job has been flagged with 'always_flush' and it has already2312* recorded any draw calls then we need to start a new job for it.2313*/2314if (job->always_flush && job->draw_count > 0) {2315assert(cmd_buffer->state.pass);2316/* First, flag the current job as not being the last in the2317* current subpass2318*/2319job->is_subpass_finish = false;23202321/* Now start a new job in the same subpass and flag it as continuing2322* the current subpass.2323*/2324job = v3dv_cmd_buffer_subpass_resume(cmd_buffer,2325cmd_buffer->state.subpass_idx);2326assert(job->draw_count == 0);23272328/* Inherit the 'always flush' behavior */2329job->always_flush = true;2330}23312332assert(job->draw_count == 0 || !job->always_flush);2333return job;2334}23352336/**2337* The Vulkan spec states:2338*2339* "It is legal for a subpass to use no color or depth/stencil2340* attachments (...) This kind of subpass can use shader side effects such2341* as image stores and atomics to produce an output. In this case, the2342* subpass continues to use the width, height, and layers of the framebuffer2343* to define the dimensions of the rendering area, and the2344* rasterizationSamples from each pipeline’s2345* VkPipelineMultisampleStateCreateInfo to define the number of samples used2346* in rasterization."2347*2348* We need to enable MSAA in the TILE_BINNING_MODE_CFG packet, which we2349* emit when we start a new frame at the begining of a subpass. At that point,2350* if the framebuffer doesn't have any attachments we won't enable MSAA and2351* the job won't be valid in the scenario described by the spec.2352*2353* This function is intended to be called before a draw call and will test if2354* we are in that scenario, in which case, it will restart the current job2355* with MSAA enabled.2356*/2357static void2358cmd_buffer_restart_job_for_msaa_if_needed(struct v3dv_cmd_buffer *cmd_buffer)2359{2360assert(cmd_buffer->state.job);23612362/* We don't support variableMultisampleRate so we know that all pipelines2363* bound in the same subpass must have matching number of samples, so we2364* can do this check only on the first draw call.2365*/2366if (cmd_buffer->state.job->draw_count > 0)2367return;23682369/* We only need to restart the frame if the pipeline requires MSAA but2370* our frame tiling didn't enable it.2371*/2372if (!cmd_buffer->state.gfx.pipeline->msaa ||2373cmd_buffer->state.job->frame_tiling.msaa) {2374return;2375}23762377/* FIXME: Secondary command buffers don't start frames. Instead, they are2378* recorded into primary jobs that start them. For secondaries, we should2379* still handle this scenario, but we should do that when we record them2380* into primaries by testing if any of the secondaries has multisampled2381* draw calls in them, and then using that info to decide if we need to2382* restart the primary job into which they are being recorded.2383*/2384if (cmd_buffer->level != VK_COMMAND_BUFFER_LEVEL_PRIMARY)2385return;23862387/* Drop the current job and restart it with MSAA enabled */2388struct v3dv_job *old_job = cmd_buffer->state.job;2389cmd_buffer->state.job = NULL;23902391struct v3dv_job *job = vk_zalloc(&cmd_buffer->device->vk.alloc,2392sizeof(struct v3dv_job), 8,2393VK_SYSTEM_ALLOCATION_SCOPE_COMMAND);2394if (!job) {2395v3dv_flag_oom(cmd_buffer, NULL);2396return;2397}23982399v3dv_job_init(job, V3DV_JOB_TYPE_GPU_CL, cmd_buffer->device, cmd_buffer,2400cmd_buffer->state.subpass_idx);2401cmd_buffer->state.job = job;24022403v3dv_job_start_frame(job,2404old_job->frame_tiling.width,2405old_job->frame_tiling.height,2406old_job->frame_tiling.layers,2407old_job->frame_tiling.render_target_count,2408old_job->frame_tiling.internal_bpp,2409true /* msaa */);24102411v3dv_job_destroy(old_job);2412}24132414void2415v3dv_cmd_buffer_emit_pre_draw(struct v3dv_cmd_buffer *cmd_buffer)2416{2417assert(cmd_buffer->state.gfx.pipeline);2418assert(!(cmd_buffer->state.gfx.pipeline->active_stages & VK_SHADER_STAGE_COMPUTE_BIT));24192420/* If we emitted a pipeline barrier right before this draw we won't have2421* an active job. In that case, create a new job continuing the current2422* subpass.2423*/2424if (!cmd_buffer->state.job) {2425v3dv_cmd_buffer_subpass_resume(cmd_buffer,2426cmd_buffer->state.subpass_idx);2427}24282429/* Restart single sample job for MSAA pipeline if needed */2430cmd_buffer_restart_job_for_msaa_if_needed(cmd_buffer);24312432/* If the job is configured to flush on every draw call we need to create2433* a new job now.2434*/2435struct v3dv_job *job = cmd_buffer_pre_draw_split_job(cmd_buffer);2436job->draw_count++;24372438/* GL shader state binds shaders, uniform and vertex attribute state. The2439* compiler injects uniforms to handle some descriptor types (such as2440* textures), so we need to regen that when descriptor state changes.2441*2442* We also need to emit new shader state if we have a dirty viewport since2443* that will require that we new uniform state for QUNIFORM_VIEWPORT_*.2444*/2445uint32_t *dirty = &cmd_buffer->state.dirty;24462447const uint32_t dirty_uniform_state =2448*dirty & (V3DV_CMD_DIRTY_PIPELINE |2449V3DV_CMD_DIRTY_PUSH_CONSTANTS |2450V3DV_CMD_DIRTY_DESCRIPTOR_SETS |2451V3DV_CMD_DIRTY_VIEWPORT);24522453if (dirty_uniform_state)2454update_gfx_uniform_state(cmd_buffer, dirty_uniform_state);24552456struct v3dv_device *device = cmd_buffer->device;24572458if (dirty_uniform_state || (*dirty & V3DV_CMD_DIRTY_VERTEX_BUFFER))2459v3dv_X(device, cmd_buffer_emit_gl_shader_state)(cmd_buffer);24602461if (*dirty & (V3DV_CMD_DIRTY_PIPELINE)) {2462v3dv_X(device, cmd_buffer_emit_configuration_bits)(cmd_buffer);2463v3dv_X(device, cmd_buffer_emit_varyings_state)(cmd_buffer);2464}24652466if (*dirty & (V3DV_CMD_DIRTY_VIEWPORT | V3DV_CMD_DIRTY_SCISSOR)) {2467emit_scissor(cmd_buffer);2468}24692470if (*dirty & V3DV_CMD_DIRTY_VIEWPORT) {2471v3dv_X(device, cmd_buffer_emit_viewport)(cmd_buffer);2472}24732474if (*dirty & V3DV_CMD_DIRTY_INDEX_BUFFER)2475v3dv_X(device, cmd_buffer_emit_index_buffer)(cmd_buffer);24762477const uint32_t dynamic_stencil_dirty_flags =2478V3DV_CMD_DIRTY_STENCIL_COMPARE_MASK |2479V3DV_CMD_DIRTY_STENCIL_WRITE_MASK |2480V3DV_CMD_DIRTY_STENCIL_REFERENCE;2481if (*dirty & (V3DV_CMD_DIRTY_PIPELINE | dynamic_stencil_dirty_flags))2482v3dv_X(device, cmd_buffer_emit_stencil)(cmd_buffer);24832484if (*dirty & (V3DV_CMD_DIRTY_PIPELINE | V3DV_CMD_DIRTY_DEPTH_BIAS))2485v3dv_X(device, cmd_buffer_emit_depth_bias)(cmd_buffer);24862487if (*dirty & (V3DV_CMD_DIRTY_PIPELINE | V3DV_CMD_DIRTY_BLEND_CONSTANTS))2488v3dv_X(device, cmd_buffer_emit_blend)(cmd_buffer);24892490if (*dirty & V3DV_CMD_DIRTY_OCCLUSION_QUERY)2491v3dv_X(device, cmd_buffer_emit_occlusion_query)(cmd_buffer);24922493if (*dirty & V3DV_CMD_DIRTY_LINE_WIDTH)2494v3dv_X(device, cmd_buffer_emit_line_width)(cmd_buffer);24952496if (*dirty & V3DV_CMD_DIRTY_PIPELINE)2497v3dv_X(device, cmd_buffer_emit_sample_state)(cmd_buffer);24982499cmd_buffer->state.dirty &= ~V3DV_CMD_DIRTY_PIPELINE;2500}25012502static void2503cmd_buffer_draw(struct v3dv_cmd_buffer *cmd_buffer,2504struct v3dv_draw_info *info)2505{2506v3dv_cmd_buffer_emit_pre_draw(cmd_buffer);2507v3dv_X(cmd_buffer->device, cmd_buffer_emit_draw)(cmd_buffer, info);2508}25092510VKAPI_ATTR void VKAPI_CALL2511v3dv_CmdDraw(VkCommandBuffer commandBuffer,2512uint32_t vertexCount,2513uint32_t instanceCount,2514uint32_t firstVertex,2515uint32_t firstInstance)2516{2517if (vertexCount == 0 || instanceCount == 0)2518return;25192520V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);2521struct v3dv_draw_info info = {};2522info.vertex_count = vertexCount;2523info.instance_count = instanceCount;2524info.first_instance = firstInstance;2525info.first_vertex = firstVertex;25262527cmd_buffer_draw(cmd_buffer, &info);2528}25292530VKAPI_ATTR void VKAPI_CALL2531v3dv_CmdDrawIndexed(VkCommandBuffer commandBuffer,2532uint32_t indexCount,2533uint32_t instanceCount,2534uint32_t firstIndex,2535int32_t vertexOffset,2536uint32_t firstInstance)2537{2538if (indexCount == 0 || instanceCount == 0)2539return;25402541V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);25422543v3dv_X(cmd_buffer->device, cmd_buffer_emit_draw_indexed)2544(cmd_buffer, indexCount, instanceCount,2545firstIndex, vertexOffset, firstInstance);2546}25472548VKAPI_ATTR void VKAPI_CALL2549v3dv_CmdDrawIndirect(VkCommandBuffer commandBuffer,2550VkBuffer _buffer,2551VkDeviceSize offset,2552uint32_t drawCount,2553uint32_t stride)2554{2555/* drawCount is the number of draws to execute, and can be zero. */2556if (drawCount == 0)2557return;25582559V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);2560V3DV_FROM_HANDLE(v3dv_buffer, buffer, _buffer);25612562v3dv_X(cmd_buffer->device, cmd_buffer_emit_draw_indirect)2563(cmd_buffer, buffer, offset, drawCount, stride);2564}25652566VKAPI_ATTR void VKAPI_CALL2567v3dv_CmdDrawIndexedIndirect(VkCommandBuffer commandBuffer,2568VkBuffer _buffer,2569VkDeviceSize offset,2570uint32_t drawCount,2571uint32_t stride)2572{2573/* drawCount is the number of draws to execute, and can be zero. */2574if (drawCount == 0)2575return;25762577V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);2578V3DV_FROM_HANDLE(v3dv_buffer, buffer, _buffer);25792580v3dv_X(cmd_buffer->device, cmd_buffer_emit_indexed_indirect)2581(cmd_buffer, buffer, offset, drawCount, stride);2582}25832584VKAPI_ATTR void VKAPI_CALL2585v3dv_CmdPipelineBarrier(VkCommandBuffer commandBuffer,2586VkPipelineStageFlags srcStageMask,2587VkPipelineStageFlags dstStageMask,2588VkDependencyFlags dependencyFlags,2589uint32_t memoryBarrierCount,2590const VkMemoryBarrier *pMemoryBarriers,2591uint32_t bufferBarrierCount,2592const VkBufferMemoryBarrier *pBufferBarriers,2593uint32_t imageBarrierCount,2594const VkImageMemoryBarrier *pImageBarriers)2595{2596V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);25972598/* We only care about barriers between GPU jobs */2599if (srcStageMask == VK_PIPELINE_STAGE_HOST_BIT ||2600dstStageMask == VK_PIPELINE_STAGE_HOST_BIT) {2601return;2602}26032604/* If we have a recording job, finish it here */2605struct v3dv_job *job = cmd_buffer->state.job;2606if (job)2607v3dv_cmd_buffer_finish_job(cmd_buffer);26082609cmd_buffer->state.has_barrier = true;2610if (dstStageMask & (VK_PIPELINE_STAGE_VERTEX_INPUT_BIT |2611VK_PIPELINE_STAGE_VERTEX_SHADER_BIT |2612VK_PIPELINE_STAGE_GEOMETRY_SHADER_BIT |2613VK_PIPELINE_STAGE_TESSELLATION_CONTROL_SHADER_BIT |2614VK_PIPELINE_STAGE_TESSELLATION_EVALUATION_SHADER_BIT |2615VK_PIPELINE_STAGE_DRAW_INDIRECT_BIT)) {2616cmd_buffer->state.has_bcl_barrier = true;2617}2618}26192620VKAPI_ATTR void VKAPI_CALL2621v3dv_CmdBindVertexBuffers(VkCommandBuffer commandBuffer,2622uint32_t firstBinding,2623uint32_t bindingCount,2624const VkBuffer *pBuffers,2625const VkDeviceSize *pOffsets)2626{2627V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);2628struct v3dv_vertex_binding *vb = cmd_buffer->state.vertex_bindings;26292630/* We have to defer setting up vertex buffer since we need the buffer2631* stride from the pipeline.2632*/26332634assert(firstBinding + bindingCount <= MAX_VBS);2635bool vb_state_changed = false;2636for (uint32_t i = 0; i < bindingCount; i++) {2637if (vb[firstBinding + i].buffer != v3dv_buffer_from_handle(pBuffers[i])) {2638vb[firstBinding + i].buffer = v3dv_buffer_from_handle(pBuffers[i]);2639vb_state_changed = true;2640}2641if (vb[firstBinding + i].offset != pOffsets[i]) {2642vb[firstBinding + i].offset = pOffsets[i];2643vb_state_changed = true;2644}2645}26462647if (vb_state_changed)2648cmd_buffer->state.dirty |= V3DV_CMD_DIRTY_VERTEX_BUFFER;2649}26502651static uint32_t2652get_index_size(VkIndexType index_type)2653{2654switch (index_type) {2655case VK_INDEX_TYPE_UINT8_EXT:2656return 1;2657break;2658case VK_INDEX_TYPE_UINT16:2659return 2;2660break;2661case VK_INDEX_TYPE_UINT32:2662return 4;2663break;2664default:2665unreachable("Unsupported index type");2666}2667}26682669VKAPI_ATTR void VKAPI_CALL2670v3dv_CmdBindIndexBuffer(VkCommandBuffer commandBuffer,2671VkBuffer buffer,2672VkDeviceSize offset,2673VkIndexType indexType)2674{2675V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);26762677const uint32_t index_size = get_index_size(indexType);2678if (buffer == cmd_buffer->state.index_buffer.buffer &&2679offset == cmd_buffer->state.index_buffer.offset &&2680index_size == cmd_buffer->state.index_buffer.index_size) {2681return;2682}26832684cmd_buffer->state.index_buffer.buffer = buffer;2685cmd_buffer->state.index_buffer.offset = offset;2686cmd_buffer->state.index_buffer.index_size = index_size;2687cmd_buffer->state.dirty |= V3DV_CMD_DIRTY_INDEX_BUFFER;2688}26892690VKAPI_ATTR void VKAPI_CALL2691v3dv_CmdSetStencilCompareMask(VkCommandBuffer commandBuffer,2692VkStencilFaceFlags faceMask,2693uint32_t compareMask)2694{2695V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);26962697if (faceMask & VK_STENCIL_FACE_FRONT_BIT)2698cmd_buffer->state.dynamic.stencil_compare_mask.front = compareMask & 0xff;2699if (faceMask & VK_STENCIL_FACE_BACK_BIT)2700cmd_buffer->state.dynamic.stencil_compare_mask.back = compareMask & 0xff;27012702cmd_buffer->state.dirty |= V3DV_CMD_DIRTY_STENCIL_COMPARE_MASK;2703}27042705VKAPI_ATTR void VKAPI_CALL2706v3dv_CmdSetStencilWriteMask(VkCommandBuffer commandBuffer,2707VkStencilFaceFlags faceMask,2708uint32_t writeMask)2709{2710V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);27112712if (faceMask & VK_STENCIL_FACE_FRONT_BIT)2713cmd_buffer->state.dynamic.stencil_write_mask.front = writeMask & 0xff;2714if (faceMask & VK_STENCIL_FACE_BACK_BIT)2715cmd_buffer->state.dynamic.stencil_write_mask.back = writeMask & 0xff;27162717cmd_buffer->state.dirty |= V3DV_CMD_DIRTY_STENCIL_WRITE_MASK;2718}27192720VKAPI_ATTR void VKAPI_CALL2721v3dv_CmdSetStencilReference(VkCommandBuffer commandBuffer,2722VkStencilFaceFlags faceMask,2723uint32_t reference)2724{2725V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);27262727if (faceMask & VK_STENCIL_FACE_FRONT_BIT)2728cmd_buffer->state.dynamic.stencil_reference.front = reference & 0xff;2729if (faceMask & VK_STENCIL_FACE_BACK_BIT)2730cmd_buffer->state.dynamic.stencil_reference.back = reference & 0xff;27312732cmd_buffer->state.dirty |= V3DV_CMD_DIRTY_STENCIL_REFERENCE;2733}27342735VKAPI_ATTR void VKAPI_CALL2736v3dv_CmdSetDepthBias(VkCommandBuffer commandBuffer,2737float depthBiasConstantFactor,2738float depthBiasClamp,2739float depthBiasSlopeFactor)2740{2741V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);27422743cmd_buffer->state.dynamic.depth_bias.constant_factor = depthBiasConstantFactor;2744cmd_buffer->state.dynamic.depth_bias.depth_bias_clamp = depthBiasClamp;2745cmd_buffer->state.dynamic.depth_bias.slope_factor = depthBiasSlopeFactor;2746cmd_buffer->state.dirty |= V3DV_CMD_DIRTY_DEPTH_BIAS;2747}27482749VKAPI_ATTR void VKAPI_CALL2750v3dv_CmdSetDepthBounds(VkCommandBuffer commandBuffer,2751float minDepthBounds,2752float maxDepthBounds)2753{2754/* We do not support depth bounds testing so we just ingore this. We are2755* already asserting that pipelines don't enable the feature anyway.2756*/2757}27582759VKAPI_ATTR void VKAPI_CALL2760v3dv_CmdSetLineWidth(VkCommandBuffer commandBuffer,2761float lineWidth)2762{2763V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);27642765cmd_buffer->state.dynamic.line_width = lineWidth;2766cmd_buffer->state.dirty |= V3DV_CMD_DIRTY_LINE_WIDTH;2767}27682769VKAPI_ATTR void VKAPI_CALL2770v3dv_CmdBindDescriptorSets(VkCommandBuffer commandBuffer,2771VkPipelineBindPoint pipelineBindPoint,2772VkPipelineLayout _layout,2773uint32_t firstSet,2774uint32_t descriptorSetCount,2775const VkDescriptorSet *pDescriptorSets,2776uint32_t dynamicOffsetCount,2777const uint32_t *pDynamicOffsets)2778{2779V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);2780V3DV_FROM_HANDLE(v3dv_pipeline_layout, layout, _layout);27812782uint32_t dyn_index = 0;27832784assert(firstSet + descriptorSetCount <= MAX_SETS);27852786struct v3dv_descriptor_state *descriptor_state =2787pipelineBindPoint == VK_PIPELINE_BIND_POINT_COMPUTE ?2788&cmd_buffer->state.compute.descriptor_state :2789&cmd_buffer->state.gfx.descriptor_state;27902791VkShaderStageFlags dirty_stages = 0;2792bool descriptor_state_changed = false;2793for (uint32_t i = 0; i < descriptorSetCount; i++) {2794V3DV_FROM_HANDLE(v3dv_descriptor_set, set, pDescriptorSets[i]);2795uint32_t index = firstSet + i;27962797descriptor_state->valid |= (1u << index);2798if (descriptor_state->descriptor_sets[index] != set) {2799descriptor_state->descriptor_sets[index] = set;2800dirty_stages |= set->layout->shader_stages;2801descriptor_state_changed = true;2802}28032804for (uint32_t j = 0; j < set->layout->dynamic_offset_count; j++, dyn_index++) {2805uint32_t idx = j + layout->set[i + firstSet].dynamic_offset_start;28062807if (descriptor_state->dynamic_offsets[idx] != pDynamicOffsets[dyn_index]) {2808descriptor_state->dynamic_offsets[idx] = pDynamicOffsets[dyn_index];2809dirty_stages |= set->layout->shader_stages;2810descriptor_state_changed = true;2811}2812}2813}28142815if (descriptor_state_changed) {2816if (pipelineBindPoint == VK_PIPELINE_BIND_POINT_GRAPHICS) {2817cmd_buffer->state.dirty |= V3DV_CMD_DIRTY_DESCRIPTOR_SETS;2818cmd_buffer->state.dirty_descriptor_stages |= dirty_stages & VK_SHADER_STAGE_ALL_GRAPHICS;2819} else {2820cmd_buffer->state.dirty |= V3DV_CMD_DIRTY_COMPUTE_DESCRIPTOR_SETS;2821cmd_buffer->state.dirty_descriptor_stages |= VK_SHADER_STAGE_COMPUTE_BIT;2822}2823}2824}28252826VKAPI_ATTR void VKAPI_CALL2827v3dv_CmdPushConstants(VkCommandBuffer commandBuffer,2828VkPipelineLayout layout,2829VkShaderStageFlags stageFlags,2830uint32_t offset,2831uint32_t size,2832const void *pValues)2833{2834V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);28352836if (!memcmp((uint8_t *) cmd_buffer->push_constants_data + offset, pValues, size))2837return;28382839memcpy((uint8_t *) cmd_buffer->push_constants_data + offset, pValues, size);28402841cmd_buffer->state.dirty |= V3DV_CMD_DIRTY_PUSH_CONSTANTS;2842cmd_buffer->state.dirty_push_constants_stages |= stageFlags;2843}28442845VKAPI_ATTR void VKAPI_CALL2846v3dv_CmdSetBlendConstants(VkCommandBuffer commandBuffer,2847const float blendConstants[4])2848{2849V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);2850struct v3dv_cmd_buffer_state *state = &cmd_buffer->state;28512852if (!memcmp(state->dynamic.blend_constants, blendConstants,2853sizeof(state->dynamic.blend_constants))) {2854return;2855}28562857memcpy(state->dynamic.blend_constants, blendConstants,2858sizeof(state->dynamic.blend_constants));28592860cmd_buffer->state.dirty |= V3DV_CMD_DIRTY_BLEND_CONSTANTS;2861}28622863void2864v3dv_cmd_buffer_reset_queries(struct v3dv_cmd_buffer *cmd_buffer,2865struct v3dv_query_pool *pool,2866uint32_t first,2867uint32_t count)2868{2869/* Resets can only happen outside a render pass instance so we should not2870* be in the middle of job recording.2871*/2872assert(cmd_buffer->state.pass == NULL);2873assert(cmd_buffer->state.job == NULL);28742875assert(first < pool->query_count);2876assert(first + count <= pool->query_count);28772878struct v3dv_job *job =2879v3dv_cmd_buffer_create_cpu_job(cmd_buffer->device,2880V3DV_JOB_TYPE_CPU_RESET_QUERIES,2881cmd_buffer, -1);2882v3dv_return_if_oom(cmd_buffer, NULL);28832884job->cpu.query_reset.pool = pool;2885job->cpu.query_reset.first = first;2886job->cpu.query_reset.count = count;28872888list_addtail(&job->list_link, &cmd_buffer->jobs);2889}28902891void2892v3dv_cmd_buffer_ensure_array_state(struct v3dv_cmd_buffer *cmd_buffer,2893uint32_t slot_size,2894uint32_t used_count,2895uint32_t *alloc_count,2896void **ptr)2897{2898if (used_count >= *alloc_count) {2899const uint32_t prev_slot_count = *alloc_count;2900void *old_buffer = *ptr;29012902const uint32_t new_slot_count = MAX2(*alloc_count * 2, 4);2903const uint32_t bytes = new_slot_count * slot_size;2904*ptr = vk_alloc(&cmd_buffer->device->vk.alloc, bytes, 8,2905VK_SYSTEM_ALLOCATION_SCOPE_COMMAND);2906if (*ptr == NULL) {2907fprintf(stderr, "Error: failed to allocate CPU buffer for query.\n");2908v3dv_flag_oom(cmd_buffer, NULL);2909return;2910}29112912memcpy(*ptr, old_buffer, prev_slot_count * slot_size);2913*alloc_count = new_slot_count;2914}2915assert(used_count < *alloc_count);2916}29172918void2919v3dv_cmd_buffer_begin_query(struct v3dv_cmd_buffer *cmd_buffer,2920struct v3dv_query_pool *pool,2921uint32_t query,2922VkQueryControlFlags flags)2923{2924/* FIXME: we only support one active query for now */2925assert(cmd_buffer->state.query.active_query.bo == NULL);2926assert(query < pool->query_count);29272928cmd_buffer->state.query.active_query.bo = pool->queries[query].bo;2929cmd_buffer->state.query.active_query.offset = pool->queries[query].offset;2930cmd_buffer->state.dirty |= V3DV_CMD_DIRTY_OCCLUSION_QUERY;2931}29322933void2934v3dv_cmd_buffer_end_query(struct v3dv_cmd_buffer *cmd_buffer,2935struct v3dv_query_pool *pool,2936uint32_t query)2937{2938assert(query < pool->query_count);2939assert(cmd_buffer->state.query.active_query.bo != NULL);29402941if (cmd_buffer->state.pass) {2942/* Queue the EndQuery in the command buffer state, we will create a CPU2943* job to flag all of these queries as possibly available right after the2944* render pass job in which they have been recorded.2945*/2946struct v3dv_cmd_buffer_state *state = &cmd_buffer->state;2947v3dv_cmd_buffer_ensure_array_state(cmd_buffer,2948sizeof(struct v3dv_end_query_cpu_job_info),2949state->query.end.used_count,2950&state->query.end.alloc_count,2951(void **) &state->query.end.states);2952v3dv_return_if_oom(cmd_buffer, NULL);29532954struct v3dv_end_query_cpu_job_info *info =2955&state->query.end.states[state->query.end.used_count++];29562957info->pool = pool;2958info->query = query;2959} else {2960/* Otherwise, schedule the CPU job immediately */2961struct v3dv_job *job =2962v3dv_cmd_buffer_create_cpu_job(cmd_buffer->device,2963V3DV_JOB_TYPE_CPU_END_QUERY,2964cmd_buffer, -1);2965v3dv_return_if_oom(cmd_buffer, NULL);29662967job->cpu.query_end.pool = pool;2968job->cpu.query_end.query = query;2969list_addtail(&job->list_link, &cmd_buffer->jobs);2970}29712972cmd_buffer->state.query.active_query.bo = NULL;2973cmd_buffer->state.dirty |= V3DV_CMD_DIRTY_OCCLUSION_QUERY;2974}29752976void2977v3dv_cmd_buffer_copy_query_results(struct v3dv_cmd_buffer *cmd_buffer,2978struct v3dv_query_pool *pool,2979uint32_t first,2980uint32_t count,2981struct v3dv_buffer *dst,2982uint32_t offset,2983uint32_t stride,2984VkQueryResultFlags flags)2985{2986/* Copies can only happen outside a render pass instance so we should not2987* be in the middle of job recording.2988*/2989assert(cmd_buffer->state.pass == NULL);2990assert(cmd_buffer->state.job == NULL);29912992assert(first < pool->query_count);2993assert(first + count <= pool->query_count);29942995struct v3dv_job *job =2996v3dv_cmd_buffer_create_cpu_job(cmd_buffer->device,2997V3DV_JOB_TYPE_CPU_COPY_QUERY_RESULTS,2998cmd_buffer, -1);2999v3dv_return_if_oom(cmd_buffer, NULL);30003001job->cpu.query_copy_results.pool = pool;3002job->cpu.query_copy_results.first = first;3003job->cpu.query_copy_results.count = count;3004job->cpu.query_copy_results.dst = dst;3005job->cpu.query_copy_results.offset = offset;3006job->cpu.query_copy_results.stride = stride;3007job->cpu.query_copy_results.flags = flags;30083009list_addtail(&job->list_link, &cmd_buffer->jobs);3010}30113012void3013v3dv_cmd_buffer_add_tfu_job(struct v3dv_cmd_buffer *cmd_buffer,3014struct drm_v3d_submit_tfu *tfu)3015{3016struct v3dv_device *device = cmd_buffer->device;3017struct v3dv_job *job = vk_zalloc(&device->vk.alloc,3018sizeof(struct v3dv_job), 8,3019VK_SYSTEM_ALLOCATION_SCOPE_COMMAND);3020if (!job) {3021v3dv_flag_oom(cmd_buffer, NULL);3022return;3023}30243025v3dv_job_init(job, V3DV_JOB_TYPE_GPU_TFU, device, cmd_buffer, -1);3026job->tfu = *tfu;3027list_addtail(&job->list_link, &cmd_buffer->jobs);3028}30293030VKAPI_ATTR void VKAPI_CALL3031v3dv_CmdSetEvent(VkCommandBuffer commandBuffer,3032VkEvent _event,3033VkPipelineStageFlags stageMask)3034{3035V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);3036V3DV_FROM_HANDLE(v3dv_event, event, _event);30373038/* Event (re)sets can only happen outside a render pass instance so we3039* should not be in the middle of job recording.3040*/3041assert(cmd_buffer->state.pass == NULL);3042assert(cmd_buffer->state.job == NULL);30433044struct v3dv_job *job =3045v3dv_cmd_buffer_create_cpu_job(cmd_buffer->device,3046V3DV_JOB_TYPE_CPU_SET_EVENT,3047cmd_buffer, -1);3048v3dv_return_if_oom(cmd_buffer, NULL);30493050job->cpu.event_set.event = event;3051job->cpu.event_set.state = 1;30523053list_addtail(&job->list_link, &cmd_buffer->jobs);3054}30553056VKAPI_ATTR void VKAPI_CALL3057v3dv_CmdResetEvent(VkCommandBuffer commandBuffer,3058VkEvent _event,3059VkPipelineStageFlags stageMask)3060{3061V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);3062V3DV_FROM_HANDLE(v3dv_event, event, _event);30633064/* Event (re)sets can only happen outside a render pass instance so we3065* should not be in the middle of job recording.3066*/3067assert(cmd_buffer->state.pass == NULL);3068assert(cmd_buffer->state.job == NULL);30693070struct v3dv_job *job =3071v3dv_cmd_buffer_create_cpu_job(cmd_buffer->device,3072V3DV_JOB_TYPE_CPU_SET_EVENT,3073cmd_buffer, -1);3074v3dv_return_if_oom(cmd_buffer, NULL);30753076job->cpu.event_set.event = event;3077job->cpu.event_set.state = 0;30783079list_addtail(&job->list_link, &cmd_buffer->jobs);3080}30813082VKAPI_ATTR void VKAPI_CALL3083v3dv_CmdWaitEvents(VkCommandBuffer commandBuffer,3084uint32_t eventCount,3085const VkEvent *pEvents,3086VkPipelineStageFlags srcStageMask,3087VkPipelineStageFlags dstStageMask,3088uint32_t memoryBarrierCount,3089const VkMemoryBarrier *pMemoryBarriers,3090uint32_t bufferMemoryBarrierCount,3091const VkBufferMemoryBarrier *pBufferMemoryBarriers,3092uint32_t imageMemoryBarrierCount,3093const VkImageMemoryBarrier *pImageMemoryBarriers)3094{3095V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);30963097assert(eventCount > 0);30983099struct v3dv_job *job =3100v3dv_cmd_buffer_create_cpu_job(cmd_buffer->device,3101V3DV_JOB_TYPE_CPU_WAIT_EVENTS,3102cmd_buffer, -1);3103v3dv_return_if_oom(cmd_buffer, NULL);31043105const uint32_t event_list_size = sizeof(struct v3dv_event *) * eventCount;31063107job->cpu.event_wait.events =3108vk_alloc(&cmd_buffer->device->vk.alloc, event_list_size, 8,3109VK_SYSTEM_ALLOCATION_SCOPE_COMMAND);3110if (!job->cpu.event_wait.events) {3111v3dv_flag_oom(cmd_buffer, NULL);3112return;3113}3114job->cpu.event_wait.event_count = eventCount;31153116for (uint32_t i = 0; i < eventCount; i++)3117job->cpu.event_wait.events[i] = v3dv_event_from_handle(pEvents[i]);31183119/* vkCmdWaitEvents can be recorded inside a render pass, so we might have3120* an active job.3121*3122* If we are inside a render pass, because we vkCmd(Re)SetEvent can't happen3123* inside a render pass, it is safe to move the wait job so it happens right3124* before the current job we are currently recording for the subpass, if any3125* (it would actually be safe to move it all the way back to right before3126* the start of the render pass).3127*3128* If we are outside a render pass then we should not have any on-going job3129* and we are free to just add the wait job without restrictions.3130*/3131assert(cmd_buffer->state.pass || !cmd_buffer->state.job);3132list_addtail(&job->list_link, &cmd_buffer->jobs);3133}31343135VKAPI_ATTR void VKAPI_CALL3136v3dv_CmdWriteTimestamp(VkCommandBuffer commandBuffer,3137VkPipelineStageFlagBits pipelineStage,3138VkQueryPool queryPool,3139uint32_t query)3140{3141V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);3142V3DV_FROM_HANDLE(v3dv_query_pool, query_pool, queryPool);31433144/* If this is called inside a render pass we need to finish the current3145* job here...3146*/3147if (cmd_buffer->state.pass)3148v3dv_cmd_buffer_finish_job(cmd_buffer);31493150struct v3dv_job *job =3151v3dv_cmd_buffer_create_cpu_job(cmd_buffer->device,3152V3DV_JOB_TYPE_CPU_TIMESTAMP_QUERY,3153cmd_buffer, -1);3154v3dv_return_if_oom(cmd_buffer, NULL);31553156job->cpu.query_timestamp.pool = query_pool;3157job->cpu.query_timestamp.query = query;31583159list_addtail(&job->list_link, &cmd_buffer->jobs);3160cmd_buffer->state.job = NULL;31613162/* ...and resume the subpass after the timestamp */3163if (cmd_buffer->state.pass)3164v3dv_cmd_buffer_subpass_resume(cmd_buffer, cmd_buffer->state.subpass_idx);3165}31663167static void3168cmd_buffer_emit_pre_dispatch(struct v3dv_cmd_buffer *cmd_buffer)3169{3170assert(cmd_buffer->state.compute.pipeline);3171assert(cmd_buffer->state.compute.pipeline->active_stages ==3172VK_SHADER_STAGE_COMPUTE_BIT);31733174cmd_buffer->state.dirty &= ~(V3DV_CMD_DIRTY_COMPUTE_PIPELINE |3175V3DV_CMD_DIRTY_COMPUTE_DESCRIPTOR_SETS);3176cmd_buffer->state.dirty_descriptor_stages &= ~VK_SHADER_STAGE_COMPUTE_BIT;3177cmd_buffer->state.dirty_push_constants_stages &= ~VK_SHADER_STAGE_COMPUTE_BIT;3178}31793180#define V3D_CSD_CFG012_WG_COUNT_SHIFT 163181#define V3D_CSD_CFG012_WG_OFFSET_SHIFT 03182/* Allow this dispatch to start while the last one is still running. */3183#define V3D_CSD_CFG3_OVERLAP_WITH_PREV (1 << 26)3184/* Maximum supergroup ID. 6 bits. */3185#define V3D_CSD_CFG3_MAX_SG_ID_SHIFT 203186/* Batches per supergroup minus 1. 8 bits. */3187#define V3D_CSD_CFG3_BATCHES_PER_SG_M1_SHIFT 123188/* Workgroups per supergroup, 0 means 16 */3189#define V3D_CSD_CFG3_WGS_PER_SG_SHIFT 83190#define V3D_CSD_CFG3_WG_SIZE_SHIFT 031913192#define V3D_CSD_CFG5_PROPAGATE_NANS (1 << 2)3193#define V3D_CSD_CFG5_SINGLE_SEG (1 << 1)3194#define V3D_CSD_CFG5_THREADING (1 << 0)31953196void3197v3dv_cmd_buffer_rewrite_indirect_csd_job(3198struct v3dv_csd_indirect_cpu_job_info *info,3199const uint32_t *wg_counts)3200{3201assert(info->csd_job);3202struct v3dv_job *job = info->csd_job;32033204assert(job->type == V3DV_JOB_TYPE_GPU_CSD);3205assert(wg_counts[0] > 0 && wg_counts[1] > 0 && wg_counts[2] > 0);32063207struct drm_v3d_submit_csd *submit = &job->csd.submit;32083209job->csd.wg_count[0] = wg_counts[0];3210job->csd.wg_count[1] = wg_counts[1];3211job->csd.wg_count[2] = wg_counts[2];32123213submit->cfg[0] = wg_counts[0] << V3D_CSD_CFG012_WG_COUNT_SHIFT;3214submit->cfg[1] = wg_counts[1] << V3D_CSD_CFG012_WG_COUNT_SHIFT;3215submit->cfg[2] = wg_counts[2] << V3D_CSD_CFG012_WG_COUNT_SHIFT;32163217submit->cfg[4] = DIV_ROUND_UP(info->wg_size, 16) *3218(wg_counts[0] * wg_counts[1] * wg_counts[2]) - 1;3219assert(submit->cfg[4] != ~0);32203221if (info->needs_wg_uniform_rewrite) {3222/* Make sure the GPU is not currently accessing the indirect CL for this3223* job, since we are about to overwrite some of the uniform data.3224*/3225v3dv_bo_wait(job->device, job->indirect.bo, PIPE_TIMEOUT_INFINITE);32263227for (uint32_t i = 0; i < 3; i++) {3228if (info->wg_uniform_offsets[i]) {3229/* Sanity check that our uniform pointers are within the allocated3230* BO space for our indirect CL.3231*/3232assert(info->wg_uniform_offsets[i] >= (uint32_t *) job->indirect.base);3233assert(info->wg_uniform_offsets[i] < (uint32_t *) job->indirect.next);3234*(info->wg_uniform_offsets[i]) = wg_counts[i];3235}3236}3237}3238}32393240static struct v3dv_job *3241cmd_buffer_create_csd_job(struct v3dv_cmd_buffer *cmd_buffer,3242uint32_t base_offset_x,3243uint32_t base_offset_y,3244uint32_t base_offset_z,3245uint32_t group_count_x,3246uint32_t group_count_y,3247uint32_t group_count_z,3248uint32_t **wg_uniform_offsets_out,3249uint32_t *wg_size_out)3250{3251struct v3dv_pipeline *pipeline = cmd_buffer->state.compute.pipeline;3252assert(pipeline && pipeline->shared_data->variants[BROADCOM_SHADER_COMPUTE]);3253struct v3dv_shader_variant *cs_variant =3254pipeline->shared_data->variants[BROADCOM_SHADER_COMPUTE];32553256struct v3dv_job *job = vk_zalloc(&cmd_buffer->device->vk.alloc,3257sizeof(struct v3dv_job), 8,3258VK_SYSTEM_ALLOCATION_SCOPE_COMMAND);3259if (!job) {3260v3dv_flag_oom(cmd_buffer, NULL);3261return NULL;3262}32633264v3dv_job_init(job, V3DV_JOB_TYPE_GPU_CSD, cmd_buffer->device, cmd_buffer, -1);3265cmd_buffer->state.job = job;32663267struct drm_v3d_submit_csd *submit = &job->csd.submit;32683269job->csd.wg_count[0] = group_count_x;3270job->csd.wg_count[1] = group_count_y;3271job->csd.wg_count[2] = group_count_z;32723273job->csd.wg_base[0] = base_offset_x;3274job->csd.wg_base[1] = base_offset_y;3275job->csd.wg_base[2] = base_offset_z;32763277submit->cfg[0] |= group_count_x << V3D_CSD_CFG012_WG_COUNT_SHIFT;3278submit->cfg[1] |= group_count_y << V3D_CSD_CFG012_WG_COUNT_SHIFT;3279submit->cfg[2] |= group_count_z << V3D_CSD_CFG012_WG_COUNT_SHIFT;32803281const struct v3d_compute_prog_data *cpd =3282cs_variant->prog_data.cs;32833284const uint32_t num_wgs = group_count_x * group_count_y * group_count_z;3285const uint32_t wg_size = cpd->local_size[0] *3286cpd->local_size[1] *3287cpd->local_size[2];32883289uint32_t wgs_per_sg =3290v3d_csd_choose_workgroups_per_supergroup(3291&cmd_buffer->device->devinfo,3292cs_variant->prog_data.cs->has_subgroups,3293cs_variant->prog_data.cs->base.has_control_barrier,3294cs_variant->prog_data.cs->base.threads,3295num_wgs, wg_size);32963297uint32_t batches_per_sg = DIV_ROUND_UP(wgs_per_sg * wg_size, 16);3298uint32_t whole_sgs = num_wgs / wgs_per_sg;3299uint32_t rem_wgs = num_wgs - whole_sgs * wgs_per_sg;3300uint32_t num_batches = batches_per_sg * whole_sgs +3301DIV_ROUND_UP(rem_wgs * wg_size, 16);33023303submit->cfg[3] |= (wgs_per_sg & 0xf) << V3D_CSD_CFG3_WGS_PER_SG_SHIFT;3304submit->cfg[3] |= (batches_per_sg - 1) << V3D_CSD_CFG3_BATCHES_PER_SG_M1_SHIFT;3305submit->cfg[3] |= (wg_size & 0xff) << V3D_CSD_CFG3_WG_SIZE_SHIFT;3306if (wg_size_out)3307*wg_size_out = wg_size;33083309submit->cfg[4] = num_batches - 1;3310assert(submit->cfg[4] != ~0);33113312assert(pipeline->shared_data->assembly_bo);3313struct v3dv_bo *cs_assembly_bo = pipeline->shared_data->assembly_bo;33143315submit->cfg[5] = cs_assembly_bo->offset + cs_variant->assembly_offset;3316submit->cfg[5] |= V3D_CSD_CFG5_PROPAGATE_NANS;3317if (cs_variant->prog_data.base->single_seg)3318submit->cfg[5] |= V3D_CSD_CFG5_SINGLE_SEG;3319if (cs_variant->prog_data.base->threads == 4)3320submit->cfg[5] |= V3D_CSD_CFG5_THREADING;33213322if (cs_variant->prog_data.cs->shared_size > 0) {3323job->csd.shared_memory =3324v3dv_bo_alloc(cmd_buffer->device,3325cs_variant->prog_data.cs->shared_size * wgs_per_sg,3326"shared_vars", true);3327if (!job->csd.shared_memory) {3328v3dv_flag_oom(cmd_buffer, NULL);3329return job;3330}3331}33323333v3dv_job_add_bo_unchecked(job, cs_assembly_bo);3334struct v3dv_cl_reloc uniforms =3335v3dv_write_uniforms_wg_offsets(cmd_buffer, pipeline,3336cs_variant,3337wg_uniform_offsets_out);3338submit->cfg[6] = uniforms.bo->offset + uniforms.offset;33393340v3dv_job_add_bo(job, uniforms.bo);33413342return job;3343}33443345static void3346cmd_buffer_dispatch(struct v3dv_cmd_buffer *cmd_buffer,3347uint32_t base_offset_x,3348uint32_t base_offset_y,3349uint32_t base_offset_z,3350uint32_t group_count_x,3351uint32_t group_count_y,3352uint32_t group_count_z)3353{3354if (group_count_x == 0 || group_count_y == 0 || group_count_z == 0)3355return;33563357struct v3dv_job *job =3358cmd_buffer_create_csd_job(cmd_buffer,3359base_offset_x,3360base_offset_y,3361base_offset_z,3362group_count_x,3363group_count_y,3364group_count_z,3365NULL, NULL);33663367list_addtail(&job->list_link, &cmd_buffer->jobs);3368cmd_buffer->state.job = NULL;3369}33703371VKAPI_ATTR void VKAPI_CALL3372v3dv_CmdDispatch(VkCommandBuffer commandBuffer,3373uint32_t groupCountX,3374uint32_t groupCountY,3375uint32_t groupCountZ)3376{3377V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);33783379cmd_buffer_emit_pre_dispatch(cmd_buffer);3380cmd_buffer_dispatch(cmd_buffer, 0, 0, 0,3381groupCountX, groupCountY, groupCountZ);3382}33833384VKAPI_ATTR void VKAPI_CALL3385v3dv_CmdDispatchBase(VkCommandBuffer commandBuffer,3386uint32_t baseGroupX,3387uint32_t baseGroupY,3388uint32_t baseGroupZ,3389uint32_t groupCountX,3390uint32_t groupCountY,3391uint32_t groupCountZ)3392{3393V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);33943395cmd_buffer_emit_pre_dispatch(cmd_buffer);3396cmd_buffer_dispatch(cmd_buffer,3397baseGroupX, baseGroupY, baseGroupZ,3398groupCountX, groupCountY, groupCountZ);3399}340034013402static void3403cmd_buffer_dispatch_indirect(struct v3dv_cmd_buffer *cmd_buffer,3404struct v3dv_buffer *buffer,3405uint32_t offset)3406{3407/* We can't do indirect dispatches, so instead we record a CPU job that,3408* when executed in the queue, will map the indirect buffer, read the3409* dispatch parameters, and submit a regular dispatch.3410*/3411struct v3dv_job *job =3412v3dv_cmd_buffer_create_cpu_job(cmd_buffer->device,3413V3DV_JOB_TYPE_CPU_CSD_INDIRECT,3414cmd_buffer, -1);3415v3dv_return_if_oom(cmd_buffer, NULL);34163417/* We need to create a CSD job now, even if we still don't know the actual3418* dispatch parameters, because the job setup needs to be done using the3419* current command buffer state (i.e. pipeline, descriptor sets, push3420* constants, etc.). So we create the job with default dispatch parameters3421* and we will rewrite the parts we need at submit time if the indirect3422* parameters don't match the ones we used to setup the job.3423*/3424struct v3dv_job *csd_job =3425cmd_buffer_create_csd_job(cmd_buffer,34260, 0, 0,34271, 1, 1,3428&job->cpu.csd_indirect.wg_uniform_offsets[0],3429&job->cpu.csd_indirect.wg_size);3430v3dv_return_if_oom(cmd_buffer, NULL);3431assert(csd_job);34323433job->cpu.csd_indirect.buffer = buffer;3434job->cpu.csd_indirect.offset = offset;3435job->cpu.csd_indirect.csd_job = csd_job;34363437/* If the compute shader reads the workgroup sizes we will also need to3438* rewrite the corresponding uniforms.3439*/3440job->cpu.csd_indirect.needs_wg_uniform_rewrite =3441job->cpu.csd_indirect.wg_uniform_offsets[0] ||3442job->cpu.csd_indirect.wg_uniform_offsets[1] ||3443job->cpu.csd_indirect.wg_uniform_offsets[2];34443445list_addtail(&job->list_link, &cmd_buffer->jobs);3446cmd_buffer->state.job = NULL;3447}34483449VKAPI_ATTR void VKAPI_CALL3450v3dv_CmdDispatchIndirect(VkCommandBuffer commandBuffer,3451VkBuffer _buffer,3452VkDeviceSize offset)3453{3454V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);3455V3DV_FROM_HANDLE(v3dv_buffer, buffer, _buffer);34563457assert(offset <= UINT32_MAX);34583459cmd_buffer_emit_pre_dispatch(cmd_buffer);3460cmd_buffer_dispatch_indirect(cmd_buffer, buffer, offset);3461}34623463VKAPI_ATTR void VKAPI_CALL3464v3dv_CmdSetDeviceMask(VkCommandBuffer commandBuffer, uint32_t deviceMask)3465{3466/* Nothing to do here since we only support a single device */3467assert(deviceMask == 0x1);3468}346934703471