Path: blob/21.2-virgl/src/freedreno/vulkan/tu_query.c
4565 views
/*1* Copyrigh 2016 Red Hat Inc.2* Based on anv:3* Copyright © 2015 Intel Corporation4*5* Permission is hereby granted, free of charge, to any person obtaining a6* copy of this software and associated documentation files (the "Software"),7* to deal in the Software without restriction, including without limitation8* the rights to use, copy, modify, merge, publish, distribute, sublicense,9* and/or sell copies of the Software, and to permit persons to whom the10* Software is furnished to do so, subject to the following conditions:11*12* The above copyright notice and this permission notice (including the next13* paragraph) shall be included in all copies or substantial portions of the14* Software.15*16* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR17* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,18* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL19* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER20* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING21* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER22* DEALINGS IN THE SOFTWARE.23*/2425#include "tu_private.h"2627#include <assert.h>28#include <fcntl.h>29#include <stdbool.h>30#include <string.h>31#include <unistd.h>3233#include "adreno_pm4.xml.h"34#include "adreno_common.xml.h"35#include "a6xx.xml.h"3637#include "nir/nir_builder.h"38#include "util/os_time.h"3940#include "tu_cs.h"41#include "vk_util.h"4243#define NSEC_PER_SEC 1000000000ull44#define WAIT_TIMEOUT 545#define STAT_COUNT ((REG_A6XX_RBBM_PRIMCTR_10_LO - REG_A6XX_RBBM_PRIMCTR_0_LO) / 2 + 1)4647struct PACKED query_slot {48uint64_t available;49};5051struct PACKED occlusion_slot_value {52/* Seems sample counters are placed to be 16-byte aligned53* even though this query needs an 8-byte slot. */54uint64_t value;55uint64_t _padding;56};5758struct PACKED occlusion_query_slot {59struct query_slot common;60uint64_t result;6162struct occlusion_slot_value begin;63struct occlusion_slot_value end;64};6566struct PACKED timestamp_query_slot {67struct query_slot common;68uint64_t result;69};7071struct PACKED primitive_slot_value {72uint64_t values[2];73};7475struct PACKED pipeline_stat_query_slot {76struct query_slot common;77uint64_t results[STAT_COUNT];7879uint64_t begin[STAT_COUNT];80uint64_t end[STAT_COUNT];81};8283struct PACKED primitive_query_slot {84struct query_slot common;85/* The result of transform feedback queries is two integer values:86* results[0] is the count of primitives written,87* results[1] is the count of primitives generated.88* Also a result for each stream is stored at 4 slots respectively.89*/90uint64_t results[2];9192/* Primitive counters also need to be 16-byte aligned. */93uint64_t _padding;9495struct primitive_slot_value begin[4];96struct primitive_slot_value end[4];97};9899struct PACKED perfcntr_query_slot {100uint64_t result;101uint64_t begin;102uint64_t end;103};104105struct PACKED perf_query_slot {106struct query_slot common;107struct perfcntr_query_slot perfcntr;108};109110/* Returns the IOVA of a given uint64_t field in a given slot of a query111* pool. */112#define query_iova(type, pool, query, field) \113pool->bo.iova + pool->stride * (query) + offsetof(type, field)114115#define occlusion_query_iova(pool, query, field) \116query_iova(struct occlusion_query_slot, pool, query, field)117118#define pipeline_stat_query_iova(pool, query, field) \119pool->bo.iova + pool->stride * query + \120offsetof(struct pipeline_stat_query_slot, field)121122#define primitive_query_iova(pool, query, field, i) \123query_iova(struct primitive_query_slot, pool, query, field) + \124offsetof(struct primitive_slot_value, values[i])125126#define perf_query_iova(pool, query, field, i) \127pool->bo.iova + pool->stride * query + \128sizeof(struct query_slot) + \129sizeof(struct perfcntr_query_slot) * i + \130offsetof(struct perfcntr_query_slot, field)131132#define query_available_iova(pool, query) \133query_iova(struct query_slot, pool, query, available)134135#define query_result_iova(pool, query, type, i) \136pool->bo.iova + pool->stride * (query) + \137sizeof(struct query_slot) + sizeof(type) * i138139#define query_result_addr(pool, query, type, i) \140pool->bo.map + pool->stride * query + \141sizeof(struct query_slot) + sizeof(type) * i142143#define query_is_available(slot) slot->available144145static const VkPerformanceCounterUnitKHR146fd_perfcntr_type_to_vk_unit[] = {147[FD_PERFCNTR_TYPE_UINT] = VK_PERFORMANCE_COUNTER_UNIT_GENERIC_KHR,148[FD_PERFCNTR_TYPE_UINT64] = VK_PERFORMANCE_COUNTER_UNIT_GENERIC_KHR,149[FD_PERFCNTR_TYPE_FLOAT] = VK_PERFORMANCE_COUNTER_UNIT_GENERIC_KHR,150[FD_PERFCNTR_TYPE_PERCENTAGE] = VK_PERFORMANCE_COUNTER_UNIT_PERCENTAGE_KHR,151[FD_PERFCNTR_TYPE_BYTES] = VK_PERFORMANCE_COUNTER_UNIT_BYTES_KHR,152/* TODO. can be UNIT_NANOSECONDS_KHR with a logic to compute */153[FD_PERFCNTR_TYPE_MICROSECONDS] = VK_PERFORMANCE_COUNTER_UNIT_GENERIC_KHR,154[FD_PERFCNTR_TYPE_HZ] = VK_PERFORMANCE_COUNTER_UNIT_HERTZ_KHR,155[FD_PERFCNTR_TYPE_DBM] = VK_PERFORMANCE_COUNTER_UNIT_GENERIC_KHR,156[FD_PERFCNTR_TYPE_TEMPERATURE] = VK_PERFORMANCE_COUNTER_UNIT_GENERIC_KHR,157[FD_PERFCNTR_TYPE_VOLTS] = VK_PERFORMANCE_COUNTER_UNIT_GENERIC_KHR,158[FD_PERFCNTR_TYPE_AMPS] = VK_PERFORMANCE_COUNTER_UNIT_GENERIC_KHR,159[FD_PERFCNTR_TYPE_WATTS] = VK_PERFORMANCE_COUNTER_UNIT_GENERIC_KHR,160};161162/* TODO. Basically this comes from the freedreno implementation where163* only UINT64 is used. We'd better confirm this by the blob vulkan driver164* when it starts supporting perf query.165*/166static const VkPerformanceCounterStorageKHR167fd_perfcntr_type_to_vk_storage[] = {168[FD_PERFCNTR_TYPE_UINT] = VK_PERFORMANCE_COUNTER_STORAGE_UINT32_KHR,169[FD_PERFCNTR_TYPE_UINT64] = VK_PERFORMANCE_COUNTER_STORAGE_UINT64_KHR,170[FD_PERFCNTR_TYPE_FLOAT] = VK_PERFORMANCE_COUNTER_STORAGE_FLOAT32_KHR,171[FD_PERFCNTR_TYPE_PERCENTAGE] = VK_PERFORMANCE_COUNTER_STORAGE_FLOAT32_KHR,172[FD_PERFCNTR_TYPE_BYTES] = VK_PERFORMANCE_COUNTER_STORAGE_UINT64_KHR,173[FD_PERFCNTR_TYPE_MICROSECONDS] = VK_PERFORMANCE_COUNTER_STORAGE_UINT64_KHR,174[FD_PERFCNTR_TYPE_HZ] = VK_PERFORMANCE_COUNTER_STORAGE_UINT64_KHR,175[FD_PERFCNTR_TYPE_DBM] = VK_PERFORMANCE_COUNTER_STORAGE_FLOAT32_KHR,176[FD_PERFCNTR_TYPE_TEMPERATURE] = VK_PERFORMANCE_COUNTER_STORAGE_FLOAT32_KHR,177[FD_PERFCNTR_TYPE_VOLTS] = VK_PERFORMANCE_COUNTER_STORAGE_FLOAT32_KHR,178[FD_PERFCNTR_TYPE_AMPS] = VK_PERFORMANCE_COUNTER_STORAGE_FLOAT32_KHR,179[FD_PERFCNTR_TYPE_WATTS] = VK_PERFORMANCE_COUNTER_STORAGE_FLOAT32_KHR,180};181182/*183* Returns a pointer to a given slot in a query pool.184*/185static void* slot_address(struct tu_query_pool *pool, uint32_t query)186{187return (char*)pool->bo.map + query * pool->stride;188}189190static void191perfcntr_index(const struct fd_perfcntr_group *group, uint32_t group_count,192uint32_t index, uint32_t *gid, uint32_t *cid)193194{195uint32_t i;196197for (i = 0; i < group_count; i++) {198if (group[i].num_countables > index) {199*gid = i;200*cid = index;201break;202}203index -= group[i].num_countables;204}205206assert(i < group_count);207}208209static int210compare_perfcntr_pass(const void *a, const void *b)211{212return ((struct tu_perf_query_data *)a)->pass -213((struct tu_perf_query_data *)b)->pass;214}215216VKAPI_ATTR VkResult VKAPI_CALL217tu_CreateQueryPool(VkDevice _device,218const VkQueryPoolCreateInfo *pCreateInfo,219const VkAllocationCallbacks *pAllocator,220VkQueryPool *pQueryPool)221{222TU_FROM_HANDLE(tu_device, device, _device);223assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_QUERY_POOL_CREATE_INFO);224assert(pCreateInfo->queryCount > 0);225226uint32_t pool_size, slot_size;227const VkQueryPoolPerformanceCreateInfoKHR *perf_query_info = NULL;228229pool_size = sizeof(struct tu_query_pool);230231switch (pCreateInfo->queryType) {232case VK_QUERY_TYPE_OCCLUSION:233slot_size = sizeof(struct occlusion_query_slot);234break;235case VK_QUERY_TYPE_TIMESTAMP:236slot_size = sizeof(struct timestamp_query_slot);237break;238case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:239slot_size = sizeof(struct primitive_query_slot);240break;241case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR: {242perf_query_info =243vk_find_struct_const(pCreateInfo->pNext,244QUERY_POOL_PERFORMANCE_CREATE_INFO_KHR);245assert(perf_query_info);246247slot_size = sizeof(struct perf_query_slot) +248sizeof(struct perfcntr_query_slot) *249(perf_query_info->counterIndexCount - 1);250251/* Size of the array pool->tu_perf_query_data */252pool_size += sizeof(struct tu_perf_query_data) *253perf_query_info->counterIndexCount;254break;255}256case VK_QUERY_TYPE_PIPELINE_STATISTICS:257slot_size = sizeof(struct pipeline_stat_query_slot);258break;259default:260unreachable("Invalid query type");261}262263struct tu_query_pool *pool =264vk_object_alloc(&device->vk, pAllocator, pool_size,265VK_OBJECT_TYPE_QUERY_POOL);266if (!pool)267return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);268269if (pCreateInfo->queryType == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR) {270pool->perf_group = fd_perfcntrs(device->physical_device->gpu_id,271&pool->perf_group_count);272273pool->counter_index_count = perf_query_info->counterIndexCount;274275/* Build all perf counters data that is requested, so we could get276* correct group id, countable id, counter register and pass index with277* only a counter index provided by applications at each command submit.278*279* Also, since this built data will be sorted by pass index later, we280* should keep the original indices and store perfcntrs results according281* to them so apps can get correct results with their own indices.282*/283uint32_t regs[pool->perf_group_count], pass[pool->perf_group_count];284memset(regs, 0x00, pool->perf_group_count * sizeof(regs[0]));285memset(pass, 0x00, pool->perf_group_count * sizeof(pass[0]));286287for (uint32_t i = 0; i < pool->counter_index_count; i++) {288uint32_t gid = 0, cid = 0;289290perfcntr_index(pool->perf_group, pool->perf_group_count,291perf_query_info->pCounterIndices[i], &gid, &cid);292293pool->perf_query_data[i].gid = gid;294pool->perf_query_data[i].cid = cid;295pool->perf_query_data[i].app_idx = i;296297/* When a counter register is over the capacity(num_counters),298* reset it for next pass.299*/300if (regs[gid] < pool->perf_group[gid].num_counters) {301pool->perf_query_data[i].cntr_reg = regs[gid]++;302pool->perf_query_data[i].pass = pass[gid];303} else {304pool->perf_query_data[i].pass = ++pass[gid];305pool->perf_query_data[i].cntr_reg = regs[gid] = 0;306regs[gid]++;307}308}309310/* Sort by pass index so we could easily prepare a command stream311* with the ascending order of pass index.312*/313qsort(pool->perf_query_data, pool->counter_index_count,314sizeof(pool->perf_query_data[0]),315compare_perfcntr_pass);316}317318VkResult result = tu_bo_init_new(device, &pool->bo,319pCreateInfo->queryCount * slot_size, TU_BO_ALLOC_NO_FLAGS);320if (result != VK_SUCCESS) {321vk_object_free(&device->vk, pAllocator, pool);322return result;323}324325result = tu_bo_map(device, &pool->bo);326if (result != VK_SUCCESS) {327tu_bo_finish(device, &pool->bo);328vk_object_free(&device->vk, pAllocator, pool);329return result;330}331332/* Initialize all query statuses to unavailable */333memset(pool->bo.map, 0, pool->bo.size);334335pool->type = pCreateInfo->queryType;336pool->stride = slot_size;337pool->size = pCreateInfo->queryCount;338pool->pipeline_statistics = pCreateInfo->pipelineStatistics;339*pQueryPool = tu_query_pool_to_handle(pool);340341return VK_SUCCESS;342}343344VKAPI_ATTR void VKAPI_CALL345tu_DestroyQueryPool(VkDevice _device,346VkQueryPool _pool,347const VkAllocationCallbacks *pAllocator)348{349TU_FROM_HANDLE(tu_device, device, _device);350TU_FROM_HANDLE(tu_query_pool, pool, _pool);351352if (!pool)353return;354355tu_bo_finish(device, &pool->bo);356vk_object_free(&device->vk, pAllocator, pool);357}358359static uint32_t360get_result_count(struct tu_query_pool *pool)361{362switch (pool->type) {363/* Occulusion and timestamp queries write one integer value */364case VK_QUERY_TYPE_OCCLUSION:365case VK_QUERY_TYPE_TIMESTAMP:366return 1;367/* Transform feedback queries write two integer values */368case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:369return 2;370case VK_QUERY_TYPE_PIPELINE_STATISTICS:371return util_bitcount(pool->pipeline_statistics);372case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR:373return pool->counter_index_count;374default:375assert(!"Invalid query type");376return 0;377}378}379380static uint32_t381statistics_index(uint32_t *statistics)382{383uint32_t stat;384stat = u_bit_scan(statistics);385386switch (1 << stat) {387case VK_QUERY_PIPELINE_STATISTIC_INPUT_ASSEMBLY_VERTICES_BIT:388case VK_QUERY_PIPELINE_STATISTIC_VERTEX_SHADER_INVOCATIONS_BIT:389return 0;390case VK_QUERY_PIPELINE_STATISTIC_INPUT_ASSEMBLY_PRIMITIVES_BIT:391return 1;392case VK_QUERY_PIPELINE_STATISTIC_TESSELLATION_CONTROL_SHADER_PATCHES_BIT:393return 2;394case VK_QUERY_PIPELINE_STATISTIC_TESSELLATION_EVALUATION_SHADER_INVOCATIONS_BIT:395return 4;396case VK_QUERY_PIPELINE_STATISTIC_GEOMETRY_SHADER_INVOCATIONS_BIT:397return 5;398case VK_QUERY_PIPELINE_STATISTIC_GEOMETRY_SHADER_PRIMITIVES_BIT:399return 6;400case VK_QUERY_PIPELINE_STATISTIC_CLIPPING_INVOCATIONS_BIT:401return 7;402case VK_QUERY_PIPELINE_STATISTIC_CLIPPING_PRIMITIVES_BIT:403return 8;404case VK_QUERY_PIPELINE_STATISTIC_FRAGMENT_SHADER_INVOCATIONS_BIT:405return 9;406case VK_QUERY_PIPELINE_STATISTIC_COMPUTE_SHADER_INVOCATIONS_BIT:407return 10;408default:409return 0;410}411}412413/* Wait on the the availability status of a query up until a timeout. */414static VkResult415wait_for_available(struct tu_device *device, struct tu_query_pool *pool,416uint32_t query)417{418/* TODO: Use the MSM_IOVA_WAIT ioctl to wait on the available bit in a419* scheduler friendly way instead of busy polling once the patch has landed420* upstream. */421struct query_slot *slot = slot_address(pool, query);422uint64_t abs_timeout = os_time_get_absolute_timeout(423WAIT_TIMEOUT * NSEC_PER_SEC);424while(os_time_get_nano() < abs_timeout) {425if (query_is_available(slot))426return VK_SUCCESS;427}428return vk_error(device->instance, VK_TIMEOUT);429}430431/* Writes a query value to a buffer from the CPU. */432static void433write_query_value_cpu(char* base,434uint32_t offset,435uint64_t value,436VkQueryResultFlags flags)437{438if (flags & VK_QUERY_RESULT_64_BIT) {439*(uint64_t*)(base + (offset * sizeof(uint64_t))) = value;440} else {441*(uint32_t*)(base + (offset * sizeof(uint32_t))) = value;442}443}444445static VkResult446get_query_pool_results(struct tu_device *device,447struct tu_query_pool *pool,448uint32_t firstQuery,449uint32_t queryCount,450size_t dataSize,451void *pData,452VkDeviceSize stride,453VkQueryResultFlags flags)454{455assert(dataSize >= stride * queryCount);456457char *result_base = pData;458VkResult result = VK_SUCCESS;459for (uint32_t i = 0; i < queryCount; i++) {460uint32_t query = firstQuery + i;461struct query_slot *slot = slot_address(pool, query);462bool available = query_is_available(slot);463uint32_t result_count = get_result_count(pool);464uint32_t statistics = pool->pipeline_statistics;465466if ((flags & VK_QUERY_RESULT_WAIT_BIT) && !available) {467VkResult wait_result = wait_for_available(device, pool, query);468if (wait_result != VK_SUCCESS)469return wait_result;470available = true;471} else if (!(flags & VK_QUERY_RESULT_PARTIAL_BIT) && !available) {472/* From the Vulkan 1.1.130 spec:473*474* If VK_QUERY_RESULT_WAIT_BIT and VK_QUERY_RESULT_PARTIAL_BIT are475* both not set then no result values are written to pData for476* queries that are in the unavailable state at the time of the477* call, and vkGetQueryPoolResults returns VK_NOT_READY. However,478* availability state is still written to pData for those queries479* if VK_QUERY_RESULT_WITH_AVAILABILITY_BIT is set.480*/481result = VK_NOT_READY;482if (!(flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT)) {483result_base += stride;484continue;485}486}487488for (uint32_t k = 0; k < result_count; k++) {489if (available) {490uint64_t *result;491492if (pool->type == VK_QUERY_TYPE_PIPELINE_STATISTICS) {493uint32_t stat_idx = statistics_index(&statistics);494result = query_result_addr(pool, query, uint64_t, stat_idx);495} else if (pool->type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR) {496result = query_result_addr(pool, query, struct perfcntr_query_slot, k);497} else {498result = query_result_addr(pool, query, uint64_t, k);499}500501write_query_value_cpu(result_base, k, *result, flags);502} else if (flags & VK_QUERY_RESULT_PARTIAL_BIT)503/* From the Vulkan 1.1.130 spec:504*505* If VK_QUERY_RESULT_PARTIAL_BIT is set, VK_QUERY_RESULT_WAIT_BIT506* is not set, and the query’s status is unavailable, an507* intermediate result value between zero and the final result508* value is written to pData for that query.509*510* Just return 0 here for simplicity since it's a valid result.511*/512write_query_value_cpu(result_base, k, 0, flags);513}514515if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT)516/* From the Vulkan 1.1.130 spec:517*518* If VK_QUERY_RESULT_WITH_AVAILABILITY_BIT is set, the final519* integer value written for each query is non-zero if the query’s520* status was available or zero if the status was unavailable.521*/522write_query_value_cpu(result_base, result_count, available, flags);523524result_base += stride;525}526return result;527}528529VKAPI_ATTR VkResult VKAPI_CALL530tu_GetQueryPoolResults(VkDevice _device,531VkQueryPool queryPool,532uint32_t firstQuery,533uint32_t queryCount,534size_t dataSize,535void *pData,536VkDeviceSize stride,537VkQueryResultFlags flags)538{539TU_FROM_HANDLE(tu_device, device, _device);540TU_FROM_HANDLE(tu_query_pool, pool, queryPool);541assert(firstQuery + queryCount <= pool->size);542543if (tu_device_is_lost(device))544return VK_ERROR_DEVICE_LOST;545546switch (pool->type) {547case VK_QUERY_TYPE_OCCLUSION:548case VK_QUERY_TYPE_TIMESTAMP:549case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:550case VK_QUERY_TYPE_PIPELINE_STATISTICS:551case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR:552return get_query_pool_results(device, pool, firstQuery, queryCount,553dataSize, pData, stride, flags);554default:555assert(!"Invalid query type");556}557return VK_SUCCESS;558}559560/* Copies a query value from one buffer to another from the GPU. */561static void562copy_query_value_gpu(struct tu_cmd_buffer *cmdbuf,563struct tu_cs *cs,564uint64_t src_iova,565uint64_t base_write_iova,566uint32_t offset,567VkQueryResultFlags flags) {568uint32_t element_size = flags & VK_QUERY_RESULT_64_BIT ?569sizeof(uint64_t) : sizeof(uint32_t);570uint64_t write_iova = base_write_iova + (offset * element_size);571572tu_cs_emit_pkt7(cs, CP_MEM_TO_MEM, 5);573uint32_t mem_to_mem_flags = flags & VK_QUERY_RESULT_64_BIT ?574CP_MEM_TO_MEM_0_DOUBLE : 0;575tu_cs_emit(cs, mem_to_mem_flags);576tu_cs_emit_qw(cs, write_iova);577tu_cs_emit_qw(cs, src_iova);578}579580static void581emit_copy_query_pool_results(struct tu_cmd_buffer *cmdbuf,582struct tu_cs *cs,583struct tu_query_pool *pool,584uint32_t firstQuery,585uint32_t queryCount,586struct tu_buffer *buffer,587VkDeviceSize dstOffset,588VkDeviceSize stride,589VkQueryResultFlags flags)590{591/* From the Vulkan 1.1.130 spec:592*593* vkCmdCopyQueryPoolResults is guaranteed to see the effect of previous594* uses of vkCmdResetQueryPool in the same queue, without any additional595* synchronization.596*597* To ensure that previous writes to the available bit are coherent, first598* wait for all writes to complete.599*/600tu_cs_emit_pkt7(cs, CP_WAIT_MEM_WRITES, 0);601602for (uint32_t i = 0; i < queryCount; i++) {603uint32_t query = firstQuery + i;604uint64_t available_iova = query_available_iova(pool, query);605uint64_t buffer_iova = tu_buffer_iova(buffer) + dstOffset + i * stride;606uint32_t result_count = get_result_count(pool);607uint32_t statistics = pool->pipeline_statistics;608609/* Wait for the available bit to be set if executed with the610* VK_QUERY_RESULT_WAIT_BIT flag. */611if (flags & VK_QUERY_RESULT_WAIT_BIT) {612tu_cs_emit_pkt7(cs, CP_WAIT_REG_MEM, 6);613tu_cs_emit(cs, CP_WAIT_REG_MEM_0_FUNCTION(WRITE_EQ) |614CP_WAIT_REG_MEM_0_POLL_MEMORY);615tu_cs_emit_qw(cs, available_iova);616tu_cs_emit(cs, CP_WAIT_REG_MEM_3_REF(0x1));617tu_cs_emit(cs, CP_WAIT_REG_MEM_4_MASK(~0));618tu_cs_emit(cs, CP_WAIT_REG_MEM_5_DELAY_LOOP_CYCLES(16));619}620621for (uint32_t k = 0; k < result_count; k++) {622uint64_t result_iova;623624if (pool->type == VK_QUERY_TYPE_PIPELINE_STATISTICS) {625uint32_t stat_idx = statistics_index(&statistics);626result_iova = query_result_iova(pool, query, uint64_t, stat_idx);627} else if (pool->type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR) {628result_iova = query_result_iova(pool, query,629struct perfcntr_query_slot, k);630} else {631result_iova = query_result_iova(pool, query, uint64_t, k);632}633634if (flags & VK_QUERY_RESULT_PARTIAL_BIT) {635/* Unconditionally copying the bo->result into the buffer here is636* valid because we only set bo->result on vkCmdEndQuery. Thus, even637* if the query is unavailable, this will copy the correct partial638* value of 0.639*/640copy_query_value_gpu(cmdbuf, cs, result_iova, buffer_iova,641k /* offset */, flags);642} else {643/* Conditionally copy bo->result into the buffer based on whether the644* query is available.645*646* NOTE: For the conditional packets to be executed, CP_COND_EXEC647* tests that ADDR0 != 0 and ADDR1 < REF. The packet here simply tests648* that 0 < available < 2, aka available == 1.649*/650tu_cs_reserve(cs, 7 + 6);651tu_cs_emit_pkt7(cs, CP_COND_EXEC, 6);652tu_cs_emit_qw(cs, available_iova);653tu_cs_emit_qw(cs, available_iova);654tu_cs_emit(cs, CP_COND_EXEC_4_REF(0x2));655tu_cs_emit(cs, 6); /* Cond execute the next 6 DWORDS */656657/* Start of conditional execution */658copy_query_value_gpu(cmdbuf, cs, result_iova, buffer_iova,659k /* offset */, flags);660/* End of conditional execution */661}662}663664if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT) {665copy_query_value_gpu(cmdbuf, cs, available_iova, buffer_iova,666result_count /* offset */, flags);667}668}669}670671VKAPI_ATTR void VKAPI_CALL672tu_CmdCopyQueryPoolResults(VkCommandBuffer commandBuffer,673VkQueryPool queryPool,674uint32_t firstQuery,675uint32_t queryCount,676VkBuffer dstBuffer,677VkDeviceSize dstOffset,678VkDeviceSize stride,679VkQueryResultFlags flags)680{681TU_FROM_HANDLE(tu_cmd_buffer, cmdbuf, commandBuffer);682TU_FROM_HANDLE(tu_query_pool, pool, queryPool);683TU_FROM_HANDLE(tu_buffer, buffer, dstBuffer);684struct tu_cs *cs = &cmdbuf->cs;685assert(firstQuery + queryCount <= pool->size);686687switch (pool->type) {688case VK_QUERY_TYPE_OCCLUSION:689case VK_QUERY_TYPE_TIMESTAMP:690case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:691case VK_QUERY_TYPE_PIPELINE_STATISTICS:692return emit_copy_query_pool_results(cmdbuf, cs, pool, firstQuery,693queryCount, buffer, dstOffset, stride, flags);694case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR:695unreachable("allowCommandBufferQueryCopies is false");696default:697assert(!"Invalid query type");698}699}700701static void702emit_reset_query_pool(struct tu_cmd_buffer *cmdbuf,703struct tu_query_pool *pool,704uint32_t firstQuery,705uint32_t queryCount)706{707struct tu_cs *cs = &cmdbuf->cs;708709for (uint32_t i = 0; i < queryCount; i++) {710uint32_t query = firstQuery + i;711uint32_t statistics = pool->pipeline_statistics;712713tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 4);714tu_cs_emit_qw(cs, query_available_iova(pool, query));715tu_cs_emit_qw(cs, 0x0);716717for (uint32_t k = 0; k < get_result_count(pool); k++) {718uint64_t result_iova;719720if (pool->type == VK_QUERY_TYPE_PIPELINE_STATISTICS) {721uint32_t stat_idx = statistics_index(&statistics);722result_iova = query_result_iova(pool, query, uint64_t, stat_idx);723} else if (pool->type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR) {724result_iova = query_result_iova(pool, query,725struct perfcntr_query_slot, k);726} else {727result_iova = query_result_iova(pool, query, uint64_t, k);728}729730tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 4);731tu_cs_emit_qw(cs, result_iova);732tu_cs_emit_qw(cs, 0x0);733}734}735736}737738VKAPI_ATTR void VKAPI_CALL739tu_CmdResetQueryPool(VkCommandBuffer commandBuffer,740VkQueryPool queryPool,741uint32_t firstQuery,742uint32_t queryCount)743{744TU_FROM_HANDLE(tu_cmd_buffer, cmdbuf, commandBuffer);745TU_FROM_HANDLE(tu_query_pool, pool, queryPool);746747switch (pool->type) {748case VK_QUERY_TYPE_TIMESTAMP:749case VK_QUERY_TYPE_OCCLUSION:750case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:751case VK_QUERY_TYPE_PIPELINE_STATISTICS:752case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR:753emit_reset_query_pool(cmdbuf, pool, firstQuery, queryCount);754break;755default:756assert(!"Invalid query type");757}758}759760VKAPI_ATTR void VKAPI_CALL761tu_ResetQueryPool(VkDevice device,762VkQueryPool queryPool,763uint32_t firstQuery,764uint32_t queryCount)765{766TU_FROM_HANDLE(tu_query_pool, pool, queryPool);767768for (uint32_t i = 0; i < queryCount; i++) {769struct query_slot *slot = slot_address(pool, i + firstQuery);770slot->available = 0;771772for (uint32_t k = 0; k < get_result_count(pool); k++) {773uint64_t *res;774775if (pool->type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR) {776res = query_result_addr(pool, i + firstQuery,777struct perfcntr_query_slot, k);778} else {779res = query_result_addr(pool, i + firstQuery, uint64_t, k);780}781782*res = 0;783}784}785}786787static void788emit_begin_occlusion_query(struct tu_cmd_buffer *cmdbuf,789struct tu_query_pool *pool,790uint32_t query)791{792/* From the Vulkan 1.1.130 spec:793*794* A query must begin and end inside the same subpass of a render pass795* instance, or must both begin and end outside of a render pass796* instance.797*798* Unlike on an immediate-mode renderer, Turnip renders all tiles on799* vkCmdEndRenderPass, not individually on each vkCmdDraw*. As such, if a800* query begins/ends inside the same subpass of a render pass, we need to801* record the packets on the secondary draw command stream. cmdbuf->draw_cs802* is then run on every tile during render, so we just need to accumulate803* sample counts in slot->result to compute the query result.804*/805struct tu_cs *cs = cmdbuf->state.pass ? &cmdbuf->draw_cs : &cmdbuf->cs;806807uint64_t begin_iova = occlusion_query_iova(pool, query, begin);808809tu_cs_emit_regs(cs,810A6XX_RB_SAMPLE_COUNT_CONTROL(.copy = true));811812tu_cs_emit_regs(cs,813A6XX_RB_SAMPLE_COUNT_ADDR(.qword = begin_iova));814815tu_cs_emit_pkt7(cs, CP_EVENT_WRITE, 1);816tu_cs_emit(cs, ZPASS_DONE);817}818819static void820emit_begin_stat_query(struct tu_cmd_buffer *cmdbuf,821struct tu_query_pool *pool,822uint32_t query)823{824struct tu_cs *cs = cmdbuf->state.pass ? &cmdbuf->draw_cs : &cmdbuf->cs;825uint64_t begin_iova = pipeline_stat_query_iova(pool, query, begin);826827tu6_emit_event_write(cmdbuf, cs, START_PRIMITIVE_CTRS);828tu6_emit_event_write(cmdbuf, cs, RST_PIX_CNT);829tu6_emit_event_write(cmdbuf, cs, TILE_FLUSH);830831tu_cs_emit_wfi(cs);832833tu_cs_emit_pkt7(cs, CP_REG_TO_MEM, 3);834tu_cs_emit(cs, CP_REG_TO_MEM_0_REG(REG_A6XX_RBBM_PRIMCTR_0_LO) |835CP_REG_TO_MEM_0_CNT(STAT_COUNT * 2) |836CP_REG_TO_MEM_0_64B);837tu_cs_emit_qw(cs, begin_iova);838}839840static void841emit_perfcntrs_pass_start(struct tu_cs *cs, uint32_t pass)842{843tu_cs_emit_pkt7(cs, CP_REG_TEST, 1);844tu_cs_emit(cs, A6XX_CP_REG_TEST_0_REG(845REG_A6XX_CP_SCRATCH_REG(PERF_CNTRS_REG)) |846A6XX_CP_REG_TEST_0_BIT(pass) |847A6XX_CP_REG_TEST_0_WAIT_FOR_ME);848tu_cond_exec_start(cs, CP_COND_REG_EXEC_0_MODE(PRED_TEST));849}850851static void852emit_begin_perf_query(struct tu_cmd_buffer *cmdbuf,853struct tu_query_pool *pool,854uint32_t query)855{856struct tu_cs *cs = cmdbuf->state.pass ? &cmdbuf->draw_cs : &cmdbuf->cs;857uint32_t last_pass = ~0;858859/* Querying perf counters happens in these steps:860*861* 0) There's a scratch reg to set a pass index for perf counters query.862* Prepare cmd streams to set each pass index to the reg at device863* creation time. See tu_CreateDevice in tu_device.c864* 1) Emit command streams to read all requested perf counters at all865* passes in begin/end query with CP_REG_TEST/CP_COND_REG_EXEC, which866* reads the scratch reg where pass index is set.867* See emit_perfcntrs_pass_start.868* 2) Pick the right cs setting proper pass index to the reg and prepend869* it to the command buffer at each submit time.870* See tu_QueueSubmit in tu_drm.c871* 3) If the pass index in the reg is true, then executes the command872* stream below CP_COND_REG_EXEC.873*/874875tu_cs_emit_wfi(cs);876877for (uint32_t i = 0; i < pool->counter_index_count; i++) {878struct tu_perf_query_data *data = &pool->perf_query_data[i];879880if (last_pass != data->pass) {881last_pass = data->pass;882883if (data->pass != 0)884tu_cond_exec_end(cs);885emit_perfcntrs_pass_start(cs, data->pass);886}887888const struct fd_perfcntr_counter *counter =889&pool->perf_group[data->gid].counters[data->cntr_reg];890const struct fd_perfcntr_countable *countable =891&pool->perf_group[data->gid].countables[data->cid];892893tu_cs_emit_pkt4(cs, counter->select_reg, 1);894tu_cs_emit(cs, countable->selector);895}896tu_cond_exec_end(cs);897898last_pass = ~0;899tu_cs_emit_wfi(cs);900901for (uint32_t i = 0; i < pool->counter_index_count; i++) {902struct tu_perf_query_data *data = &pool->perf_query_data[i];903904if (last_pass != data->pass) {905last_pass = data->pass;906907if (data->pass != 0)908tu_cond_exec_end(cs);909emit_perfcntrs_pass_start(cs, data->pass);910}911912const struct fd_perfcntr_counter *counter =913&pool->perf_group[data->gid].counters[data->cntr_reg];914915uint64_t begin_iova = perf_query_iova(pool, 0, begin, data->app_idx);916917tu_cs_emit_pkt7(cs, CP_REG_TO_MEM, 3);918tu_cs_emit(cs, CP_REG_TO_MEM_0_REG(counter->counter_reg_lo) |919CP_REG_TO_MEM_0_64B);920tu_cs_emit_qw(cs, begin_iova);921}922tu_cond_exec_end(cs);923}924925static void926emit_begin_xfb_query(struct tu_cmd_buffer *cmdbuf,927struct tu_query_pool *pool,928uint32_t query,929uint32_t stream_id)930{931struct tu_cs *cs = cmdbuf->state.pass ? &cmdbuf->draw_cs : &cmdbuf->cs;932uint64_t begin_iova = primitive_query_iova(pool, query, begin[0], 0);933934tu_cs_emit_regs(cs, A6XX_VPC_SO_STREAM_COUNTS(.qword = begin_iova));935tu6_emit_event_write(cmdbuf, cs, WRITE_PRIMITIVE_COUNTS);936}937938VKAPI_ATTR void VKAPI_CALL939tu_CmdBeginQuery(VkCommandBuffer commandBuffer,940VkQueryPool queryPool,941uint32_t query,942VkQueryControlFlags flags)943{944TU_FROM_HANDLE(tu_cmd_buffer, cmdbuf, commandBuffer);945TU_FROM_HANDLE(tu_query_pool, pool, queryPool);946assert(query < pool->size);947948switch (pool->type) {949case VK_QUERY_TYPE_OCCLUSION:950/* In freedreno, there is no implementation difference between951* GL_SAMPLES_PASSED and GL_ANY_SAMPLES_PASSED, so we can similarly952* ignore the VK_QUERY_CONTROL_PRECISE_BIT flag here.953*/954emit_begin_occlusion_query(cmdbuf, pool, query);955break;956case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:957emit_begin_xfb_query(cmdbuf, pool, query, 0);958break;959case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR:960emit_begin_perf_query(cmdbuf, pool, query);961break;962case VK_QUERY_TYPE_PIPELINE_STATISTICS:963emit_begin_stat_query(cmdbuf, pool, query);964break;965case VK_QUERY_TYPE_TIMESTAMP:966unreachable("Unimplemented query type");967default:968assert(!"Invalid query type");969}970}971972VKAPI_ATTR void VKAPI_CALL973tu_CmdBeginQueryIndexedEXT(VkCommandBuffer commandBuffer,974VkQueryPool queryPool,975uint32_t query,976VkQueryControlFlags flags,977uint32_t index)978{979TU_FROM_HANDLE(tu_cmd_buffer, cmdbuf, commandBuffer);980TU_FROM_HANDLE(tu_query_pool, pool, queryPool);981assert(query < pool->size);982983switch (pool->type) {984case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:985emit_begin_xfb_query(cmdbuf, pool, query, index);986break;987default:988assert(!"Invalid query type");989}990}991992static void993emit_end_occlusion_query(struct tu_cmd_buffer *cmdbuf,994struct tu_query_pool *pool,995uint32_t query)996{997/* Ending an occlusion query happens in a few steps:998* 1) Set the slot->end to UINT64_MAX.999* 2) Set up the SAMPLE_COUNT registers and trigger a CP_EVENT_WRITE to1000* write the current sample count value into slot->end.1001* 3) Since (2) is asynchronous, wait until slot->end is not equal to1002* UINT64_MAX before continuing via CP_WAIT_REG_MEM.1003* 4) Accumulate the results of the query (slot->end - slot->begin) into1004* slot->result.1005* 5) If vkCmdEndQuery is *not* called from within the scope of a render1006* pass, set the slot's available bit since the query is now done.1007* 6) If vkCmdEndQuery *is* called from within the scope of a render1008* pass, we cannot mark as available yet since the commands in1009* draw_cs are not run until vkCmdEndRenderPass.1010*/1011const struct tu_render_pass *pass = cmdbuf->state.pass;1012struct tu_cs *cs = pass ? &cmdbuf->draw_cs : &cmdbuf->cs;10131014uint64_t available_iova = query_available_iova(pool, query);1015uint64_t begin_iova = occlusion_query_iova(pool, query, begin);1016uint64_t end_iova = occlusion_query_iova(pool, query, end);1017uint64_t result_iova = query_result_iova(pool, query, uint64_t, 0);1018tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 4);1019tu_cs_emit_qw(cs, end_iova);1020tu_cs_emit_qw(cs, 0xffffffffffffffffull);10211022tu_cs_emit_pkt7(cs, CP_WAIT_MEM_WRITES, 0);10231024tu_cs_emit_regs(cs,1025A6XX_RB_SAMPLE_COUNT_CONTROL(.copy = true));10261027tu_cs_emit_regs(cs,1028A6XX_RB_SAMPLE_COUNT_ADDR(.qword = end_iova));10291030tu_cs_emit_pkt7(cs, CP_EVENT_WRITE, 1);1031tu_cs_emit(cs, ZPASS_DONE);10321033tu_cs_emit_pkt7(cs, CP_WAIT_REG_MEM, 6);1034tu_cs_emit(cs, CP_WAIT_REG_MEM_0_FUNCTION(WRITE_NE) |1035CP_WAIT_REG_MEM_0_POLL_MEMORY);1036tu_cs_emit_qw(cs, end_iova);1037tu_cs_emit(cs, CP_WAIT_REG_MEM_3_REF(0xffffffff));1038tu_cs_emit(cs, CP_WAIT_REG_MEM_4_MASK(~0));1039tu_cs_emit(cs, CP_WAIT_REG_MEM_5_DELAY_LOOP_CYCLES(16));10401041/* result (dst) = result (srcA) + end (srcB) - begin (srcC) */1042tu_cs_emit_pkt7(cs, CP_MEM_TO_MEM, 9);1043tu_cs_emit(cs, CP_MEM_TO_MEM_0_DOUBLE | CP_MEM_TO_MEM_0_NEG_C);1044tu_cs_emit_qw(cs, result_iova);1045tu_cs_emit_qw(cs, result_iova);1046tu_cs_emit_qw(cs, end_iova);1047tu_cs_emit_qw(cs, begin_iova);10481049tu_cs_emit_pkt7(cs, CP_WAIT_MEM_WRITES, 0);10501051if (pass)1052/* Technically, queries should be tracked per-subpass, but here we track1053* at the render pass level to simply the code a bit. This is safe1054* because the only commands that use the available bit are1055* vkCmdCopyQueryPoolResults and vkCmdResetQueryPool, both of which1056* cannot be invoked from inside a render pass scope.1057*/1058cs = &cmdbuf->draw_epilogue_cs;10591060tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 4);1061tu_cs_emit_qw(cs, available_iova);1062tu_cs_emit_qw(cs, 0x1);1063}10641065static void1066emit_end_stat_query(struct tu_cmd_buffer *cmdbuf,1067struct tu_query_pool *pool,1068uint32_t query)1069{1070struct tu_cs *cs = cmdbuf->state.pass ? &cmdbuf->draw_cs : &cmdbuf->cs;1071uint64_t end_iova = pipeline_stat_query_iova(pool, query, end);1072uint64_t available_iova = query_available_iova(pool, query);1073uint64_t result_iova;1074uint64_t stat_start_iova;1075uint64_t stat_stop_iova;10761077tu6_emit_event_write(cmdbuf, cs, STOP_PRIMITIVE_CTRS);1078tu6_emit_event_write(cmdbuf, cs, RST_VTX_CNT);1079tu6_emit_event_write(cmdbuf, cs, STAT_EVENT);10801081tu_cs_emit_wfi(cs);10821083tu_cs_emit_pkt7(cs, CP_REG_TO_MEM, 3);1084tu_cs_emit(cs, CP_REG_TO_MEM_0_REG(REG_A6XX_RBBM_PRIMCTR_0_LO) |1085CP_REG_TO_MEM_0_CNT(STAT_COUNT * 2) |1086CP_REG_TO_MEM_0_64B);1087tu_cs_emit_qw(cs, end_iova);10881089for (int i = 0; i < STAT_COUNT; i++) {1090result_iova = query_result_iova(pool, query, uint64_t, i);1091stat_start_iova = pipeline_stat_query_iova(pool, query, begin[i]);1092stat_stop_iova = pipeline_stat_query_iova(pool, query, end[i]);10931094tu_cs_emit_pkt7(cs, CP_MEM_TO_MEM, 9);1095tu_cs_emit(cs, CP_MEM_TO_MEM_0_WAIT_FOR_MEM_WRITES |1096CP_MEM_TO_MEM_0_DOUBLE |1097CP_MEM_TO_MEM_0_NEG_C);10981099tu_cs_emit_qw(cs, result_iova);1100tu_cs_emit_qw(cs, result_iova);1101tu_cs_emit_qw(cs, stat_stop_iova);1102tu_cs_emit_qw(cs, stat_start_iova);1103}11041105tu_cs_emit_pkt7(cs, CP_WAIT_MEM_WRITES, 0);11061107if (cmdbuf->state.pass)1108cs = &cmdbuf->draw_epilogue_cs;11091110/* Set the availability to 1 */1111tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 4);1112tu_cs_emit_qw(cs, available_iova);1113tu_cs_emit_qw(cs, 0x1);1114}11151116static void1117emit_end_perf_query(struct tu_cmd_buffer *cmdbuf,1118struct tu_query_pool *pool,1119uint32_t query)1120{1121struct tu_cs *cs = cmdbuf->state.pass ? &cmdbuf->draw_cs : &cmdbuf->cs;1122uint64_t available_iova = query_available_iova(pool, query);1123uint64_t end_iova;1124uint64_t begin_iova;1125uint64_t result_iova;1126uint32_t last_pass = ~0;11271128for (uint32_t i = 0; i < pool->counter_index_count; i++) {1129struct tu_perf_query_data *data = &pool->perf_query_data[i];11301131if (last_pass != data->pass) {1132last_pass = data->pass;11331134if (data->pass != 0)1135tu_cond_exec_end(cs);1136emit_perfcntrs_pass_start(cs, data->pass);1137}11381139const struct fd_perfcntr_counter *counter =1140&pool->perf_group[data->gid].counters[data->cntr_reg];11411142end_iova = perf_query_iova(pool, 0, end, data->app_idx);11431144tu_cs_emit_pkt7(cs, CP_REG_TO_MEM, 3);1145tu_cs_emit(cs, CP_REG_TO_MEM_0_REG(counter->counter_reg_lo) |1146CP_REG_TO_MEM_0_64B);1147tu_cs_emit_qw(cs, end_iova);1148}1149tu_cond_exec_end(cs);11501151last_pass = ~0;1152tu_cs_emit_wfi(cs);11531154for (uint32_t i = 0; i < pool->counter_index_count; i++) {1155struct tu_perf_query_data *data = &pool->perf_query_data[i];11561157if (last_pass != data->pass) {1158last_pass = data->pass;115911601161if (data->pass != 0)1162tu_cond_exec_end(cs);1163emit_perfcntrs_pass_start(cs, data->pass);1164}11651166result_iova = query_result_iova(pool, 0, struct perfcntr_query_slot,1167data->app_idx);1168begin_iova = perf_query_iova(pool, 0, begin, data->app_idx);1169end_iova = perf_query_iova(pool, 0, end, data->app_idx);11701171/* result += end - begin */1172tu_cs_emit_pkt7(cs, CP_MEM_TO_MEM, 9);1173tu_cs_emit(cs, CP_MEM_TO_MEM_0_WAIT_FOR_MEM_WRITES |1174CP_MEM_TO_MEM_0_DOUBLE |1175CP_MEM_TO_MEM_0_NEG_C);11761177tu_cs_emit_qw(cs, result_iova);1178tu_cs_emit_qw(cs, result_iova);1179tu_cs_emit_qw(cs, end_iova);1180tu_cs_emit_qw(cs, begin_iova);1181}1182tu_cond_exec_end(cs);11831184tu_cs_emit_pkt7(cs, CP_WAIT_MEM_WRITES, 0);11851186if (cmdbuf->state.pass)1187cs = &cmdbuf->draw_epilogue_cs;11881189/* Set the availability to 1 */1190tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 4);1191tu_cs_emit_qw(cs, available_iova);1192tu_cs_emit_qw(cs, 0x1);1193}11941195static void1196emit_end_xfb_query(struct tu_cmd_buffer *cmdbuf,1197struct tu_query_pool *pool,1198uint32_t query,1199uint32_t stream_id)1200{1201struct tu_cs *cs = cmdbuf->state.pass ? &cmdbuf->draw_cs : &cmdbuf->cs;12021203uint64_t end_iova = primitive_query_iova(pool, query, end[0], 0);1204uint64_t result_written_iova = query_result_iova(pool, query, uint64_t, 0);1205uint64_t result_generated_iova = query_result_iova(pool, query, uint64_t, 1);1206uint64_t begin_written_iova = primitive_query_iova(pool, query, begin[stream_id], 0);1207uint64_t begin_generated_iova = primitive_query_iova(pool, query, begin[stream_id], 1);1208uint64_t end_written_iova = primitive_query_iova(pool, query, end[stream_id], 0);1209uint64_t end_generated_iova = primitive_query_iova(pool, query, end[stream_id], 1);1210uint64_t available_iova = query_available_iova(pool, query);12111212tu_cs_emit_regs(cs, A6XX_VPC_SO_STREAM_COUNTS(.qword = end_iova));1213tu6_emit_event_write(cmdbuf, cs, WRITE_PRIMITIVE_COUNTS);12141215tu_cs_emit_wfi(cs);1216tu6_emit_event_write(cmdbuf, cs, CACHE_FLUSH_TS);12171218/* Set the count of written primitives */1219tu_cs_emit_pkt7(cs, CP_MEM_TO_MEM, 9);1220tu_cs_emit(cs, CP_MEM_TO_MEM_0_DOUBLE | CP_MEM_TO_MEM_0_NEG_C |1221CP_MEM_TO_MEM_0_WAIT_FOR_MEM_WRITES | 0x80000000);1222tu_cs_emit_qw(cs, result_written_iova);1223tu_cs_emit_qw(cs, result_written_iova);1224tu_cs_emit_qw(cs, end_written_iova);1225tu_cs_emit_qw(cs, begin_written_iova);12261227tu6_emit_event_write(cmdbuf, cs, CACHE_FLUSH_TS);12281229/* Set the count of generated primitives */1230tu_cs_emit_pkt7(cs, CP_MEM_TO_MEM, 9);1231tu_cs_emit(cs, CP_MEM_TO_MEM_0_DOUBLE | CP_MEM_TO_MEM_0_NEG_C |1232CP_MEM_TO_MEM_0_WAIT_FOR_MEM_WRITES | 0x80000000);1233tu_cs_emit_qw(cs, result_generated_iova);1234tu_cs_emit_qw(cs, result_generated_iova);1235tu_cs_emit_qw(cs, end_generated_iova);1236tu_cs_emit_qw(cs, begin_generated_iova);12371238/* Set the availability to 1 */1239tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 4);1240tu_cs_emit_qw(cs, available_iova);1241tu_cs_emit_qw(cs, 0x1);1242}12431244/* Implement this bit of spec text from section 17.2 "Query Operation":1245*1246* If queries are used while executing a render pass instance that has1247* multiview enabled, the query uses N consecutive query indices in the1248* query pool (starting at query) where N is the number of bits set in the1249* view mask in the subpass the query is used in. How the numerical1250* results of the query are distributed among the queries is1251* implementation-dependent. For example, some implementations may write1252* each view’s results to a distinct query, while other implementations1253* may write the total result to the first query and write zero to the1254* other queries. However, the sum of the results in all the queries must1255* accurately reflect the total result of the query summed over all views.1256* Applications can sum the results from all the queries to compute the1257* total result.1258*1259* Since we execute all views at once, we write zero to the other queries.1260* Furthermore, because queries must be reset before use, and we set the1261* result to 0 in vkCmdResetQueryPool(), we just need to mark it as available.1262*/12631264static void1265handle_multiview_queries(struct tu_cmd_buffer *cmd,1266struct tu_query_pool *pool,1267uint32_t query)1268{1269if (!cmd->state.pass || !cmd->state.subpass->multiview_mask)1270return;12711272unsigned views = util_bitcount(cmd->state.subpass->multiview_mask);1273struct tu_cs *cs = &cmd->draw_epilogue_cs;12741275for (uint32_t i = 1; i < views; i++) {1276tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 4);1277tu_cs_emit_qw(cs, query_available_iova(pool, query + i));1278tu_cs_emit_qw(cs, 0x1);1279}1280}12811282VKAPI_ATTR void VKAPI_CALL1283tu_CmdEndQuery(VkCommandBuffer commandBuffer,1284VkQueryPool queryPool,1285uint32_t query)1286{1287TU_FROM_HANDLE(tu_cmd_buffer, cmdbuf, commandBuffer);1288TU_FROM_HANDLE(tu_query_pool, pool, queryPool);1289assert(query < pool->size);12901291switch (pool->type) {1292case VK_QUERY_TYPE_OCCLUSION:1293emit_end_occlusion_query(cmdbuf, pool, query);1294break;1295case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:1296emit_end_xfb_query(cmdbuf, pool, query, 0);1297break;1298case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR:1299emit_end_perf_query(cmdbuf, pool, query);1300break;1301case VK_QUERY_TYPE_PIPELINE_STATISTICS:1302emit_end_stat_query(cmdbuf, pool, query);1303break;1304case VK_QUERY_TYPE_TIMESTAMP:1305unreachable("Unimplemented query type");1306default:1307assert(!"Invalid query type");1308}13091310handle_multiview_queries(cmdbuf, pool, query);1311}13121313VKAPI_ATTR void VKAPI_CALL1314tu_CmdEndQueryIndexedEXT(VkCommandBuffer commandBuffer,1315VkQueryPool queryPool,1316uint32_t query,1317uint32_t index)1318{1319TU_FROM_HANDLE(tu_cmd_buffer, cmdbuf, commandBuffer);1320TU_FROM_HANDLE(tu_query_pool, pool, queryPool);1321assert(query < pool->size);13221323switch (pool->type) {1324case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:1325assert(index <= 4);1326emit_end_xfb_query(cmdbuf, pool, query, index);1327break;1328default:1329assert(!"Invalid query type");1330}1331}13321333VKAPI_ATTR void VKAPI_CALL1334tu_CmdWriteTimestamp(VkCommandBuffer commandBuffer,1335VkPipelineStageFlagBits pipelineStage,1336VkQueryPool queryPool,1337uint32_t query)1338{1339TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);1340TU_FROM_HANDLE(tu_query_pool, pool, queryPool);13411342/* Inside a render pass, just write the timestamp multiple times so that1343* the user gets the last one if we use GMEM. There isn't really much1344* better we can do, and this seems to be what the blob does too.1345*/1346struct tu_cs *cs = cmd->state.pass ? &cmd->draw_cs : &cmd->cs;13471348/* Stages that will already have been executed by the time the CP executes1349* the REG_TO_MEM. DrawIndirect parameters are read by the CP, so the draw1350* indirect stage counts as top-of-pipe too.1351*/1352VkPipelineStageFlags top_of_pipe_flags =1353VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT |1354VK_PIPELINE_STAGE_DRAW_INDIRECT_BIT;13551356if (pipelineStage & ~top_of_pipe_flags) {1357/* Execute a WFI so that all commands complete. Note that CP_REG_TO_MEM1358* does CP_WAIT_FOR_ME internally, which will wait for the WFI to1359* complete.1360*1361* Stalling the CP like this is really unfortunate, but I don't think1362* there's a better solution that allows all 48 bits of precision1363* because CP_EVENT_WRITE doesn't support 64-bit timestamps.1364*/1365tu_cs_emit_wfi(cs);1366}13671368tu_cs_emit_pkt7(cs, CP_REG_TO_MEM, 3);1369tu_cs_emit(cs, CP_REG_TO_MEM_0_REG(REG_A6XX_CP_ALWAYS_ON_COUNTER_LO) |1370CP_REG_TO_MEM_0_CNT(2) |1371CP_REG_TO_MEM_0_64B);1372tu_cs_emit_qw(cs, query_result_iova(pool, query, uint64_t, 0));13731374/* Only flag availability once the entire renderpass is done, similar to1375* the begin/end path.1376*/1377cs = cmd->state.pass ? &cmd->draw_epilogue_cs : &cmd->cs;13781379tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 4);1380tu_cs_emit_qw(cs, query_available_iova(pool, query));1381tu_cs_emit_qw(cs, 0x1);13821383/* From the spec for vkCmdWriteTimestamp:1384*1385* If vkCmdWriteTimestamp is called while executing a render pass1386* instance that has multiview enabled, the timestamp uses N consecutive1387* query indices in the query pool (starting at query) where N is the1388* number of bits set in the view mask of the subpass the command is1389* executed in. The resulting query values are determined by an1390* implementation-dependent choice of one of the following behaviors:1391*1392* - The first query is a timestamp value and (if more than one bit is1393* set in the view mask) zero is written to the remaining queries.1394* If two timestamps are written in the same subpass, the sum of the1395* execution time of all views between those commands is the1396* difference between the first query written by each command.1397*1398* - All N queries are timestamp values. If two timestamps are written1399* in the same subpass, the sum of the execution time of all views1400* between those commands is the sum of the difference between1401* corresponding queries written by each command. The difference1402* between corresponding queries may be the execution time of a1403* single view.1404*1405* We execute all views in the same draw call, so we implement the first1406* option, the same as regular queries.1407*/1408handle_multiview_queries(cmd, pool, query);1409}14101411VKAPI_ATTR VkResult VKAPI_CALL1412tu_EnumeratePhysicalDeviceQueueFamilyPerformanceQueryCountersKHR(1413VkPhysicalDevice physicalDevice,1414uint32_t queueFamilyIndex,1415uint32_t* pCounterCount,1416VkPerformanceCounterKHR* pCounters,1417VkPerformanceCounterDescriptionKHR* pCounterDescriptions)1418{1419TU_FROM_HANDLE(tu_physical_device, phydev, physicalDevice);14201421uint32_t desc_count = *pCounterCount;1422uint32_t group_count;1423const struct fd_perfcntr_group *group =1424fd_perfcntrs(phydev->gpu_id, &group_count);14251426VK_OUTARRAY_MAKE(out, pCounters, pCounterCount);1427VK_OUTARRAY_MAKE(out_desc, pCounterDescriptions, &desc_count);14281429for (int i = 0; i < group_count; i++) {1430for (int j = 0; j < group[i].num_countables; j++) {14311432vk_outarray_append(&out, counter) {1433counter->scope = VK_QUERY_SCOPE_COMMAND_BUFFER_KHR;1434counter->unit =1435fd_perfcntr_type_to_vk_unit[group[i].countables[j].query_type];1436counter->storage =1437fd_perfcntr_type_to_vk_storage[group[i].countables[j].query_type];14381439unsigned char sha1_result[20];1440_mesa_sha1_compute(group[i].countables[j].name,1441strlen(group[i].countables[j].name),1442sha1_result);1443memcpy(counter->uuid, sha1_result, sizeof(counter->uuid));1444}14451446vk_outarray_append(&out_desc, desc) {1447desc->flags = 0;14481449snprintf(desc->name, sizeof(desc->name),1450"%s", group[i].countables[j].name);1451snprintf(desc->category, sizeof(desc->category), "%s", group[i].name);1452snprintf(desc->description, sizeof(desc->description),1453"%s: %s performance counter",1454group[i].name, group[i].countables[j].name);1455}1456}1457}14581459return vk_outarray_status(&out);1460}14611462VKAPI_ATTR void VKAPI_CALL1463tu_GetPhysicalDeviceQueueFamilyPerformanceQueryPassesKHR(1464VkPhysicalDevice physicalDevice,1465const VkQueryPoolPerformanceCreateInfoKHR* pPerformanceQueryCreateInfo,1466uint32_t* pNumPasses)1467{1468TU_FROM_HANDLE(tu_physical_device, phydev, physicalDevice);1469uint32_t group_count = 0;1470uint32_t gid = 0, cid = 0, n_passes;1471const struct fd_perfcntr_group *group =1472fd_perfcntrs(phydev->gpu_id, &group_count);14731474uint32_t counters_requested[group_count];1475memset(counters_requested, 0x0, sizeof(counters_requested));1476*pNumPasses = 1;14771478for (unsigned i = 0; i < pPerformanceQueryCreateInfo->counterIndexCount; i++) {1479perfcntr_index(group, group_count,1480pPerformanceQueryCreateInfo->pCounterIndices[i],1481&gid, &cid);14821483counters_requested[gid]++;1484}14851486for (uint32_t i = 0; i < group_count; i++) {1487n_passes = DIV_ROUND_UP(counters_requested[i], group[i].num_counters);1488*pNumPasses = MAX2(*pNumPasses, n_passes);1489}1490}14911492VKAPI_ATTR VkResult VKAPI_CALL1493tu_AcquireProfilingLockKHR(VkDevice device,1494const VkAcquireProfilingLockInfoKHR* pInfo)1495{1496/* TODO. Probably there's something to do for kgsl. */1497return VK_SUCCESS;1498}14991500VKAPI_ATTR void VKAPI_CALL1501tu_ReleaseProfilingLockKHR(VkDevice device)1502{1503/* TODO. Probably there's something to do for kgsl. */1504return;1505}150615071508