Path: blob/21.2-virgl/src/intel/vulkan/genX_query.c
4547 views
/*1* Copyright © 2015 Intel Corporation2*3* Permission is hereby granted, free of charge, to any person obtaining a4* copy of this software and associated documentation files (the "Software"),5* to deal in the Software without restriction, including without limitation6* the rights to use, copy, modify, merge, publish, distribute, sublicense,7* and/or sell copies of the Software, and to permit persons to whom the8* Software is furnished to do so, subject to the following conditions:9*10* The above copyright notice and this permission notice (including the next11* paragraph) shall be included in all copies or substantial portions of the12* Software.13*14* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR15* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,16* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL17* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER18* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING19* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS20* IN THE SOFTWARE.21*/2223#include <assert.h>24#include <stdbool.h>25#include <string.h>26#include <unistd.h>27#include <fcntl.h>2829#include "anv_private.h"3031#include "genxml/gen_macros.h"32#include "genxml/genX_pack.h"3334/* We reserve :35* - GPR 14 for perf queries36* - GPR 15 for conditional rendering37*/38#define MI_BUILDER_NUM_ALLOC_GPRS 1439#define MI_BUILDER_CAN_WRITE_BATCH GFX_VER >= 840#define __gen_get_batch_dwords anv_batch_emit_dwords41#define __gen_address_offset anv_address_add42#define __gen_get_batch_address(b, a) anv_batch_address(b, a)43#include "common/mi_builder.h"44#include "perf/intel_perf.h"45#include "perf/intel_perf_mdapi.h"46#include "perf/intel_perf_regs.h"4748#include "vk_util.h"4950static struct anv_address51anv_query_address(struct anv_query_pool *pool, uint32_t query)52{53return (struct anv_address) {54.bo = pool->bo,55.offset = query * pool->stride,56};57}5859VkResult genX(CreateQueryPool)(60VkDevice _device,61const VkQueryPoolCreateInfo* pCreateInfo,62const VkAllocationCallbacks* pAllocator,63VkQueryPool* pQueryPool)64{65ANV_FROM_HANDLE(anv_device, device, _device);66const struct anv_physical_device *pdevice = device->physical;67#if GFX_VER >= 868const VkQueryPoolPerformanceCreateInfoKHR *perf_query_info = NULL;69struct intel_perf_counter_pass *counter_pass;70struct intel_perf_query_info **pass_query;71uint32_t n_passes = 0;72#endif73uint32_t data_offset = 0;74VK_MULTIALLOC(ma);75VkResult result;7677assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_QUERY_POOL_CREATE_INFO);7879/* Query pool slots are made up of some number of 64-bit values packed80* tightly together. For most query types have the first 64-bit value is81* the "available" bit which is 0 when the query is unavailable and 1 when82* it is available. The 64-bit values that follow are determined by the83* type of query.84*85* For performance queries, we have a requirement to align OA reports at86* 64bytes so we put those first and have the "available" bit behind87* together with some other counters.88*/89uint32_t uint64s_per_slot = 0;9091VK_MULTIALLOC_DECL(&ma, struct anv_query_pool, pool, 1);9293VkQueryPipelineStatisticFlags pipeline_statistics = 0;94switch (pCreateInfo->queryType) {95case VK_QUERY_TYPE_OCCLUSION:96/* Occlusion queries have two values: begin and end. */97uint64s_per_slot = 1 + 2;98break;99case VK_QUERY_TYPE_TIMESTAMP:100/* Timestamps just have the one timestamp value */101uint64s_per_slot = 1 + 1;102break;103case VK_QUERY_TYPE_PIPELINE_STATISTICS:104pipeline_statistics = pCreateInfo->pipelineStatistics;105/* We're going to trust this field implicitly so we need to ensure that106* no unhandled extension bits leak in.107*/108pipeline_statistics &= ANV_PIPELINE_STATISTICS_MASK;109110/* Statistics queries have a min and max for every statistic */111uint64s_per_slot = 1 + 2 * util_bitcount(pipeline_statistics);112break;113case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:114/* Transform feedback queries are 4 values, begin/end for115* written/available.116*/117uint64s_per_slot = 1 + 4;118break;119case VK_QUERY_TYPE_PERFORMANCE_QUERY_INTEL: {120const struct intel_perf_query_field_layout *layout =121&pdevice->perf->query_layout;122123uint64s_per_slot = 2; /* availability + marker */124/* Align to the requirement of the layout */125uint64s_per_slot = align_u32(uint64s_per_slot,126DIV_ROUND_UP(layout->alignment, sizeof(uint64_t)));127data_offset = uint64s_per_slot * sizeof(uint64_t);128/* Add the query data for begin & end commands */129uint64s_per_slot += 2 * DIV_ROUND_UP(layout->size, sizeof(uint64_t));130break;131}132#if GFX_VER >= 8133case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR: {134const struct intel_perf_query_field_layout *layout =135&pdevice->perf->query_layout;136137perf_query_info = vk_find_struct_const(pCreateInfo->pNext,138QUERY_POOL_PERFORMANCE_CREATE_INFO_KHR);139n_passes = intel_perf_get_n_passes(pdevice->perf,140perf_query_info->pCounterIndices,141perf_query_info->counterIndexCount,142NULL);143vk_multialloc_add(&ma, &counter_pass, struct intel_perf_counter_pass,144perf_query_info->counterIndexCount);145vk_multialloc_add(&ma, &pass_query, struct intel_perf_query_info *,146n_passes);147uint64s_per_slot = 4 /* availability + small batch */;148/* Align to the requirement of the layout */149uint64s_per_slot = align_u32(uint64s_per_slot,150DIV_ROUND_UP(layout->alignment, sizeof(uint64_t)));151data_offset = uint64s_per_slot * sizeof(uint64_t);152/* Add the query data for begin & end commands */153uint64s_per_slot += 2 * DIV_ROUND_UP(layout->size, sizeof(uint64_t));154/* Multiply by the number of passes */155uint64s_per_slot *= n_passes;156break;157}158#endif159default:160assert(!"Invalid query type");161}162163if (!vk_object_multialloc(&device->vk, &ma, pAllocator,164VK_OBJECT_TYPE_QUERY_POOL))165return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);166167pool->type = pCreateInfo->queryType;168pool->pipeline_statistics = pipeline_statistics;169pool->stride = uint64s_per_slot * sizeof(uint64_t);170pool->slots = pCreateInfo->queryCount;171172if (pool->type == VK_QUERY_TYPE_PERFORMANCE_QUERY_INTEL) {173pool->data_offset = data_offset;174pool->snapshot_size = (pool->stride - data_offset) / 2;175}176#if GFX_VER >= 8177else if (pool->type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR) {178pool->pass_size = pool->stride / n_passes;179pool->data_offset = data_offset;180pool->snapshot_size = (pool->pass_size - data_offset) / 2;181pool->n_counters = perf_query_info->counterIndexCount;182pool->counter_pass = counter_pass;183intel_perf_get_counters_passes(pdevice->perf,184perf_query_info->pCounterIndices,185perf_query_info->counterIndexCount,186pool->counter_pass);187pool->n_passes = n_passes;188pool->pass_query = pass_query;189intel_perf_get_n_passes(pdevice->perf,190perf_query_info->pCounterIndices,191perf_query_info->counterIndexCount,192pool->pass_query);193}194#endif195196uint64_t size = pool->slots * (uint64_t)pool->stride;197result = anv_device_alloc_bo(device, "query-pool", size,198ANV_BO_ALLOC_MAPPED |199ANV_BO_ALLOC_SNOOPED,2000 /* explicit_address */,201&pool->bo);202if (result != VK_SUCCESS)203goto fail;204205#if GFX_VER >= 8206if (pool->type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR) {207for (uint32_t p = 0; p < pool->n_passes; p++) {208struct mi_builder b;209struct anv_batch batch = {210.start = pool->bo->map + khr_perf_query_preamble_offset(pool, p),211.end = pool->bo->map + khr_perf_query_preamble_offset(pool, p) + pool->data_offset,212};213batch.next = batch.start;214215mi_builder_init(&b, &device->info, &batch);216mi_store(&b, mi_reg64(ANV_PERF_QUERY_OFFSET_REG),217mi_imm(p * (uint64_t)pool->pass_size));218anv_batch_emit(&batch, GENX(MI_BATCH_BUFFER_END), bbe);219}220}221#endif222223*pQueryPool = anv_query_pool_to_handle(pool);224225return VK_SUCCESS;226227fail:228vk_free2(&device->vk.alloc, pAllocator, pool);229230return result;231}232233void genX(DestroyQueryPool)(234VkDevice _device,235VkQueryPool _pool,236const VkAllocationCallbacks* pAllocator)237{238ANV_FROM_HANDLE(anv_device, device, _device);239ANV_FROM_HANDLE(anv_query_pool, pool, _pool);240241if (!pool)242return;243244anv_device_release_bo(device, pool->bo);245vk_object_free(&device->vk, pAllocator, pool);246}247248#if GFX_VER >= 8249/**250* VK_KHR_performance_query layout :251*252* --------------------------------------------253* | availability (8b) | | |254* |-------------------------------| | |255* | Small batch loading | | |256* | ANV_PERF_QUERY_OFFSET_REG | | |257* | (24b) | | Pass 0 |258* |-------------------------------| | |259* | some padding (see | | |260* | query_field_layout:alignment) | | |261* |-------------------------------| | |262* | query data | | |263* | (2 * query_field_layout:size) | | |264* |-------------------------------|-- | Query 0265* | availability (8b) | | |266* |-------------------------------| | |267* | Small batch loading | | |268* | ANV_PERF_QUERY_OFFSET_REG | | |269* | (24b) | | Pass 1 |270* |-------------------------------| | |271* | some padding (see | | |272* | query_field_layout:alignment) | | |273* |-------------------------------| | |274* | query data | | |275* | (2 * query_field_layout:size) | | |276* |-------------------------------|-----------277* | availability (8b) | | |278* |-------------------------------| | |279* | Small batch loading | | |280* | ANV_PERF_QUERY_OFFSET_REG | | |281* | (24b) | | Pass 0 |282* |-------------------------------| | |283* | some padding (see | | |284* | query_field_layout:alignment) | | |285* |-------------------------------| | |286* | query data | | |287* | (2 * query_field_layout:size) | | |288* |-------------------------------|-- | Query 1289* | ... | | |290* --------------------------------------------291*/292293static uint64_t294khr_perf_query_availability_offset(struct anv_query_pool *pool, uint32_t query, uint32_t pass)295{296return query * (uint64_t)pool->stride + pass * (uint64_t)pool->pass_size;297}298299static uint64_t300khr_perf_query_data_offset(struct anv_query_pool *pool, uint32_t query, uint32_t pass, bool end)301{302return query * (uint64_t)pool->stride + pass * (uint64_t)pool->pass_size +303pool->data_offset + (end ? pool->snapshot_size : 0);304}305306static struct anv_address307khr_perf_query_availability_address(struct anv_query_pool *pool, uint32_t query, uint32_t pass)308{309return anv_address_add(310(struct anv_address) { .bo = pool->bo, },311khr_perf_query_availability_offset(pool, query, pass));312}313314static struct anv_address315khr_perf_query_data_address(struct anv_query_pool *pool, uint32_t query, uint32_t pass, bool end)316{317return anv_address_add(318(struct anv_address) { .bo = pool->bo, },319khr_perf_query_data_offset(pool, query, pass, end));320}321322static bool323khr_perf_query_ensure_relocs(struct anv_cmd_buffer *cmd_buffer)324{325if (anv_batch_has_error(&cmd_buffer->batch))326return false;327328if (cmd_buffer->self_mod_locations)329return true;330331struct anv_device *device = cmd_buffer->device;332const struct anv_physical_device *pdevice = device->physical;333334cmd_buffer->self_mod_locations =335vk_alloc(&cmd_buffer->pool->alloc,336pdevice->n_perf_query_commands * sizeof(*cmd_buffer->self_mod_locations), 8,337VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);338339if (!cmd_buffer->self_mod_locations) {340anv_batch_set_error(&cmd_buffer->batch, VK_ERROR_OUT_OF_HOST_MEMORY);341return false;342}343344return true;345}346#endif347348/**349* VK_INTEL_performance_query layout :350*351* ---------------------------------352* | availability (8b) |353* |-------------------------------|354* | marker (8b) |355* |-------------------------------|356* | some padding (see |357* | query_field_layout:alignment) |358* |-------------------------------|359* | query data |360* | (2 * query_field_layout:size) |361* ---------------------------------362*/363364static uint32_t365intel_perf_marker_offset(void)366{367return 8;368}369370static uint32_t371intel_perf_query_data_offset(struct anv_query_pool *pool, bool end)372{373return pool->data_offset + (end ? pool->snapshot_size : 0);374}375376static void377cpu_write_query_result(void *dst_slot, VkQueryResultFlags flags,378uint32_t value_index, uint64_t result)379{380if (flags & VK_QUERY_RESULT_64_BIT) {381uint64_t *dst64 = dst_slot;382dst64[value_index] = result;383} else {384uint32_t *dst32 = dst_slot;385dst32[value_index] = result;386}387}388389static void *390query_slot(struct anv_query_pool *pool, uint32_t query)391{392return pool->bo->map + query * pool->stride;393}394395static bool396query_is_available(struct anv_query_pool *pool, uint32_t query)397{398#if GFX_VER >= 8399if (pool->type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR) {400for (uint32_t p = 0; p < pool->n_passes; p++) {401volatile uint64_t *slot =402pool->bo->map + khr_perf_query_availability_offset(pool, query, p);403if (!slot[0])404return false;405}406return true;407}408#endif409410return *(volatile uint64_t *)query_slot(pool, query);411}412413static VkResult414wait_for_available(struct anv_device *device,415struct anv_query_pool *pool, uint32_t query)416{417uint64_t abs_timeout = anv_get_absolute_timeout(2 * NSEC_PER_SEC);418419while (anv_gettime_ns() < abs_timeout) {420if (query_is_available(pool, query))421return VK_SUCCESS;422VkResult status = anv_device_query_status(device);423if (status != VK_SUCCESS)424return status;425}426427return anv_device_set_lost(device, "query timeout");428}429430VkResult genX(GetQueryPoolResults)(431VkDevice _device,432VkQueryPool queryPool,433uint32_t firstQuery,434uint32_t queryCount,435size_t dataSize,436void* pData,437VkDeviceSize stride,438VkQueryResultFlags flags)439{440ANV_FROM_HANDLE(anv_device, device, _device);441ANV_FROM_HANDLE(anv_query_pool, pool, queryPool);442443assert(pool->type == VK_QUERY_TYPE_OCCLUSION ||444pool->type == VK_QUERY_TYPE_PIPELINE_STATISTICS ||445pool->type == VK_QUERY_TYPE_TIMESTAMP ||446pool->type == VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT ||447pool->type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR ||448pool->type == VK_QUERY_TYPE_PERFORMANCE_QUERY_INTEL);449450if (anv_device_is_lost(device))451return VK_ERROR_DEVICE_LOST;452453if (pData == NULL)454return VK_SUCCESS;455456void *data_end = pData + dataSize;457458VkResult status = VK_SUCCESS;459for (uint32_t i = 0; i < queryCount; i++) {460bool available = query_is_available(pool, firstQuery + i);461462if (!available && (flags & VK_QUERY_RESULT_WAIT_BIT)) {463status = wait_for_available(device, pool, firstQuery + i);464if (status != VK_SUCCESS) {465return status;466}467468available = true;469}470471/* From the Vulkan 1.0.42 spec:472*473* "If VK_QUERY_RESULT_WAIT_BIT and VK_QUERY_RESULT_PARTIAL_BIT are474* both not set then no result values are written to pData for475* queries that are in the unavailable state at the time of the call,476* and vkGetQueryPoolResults returns VK_NOT_READY. However,477* availability state is still written to pData for those queries if478* VK_QUERY_RESULT_WITH_AVAILABILITY_BIT is set."479*480* From VK_KHR_performance_query :481*482* "VK_QUERY_RESULT_PERFORMANCE_QUERY_RECORDED_COUNTERS_BIT_KHR specifies483* that the result should contain the number of counters that were recorded484* into a query pool of type ename:VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR"485*/486bool write_results = available || (flags & VK_QUERY_RESULT_PARTIAL_BIT);487488uint32_t idx = 0;489switch (pool->type) {490case VK_QUERY_TYPE_OCCLUSION: {491uint64_t *slot = query_slot(pool, firstQuery + i);492if (write_results) {493/* From the Vulkan 1.2.132 spec:494*495* "If VK_QUERY_RESULT_PARTIAL_BIT is set,496* VK_QUERY_RESULT_WAIT_BIT is not set, and the query’s status497* is unavailable, an intermediate result value between zero and498* the final result value is written to pData for that query."499*/500uint64_t result = available ? slot[2] - slot[1] : 0;501cpu_write_query_result(pData, flags, idx, result);502}503idx++;504break;505}506507case VK_QUERY_TYPE_PIPELINE_STATISTICS: {508uint64_t *slot = query_slot(pool, firstQuery + i);509uint32_t statistics = pool->pipeline_statistics;510while (statistics) {511uint32_t stat = u_bit_scan(&statistics);512if (write_results) {513uint64_t result = slot[idx * 2 + 2] - slot[idx * 2 + 1];514515/* WaDividePSInvocationCountBy4:HSW,BDW */516if ((device->info.ver == 8 || device->info.is_haswell) &&517(1 << stat) == VK_QUERY_PIPELINE_STATISTIC_FRAGMENT_SHADER_INVOCATIONS_BIT)518result >>= 2;519520cpu_write_query_result(pData, flags, idx, result);521}522idx++;523}524assert(idx == util_bitcount(pool->pipeline_statistics));525break;526}527528case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT: {529uint64_t *slot = query_slot(pool, firstQuery + i);530if (write_results)531cpu_write_query_result(pData, flags, idx, slot[2] - slot[1]);532idx++;533if (write_results)534cpu_write_query_result(pData, flags, idx, slot[4] - slot[3]);535idx++;536break;537}538539case VK_QUERY_TYPE_TIMESTAMP: {540uint64_t *slot = query_slot(pool, firstQuery + i);541if (write_results)542cpu_write_query_result(pData, flags, idx, slot[1]);543idx++;544break;545}546547#if GFX_VER >= 8548case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR: {549const struct anv_physical_device *pdevice = device->physical;550assert((flags & (VK_QUERY_RESULT_WITH_AVAILABILITY_BIT |551VK_QUERY_RESULT_PARTIAL_BIT)) == 0);552for (uint32_t p = 0; p < pool->n_passes; p++) {553const struct intel_perf_query_info *query = pool->pass_query[p];554struct intel_perf_query_result result;555intel_perf_query_result_clear(&result);556intel_perf_query_result_accumulate_fields(&result, query, &device->info,557pool->bo->map + khr_perf_query_data_offset(pool, firstQuery + i, p, false),558pool->bo->map + khr_perf_query_data_offset(pool, firstQuery + i, p, true),559false /* no_oa_accumulate */);560anv_perf_write_pass_results(pdevice->perf, pool, p, &result, pData);561}562break;563}564#endif565566case VK_QUERY_TYPE_PERFORMANCE_QUERY_INTEL: {567if (!write_results)568break;569const void *query_data = query_slot(pool, firstQuery + i);570const struct intel_perf_query_info *query = &device->physical->perf->queries[0];571struct intel_perf_query_result result;572intel_perf_query_result_clear(&result);573intel_perf_query_result_accumulate_fields(&result, query, &device->info,574query_data + intel_perf_query_data_offset(pool, false),575query_data + intel_perf_query_data_offset(pool, true),576false /* no_oa_accumulate */);577intel_perf_query_result_write_mdapi(pData, stride,578&device->info,579query, &result);580const uint64_t *marker = query_data + intel_perf_marker_offset();581intel_perf_query_mdapi_write_marker(pData, stride, &device->info, *marker);582break;583}584585default:586unreachable("invalid pool type");587}588589if (!write_results)590status = VK_NOT_READY;591592if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT)593cpu_write_query_result(pData, flags, idx, available);594595pData += stride;596if (pData >= data_end)597break;598}599600return status;601}602603static void604emit_ps_depth_count(struct anv_cmd_buffer *cmd_buffer,605struct anv_address addr)606{607cmd_buffer->state.pending_pipe_bits |= ANV_PIPE_POST_SYNC_BIT;608genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);609610anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {611pc.DestinationAddressType = DAT_PPGTT;612pc.PostSyncOperation = WritePSDepthCount;613pc.DepthStallEnable = true;614pc.Address = addr;615616if (GFX_VER == 9 && cmd_buffer->device->info.gt == 4)617pc.CommandStreamerStallEnable = true;618}619}620621static void622emit_query_mi_availability(struct mi_builder *b,623struct anv_address addr,624bool available)625{626mi_store(b, mi_mem64(addr), mi_imm(available));627}628629static void630emit_query_pc_availability(struct anv_cmd_buffer *cmd_buffer,631struct anv_address addr,632bool available)633{634cmd_buffer->state.pending_pipe_bits |= ANV_PIPE_POST_SYNC_BIT;635genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);636637anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {638pc.DestinationAddressType = DAT_PPGTT;639pc.PostSyncOperation = WriteImmediateData;640pc.Address = addr;641pc.ImmediateData = available;642}643}644645/**646* Goes through a series of consecutive query indices in the given pool647* setting all element values to 0 and emitting them as available.648*/649static void650emit_zero_queries(struct anv_cmd_buffer *cmd_buffer,651struct mi_builder *b, struct anv_query_pool *pool,652uint32_t first_index, uint32_t num_queries)653{654switch (pool->type) {655case VK_QUERY_TYPE_OCCLUSION:656case VK_QUERY_TYPE_TIMESTAMP:657/* These queries are written with a PIPE_CONTROL so clear them using the658* PIPE_CONTROL as well so we don't have to synchronize between 2 types659* of operations.660*/661assert((pool->stride % 8) == 0);662for (uint32_t i = 0; i < num_queries; i++) {663struct anv_address slot_addr =664anv_query_address(pool, first_index + i);665666for (uint32_t qword = 1; qword < (pool->stride / 8); qword++) {667emit_query_pc_availability(cmd_buffer,668anv_address_add(slot_addr, qword * 8),669false);670}671emit_query_pc_availability(cmd_buffer, slot_addr, true);672}673break;674675case VK_QUERY_TYPE_PIPELINE_STATISTICS:676case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:677for (uint32_t i = 0; i < num_queries; i++) {678struct anv_address slot_addr =679anv_query_address(pool, first_index + i);680mi_memset(b, anv_address_add(slot_addr, 8), 0, pool->stride - 8);681emit_query_mi_availability(b, slot_addr, true);682}683break;684685#if GFX_VER >= 8686case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR: {687for (uint32_t i = 0; i < num_queries; i++) {688for (uint32_t p = 0; p < pool->n_passes; p++) {689mi_memset(b, khr_perf_query_data_address(pool, first_index + i, p, false),6900, 2 * pool->snapshot_size);691emit_query_mi_availability(b,692khr_perf_query_availability_address(pool, first_index + i, p),693true);694}695}696break;697}698#endif699700case VK_QUERY_TYPE_PERFORMANCE_QUERY_INTEL:701for (uint32_t i = 0; i < num_queries; i++) {702struct anv_address slot_addr =703anv_query_address(pool, first_index + i);704mi_memset(b, anv_address_add(slot_addr, 8), 0, pool->stride - 8);705emit_query_mi_availability(b, slot_addr, true);706}707break;708709default:710unreachable("Unsupported query type");711}712}713714void genX(CmdResetQueryPool)(715VkCommandBuffer commandBuffer,716VkQueryPool queryPool,717uint32_t firstQuery,718uint32_t queryCount)719{720ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);721ANV_FROM_HANDLE(anv_query_pool, pool, queryPool);722723switch (pool->type) {724case VK_QUERY_TYPE_OCCLUSION:725case VK_QUERY_TYPE_TIMESTAMP:726for (uint32_t i = 0; i < queryCount; i++) {727emit_query_pc_availability(cmd_buffer,728anv_query_address(pool, firstQuery + i),729false);730}731break;732733case VK_QUERY_TYPE_PIPELINE_STATISTICS:734case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT: {735struct mi_builder b;736mi_builder_init(&b, &cmd_buffer->device->info, &cmd_buffer->batch);737738for (uint32_t i = 0; i < queryCount; i++)739emit_query_mi_availability(&b, anv_query_address(pool, firstQuery + i), false);740break;741}742743#if GFX_VER >= 8744case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR: {745struct mi_builder b;746mi_builder_init(&b, &cmd_buffer->device->info, &cmd_buffer->batch);747748for (uint32_t i = 0; i < queryCount; i++) {749for (uint32_t p = 0; p < pool->n_passes; p++) {750emit_query_mi_availability(751&b,752khr_perf_query_availability_address(pool, firstQuery + i, p),753false);754}755}756break;757}758#endif759760case VK_QUERY_TYPE_PERFORMANCE_QUERY_INTEL: {761struct mi_builder b;762mi_builder_init(&b, &cmd_buffer->device->info, &cmd_buffer->batch);763764for (uint32_t i = 0; i < queryCount; i++)765emit_query_mi_availability(&b, anv_query_address(pool, firstQuery + i), false);766break;767}768769default:770unreachable("Unsupported query type");771}772}773774void genX(ResetQueryPool)(775VkDevice _device,776VkQueryPool queryPool,777uint32_t firstQuery,778uint32_t queryCount)779{780ANV_FROM_HANDLE(anv_query_pool, pool, queryPool);781782for (uint32_t i = 0; i < queryCount; i++) {783if (pool->type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR) {784#if GFX_VER >= 8785for (uint32_t p = 0; p < pool->n_passes; p++) {786uint64_t *pass_slot = pool->bo->map +787khr_perf_query_availability_offset(pool, firstQuery + i, p);788*pass_slot = 0;789}790#endif791} else {792uint64_t *slot = query_slot(pool, firstQuery + i);793*slot = 0;794}795}796}797798static const uint32_t vk_pipeline_stat_to_reg[] = {799GENX(IA_VERTICES_COUNT_num),800GENX(IA_PRIMITIVES_COUNT_num),801GENX(VS_INVOCATION_COUNT_num),802GENX(GS_INVOCATION_COUNT_num),803GENX(GS_PRIMITIVES_COUNT_num),804GENX(CL_INVOCATION_COUNT_num),805GENX(CL_PRIMITIVES_COUNT_num),806GENX(PS_INVOCATION_COUNT_num),807GENX(HS_INVOCATION_COUNT_num),808GENX(DS_INVOCATION_COUNT_num),809GENX(CS_INVOCATION_COUNT_num),810};811812static void813emit_pipeline_stat(struct mi_builder *b, uint32_t stat,814struct anv_address addr)815{816STATIC_ASSERT(ANV_PIPELINE_STATISTICS_MASK ==817(1 << ARRAY_SIZE(vk_pipeline_stat_to_reg)) - 1);818819assert(stat < ARRAY_SIZE(vk_pipeline_stat_to_reg));820mi_store(b, mi_mem64(addr), mi_reg64(vk_pipeline_stat_to_reg[stat]));821}822823static void824emit_xfb_query(struct mi_builder *b, uint32_t stream,825struct anv_address addr)826{827assert(stream < MAX_XFB_STREAMS);828829mi_store(b, mi_mem64(anv_address_add(addr, 0)),830mi_reg64(GENX(SO_NUM_PRIMS_WRITTEN0_num) + stream * 8));831mi_store(b, mi_mem64(anv_address_add(addr, 16)),832mi_reg64(GENX(SO_PRIM_STORAGE_NEEDED0_num) + stream * 8));833}834835static void836emit_perf_intel_query(struct anv_cmd_buffer *cmd_buffer,837struct anv_query_pool *pool,838struct mi_builder *b,839struct anv_address query_addr,840bool end)841{842const struct intel_perf_query_field_layout *layout =843&cmd_buffer->device->physical->perf->query_layout;844struct anv_address data_addr =845anv_address_add(query_addr, intel_perf_query_data_offset(pool, end));846847for (uint32_t f = 0; f < layout->n_fields; f++) {848const struct intel_perf_query_field *field =849&layout->fields[end ? f : (layout->n_fields - 1 - f)];850851switch (field->type) {852case INTEL_PERF_QUERY_FIELD_TYPE_MI_RPC:853anv_batch_emit(&cmd_buffer->batch, GENX(MI_REPORT_PERF_COUNT), rpc) {854rpc.MemoryAddress = anv_address_add(data_addr, field->location);855}856break;857858case INTEL_PERF_QUERY_FIELD_TYPE_SRM_PERFCNT:859case INTEL_PERF_QUERY_FIELD_TYPE_SRM_RPSTAT:860case INTEL_PERF_QUERY_FIELD_TYPE_SRM_OA_B:861case INTEL_PERF_QUERY_FIELD_TYPE_SRM_OA_C: {862struct anv_address addr = anv_address_add(data_addr, field->location);863struct mi_value src = field->size == 8 ?864mi_reg64(field->mmio_offset) :865mi_reg32(field->mmio_offset);866struct mi_value dst = field->size == 8 ?867mi_mem64(addr) : mi_mem32(addr);868mi_store(b, dst, src);869break;870}871872default:873unreachable("Invalid query field");874break;875}876}877}878879void genX(CmdBeginQuery)(880VkCommandBuffer commandBuffer,881VkQueryPool queryPool,882uint32_t query,883VkQueryControlFlags flags)884{885genX(CmdBeginQueryIndexedEXT)(commandBuffer, queryPool, query, flags, 0);886}887888void genX(CmdBeginQueryIndexedEXT)(889VkCommandBuffer commandBuffer,890VkQueryPool queryPool,891uint32_t query,892VkQueryControlFlags flags,893uint32_t index)894{895ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);896ANV_FROM_HANDLE(anv_query_pool, pool, queryPool);897struct anv_address query_addr = anv_query_address(pool, query);898899struct mi_builder b;900mi_builder_init(&b, &cmd_buffer->device->info, &cmd_buffer->batch);901902switch (pool->type) {903case VK_QUERY_TYPE_OCCLUSION:904emit_ps_depth_count(cmd_buffer, anv_address_add(query_addr, 8));905break;906907case VK_QUERY_TYPE_PIPELINE_STATISTICS: {908/* TODO: This might only be necessary for certain stats */909anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {910pc.CommandStreamerStallEnable = true;911pc.StallAtPixelScoreboard = true;912}913914uint32_t statistics = pool->pipeline_statistics;915uint32_t offset = 8;916while (statistics) {917uint32_t stat = u_bit_scan(&statistics);918emit_pipeline_stat(&b, stat, anv_address_add(query_addr, offset));919offset += 16;920}921break;922}923924case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:925anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {926pc.CommandStreamerStallEnable = true;927pc.StallAtPixelScoreboard = true;928}929emit_xfb_query(&b, index, anv_address_add(query_addr, 8));930break;931932#if GFX_VER >= 8933case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR: {934if (!khr_perf_query_ensure_relocs(cmd_buffer))935return;936937const struct anv_physical_device *pdevice = cmd_buffer->device->physical;938const struct intel_perf_query_field_layout *layout = &pdevice->perf->query_layout;939940uint32_t reloc_idx = 0;941for (uint32_t end = 0; end < 2; end++) {942for (uint32_t r = 0; r < layout->n_fields; r++) {943const struct intel_perf_query_field *field =944&layout->fields[end ? r : (layout->n_fields - 1 - r)];945struct mi_value reg_addr =946mi_iadd(947&b,948mi_imm(intel_canonical_address(pool->bo->offset +949khr_perf_query_data_offset(pool, query, 0, end) +950field->location)),951mi_reg64(ANV_PERF_QUERY_OFFSET_REG));952cmd_buffer->self_mod_locations[reloc_idx++] = mi_store_address(&b, reg_addr);953954if (field->type != INTEL_PERF_QUERY_FIELD_TYPE_MI_RPC &&955field->size == 8) {956reg_addr =957mi_iadd(958&b,959mi_imm(intel_canonical_address(pool->bo->offset +960khr_perf_query_data_offset(pool, query, 0, end) +961field->location + 4)),962mi_reg64(ANV_PERF_QUERY_OFFSET_REG));963cmd_buffer->self_mod_locations[reloc_idx++] = mi_store_address(&b, reg_addr);964}965}966}967968struct mi_value availability_write_offset =969mi_iadd(970&b,971mi_imm(972intel_canonical_address(973pool->bo->offset +974khr_perf_query_availability_offset(pool, query, 0 /* pass */))),975mi_reg64(ANV_PERF_QUERY_OFFSET_REG));976cmd_buffer->self_mod_locations[reloc_idx++] =977mi_store_address(&b, availability_write_offset);978979assert(reloc_idx == pdevice->n_perf_query_commands);980981mi_self_mod_barrier(&b);982983anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {984pc.CommandStreamerStallEnable = true;985pc.StallAtPixelScoreboard = true;986}987cmd_buffer->perf_query_pool = pool;988989cmd_buffer->perf_reloc_idx = 0;990for (uint32_t r = 0; r < layout->n_fields; r++) {991const struct intel_perf_query_field *field =992&layout->fields[layout->n_fields - 1 - r];993void *dws;994995switch (field->type) {996case INTEL_PERF_QUERY_FIELD_TYPE_MI_RPC:997dws = anv_batch_emitn(&cmd_buffer->batch,998GENX(MI_REPORT_PERF_COUNT_length),999GENX(MI_REPORT_PERF_COUNT),1000.MemoryAddress = query_addr /* Will be overwritten */);1001_mi_resolve_address_token(&b,1002cmd_buffer->self_mod_locations[cmd_buffer->perf_reloc_idx++],1003dws +1004GENX(MI_REPORT_PERF_COUNT_MemoryAddress_start) / 8);1005break;10061007case INTEL_PERF_QUERY_FIELD_TYPE_SRM_PERFCNT:1008case INTEL_PERF_QUERY_FIELD_TYPE_SRM_RPSTAT:1009case INTEL_PERF_QUERY_FIELD_TYPE_SRM_OA_B:1010case INTEL_PERF_QUERY_FIELD_TYPE_SRM_OA_C:1011dws =1012anv_batch_emitn(&cmd_buffer->batch,1013GENX(MI_STORE_REGISTER_MEM_length),1014GENX(MI_STORE_REGISTER_MEM),1015.RegisterAddress = field->mmio_offset,1016.MemoryAddress = query_addr /* Will be overwritten */ );1017_mi_resolve_address_token(&b,1018cmd_buffer->self_mod_locations[cmd_buffer->perf_reloc_idx++],1019dws +1020GENX(MI_STORE_REGISTER_MEM_MemoryAddress_start) / 8);1021if (field->size == 8) {1022dws =1023anv_batch_emitn(&cmd_buffer->batch,1024GENX(MI_STORE_REGISTER_MEM_length),1025GENX(MI_STORE_REGISTER_MEM),1026.RegisterAddress = field->mmio_offset + 4,1027.MemoryAddress = query_addr /* Will be overwritten */ );1028_mi_resolve_address_token(&b,1029cmd_buffer->self_mod_locations[cmd_buffer->perf_reloc_idx++],1030dws +1031GENX(MI_STORE_REGISTER_MEM_MemoryAddress_start) / 8);1032}1033break;10341035default:1036unreachable("Invalid query field");1037break;1038}1039}1040break;1041}1042#endif10431044case VK_QUERY_TYPE_PERFORMANCE_QUERY_INTEL: {1045anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {1046pc.CommandStreamerStallEnable = true;1047pc.StallAtPixelScoreboard = true;1048}1049emit_perf_intel_query(cmd_buffer, pool, &b, query_addr, false);1050break;1051}10521053default:1054unreachable("");1055}1056}10571058void genX(CmdEndQuery)(1059VkCommandBuffer commandBuffer,1060VkQueryPool queryPool,1061uint32_t query)1062{1063genX(CmdEndQueryIndexedEXT)(commandBuffer, queryPool, query, 0);1064}10651066void genX(CmdEndQueryIndexedEXT)(1067VkCommandBuffer commandBuffer,1068VkQueryPool queryPool,1069uint32_t query,1070uint32_t index)1071{1072ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);1073ANV_FROM_HANDLE(anv_query_pool, pool, queryPool);1074struct anv_address query_addr = anv_query_address(pool, query);10751076struct mi_builder b;1077mi_builder_init(&b, &cmd_buffer->device->info, &cmd_buffer->batch);10781079switch (pool->type) {1080case VK_QUERY_TYPE_OCCLUSION:1081emit_ps_depth_count(cmd_buffer, anv_address_add(query_addr, 16));1082emit_query_pc_availability(cmd_buffer, query_addr, true);1083break;10841085case VK_QUERY_TYPE_PIPELINE_STATISTICS: {1086/* TODO: This might only be necessary for certain stats */1087anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {1088pc.CommandStreamerStallEnable = true;1089pc.StallAtPixelScoreboard = true;1090}10911092uint32_t statistics = pool->pipeline_statistics;1093uint32_t offset = 16;1094while (statistics) {1095uint32_t stat = u_bit_scan(&statistics);1096emit_pipeline_stat(&b, stat, anv_address_add(query_addr, offset));1097offset += 16;1098}10991100emit_query_mi_availability(&b, query_addr, true);1101break;1102}11031104case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:1105anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {1106pc.CommandStreamerStallEnable = true;1107pc.StallAtPixelScoreboard = true;1108}11091110emit_xfb_query(&b, index, anv_address_add(query_addr, 16));1111emit_query_mi_availability(&b, query_addr, true);1112break;11131114#if GFX_VER >= 81115case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR: {1116anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {1117pc.CommandStreamerStallEnable = true;1118pc.StallAtPixelScoreboard = true;1119}1120cmd_buffer->perf_query_pool = pool;11211122if (!khr_perf_query_ensure_relocs(cmd_buffer))1123return;11241125const struct anv_physical_device *pdevice = cmd_buffer->device->physical;1126const struct intel_perf_query_field_layout *layout = &pdevice->perf->query_layout;11271128void *dws;1129for (uint32_t r = 0; r < layout->n_fields; r++) {1130const struct intel_perf_query_field *field = &layout->fields[r];11311132switch (field->type) {1133case INTEL_PERF_QUERY_FIELD_TYPE_MI_RPC:1134dws = anv_batch_emitn(&cmd_buffer->batch,1135GENX(MI_REPORT_PERF_COUNT_length),1136GENX(MI_REPORT_PERF_COUNT),1137.MemoryAddress = query_addr /* Will be overwritten */);1138_mi_resolve_address_token(&b,1139cmd_buffer->self_mod_locations[cmd_buffer->perf_reloc_idx++],1140dws +1141GENX(MI_REPORT_PERF_COUNT_MemoryAddress_start) / 8);1142break;11431144case INTEL_PERF_QUERY_FIELD_TYPE_SRM_PERFCNT:1145case INTEL_PERF_QUERY_FIELD_TYPE_SRM_RPSTAT:1146case INTEL_PERF_QUERY_FIELD_TYPE_SRM_OA_B:1147case INTEL_PERF_QUERY_FIELD_TYPE_SRM_OA_C:1148dws =1149anv_batch_emitn(&cmd_buffer->batch,1150GENX(MI_STORE_REGISTER_MEM_length),1151GENX(MI_STORE_REGISTER_MEM),1152.RegisterAddress = field->mmio_offset,1153.MemoryAddress = query_addr /* Will be overwritten */ );1154_mi_resolve_address_token(&b,1155cmd_buffer->self_mod_locations[cmd_buffer->perf_reloc_idx++],1156dws +1157GENX(MI_STORE_REGISTER_MEM_MemoryAddress_start) / 8);1158if (field->size == 8) {1159dws =1160anv_batch_emitn(&cmd_buffer->batch,1161GENX(MI_STORE_REGISTER_MEM_length),1162GENX(MI_STORE_REGISTER_MEM),1163.RegisterAddress = field->mmio_offset + 4,1164.MemoryAddress = query_addr /* Will be overwritten */ );1165_mi_resolve_address_token(&b,1166cmd_buffer->self_mod_locations[cmd_buffer->perf_reloc_idx++],1167dws +1168GENX(MI_STORE_REGISTER_MEM_MemoryAddress_start) / 8);1169}1170break;11711172default:1173unreachable("Invalid query field");1174break;1175}1176}11771178dws =1179anv_batch_emitn(&cmd_buffer->batch,1180GENX(MI_STORE_DATA_IMM_length),1181GENX(MI_STORE_DATA_IMM),1182.ImmediateData = true);1183_mi_resolve_address_token(&b,1184cmd_buffer->self_mod_locations[cmd_buffer->perf_reloc_idx++],1185dws +1186GENX(MI_STORE_DATA_IMM_Address_start) / 8);11871188assert(cmd_buffer->perf_reloc_idx == pdevice->n_perf_query_commands);1189break;1190}1191#endif11921193case VK_QUERY_TYPE_PERFORMANCE_QUERY_INTEL: {1194anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {1195pc.CommandStreamerStallEnable = true;1196pc.StallAtPixelScoreboard = true;1197}1198uint32_t marker_offset = intel_perf_marker_offset();1199mi_store(&b, mi_mem64(anv_address_add(query_addr, marker_offset)),1200mi_imm(cmd_buffer->intel_perf_marker));1201emit_perf_intel_query(cmd_buffer, pool, &b, query_addr, true);1202emit_query_mi_availability(&b, query_addr, true);1203break;1204}12051206default:1207unreachable("");1208}12091210/* When multiview is active the spec requires that N consecutive query1211* indices are used, where N is the number of active views in the subpass.1212* The spec allows that we only write the results to one of the queries1213* but we still need to manage result availability for all the query indices.1214* Since we only emit a single query for all active views in the1215* first index, mark the other query indices as being already available1216* with result 0.1217*/1218if (cmd_buffer->state.subpass && cmd_buffer->state.subpass->view_mask) {1219const uint32_t num_queries =1220util_bitcount(cmd_buffer->state.subpass->view_mask);1221if (num_queries > 1)1222emit_zero_queries(cmd_buffer, &b, pool, query + 1, num_queries - 1);1223}1224}12251226#define TIMESTAMP 0x235812271228void genX(CmdWriteTimestamp)(1229VkCommandBuffer commandBuffer,1230VkPipelineStageFlagBits pipelineStage,1231VkQueryPool queryPool,1232uint32_t query)1233{1234ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);1235ANV_FROM_HANDLE(anv_query_pool, pool, queryPool);1236struct anv_address query_addr = anv_query_address(pool, query);12371238assert(pool->type == VK_QUERY_TYPE_TIMESTAMP);12391240struct mi_builder b;1241mi_builder_init(&b, &cmd_buffer->device->info, &cmd_buffer->batch);12421243switch (pipelineStage) {1244case VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT:1245mi_store(&b, mi_mem64(anv_address_add(query_addr, 8)),1246mi_reg64(TIMESTAMP));1247break;12481249default:1250/* Everything else is bottom-of-pipe */1251cmd_buffer->state.pending_pipe_bits |= ANV_PIPE_POST_SYNC_BIT;1252genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);12531254anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {1255pc.DestinationAddressType = DAT_PPGTT;1256pc.PostSyncOperation = WriteTimestamp;1257pc.Address = anv_address_add(query_addr, 8);12581259if (GFX_VER == 9 && cmd_buffer->device->info.gt == 4)1260pc.CommandStreamerStallEnable = true;1261}1262break;1263}12641265emit_query_pc_availability(cmd_buffer, query_addr, true);12661267/* When multiview is active the spec requires that N consecutive query1268* indices are used, where N is the number of active views in the subpass.1269* The spec allows that we only write the results to one of the queries1270* but we still need to manage result availability for all the query indices.1271* Since we only emit a single query for all active views in the1272* first index, mark the other query indices as being already available1273* with result 0.1274*/1275if (cmd_buffer->state.subpass && cmd_buffer->state.subpass->view_mask) {1276const uint32_t num_queries =1277util_bitcount(cmd_buffer->state.subpass->view_mask);1278if (num_queries > 1)1279emit_zero_queries(cmd_buffer, &b, pool, query + 1, num_queries - 1);1280}1281}12821283#if GFX_VERx10 >= 7512841285#define MI_PREDICATE_SRC0 0x24001286#define MI_PREDICATE_SRC1 0x24081287#define MI_PREDICATE_RESULT 0x241812881289/**1290* Writes the results of a query to dst_addr is the value at poll_addr is equal1291* to the reference value.1292*/1293static void1294gpu_write_query_result_cond(struct anv_cmd_buffer *cmd_buffer,1295struct mi_builder *b,1296struct anv_address poll_addr,1297struct anv_address dst_addr,1298uint64_t ref_value,1299VkQueryResultFlags flags,1300uint32_t value_index,1301struct mi_value query_result)1302{1303mi_store(b, mi_reg64(MI_PREDICATE_SRC0), mi_mem64(poll_addr));1304mi_store(b, mi_reg64(MI_PREDICATE_SRC1), mi_imm(ref_value));1305anv_batch_emit(&cmd_buffer->batch, GENX(MI_PREDICATE), mip) {1306mip.LoadOperation = LOAD_LOAD;1307mip.CombineOperation = COMBINE_SET;1308mip.CompareOperation = COMPARE_SRCS_EQUAL;1309}13101311if (flags & VK_QUERY_RESULT_64_BIT) {1312struct anv_address res_addr = anv_address_add(dst_addr, value_index * 8);1313mi_store_if(b, mi_mem64(res_addr), query_result);1314} else {1315struct anv_address res_addr = anv_address_add(dst_addr, value_index * 4);1316mi_store_if(b, mi_mem32(res_addr), query_result);1317}1318}13191320static void1321gpu_write_query_result(struct mi_builder *b,1322struct anv_address dst_addr,1323VkQueryResultFlags flags,1324uint32_t value_index,1325struct mi_value query_result)1326{1327if (flags & VK_QUERY_RESULT_64_BIT) {1328struct anv_address res_addr = anv_address_add(dst_addr, value_index * 8);1329mi_store(b, mi_mem64(res_addr), query_result);1330} else {1331struct anv_address res_addr = anv_address_add(dst_addr, value_index * 4);1332mi_store(b, mi_mem32(res_addr), query_result);1333}1334}13351336static struct mi_value1337compute_query_result(struct mi_builder *b, struct anv_address addr)1338{1339return mi_isub(b, mi_mem64(anv_address_add(addr, 8)),1340mi_mem64(anv_address_add(addr, 0)));1341}13421343void genX(CmdCopyQueryPoolResults)(1344VkCommandBuffer commandBuffer,1345VkQueryPool queryPool,1346uint32_t firstQuery,1347uint32_t queryCount,1348VkBuffer destBuffer,1349VkDeviceSize destOffset,1350VkDeviceSize destStride,1351VkQueryResultFlags flags)1352{1353ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);1354ANV_FROM_HANDLE(anv_query_pool, pool, queryPool);1355ANV_FROM_HANDLE(anv_buffer, buffer, destBuffer);13561357struct mi_builder b;1358mi_builder_init(&b, &cmd_buffer->device->info, &cmd_buffer->batch);1359struct mi_value result;13601361/* If render target writes are ongoing, request a render target cache flush1362* to ensure proper ordering of the commands from the 3d pipe and the1363* command streamer.1364*/1365if (cmd_buffer->state.pending_pipe_bits & ANV_PIPE_RENDER_TARGET_BUFFER_WRITES) {1366anv_add_pending_pipe_bits(cmd_buffer,1367ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT,1368"CopyQueryPoolResults");1369}13701371if ((flags & VK_QUERY_RESULT_WAIT_BIT) ||1372(cmd_buffer->state.pending_pipe_bits & ANV_PIPE_FLUSH_BITS) ||1373/* Occlusion & timestamp queries are written using a PIPE_CONTROL and1374* because we're about to copy values from MI commands, we need to1375* stall the command streamer to make sure the PIPE_CONTROL values have1376* landed, otherwise we could see inconsistent values & availability.1377*1378* From the vulkan spec:1379*1380* "vkCmdCopyQueryPoolResults is guaranteed to see the effect of1381* previous uses of vkCmdResetQueryPool in the same queue, without1382* any additional synchronization."1383*/1384pool->type == VK_QUERY_TYPE_OCCLUSION ||1385pool->type == VK_QUERY_TYPE_TIMESTAMP) {1386anv_add_pending_pipe_bits(cmd_buffer,1387ANV_PIPE_CS_STALL_BIT,1388"CopyQueryPoolResults");1389genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);1390}13911392struct anv_address dest_addr = anv_address_add(buffer->address, destOffset);1393for (uint32_t i = 0; i < queryCount; i++) {1394struct anv_address query_addr = anv_query_address(pool, firstQuery + i);1395uint32_t idx = 0;1396switch (pool->type) {1397case VK_QUERY_TYPE_OCCLUSION:1398result = compute_query_result(&b, anv_address_add(query_addr, 8));1399/* Like in the case of vkGetQueryPoolResults, if the query is1400* unavailable and the VK_QUERY_RESULT_PARTIAL_BIT flag is set,1401* conservatively write 0 as the query result. If the1402* VK_QUERY_RESULT_PARTIAL_BIT isn't set, don't write any value.1403*/1404gpu_write_query_result_cond(cmd_buffer, &b, query_addr, dest_addr,14051 /* available */, flags, idx, result);1406if (flags & VK_QUERY_RESULT_PARTIAL_BIT) {1407gpu_write_query_result_cond(cmd_buffer, &b, query_addr, dest_addr,14080 /* unavailable */, flags, idx, mi_imm(0));1409}1410idx++;1411break;14121413case VK_QUERY_TYPE_PIPELINE_STATISTICS: {1414uint32_t statistics = pool->pipeline_statistics;1415while (statistics) {1416uint32_t stat = u_bit_scan(&statistics);14171418result = compute_query_result(&b, anv_address_add(query_addr,1419idx * 16 + 8));14201421/* WaDividePSInvocationCountBy4:HSW,BDW */1422if ((cmd_buffer->device->info.ver == 8 ||1423cmd_buffer->device->info.is_haswell) &&1424(1 << stat) == VK_QUERY_PIPELINE_STATISTIC_FRAGMENT_SHADER_INVOCATIONS_BIT) {1425result = mi_ushr32_imm(&b, result, 2);1426}14271428gpu_write_query_result(&b, dest_addr, flags, idx++, result);1429}1430assert(idx == util_bitcount(pool->pipeline_statistics));1431break;1432}14331434case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:1435result = compute_query_result(&b, anv_address_add(query_addr, 8));1436gpu_write_query_result(&b, dest_addr, flags, idx++, result);1437result = compute_query_result(&b, anv_address_add(query_addr, 24));1438gpu_write_query_result(&b, dest_addr, flags, idx++, result);1439break;14401441case VK_QUERY_TYPE_TIMESTAMP:1442result = mi_mem64(anv_address_add(query_addr, 8));1443gpu_write_query_result(&b, dest_addr, flags, idx++, result);1444break;14451446#if GFX_VER >= 81447case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR:1448unreachable("Copy KHR performance query results not implemented");1449break;1450#endif14511452default:1453unreachable("unhandled query type");1454}14551456if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT) {1457gpu_write_query_result(&b, dest_addr, flags, idx,1458mi_mem64(query_addr));1459}14601461dest_addr = anv_address_add(dest_addr, destStride);1462}1463}14641465#else1466void genX(CmdCopyQueryPoolResults)(1467VkCommandBuffer commandBuffer,1468VkQueryPool queryPool,1469uint32_t firstQuery,1470uint32_t queryCount,1471VkBuffer destBuffer,1472VkDeviceSize destOffset,1473VkDeviceSize destStride,1474VkQueryResultFlags flags)1475{1476anv_finishme("Queries not yet supported on Ivy Bridge");1477}1478#endif147914801481