Path: blob/21.2-virgl/src/gallium/drivers/freedreno/a4xx/fd4_query.c
4574 views
/*1* Copyright (C) 2014 Rob Clark <[email protected]>2*3* Permission is hereby granted, free of charge, to any person obtaining a4* copy of this software and associated documentation files (the "Software"),5* to deal in the Software without restriction, including without limitation6* the rights to use, copy, modify, merge, publish, distribute, sublicense,7* and/or sell copies of the Software, and to permit persons to whom the8* Software is furnished to do so, subject to the following conditions:9*10* The above copyright notice and this permission notice (including the next11* paragraph) shall be included in all copies or substantial portions of the12* Software.13*14* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR15* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,16* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL17* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER18* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,19* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE20* SOFTWARE.21*22* Authors:23* Rob Clark <[email protected]>24*/2526#include "freedreno_context.h"27#include "freedreno_query_hw.h"28#include "freedreno_util.h"2930#include "fd4_context.h"31#include "fd4_draw.h"32#include "fd4_format.h"33#include "fd4_query.h"3435struct fd_rb_samp_ctrs {36uint64_t ctr[16];37};3839/*40* Occlusion Query:41*42* OCCLUSION_COUNTER and OCCLUSION_PREDICATE differ only in how they43* interpret results44*/4546static struct fd_hw_sample *47occlusion_get_sample(struct fd_batch *batch, struct fd_ringbuffer *ring)48{49struct fd_hw_sample *samp =50fd_hw_sample_init(batch, sizeof(struct fd_rb_samp_ctrs));5152/* low bits of sample addr should be zero (since they are control53* flags in RB_SAMPLE_COUNT_CONTROL):54*/55debug_assert((samp->offset & 0x3) == 0);5657/* Set RB_SAMPLE_COUNT_ADDR to samp->offset plus value of58* HW_QUERY_BASE_REG register:59*/60OUT_PKT3(ring, CP_SET_CONSTANT, 3);61OUT_RING(ring, CP_REG(REG_A4XX_RB_SAMPLE_COUNT_CONTROL) | 0x80000000);62OUT_RING(ring, HW_QUERY_BASE_REG);63OUT_RING(ring, A4XX_RB_SAMPLE_COUNT_CONTROL_COPY | samp->offset);6465OUT_PKT3(ring, CP_DRAW_INDX_OFFSET, 3);66OUT_RING(ring, DRAW4(DI_PT_POINTLIST_PSIZE, DI_SRC_SEL_AUTO_INDEX,67INDEX4_SIZE_32_BIT, USE_VISIBILITY));68OUT_RING(ring, 1); /* NumInstances */69OUT_RING(ring, 0); /* NumIndices */7071fd_event_write(batch, ring, ZPASS_DONE);7273return samp;74}7576static uint64_t77count_samples(const struct fd_rb_samp_ctrs *start,78const struct fd_rb_samp_ctrs *end)79{80return end->ctr[0] - start->ctr[0];81}8283static void84occlusion_counter_accumulate_result(struct fd_context *ctx, const void *start,85const void *end,86union pipe_query_result *result)87{88uint64_t n = count_samples(start, end);89result->u64 += n;90}9192static void93occlusion_predicate_accumulate_result(struct fd_context *ctx, const void *start,94const void *end,95union pipe_query_result *result)96{97uint64_t n = count_samples(start, end);98result->b |= (n > 0);99}100101/*102* Time Elapsed Query:103*104* Note: we could in theory support timestamp queries, but they105* won't give sensible results for tilers.106*/107108static void109time_elapsed_enable(struct fd_context *ctx,110struct fd_ringbuffer *ring) assert_dt111{112/* Right now, the assignment of countable to counter register is113* just hard coded. If we start exposing more countables than we114* have counters, we will need to be more clever.115*/116struct fd_batch *batch = fd_context_batch_locked(ctx);117fd_wfi(batch, ring);118OUT_PKT0(ring, REG_A4XX_CP_PERFCTR_CP_SEL_0, 1);119OUT_RING(ring, CP_ALWAYS_COUNT);120fd_batch_unlock_submit(batch);121fd_batch_reference(&batch, NULL);122}123124static struct fd_hw_sample *125time_elapsed_get_sample(struct fd_batch *batch,126struct fd_ringbuffer *ring) assert_dt127{128struct fd_hw_sample *samp = fd_hw_sample_init(batch, sizeof(uint64_t));129130/* use unused part of vsc_size_mem as scratch space, to avoid131* extra allocation:132*/133struct fd_bo *scratch_bo = fd4_context(batch->ctx)->vsc_size_mem;134const int sample_off = 128;135const int addr_off = sample_off + 8;136137debug_assert(batch->ctx->screen->max_freq > 0);138139/* Basic issue is that we need to read counter value to a relative140* destination (with per-tile offset) rather than absolute dest141* addr. But there is no pm4 packet that can do that. This is142* where it would be *really* nice if we could write our own fw143* since afaict implementing the sort of packet we need would be144* trivial.145*146* Instead, we:147* (1) CP_REG_TO_MEM to do a 64b copy of counter to scratch buffer148* (2) CP_MEM_WRITE to write per-sample offset to scratch buffer149* (3) CP_REG_TO_MEM w/ accumulate flag to add the per-tile base150* address to the per-sample offset in the scratch buffer151* (4) CP_MEM_TO_REG to copy resulting address from steps #2 and #3152* to CP_ME_NRT_ADDR153* (5) CP_MEM_TO_REG's to copy saved counter value from scratch154* buffer to CP_ME_NRT_DATA to trigger the write out to query155* result buffer156*157* Straightforward, right?158*159* Maybe could swap the order of things in the scratch buffer to160* put address first, and copy back to CP_ME_NRT_ADDR+DATA in one161* shot, but that's really just polishing a turd..162*/163164fd_wfi(batch, ring);165166/* copy sample counter _LO and _HI to scratch: */167OUT_PKT3(ring, CP_REG_TO_MEM, 2);168OUT_RING(ring, CP_REG_TO_MEM_0_REG(REG_A4XX_RBBM_PERFCTR_CP_0_LO) |169CP_REG_TO_MEM_0_64B |170CP_REG_TO_MEM_0_CNT(2)); /* write 2 regs to mem */171OUT_RELOC(ring, scratch_bo, sample_off, 0, 0);172173/* ok... here we really *would* like to use the CP_SET_CONSTANT174* mode which can add a constant to value in reg2 and write to175* reg1... *but* that only works for banked/context registers,176* and CP_ME_NRT_DATA isn't one of those.. so we need to do some177* CP math to the scratch buffer instead:178*179* (note first 8 bytes are counter value, use offset 0x8 for180* address calculation)181*/182183/* per-sample offset to scratch bo: */184OUT_PKT3(ring, CP_MEM_WRITE, 2);185OUT_RELOC(ring, scratch_bo, addr_off, 0, 0);186OUT_RING(ring, samp->offset);187188/* now add to that the per-tile base: */189OUT_PKT3(ring, CP_REG_TO_MEM, 2);190OUT_RING(ring, CP_REG_TO_MEM_0_REG(HW_QUERY_BASE_REG) |191CP_REG_TO_MEM_0_ACCUMULATE |192CP_REG_TO_MEM_0_CNT(0)); /* readback 1 regs */193OUT_RELOC(ring, scratch_bo, addr_off, 0, 0);194195/* now copy that back to CP_ME_NRT_ADDR: */196OUT_PKT3(ring, CP_MEM_TO_REG, 2);197OUT_RING(ring, REG_A4XX_CP_ME_NRT_ADDR);198OUT_RELOC(ring, scratch_bo, addr_off, 0, 0);199200/* and finally, copy sample from scratch buffer to CP_ME_NRT_DATA201* to trigger the write to result buffer202*/203OUT_PKT3(ring, CP_MEM_TO_REG, 2);204OUT_RING(ring, REG_A4XX_CP_ME_NRT_DATA);205OUT_RELOC(ring, scratch_bo, sample_off, 0, 0);206207/* and again to get the value of the _HI reg from scratch: */208OUT_PKT3(ring, CP_MEM_TO_REG, 2);209OUT_RING(ring, REG_A4XX_CP_ME_NRT_DATA);210OUT_RELOC(ring, scratch_bo, sample_off + 0x4, 0, 0);211212/* Sigh.. */213214return samp;215}216217static void218time_elapsed_accumulate_result(struct fd_context *ctx, const void *start,219const void *end, union pipe_query_result *result)220{221uint64_t n = *(uint64_t *)end - *(uint64_t *)start;222/* max_freq is in Hz, convert cycle count to ns: */223result->u64 += n * 1000000000 / ctx->screen->max_freq;224}225226static void227timestamp_accumulate_result(struct fd_context *ctx, const void *start,228const void *end, union pipe_query_result *result)229{230/* just return the value from fist tile: */231if (result->u64 != 0)232return;233uint64_t n = *(uint64_t *)start;234/* max_freq is in Hz, convert cycle count to ns: */235result->u64 = n * 1000000000 / ctx->screen->max_freq;236}237238static const struct fd_hw_sample_provider occlusion_counter = {239.query_type = PIPE_QUERY_OCCLUSION_COUNTER,240.get_sample = occlusion_get_sample,241.accumulate_result = occlusion_counter_accumulate_result,242};243244static const struct fd_hw_sample_provider occlusion_predicate = {245.query_type = PIPE_QUERY_OCCLUSION_PREDICATE,246.get_sample = occlusion_get_sample,247.accumulate_result = occlusion_predicate_accumulate_result,248};249250static const struct fd_hw_sample_provider occlusion_predicate_conservative = {251.query_type = PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE,252.get_sample = occlusion_get_sample,253.accumulate_result = occlusion_predicate_accumulate_result,254};255256static const struct fd_hw_sample_provider time_elapsed = {257.query_type = PIPE_QUERY_TIME_ELAPSED,258.always = true,259.enable = time_elapsed_enable,260.get_sample = time_elapsed_get_sample,261.accumulate_result = time_elapsed_accumulate_result,262};263264/* NOTE: timestamp query isn't going to give terribly sensible results265* on a tiler. But it is needed by qapitrace profile heatmap. If you266* add in a binning pass, the results get even more non-sensical. So267* we just return the timestamp on the first tile and hope that is268* kind of good enough.269*/270static const struct fd_hw_sample_provider timestamp = {271.query_type = PIPE_QUERY_TIMESTAMP,272.always = true,273.enable = time_elapsed_enable,274.get_sample = time_elapsed_get_sample,275.accumulate_result = timestamp_accumulate_result,276};277278void279fd4_query_context_init(struct pipe_context *pctx) disable_thread_safety_analysis280{281struct fd_context *ctx = fd_context(pctx);282283ctx->create_query = fd_hw_create_query;284ctx->query_prepare = fd_hw_query_prepare;285ctx->query_prepare_tile = fd_hw_query_prepare_tile;286ctx->query_update_batch = fd_hw_query_update_batch;287288fd_hw_query_register_provider(pctx, &occlusion_counter);289fd_hw_query_register_provider(pctx, &occlusion_predicate);290fd_hw_query_register_provider(pctx, &occlusion_predicate_conservative);291fd_hw_query_register_provider(pctx, &time_elapsed);292fd_hw_query_register_provider(pctx, ×tamp);293}294295296