Path: blob/21.2-virgl/src/gallium/drivers/r600/r600_query.c
4570 views
/*1* Copyright 2010 Jerome Glisse <[email protected]>2* Copyright 2014 Marek Olšák <[email protected]>3*4* Permission is hereby granted, free of charge, to any person obtaining a5* copy of this software and associated documentation files (the "Software"),6* to deal in the Software without restriction, including without limitation7* on the rights to use, copy, modify, merge, publish, distribute, sub8* license, and/or sell copies of the Software, and to permit persons to whom9* the Software is furnished to do so, subject to the following conditions:10*11* The above copyright notice and this permission notice (including the next12* paragraph) shall be included in all copies or substantial portions of the13* Software.14*15* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR16* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,17* FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL18* THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,19* DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR20* OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE21* USE OR OTHER DEALINGS IN THE SOFTWARE.22*/2324#include "r600_query.h"25#include "r600_pipe.h"26#include "r600_cs.h"27#include "util/u_memory.h"28#include "util/u_upload_mgr.h"29#include "util/os_time.h"30#include "tgsi/tgsi_text.h"3132#define R600_MAX_STREAMS 43334struct r600_hw_query_params {35unsigned start_offset;36unsigned end_offset;37unsigned fence_offset;38unsigned pair_stride;39unsigned pair_count;40};4142/* Queries without buffer handling or suspend/resume. */43struct r600_query_sw {44struct r600_query b;4546uint64_t begin_result;47uint64_t end_result;4849uint64_t begin_time;50uint64_t end_time;5152/* Fence for GPU_FINISHED. */53struct pipe_fence_handle *fence;54};5556static void r600_query_sw_destroy(struct r600_common_screen *rscreen,57struct r600_query *rquery)58{59struct r600_query_sw *query = (struct r600_query_sw *)rquery;6061rscreen->b.fence_reference(&rscreen->b, &query->fence, NULL);62FREE(query);63}6465static enum radeon_value_id winsys_id_from_type(unsigned type)66{67switch (type) {68case R600_QUERY_REQUESTED_VRAM: return RADEON_REQUESTED_VRAM_MEMORY;69case R600_QUERY_REQUESTED_GTT: return RADEON_REQUESTED_GTT_MEMORY;70case R600_QUERY_MAPPED_VRAM: return RADEON_MAPPED_VRAM;71case R600_QUERY_MAPPED_GTT: return RADEON_MAPPED_GTT;72case R600_QUERY_BUFFER_WAIT_TIME: return RADEON_BUFFER_WAIT_TIME_NS;73case R600_QUERY_NUM_MAPPED_BUFFERS: return RADEON_NUM_MAPPED_BUFFERS;74case R600_QUERY_NUM_GFX_IBS: return RADEON_NUM_GFX_IBS;75case R600_QUERY_NUM_SDMA_IBS: return RADEON_NUM_SDMA_IBS;76case R600_QUERY_GFX_BO_LIST_SIZE: return RADEON_GFX_BO_LIST_COUNTER;77case R600_QUERY_NUM_BYTES_MOVED: return RADEON_NUM_BYTES_MOVED;78case R600_QUERY_NUM_EVICTIONS: return RADEON_NUM_EVICTIONS;79case R600_QUERY_NUM_VRAM_CPU_PAGE_FAULTS: return RADEON_NUM_VRAM_CPU_PAGE_FAULTS;80case R600_QUERY_VRAM_USAGE: return RADEON_VRAM_USAGE;81case R600_QUERY_VRAM_VIS_USAGE: return RADEON_VRAM_VIS_USAGE;82case R600_QUERY_GTT_USAGE: return RADEON_GTT_USAGE;83case R600_QUERY_GPU_TEMPERATURE: return RADEON_GPU_TEMPERATURE;84case R600_QUERY_CURRENT_GPU_SCLK: return RADEON_CURRENT_SCLK;85case R600_QUERY_CURRENT_GPU_MCLK: return RADEON_CURRENT_MCLK;86case R600_QUERY_CS_THREAD_BUSY: return RADEON_CS_THREAD_TIME;87default: unreachable("query type does not correspond to winsys id");88}89}9091static bool r600_query_sw_begin(struct r600_common_context *rctx,92struct r600_query *rquery)93{94struct r600_query_sw *query = (struct r600_query_sw *)rquery;95enum radeon_value_id ws_id;9697switch(query->b.type) {98case PIPE_QUERY_TIMESTAMP_DISJOINT:99case PIPE_QUERY_GPU_FINISHED:100break;101case R600_QUERY_DRAW_CALLS:102query->begin_result = rctx->num_draw_calls;103break;104case R600_QUERY_DECOMPRESS_CALLS:105query->begin_result = rctx->num_decompress_calls;106break;107case R600_QUERY_MRT_DRAW_CALLS:108query->begin_result = rctx->num_mrt_draw_calls;109break;110case R600_QUERY_PRIM_RESTART_CALLS:111query->begin_result = rctx->num_prim_restart_calls;112break;113case R600_QUERY_SPILL_DRAW_CALLS:114query->begin_result = rctx->num_spill_draw_calls;115break;116case R600_QUERY_COMPUTE_CALLS:117query->begin_result = rctx->num_compute_calls;118break;119case R600_QUERY_SPILL_COMPUTE_CALLS:120query->begin_result = rctx->num_spill_compute_calls;121break;122case R600_QUERY_DMA_CALLS:123query->begin_result = rctx->num_dma_calls;124break;125case R600_QUERY_CP_DMA_CALLS:126query->begin_result = rctx->num_cp_dma_calls;127break;128case R600_QUERY_NUM_VS_FLUSHES:129query->begin_result = rctx->num_vs_flushes;130break;131case R600_QUERY_NUM_PS_FLUSHES:132query->begin_result = rctx->num_ps_flushes;133break;134case R600_QUERY_NUM_CS_FLUSHES:135query->begin_result = rctx->num_cs_flushes;136break;137case R600_QUERY_NUM_CB_CACHE_FLUSHES:138query->begin_result = rctx->num_cb_cache_flushes;139break;140case R600_QUERY_NUM_DB_CACHE_FLUSHES:141query->begin_result = rctx->num_db_cache_flushes;142break;143case R600_QUERY_NUM_RESIDENT_HANDLES:144query->begin_result = rctx->num_resident_handles;145break;146case R600_QUERY_TC_OFFLOADED_SLOTS:147query->begin_result = rctx->tc ? rctx->tc->num_offloaded_slots : 0;148break;149case R600_QUERY_TC_DIRECT_SLOTS:150query->begin_result = rctx->tc ? rctx->tc->num_direct_slots : 0;151break;152case R600_QUERY_TC_NUM_SYNCS:153query->begin_result = rctx->tc ? rctx->tc->num_syncs : 0;154break;155case R600_QUERY_REQUESTED_VRAM:156case R600_QUERY_REQUESTED_GTT:157case R600_QUERY_MAPPED_VRAM:158case R600_QUERY_MAPPED_GTT:159case R600_QUERY_VRAM_USAGE:160case R600_QUERY_VRAM_VIS_USAGE:161case R600_QUERY_GTT_USAGE:162case R600_QUERY_GPU_TEMPERATURE:163case R600_QUERY_CURRENT_GPU_SCLK:164case R600_QUERY_CURRENT_GPU_MCLK:165case R600_QUERY_NUM_MAPPED_BUFFERS:166query->begin_result = 0;167break;168case R600_QUERY_BUFFER_WAIT_TIME:169case R600_QUERY_NUM_GFX_IBS:170case R600_QUERY_NUM_SDMA_IBS:171case R600_QUERY_NUM_BYTES_MOVED:172case R600_QUERY_NUM_EVICTIONS:173case R600_QUERY_NUM_VRAM_CPU_PAGE_FAULTS: {174enum radeon_value_id ws_id = winsys_id_from_type(query->b.type);175query->begin_result = rctx->ws->query_value(rctx->ws, ws_id);176break;177}178case R600_QUERY_GFX_BO_LIST_SIZE:179ws_id = winsys_id_from_type(query->b.type);180query->begin_result = rctx->ws->query_value(rctx->ws, ws_id);181query->begin_time = rctx->ws->query_value(rctx->ws,182RADEON_NUM_GFX_IBS);183break;184case R600_QUERY_CS_THREAD_BUSY:185ws_id = winsys_id_from_type(query->b.type);186query->begin_result = rctx->ws->query_value(rctx->ws, ws_id);187query->begin_time = os_time_get_nano();188break;189case R600_QUERY_GALLIUM_THREAD_BUSY:190query->begin_result =191rctx->tc ? util_queue_get_thread_time_nano(&rctx->tc->queue, 0) : 0;192query->begin_time = os_time_get_nano();193break;194case R600_QUERY_GPU_LOAD:195case R600_QUERY_GPU_SHADERS_BUSY:196case R600_QUERY_GPU_TA_BUSY:197case R600_QUERY_GPU_GDS_BUSY:198case R600_QUERY_GPU_VGT_BUSY:199case R600_QUERY_GPU_IA_BUSY:200case R600_QUERY_GPU_SX_BUSY:201case R600_QUERY_GPU_WD_BUSY:202case R600_QUERY_GPU_BCI_BUSY:203case R600_QUERY_GPU_SC_BUSY:204case R600_QUERY_GPU_PA_BUSY:205case R600_QUERY_GPU_DB_BUSY:206case R600_QUERY_GPU_CP_BUSY:207case R600_QUERY_GPU_CB_BUSY:208case R600_QUERY_GPU_SDMA_BUSY:209case R600_QUERY_GPU_PFP_BUSY:210case R600_QUERY_GPU_MEQ_BUSY:211case R600_QUERY_GPU_ME_BUSY:212case R600_QUERY_GPU_SURF_SYNC_BUSY:213case R600_QUERY_GPU_CP_DMA_BUSY:214case R600_QUERY_GPU_SCRATCH_RAM_BUSY:215query->begin_result = r600_begin_counter(rctx->screen,216query->b.type);217break;218case R600_QUERY_NUM_COMPILATIONS:219query->begin_result = p_atomic_read(&rctx->screen->num_compilations);220break;221case R600_QUERY_NUM_SHADERS_CREATED:222query->begin_result = p_atomic_read(&rctx->screen->num_shaders_created);223break;224case R600_QUERY_NUM_SHADER_CACHE_HITS:225query->begin_result =226p_atomic_read(&rctx->screen->num_shader_cache_hits);227break;228case R600_QUERY_GPIN_ASIC_ID:229case R600_QUERY_GPIN_NUM_SIMD:230case R600_QUERY_GPIN_NUM_RB:231case R600_QUERY_GPIN_NUM_SPI:232case R600_QUERY_GPIN_NUM_SE:233break;234default:235unreachable("r600_query_sw_begin: bad query type");236}237238return true;239}240241static bool r600_query_sw_end(struct r600_common_context *rctx,242struct r600_query *rquery)243{244struct r600_query_sw *query = (struct r600_query_sw *)rquery;245enum radeon_value_id ws_id;246247switch(query->b.type) {248case PIPE_QUERY_TIMESTAMP_DISJOINT:249break;250case PIPE_QUERY_GPU_FINISHED:251rctx->b.flush(&rctx->b, &query->fence, PIPE_FLUSH_DEFERRED);252break;253case R600_QUERY_DRAW_CALLS:254query->end_result = rctx->num_draw_calls;255break;256case R600_QUERY_DECOMPRESS_CALLS:257query->end_result = rctx->num_decompress_calls;258break;259case R600_QUERY_MRT_DRAW_CALLS:260query->end_result = rctx->num_mrt_draw_calls;261break;262case R600_QUERY_PRIM_RESTART_CALLS:263query->end_result = rctx->num_prim_restart_calls;264break;265case R600_QUERY_SPILL_DRAW_CALLS:266query->end_result = rctx->num_spill_draw_calls;267break;268case R600_QUERY_COMPUTE_CALLS:269query->end_result = rctx->num_compute_calls;270break;271case R600_QUERY_SPILL_COMPUTE_CALLS:272query->end_result = rctx->num_spill_compute_calls;273break;274case R600_QUERY_DMA_CALLS:275query->end_result = rctx->num_dma_calls;276break;277case R600_QUERY_CP_DMA_CALLS:278query->end_result = rctx->num_cp_dma_calls;279break;280case R600_QUERY_NUM_VS_FLUSHES:281query->end_result = rctx->num_vs_flushes;282break;283case R600_QUERY_NUM_PS_FLUSHES:284query->end_result = rctx->num_ps_flushes;285break;286case R600_QUERY_NUM_CS_FLUSHES:287query->end_result = rctx->num_cs_flushes;288break;289case R600_QUERY_NUM_CB_CACHE_FLUSHES:290query->end_result = rctx->num_cb_cache_flushes;291break;292case R600_QUERY_NUM_DB_CACHE_FLUSHES:293query->end_result = rctx->num_db_cache_flushes;294break;295case R600_QUERY_NUM_RESIDENT_HANDLES:296query->end_result = rctx->num_resident_handles;297break;298case R600_QUERY_TC_OFFLOADED_SLOTS:299query->end_result = rctx->tc ? rctx->tc->num_offloaded_slots : 0;300break;301case R600_QUERY_TC_DIRECT_SLOTS:302query->end_result = rctx->tc ? rctx->tc->num_direct_slots : 0;303break;304case R600_QUERY_TC_NUM_SYNCS:305query->end_result = rctx->tc ? rctx->tc->num_syncs : 0;306break;307case R600_QUERY_REQUESTED_VRAM:308case R600_QUERY_REQUESTED_GTT:309case R600_QUERY_MAPPED_VRAM:310case R600_QUERY_MAPPED_GTT:311case R600_QUERY_VRAM_USAGE:312case R600_QUERY_VRAM_VIS_USAGE:313case R600_QUERY_GTT_USAGE:314case R600_QUERY_GPU_TEMPERATURE:315case R600_QUERY_CURRENT_GPU_SCLK:316case R600_QUERY_CURRENT_GPU_MCLK:317case R600_QUERY_BUFFER_WAIT_TIME:318case R600_QUERY_NUM_MAPPED_BUFFERS:319case R600_QUERY_NUM_GFX_IBS:320case R600_QUERY_NUM_SDMA_IBS:321case R600_QUERY_NUM_BYTES_MOVED:322case R600_QUERY_NUM_EVICTIONS:323case R600_QUERY_NUM_VRAM_CPU_PAGE_FAULTS: {324enum radeon_value_id ws_id = winsys_id_from_type(query->b.type);325query->end_result = rctx->ws->query_value(rctx->ws, ws_id);326break;327}328case R600_QUERY_GFX_BO_LIST_SIZE:329ws_id = winsys_id_from_type(query->b.type);330query->end_result = rctx->ws->query_value(rctx->ws, ws_id);331query->end_time = rctx->ws->query_value(rctx->ws,332RADEON_NUM_GFX_IBS);333break;334case R600_QUERY_CS_THREAD_BUSY:335ws_id = winsys_id_from_type(query->b.type);336query->end_result = rctx->ws->query_value(rctx->ws, ws_id);337query->end_time = os_time_get_nano();338break;339case R600_QUERY_GALLIUM_THREAD_BUSY:340query->end_result =341rctx->tc ? util_queue_get_thread_time_nano(&rctx->tc->queue, 0) : 0;342query->end_time = os_time_get_nano();343break;344case R600_QUERY_GPU_LOAD:345case R600_QUERY_GPU_SHADERS_BUSY:346case R600_QUERY_GPU_TA_BUSY:347case R600_QUERY_GPU_GDS_BUSY:348case R600_QUERY_GPU_VGT_BUSY:349case R600_QUERY_GPU_IA_BUSY:350case R600_QUERY_GPU_SX_BUSY:351case R600_QUERY_GPU_WD_BUSY:352case R600_QUERY_GPU_BCI_BUSY:353case R600_QUERY_GPU_SC_BUSY:354case R600_QUERY_GPU_PA_BUSY:355case R600_QUERY_GPU_DB_BUSY:356case R600_QUERY_GPU_CP_BUSY:357case R600_QUERY_GPU_CB_BUSY:358case R600_QUERY_GPU_SDMA_BUSY:359case R600_QUERY_GPU_PFP_BUSY:360case R600_QUERY_GPU_MEQ_BUSY:361case R600_QUERY_GPU_ME_BUSY:362case R600_QUERY_GPU_SURF_SYNC_BUSY:363case R600_QUERY_GPU_CP_DMA_BUSY:364case R600_QUERY_GPU_SCRATCH_RAM_BUSY:365query->end_result = r600_end_counter(rctx->screen,366query->b.type,367query->begin_result);368query->begin_result = 0;369break;370case R600_QUERY_NUM_COMPILATIONS:371query->end_result = p_atomic_read(&rctx->screen->num_compilations);372break;373case R600_QUERY_NUM_SHADERS_CREATED:374query->end_result = p_atomic_read(&rctx->screen->num_shaders_created);375break;376case R600_QUERY_NUM_SHADER_CACHE_HITS:377query->end_result =378p_atomic_read(&rctx->screen->num_shader_cache_hits);379break;380case R600_QUERY_GPIN_ASIC_ID:381case R600_QUERY_GPIN_NUM_SIMD:382case R600_QUERY_GPIN_NUM_RB:383case R600_QUERY_GPIN_NUM_SPI:384case R600_QUERY_GPIN_NUM_SE:385break;386default:387unreachable("r600_query_sw_end: bad query type");388}389390return true;391}392393static bool r600_query_sw_get_result(struct r600_common_context *rctx,394struct r600_query *rquery,395bool wait,396union pipe_query_result *result)397{398struct r600_query_sw *query = (struct r600_query_sw *)rquery;399400switch (query->b.type) {401case PIPE_QUERY_TIMESTAMP_DISJOINT:402/* Convert from cycles per millisecond to cycles per second (Hz). */403result->timestamp_disjoint.frequency =404(uint64_t)rctx->screen->info.clock_crystal_freq * 1000;405result->timestamp_disjoint.disjoint = false;406return true;407case PIPE_QUERY_GPU_FINISHED: {408struct pipe_screen *screen = rctx->b.screen;409struct pipe_context *ctx = rquery->b.flushed ? NULL : &rctx->b;410411result->b = screen->fence_finish(screen, ctx, query->fence,412wait ? PIPE_TIMEOUT_INFINITE : 0);413return result->b;414}415416case R600_QUERY_GFX_BO_LIST_SIZE:417result->u64 = (query->end_result - query->begin_result) /418(query->end_time - query->begin_time);419return true;420case R600_QUERY_CS_THREAD_BUSY:421case R600_QUERY_GALLIUM_THREAD_BUSY:422result->u64 = (query->end_result - query->begin_result) * 100 /423(query->end_time - query->begin_time);424return true;425case R600_QUERY_GPIN_ASIC_ID:426result->u32 = 0;427return true;428case R600_QUERY_GPIN_NUM_SIMD:429result->u32 = rctx->screen->info.num_good_compute_units;430return true;431case R600_QUERY_GPIN_NUM_RB:432result->u32 = rctx->screen->info.max_render_backends;433return true;434case R600_QUERY_GPIN_NUM_SPI:435result->u32 = 1; /* all supported chips have one SPI per SE */436return true;437case R600_QUERY_GPIN_NUM_SE:438result->u32 = rctx->screen->info.max_se;439return true;440}441442result->u64 = query->end_result - query->begin_result;443444switch (query->b.type) {445case R600_QUERY_BUFFER_WAIT_TIME:446case R600_QUERY_GPU_TEMPERATURE:447result->u64 /= 1000;448break;449case R600_QUERY_CURRENT_GPU_SCLK:450case R600_QUERY_CURRENT_GPU_MCLK:451result->u64 *= 1000000;452break;453}454455return true;456}457458459static struct r600_query_ops sw_query_ops = {460.destroy = r600_query_sw_destroy,461.begin = r600_query_sw_begin,462.end = r600_query_sw_end,463.get_result = r600_query_sw_get_result,464.get_result_resource = NULL465};466467static struct pipe_query *r600_query_sw_create(unsigned query_type)468{469struct r600_query_sw *query;470471query = CALLOC_STRUCT(r600_query_sw);472if (!query)473return NULL;474475query->b.type = query_type;476query->b.ops = &sw_query_ops;477478return (struct pipe_query *)query;479}480481void r600_query_hw_destroy(struct r600_common_screen *rscreen,482struct r600_query *rquery)483{484struct r600_query_hw *query = (struct r600_query_hw *)rquery;485struct r600_query_buffer *prev = query->buffer.previous;486487/* Release all query buffers. */488while (prev) {489struct r600_query_buffer *qbuf = prev;490prev = prev->previous;491r600_resource_reference(&qbuf->buf, NULL);492FREE(qbuf);493}494495r600_resource_reference(&query->buffer.buf, NULL);496FREE(rquery);497}498499static struct r600_resource *r600_new_query_buffer(struct r600_common_screen *rscreen,500struct r600_query_hw *query)501{502unsigned buf_size = MAX2(query->result_size,503rscreen->info.min_alloc_size);504505/* Queries are normally read by the CPU after506* being written by the gpu, hence staging is probably a good507* usage pattern.508*/509struct r600_resource *buf = (struct r600_resource*)510pipe_buffer_create(&rscreen->b, 0,511PIPE_USAGE_STAGING, buf_size);512if (!buf)513return NULL;514515if (!query->ops->prepare_buffer(rscreen, query, buf)) {516r600_resource_reference(&buf, NULL);517return NULL;518}519520return buf;521}522523static bool r600_query_hw_prepare_buffer(struct r600_common_screen *rscreen,524struct r600_query_hw *query,525struct r600_resource *buffer)526{527/* Callers ensure that the buffer is currently unused by the GPU. */528uint32_t *results = rscreen->ws->buffer_map(rscreen->ws, buffer->buf, NULL,529PIPE_MAP_WRITE |530PIPE_MAP_UNSYNCHRONIZED);531if (!results)532return false;533534memset(results, 0, buffer->b.b.width0);535536if (query->b.type == PIPE_QUERY_OCCLUSION_COUNTER ||537query->b.type == PIPE_QUERY_OCCLUSION_PREDICATE ||538query->b.type == PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE) {539unsigned max_rbs = rscreen->info.max_render_backends;540unsigned enabled_rb_mask = rscreen->info.enabled_rb_mask;541unsigned num_results;542unsigned i, j;543544/* Set top bits for unused backends. */545num_results = buffer->b.b.width0 / query->result_size;546for (j = 0; j < num_results; j++) {547for (i = 0; i < max_rbs; i++) {548if (!(enabled_rb_mask & (1<<i))) {549results[(i * 4)+1] = 0x80000000;550results[(i * 4)+3] = 0x80000000;551}552}553results += 4 * max_rbs;554}555}556557return true;558}559560static void r600_query_hw_get_result_resource(struct r600_common_context *rctx,561struct r600_query *rquery,562bool wait,563enum pipe_query_value_type result_type,564int index,565struct pipe_resource *resource,566unsigned offset);567568static struct r600_query_ops query_hw_ops = {569.destroy = r600_query_hw_destroy,570.begin = r600_query_hw_begin,571.end = r600_query_hw_end,572.get_result = r600_query_hw_get_result,573.get_result_resource = r600_query_hw_get_result_resource,574};575576static void r600_query_hw_do_emit_start(struct r600_common_context *ctx,577struct r600_query_hw *query,578struct r600_resource *buffer,579uint64_t va);580static void r600_query_hw_do_emit_stop(struct r600_common_context *ctx,581struct r600_query_hw *query,582struct r600_resource *buffer,583uint64_t va);584static void r600_query_hw_add_result(struct r600_common_screen *rscreen,585struct r600_query_hw *, void *buffer,586union pipe_query_result *result);587static void r600_query_hw_clear_result(struct r600_query_hw *,588union pipe_query_result *);589590static struct r600_query_hw_ops query_hw_default_hw_ops = {591.prepare_buffer = r600_query_hw_prepare_buffer,592.emit_start = r600_query_hw_do_emit_start,593.emit_stop = r600_query_hw_do_emit_stop,594.clear_result = r600_query_hw_clear_result,595.add_result = r600_query_hw_add_result,596};597598bool r600_query_hw_init(struct r600_common_screen *rscreen,599struct r600_query_hw *query)600{601query->buffer.buf = r600_new_query_buffer(rscreen, query);602if (!query->buffer.buf)603return false;604605return true;606}607608static struct pipe_query *r600_query_hw_create(struct r600_common_screen *rscreen,609unsigned query_type,610unsigned index)611{612struct r600_query_hw *query = CALLOC_STRUCT(r600_query_hw);613if (!query)614return NULL;615616query->b.type = query_type;617query->b.ops = &query_hw_ops;618query->ops = &query_hw_default_hw_ops;619620switch (query_type) {621case PIPE_QUERY_OCCLUSION_COUNTER:622case PIPE_QUERY_OCCLUSION_PREDICATE:623case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE:624query->result_size = 16 * rscreen->info.max_render_backends;625query->result_size += 16; /* for the fence + alignment */626query->num_cs_dw_begin = 6;627query->num_cs_dw_end = 6 + r600_gfx_write_fence_dwords(rscreen);628break;629case PIPE_QUERY_TIME_ELAPSED:630query->result_size = 24;631query->num_cs_dw_begin = 8;632query->num_cs_dw_end = 8 + r600_gfx_write_fence_dwords(rscreen);633break;634case PIPE_QUERY_TIMESTAMP:635query->result_size = 16;636query->num_cs_dw_end = 8 + r600_gfx_write_fence_dwords(rscreen);637query->flags = R600_QUERY_HW_FLAG_NO_START;638break;639case PIPE_QUERY_PRIMITIVES_EMITTED:640case PIPE_QUERY_PRIMITIVES_GENERATED:641case PIPE_QUERY_SO_STATISTICS:642case PIPE_QUERY_SO_OVERFLOW_PREDICATE:643/* NumPrimitivesWritten, PrimitiveStorageNeeded. */644query->result_size = 32;645query->num_cs_dw_begin = 6;646query->num_cs_dw_end = 6;647query->stream = index;648break;649case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE:650/* NumPrimitivesWritten, PrimitiveStorageNeeded. */651query->result_size = 32 * R600_MAX_STREAMS;652query->num_cs_dw_begin = 6 * R600_MAX_STREAMS;653query->num_cs_dw_end = 6 * R600_MAX_STREAMS;654break;655case PIPE_QUERY_PIPELINE_STATISTICS:656/* 11 values on EG, 8 on R600. */657query->result_size = (rscreen->chip_class >= EVERGREEN ? 11 : 8) * 16;658query->result_size += 8; /* for the fence + alignment */659query->num_cs_dw_begin = 6;660query->num_cs_dw_end = 6 + r600_gfx_write_fence_dwords(rscreen);661break;662default:663assert(0);664FREE(query);665return NULL;666}667668if (!r600_query_hw_init(rscreen, query)) {669FREE(query);670return NULL;671}672673return (struct pipe_query *)query;674}675676static void r600_update_occlusion_query_state(struct r600_common_context *rctx,677unsigned type, int diff)678{679if (type == PIPE_QUERY_OCCLUSION_COUNTER ||680type == PIPE_QUERY_OCCLUSION_PREDICATE ||681type == PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE) {682bool old_enable = rctx->num_occlusion_queries != 0;683bool old_perfect_enable =684rctx->num_perfect_occlusion_queries != 0;685bool enable, perfect_enable;686687rctx->num_occlusion_queries += diff;688assert(rctx->num_occlusion_queries >= 0);689690if (type != PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE) {691rctx->num_perfect_occlusion_queries += diff;692assert(rctx->num_perfect_occlusion_queries >= 0);693}694695enable = rctx->num_occlusion_queries != 0;696perfect_enable = rctx->num_perfect_occlusion_queries != 0;697698if (enable != old_enable || perfect_enable != old_perfect_enable) {699struct r600_context *ctx = (struct r600_context*)rctx;700r600_mark_atom_dirty(ctx, &ctx->db_misc_state.atom);701}702}703}704705static unsigned event_type_for_stream(unsigned stream)706{707switch (stream) {708default:709case 0: return EVENT_TYPE_SAMPLE_STREAMOUTSTATS;710case 1: return EVENT_TYPE_SAMPLE_STREAMOUTSTATS1;711case 2: return EVENT_TYPE_SAMPLE_STREAMOUTSTATS2;712case 3: return EVENT_TYPE_SAMPLE_STREAMOUTSTATS3;713}714}715716static void emit_sample_streamout(struct radeon_cmdbuf *cs, uint64_t va,717unsigned stream)718{719radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0));720radeon_emit(cs, EVENT_TYPE(event_type_for_stream(stream)) | EVENT_INDEX(3));721radeon_emit(cs, va);722radeon_emit(cs, va >> 32);723}724725static void r600_query_hw_do_emit_start(struct r600_common_context *ctx,726struct r600_query_hw *query,727struct r600_resource *buffer,728uint64_t va)729{730struct radeon_cmdbuf *cs = &ctx->gfx.cs;731732switch (query->b.type) {733case PIPE_QUERY_OCCLUSION_COUNTER:734case PIPE_QUERY_OCCLUSION_PREDICATE:735case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE:736radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0));737radeon_emit(cs, EVENT_TYPE(EVENT_TYPE_ZPASS_DONE) | EVENT_INDEX(1));738radeon_emit(cs, va);739radeon_emit(cs, va >> 32);740break;741case PIPE_QUERY_PRIMITIVES_EMITTED:742case PIPE_QUERY_PRIMITIVES_GENERATED:743case PIPE_QUERY_SO_STATISTICS:744case PIPE_QUERY_SO_OVERFLOW_PREDICATE:745emit_sample_streamout(cs, va, query->stream);746break;747case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE:748for (unsigned stream = 0; stream < R600_MAX_STREAMS; ++stream)749emit_sample_streamout(cs, va + 32 * stream, stream);750break;751case PIPE_QUERY_TIME_ELAPSED:752/* Write the timestamp after the last draw is done.753* (bottom-of-pipe)754*/755r600_gfx_write_event_eop(ctx, EVENT_TYPE_BOTTOM_OF_PIPE_TS,7560, EOP_DATA_SEL_TIMESTAMP,757NULL, va, 0, query->b.type);758break;759case PIPE_QUERY_PIPELINE_STATISTICS:760radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0));761radeon_emit(cs, EVENT_TYPE(EVENT_TYPE_SAMPLE_PIPELINESTAT) | EVENT_INDEX(2));762radeon_emit(cs, va);763radeon_emit(cs, va >> 32);764break;765default:766assert(0);767}768r600_emit_reloc(ctx, &ctx->gfx, query->buffer.buf, RADEON_USAGE_WRITE,769RADEON_PRIO_QUERY);770}771772static void r600_query_hw_emit_start(struct r600_common_context *ctx,773struct r600_query_hw *query)774{775uint64_t va;776777if (!query->buffer.buf)778return; // previous buffer allocation failure779780r600_update_occlusion_query_state(ctx, query->b.type, 1);781r600_update_prims_generated_query_state(ctx, query->b.type, 1);782783ctx->need_gfx_cs_space(&ctx->b, query->num_cs_dw_begin + query->num_cs_dw_end,784true);785786/* Get a new query buffer if needed. */787if (query->buffer.results_end + query->result_size > query->buffer.buf->b.b.width0) {788struct r600_query_buffer *qbuf = MALLOC_STRUCT(r600_query_buffer);789*qbuf = query->buffer;790query->buffer.results_end = 0;791query->buffer.previous = qbuf;792query->buffer.buf = r600_new_query_buffer(ctx->screen, query);793if (!query->buffer.buf)794return;795}796797/* emit begin query */798va = query->buffer.buf->gpu_address + query->buffer.results_end;799800query->ops->emit_start(ctx, query, query->buffer.buf, va);801802ctx->num_cs_dw_queries_suspend += query->num_cs_dw_end;803}804805static void r600_query_hw_do_emit_stop(struct r600_common_context *ctx,806struct r600_query_hw *query,807struct r600_resource *buffer,808uint64_t va)809{810struct radeon_cmdbuf *cs = &ctx->gfx.cs;811uint64_t fence_va = 0;812813switch (query->b.type) {814case PIPE_QUERY_OCCLUSION_COUNTER:815case PIPE_QUERY_OCCLUSION_PREDICATE:816case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE:817va += 8;818radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0));819radeon_emit(cs, EVENT_TYPE(EVENT_TYPE_ZPASS_DONE) | EVENT_INDEX(1));820radeon_emit(cs, va);821radeon_emit(cs, va >> 32);822823fence_va = va + ctx->screen->info.max_render_backends * 16 - 8;824break;825case PIPE_QUERY_PRIMITIVES_EMITTED:826case PIPE_QUERY_PRIMITIVES_GENERATED:827case PIPE_QUERY_SO_STATISTICS:828case PIPE_QUERY_SO_OVERFLOW_PREDICATE:829va += 16;830emit_sample_streamout(cs, va, query->stream);831break;832case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE:833va += 16;834for (unsigned stream = 0; stream < R600_MAX_STREAMS; ++stream)835emit_sample_streamout(cs, va + 32 * stream, stream);836break;837case PIPE_QUERY_TIME_ELAPSED:838va += 8;839FALLTHROUGH;840case PIPE_QUERY_TIMESTAMP:841r600_gfx_write_event_eop(ctx, EVENT_TYPE_BOTTOM_OF_PIPE_TS,8420, EOP_DATA_SEL_TIMESTAMP, NULL, va,8430, query->b.type);844fence_va = va + 8;845break;846case PIPE_QUERY_PIPELINE_STATISTICS: {847unsigned sample_size = (query->result_size - 8) / 2;848849va += sample_size;850radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0));851radeon_emit(cs, EVENT_TYPE(EVENT_TYPE_SAMPLE_PIPELINESTAT) | EVENT_INDEX(2));852radeon_emit(cs, va);853radeon_emit(cs, va >> 32);854855fence_va = va + sample_size;856break;857}858default:859assert(0);860}861r600_emit_reloc(ctx, &ctx->gfx, query->buffer.buf, RADEON_USAGE_WRITE,862RADEON_PRIO_QUERY);863864if (fence_va)865r600_gfx_write_event_eop(ctx, EVENT_TYPE_BOTTOM_OF_PIPE_TS, 0,866EOP_DATA_SEL_VALUE_32BIT,867query->buffer.buf, fence_va, 0x80000000,868query->b.type);869}870871static void r600_query_hw_emit_stop(struct r600_common_context *ctx,872struct r600_query_hw *query)873{874uint64_t va;875876if (!query->buffer.buf)877return; // previous buffer allocation failure878879/* The queries which need begin already called this in begin_query. */880if (query->flags & R600_QUERY_HW_FLAG_NO_START) {881ctx->need_gfx_cs_space(&ctx->b, query->num_cs_dw_end, false);882}883884/* emit end query */885va = query->buffer.buf->gpu_address + query->buffer.results_end;886887query->ops->emit_stop(ctx, query, query->buffer.buf, va);888889query->buffer.results_end += query->result_size;890891if (!(query->flags & R600_QUERY_HW_FLAG_NO_START))892ctx->num_cs_dw_queries_suspend -= query->num_cs_dw_end;893894r600_update_occlusion_query_state(ctx, query->b.type, -1);895r600_update_prims_generated_query_state(ctx, query->b.type, -1);896}897898static void emit_set_predicate(struct r600_common_context *ctx,899struct r600_resource *buf, uint64_t va,900uint32_t op)901{902struct radeon_cmdbuf *cs = &ctx->gfx.cs;903904radeon_emit(cs, PKT3(PKT3_SET_PREDICATION, 1, 0));905radeon_emit(cs, va);906radeon_emit(cs, op | ((va >> 32) & 0xFF));907r600_emit_reloc(ctx, &ctx->gfx, buf, RADEON_USAGE_READ,908RADEON_PRIO_QUERY);909}910911static void r600_emit_query_predication(struct r600_common_context *ctx,912struct r600_atom *atom)913{914struct r600_query_hw *query = (struct r600_query_hw *)ctx->render_cond;915struct r600_query_buffer *qbuf;916uint32_t op;917bool flag_wait, invert;918919if (!query)920return;921922invert = ctx->render_cond_invert;923flag_wait = ctx->render_cond_mode == PIPE_RENDER_COND_WAIT ||924ctx->render_cond_mode == PIPE_RENDER_COND_BY_REGION_WAIT;925926switch (query->b.type) {927case PIPE_QUERY_OCCLUSION_COUNTER:928case PIPE_QUERY_OCCLUSION_PREDICATE:929case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE:930op = PRED_OP(PREDICATION_OP_ZPASS);931break;932case PIPE_QUERY_SO_OVERFLOW_PREDICATE:933case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE:934op = PRED_OP(PREDICATION_OP_PRIMCOUNT);935invert = !invert;936break;937default:938assert(0);939return;940}941942/* if true then invert, see GL_ARB_conditional_render_inverted */943if (invert)944op |= PREDICATION_DRAW_NOT_VISIBLE; /* Draw if not visible or overflow */945else946op |= PREDICATION_DRAW_VISIBLE; /* Draw if visible or no overflow */947948op |= flag_wait ? PREDICATION_HINT_WAIT : PREDICATION_HINT_NOWAIT_DRAW;949950/* emit predicate packets for all data blocks */951for (qbuf = &query->buffer; qbuf; qbuf = qbuf->previous) {952unsigned results_base = 0;953uint64_t va_base = qbuf->buf->gpu_address;954955while (results_base < qbuf->results_end) {956uint64_t va = va_base + results_base;957958if (query->b.type == PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE) {959for (unsigned stream = 0; stream < R600_MAX_STREAMS; ++stream) {960emit_set_predicate(ctx, qbuf->buf, va + 32 * stream, op);961962/* set CONTINUE bit for all packets except the first */963op |= PREDICATION_CONTINUE;964}965} else {966emit_set_predicate(ctx, qbuf->buf, va, op);967op |= PREDICATION_CONTINUE;968}969970results_base += query->result_size;971}972}973}974975static struct pipe_query *r600_create_query(struct pipe_context *ctx, unsigned query_type, unsigned index)976{977struct r600_common_screen *rscreen =978(struct r600_common_screen *)ctx->screen;979980if (query_type == PIPE_QUERY_TIMESTAMP_DISJOINT ||981query_type == PIPE_QUERY_GPU_FINISHED ||982query_type >= PIPE_QUERY_DRIVER_SPECIFIC)983return r600_query_sw_create(query_type);984985return r600_query_hw_create(rscreen, query_type, index);986}987988static void r600_destroy_query(struct pipe_context *ctx, struct pipe_query *query)989{990struct r600_common_context *rctx = (struct r600_common_context *)ctx;991struct r600_query *rquery = (struct r600_query *)query;992993rquery->ops->destroy(rctx->screen, rquery);994}995996static bool r600_begin_query(struct pipe_context *ctx,997struct pipe_query *query)998{999struct r600_common_context *rctx = (struct r600_common_context *)ctx;1000struct r600_query *rquery = (struct r600_query *)query;10011002return rquery->ops->begin(rctx, rquery);1003}10041005void r600_query_hw_reset_buffers(struct r600_common_context *rctx,1006struct r600_query_hw *query)1007{1008struct r600_query_buffer *prev = query->buffer.previous;10091010/* Discard the old query buffers. */1011while (prev) {1012struct r600_query_buffer *qbuf = prev;1013prev = prev->previous;1014r600_resource_reference(&qbuf->buf, NULL);1015FREE(qbuf);1016}10171018query->buffer.results_end = 0;1019query->buffer.previous = NULL;10201021/* Obtain a new buffer if the current one can't be mapped without a stall. */1022if (r600_rings_is_buffer_referenced(rctx, query->buffer.buf->buf, RADEON_USAGE_READWRITE) ||1023!rctx->ws->buffer_wait(rctx->ws, query->buffer.buf->buf, 0, RADEON_USAGE_READWRITE)) {1024r600_resource_reference(&query->buffer.buf, NULL);1025query->buffer.buf = r600_new_query_buffer(rctx->screen, query);1026} else {1027if (!query->ops->prepare_buffer(rctx->screen, query, query->buffer.buf))1028r600_resource_reference(&query->buffer.buf, NULL);1029}1030}10311032bool r600_query_hw_begin(struct r600_common_context *rctx,1033struct r600_query *rquery)1034{1035struct r600_query_hw *query = (struct r600_query_hw *)rquery;10361037if (query->flags & R600_QUERY_HW_FLAG_NO_START) {1038assert(0);1039return false;1040}10411042if (!(query->flags & R600_QUERY_HW_FLAG_BEGIN_RESUMES))1043r600_query_hw_reset_buffers(rctx, query);10441045r600_query_hw_emit_start(rctx, query);1046if (!query->buffer.buf)1047return false;10481049list_addtail(&query->list, &rctx->active_queries);1050return true;1051}10521053static bool r600_end_query(struct pipe_context *ctx, struct pipe_query *query)1054{1055struct r600_common_context *rctx = (struct r600_common_context *)ctx;1056struct r600_query *rquery = (struct r600_query *)query;10571058return rquery->ops->end(rctx, rquery);1059}10601061bool r600_query_hw_end(struct r600_common_context *rctx,1062struct r600_query *rquery)1063{1064struct r600_query_hw *query = (struct r600_query_hw *)rquery;10651066if (query->flags & R600_QUERY_HW_FLAG_NO_START)1067r600_query_hw_reset_buffers(rctx, query);10681069r600_query_hw_emit_stop(rctx, query);10701071if (!(query->flags & R600_QUERY_HW_FLAG_NO_START))1072list_delinit(&query->list);10731074if (!query->buffer.buf)1075return false;10761077return true;1078}10791080static void r600_get_hw_query_params(struct r600_common_context *rctx,1081struct r600_query_hw *rquery, int index,1082struct r600_hw_query_params *params)1083{1084unsigned max_rbs = rctx->screen->info.max_render_backends;10851086params->pair_stride = 0;1087params->pair_count = 1;10881089switch (rquery->b.type) {1090case PIPE_QUERY_OCCLUSION_COUNTER:1091case PIPE_QUERY_OCCLUSION_PREDICATE:1092case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE:1093params->start_offset = 0;1094params->end_offset = 8;1095params->fence_offset = max_rbs * 16;1096params->pair_stride = 16;1097params->pair_count = max_rbs;1098break;1099case PIPE_QUERY_TIME_ELAPSED:1100params->start_offset = 0;1101params->end_offset = 8;1102params->fence_offset = 16;1103break;1104case PIPE_QUERY_TIMESTAMP:1105params->start_offset = 0;1106params->end_offset = 0;1107params->fence_offset = 8;1108break;1109case PIPE_QUERY_PRIMITIVES_EMITTED:1110params->start_offset = 8;1111params->end_offset = 24;1112params->fence_offset = params->end_offset + 4;1113break;1114case PIPE_QUERY_PRIMITIVES_GENERATED:1115params->start_offset = 0;1116params->end_offset = 16;1117params->fence_offset = params->end_offset + 4;1118break;1119case PIPE_QUERY_SO_STATISTICS:1120params->start_offset = 8 - index * 8;1121params->end_offset = 24 - index * 8;1122params->fence_offset = params->end_offset + 4;1123break;1124case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE:1125params->pair_count = R600_MAX_STREAMS;1126params->pair_stride = 32;1127FALLTHROUGH;1128case PIPE_QUERY_SO_OVERFLOW_PREDICATE:1129params->start_offset = 0;1130params->end_offset = 16;11311132/* We can re-use the high dword of the last 64-bit value as a1133* fence: it is initialized as 0, and the high bit is set by1134* the write of the streamout stats event.1135*/1136params->fence_offset = rquery->result_size - 4;1137break;1138case PIPE_QUERY_PIPELINE_STATISTICS:1139{1140/* Offsets apply to EG+ */1141static const unsigned offsets[] = {56, 48, 24, 32, 40, 16, 8, 0, 64, 72, 80};1142params->start_offset = offsets[index];1143params->end_offset = 88 + offsets[index];1144params->fence_offset = 2 * 88;1145break;1146}1147default:1148unreachable("r600_get_hw_query_params unsupported");1149}1150}11511152static unsigned r600_query_read_result(void *map, unsigned start_index, unsigned end_index,1153bool test_status_bit)1154{1155uint32_t *current_result = (uint32_t*)map;1156uint64_t start, end;11571158start = (uint64_t)current_result[start_index] |1159(uint64_t)current_result[start_index+1] << 32;1160end = (uint64_t)current_result[end_index] |1161(uint64_t)current_result[end_index+1] << 32;11621163if (!test_status_bit ||1164((start & 0x8000000000000000UL) && (end & 0x8000000000000000UL))) {1165return end - start;1166}1167return 0;1168}11691170static void r600_query_hw_add_result(struct r600_common_screen *rscreen,1171struct r600_query_hw *query,1172void *buffer,1173union pipe_query_result *result)1174{1175unsigned max_rbs = rscreen->info.max_render_backends;11761177switch (query->b.type) {1178case PIPE_QUERY_OCCLUSION_COUNTER: {1179for (unsigned i = 0; i < max_rbs; ++i) {1180unsigned results_base = i * 16;1181result->u64 +=1182r600_query_read_result(buffer + results_base, 0, 2, true);1183}1184break;1185}1186case PIPE_QUERY_OCCLUSION_PREDICATE:1187case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE: {1188for (unsigned i = 0; i < max_rbs; ++i) {1189unsigned results_base = i * 16;1190result->b = result->b ||1191r600_query_read_result(buffer + results_base, 0, 2, true) != 0;1192}1193break;1194}1195case PIPE_QUERY_TIME_ELAPSED:1196result->u64 += r600_query_read_result(buffer, 0, 2, false);1197break;1198case PIPE_QUERY_TIMESTAMP:1199result->u64 = *(uint64_t*)buffer;1200break;1201case PIPE_QUERY_PRIMITIVES_EMITTED:1202/* SAMPLE_STREAMOUTSTATS stores this structure:1203* {1204* u64 NumPrimitivesWritten;1205* u64 PrimitiveStorageNeeded;1206* }1207* We only need NumPrimitivesWritten here. */1208result->u64 += r600_query_read_result(buffer, 2, 6, true);1209break;1210case PIPE_QUERY_PRIMITIVES_GENERATED:1211/* Here we read PrimitiveStorageNeeded. */1212result->u64 += r600_query_read_result(buffer, 0, 4, true);1213break;1214case PIPE_QUERY_SO_STATISTICS:1215result->so_statistics.num_primitives_written +=1216r600_query_read_result(buffer, 2, 6, true);1217result->so_statistics.primitives_storage_needed +=1218r600_query_read_result(buffer, 0, 4, true);1219break;1220case PIPE_QUERY_SO_OVERFLOW_PREDICATE:1221result->b = result->b ||1222r600_query_read_result(buffer, 2, 6, true) !=1223r600_query_read_result(buffer, 0, 4, true);1224break;1225case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE:1226for (unsigned stream = 0; stream < R600_MAX_STREAMS; ++stream) {1227result->b = result->b ||1228r600_query_read_result(buffer, 2, 6, true) !=1229r600_query_read_result(buffer, 0, 4, true);1230buffer = (char *)buffer + 32;1231}1232break;1233case PIPE_QUERY_PIPELINE_STATISTICS:1234if (rscreen->chip_class >= EVERGREEN) {1235result->pipeline_statistics.ps_invocations +=1236r600_query_read_result(buffer, 0, 22, false);1237result->pipeline_statistics.c_primitives +=1238r600_query_read_result(buffer, 2, 24, false);1239result->pipeline_statistics.c_invocations +=1240r600_query_read_result(buffer, 4, 26, false);1241result->pipeline_statistics.vs_invocations +=1242r600_query_read_result(buffer, 6, 28, false);1243result->pipeline_statistics.gs_invocations +=1244r600_query_read_result(buffer, 8, 30, false);1245result->pipeline_statistics.gs_primitives +=1246r600_query_read_result(buffer, 10, 32, false);1247result->pipeline_statistics.ia_primitives +=1248r600_query_read_result(buffer, 12, 34, false);1249result->pipeline_statistics.ia_vertices +=1250r600_query_read_result(buffer, 14, 36, false);1251result->pipeline_statistics.hs_invocations +=1252r600_query_read_result(buffer, 16, 38, false);1253result->pipeline_statistics.ds_invocations +=1254r600_query_read_result(buffer, 18, 40, false);1255result->pipeline_statistics.cs_invocations +=1256r600_query_read_result(buffer, 20, 42, false);1257} else {1258result->pipeline_statistics.ps_invocations +=1259r600_query_read_result(buffer, 0, 16, false);1260result->pipeline_statistics.c_primitives +=1261r600_query_read_result(buffer, 2, 18, false);1262result->pipeline_statistics.c_invocations +=1263r600_query_read_result(buffer, 4, 20, false);1264result->pipeline_statistics.vs_invocations +=1265r600_query_read_result(buffer, 6, 22, false);1266result->pipeline_statistics.gs_invocations +=1267r600_query_read_result(buffer, 8, 24, false);1268result->pipeline_statistics.gs_primitives +=1269r600_query_read_result(buffer, 10, 26, false);1270result->pipeline_statistics.ia_primitives +=1271r600_query_read_result(buffer, 12, 28, false);1272result->pipeline_statistics.ia_vertices +=1273r600_query_read_result(buffer, 14, 30, false);1274}1275#if 0 /* for testing */1276printf("Pipeline stats: IA verts=%llu, IA prims=%llu, VS=%llu, HS=%llu, "1277"DS=%llu, GS=%llu, GS prims=%llu, Clipper=%llu, "1278"Clipper prims=%llu, PS=%llu, CS=%llu\n",1279result->pipeline_statistics.ia_vertices,1280result->pipeline_statistics.ia_primitives,1281result->pipeline_statistics.vs_invocations,1282result->pipeline_statistics.hs_invocations,1283result->pipeline_statistics.ds_invocations,1284result->pipeline_statistics.gs_invocations,1285result->pipeline_statistics.gs_primitives,1286result->pipeline_statistics.c_invocations,1287result->pipeline_statistics.c_primitives,1288result->pipeline_statistics.ps_invocations,1289result->pipeline_statistics.cs_invocations);1290#endif1291break;1292default:1293assert(0);1294}1295}12961297static bool r600_get_query_result(struct pipe_context *ctx,1298struct pipe_query *query, bool wait,1299union pipe_query_result *result)1300{1301struct r600_common_context *rctx = (struct r600_common_context *)ctx;1302struct r600_query *rquery = (struct r600_query *)query;13031304return rquery->ops->get_result(rctx, rquery, wait, result);1305}13061307static void r600_get_query_result_resource(struct pipe_context *ctx,1308struct pipe_query *query,1309bool wait,1310enum pipe_query_value_type result_type,1311int index,1312struct pipe_resource *resource,1313unsigned offset)1314{1315struct r600_common_context *rctx = (struct r600_common_context *)ctx;1316struct r600_query *rquery = (struct r600_query *)query;13171318rquery->ops->get_result_resource(rctx, rquery, wait, result_type, index,1319resource, offset);1320}13211322static void r600_query_hw_clear_result(struct r600_query_hw *query,1323union pipe_query_result *result)1324{1325util_query_clear_result(result, query->b.type);1326}13271328bool r600_query_hw_get_result(struct r600_common_context *rctx,1329struct r600_query *rquery,1330bool wait, union pipe_query_result *result)1331{1332struct r600_common_screen *rscreen = rctx->screen;1333struct r600_query_hw *query = (struct r600_query_hw *)rquery;1334struct r600_query_buffer *qbuf;13351336query->ops->clear_result(query, result);13371338for (qbuf = &query->buffer; qbuf; qbuf = qbuf->previous) {1339unsigned usage = PIPE_MAP_READ |1340(wait ? 0 : PIPE_MAP_DONTBLOCK);1341unsigned results_base = 0;1342void *map;13431344if (rquery->b.flushed)1345map = rctx->ws->buffer_map(rctx->ws, qbuf->buf->buf, NULL, usage);1346else1347map = r600_buffer_map_sync_with_rings(rctx, qbuf->buf, usage);13481349if (!map)1350return false;13511352while (results_base != qbuf->results_end) {1353query->ops->add_result(rscreen, query, map + results_base,1354result);1355results_base += query->result_size;1356}1357}13581359/* Convert the time to expected units. */1360if (rquery->type == PIPE_QUERY_TIME_ELAPSED ||1361rquery->type == PIPE_QUERY_TIMESTAMP) {1362result->u64 = (1000000 * result->u64) / rscreen->info.clock_crystal_freq;1363}1364return true;1365}13661367/* Create the compute shader that is used to collect the results.1368*1369* One compute grid with a single thread is launched for every query result1370* buffer. The thread (optionally) reads a previous summary buffer, then1371* accumulates data from the query result buffer, and writes the result either1372* to a summary buffer to be consumed by the next grid invocation or to the1373* user-supplied buffer.1374*1375* Data layout:1376*1377* CONST1378* 0.x = end_offset1379* 0.y = result_stride1380* 0.z = result_count1381* 0.w = bit field:1382* 1: read previously accumulated values1383* 2: write accumulated values for chaining1384* 4: write result available1385* 8: convert result to boolean (0/1)1386* 16: only read one dword and use that as result1387* 32: apply timestamp conversion1388* 64: store full 64 bits result1389* 128: store signed 32 bits result1390* 256: SO_OVERFLOW mode: take the difference of two successive half-pairs1391* 1.x = fence_offset1392* 1.y = pair_stride1393* 1.z = pair_count1394* 1.w = result_offset1395* 2.x = buffer0 offset1396*1397* BUFFER[0] = query result buffer1398* BUFFER[1] = previous summary buffer1399* BUFFER[2] = next summary buffer or user-supplied buffer1400*/1401static void r600_create_query_result_shader(struct r600_common_context *rctx)1402{1403/* TEMP[0].xy = accumulated result so far1404* TEMP[0].z = result not available1405*1406* TEMP[1].x = current result index1407* TEMP[1].y = current pair index1408*/1409static const char text_tmpl[] =1410"COMP\n"1411"PROPERTY CS_FIXED_BLOCK_WIDTH 1\n"1412"PROPERTY CS_FIXED_BLOCK_HEIGHT 1\n"1413"PROPERTY CS_FIXED_BLOCK_DEPTH 1\n"1414"DCL BUFFER[0]\n"1415"DCL BUFFER[1]\n"1416"DCL BUFFER[2]\n"1417"DCL CONST[0][0..2]\n"1418"DCL TEMP[0..5]\n"1419"IMM[0] UINT32 {0, 31, 2147483647, 4294967295}\n"1420"IMM[1] UINT32 {1, 2, 4, 8}\n"1421"IMM[2] UINT32 {16, 32, 64, 128}\n"1422"IMM[3] UINT32 {1000000, 0, %u, 0}\n" /* for timestamp conversion */1423"IMM[4] UINT32 {256, 0, 0, 0}\n"14241425"AND TEMP[5], CONST[0][0].wwww, IMM[2].xxxx\n"1426"UIF TEMP[5]\n"1427/* Check result availability. */1428"UADD TEMP[1].x, CONST[0][1].xxxx, CONST[0][2].xxxx\n"1429"LOAD TEMP[1].x, BUFFER[0], TEMP[1].xxxx\n"1430"ISHR TEMP[0].z, TEMP[1].xxxx, IMM[0].yyyy\n"1431"MOV TEMP[1], TEMP[0].zzzz\n"1432"NOT TEMP[0].z, TEMP[0].zzzz\n"14331434/* Load result if available. */1435"UIF TEMP[1]\n"1436"UADD TEMP[0].x, IMM[0].xxxx, CONST[0][2].xxxx\n"1437"LOAD TEMP[0].xy, BUFFER[0], TEMP[0].xxxx\n"1438"ENDIF\n"1439"ELSE\n"1440/* Load previously accumulated result if requested. */1441"MOV TEMP[0], IMM[0].xxxx\n"1442"AND TEMP[4], CONST[0][0].wwww, IMM[1].xxxx\n"1443"UIF TEMP[4]\n"1444"LOAD TEMP[0].xyz, BUFFER[1], IMM[0].xxxx\n"1445"ENDIF\n"14461447"MOV TEMP[1].x, IMM[0].xxxx\n"1448"BGNLOOP\n"1449/* Break if accumulated result so far is not available. */1450"UIF TEMP[0].zzzz\n"1451"BRK\n"1452"ENDIF\n"14531454/* Break if result_index >= result_count. */1455"USGE TEMP[5], TEMP[1].xxxx, CONST[0][0].zzzz\n"1456"UIF TEMP[5]\n"1457"BRK\n"1458"ENDIF\n"14591460/* Load fence and check result availability */1461"UMAD TEMP[5].x, TEMP[1].xxxx, CONST[0][0].yyyy, CONST[0][1].xxxx\n"1462"UADD TEMP[5].x, TEMP[5].xxxx, CONST[0][2].xxxx\n"1463"LOAD TEMP[5].x, BUFFER[0], TEMP[5].xxxx\n"1464"ISHR TEMP[0].z, TEMP[5].xxxx, IMM[0].yyyy\n"1465"NOT TEMP[0].z, TEMP[0].zzzz\n"1466"UIF TEMP[0].zzzz\n"1467"BRK\n"1468"ENDIF\n"14691470"MOV TEMP[1].y, IMM[0].xxxx\n"1471"BGNLOOP\n"1472/* Load start and end. */1473"UMUL TEMP[5].x, TEMP[1].xxxx, CONST[0][0].yyyy\n"1474"UMAD TEMP[5].x, TEMP[1].yyyy, CONST[0][1].yyyy, TEMP[5].xxxx\n"1475"UADD TEMP[5].x, TEMP[5].xxxx, CONST[0][2].xxxx\n"1476"LOAD TEMP[2].xy, BUFFER[0], TEMP[5].xxxx\n"14771478"UADD TEMP[5].y, TEMP[5].xxxx, CONST[0][0].xxxx\n"1479"LOAD TEMP[3].xy, BUFFER[0], TEMP[5].yyyy\n"14801481"U64ADD TEMP[4].xy, TEMP[3], -TEMP[2]\n"14821483"AND TEMP[5].z, CONST[0][0].wwww, IMM[4].xxxx\n"1484"UIF TEMP[5].zzzz\n"1485/* Load second start/end half-pair and1486* take the difference1487*/1488"UADD TEMP[5].xy, TEMP[5], IMM[1].wwww\n"1489"LOAD TEMP[2].xy, BUFFER[0], TEMP[5].xxxx\n"1490"LOAD TEMP[3].xy, BUFFER[0], TEMP[5].yyyy\n"14911492"U64ADD TEMP[3].xy, TEMP[3], -TEMP[2]\n"1493"U64ADD TEMP[4].xy, TEMP[4], -TEMP[3]\n"1494"ENDIF\n"14951496"U64ADD TEMP[0].xy, TEMP[0], TEMP[4]\n"14971498/* Increment pair index */1499"UADD TEMP[1].y, TEMP[1].yyyy, IMM[1].xxxx\n"1500"USGE TEMP[5], TEMP[1].yyyy, CONST[0][1].zzzz\n"1501"UIF TEMP[5]\n"1502"BRK\n"1503"ENDIF\n"1504"ENDLOOP\n"15051506/* Increment result index */1507"UADD TEMP[1].x, TEMP[1].xxxx, IMM[1].xxxx\n"1508"ENDLOOP\n"1509"ENDIF\n"15101511"AND TEMP[4], CONST[0][0].wwww, IMM[1].yyyy\n"1512"UIF TEMP[4]\n"1513/* Store accumulated data for chaining. */1514"STORE BUFFER[2].xyz, CONST[0][1].wwww, TEMP[0]\n"1515"ELSE\n"1516"AND TEMP[4], CONST[0][0].wwww, IMM[1].zzzz\n"1517"UIF TEMP[4]\n"1518/* Store result availability. */1519"NOT TEMP[0].z, TEMP[0]\n"1520"AND TEMP[0].z, TEMP[0].zzzz, IMM[1].xxxx\n"1521"STORE BUFFER[2].x, CONST[0][1].wwww, TEMP[0].zzzz\n"15221523"AND TEMP[4], CONST[0][0].wwww, IMM[2].zzzz\n"1524"UIF TEMP[4]\n"1525"STORE BUFFER[2].y, CONST[0][1].wwww, IMM[0].xxxx\n"1526"ENDIF\n"1527"ELSE\n"1528/* Store result if it is available. */1529"NOT TEMP[4], TEMP[0].zzzz\n"1530"UIF TEMP[4]\n"1531/* Apply timestamp conversion */1532"AND TEMP[4], CONST[0][0].wwww, IMM[2].yyyy\n"1533"UIF TEMP[4]\n"1534"U64MUL TEMP[0].xy, TEMP[0], IMM[3].xyxy\n"1535"U64DIV TEMP[0].xy, TEMP[0], IMM[3].zwzw\n"1536"ENDIF\n"15371538/* Convert to boolean */1539"AND TEMP[4], CONST[0][0].wwww, IMM[1].wwww\n"1540"UIF TEMP[4]\n"1541"U64SNE TEMP[0].x, TEMP[0].xyxy, IMM[4].zwzw\n"1542"AND TEMP[0].x, TEMP[0].xxxx, IMM[1].xxxx\n"1543"MOV TEMP[0].y, IMM[0].xxxx\n"1544"ENDIF\n"15451546"AND TEMP[4], CONST[0][0].wwww, IMM[2].zzzz\n"1547"UIF TEMP[4]\n"1548"STORE BUFFER[2].xy, CONST[0][1].wwww, TEMP[0].xyxy\n"1549"ELSE\n"1550/* Clamping */1551"UIF TEMP[0].yyyy\n"1552"MOV TEMP[0].x, IMM[0].wwww\n"1553"ENDIF\n"15541555"AND TEMP[4], CONST[0][0].wwww, IMM[2].wwww\n"1556"UIF TEMP[4]\n"1557"UMIN TEMP[0].x, TEMP[0].xxxx, IMM[0].zzzz\n"1558"ENDIF\n"15591560"STORE BUFFER[2].x, CONST[0][1].wwww, TEMP[0].xxxx\n"1561"ENDIF\n"1562"ENDIF\n"1563"ENDIF\n"1564"ENDIF\n"15651566"END\n";15671568char text[sizeof(text_tmpl) + 32];1569struct tgsi_token tokens[1024];1570struct pipe_compute_state state = {};15711572/* Hard code the frequency into the shader so that the backend can1573* use the full range of optimizations for divide-by-constant.1574*/1575snprintf(text, sizeof(text), text_tmpl,1576rctx->screen->info.clock_crystal_freq);15771578if (!tgsi_text_translate(text, tokens, ARRAY_SIZE(tokens))) {1579assert(false);1580return;1581}15821583state.ir_type = PIPE_SHADER_IR_TGSI;1584state.prog = tokens;15851586rctx->query_result_shader = rctx->b.create_compute_state(&rctx->b, &state);1587}15881589static void r600_restore_qbo_state(struct r600_common_context *rctx,1590struct r600_qbo_state *st)1591{1592rctx->b.bind_compute_state(&rctx->b, st->saved_compute);1593rctx->b.set_constant_buffer(&rctx->b, PIPE_SHADER_COMPUTE, 0, true, &st->saved_const0);1594rctx->b.set_shader_buffers(&rctx->b, PIPE_SHADER_COMPUTE, 0, 3, st->saved_ssbo, ~0);1595for (unsigned i = 0; i < 3; ++i)1596pipe_resource_reference(&st->saved_ssbo[i].buffer, NULL);1597}15981599static void r600_query_hw_get_result_resource(struct r600_common_context *rctx,1600struct r600_query *rquery,1601bool wait,1602enum pipe_query_value_type result_type,1603int index,1604struct pipe_resource *resource,1605unsigned offset)1606{1607struct r600_query_hw *query = (struct r600_query_hw *)rquery;1608struct r600_query_buffer *qbuf;1609struct r600_query_buffer *qbuf_prev;1610struct pipe_resource *tmp_buffer = NULL;1611unsigned tmp_buffer_offset = 0;1612struct r600_qbo_state saved_state = {};1613struct pipe_grid_info grid = {};1614struct pipe_constant_buffer constant_buffer = {};1615struct pipe_shader_buffer ssbo[3];1616struct r600_hw_query_params params;1617struct {1618uint32_t end_offset;1619uint32_t result_stride;1620uint32_t result_count;1621uint32_t config;1622uint32_t fence_offset;1623uint32_t pair_stride;1624uint32_t pair_count;1625uint32_t buffer_offset;1626uint32_t buffer0_offset;1627} consts;16281629if (!rctx->query_result_shader) {1630r600_create_query_result_shader(rctx);1631if (!rctx->query_result_shader)1632return;1633}16341635if (query->buffer.previous) {1636u_suballocator_alloc(&rctx->allocator_zeroed_memory, 16, 256,1637&tmp_buffer_offset, &tmp_buffer);1638if (!tmp_buffer)1639return;1640}16411642rctx->save_qbo_state(&rctx->b, &saved_state);16431644r600_get_hw_query_params(rctx, query, index >= 0 ? index : 0, ¶ms);1645consts.end_offset = params.end_offset - params.start_offset;1646consts.fence_offset = params.fence_offset - params.start_offset;1647consts.result_stride = query->result_size;1648consts.pair_stride = params.pair_stride;1649consts.pair_count = params.pair_count;16501651constant_buffer.buffer_size = sizeof(consts);1652constant_buffer.user_buffer = &consts;16531654ssbo[1].buffer = tmp_buffer;1655ssbo[1].buffer_offset = tmp_buffer_offset;1656ssbo[1].buffer_size = 16;16571658ssbo[2] = ssbo[1];16591660rctx->b.bind_compute_state(&rctx->b, rctx->query_result_shader);16611662grid.block[0] = 1;1663grid.block[1] = 1;1664grid.block[2] = 1;1665grid.grid[0] = 1;1666grid.grid[1] = 1;1667grid.grid[2] = 1;16681669consts.config = 0;1670if (index < 0)1671consts.config |= 4;1672if (query->b.type == PIPE_QUERY_OCCLUSION_PREDICATE ||1673query->b.type == PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE)1674consts.config |= 8;1675else if (query->b.type == PIPE_QUERY_SO_OVERFLOW_PREDICATE ||1676query->b.type == PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE)1677consts.config |= 8 | 256;1678else if (query->b.type == PIPE_QUERY_TIMESTAMP ||1679query->b.type == PIPE_QUERY_TIME_ELAPSED)1680consts.config |= 32;16811682switch (result_type) {1683case PIPE_QUERY_TYPE_U64:1684case PIPE_QUERY_TYPE_I64:1685consts.config |= 64;1686break;1687case PIPE_QUERY_TYPE_I32:1688consts.config |= 128;1689break;1690case PIPE_QUERY_TYPE_U32:1691break;1692}16931694rctx->flags |= rctx->screen->barrier_flags.cp_to_L2;16951696for (qbuf = &query->buffer; qbuf; qbuf = qbuf_prev) {1697if (query->b.type != PIPE_QUERY_TIMESTAMP) {1698qbuf_prev = qbuf->previous;1699consts.result_count = qbuf->results_end / query->result_size;1700consts.config &= ~3;1701if (qbuf != &query->buffer)1702consts.config |= 1;1703if (qbuf->previous)1704consts.config |= 2;1705} else {1706/* Only read the last timestamp. */1707qbuf_prev = NULL;1708consts.result_count = 0;1709consts.config |= 16;1710params.start_offset += qbuf->results_end - query->result_size;1711}17121713ssbo[0].buffer = &qbuf->buf->b.b;1714ssbo[0].buffer_offset = params.start_offset & ~0xff;1715ssbo[0].buffer_size = qbuf->results_end - ssbo[0].buffer_offset;1716consts.buffer0_offset = (params.start_offset & 0xff);1717if (!qbuf->previous) {17181719ssbo[2].buffer = resource;1720ssbo[2].buffer_offset = offset & ~0xff;1721ssbo[2].buffer_size = offset + 8;1722consts.buffer_offset = (offset & 0xff);1723} else1724consts.buffer_offset = 0;17251726rctx->b.set_constant_buffer(&rctx->b, PIPE_SHADER_COMPUTE, 0, false, &constant_buffer);17271728rctx->b.set_shader_buffers(&rctx->b, PIPE_SHADER_COMPUTE, 0, 3, ssbo, ~0);17291730if (wait && qbuf == &query->buffer) {1731uint64_t va;17321733/* Wait for result availability. Wait only for readiness1734* of the last entry, since the fence writes should be1735* serialized in the CP.1736*/1737va = qbuf->buf->gpu_address + qbuf->results_end - query->result_size;1738va += params.fence_offset;17391740r600_gfx_wait_fence(rctx, qbuf->buf, va, 0x80000000, 0x80000000);1741}17421743rctx->b.launch_grid(&rctx->b, &grid);1744rctx->flags |= rctx->screen->barrier_flags.compute_to_L2;1745}17461747r600_restore_qbo_state(rctx, &saved_state);1748pipe_resource_reference(&tmp_buffer, NULL);1749}17501751static void r600_render_condition(struct pipe_context *ctx,1752struct pipe_query *query,1753bool condition,1754enum pipe_render_cond_flag mode)1755{1756struct r600_common_context *rctx = (struct r600_common_context *)ctx;1757struct r600_query_hw *rquery = (struct r600_query_hw *)query;1758struct r600_query_buffer *qbuf;1759struct r600_atom *atom = &rctx->render_cond_atom;17601761/* Compute the size of SET_PREDICATION packets. */1762atom->num_dw = 0;1763if (query) {1764for (qbuf = &rquery->buffer; qbuf; qbuf = qbuf->previous)1765atom->num_dw += (qbuf->results_end / rquery->result_size) * 5;17661767if (rquery->b.type == PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE)1768atom->num_dw *= R600_MAX_STREAMS;1769}17701771rctx->render_cond = query;1772rctx->render_cond_invert = condition;1773rctx->render_cond_mode = mode;17741775rctx->set_atom_dirty(rctx, atom, query != NULL);1776}17771778void r600_suspend_queries(struct r600_common_context *ctx)1779{1780struct r600_query_hw *query;17811782LIST_FOR_EACH_ENTRY(query, &ctx->active_queries, list) {1783r600_query_hw_emit_stop(ctx, query);1784}1785assert(ctx->num_cs_dw_queries_suspend == 0);1786}17871788static unsigned r600_queries_num_cs_dw_for_resuming(struct r600_common_context *ctx,1789struct list_head *query_list)1790{1791struct r600_query_hw *query;1792unsigned num_dw = 0;17931794LIST_FOR_EACH_ENTRY(query, query_list, list) {1795/* begin + end */1796num_dw += query->num_cs_dw_begin + query->num_cs_dw_end;17971798/* Workaround for the fact that1799* num_cs_dw_nontimer_queries_suspend is incremented for every1800* resumed query, which raises the bar in need_cs_space for1801* queries about to be resumed.1802*/1803num_dw += query->num_cs_dw_end;1804}1805/* primitives generated query */1806num_dw += ctx->streamout.enable_atom.num_dw;1807/* guess for ZPASS enable or PERFECT_ZPASS_COUNT enable updates */1808num_dw += 13;18091810return num_dw;1811}18121813void r600_resume_queries(struct r600_common_context *ctx)1814{1815struct r600_query_hw *query;1816unsigned num_cs_dw = r600_queries_num_cs_dw_for_resuming(ctx, &ctx->active_queries);18171818assert(ctx->num_cs_dw_queries_suspend == 0);18191820/* Check CS space here. Resuming must not be interrupted by flushes. */1821ctx->need_gfx_cs_space(&ctx->b, num_cs_dw, true);18221823LIST_FOR_EACH_ENTRY(query, &ctx->active_queries, list) {1824r600_query_hw_emit_start(ctx, query);1825}1826}18271828/* Fix radeon_info::enabled_rb_mask for R600, R700, EVERGREEN, NI. */1829void r600_query_fix_enabled_rb_mask(struct r600_common_screen *rscreen)1830{1831struct r600_common_context *ctx =1832(struct r600_common_context*)rscreen->aux_context;1833struct radeon_cmdbuf *cs = &ctx->gfx.cs;1834struct r600_resource *buffer;1835uint32_t *results;1836unsigned i, mask = 0;1837unsigned max_rbs;18381839if (ctx->family == CHIP_JUNIPER) {1840/*1841* Fix for predication lockups - the chip can only ever have1842* 4 RBs, however it looks like the predication logic assumes1843* there's 8, trying to read results from query buffers never1844* written to. By increasing this number we'll write the1845* status bit for these as per the normal disabled rb logic.1846*/1847ctx->screen->info.max_render_backends = 8;1848}1849max_rbs = ctx->screen->info.max_render_backends;18501851assert(rscreen->chip_class <= CAYMAN);18521853/*1854* if backend_map query is supported by the kernel.1855* Note the kernel drm driver for a long time never filled in the1856* associated data on eg/cm, only on r600/r700, hence ignore the valid1857* bit there if the map is zero.1858* (Albeit some chips with just one active rb can have a valid 0 map.)1859*/1860if (rscreen->info.r600_gb_backend_map_valid &&1861(ctx->chip_class < EVERGREEN || rscreen->info.r600_gb_backend_map != 0)) {1862unsigned num_tile_pipes = rscreen->info.num_tile_pipes;1863unsigned backend_map = rscreen->info.r600_gb_backend_map;1864unsigned item_width, item_mask;18651866if (ctx->chip_class >= EVERGREEN) {1867item_width = 4;1868item_mask = 0x7;1869} else {1870item_width = 2;1871item_mask = 0x3;1872}18731874while (num_tile_pipes--) {1875i = backend_map & item_mask;1876mask |= (1<<i);1877backend_map >>= item_width;1878}1879if (mask != 0) {1880rscreen->info.enabled_rb_mask = mask;1881return;1882}1883}18841885/* otherwise backup path for older kernels */18861887/* create buffer for event data */1888buffer = (struct r600_resource*)1889pipe_buffer_create(ctx->b.screen, 0,1890PIPE_USAGE_STAGING, max_rbs * 16);1891if (!buffer)1892return;18931894/* initialize buffer with zeroes */1895results = r600_buffer_map_sync_with_rings(ctx, buffer, PIPE_MAP_WRITE);1896if (results) {1897memset(results, 0, max_rbs * 4 * 4);18981899/* emit EVENT_WRITE for ZPASS_DONE */1900radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0));1901radeon_emit(cs, EVENT_TYPE(EVENT_TYPE_ZPASS_DONE) | EVENT_INDEX(1));1902radeon_emit(cs, buffer->gpu_address);1903radeon_emit(cs, buffer->gpu_address >> 32);19041905r600_emit_reloc(ctx, &ctx->gfx, buffer,1906RADEON_USAGE_WRITE, RADEON_PRIO_QUERY);19071908/* analyze results */1909results = r600_buffer_map_sync_with_rings(ctx, buffer, PIPE_MAP_READ);1910if (results) {1911for(i = 0; i < max_rbs; i++) {1912/* at least highest bit will be set if backend is used */1913if (results[i*4 + 1])1914mask |= (1<<i);1915}1916}1917}19181919r600_resource_reference(&buffer, NULL);19201921if (mask) {1922if (rscreen->debug_flags & DBG_INFO &&1923mask != rscreen->info.enabled_rb_mask) {1924printf("enabled_rb_mask (fixed) = 0x%x\n", mask);1925}1926rscreen->info.enabled_rb_mask = mask;1927}1928}19291930#define XFULL(name_, query_type_, type_, result_type_, group_id_) \1931{ \1932.name = name_, \1933.query_type = R600_QUERY_##query_type_, \1934.type = PIPE_DRIVER_QUERY_TYPE_##type_, \1935.result_type = PIPE_DRIVER_QUERY_RESULT_TYPE_##result_type_, \1936.group_id = group_id_ \1937}19381939#define X(name_, query_type_, type_, result_type_) \1940XFULL(name_, query_type_, type_, result_type_, ~(unsigned)0)19411942#define XG(group_, name_, query_type_, type_, result_type_) \1943XFULL(name_, query_type_, type_, result_type_, R600_QUERY_GROUP_##group_)19441945static const struct pipe_driver_query_info r600_driver_query_list[] = {1946X("num-compilations", NUM_COMPILATIONS, UINT64, CUMULATIVE),1947X("num-shaders-created", NUM_SHADERS_CREATED, UINT64, CUMULATIVE),1948X("num-shader-cache-hits", NUM_SHADER_CACHE_HITS, UINT64, CUMULATIVE),1949X("draw-calls", DRAW_CALLS, UINT64, AVERAGE),1950X("decompress-calls", DECOMPRESS_CALLS, UINT64, AVERAGE),1951X("MRT-draw-calls", MRT_DRAW_CALLS, UINT64, AVERAGE),1952X("prim-restart-calls", PRIM_RESTART_CALLS, UINT64, AVERAGE),1953X("spill-draw-calls", SPILL_DRAW_CALLS, UINT64, AVERAGE),1954X("compute-calls", COMPUTE_CALLS, UINT64, AVERAGE),1955X("spill-compute-calls", SPILL_COMPUTE_CALLS, UINT64, AVERAGE),1956X("dma-calls", DMA_CALLS, UINT64, AVERAGE),1957X("cp-dma-calls", CP_DMA_CALLS, UINT64, AVERAGE),1958X("num-vs-flushes", NUM_VS_FLUSHES, UINT64, AVERAGE),1959X("num-ps-flushes", NUM_PS_FLUSHES, UINT64, AVERAGE),1960X("num-cs-flushes", NUM_CS_FLUSHES, UINT64, AVERAGE),1961X("num-CB-cache-flushes", NUM_CB_CACHE_FLUSHES, UINT64, AVERAGE),1962X("num-DB-cache-flushes", NUM_DB_CACHE_FLUSHES, UINT64, AVERAGE),1963X("num-resident-handles", NUM_RESIDENT_HANDLES, UINT64, AVERAGE),1964X("tc-offloaded-slots", TC_OFFLOADED_SLOTS, UINT64, AVERAGE),1965X("tc-direct-slots", TC_DIRECT_SLOTS, UINT64, AVERAGE),1966X("tc-num-syncs", TC_NUM_SYNCS, UINT64, AVERAGE),1967X("CS-thread-busy", CS_THREAD_BUSY, UINT64, AVERAGE),1968X("gallium-thread-busy", GALLIUM_THREAD_BUSY, UINT64, AVERAGE),1969X("requested-VRAM", REQUESTED_VRAM, BYTES, AVERAGE),1970X("requested-GTT", REQUESTED_GTT, BYTES, AVERAGE),1971X("mapped-VRAM", MAPPED_VRAM, BYTES, AVERAGE),1972X("mapped-GTT", MAPPED_GTT, BYTES, AVERAGE),1973X("buffer-wait-time", BUFFER_WAIT_TIME, MICROSECONDS, CUMULATIVE),1974X("num-mapped-buffers", NUM_MAPPED_BUFFERS, UINT64, AVERAGE),1975X("num-GFX-IBs", NUM_GFX_IBS, UINT64, AVERAGE),1976X("num-SDMA-IBs", NUM_SDMA_IBS, UINT64, AVERAGE),1977X("GFX-BO-list-size", GFX_BO_LIST_SIZE, UINT64, AVERAGE),1978X("num-bytes-moved", NUM_BYTES_MOVED, BYTES, CUMULATIVE),1979X("num-evictions", NUM_EVICTIONS, UINT64, CUMULATIVE),1980X("VRAM-CPU-page-faults", NUM_VRAM_CPU_PAGE_FAULTS, UINT64, CUMULATIVE),1981X("VRAM-usage", VRAM_USAGE, BYTES, AVERAGE),1982X("VRAM-vis-usage", VRAM_VIS_USAGE, BYTES, AVERAGE),1983X("GTT-usage", GTT_USAGE, BYTES, AVERAGE),19841985/* GPIN queries are for the benefit of old versions of GPUPerfStudio,1986* which use it as a fallback path to detect the GPU type.1987*1988* Note: The names of these queries are significant for GPUPerfStudio1989* (and possibly their order as well). */1990XG(GPIN, "GPIN_000", GPIN_ASIC_ID, UINT, AVERAGE),1991XG(GPIN, "GPIN_001", GPIN_NUM_SIMD, UINT, AVERAGE),1992XG(GPIN, "GPIN_002", GPIN_NUM_RB, UINT, AVERAGE),1993XG(GPIN, "GPIN_003", GPIN_NUM_SPI, UINT, AVERAGE),1994XG(GPIN, "GPIN_004", GPIN_NUM_SE, UINT, AVERAGE),19951996X("temperature", GPU_TEMPERATURE, UINT64, AVERAGE),1997X("shader-clock", CURRENT_GPU_SCLK, HZ, AVERAGE),1998X("memory-clock", CURRENT_GPU_MCLK, HZ, AVERAGE),19992000/* The following queries must be at the end of the list because their2001* availability is adjusted dynamically based on the DRM version. */2002X("GPU-load", GPU_LOAD, UINT64, AVERAGE),2003X("GPU-shaders-busy", GPU_SHADERS_BUSY, UINT64, AVERAGE),2004X("GPU-ta-busy", GPU_TA_BUSY, UINT64, AVERAGE),2005X("GPU-gds-busy", GPU_GDS_BUSY, UINT64, AVERAGE),2006X("GPU-vgt-busy", GPU_VGT_BUSY, UINT64, AVERAGE),2007X("GPU-ia-busy", GPU_IA_BUSY, UINT64, AVERAGE),2008X("GPU-sx-busy", GPU_SX_BUSY, UINT64, AVERAGE),2009X("GPU-wd-busy", GPU_WD_BUSY, UINT64, AVERAGE),2010X("GPU-bci-busy", GPU_BCI_BUSY, UINT64, AVERAGE),2011X("GPU-sc-busy", GPU_SC_BUSY, UINT64, AVERAGE),2012X("GPU-pa-busy", GPU_PA_BUSY, UINT64, AVERAGE),2013X("GPU-db-busy", GPU_DB_BUSY, UINT64, AVERAGE),2014X("GPU-cp-busy", GPU_CP_BUSY, UINT64, AVERAGE),2015X("GPU-cb-busy", GPU_CB_BUSY, UINT64, AVERAGE),2016X("GPU-sdma-busy", GPU_SDMA_BUSY, UINT64, AVERAGE),2017X("GPU-pfp-busy", GPU_PFP_BUSY, UINT64, AVERAGE),2018X("GPU-meq-busy", GPU_MEQ_BUSY, UINT64, AVERAGE),2019X("GPU-me-busy", GPU_ME_BUSY, UINT64, AVERAGE),2020X("GPU-surf-sync-busy", GPU_SURF_SYNC_BUSY, UINT64, AVERAGE),2021X("GPU-cp-dma-busy", GPU_CP_DMA_BUSY, UINT64, AVERAGE),2022X("GPU-scratch-ram-busy", GPU_SCRATCH_RAM_BUSY, UINT64, AVERAGE),2023};20242025#undef X2026#undef XG2027#undef XFULL20282029static unsigned r600_get_num_queries(struct r600_common_screen *rscreen)2030{2031if (rscreen->info.drm_minor >= 42)2032return ARRAY_SIZE(r600_driver_query_list);2033else2034return ARRAY_SIZE(r600_driver_query_list) - 25;2035}20362037static int r600_get_driver_query_info(struct pipe_screen *screen,2038unsigned index,2039struct pipe_driver_query_info *info)2040{2041struct r600_common_screen *rscreen = (struct r600_common_screen*)screen;2042unsigned num_queries = r600_get_num_queries(rscreen);20432044if (!info) {2045unsigned num_perfcounters =2046r600_get_perfcounter_info(rscreen, 0, NULL);20472048return num_queries + num_perfcounters;2049}20502051if (index >= num_queries)2052return r600_get_perfcounter_info(rscreen, index - num_queries, info);20532054*info = r600_driver_query_list[index];20552056switch (info->query_type) {2057case R600_QUERY_REQUESTED_VRAM:2058case R600_QUERY_VRAM_USAGE:2059case R600_QUERY_MAPPED_VRAM:2060info->max_value.u64 = rscreen->info.vram_size;2061break;2062case R600_QUERY_REQUESTED_GTT:2063case R600_QUERY_GTT_USAGE:2064case R600_QUERY_MAPPED_GTT:2065info->max_value.u64 = rscreen->info.gart_size;2066break;2067case R600_QUERY_GPU_TEMPERATURE:2068info->max_value.u64 = 125;2069break;2070case R600_QUERY_VRAM_VIS_USAGE:2071info->max_value.u64 = rscreen->info.vram_vis_size;2072break;2073}20742075if (info->group_id != ~(unsigned)0 && rscreen->perfcounters)2076info->group_id += rscreen->perfcounters->num_groups;20772078return 1;2079}20802081/* Note: Unfortunately, GPUPerfStudio hardcodes the order of hardware2082* performance counter groups, so be careful when changing this and related2083* functions.2084*/2085static int r600_get_driver_query_group_info(struct pipe_screen *screen,2086unsigned index,2087struct pipe_driver_query_group_info *info)2088{2089struct r600_common_screen *rscreen = (struct r600_common_screen *)screen;2090unsigned num_pc_groups = 0;20912092if (rscreen->perfcounters)2093num_pc_groups = rscreen->perfcounters->num_groups;20942095if (!info)2096return num_pc_groups + R600_NUM_SW_QUERY_GROUPS;20972098if (index < num_pc_groups)2099return r600_get_perfcounter_group_info(rscreen, index, info);21002101index -= num_pc_groups;2102if (index >= R600_NUM_SW_QUERY_GROUPS)2103return 0;21042105info->name = "GPIN";2106info->max_active_queries = 5;2107info->num_queries = 5;2108return 1;2109}21102111void r600_query_init(struct r600_common_context *rctx)2112{2113rctx->b.create_query = r600_create_query;2114rctx->b.create_batch_query = r600_create_batch_query;2115rctx->b.destroy_query = r600_destroy_query;2116rctx->b.begin_query = r600_begin_query;2117rctx->b.end_query = r600_end_query;2118rctx->b.get_query_result = r600_get_query_result;2119rctx->b.get_query_result_resource = r600_get_query_result_resource;2120rctx->render_cond_atom.emit = r600_emit_query_predication;21212122if (((struct r600_common_screen*)rctx->b.screen)->info.max_render_backends > 0)2123rctx->b.render_condition = r600_render_condition;21242125list_inithead(&rctx->active_queries);2126}21272128void r600_init_screen_query_functions(struct r600_common_screen *rscreen)2129{2130rscreen->b.get_driver_query_info = r600_get_driver_query_info;2131rscreen->b.get_driver_query_group_info = r600_get_driver_query_group_info;2132}213321342135