Path: blob/21.2-virgl/src/gallium/drivers/crocus/crocus_query.c
4570 views
/*1* Copyright © 2017 Intel Corporation2*3* Permission is hereby granted, free of charge, to any person obtaining a4* copy of this software and associated documentation files (the "Software"),5* to deal in the Software without restriction, including without limitation6* the rights to use, copy, modify, merge, publish, distribute, sublicense,7* and/or sell copies of the Software, and to permit persons to whom the8* Software is furnished to do so, subject to the following conditions:9*10* The above copyright notice and this permission notice shall be included11* in all copies or substantial portions of the Software.12*13* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS14* OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,15* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL16* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER17* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING18* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER19* DEALINGS IN THE SOFTWARE.20*/2122/**23* @file crocus_query.c24*25* ============================= GENXML CODE =============================26* [This file is compiled once per generation.]27* =======================================================================28*29* Query object support. This allows measuring various simple statistics30* via counters on the GPU. We use GenX code for MI_MATH calculations.31*/3233#include <stdio.h>34#include <errno.h>35#include "perf/intel_perf.h"36#include "pipe/p_defines.h"37#include "pipe/p_state.h"38#include "pipe/p_context.h"39#include "pipe/p_screen.h"40#include "util/u_inlines.h"41#include "util/u_upload_mgr.h"42#include "crocus_context.h"43#include "crocus_defines.h"44#include "crocus_fence.h"45#include "crocus_monitor.h"46#include "crocus_resource.h"47#include "crocus_screen.h"4849#include "crocus_genx_macros.h"5051#if GFX_VER == 652// TOOD: Add these to genxml?53#define SO_PRIM_STORAGE_NEEDED(n) (0x2280)54#define SO_NUM_PRIMS_WRITTEN(n) (0x2288)5556// TODO: remove HS/DS/CS57#define GFX6_IA_VERTICES_COUNT_num 0x231058#define GFX6_IA_PRIMITIVES_COUNT_num 0x231859#define GFX6_VS_INVOCATION_COUNT_num 0x232060#define GFX6_HS_INVOCATION_COUNT_num 0x230061#define GFX6_DS_INVOCATION_COUNT_num 0x230862#define GFX6_GS_INVOCATION_COUNT_num 0x232863#define GFX6_GS_PRIMITIVES_COUNT_num 0x233064#define GFX6_CL_INVOCATION_COUNT_num 0x233865#define GFX6_CL_PRIMITIVES_COUNT_num 0x234066#define GFX6_PS_INVOCATION_COUNT_num 0x234867#define GFX6_CS_INVOCATION_COUNT_num 0x229068#define GFX6_PS_DEPTH_COUNT_num 0x23506970#elif GFX_VER >= 771#define SO_PRIM_STORAGE_NEEDED(n) (GENX(SO_PRIM_STORAGE_NEEDED0_num) + (n) * 8)72#define SO_NUM_PRIMS_WRITTEN(n) (GENX(SO_NUM_PRIMS_WRITTEN0_num) + (n) * 8)73#endif7475struct crocus_query {76struct threaded_query b;7778enum pipe_query_type type;79int index;8081bool ready;8283bool stalled;8485uint64_t result;8687struct crocus_state_ref query_state_ref;88struct crocus_query_snapshots *map;89struct crocus_syncobj *syncobj;9091int batch_idx;9293struct crocus_monitor_object *monitor;9495/* Fence for PIPE_QUERY_GPU_FINISHED. */96struct pipe_fence_handle *fence;97};9899struct crocus_query_snapshots {100/** crocus_render_condition's saved MI_PREDICATE_RESULT value. */101uint64_t predicate_result;102103/** Have the start/end snapshots landed? */104uint64_t snapshots_landed;105106/** Starting and ending counter snapshots */107uint64_t start;108uint64_t end;109};110111struct crocus_query_so_overflow {112uint64_t predicate_result;113uint64_t snapshots_landed;114115struct {116uint64_t prim_storage_needed[2];117uint64_t num_prims[2];118} stream[4];119};120121#if GFX_VERx10 >= 75122static struct mi_value123query_mem64(struct crocus_query *q, uint32_t offset)124{125return mi_mem64(rw_bo(crocus_resource_bo(q->query_state_ref.res),126q->query_state_ref.offset + offset));127}128#endif129130/**131* Is this type of query written by PIPE_CONTROL?132*/133static bool134crocus_is_query_pipelined(struct crocus_query *q)135{136switch (q->type) {137case PIPE_QUERY_OCCLUSION_COUNTER:138case PIPE_QUERY_OCCLUSION_PREDICATE:139case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE:140case PIPE_QUERY_TIMESTAMP:141case PIPE_QUERY_TIMESTAMP_DISJOINT:142case PIPE_QUERY_TIME_ELAPSED:143return true;144145default:146return false;147}148}149150static void151mark_available(struct crocus_context *ice, struct crocus_query *q)152{153#if GFX_VERx10 >= 75154struct crocus_batch *batch = &ice->batches[q->batch_idx];155struct crocus_screen *screen = batch->screen;156unsigned flags = PIPE_CONTROL_WRITE_IMMEDIATE;157unsigned offset = offsetof(struct crocus_query_snapshots, snapshots_landed);158struct crocus_bo *bo = crocus_resource_bo(q->query_state_ref.res);159offset += q->query_state_ref.offset;160161if (!crocus_is_query_pipelined(q)) {162screen->vtbl.store_data_imm64(batch, bo, offset, true);163} else {164/* Order available *after* the query results. */165flags |= PIPE_CONTROL_FLUSH_ENABLE;166crocus_emit_pipe_control_write(batch, "query: mark available",167flags, bo, offset, true);168}169#endif170}171172/**173* Write PS_DEPTH_COUNT to q->(dest) via a PIPE_CONTROL.174*/175static void176crocus_pipelined_write(struct crocus_batch *batch,177struct crocus_query *q,178enum pipe_control_flags flags,179unsigned offset)180{181struct crocus_bo *bo = crocus_resource_bo(q->query_state_ref.res);182183crocus_emit_pipe_control_write(batch, "query: pipelined snapshot write",184flags,185bo, offset, 0ull);186}187188static void189write_value(struct crocus_context *ice, struct crocus_query *q, unsigned offset)190{191struct crocus_batch *batch = &ice->batches[q->batch_idx];192#if GFX_VER >= 6193struct crocus_screen *screen = batch->screen;194struct crocus_bo *bo = crocus_resource_bo(q->query_state_ref.res);195#endif196197if (!crocus_is_query_pipelined(q)) {198crocus_emit_pipe_control_flush(batch,199"query: non-pipelined snapshot write",200PIPE_CONTROL_CS_STALL |201PIPE_CONTROL_STALL_AT_SCOREBOARD);202q->stalled = true;203}204205switch (q->type) {206case PIPE_QUERY_OCCLUSION_COUNTER:207case PIPE_QUERY_OCCLUSION_PREDICATE:208case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE:209crocus_pipelined_write(&ice->batches[CROCUS_BATCH_RENDER], q,210PIPE_CONTROL_WRITE_DEPTH_COUNT |211PIPE_CONTROL_DEPTH_STALL,212offset);213break;214case PIPE_QUERY_TIME_ELAPSED:215case PIPE_QUERY_TIMESTAMP:216case PIPE_QUERY_TIMESTAMP_DISJOINT:217crocus_pipelined_write(&ice->batches[CROCUS_BATCH_RENDER], q,218PIPE_CONTROL_WRITE_TIMESTAMP,219offset);220break;221case PIPE_QUERY_PRIMITIVES_GENERATED:222#if GFX_VER >= 6223screen->vtbl.store_register_mem64(batch,224q->index == 0 ?225GENX(CL_INVOCATION_COUNT_num) :226SO_PRIM_STORAGE_NEEDED(q->index),227bo, offset, false);228#endif229break;230case PIPE_QUERY_PRIMITIVES_EMITTED:231#if GFX_VER >= 6232screen->vtbl.store_register_mem64(batch,233SO_NUM_PRIMS_WRITTEN(q->index),234bo, offset, false);235#endif236break;237case PIPE_QUERY_PIPELINE_STATISTICS_SINGLE: {238#if GFX_VER >= 6239static const uint32_t index_to_reg[] = {240GENX(IA_VERTICES_COUNT_num),241GENX(IA_PRIMITIVES_COUNT_num),242GENX(VS_INVOCATION_COUNT_num),243GENX(GS_INVOCATION_COUNT_num),244GENX(GS_PRIMITIVES_COUNT_num),245GENX(CL_INVOCATION_COUNT_num),246GENX(CL_PRIMITIVES_COUNT_num),247GENX(PS_INVOCATION_COUNT_num),248GENX(HS_INVOCATION_COUNT_num),249GENX(DS_INVOCATION_COUNT_num),250GENX(CS_INVOCATION_COUNT_num),251};252uint32_t reg = index_to_reg[q->index];253254#if GFX_VER == 6255/* Gfx6 GS code counts full primitives, that is, it won't count individual256* triangles in a triangle strip. Use CL_INVOCATION_COUNT for that.257*/258if (q->index == PIPE_STAT_QUERY_GS_PRIMITIVES)259reg = GENX(CL_INVOCATION_COUNT_num);260#endif261262screen->vtbl.store_register_mem64(batch, reg, bo, offset, false);263#endif264break;265}266default:267assert(false);268}269}270271#if GFX_VER >= 6272static void273write_overflow_values(struct crocus_context *ice, struct crocus_query *q, bool end)274{275struct crocus_batch *batch = &ice->batches[CROCUS_BATCH_RENDER];276struct crocus_screen *screen = batch->screen;277uint32_t count = q->type == PIPE_QUERY_SO_OVERFLOW_PREDICATE ? 1 : 4;278struct crocus_bo *bo = crocus_resource_bo(q->query_state_ref.res);279uint32_t offset = q->query_state_ref.offset;280crocus_emit_pipe_control_flush(batch,281"query: write SO overflow snapshots",282PIPE_CONTROL_CS_STALL |283PIPE_CONTROL_STALL_AT_SCOREBOARD);284for (uint32_t i = 0; i < count; i++) {285int s = q->index + i;286int g_idx = offset + offsetof(struct crocus_query_so_overflow,287stream[s].num_prims[end]);288int w_idx = offset + offsetof(struct crocus_query_so_overflow,289stream[s].prim_storage_needed[end]);290screen->vtbl.store_register_mem64(batch, SO_NUM_PRIMS_WRITTEN(s),291bo, g_idx, false);292screen->vtbl.store_register_mem64(batch, SO_PRIM_STORAGE_NEEDED(s),293bo, w_idx, false);294}295}296#endif297static uint64_t298crocus_raw_timestamp_delta(uint64_t time0, uint64_t time1)299{300if (time0 > time1) {301return (1ULL << TIMESTAMP_BITS) + time1 - time0;302} else {303return time1 - time0;304}305}306307static bool308stream_overflowed(struct crocus_query_so_overflow *so, int s)309{310return (so->stream[s].prim_storage_needed[1] -311so->stream[s].prim_storage_needed[0]) !=312(so->stream[s].num_prims[1] - so->stream[s].num_prims[0]);313}314315static void316calculate_result_on_cpu(const struct intel_device_info *devinfo,317struct crocus_query *q)318{319switch (q->type) {320case PIPE_QUERY_OCCLUSION_PREDICATE:321case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE:322q->result = q->map->end != q->map->start;323break;324case PIPE_QUERY_TIMESTAMP:325case PIPE_QUERY_TIMESTAMP_DISJOINT:326/* The timestamp is the single starting snapshot. */327q->result = intel_device_info_timebase_scale(devinfo, q->map->start);328q->result &= (1ull << TIMESTAMP_BITS) - 1;329break;330case PIPE_QUERY_TIME_ELAPSED:331q->result = crocus_raw_timestamp_delta(q->map->start, q->map->end);332q->result = intel_device_info_timebase_scale(devinfo, q->result);333q->result &= (1ull << TIMESTAMP_BITS) - 1;334break;335case PIPE_QUERY_SO_OVERFLOW_PREDICATE:336q->result = stream_overflowed((void *) q->map, q->index);337break;338case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE:339q->result = false;340for (int i = 0; i < MAX_VERTEX_STREAMS; i++)341q->result |= stream_overflowed((void *) q->map, i);342break;343case PIPE_QUERY_PIPELINE_STATISTICS_SINGLE:344q->result = q->map->end - q->map->start;345346/* WaDividePSInvocationCountBy4:HSW,BDW */347if (GFX_VERx10 >= 75 && q->index == PIPE_STAT_QUERY_PS_INVOCATIONS)348q->result /= 4;349break;350case PIPE_QUERY_OCCLUSION_COUNTER:351case PIPE_QUERY_PRIMITIVES_GENERATED:352case PIPE_QUERY_PRIMITIVES_EMITTED:353default:354q->result = q->map->end - q->map->start;355break;356}357358q->ready = true;359}360361#if GFX_VERx10 >= 75362/**363* Calculate the streamout overflow for stream \p idx:364*365* (num_prims[1] - num_prims[0]) - (storage_needed[1] - storage_needed[0])366*/367static struct mi_value368calc_overflow_for_stream(struct mi_builder *b,369struct crocus_query *q,370int idx)371{372#define C(counter, i) query_mem64(q, \373offsetof(struct crocus_query_so_overflow, stream[idx].counter[i]))374375return mi_isub(b, mi_isub(b, C(num_prims, 1), C(num_prims, 0)),376mi_isub(b, C(prim_storage_needed, 1),377C(prim_storage_needed, 0)));378#undef C379}380381/**382* Calculate whether any stream has overflowed.383*/384static struct mi_value385calc_overflow_any_stream(struct mi_builder *b, struct crocus_query *q)386{387struct mi_value stream_result[MAX_VERTEX_STREAMS];388for (int i = 0; i < MAX_VERTEX_STREAMS; i++)389stream_result[i] = calc_overflow_for_stream(b, q, i);390391struct mi_value result = stream_result[0];392for (int i = 1; i < MAX_VERTEX_STREAMS; i++)393result = mi_ior(b, result, stream_result[i]);394395return result;396}397398399static bool400query_is_boolean(enum pipe_query_type type)401{402switch (type) {403case PIPE_QUERY_OCCLUSION_PREDICATE:404case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE:405case PIPE_QUERY_SO_OVERFLOW_PREDICATE:406case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE:407return true;408default:409return false;410}411}412413/**414* Calculate the result using MI_MATH.415*/416static struct mi_value417calculate_result_on_gpu(const struct intel_device_info *devinfo,418struct mi_builder *b,419struct crocus_query *q)420{421struct mi_value result;422struct mi_value start_val =423query_mem64(q, offsetof(struct crocus_query_snapshots, start));424struct mi_value end_val =425query_mem64(q, offsetof(struct crocus_query_snapshots, end));426427switch (q->type) {428case PIPE_QUERY_SO_OVERFLOW_PREDICATE:429result = calc_overflow_for_stream(b, q, q->index);430break;431case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE:432result = calc_overflow_any_stream(b, q);433break;434case PIPE_QUERY_TIMESTAMP: {435/* TODO: This discards any fractional bits of the timebase scale.436* We would need to do a bit of fixed point math on the CS ALU, or437* launch an actual shader to calculate this with full precision.438*/439uint32_t scale = 1000000000ull / devinfo->timestamp_frequency;440result = mi_iand(b, mi_imm((1ull << 36) - 1),441mi_imul_imm(b, start_val, scale));442break;443}444case PIPE_QUERY_TIME_ELAPSED: {445/* TODO: This discards fractional bits (see above). */446uint32_t scale = 1000000000ull / devinfo->timestamp_frequency;447result = mi_imul_imm(b, mi_isub(b, end_val, start_val), scale);448break;449}450default:451result = mi_isub(b, end_val, start_val);452break;453}454/* WaDividePSInvocationCountBy4:HSW,BDW */455if (GFX_VERx10 >= 75 &&456q->type == PIPE_QUERY_PIPELINE_STATISTICS_SINGLE &&457q->index == PIPE_STAT_QUERY_PS_INVOCATIONS)458result = mi_ushr32_imm(b, result, 2);459460if (query_is_boolean(q->type))461result = mi_iand(b, mi_nz(b, result), mi_imm(1));462463return result;464}465#endif466467static struct pipe_query *468crocus_create_query(struct pipe_context *ctx,469unsigned query_type,470unsigned index)471{472struct crocus_query *q = calloc(1, sizeof(struct crocus_query));473474q->type = query_type;475q->index = index;476q->monitor = NULL;477478if (q->type == PIPE_QUERY_PIPELINE_STATISTICS_SINGLE &&479q->index == PIPE_STAT_QUERY_CS_INVOCATIONS)480q->batch_idx = CROCUS_BATCH_COMPUTE;481else482q->batch_idx = CROCUS_BATCH_RENDER;483return (struct pipe_query *) q;484}485486static struct pipe_query *487crocus_create_batch_query(struct pipe_context *ctx,488unsigned num_queries,489unsigned *query_types)490{491struct crocus_context *ice = (void *) ctx;492struct crocus_query *q = calloc(1, sizeof(struct crocus_query));493if (unlikely(!q))494return NULL;495q->type = PIPE_QUERY_DRIVER_SPECIFIC;496q->index = -1;497q->monitor = crocus_create_monitor_object(ice, num_queries, query_types);498if (unlikely(!q->monitor)) {499free(q);500return NULL;501}502503return (struct pipe_query *) q;504}505506static void507crocus_destroy_query(struct pipe_context *ctx, struct pipe_query *p_query)508{509struct crocus_query *query = (void *) p_query;510struct crocus_screen *screen = (void *) ctx->screen;511if (query->monitor) {512crocus_destroy_monitor_object(ctx, query->monitor);513query->monitor = NULL;514} else {515crocus_syncobj_reference(screen, &query->syncobj, NULL);516screen->base.fence_reference(ctx->screen, &query->fence, NULL);517}518free(query);519}520521522static bool523crocus_begin_query(struct pipe_context *ctx, struct pipe_query *query)524{525struct crocus_context *ice = (void *) ctx;526struct crocus_query *q = (void *) query;527528if (q->monitor)529return crocus_begin_monitor(ctx, q->monitor);530531void *ptr = NULL;532uint32_t size;533534if (q->type == PIPE_QUERY_SO_OVERFLOW_PREDICATE ||535q->type == PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE)536size = sizeof(struct crocus_query_so_overflow);537else538size = sizeof(struct crocus_query_snapshots);539540u_upload_alloc(ice->query_buffer_uploader, 0,541size, size, &q->query_state_ref.offset,542&q->query_state_ref.res, &ptr);543544if (!crocus_resource_bo(q->query_state_ref.res))545return false;546547q->map = ptr;548if (!q->map)549return false;550551q->result = 0ull;552q->ready = false;553WRITE_ONCE(q->map->snapshots_landed, false);554555if (q->type == PIPE_QUERY_PRIMITIVES_GENERATED && q->index == 0) {556ice->state.prims_generated_query_active = true;557ice->state.dirty |= CROCUS_DIRTY_STREAMOUT | CROCUS_DIRTY_CLIP;558}559560#if GFX_VER <= 5561if (q->type == PIPE_QUERY_OCCLUSION_COUNTER ||562q->type == PIPE_QUERY_OCCLUSION_PREDICATE) {563ice->state.stats_wm++;564ice->state.dirty |= CROCUS_DIRTY_WM | CROCUS_DIRTY_COLOR_CALC_STATE;565}566#endif567#if GFX_VER >= 6568if (q->type == PIPE_QUERY_SO_OVERFLOW_PREDICATE ||569q->type == PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE)570write_overflow_values(ice, q, false);571else572#endif573write_value(ice, q,574q->query_state_ref.offset +575offsetof(struct crocus_query_snapshots, start));576577return true;578}579580static bool581crocus_end_query(struct pipe_context *ctx, struct pipe_query *query)582{583struct crocus_context *ice = (void *) ctx;584struct crocus_query *q = (void *) query;585586if (q->monitor)587return crocus_end_monitor(ctx, q->monitor);588589if (q->type == PIPE_QUERY_GPU_FINISHED) {590ctx->flush(ctx, &q->fence, PIPE_FLUSH_DEFERRED);591return true;592}593594struct crocus_batch *batch = &ice->batches[q->batch_idx];595596if (q->type == PIPE_QUERY_TIMESTAMP) {597crocus_begin_query(ctx, query);598crocus_batch_reference_signal_syncobj(batch, &q->syncobj);599mark_available(ice, q);600return true;601}602603#if GFX_VER <= 5604if (q->type == PIPE_QUERY_OCCLUSION_COUNTER ||605q->type == PIPE_QUERY_OCCLUSION_PREDICATE) {606ice->state.stats_wm--;607ice->state.dirty |= CROCUS_DIRTY_WM | CROCUS_DIRTY_COLOR_CALC_STATE;608}609#endif610if (q->type == PIPE_QUERY_PRIMITIVES_GENERATED && q->index == 0) {611ice->state.prims_generated_query_active = false;612ice->state.dirty |= CROCUS_DIRTY_STREAMOUT | CROCUS_DIRTY_CLIP;613}614615#if GFX_VER >= 6616if (q->type == PIPE_QUERY_SO_OVERFLOW_PREDICATE ||617q->type == PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE)618write_overflow_values(ice, q, true);619else620#endif621write_value(ice, q,622q->query_state_ref.offset +623offsetof(struct crocus_query_snapshots, end));624625crocus_batch_reference_signal_syncobj(batch, &q->syncobj);626mark_available(ice, q);627628return true;629}630631/**632* See if the snapshots have landed for a query, and if so, compute the633* result and mark it ready. Does not flush (unlike crocus_get_query_result).634*/635static void636crocus_check_query_no_flush(struct crocus_context *ice, struct crocus_query *q)637{638struct crocus_screen *screen = (void *) ice->ctx.screen;639const struct intel_device_info *devinfo = &screen->devinfo;640641if (!q->ready && READ_ONCE(q->map->snapshots_landed)) {642calculate_result_on_cpu(devinfo, q);643}644}645646static bool647crocus_get_query_result(struct pipe_context *ctx,648struct pipe_query *query,649bool wait,650union pipe_query_result *result)651{652struct crocus_context *ice = (void *) ctx;653struct crocus_query *q = (void *) query;654655if (q->monitor)656return crocus_get_monitor_result(ctx, q->monitor, wait, result->batch);657658struct crocus_screen *screen = (void *) ctx->screen;659const struct intel_device_info *devinfo = &screen->devinfo;660661if (unlikely(screen->no_hw)) {662result->u64 = 0;663return true;664}665666if (!q->ready) {667struct crocus_batch *batch = &ice->batches[q->batch_idx];668if (q->syncobj == crocus_batch_get_signal_syncobj(batch))669crocus_batch_flush(batch);670671#if GFX_VERx10 >= 75672while (!READ_ONCE(q->map->snapshots_landed)) {673if (wait)674crocus_wait_syncobj(ctx->screen, q->syncobj, INT64_MAX);675else676return false;677}678assert(READ_ONCE(q->map->snapshots_landed));679#else680if (crocus_wait_syncobj(ctx->screen, q->syncobj, wait ? INT64_MAX : 0))681return false;682#endif683calculate_result_on_cpu(devinfo, q);684}685686assert(q->ready);687688result->u64 = q->result;689690return true;691}692693#if GFX_VER >= 7694static void695crocus_get_query_result_resource(struct pipe_context *ctx,696struct pipe_query *query,697bool wait,698enum pipe_query_value_type result_type,699int index,700struct pipe_resource *p_res,701unsigned offset)702{703struct crocus_context *ice = (void *) ctx;704struct crocus_query *q = (void *) query;705struct crocus_batch *batch = &ice->batches[q->batch_idx];706struct crocus_screen *screen = batch->screen;707const struct intel_device_info *devinfo = &batch->screen->devinfo;708struct crocus_resource *res = (void *) p_res;709struct crocus_bo *query_bo = crocus_resource_bo(q->query_state_ref.res);710struct crocus_bo *dst_bo = crocus_resource_bo(p_res);711unsigned snapshots_landed_offset =712offsetof(struct crocus_query_snapshots, snapshots_landed);713714res->bind_history |= PIPE_BIND_QUERY_BUFFER;715716if (index == -1) {717/* They're asking for the availability of the result. If we still718* have commands queued up which produce the result, submit them719* now so that progress happens. Either way, copy the snapshots720* landed field to the destination resource.721*/722if (q->syncobj == crocus_batch_get_signal_syncobj(batch))723crocus_batch_flush(batch);724725screen->vtbl.copy_mem_mem(batch, dst_bo, offset,726query_bo, snapshots_landed_offset,727result_type <= PIPE_QUERY_TYPE_U32 ? 4 : 8);728return;729}730731if (!q->ready && READ_ONCE(q->map->snapshots_landed)) {732/* The final snapshots happen to have landed, so let's just compute733* the result on the CPU now...734*/735calculate_result_on_cpu(devinfo, q);736}737738if (q->ready) {739/* We happen to have the result on the CPU, so just copy it. */740if (result_type <= PIPE_QUERY_TYPE_U32) {741screen->vtbl.store_data_imm32(batch, dst_bo, offset, q->result);742} else {743screen->vtbl.store_data_imm64(batch, dst_bo, offset, q->result);744}745746/* Make sure the result lands before they use bind the QBO elsewhere747* and use the result.748*/749// XXX: Why? i965 doesn't do this.750crocus_emit_pipe_control_flush(batch,751"query: unknown QBO flushing hack",752PIPE_CONTROL_CS_STALL);753return;754}755756#if GFX_VERx10 >= 75757bool predicated = !wait && !q->stalled;758759struct mi_builder b;760mi_builder_init(&b, &batch->screen->devinfo, batch);761762struct mi_value result = calculate_result_on_gpu(devinfo, &b, q);763struct mi_value dst =764result_type <= PIPE_QUERY_TYPE_U32 ? mi_mem32(rw_bo(dst_bo, offset))765: mi_mem64(rw_bo(dst_bo, offset));766767if (predicated) {768mi_store(&b, mi_reg32(MI_PREDICATE_RESULT),769mi_mem64(ro_bo(query_bo, snapshots_landed_offset)));770mi_store_if(&b, dst, result);771} else {772mi_store(&b, dst, result);773}774#endif775}776#endif777778static void779crocus_set_active_query_state(struct pipe_context *ctx, bool enable)780{781struct crocus_context *ice = (void *) ctx;782783if (ice->state.statistics_counters_enabled == enable)784return;785786// XXX: most packets aren't paying attention to this yet, because it'd787// have to be done dynamically at draw time, which is a pain788ice->state.statistics_counters_enabled = enable;789ice->state.dirty |= CROCUS_DIRTY_CLIP |790CROCUS_DIRTY_RASTER |791CROCUS_DIRTY_STREAMOUT |792CROCUS_DIRTY_WM;793ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_GS |794CROCUS_STAGE_DIRTY_TCS |795CROCUS_STAGE_DIRTY_TES |796CROCUS_STAGE_DIRTY_VS;797}798799static void800set_predicate_enable(struct crocus_context *ice, bool value)801{802if (value)803ice->state.predicate = CROCUS_PREDICATE_STATE_RENDER;804else805ice->state.predicate = CROCUS_PREDICATE_STATE_DONT_RENDER;806}807808#if GFX_VER >= 7809static void810set_predicate_for_result(struct crocus_context *ice,811struct crocus_query *q,812bool inverted)813{814struct crocus_batch *batch = &ice->batches[CROCUS_BATCH_RENDER];815struct crocus_bo *bo = crocus_resource_bo(q->query_state_ref.res);816817#if GFX_VERx10 < 75818/* IVB doesn't have enough MI for this */819if (q->type == PIPE_QUERY_SO_OVERFLOW_PREDICATE ||820q->type == PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE) {821ice->state.predicate = CROCUS_PREDICATE_STATE_STALL_FOR_QUERY;822return;823}824#endif825826/* The CPU doesn't have the query result yet; use hardware predication */827ice->state.predicate = CROCUS_PREDICATE_STATE_USE_BIT;828829/* Ensure the memory is coherent for MI_LOAD_REGISTER_* commands. */830crocus_emit_pipe_control_flush(batch,831"conditional rendering: set predicate",832PIPE_CONTROL_FLUSH_ENABLE);833q->stalled = true;834835#if GFX_VERx10 < 75836struct crocus_screen *screen = batch->screen;837screen->vtbl.load_register_mem64(batch, MI_PREDICATE_SRC0, bo,838q->query_state_ref.offset + offsetof(struct crocus_query_snapshots, start));839screen->vtbl.load_register_mem64(batch, MI_PREDICATE_SRC1, bo,840q->query_state_ref.offset + offsetof(struct crocus_query_snapshots, end));841842uint32_t mi_predicate = MI_PREDICATE | MI_PREDICATE_COMBINEOP_SET |843MI_PREDICATE_COMPAREOP_SRCS_EQUAL;844if (inverted)845mi_predicate |= MI_PREDICATE_LOADOP_LOAD;846else847mi_predicate |= MI_PREDICATE_LOADOP_LOADINV;848crocus_batch_emit(batch, &mi_predicate, sizeof(uint32_t));849#else850struct mi_builder b;851mi_builder_init(&b, &batch->screen->devinfo, batch);852853struct mi_value result;854855switch (q->type) {856case PIPE_QUERY_SO_OVERFLOW_PREDICATE:857result = calc_overflow_for_stream(&b, q, q->index);858break;859case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE:860result = calc_overflow_any_stream(&b, q);861break;862default: {863/* PIPE_QUERY_OCCLUSION_* */864struct mi_value start =865query_mem64(q, offsetof(struct crocus_query_snapshots, start));866struct mi_value end =867query_mem64(q, offsetof(struct crocus_query_snapshots, end));868result = mi_isub(&b, end, start);869break;870}871}872873result = inverted ? mi_z(&b, result) : mi_nz(&b, result);874result = mi_iand(&b, result, mi_imm(1));875876/* We immediately set the predicate on the render batch, as all the877* counters come from 3D operations. However, we may need to predicate878* a compute dispatch, which executes in a different GEM context and has879* a different MI_PREDICATE_RESULT register. So, we save the result to880* memory and reload it in crocus_launch_grid.881*/882mi_value_ref(&b, result);883884mi_store(&b, mi_reg64(MI_PREDICATE_SRC0), result);885mi_store(&b, mi_reg64(MI_PREDICATE_SRC1), mi_imm(0));886887unsigned mi_predicate = MI_PREDICATE | MI_PREDICATE_LOADOP_LOADINV |888MI_PREDICATE_COMBINEOP_SET |889MI_PREDICATE_COMPAREOP_SRCS_EQUAL;890891crocus_batch_emit(batch, &mi_predicate, sizeof(uint32_t));892mi_store(&b, query_mem64(q, offsetof(struct crocus_query_snapshots,893predicate_result)), result);894#endif895ice->state.compute_predicate = bo;896}897#endif898899static void900crocus_render_condition(struct pipe_context *ctx,901struct pipe_query *query,902bool condition,903enum pipe_render_cond_flag mode)904{905struct crocus_context *ice = (void *) ctx;906struct crocus_query *q = (void *) query;907908/* The old condition isn't relevant; we'll update it if necessary */909ice->state.compute_predicate = NULL;910ice->condition.query = q;911ice->condition.condition = condition;912ice->condition.mode = mode;913914if (!q) {915ice->state.predicate = CROCUS_PREDICATE_STATE_RENDER;916return;917}918919crocus_check_query_no_flush(ice, q);920921if (q->result || q->ready) {922set_predicate_enable(ice, (q->result != 0) ^ condition);923} else {924if (mode == PIPE_RENDER_COND_NO_WAIT ||925mode == PIPE_RENDER_COND_BY_REGION_NO_WAIT) {926perf_debug(&ice->dbg, "Conditional rendering demoted from "927"\"no wait\" to \"wait\".");928}929#if GFX_VER >= 7930set_predicate_for_result(ice, q, condition);931#else932ice->state.predicate = CROCUS_PREDICATE_STATE_STALL_FOR_QUERY;933#endif934}935}936937static void938crocus_resolve_conditional_render(struct crocus_context *ice)939{940struct pipe_context *ctx = (void *) ice;941struct crocus_query *q = ice->condition.query;942struct pipe_query *query = (void *) q;943union pipe_query_result result;944945if (ice->state.predicate != CROCUS_PREDICATE_STATE_USE_BIT)946return;947948assert(q);949950crocus_get_query_result(ctx, query, true, &result);951set_predicate_enable(ice, (q->result != 0) ^ ice->condition.condition);952}953954#if GFX_VER >= 7955static void956crocus_emit_compute_predicate(struct crocus_batch *batch)957{958struct crocus_context *ice = batch->ice;959struct crocus_screen *screen = batch->screen;960screen->vtbl.load_register_mem32(batch, MI_PREDICATE_SRC0,961ice->state.compute_predicate, 0);962screen->vtbl.load_register_imm32(batch, MI_PREDICATE_SRC1, 0);963unsigned mi_predicate = MI_PREDICATE | MI_PREDICATE_LOADOP_LOADINV |964MI_PREDICATE_COMBINEOP_SET |965MI_PREDICATE_COMPAREOP_SRCS_EQUAL;966967crocus_batch_emit(batch, &mi_predicate, sizeof(uint32_t));968}969#endif970971void972genX(crocus_init_screen_query)(struct crocus_screen *screen)973{974screen->vtbl.resolve_conditional_render = crocus_resolve_conditional_render;975#if GFX_VER >= 7976screen->vtbl.emit_compute_predicate = crocus_emit_compute_predicate;977#endif978}979980void981genX(crocus_init_query)(struct crocus_context *ice)982{983struct pipe_context *ctx = &ice->ctx;984985ctx->create_query = crocus_create_query;986ctx->create_batch_query = crocus_create_batch_query;987ctx->destroy_query = crocus_destroy_query;988ctx->begin_query = crocus_begin_query;989ctx->end_query = crocus_end_query;990ctx->get_query_result = crocus_get_query_result;991#if GFX_VER >= 7992ctx->get_query_result_resource = crocus_get_query_result_resource;993#endif994ctx->set_active_query_state = crocus_set_active_query_state;995ctx->render_condition = crocus_render_condition;996997}9989991000