Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
PojavLauncherTeam
GitHub Repository: PojavLauncherTeam/mesa
Path: blob/21.2-virgl/src/intel/vulkan/genX_query.c
4547 views
1
/*
2
* Copyright © 2015 Intel Corporation
3
*
4
* Permission is hereby granted, free of charge, to any person obtaining a
5
* copy of this software and associated documentation files (the "Software"),
6
* to deal in the Software without restriction, including without limitation
7
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
8
* and/or sell copies of the Software, and to permit persons to whom the
9
* Software is furnished to do so, subject to the following conditions:
10
*
11
* The above copyright notice and this permission notice (including the next
12
* paragraph) shall be included in all copies or substantial portions of the
13
* Software.
14
*
15
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21
* IN THE SOFTWARE.
22
*/
23
24
#include <assert.h>
25
#include <stdbool.h>
26
#include <string.h>
27
#include <unistd.h>
28
#include <fcntl.h>
29
30
#include "anv_private.h"
31
32
#include "genxml/gen_macros.h"
33
#include "genxml/genX_pack.h"
34
35
/* We reserve :
36
* - GPR 14 for perf queries
37
* - GPR 15 for conditional rendering
38
*/
39
#define MI_BUILDER_NUM_ALLOC_GPRS 14
40
#define MI_BUILDER_CAN_WRITE_BATCH GFX_VER >= 8
41
#define __gen_get_batch_dwords anv_batch_emit_dwords
42
#define __gen_address_offset anv_address_add
43
#define __gen_get_batch_address(b, a) anv_batch_address(b, a)
44
#include "common/mi_builder.h"
45
#include "perf/intel_perf.h"
46
#include "perf/intel_perf_mdapi.h"
47
#include "perf/intel_perf_regs.h"
48
49
#include "vk_util.h"
50
51
static struct anv_address
52
anv_query_address(struct anv_query_pool *pool, uint32_t query)
53
{
54
return (struct anv_address) {
55
.bo = pool->bo,
56
.offset = query * pool->stride,
57
};
58
}
59
60
VkResult genX(CreateQueryPool)(
61
VkDevice _device,
62
const VkQueryPoolCreateInfo* pCreateInfo,
63
const VkAllocationCallbacks* pAllocator,
64
VkQueryPool* pQueryPool)
65
{
66
ANV_FROM_HANDLE(anv_device, device, _device);
67
const struct anv_physical_device *pdevice = device->physical;
68
#if GFX_VER >= 8
69
const VkQueryPoolPerformanceCreateInfoKHR *perf_query_info = NULL;
70
struct intel_perf_counter_pass *counter_pass;
71
struct intel_perf_query_info **pass_query;
72
uint32_t n_passes = 0;
73
#endif
74
uint32_t data_offset = 0;
75
VK_MULTIALLOC(ma);
76
VkResult result;
77
78
assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_QUERY_POOL_CREATE_INFO);
79
80
/* Query pool slots are made up of some number of 64-bit values packed
81
* tightly together. For most query types have the first 64-bit value is
82
* the "available" bit which is 0 when the query is unavailable and 1 when
83
* it is available. The 64-bit values that follow are determined by the
84
* type of query.
85
*
86
* For performance queries, we have a requirement to align OA reports at
87
* 64bytes so we put those first and have the "available" bit behind
88
* together with some other counters.
89
*/
90
uint32_t uint64s_per_slot = 0;
91
92
VK_MULTIALLOC_DECL(&ma, struct anv_query_pool, pool, 1);
93
94
VkQueryPipelineStatisticFlags pipeline_statistics = 0;
95
switch (pCreateInfo->queryType) {
96
case VK_QUERY_TYPE_OCCLUSION:
97
/* Occlusion queries have two values: begin and end. */
98
uint64s_per_slot = 1 + 2;
99
break;
100
case VK_QUERY_TYPE_TIMESTAMP:
101
/* Timestamps just have the one timestamp value */
102
uint64s_per_slot = 1 + 1;
103
break;
104
case VK_QUERY_TYPE_PIPELINE_STATISTICS:
105
pipeline_statistics = pCreateInfo->pipelineStatistics;
106
/* We're going to trust this field implicitly so we need to ensure that
107
* no unhandled extension bits leak in.
108
*/
109
pipeline_statistics &= ANV_PIPELINE_STATISTICS_MASK;
110
111
/* Statistics queries have a min and max for every statistic */
112
uint64s_per_slot = 1 + 2 * util_bitcount(pipeline_statistics);
113
break;
114
case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
115
/* Transform feedback queries are 4 values, begin/end for
116
* written/available.
117
*/
118
uint64s_per_slot = 1 + 4;
119
break;
120
case VK_QUERY_TYPE_PERFORMANCE_QUERY_INTEL: {
121
const struct intel_perf_query_field_layout *layout =
122
&pdevice->perf->query_layout;
123
124
uint64s_per_slot = 2; /* availability + marker */
125
/* Align to the requirement of the layout */
126
uint64s_per_slot = align_u32(uint64s_per_slot,
127
DIV_ROUND_UP(layout->alignment, sizeof(uint64_t)));
128
data_offset = uint64s_per_slot * sizeof(uint64_t);
129
/* Add the query data for begin & end commands */
130
uint64s_per_slot += 2 * DIV_ROUND_UP(layout->size, sizeof(uint64_t));
131
break;
132
}
133
#if GFX_VER >= 8
134
case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR: {
135
const struct intel_perf_query_field_layout *layout =
136
&pdevice->perf->query_layout;
137
138
perf_query_info = vk_find_struct_const(pCreateInfo->pNext,
139
QUERY_POOL_PERFORMANCE_CREATE_INFO_KHR);
140
n_passes = intel_perf_get_n_passes(pdevice->perf,
141
perf_query_info->pCounterIndices,
142
perf_query_info->counterIndexCount,
143
NULL);
144
vk_multialloc_add(&ma, &counter_pass, struct intel_perf_counter_pass,
145
perf_query_info->counterIndexCount);
146
vk_multialloc_add(&ma, &pass_query, struct intel_perf_query_info *,
147
n_passes);
148
uint64s_per_slot = 4 /* availability + small batch */;
149
/* Align to the requirement of the layout */
150
uint64s_per_slot = align_u32(uint64s_per_slot,
151
DIV_ROUND_UP(layout->alignment, sizeof(uint64_t)));
152
data_offset = uint64s_per_slot * sizeof(uint64_t);
153
/* Add the query data for begin & end commands */
154
uint64s_per_slot += 2 * DIV_ROUND_UP(layout->size, sizeof(uint64_t));
155
/* Multiply by the number of passes */
156
uint64s_per_slot *= n_passes;
157
break;
158
}
159
#endif
160
default:
161
assert(!"Invalid query type");
162
}
163
164
if (!vk_object_multialloc(&device->vk, &ma, pAllocator,
165
VK_OBJECT_TYPE_QUERY_POOL))
166
return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
167
168
pool->type = pCreateInfo->queryType;
169
pool->pipeline_statistics = pipeline_statistics;
170
pool->stride = uint64s_per_slot * sizeof(uint64_t);
171
pool->slots = pCreateInfo->queryCount;
172
173
if (pool->type == VK_QUERY_TYPE_PERFORMANCE_QUERY_INTEL) {
174
pool->data_offset = data_offset;
175
pool->snapshot_size = (pool->stride - data_offset) / 2;
176
}
177
#if GFX_VER >= 8
178
else if (pool->type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR) {
179
pool->pass_size = pool->stride / n_passes;
180
pool->data_offset = data_offset;
181
pool->snapshot_size = (pool->pass_size - data_offset) / 2;
182
pool->n_counters = perf_query_info->counterIndexCount;
183
pool->counter_pass = counter_pass;
184
intel_perf_get_counters_passes(pdevice->perf,
185
perf_query_info->pCounterIndices,
186
perf_query_info->counterIndexCount,
187
pool->counter_pass);
188
pool->n_passes = n_passes;
189
pool->pass_query = pass_query;
190
intel_perf_get_n_passes(pdevice->perf,
191
perf_query_info->pCounterIndices,
192
perf_query_info->counterIndexCount,
193
pool->pass_query);
194
}
195
#endif
196
197
uint64_t size = pool->slots * (uint64_t)pool->stride;
198
result = anv_device_alloc_bo(device, "query-pool", size,
199
ANV_BO_ALLOC_MAPPED |
200
ANV_BO_ALLOC_SNOOPED,
201
0 /* explicit_address */,
202
&pool->bo);
203
if (result != VK_SUCCESS)
204
goto fail;
205
206
#if GFX_VER >= 8
207
if (pool->type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR) {
208
for (uint32_t p = 0; p < pool->n_passes; p++) {
209
struct mi_builder b;
210
struct anv_batch batch = {
211
.start = pool->bo->map + khr_perf_query_preamble_offset(pool, p),
212
.end = pool->bo->map + khr_perf_query_preamble_offset(pool, p) + pool->data_offset,
213
};
214
batch.next = batch.start;
215
216
mi_builder_init(&b, &device->info, &batch);
217
mi_store(&b, mi_reg64(ANV_PERF_QUERY_OFFSET_REG),
218
mi_imm(p * (uint64_t)pool->pass_size));
219
anv_batch_emit(&batch, GENX(MI_BATCH_BUFFER_END), bbe);
220
}
221
}
222
#endif
223
224
*pQueryPool = anv_query_pool_to_handle(pool);
225
226
return VK_SUCCESS;
227
228
fail:
229
vk_free2(&device->vk.alloc, pAllocator, pool);
230
231
return result;
232
}
233
234
void genX(DestroyQueryPool)(
235
VkDevice _device,
236
VkQueryPool _pool,
237
const VkAllocationCallbacks* pAllocator)
238
{
239
ANV_FROM_HANDLE(anv_device, device, _device);
240
ANV_FROM_HANDLE(anv_query_pool, pool, _pool);
241
242
if (!pool)
243
return;
244
245
anv_device_release_bo(device, pool->bo);
246
vk_object_free(&device->vk, pAllocator, pool);
247
}
248
249
#if GFX_VER >= 8
250
/**
251
* VK_KHR_performance_query layout :
252
*
253
* --------------------------------------------
254
* | availability (8b) | | |
255
* |-------------------------------| | |
256
* | Small batch loading | | |
257
* | ANV_PERF_QUERY_OFFSET_REG | | |
258
* | (24b) | | Pass 0 |
259
* |-------------------------------| | |
260
* | some padding (see | | |
261
* | query_field_layout:alignment) | | |
262
* |-------------------------------| | |
263
* | query data | | |
264
* | (2 * query_field_layout:size) | | |
265
* |-------------------------------|-- | Query 0
266
* | availability (8b) | | |
267
* |-------------------------------| | |
268
* | Small batch loading | | |
269
* | ANV_PERF_QUERY_OFFSET_REG | | |
270
* | (24b) | | Pass 1 |
271
* |-------------------------------| | |
272
* | some padding (see | | |
273
* | query_field_layout:alignment) | | |
274
* |-------------------------------| | |
275
* | query data | | |
276
* | (2 * query_field_layout:size) | | |
277
* |-------------------------------|-----------
278
* | availability (8b) | | |
279
* |-------------------------------| | |
280
* | Small batch loading | | |
281
* | ANV_PERF_QUERY_OFFSET_REG | | |
282
* | (24b) | | Pass 0 |
283
* |-------------------------------| | |
284
* | some padding (see | | |
285
* | query_field_layout:alignment) | | |
286
* |-------------------------------| | |
287
* | query data | | |
288
* | (2 * query_field_layout:size) | | |
289
* |-------------------------------|-- | Query 1
290
* | ... | | |
291
* --------------------------------------------
292
*/
293
294
static uint64_t
295
khr_perf_query_availability_offset(struct anv_query_pool *pool, uint32_t query, uint32_t pass)
296
{
297
return query * (uint64_t)pool->stride + pass * (uint64_t)pool->pass_size;
298
}
299
300
static uint64_t
301
khr_perf_query_data_offset(struct anv_query_pool *pool, uint32_t query, uint32_t pass, bool end)
302
{
303
return query * (uint64_t)pool->stride + pass * (uint64_t)pool->pass_size +
304
pool->data_offset + (end ? pool->snapshot_size : 0);
305
}
306
307
static struct anv_address
308
khr_perf_query_availability_address(struct anv_query_pool *pool, uint32_t query, uint32_t pass)
309
{
310
return anv_address_add(
311
(struct anv_address) { .bo = pool->bo, },
312
khr_perf_query_availability_offset(pool, query, pass));
313
}
314
315
static struct anv_address
316
khr_perf_query_data_address(struct anv_query_pool *pool, uint32_t query, uint32_t pass, bool end)
317
{
318
return anv_address_add(
319
(struct anv_address) { .bo = pool->bo, },
320
khr_perf_query_data_offset(pool, query, pass, end));
321
}
322
323
static bool
324
khr_perf_query_ensure_relocs(struct anv_cmd_buffer *cmd_buffer)
325
{
326
if (anv_batch_has_error(&cmd_buffer->batch))
327
return false;
328
329
if (cmd_buffer->self_mod_locations)
330
return true;
331
332
struct anv_device *device = cmd_buffer->device;
333
const struct anv_physical_device *pdevice = device->physical;
334
335
cmd_buffer->self_mod_locations =
336
vk_alloc(&cmd_buffer->pool->alloc,
337
pdevice->n_perf_query_commands * sizeof(*cmd_buffer->self_mod_locations), 8,
338
VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
339
340
if (!cmd_buffer->self_mod_locations) {
341
anv_batch_set_error(&cmd_buffer->batch, VK_ERROR_OUT_OF_HOST_MEMORY);
342
return false;
343
}
344
345
return true;
346
}
347
#endif
348
349
/**
350
* VK_INTEL_performance_query layout :
351
*
352
* ---------------------------------
353
* | availability (8b) |
354
* |-------------------------------|
355
* | marker (8b) |
356
* |-------------------------------|
357
* | some padding (see |
358
* | query_field_layout:alignment) |
359
* |-------------------------------|
360
* | query data |
361
* | (2 * query_field_layout:size) |
362
* ---------------------------------
363
*/
364
365
static uint32_t
366
intel_perf_marker_offset(void)
367
{
368
return 8;
369
}
370
371
static uint32_t
372
intel_perf_query_data_offset(struct anv_query_pool *pool, bool end)
373
{
374
return pool->data_offset + (end ? pool->snapshot_size : 0);
375
}
376
377
static void
378
cpu_write_query_result(void *dst_slot, VkQueryResultFlags flags,
379
uint32_t value_index, uint64_t result)
380
{
381
if (flags & VK_QUERY_RESULT_64_BIT) {
382
uint64_t *dst64 = dst_slot;
383
dst64[value_index] = result;
384
} else {
385
uint32_t *dst32 = dst_slot;
386
dst32[value_index] = result;
387
}
388
}
389
390
static void *
391
query_slot(struct anv_query_pool *pool, uint32_t query)
392
{
393
return pool->bo->map + query * pool->stride;
394
}
395
396
static bool
397
query_is_available(struct anv_query_pool *pool, uint32_t query)
398
{
399
#if GFX_VER >= 8
400
if (pool->type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR) {
401
for (uint32_t p = 0; p < pool->n_passes; p++) {
402
volatile uint64_t *slot =
403
pool->bo->map + khr_perf_query_availability_offset(pool, query, p);
404
if (!slot[0])
405
return false;
406
}
407
return true;
408
}
409
#endif
410
411
return *(volatile uint64_t *)query_slot(pool, query);
412
}
413
414
static VkResult
415
wait_for_available(struct anv_device *device,
416
struct anv_query_pool *pool, uint32_t query)
417
{
418
uint64_t abs_timeout = anv_get_absolute_timeout(2 * NSEC_PER_SEC);
419
420
while (anv_gettime_ns() < abs_timeout) {
421
if (query_is_available(pool, query))
422
return VK_SUCCESS;
423
VkResult status = anv_device_query_status(device);
424
if (status != VK_SUCCESS)
425
return status;
426
}
427
428
return anv_device_set_lost(device, "query timeout");
429
}
430
431
VkResult genX(GetQueryPoolResults)(
432
VkDevice _device,
433
VkQueryPool queryPool,
434
uint32_t firstQuery,
435
uint32_t queryCount,
436
size_t dataSize,
437
void* pData,
438
VkDeviceSize stride,
439
VkQueryResultFlags flags)
440
{
441
ANV_FROM_HANDLE(anv_device, device, _device);
442
ANV_FROM_HANDLE(anv_query_pool, pool, queryPool);
443
444
assert(pool->type == VK_QUERY_TYPE_OCCLUSION ||
445
pool->type == VK_QUERY_TYPE_PIPELINE_STATISTICS ||
446
pool->type == VK_QUERY_TYPE_TIMESTAMP ||
447
pool->type == VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT ||
448
pool->type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR ||
449
pool->type == VK_QUERY_TYPE_PERFORMANCE_QUERY_INTEL);
450
451
if (anv_device_is_lost(device))
452
return VK_ERROR_DEVICE_LOST;
453
454
if (pData == NULL)
455
return VK_SUCCESS;
456
457
void *data_end = pData + dataSize;
458
459
VkResult status = VK_SUCCESS;
460
for (uint32_t i = 0; i < queryCount; i++) {
461
bool available = query_is_available(pool, firstQuery + i);
462
463
if (!available && (flags & VK_QUERY_RESULT_WAIT_BIT)) {
464
status = wait_for_available(device, pool, firstQuery + i);
465
if (status != VK_SUCCESS) {
466
return status;
467
}
468
469
available = true;
470
}
471
472
/* From the Vulkan 1.0.42 spec:
473
*
474
* "If VK_QUERY_RESULT_WAIT_BIT and VK_QUERY_RESULT_PARTIAL_BIT are
475
* both not set then no result values are written to pData for
476
* queries that are in the unavailable state at the time of the call,
477
* and vkGetQueryPoolResults returns VK_NOT_READY. However,
478
* availability state is still written to pData for those queries if
479
* VK_QUERY_RESULT_WITH_AVAILABILITY_BIT is set."
480
*
481
* From VK_KHR_performance_query :
482
*
483
* "VK_QUERY_RESULT_PERFORMANCE_QUERY_RECORDED_COUNTERS_BIT_KHR specifies
484
* that the result should contain the number of counters that were recorded
485
* into a query pool of type ename:VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR"
486
*/
487
bool write_results = available || (flags & VK_QUERY_RESULT_PARTIAL_BIT);
488
489
uint32_t idx = 0;
490
switch (pool->type) {
491
case VK_QUERY_TYPE_OCCLUSION: {
492
uint64_t *slot = query_slot(pool, firstQuery + i);
493
if (write_results) {
494
/* From the Vulkan 1.2.132 spec:
495
*
496
* "If VK_QUERY_RESULT_PARTIAL_BIT is set,
497
* VK_QUERY_RESULT_WAIT_BIT is not set, and the query’s status
498
* is unavailable, an intermediate result value between zero and
499
* the final result value is written to pData for that query."
500
*/
501
uint64_t result = available ? slot[2] - slot[1] : 0;
502
cpu_write_query_result(pData, flags, idx, result);
503
}
504
idx++;
505
break;
506
}
507
508
case VK_QUERY_TYPE_PIPELINE_STATISTICS: {
509
uint64_t *slot = query_slot(pool, firstQuery + i);
510
uint32_t statistics = pool->pipeline_statistics;
511
while (statistics) {
512
uint32_t stat = u_bit_scan(&statistics);
513
if (write_results) {
514
uint64_t result = slot[idx * 2 + 2] - slot[idx * 2 + 1];
515
516
/* WaDividePSInvocationCountBy4:HSW,BDW */
517
if ((device->info.ver == 8 || device->info.is_haswell) &&
518
(1 << stat) == VK_QUERY_PIPELINE_STATISTIC_FRAGMENT_SHADER_INVOCATIONS_BIT)
519
result >>= 2;
520
521
cpu_write_query_result(pData, flags, idx, result);
522
}
523
idx++;
524
}
525
assert(idx == util_bitcount(pool->pipeline_statistics));
526
break;
527
}
528
529
case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT: {
530
uint64_t *slot = query_slot(pool, firstQuery + i);
531
if (write_results)
532
cpu_write_query_result(pData, flags, idx, slot[2] - slot[1]);
533
idx++;
534
if (write_results)
535
cpu_write_query_result(pData, flags, idx, slot[4] - slot[3]);
536
idx++;
537
break;
538
}
539
540
case VK_QUERY_TYPE_TIMESTAMP: {
541
uint64_t *slot = query_slot(pool, firstQuery + i);
542
if (write_results)
543
cpu_write_query_result(pData, flags, idx, slot[1]);
544
idx++;
545
break;
546
}
547
548
#if GFX_VER >= 8
549
case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR: {
550
const struct anv_physical_device *pdevice = device->physical;
551
assert((flags & (VK_QUERY_RESULT_WITH_AVAILABILITY_BIT |
552
VK_QUERY_RESULT_PARTIAL_BIT)) == 0);
553
for (uint32_t p = 0; p < pool->n_passes; p++) {
554
const struct intel_perf_query_info *query = pool->pass_query[p];
555
struct intel_perf_query_result result;
556
intel_perf_query_result_clear(&result);
557
intel_perf_query_result_accumulate_fields(&result, query, &device->info,
558
pool->bo->map + khr_perf_query_data_offset(pool, firstQuery + i, p, false),
559
pool->bo->map + khr_perf_query_data_offset(pool, firstQuery + i, p, true),
560
false /* no_oa_accumulate */);
561
anv_perf_write_pass_results(pdevice->perf, pool, p, &result, pData);
562
}
563
break;
564
}
565
#endif
566
567
case VK_QUERY_TYPE_PERFORMANCE_QUERY_INTEL: {
568
if (!write_results)
569
break;
570
const void *query_data = query_slot(pool, firstQuery + i);
571
const struct intel_perf_query_info *query = &device->physical->perf->queries[0];
572
struct intel_perf_query_result result;
573
intel_perf_query_result_clear(&result);
574
intel_perf_query_result_accumulate_fields(&result, query, &device->info,
575
query_data + intel_perf_query_data_offset(pool, false),
576
query_data + intel_perf_query_data_offset(pool, true),
577
false /* no_oa_accumulate */);
578
intel_perf_query_result_write_mdapi(pData, stride,
579
&device->info,
580
query, &result);
581
const uint64_t *marker = query_data + intel_perf_marker_offset();
582
intel_perf_query_mdapi_write_marker(pData, stride, &device->info, *marker);
583
break;
584
}
585
586
default:
587
unreachable("invalid pool type");
588
}
589
590
if (!write_results)
591
status = VK_NOT_READY;
592
593
if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT)
594
cpu_write_query_result(pData, flags, idx, available);
595
596
pData += stride;
597
if (pData >= data_end)
598
break;
599
}
600
601
return status;
602
}
603
604
static void
605
emit_ps_depth_count(struct anv_cmd_buffer *cmd_buffer,
606
struct anv_address addr)
607
{
608
cmd_buffer->state.pending_pipe_bits |= ANV_PIPE_POST_SYNC_BIT;
609
genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
610
611
anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
612
pc.DestinationAddressType = DAT_PPGTT;
613
pc.PostSyncOperation = WritePSDepthCount;
614
pc.DepthStallEnable = true;
615
pc.Address = addr;
616
617
if (GFX_VER == 9 && cmd_buffer->device->info.gt == 4)
618
pc.CommandStreamerStallEnable = true;
619
}
620
}
621
622
static void
623
emit_query_mi_availability(struct mi_builder *b,
624
struct anv_address addr,
625
bool available)
626
{
627
mi_store(b, mi_mem64(addr), mi_imm(available));
628
}
629
630
static void
631
emit_query_pc_availability(struct anv_cmd_buffer *cmd_buffer,
632
struct anv_address addr,
633
bool available)
634
{
635
cmd_buffer->state.pending_pipe_bits |= ANV_PIPE_POST_SYNC_BIT;
636
genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
637
638
anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
639
pc.DestinationAddressType = DAT_PPGTT;
640
pc.PostSyncOperation = WriteImmediateData;
641
pc.Address = addr;
642
pc.ImmediateData = available;
643
}
644
}
645
646
/**
647
* Goes through a series of consecutive query indices in the given pool
648
* setting all element values to 0 and emitting them as available.
649
*/
650
static void
651
emit_zero_queries(struct anv_cmd_buffer *cmd_buffer,
652
struct mi_builder *b, struct anv_query_pool *pool,
653
uint32_t first_index, uint32_t num_queries)
654
{
655
switch (pool->type) {
656
case VK_QUERY_TYPE_OCCLUSION:
657
case VK_QUERY_TYPE_TIMESTAMP:
658
/* These queries are written with a PIPE_CONTROL so clear them using the
659
* PIPE_CONTROL as well so we don't have to synchronize between 2 types
660
* of operations.
661
*/
662
assert((pool->stride % 8) == 0);
663
for (uint32_t i = 0; i < num_queries; i++) {
664
struct anv_address slot_addr =
665
anv_query_address(pool, first_index + i);
666
667
for (uint32_t qword = 1; qword < (pool->stride / 8); qword++) {
668
emit_query_pc_availability(cmd_buffer,
669
anv_address_add(slot_addr, qword * 8),
670
false);
671
}
672
emit_query_pc_availability(cmd_buffer, slot_addr, true);
673
}
674
break;
675
676
case VK_QUERY_TYPE_PIPELINE_STATISTICS:
677
case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
678
for (uint32_t i = 0; i < num_queries; i++) {
679
struct anv_address slot_addr =
680
anv_query_address(pool, first_index + i);
681
mi_memset(b, anv_address_add(slot_addr, 8), 0, pool->stride - 8);
682
emit_query_mi_availability(b, slot_addr, true);
683
}
684
break;
685
686
#if GFX_VER >= 8
687
case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR: {
688
for (uint32_t i = 0; i < num_queries; i++) {
689
for (uint32_t p = 0; p < pool->n_passes; p++) {
690
mi_memset(b, khr_perf_query_data_address(pool, first_index + i, p, false),
691
0, 2 * pool->snapshot_size);
692
emit_query_mi_availability(b,
693
khr_perf_query_availability_address(pool, first_index + i, p),
694
true);
695
}
696
}
697
break;
698
}
699
#endif
700
701
case VK_QUERY_TYPE_PERFORMANCE_QUERY_INTEL:
702
for (uint32_t i = 0; i < num_queries; i++) {
703
struct anv_address slot_addr =
704
anv_query_address(pool, first_index + i);
705
mi_memset(b, anv_address_add(slot_addr, 8), 0, pool->stride - 8);
706
emit_query_mi_availability(b, slot_addr, true);
707
}
708
break;
709
710
default:
711
unreachable("Unsupported query type");
712
}
713
}
714
715
void genX(CmdResetQueryPool)(
716
VkCommandBuffer commandBuffer,
717
VkQueryPool queryPool,
718
uint32_t firstQuery,
719
uint32_t queryCount)
720
{
721
ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
722
ANV_FROM_HANDLE(anv_query_pool, pool, queryPool);
723
724
switch (pool->type) {
725
case VK_QUERY_TYPE_OCCLUSION:
726
case VK_QUERY_TYPE_TIMESTAMP:
727
for (uint32_t i = 0; i < queryCount; i++) {
728
emit_query_pc_availability(cmd_buffer,
729
anv_query_address(pool, firstQuery + i),
730
false);
731
}
732
break;
733
734
case VK_QUERY_TYPE_PIPELINE_STATISTICS:
735
case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT: {
736
struct mi_builder b;
737
mi_builder_init(&b, &cmd_buffer->device->info, &cmd_buffer->batch);
738
739
for (uint32_t i = 0; i < queryCount; i++)
740
emit_query_mi_availability(&b, anv_query_address(pool, firstQuery + i), false);
741
break;
742
}
743
744
#if GFX_VER >= 8
745
case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR: {
746
struct mi_builder b;
747
mi_builder_init(&b, &cmd_buffer->device->info, &cmd_buffer->batch);
748
749
for (uint32_t i = 0; i < queryCount; i++) {
750
for (uint32_t p = 0; p < pool->n_passes; p++) {
751
emit_query_mi_availability(
752
&b,
753
khr_perf_query_availability_address(pool, firstQuery + i, p),
754
false);
755
}
756
}
757
break;
758
}
759
#endif
760
761
case VK_QUERY_TYPE_PERFORMANCE_QUERY_INTEL: {
762
struct mi_builder b;
763
mi_builder_init(&b, &cmd_buffer->device->info, &cmd_buffer->batch);
764
765
for (uint32_t i = 0; i < queryCount; i++)
766
emit_query_mi_availability(&b, anv_query_address(pool, firstQuery + i), false);
767
break;
768
}
769
770
default:
771
unreachable("Unsupported query type");
772
}
773
}
774
775
void genX(ResetQueryPool)(
776
VkDevice _device,
777
VkQueryPool queryPool,
778
uint32_t firstQuery,
779
uint32_t queryCount)
780
{
781
ANV_FROM_HANDLE(anv_query_pool, pool, queryPool);
782
783
for (uint32_t i = 0; i < queryCount; i++) {
784
if (pool->type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR) {
785
#if GFX_VER >= 8
786
for (uint32_t p = 0; p < pool->n_passes; p++) {
787
uint64_t *pass_slot = pool->bo->map +
788
khr_perf_query_availability_offset(pool, firstQuery + i, p);
789
*pass_slot = 0;
790
}
791
#endif
792
} else {
793
uint64_t *slot = query_slot(pool, firstQuery + i);
794
*slot = 0;
795
}
796
}
797
}
798
799
static const uint32_t vk_pipeline_stat_to_reg[] = {
800
GENX(IA_VERTICES_COUNT_num),
801
GENX(IA_PRIMITIVES_COUNT_num),
802
GENX(VS_INVOCATION_COUNT_num),
803
GENX(GS_INVOCATION_COUNT_num),
804
GENX(GS_PRIMITIVES_COUNT_num),
805
GENX(CL_INVOCATION_COUNT_num),
806
GENX(CL_PRIMITIVES_COUNT_num),
807
GENX(PS_INVOCATION_COUNT_num),
808
GENX(HS_INVOCATION_COUNT_num),
809
GENX(DS_INVOCATION_COUNT_num),
810
GENX(CS_INVOCATION_COUNT_num),
811
};
812
813
static void
814
emit_pipeline_stat(struct mi_builder *b, uint32_t stat,
815
struct anv_address addr)
816
{
817
STATIC_ASSERT(ANV_PIPELINE_STATISTICS_MASK ==
818
(1 << ARRAY_SIZE(vk_pipeline_stat_to_reg)) - 1);
819
820
assert(stat < ARRAY_SIZE(vk_pipeline_stat_to_reg));
821
mi_store(b, mi_mem64(addr), mi_reg64(vk_pipeline_stat_to_reg[stat]));
822
}
823
824
static void
825
emit_xfb_query(struct mi_builder *b, uint32_t stream,
826
struct anv_address addr)
827
{
828
assert(stream < MAX_XFB_STREAMS);
829
830
mi_store(b, mi_mem64(anv_address_add(addr, 0)),
831
mi_reg64(GENX(SO_NUM_PRIMS_WRITTEN0_num) + stream * 8));
832
mi_store(b, mi_mem64(anv_address_add(addr, 16)),
833
mi_reg64(GENX(SO_PRIM_STORAGE_NEEDED0_num) + stream * 8));
834
}
835
836
static void
837
emit_perf_intel_query(struct anv_cmd_buffer *cmd_buffer,
838
struct anv_query_pool *pool,
839
struct mi_builder *b,
840
struct anv_address query_addr,
841
bool end)
842
{
843
const struct intel_perf_query_field_layout *layout =
844
&cmd_buffer->device->physical->perf->query_layout;
845
struct anv_address data_addr =
846
anv_address_add(query_addr, intel_perf_query_data_offset(pool, end));
847
848
for (uint32_t f = 0; f < layout->n_fields; f++) {
849
const struct intel_perf_query_field *field =
850
&layout->fields[end ? f : (layout->n_fields - 1 - f)];
851
852
switch (field->type) {
853
case INTEL_PERF_QUERY_FIELD_TYPE_MI_RPC:
854
anv_batch_emit(&cmd_buffer->batch, GENX(MI_REPORT_PERF_COUNT), rpc) {
855
rpc.MemoryAddress = anv_address_add(data_addr, field->location);
856
}
857
break;
858
859
case INTEL_PERF_QUERY_FIELD_TYPE_SRM_PERFCNT:
860
case INTEL_PERF_QUERY_FIELD_TYPE_SRM_RPSTAT:
861
case INTEL_PERF_QUERY_FIELD_TYPE_SRM_OA_B:
862
case INTEL_PERF_QUERY_FIELD_TYPE_SRM_OA_C: {
863
struct anv_address addr = anv_address_add(data_addr, field->location);
864
struct mi_value src = field->size == 8 ?
865
mi_reg64(field->mmio_offset) :
866
mi_reg32(field->mmio_offset);
867
struct mi_value dst = field->size == 8 ?
868
mi_mem64(addr) : mi_mem32(addr);
869
mi_store(b, dst, src);
870
break;
871
}
872
873
default:
874
unreachable("Invalid query field");
875
break;
876
}
877
}
878
}
879
880
void genX(CmdBeginQuery)(
881
VkCommandBuffer commandBuffer,
882
VkQueryPool queryPool,
883
uint32_t query,
884
VkQueryControlFlags flags)
885
{
886
genX(CmdBeginQueryIndexedEXT)(commandBuffer, queryPool, query, flags, 0);
887
}
888
889
void genX(CmdBeginQueryIndexedEXT)(
890
VkCommandBuffer commandBuffer,
891
VkQueryPool queryPool,
892
uint32_t query,
893
VkQueryControlFlags flags,
894
uint32_t index)
895
{
896
ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
897
ANV_FROM_HANDLE(anv_query_pool, pool, queryPool);
898
struct anv_address query_addr = anv_query_address(pool, query);
899
900
struct mi_builder b;
901
mi_builder_init(&b, &cmd_buffer->device->info, &cmd_buffer->batch);
902
903
switch (pool->type) {
904
case VK_QUERY_TYPE_OCCLUSION:
905
emit_ps_depth_count(cmd_buffer, anv_address_add(query_addr, 8));
906
break;
907
908
case VK_QUERY_TYPE_PIPELINE_STATISTICS: {
909
/* TODO: This might only be necessary for certain stats */
910
anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
911
pc.CommandStreamerStallEnable = true;
912
pc.StallAtPixelScoreboard = true;
913
}
914
915
uint32_t statistics = pool->pipeline_statistics;
916
uint32_t offset = 8;
917
while (statistics) {
918
uint32_t stat = u_bit_scan(&statistics);
919
emit_pipeline_stat(&b, stat, anv_address_add(query_addr, offset));
920
offset += 16;
921
}
922
break;
923
}
924
925
case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
926
anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
927
pc.CommandStreamerStallEnable = true;
928
pc.StallAtPixelScoreboard = true;
929
}
930
emit_xfb_query(&b, index, anv_address_add(query_addr, 8));
931
break;
932
933
#if GFX_VER >= 8
934
case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR: {
935
if (!khr_perf_query_ensure_relocs(cmd_buffer))
936
return;
937
938
const struct anv_physical_device *pdevice = cmd_buffer->device->physical;
939
const struct intel_perf_query_field_layout *layout = &pdevice->perf->query_layout;
940
941
uint32_t reloc_idx = 0;
942
for (uint32_t end = 0; end < 2; end++) {
943
for (uint32_t r = 0; r < layout->n_fields; r++) {
944
const struct intel_perf_query_field *field =
945
&layout->fields[end ? r : (layout->n_fields - 1 - r)];
946
struct mi_value reg_addr =
947
mi_iadd(
948
&b,
949
mi_imm(intel_canonical_address(pool->bo->offset +
950
khr_perf_query_data_offset(pool, query, 0, end) +
951
field->location)),
952
mi_reg64(ANV_PERF_QUERY_OFFSET_REG));
953
cmd_buffer->self_mod_locations[reloc_idx++] = mi_store_address(&b, reg_addr);
954
955
if (field->type != INTEL_PERF_QUERY_FIELD_TYPE_MI_RPC &&
956
field->size == 8) {
957
reg_addr =
958
mi_iadd(
959
&b,
960
mi_imm(intel_canonical_address(pool->bo->offset +
961
khr_perf_query_data_offset(pool, query, 0, end) +
962
field->location + 4)),
963
mi_reg64(ANV_PERF_QUERY_OFFSET_REG));
964
cmd_buffer->self_mod_locations[reloc_idx++] = mi_store_address(&b, reg_addr);
965
}
966
}
967
}
968
969
struct mi_value availability_write_offset =
970
mi_iadd(
971
&b,
972
mi_imm(
973
intel_canonical_address(
974
pool->bo->offset +
975
khr_perf_query_availability_offset(pool, query, 0 /* pass */))),
976
mi_reg64(ANV_PERF_QUERY_OFFSET_REG));
977
cmd_buffer->self_mod_locations[reloc_idx++] =
978
mi_store_address(&b, availability_write_offset);
979
980
assert(reloc_idx == pdevice->n_perf_query_commands);
981
982
mi_self_mod_barrier(&b);
983
984
anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
985
pc.CommandStreamerStallEnable = true;
986
pc.StallAtPixelScoreboard = true;
987
}
988
cmd_buffer->perf_query_pool = pool;
989
990
cmd_buffer->perf_reloc_idx = 0;
991
for (uint32_t r = 0; r < layout->n_fields; r++) {
992
const struct intel_perf_query_field *field =
993
&layout->fields[layout->n_fields - 1 - r];
994
void *dws;
995
996
switch (field->type) {
997
case INTEL_PERF_QUERY_FIELD_TYPE_MI_RPC:
998
dws = anv_batch_emitn(&cmd_buffer->batch,
999
GENX(MI_REPORT_PERF_COUNT_length),
1000
GENX(MI_REPORT_PERF_COUNT),
1001
.MemoryAddress = query_addr /* Will be overwritten */);
1002
_mi_resolve_address_token(&b,
1003
cmd_buffer->self_mod_locations[cmd_buffer->perf_reloc_idx++],
1004
dws +
1005
GENX(MI_REPORT_PERF_COUNT_MemoryAddress_start) / 8);
1006
break;
1007
1008
case INTEL_PERF_QUERY_FIELD_TYPE_SRM_PERFCNT:
1009
case INTEL_PERF_QUERY_FIELD_TYPE_SRM_RPSTAT:
1010
case INTEL_PERF_QUERY_FIELD_TYPE_SRM_OA_B:
1011
case INTEL_PERF_QUERY_FIELD_TYPE_SRM_OA_C:
1012
dws =
1013
anv_batch_emitn(&cmd_buffer->batch,
1014
GENX(MI_STORE_REGISTER_MEM_length),
1015
GENX(MI_STORE_REGISTER_MEM),
1016
.RegisterAddress = field->mmio_offset,
1017
.MemoryAddress = query_addr /* Will be overwritten */ );
1018
_mi_resolve_address_token(&b,
1019
cmd_buffer->self_mod_locations[cmd_buffer->perf_reloc_idx++],
1020
dws +
1021
GENX(MI_STORE_REGISTER_MEM_MemoryAddress_start) / 8);
1022
if (field->size == 8) {
1023
dws =
1024
anv_batch_emitn(&cmd_buffer->batch,
1025
GENX(MI_STORE_REGISTER_MEM_length),
1026
GENX(MI_STORE_REGISTER_MEM),
1027
.RegisterAddress = field->mmio_offset + 4,
1028
.MemoryAddress = query_addr /* Will be overwritten */ );
1029
_mi_resolve_address_token(&b,
1030
cmd_buffer->self_mod_locations[cmd_buffer->perf_reloc_idx++],
1031
dws +
1032
GENX(MI_STORE_REGISTER_MEM_MemoryAddress_start) / 8);
1033
}
1034
break;
1035
1036
default:
1037
unreachable("Invalid query field");
1038
break;
1039
}
1040
}
1041
break;
1042
}
1043
#endif
1044
1045
case VK_QUERY_TYPE_PERFORMANCE_QUERY_INTEL: {
1046
anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
1047
pc.CommandStreamerStallEnable = true;
1048
pc.StallAtPixelScoreboard = true;
1049
}
1050
emit_perf_intel_query(cmd_buffer, pool, &b, query_addr, false);
1051
break;
1052
}
1053
1054
default:
1055
unreachable("");
1056
}
1057
}
1058
1059
void genX(CmdEndQuery)(
1060
VkCommandBuffer commandBuffer,
1061
VkQueryPool queryPool,
1062
uint32_t query)
1063
{
1064
genX(CmdEndQueryIndexedEXT)(commandBuffer, queryPool, query, 0);
1065
}
1066
1067
void genX(CmdEndQueryIndexedEXT)(
1068
VkCommandBuffer commandBuffer,
1069
VkQueryPool queryPool,
1070
uint32_t query,
1071
uint32_t index)
1072
{
1073
ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
1074
ANV_FROM_HANDLE(anv_query_pool, pool, queryPool);
1075
struct anv_address query_addr = anv_query_address(pool, query);
1076
1077
struct mi_builder b;
1078
mi_builder_init(&b, &cmd_buffer->device->info, &cmd_buffer->batch);
1079
1080
switch (pool->type) {
1081
case VK_QUERY_TYPE_OCCLUSION:
1082
emit_ps_depth_count(cmd_buffer, anv_address_add(query_addr, 16));
1083
emit_query_pc_availability(cmd_buffer, query_addr, true);
1084
break;
1085
1086
case VK_QUERY_TYPE_PIPELINE_STATISTICS: {
1087
/* TODO: This might only be necessary for certain stats */
1088
anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
1089
pc.CommandStreamerStallEnable = true;
1090
pc.StallAtPixelScoreboard = true;
1091
}
1092
1093
uint32_t statistics = pool->pipeline_statistics;
1094
uint32_t offset = 16;
1095
while (statistics) {
1096
uint32_t stat = u_bit_scan(&statistics);
1097
emit_pipeline_stat(&b, stat, anv_address_add(query_addr, offset));
1098
offset += 16;
1099
}
1100
1101
emit_query_mi_availability(&b, query_addr, true);
1102
break;
1103
}
1104
1105
case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
1106
anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
1107
pc.CommandStreamerStallEnable = true;
1108
pc.StallAtPixelScoreboard = true;
1109
}
1110
1111
emit_xfb_query(&b, index, anv_address_add(query_addr, 16));
1112
emit_query_mi_availability(&b, query_addr, true);
1113
break;
1114
1115
#if GFX_VER >= 8
1116
case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR: {
1117
anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
1118
pc.CommandStreamerStallEnable = true;
1119
pc.StallAtPixelScoreboard = true;
1120
}
1121
cmd_buffer->perf_query_pool = pool;
1122
1123
if (!khr_perf_query_ensure_relocs(cmd_buffer))
1124
return;
1125
1126
const struct anv_physical_device *pdevice = cmd_buffer->device->physical;
1127
const struct intel_perf_query_field_layout *layout = &pdevice->perf->query_layout;
1128
1129
void *dws;
1130
for (uint32_t r = 0; r < layout->n_fields; r++) {
1131
const struct intel_perf_query_field *field = &layout->fields[r];
1132
1133
switch (field->type) {
1134
case INTEL_PERF_QUERY_FIELD_TYPE_MI_RPC:
1135
dws = anv_batch_emitn(&cmd_buffer->batch,
1136
GENX(MI_REPORT_PERF_COUNT_length),
1137
GENX(MI_REPORT_PERF_COUNT),
1138
.MemoryAddress = query_addr /* Will be overwritten */);
1139
_mi_resolve_address_token(&b,
1140
cmd_buffer->self_mod_locations[cmd_buffer->perf_reloc_idx++],
1141
dws +
1142
GENX(MI_REPORT_PERF_COUNT_MemoryAddress_start) / 8);
1143
break;
1144
1145
case INTEL_PERF_QUERY_FIELD_TYPE_SRM_PERFCNT:
1146
case INTEL_PERF_QUERY_FIELD_TYPE_SRM_RPSTAT:
1147
case INTEL_PERF_QUERY_FIELD_TYPE_SRM_OA_B:
1148
case INTEL_PERF_QUERY_FIELD_TYPE_SRM_OA_C:
1149
dws =
1150
anv_batch_emitn(&cmd_buffer->batch,
1151
GENX(MI_STORE_REGISTER_MEM_length),
1152
GENX(MI_STORE_REGISTER_MEM),
1153
.RegisterAddress = field->mmio_offset,
1154
.MemoryAddress = query_addr /* Will be overwritten */ );
1155
_mi_resolve_address_token(&b,
1156
cmd_buffer->self_mod_locations[cmd_buffer->perf_reloc_idx++],
1157
dws +
1158
GENX(MI_STORE_REGISTER_MEM_MemoryAddress_start) / 8);
1159
if (field->size == 8) {
1160
dws =
1161
anv_batch_emitn(&cmd_buffer->batch,
1162
GENX(MI_STORE_REGISTER_MEM_length),
1163
GENX(MI_STORE_REGISTER_MEM),
1164
.RegisterAddress = field->mmio_offset + 4,
1165
.MemoryAddress = query_addr /* Will be overwritten */ );
1166
_mi_resolve_address_token(&b,
1167
cmd_buffer->self_mod_locations[cmd_buffer->perf_reloc_idx++],
1168
dws +
1169
GENX(MI_STORE_REGISTER_MEM_MemoryAddress_start) / 8);
1170
}
1171
break;
1172
1173
default:
1174
unreachable("Invalid query field");
1175
break;
1176
}
1177
}
1178
1179
dws =
1180
anv_batch_emitn(&cmd_buffer->batch,
1181
GENX(MI_STORE_DATA_IMM_length),
1182
GENX(MI_STORE_DATA_IMM),
1183
.ImmediateData = true);
1184
_mi_resolve_address_token(&b,
1185
cmd_buffer->self_mod_locations[cmd_buffer->perf_reloc_idx++],
1186
dws +
1187
GENX(MI_STORE_DATA_IMM_Address_start) / 8);
1188
1189
assert(cmd_buffer->perf_reloc_idx == pdevice->n_perf_query_commands);
1190
break;
1191
}
1192
#endif
1193
1194
case VK_QUERY_TYPE_PERFORMANCE_QUERY_INTEL: {
1195
anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
1196
pc.CommandStreamerStallEnable = true;
1197
pc.StallAtPixelScoreboard = true;
1198
}
1199
uint32_t marker_offset = intel_perf_marker_offset();
1200
mi_store(&b, mi_mem64(anv_address_add(query_addr, marker_offset)),
1201
mi_imm(cmd_buffer->intel_perf_marker));
1202
emit_perf_intel_query(cmd_buffer, pool, &b, query_addr, true);
1203
emit_query_mi_availability(&b, query_addr, true);
1204
break;
1205
}
1206
1207
default:
1208
unreachable("");
1209
}
1210
1211
/* When multiview is active the spec requires that N consecutive query
1212
* indices are used, where N is the number of active views in the subpass.
1213
* The spec allows that we only write the results to one of the queries
1214
* but we still need to manage result availability for all the query indices.
1215
* Since we only emit a single query for all active views in the
1216
* first index, mark the other query indices as being already available
1217
* with result 0.
1218
*/
1219
if (cmd_buffer->state.subpass && cmd_buffer->state.subpass->view_mask) {
1220
const uint32_t num_queries =
1221
util_bitcount(cmd_buffer->state.subpass->view_mask);
1222
if (num_queries > 1)
1223
emit_zero_queries(cmd_buffer, &b, pool, query + 1, num_queries - 1);
1224
}
1225
}
1226
1227
#define TIMESTAMP 0x2358
1228
1229
void genX(CmdWriteTimestamp)(
1230
VkCommandBuffer commandBuffer,
1231
VkPipelineStageFlagBits pipelineStage,
1232
VkQueryPool queryPool,
1233
uint32_t query)
1234
{
1235
ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
1236
ANV_FROM_HANDLE(anv_query_pool, pool, queryPool);
1237
struct anv_address query_addr = anv_query_address(pool, query);
1238
1239
assert(pool->type == VK_QUERY_TYPE_TIMESTAMP);
1240
1241
struct mi_builder b;
1242
mi_builder_init(&b, &cmd_buffer->device->info, &cmd_buffer->batch);
1243
1244
switch (pipelineStage) {
1245
case VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT:
1246
mi_store(&b, mi_mem64(anv_address_add(query_addr, 8)),
1247
mi_reg64(TIMESTAMP));
1248
break;
1249
1250
default:
1251
/* Everything else is bottom-of-pipe */
1252
cmd_buffer->state.pending_pipe_bits |= ANV_PIPE_POST_SYNC_BIT;
1253
genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
1254
1255
anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
1256
pc.DestinationAddressType = DAT_PPGTT;
1257
pc.PostSyncOperation = WriteTimestamp;
1258
pc.Address = anv_address_add(query_addr, 8);
1259
1260
if (GFX_VER == 9 && cmd_buffer->device->info.gt == 4)
1261
pc.CommandStreamerStallEnable = true;
1262
}
1263
break;
1264
}
1265
1266
emit_query_pc_availability(cmd_buffer, query_addr, true);
1267
1268
/* When multiview is active the spec requires that N consecutive query
1269
* indices are used, where N is the number of active views in the subpass.
1270
* The spec allows that we only write the results to one of the queries
1271
* but we still need to manage result availability for all the query indices.
1272
* Since we only emit a single query for all active views in the
1273
* first index, mark the other query indices as being already available
1274
* with result 0.
1275
*/
1276
if (cmd_buffer->state.subpass && cmd_buffer->state.subpass->view_mask) {
1277
const uint32_t num_queries =
1278
util_bitcount(cmd_buffer->state.subpass->view_mask);
1279
if (num_queries > 1)
1280
emit_zero_queries(cmd_buffer, &b, pool, query + 1, num_queries - 1);
1281
}
1282
}
1283
1284
#if GFX_VERx10 >= 75
1285
1286
#define MI_PREDICATE_SRC0 0x2400
1287
#define MI_PREDICATE_SRC1 0x2408
1288
#define MI_PREDICATE_RESULT 0x2418
1289
1290
/**
1291
* Writes the results of a query to dst_addr is the value at poll_addr is equal
1292
* to the reference value.
1293
*/
1294
static void
1295
gpu_write_query_result_cond(struct anv_cmd_buffer *cmd_buffer,
1296
struct mi_builder *b,
1297
struct anv_address poll_addr,
1298
struct anv_address dst_addr,
1299
uint64_t ref_value,
1300
VkQueryResultFlags flags,
1301
uint32_t value_index,
1302
struct mi_value query_result)
1303
{
1304
mi_store(b, mi_reg64(MI_PREDICATE_SRC0), mi_mem64(poll_addr));
1305
mi_store(b, mi_reg64(MI_PREDICATE_SRC1), mi_imm(ref_value));
1306
anv_batch_emit(&cmd_buffer->batch, GENX(MI_PREDICATE), mip) {
1307
mip.LoadOperation = LOAD_LOAD;
1308
mip.CombineOperation = COMBINE_SET;
1309
mip.CompareOperation = COMPARE_SRCS_EQUAL;
1310
}
1311
1312
if (flags & VK_QUERY_RESULT_64_BIT) {
1313
struct anv_address res_addr = anv_address_add(dst_addr, value_index * 8);
1314
mi_store_if(b, mi_mem64(res_addr), query_result);
1315
} else {
1316
struct anv_address res_addr = anv_address_add(dst_addr, value_index * 4);
1317
mi_store_if(b, mi_mem32(res_addr), query_result);
1318
}
1319
}
1320
1321
static void
1322
gpu_write_query_result(struct mi_builder *b,
1323
struct anv_address dst_addr,
1324
VkQueryResultFlags flags,
1325
uint32_t value_index,
1326
struct mi_value query_result)
1327
{
1328
if (flags & VK_QUERY_RESULT_64_BIT) {
1329
struct anv_address res_addr = anv_address_add(dst_addr, value_index * 8);
1330
mi_store(b, mi_mem64(res_addr), query_result);
1331
} else {
1332
struct anv_address res_addr = anv_address_add(dst_addr, value_index * 4);
1333
mi_store(b, mi_mem32(res_addr), query_result);
1334
}
1335
}
1336
1337
static struct mi_value
1338
compute_query_result(struct mi_builder *b, struct anv_address addr)
1339
{
1340
return mi_isub(b, mi_mem64(anv_address_add(addr, 8)),
1341
mi_mem64(anv_address_add(addr, 0)));
1342
}
1343
1344
void genX(CmdCopyQueryPoolResults)(
1345
VkCommandBuffer commandBuffer,
1346
VkQueryPool queryPool,
1347
uint32_t firstQuery,
1348
uint32_t queryCount,
1349
VkBuffer destBuffer,
1350
VkDeviceSize destOffset,
1351
VkDeviceSize destStride,
1352
VkQueryResultFlags flags)
1353
{
1354
ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
1355
ANV_FROM_HANDLE(anv_query_pool, pool, queryPool);
1356
ANV_FROM_HANDLE(anv_buffer, buffer, destBuffer);
1357
1358
struct mi_builder b;
1359
mi_builder_init(&b, &cmd_buffer->device->info, &cmd_buffer->batch);
1360
struct mi_value result;
1361
1362
/* If render target writes are ongoing, request a render target cache flush
1363
* to ensure proper ordering of the commands from the 3d pipe and the
1364
* command streamer.
1365
*/
1366
if (cmd_buffer->state.pending_pipe_bits & ANV_PIPE_RENDER_TARGET_BUFFER_WRITES) {
1367
anv_add_pending_pipe_bits(cmd_buffer,
1368
ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT,
1369
"CopyQueryPoolResults");
1370
}
1371
1372
if ((flags & VK_QUERY_RESULT_WAIT_BIT) ||
1373
(cmd_buffer->state.pending_pipe_bits & ANV_PIPE_FLUSH_BITS) ||
1374
/* Occlusion & timestamp queries are written using a PIPE_CONTROL and
1375
* because we're about to copy values from MI commands, we need to
1376
* stall the command streamer to make sure the PIPE_CONTROL values have
1377
* landed, otherwise we could see inconsistent values & availability.
1378
*
1379
* From the vulkan spec:
1380
*
1381
* "vkCmdCopyQueryPoolResults is guaranteed to see the effect of
1382
* previous uses of vkCmdResetQueryPool in the same queue, without
1383
* any additional synchronization."
1384
*/
1385
pool->type == VK_QUERY_TYPE_OCCLUSION ||
1386
pool->type == VK_QUERY_TYPE_TIMESTAMP) {
1387
anv_add_pending_pipe_bits(cmd_buffer,
1388
ANV_PIPE_CS_STALL_BIT,
1389
"CopyQueryPoolResults");
1390
genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
1391
}
1392
1393
struct anv_address dest_addr = anv_address_add(buffer->address, destOffset);
1394
for (uint32_t i = 0; i < queryCount; i++) {
1395
struct anv_address query_addr = anv_query_address(pool, firstQuery + i);
1396
uint32_t idx = 0;
1397
switch (pool->type) {
1398
case VK_QUERY_TYPE_OCCLUSION:
1399
result = compute_query_result(&b, anv_address_add(query_addr, 8));
1400
/* Like in the case of vkGetQueryPoolResults, if the query is
1401
* unavailable and the VK_QUERY_RESULT_PARTIAL_BIT flag is set,
1402
* conservatively write 0 as the query result. If the
1403
* VK_QUERY_RESULT_PARTIAL_BIT isn't set, don't write any value.
1404
*/
1405
gpu_write_query_result_cond(cmd_buffer, &b, query_addr, dest_addr,
1406
1 /* available */, flags, idx, result);
1407
if (flags & VK_QUERY_RESULT_PARTIAL_BIT) {
1408
gpu_write_query_result_cond(cmd_buffer, &b, query_addr, dest_addr,
1409
0 /* unavailable */, flags, idx, mi_imm(0));
1410
}
1411
idx++;
1412
break;
1413
1414
case VK_QUERY_TYPE_PIPELINE_STATISTICS: {
1415
uint32_t statistics = pool->pipeline_statistics;
1416
while (statistics) {
1417
uint32_t stat = u_bit_scan(&statistics);
1418
1419
result = compute_query_result(&b, anv_address_add(query_addr,
1420
idx * 16 + 8));
1421
1422
/* WaDividePSInvocationCountBy4:HSW,BDW */
1423
if ((cmd_buffer->device->info.ver == 8 ||
1424
cmd_buffer->device->info.is_haswell) &&
1425
(1 << stat) == VK_QUERY_PIPELINE_STATISTIC_FRAGMENT_SHADER_INVOCATIONS_BIT) {
1426
result = mi_ushr32_imm(&b, result, 2);
1427
}
1428
1429
gpu_write_query_result(&b, dest_addr, flags, idx++, result);
1430
}
1431
assert(idx == util_bitcount(pool->pipeline_statistics));
1432
break;
1433
}
1434
1435
case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
1436
result = compute_query_result(&b, anv_address_add(query_addr, 8));
1437
gpu_write_query_result(&b, dest_addr, flags, idx++, result);
1438
result = compute_query_result(&b, anv_address_add(query_addr, 24));
1439
gpu_write_query_result(&b, dest_addr, flags, idx++, result);
1440
break;
1441
1442
case VK_QUERY_TYPE_TIMESTAMP:
1443
result = mi_mem64(anv_address_add(query_addr, 8));
1444
gpu_write_query_result(&b, dest_addr, flags, idx++, result);
1445
break;
1446
1447
#if GFX_VER >= 8
1448
case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR:
1449
unreachable("Copy KHR performance query results not implemented");
1450
break;
1451
#endif
1452
1453
default:
1454
unreachable("unhandled query type");
1455
}
1456
1457
if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT) {
1458
gpu_write_query_result(&b, dest_addr, flags, idx,
1459
mi_mem64(query_addr));
1460
}
1461
1462
dest_addr = anv_address_add(dest_addr, destStride);
1463
}
1464
}
1465
1466
#else
1467
void genX(CmdCopyQueryPoolResults)(
1468
VkCommandBuffer commandBuffer,
1469
VkQueryPool queryPool,
1470
uint32_t firstQuery,
1471
uint32_t queryCount,
1472
VkBuffer destBuffer,
1473
VkDeviceSize destOffset,
1474
VkDeviceSize destStride,
1475
VkQueryResultFlags flags)
1476
{
1477
anv_finishme("Queries not yet supported on Ivy Bridge");
1478
}
1479
#endif
1480
1481