Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
PojavLauncherTeam
GitHub Repository: PojavLauncherTeam/mesa
Path: blob/21.2-virgl/src/gallium/drivers/freedreno/a4xx/fd4_query.c
4574 views
1
/*
2
* Copyright (C) 2014 Rob Clark <[email protected]>
3
*
4
* Permission is hereby granted, free of charge, to any person obtaining a
5
* copy of this software and associated documentation files (the "Software"),
6
* to deal in the Software without restriction, including without limitation
7
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
8
* and/or sell copies of the Software, and to permit persons to whom the
9
* Software is furnished to do so, subject to the following conditions:
10
*
11
* The above copyright notice and this permission notice (including the next
12
* paragraph) shall be included in all copies or substantial portions of the
13
* Software.
14
*
15
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
* SOFTWARE.
22
*
23
* Authors:
24
* Rob Clark <[email protected]>
25
*/
26
27
#include "freedreno_context.h"
28
#include "freedreno_query_hw.h"
29
#include "freedreno_util.h"
30
31
#include "fd4_context.h"
32
#include "fd4_draw.h"
33
#include "fd4_format.h"
34
#include "fd4_query.h"
35
36
struct fd_rb_samp_ctrs {
37
uint64_t ctr[16];
38
};
39
40
/*
41
* Occlusion Query:
42
*
43
* OCCLUSION_COUNTER and OCCLUSION_PREDICATE differ only in how they
44
* interpret results
45
*/
46
47
static struct fd_hw_sample *
48
occlusion_get_sample(struct fd_batch *batch, struct fd_ringbuffer *ring)
49
{
50
struct fd_hw_sample *samp =
51
fd_hw_sample_init(batch, sizeof(struct fd_rb_samp_ctrs));
52
53
/* low bits of sample addr should be zero (since they are control
54
* flags in RB_SAMPLE_COUNT_CONTROL):
55
*/
56
debug_assert((samp->offset & 0x3) == 0);
57
58
/* Set RB_SAMPLE_COUNT_ADDR to samp->offset plus value of
59
* HW_QUERY_BASE_REG register:
60
*/
61
OUT_PKT3(ring, CP_SET_CONSTANT, 3);
62
OUT_RING(ring, CP_REG(REG_A4XX_RB_SAMPLE_COUNT_CONTROL) | 0x80000000);
63
OUT_RING(ring, HW_QUERY_BASE_REG);
64
OUT_RING(ring, A4XX_RB_SAMPLE_COUNT_CONTROL_COPY | samp->offset);
65
66
OUT_PKT3(ring, CP_DRAW_INDX_OFFSET, 3);
67
OUT_RING(ring, DRAW4(DI_PT_POINTLIST_PSIZE, DI_SRC_SEL_AUTO_INDEX,
68
INDEX4_SIZE_32_BIT, USE_VISIBILITY));
69
OUT_RING(ring, 1); /* NumInstances */
70
OUT_RING(ring, 0); /* NumIndices */
71
72
fd_event_write(batch, ring, ZPASS_DONE);
73
74
return samp;
75
}
76
77
static uint64_t
78
count_samples(const struct fd_rb_samp_ctrs *start,
79
const struct fd_rb_samp_ctrs *end)
80
{
81
return end->ctr[0] - start->ctr[0];
82
}
83
84
static void
85
occlusion_counter_accumulate_result(struct fd_context *ctx, const void *start,
86
const void *end,
87
union pipe_query_result *result)
88
{
89
uint64_t n = count_samples(start, end);
90
result->u64 += n;
91
}
92
93
static void
94
occlusion_predicate_accumulate_result(struct fd_context *ctx, const void *start,
95
const void *end,
96
union pipe_query_result *result)
97
{
98
uint64_t n = count_samples(start, end);
99
result->b |= (n > 0);
100
}
101
102
/*
103
* Time Elapsed Query:
104
*
105
* Note: we could in theory support timestamp queries, but they
106
* won't give sensible results for tilers.
107
*/
108
109
static void
110
time_elapsed_enable(struct fd_context *ctx,
111
struct fd_ringbuffer *ring) assert_dt
112
{
113
/* Right now, the assignment of countable to counter register is
114
* just hard coded. If we start exposing more countables than we
115
* have counters, we will need to be more clever.
116
*/
117
struct fd_batch *batch = fd_context_batch_locked(ctx);
118
fd_wfi(batch, ring);
119
OUT_PKT0(ring, REG_A4XX_CP_PERFCTR_CP_SEL_0, 1);
120
OUT_RING(ring, CP_ALWAYS_COUNT);
121
fd_batch_unlock_submit(batch);
122
fd_batch_reference(&batch, NULL);
123
}
124
125
static struct fd_hw_sample *
126
time_elapsed_get_sample(struct fd_batch *batch,
127
struct fd_ringbuffer *ring) assert_dt
128
{
129
struct fd_hw_sample *samp = fd_hw_sample_init(batch, sizeof(uint64_t));
130
131
/* use unused part of vsc_size_mem as scratch space, to avoid
132
* extra allocation:
133
*/
134
struct fd_bo *scratch_bo = fd4_context(batch->ctx)->vsc_size_mem;
135
const int sample_off = 128;
136
const int addr_off = sample_off + 8;
137
138
debug_assert(batch->ctx->screen->max_freq > 0);
139
140
/* Basic issue is that we need to read counter value to a relative
141
* destination (with per-tile offset) rather than absolute dest
142
* addr. But there is no pm4 packet that can do that. This is
143
* where it would be *really* nice if we could write our own fw
144
* since afaict implementing the sort of packet we need would be
145
* trivial.
146
*
147
* Instead, we:
148
* (1) CP_REG_TO_MEM to do a 64b copy of counter to scratch buffer
149
* (2) CP_MEM_WRITE to write per-sample offset to scratch buffer
150
* (3) CP_REG_TO_MEM w/ accumulate flag to add the per-tile base
151
* address to the per-sample offset in the scratch buffer
152
* (4) CP_MEM_TO_REG to copy resulting address from steps #2 and #3
153
* to CP_ME_NRT_ADDR
154
* (5) CP_MEM_TO_REG's to copy saved counter value from scratch
155
* buffer to CP_ME_NRT_DATA to trigger the write out to query
156
* result buffer
157
*
158
* Straightforward, right?
159
*
160
* Maybe could swap the order of things in the scratch buffer to
161
* put address first, and copy back to CP_ME_NRT_ADDR+DATA in one
162
* shot, but that's really just polishing a turd..
163
*/
164
165
fd_wfi(batch, ring);
166
167
/* copy sample counter _LO and _HI to scratch: */
168
OUT_PKT3(ring, CP_REG_TO_MEM, 2);
169
OUT_RING(ring, CP_REG_TO_MEM_0_REG(REG_A4XX_RBBM_PERFCTR_CP_0_LO) |
170
CP_REG_TO_MEM_0_64B |
171
CP_REG_TO_MEM_0_CNT(2)); /* write 2 regs to mem */
172
OUT_RELOC(ring, scratch_bo, sample_off, 0, 0);
173
174
/* ok... here we really *would* like to use the CP_SET_CONSTANT
175
* mode which can add a constant to value in reg2 and write to
176
* reg1... *but* that only works for banked/context registers,
177
* and CP_ME_NRT_DATA isn't one of those.. so we need to do some
178
* CP math to the scratch buffer instead:
179
*
180
* (note first 8 bytes are counter value, use offset 0x8 for
181
* address calculation)
182
*/
183
184
/* per-sample offset to scratch bo: */
185
OUT_PKT3(ring, CP_MEM_WRITE, 2);
186
OUT_RELOC(ring, scratch_bo, addr_off, 0, 0);
187
OUT_RING(ring, samp->offset);
188
189
/* now add to that the per-tile base: */
190
OUT_PKT3(ring, CP_REG_TO_MEM, 2);
191
OUT_RING(ring, CP_REG_TO_MEM_0_REG(HW_QUERY_BASE_REG) |
192
CP_REG_TO_MEM_0_ACCUMULATE |
193
CP_REG_TO_MEM_0_CNT(0)); /* readback 1 regs */
194
OUT_RELOC(ring, scratch_bo, addr_off, 0, 0);
195
196
/* now copy that back to CP_ME_NRT_ADDR: */
197
OUT_PKT3(ring, CP_MEM_TO_REG, 2);
198
OUT_RING(ring, REG_A4XX_CP_ME_NRT_ADDR);
199
OUT_RELOC(ring, scratch_bo, addr_off, 0, 0);
200
201
/* and finally, copy sample from scratch buffer to CP_ME_NRT_DATA
202
* to trigger the write to result buffer
203
*/
204
OUT_PKT3(ring, CP_MEM_TO_REG, 2);
205
OUT_RING(ring, REG_A4XX_CP_ME_NRT_DATA);
206
OUT_RELOC(ring, scratch_bo, sample_off, 0, 0);
207
208
/* and again to get the value of the _HI reg from scratch: */
209
OUT_PKT3(ring, CP_MEM_TO_REG, 2);
210
OUT_RING(ring, REG_A4XX_CP_ME_NRT_DATA);
211
OUT_RELOC(ring, scratch_bo, sample_off + 0x4, 0, 0);
212
213
/* Sigh.. */
214
215
return samp;
216
}
217
218
static void
219
time_elapsed_accumulate_result(struct fd_context *ctx, const void *start,
220
const void *end, union pipe_query_result *result)
221
{
222
uint64_t n = *(uint64_t *)end - *(uint64_t *)start;
223
/* max_freq is in Hz, convert cycle count to ns: */
224
result->u64 += n * 1000000000 / ctx->screen->max_freq;
225
}
226
227
static void
228
timestamp_accumulate_result(struct fd_context *ctx, const void *start,
229
const void *end, union pipe_query_result *result)
230
{
231
/* just return the value from fist tile: */
232
if (result->u64 != 0)
233
return;
234
uint64_t n = *(uint64_t *)start;
235
/* max_freq is in Hz, convert cycle count to ns: */
236
result->u64 = n * 1000000000 / ctx->screen->max_freq;
237
}
238
239
static const struct fd_hw_sample_provider occlusion_counter = {
240
.query_type = PIPE_QUERY_OCCLUSION_COUNTER,
241
.get_sample = occlusion_get_sample,
242
.accumulate_result = occlusion_counter_accumulate_result,
243
};
244
245
static const struct fd_hw_sample_provider occlusion_predicate = {
246
.query_type = PIPE_QUERY_OCCLUSION_PREDICATE,
247
.get_sample = occlusion_get_sample,
248
.accumulate_result = occlusion_predicate_accumulate_result,
249
};
250
251
static const struct fd_hw_sample_provider occlusion_predicate_conservative = {
252
.query_type = PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE,
253
.get_sample = occlusion_get_sample,
254
.accumulate_result = occlusion_predicate_accumulate_result,
255
};
256
257
static const struct fd_hw_sample_provider time_elapsed = {
258
.query_type = PIPE_QUERY_TIME_ELAPSED,
259
.always = true,
260
.enable = time_elapsed_enable,
261
.get_sample = time_elapsed_get_sample,
262
.accumulate_result = time_elapsed_accumulate_result,
263
};
264
265
/* NOTE: timestamp query isn't going to give terribly sensible results
266
* on a tiler. But it is needed by qapitrace profile heatmap. If you
267
* add in a binning pass, the results get even more non-sensical. So
268
* we just return the timestamp on the first tile and hope that is
269
* kind of good enough.
270
*/
271
static const struct fd_hw_sample_provider timestamp = {
272
.query_type = PIPE_QUERY_TIMESTAMP,
273
.always = true,
274
.enable = time_elapsed_enable,
275
.get_sample = time_elapsed_get_sample,
276
.accumulate_result = timestamp_accumulate_result,
277
};
278
279
void
280
fd4_query_context_init(struct pipe_context *pctx) disable_thread_safety_analysis
281
{
282
struct fd_context *ctx = fd_context(pctx);
283
284
ctx->create_query = fd_hw_create_query;
285
ctx->query_prepare = fd_hw_query_prepare;
286
ctx->query_prepare_tile = fd_hw_query_prepare_tile;
287
ctx->query_update_batch = fd_hw_query_update_batch;
288
289
fd_hw_query_register_provider(pctx, &occlusion_counter);
290
fd_hw_query_register_provider(pctx, &occlusion_predicate);
291
fd_hw_query_register_provider(pctx, &occlusion_predicate_conservative);
292
fd_hw_query_register_provider(pctx, &time_elapsed);
293
fd_hw_query_register_provider(pctx, &timestamp);
294
}
295
296