Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
PojavLauncherTeam
GitHub Repository: PojavLauncherTeam/mesa
Path: blob/21.2-virgl/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw_sm.c
4574 views
1
/*
2
* Copyright 2011 Christoph Bumiller
3
* Copyright 2015 Samuel Pitoiset
4
*
5
* Permission is hereby granted, free of charge, to any person obtaining a
6
* copy of this software and associated documentation files (the "Software"),
7
* to deal in the Software without restriction, including without limitation
8
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
9
* and/or sell copies of the Software, and to permit persons to whom the
10
* Software is furnished to do so, subject to the following conditions:
11
*
12
* The above copyright notice and this permission notice shall be included in
13
* all copies or substantial portions of the Software.
14
*
15
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
19
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
20
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
21
* OTHER DEALINGS IN THE SOFTWARE.
22
*/
23
24
#define NVC0_PUSH_EXPLICIT_SPACE_CHECKING
25
26
#include "nvc0/nvc0_context.h"
27
#include "nvc0/nvc0_query_hw_sm.h"
28
29
#include "nv_object.xml.h"
30
#include "nvc0/nve4_compute.xml.h"
31
#include "nvc0/nvc0_compute.xml.h"
32
33
/* NOTE: intentionally using the same names as NV */
34
#define _Q(t, n, d) { NVC0_HW_SM_QUERY_##t, n, d }
35
static const struct {
36
unsigned type;
37
const char *name;
38
const char *desc;
39
} nvc0_hw_sm_queries[] = {
40
_Q(ACTIVE_CTAS,
41
"active_ctas",
42
"Accumulated number of active blocks per cycle. For every cycle it "
43
"increments by the number of active blocks in the cycle which can be in "
44
"the range 0 to 32."),
45
46
_Q(ACTIVE_CYCLES,
47
"active_cycles",
48
"Number of cycles a multiprocessor has at least one active warp"),
49
50
_Q(ACTIVE_WARPS,
51
"active_warps",
52
"Accumulated number of active warps per cycle. For every cycle it "
53
"increments by the number of active warps in the cycle which can be in "
54
"the range 0 to 64"),
55
56
_Q(ATOM_CAS_COUNT,
57
"atom_cas_count",
58
"Number of warps executing atomic compare and swap operations. Increments "
59
"by one if at least one thread in a warp executes the instruction."),
60
61
_Q(ATOM_COUNT,
62
"atom_count",
63
"Number of warps executing atomic reduction operations. Increments by one "
64
"if at least one thread in a warp executes the instruction"),
65
66
_Q(BRANCH,
67
"branch",
68
"Number of branch instructions executed per warp on a multiprocessor"),
69
70
_Q(DIVERGENT_BRANCH,
71
"divergent_branch",
72
"Number of divergent branches within a warp. This counter will be "
73
"incremented by one if at least one thread in a warp diverges (that is, "
74
"follows a different execution path) via a conditional branch"),
75
76
_Q(GLD_REQUEST,
77
"gld_request",
78
"Number of executed load instructions where the state space is not "
79
"specified and hence generic addressing is used, increments per warp on a "
80
"multiprocessor. It can include the load operations from global,local and "
81
"shared state space"),
82
83
_Q(GLD_MEM_DIV_REPLAY,
84
"global_ld_mem_divergence_replays",
85
"Number of instruction replays for global memory loads. Instruction is "
86
"replayed if the instruction is accessing more than one cache line of "
87
"128 bytes. For each extra cache line access the counter is incremented "
88
"by 1"),
89
90
_Q(GLOBAL_ATOM_CAS,
91
"global_atom_cas",
92
"Number of ATOM.CAS instructions executed per warp."),
93
94
_Q(GLOBAL_LD,
95
"global_load",
96
"Number of executed load instructions where state space is specified as "
97
"global, increments per warp on a multiprocessor."),
98
99
_Q(GLOBAL_ST,
100
"global_store",
101
"Number of executed store instructions where state space is specified as "
102
"global, increments per warp on a multiprocessor."),
103
104
_Q(GST_TRANSACTIONS,
105
"global_store_transaction",
106
"Number of global store transactions. Increments by 1 per transaction. "
107
"Transaction can be 32/64/96/128B"),
108
109
_Q(GST_MEM_DIV_REPLAY,
110
"global_st_mem_divergence_replays",
111
"Number of instruction replays for global memory stores. Instruction is "
112
"replayed if the instruction is accessing more than one cache line of "
113
"128 bytes. For each extra cache line access the counter is incremented "
114
"by 1"),
115
116
_Q(GRED_COUNT,
117
"gred_count",
118
"Number of warps executing reduction operations on global memory. "
119
"Increments by one if at least one thread in a warp executes the "
120
"instruction"),
121
122
_Q(GST_REQUEST,
123
"gst_request",
124
"Number of executed store instructions where the state space is not "
125
"specified and hence generic addressing is used, increments per warp on a "
126
"multiprocessor. It can include the store operations to global,local and "
127
"shared state space"),
128
129
_Q(INST_EXECUTED,
130
"inst_executed",
131
"Number of instructions executed, do not include replays"),
132
133
_Q(INST_ISSUED,
134
"inst_issued",
135
"Number of instructions issued including replays"),
136
137
_Q(INST_ISSUED0,
138
"inst_issued0",
139
"Number of cycles that did not issue any instruction, increments per "
140
"warp."),
141
142
_Q(INST_ISSUED1,
143
"inst_issued1",
144
"Number of single instruction issued per cycle"),
145
146
_Q(INST_ISSUED2,
147
"inst_issued2",
148
"Number of dual instructions issued per cycle"),
149
150
_Q(INST_ISSUED1_0,
151
"inst_issued1_0",
152
"Number of single instruction issued per cycle in pipeline 0"),
153
154
_Q(INST_ISSUED1_1,
155
"inst_issued1_1",
156
"Number of single instruction issued per cycle in pipeline 1"),
157
158
_Q(INST_ISSUED2_0,
159
"inst_issued2_0",
160
"Number of dual instructions issued per cycle in pipeline 0"),
161
162
_Q(INST_ISSUED2_1,
163
"inst_issued2_1",
164
"Number of dual instructions issued per cycle in pipeline 1"),
165
166
_Q(L1_GLD_HIT,
167
"l1_global_load_hit",
168
"Number of cache lines that hit in L1 cache for global memory load "
169
"accesses. In case of perfect coalescing this increments by 1,2, and 4 for "
170
"32, 64 and 128 bit accesses by a warp respectively"),
171
172
_Q(L1_GLD_MISS,
173
"l1_global_load_miss",
174
"Number of cache lines that miss in L1 cache for global memory load "
175
"accesses. In case of perfect coalescing this increments by 1,2, and 4 for "
176
"32, 64 and 128 bit accesses by a warp respectively"),
177
178
_Q(L1_GLD_TRANSACTIONS,
179
"__l1_global_load_transactions",
180
"Number of global load transactions from L1 cache. Increments by 1 per "
181
"transaction. Transaction can be 32/64/96/128B"),
182
183
_Q(L1_GST_TRANSACTIONS,
184
"__l1_global_store_transactions",
185
"Number of global store transactions from L1 cache. Increments by 1 per "
186
"transaction. Transaction can be 32/64/96/128B"),
187
188
_Q(L1_LOCAL_LD_HIT,
189
"l1_local_load_hit",
190
"Number of cache lines that hit in L1 cache for local memory load "
191
"accesses. In case of perfect coalescing this increments by 1,2, and 4 for "
192
"32, 64 and 128 bit accesses by a warp respectively"),
193
194
_Q(L1_LOCAL_LD_MISS,
195
"l1_local_load_miss",
196
"Number of cache lines that miss in L1 cache for local memory load "
197
"accesses. In case of perfect coalescing this increments by 1,2, and 4 for "
198
"32, 64 and 128 bit accesses by a warp respectively"),
199
200
_Q(L1_LOCAL_ST_HIT,
201
"l1_local_store_hit",
202
"Number of cache lines that hit in L1 cache for local memory store "
203
"accesses. In case of perfect coalescing this increments by 1,2, and 4 for "
204
"32, 64 and 128 bit accesses by a warp respectively"),
205
206
_Q(L1_LOCAL_ST_MISS,
207
"l1_local_store_miss",
208
"Number of cache lines that miss in L1 cache for local memory store "
209
"accesses. In case of perfect coalescing this increments by 1,2, and 4 for "
210
"32,64 and 128 bit accesses by a warp respectively"),
211
212
_Q(L1_SHARED_LD_TRANSACTIONS,
213
"l1_shared_load_transactions",
214
"Number of shared load transactions. Increments by 1 per transaction. "
215
"Transaction can be 32/64/96/128B"),
216
217
_Q(L1_SHARED_ST_TRANSACTIONS,
218
"l1_shared_store_transactions",
219
"Number of shared store transactions. Increments by 1 per transaction. "
220
"Transaction can be 32/64/96/128B"),
221
222
_Q(LOCAL_LD,
223
"local_load",
224
"Number of executed load instructions where state space is specified as "
225
"local, increments per warp on a multiprocessor"),
226
227
_Q(LOCAL_LD_TRANSACTIONS,
228
"local_load_transactions",
229
"Number of local load transactions from L1 cache. Increments by 1 per "
230
"transaction. Transaction can be 32/64/96/128B"),
231
232
_Q(LOCAL_ST,
233
"local_store",
234
"Number of executed store instructions where state space is specified as "
235
"local, increments per warp on a multiprocessor"),
236
237
_Q(LOCAL_ST_TRANSACTIONS,
238
"local_store_transactions",
239
"Number of local store transactions to L1 cache. Increments by 1 per "
240
"transaction. Transaction can be 32/64/96/128B."),
241
242
_Q(NOT_PRED_OFF_INST_EXECUTED,
243
"not_predicated_off_thread_inst_executed",
244
"Number of not predicated off instructions executed by all threads, does "
245
"not include replays. For each instruction it increments by the number of "
246
"threads that execute this instruction"),
247
248
_Q(PROF_TRIGGER_0,
249
"prof_trigger_00",
250
"User profiled generic trigger that can be inserted in any place of the "
251
"code to collect the related information. Increments per warp."),
252
253
_Q(PROF_TRIGGER_1,
254
"prof_trigger_01",
255
"User profiled generic trigger that can be inserted in any place of the "
256
"code to collect the related information. Increments per warp."),
257
258
_Q(PROF_TRIGGER_2,
259
"prof_trigger_02",
260
"User profiled generic trigger that can be inserted in any place of the "
261
"code to collect the related information. Increments per warp."),
262
263
_Q(PROF_TRIGGER_3,
264
"prof_trigger_03",
265
"User profiled generic trigger that can be inserted in any place of the "
266
"code to collect the related information. Increments per warp."),
267
268
_Q(PROF_TRIGGER_4,
269
"prof_trigger_04",
270
"User profiled generic trigger that can be inserted in any place of the "
271
"code to collect the related information. Increments per warp."),
272
273
_Q(PROF_TRIGGER_5,
274
"prof_trigger_05",
275
"User profiled generic trigger that can be inserted in any place of the "
276
"code to collect the related information. Increments per warp."),
277
278
_Q(PROF_TRIGGER_6,
279
"prof_trigger_06",
280
"User profiled generic trigger that can be inserted in any place of the "
281
"code to collect the related information. Increments per warp."),
282
283
_Q(PROF_TRIGGER_7,
284
"prof_trigger_07",
285
"User profiled generic trigger that can be inserted in any place of the "
286
"code to collect the related information. Increments per warp."),
287
288
_Q(SHARED_ATOM,
289
"shared_atom",
290
"Number of ATOMS instructions executed per warp."),
291
292
_Q(SHARED_ATOM_CAS,
293
"shared_atom_cas",
294
"Number of ATOMS.CAS instructions executed per warp."),
295
296
_Q(SHARED_LD,
297
"shared_load",
298
"Number of executed load instructions where state space is specified as "
299
"shared, increments per warp on a multiprocessor"),
300
301
_Q(SHARED_LD_BANK_CONFLICT,
302
"shared_load_bank_conflict",
303
"Number of shared load bank conflict generated when the addresses for "
304
"two or more shared memory load requests fall in the same memory bank."),
305
306
_Q(SHARED_LD_REPLAY,
307
"shared_load_replay",
308
"Replays caused due to shared load bank conflict (when the addresses for "
309
"two or more shared memory load requests fall in the same memory bank) or "
310
"when there is no conflict but the total number of words accessed by all "
311
"threads in the warp executing that instruction exceed the number of words "
312
"that can be loaded in one cycle (256 bytes)"),
313
314
_Q(SHARED_LD_TRANSACTIONS,
315
"shared_ld_transactions",
316
"Number of transactions for shared load accesses. Maximum transaction "
317
"size in maxwell is 128 bytes, any warp accessing more that 128 bytes "
318
"will cause multiple transactions for a shared load instruction. This "
319
"also includes extra transactions caused by shared bank conflicts."),
320
321
_Q(SHARED_ST,
322
"shared_store",
323
"Number of executed store instructions where state space is specified as "
324
"shared, increments per warp on a multiprocessor"),
325
326
_Q(SHARED_ST_BANK_CONFLICT,
327
"shared_store_bank_conflict",
328
"Number of shared store bank conflict generated when the addresses for "
329
"two or more shared memory store requests fall in the same memory bank."),
330
331
_Q(SHARED_ST_REPLAY,
332
"shared_store_replay",
333
"Replays caused due to shared store bank conflict (when the addresses for "
334
"two or more shared memory store requests fall in the same memory bank) or "
335
"when there is no conflict but the total number of words accessed by all "
336
"threads in the warp executing that instruction exceed the number of words "
337
"that can be stored in one cycle"),
338
339
_Q(SHARED_ST_TRANSACTIONS,
340
"shared_st_transactions",
341
"Number of transactions for shared store accesses. Maximum transaction "
342
"size in maxwell is 128 bytes, any warp accessing more that 128 bytes "
343
"will cause multiple transactions for a shared store instruction. This "
344
"also includes extra transactions caused by shared bank conflicts."),
345
346
_Q(SM_CTA_LAUNCHED,
347
"sm_cta_launched",
348
"Number of thread blocks launched on a multiprocessor"),
349
350
_Q(THREADS_LAUNCHED,
351
"threads_launched",
352
"Number of threads launched on a multiprocessor"),
353
354
_Q(TH_INST_EXECUTED,
355
"thread_inst_executed",
356
"Number of instructions executed by all threads, does not include "
357
"replays. For each instruction it increments by the number of threads in "
358
"the warp that execute the instruction"),
359
360
_Q(TH_INST_EXECUTED_0,
361
"thread_inst_executed_0",
362
"Number of instructions executed by all threads, does not include "
363
"replays. For each instruction it increments by the number of threads in "
364
"the warp that execute the instruction in pipeline 0"),
365
366
_Q(TH_INST_EXECUTED_1,
367
"thread_inst_executed_1",
368
"Number of instructions executed by all threads, does not include "
369
"replays. For each instruction it increments by the number of threads in "
370
"the warp that execute the instruction in pipeline 1"),
371
372
_Q(TH_INST_EXECUTED_2,
373
"thread_inst_executed_2",
374
"Number of instructions executed by all threads, does not include "
375
"replays. For each instruction it increments by the number of threads in "
376
"the warp that execute the instruction in pipeline 2"),
377
378
_Q(TH_INST_EXECUTED_3,
379
"thread_inst_executed_3",
380
"Number of instructions executed by all threads, does not include "
381
"replays. For each instruction it increments by the number of threads in "
382
"the warp that execute the instruction in pipeline 3"),
383
384
_Q(UNCACHED_GLD_TRANSACTIONS,
385
"uncached_global_load_transaction",
386
"Number of uncached global load transactions. Increments by 1 per "
387
"transaction. Transaction can be 32/64/96/128B."),
388
389
_Q(WARPS_LAUNCHED,
390
"warps_launched",
391
"Number of warps launched on a multiprocessor"),
392
};
393
394
#undef _Q
395
396
static inline const char *
397
nvc0_hw_sm_query_get_name(unsigned query_type)
398
{
399
unsigned i;
400
401
for (i = 0; i < ARRAY_SIZE(nvc0_hw_sm_queries); i++) {
402
if (nvc0_hw_sm_queries[i].type == query_type)
403
return nvc0_hw_sm_queries[i].name;
404
}
405
assert(0);
406
return NULL;
407
}
408
409
/* === PERFORMANCE MONITORING COUNTERS for NVE4+ === */
410
411
/* Code to read out MP counters: They are accessible via mmio, too, but let's
412
* just avoid mapping registers in userspace. We'd have to know which MPs are
413
* enabled/present, too, and that information is not presently exposed.
414
* We could add a kernel interface for it, but reading the counters like this
415
* has the advantage of being async (if get_result isn't called immediately).
416
*/
417
static const uint64_t nve4_read_hw_sm_counters_code[] =
418
{
419
/* sched 0x20 0x20 0x20 0x20 0x20 0x20 0x20
420
* mov b32 $r8 $tidx
421
* mov b32 $r12 $physid
422
* mov b32 $r0 $pm0
423
* mov b32 $r1 $pm1
424
* mov b32 $r2 $pm2
425
* mov b32 $r3 $pm3
426
* mov b32 $r4 $pm4
427
* sched 0x20 0x20 0x23 0x04 0x20 0x04 0x2b
428
* mov b32 $r5 $pm5
429
* mov b32 $r6 $pm6
430
* mov b32 $r7 $pm7
431
* set $p0 0x1 eq u32 $r8 0x0
432
* mov b32 $r10 c7[0x6a0]
433
* ext u32 $r8 $r12 0x414
434
* mov b32 $r11 c7[0x6a4]
435
* sched 0x04 0x2e 0x04 0x20 0x20 0x28 0x04
436
* ext u32 $r9 $r12 0x208
437
* (not $p0) exit
438
* set $p1 0x1 eq u32 $r9 0x0
439
* mul $r8 u32 $r8 u32 96
440
* mul $r12 u32 $r9 u32 16
441
* mul $r13 u32 $r9 u32 4
442
* add b32 $r9 $r8 $r13
443
* sched 0x28 0x04 0x2c 0x04 0x2c 0x04 0x2c
444
* add b32 $r8 $r8 $r12
445
* mov b32 $r12 $r10
446
* add b32 $r10 $c $r10 $r8
447
* mov b32 $r13 $r11
448
* add b32 $r11 $r11 0x0 $c
449
* add b32 $r12 $c $r12 $r9
450
* st b128 wt g[$r10d] $r0q
451
* sched 0x4 0x2c 0x20 0x04 0x2e 0x00 0x00
452
* mov b32 $r0 c7[0x6a8]
453
* add b32 $r13 $r13 0x0 $c
454
* $p1 st b128 wt g[$r12d+0x40] $r4q
455
* st b32 wt g[$r12d+0x50] $r0
456
* exit */
457
0x2202020202020207ULL,
458
0x2c00000084021c04ULL,
459
0x2c0000000c031c04ULL,
460
0x2c00000010001c04ULL,
461
0x2c00000014005c04ULL,
462
0x2c00000018009c04ULL,
463
0x2c0000001c00dc04ULL,
464
0x2c00000020011c04ULL,
465
0x22b0420042320207ULL,
466
0x2c00000024015c04ULL,
467
0x2c00000028019c04ULL,
468
0x2c0000002c01dc04ULL,
469
0x190e0000fc81dc03ULL,
470
0x28005c1a80029de4ULL,
471
0x7000c01050c21c03ULL,
472
0x28005c1a9002dde4ULL,
473
0x204282020042e047ULL,
474
0x7000c00820c25c03ULL,
475
0x80000000000021e7ULL,
476
0x190e0000fc93dc03ULL,
477
0x1000000180821c02ULL,
478
0x1000000040931c02ULL,
479
0x1000000010935c02ULL,
480
0x4800000034825c03ULL,
481
0x22c042c042c04287ULL,
482
0x4800000030821c03ULL,
483
0x2800000028031de4ULL,
484
0x4801000020a29c03ULL,
485
0x280000002c035de4ULL,
486
0x0800000000b2dc42ULL,
487
0x4801000024c31c03ULL,
488
0x9400000000a01fc5ULL,
489
0x200002e04202c047ULL,
490
0x28005c1aa0001de4ULL,
491
0x0800000000d35c42ULL,
492
0x9400000100c107c5ULL,
493
0x9400000140c01f85ULL,
494
0x8000000000001de7ULL
495
};
496
497
static const uint64_t nvf0_read_hw_sm_counters_code[] =
498
{
499
/* Same kernel as GK104 */
500
0x0880808080808080ULL,
501
0x86400000109c0022ULL,
502
0x86400000019c0032ULL,
503
0x86400000021c0002ULL,
504
0x86400000029c0006ULL,
505
0x86400000031c000aULL,
506
0x86400000039c000eULL,
507
0x86400000041c0012ULL,
508
0x08ac1080108c8080ULL,
509
0x86400000049c0016ULL,
510
0x86400000051c001aULL,
511
0x86400000059c001eULL,
512
0xdb201c007f9c201eULL,
513
0x64c03ce0d41c002aULL,
514
0xc00000020a1c3021ULL,
515
0x64c03ce0d49c002eULL,
516
0x0810a0808010b810ULL,
517
0xc0000001041c3025ULL,
518
0x180000000020003cULL,
519
0xdb201c007f9c243eULL,
520
0xc1c00000301c2021ULL,
521
0xc1c00000081c2431ULL,
522
0xc1c00000021c2435ULL,
523
0xe0800000069c2026ULL,
524
0x08b010b010b010a0ULL,
525
0xe0800000061c2022ULL,
526
0xe4c03c00051c0032ULL,
527
0xe0840000041c282aULL,
528
0xe4c03c00059c0036ULL,
529
0xe08040007f9c2c2eULL,
530
0xe0840000049c3032ULL,
531
0xfe800000001c2800ULL,
532
0x080000b81080b010ULL,
533
0x64c03ce0d51c0002ULL,
534
0xe08040007f9c3436ULL,
535
0xfe80000020043010ULL,
536
0xfc800000281c3000ULL,
537
0x18000000001c003cULL,
538
};
539
540
static const uint64_t gm107_read_hw_sm_counters_code[] =
541
{
542
0x001d0400e4200701ULL, /* sched (st 0x1 wr 0x0) (st 0x1 wr 0x1) (st 0x1 wr 0x2) */
543
0xf0c8000002170008ULL, /* mov $r8 $tidx */
544
0xf0c800000037000cULL, /* mov $r12 $virtid */
545
0xf0c8000000470000ULL, /* mov $r0 $pm0 */
546
0x001e8400f0200761ULL, /* sched (st 0x1 wr 0x3) (st 0x1 wr 0x4) (st 0x1 wr 0x5) */
547
0xf0c8000000570001ULL, /* mov $r1 $pm1 */
548
0xf0c8000000670002ULL, /* mov $r2 $pm2 */
549
0xf0c8000000770003ULL, /* mov $r3 $pm3 */
550
0x001e8400f42007a1ULL, /* sched (st 0x1 wr 0x5) (st 0x1 wr 0x5) (st 0x1 wr 0x5) */
551
0xf0c8000000870004ULL, /* mov $r4 $pm4 */
552
0xf0c8000000970005ULL, /* mov $r5 $pm5 */
553
0xf0c8000000a70006ULL, /* mov $r6 $pm6 */
554
0x001f8401fc2007a1ULL, /* sched (st 0x1 wr 0x5) (st 0x1 wt 0x1) (st 0x1) */
555
0xf0c8000000b70007ULL, /* mov $r7 $pm7 */
556
0x5b6403800087ff07ULL, /* isetp eq u32 and $p0 0x1 0x0 $r8 0x1 */
557
0x4c98079c1a87000aULL, /* mov $r10 c7[0x6a0] 0xf */
558
0x001fa400fc2017e1ULL, /* sched (st 0x1 wt 0x2) (st 0x1) (st 0x9) */
559
0x3800000091470c08ULL, /* bfe u32 $r8 $r12 0x914 */
560
0x4c98079c1a97000bULL, /* mov $r11 c7[0x6a4] 0xf */
561
0x3800000020870c09ULL, /* bfe u32 $r9 $r12 0x208 */
562
0x001c1800fc2007edULL, /* sched (st 0xd) (st 0x1) (st 0x6 wr 0x0) */
563
0xe30000000008000fULL, /* not $p0 exit */
564
0x5b6403800097ff0fULL, /* isetp eq u32 and $p1 0x1 0x0 $r9 0x1 */
565
0x3838000006070808ULL, /* imul u32 u32 $r8 $r8 0x60 */
566
0x003f8400e0c00726ULL, /* sched (st 0x6 wr 0x1) (st 0x6 wr 0x0) (st 0x1 wt 0x1) */
567
0x383800000107090cULL, /* imul u32 u32 $r12 $r9 0x10 */
568
0x383800000047090dULL, /* imul u32 u32 $r13 $r9 0x4 */
569
0x5c10000000d70809ULL, /* iadd $r9 $r8 $r13 */
570
0x001f8400fcc017e1ULL, /* sched (st 0x1 wt 0x2) (st 0x6) (st 0x1) */
571
0x5c10000000c70808ULL, /* iadd $r8 $r8 $r12 */
572
0x5c98078000a7000cULL, /* mov $r12 $r10 0xf */
573
0x5c10800000870a0aULL, /* iadd cc $r10 $r10 $r8 */
574
0x001f8400fc2007e6ULL, /* sched (st 0x6) (st 0x1) (st 0x1) */
575
0x5c98078000b7000dULL, /* mov $r13 $r11 0xf */
576
0x5c1008000ff70b0bULL, /* iadd x $r11 $r11 0x0 */
577
0x5c10800000970c0cULL, /* iadd cc $r12 $r12 $r9 */
578
0x003f983c1c4007e1ULL, /* sched (st 0x1) (st 0x2 rd 0x0 wt 0x3c) (st 0x6 wt 0x1) */
579
0x5c1008000ff70d0dULL, /* iadd x $r13 $r13 0x0 */
580
0xbfd0000000070a00ULL, /* st e wt b128 g[$r10] $r0 0x1 */
581
0x4c98079c1aa70000ULL, /* mov $r0 c7[0x6a8] 0xf */
582
0x001fbc00fc2007e6ULL, /* sched (st 0x1) (st 0x1) (st 0xf) */
583
0xbfd0000004010c04ULL, /* $p1 st e wt b128 g[$r12+0x40] $r4 0x1 */
584
0xbf90000005070c00ULL, /* st e wt b32 g[$r12+0x50] $r0 0x1 */
585
0xe30000000007000fULL, /* exit */
586
};
587
588
/* For simplicity, we will allocate as many group slots as we allocate counter
589
* slots. This means that a single counter which wants to source from 2 groups
590
* will have to be declared as using 2 counter slots. This shouldn't really be
591
* a problem because such queries don't make much sense ... (unless someone is
592
* really creative).
593
*/
594
struct nvc0_hw_sm_counter_cfg
595
{
596
uint32_t func : 16; /* mask or 4-bit logic op (depending on mode) */
597
uint32_t mode : 4; /* LOGOP,B6,LOGOP_B6(_PULSE) */
598
uint32_t sig_dom : 1; /* if 0, MP_PM_A (per warp-sched), if 1, MP_PM_B */
599
uint32_t sig_sel : 8; /* signal group */
600
uint32_t src_mask; /* mask for signal selection (only for NVC0:NVE4) */
601
uint32_t src_sel; /* signal selection for up to 4 sources */
602
};
603
604
struct nvc0_hw_sm_query_cfg
605
{
606
unsigned type;
607
struct nvc0_hw_sm_counter_cfg ctr[8];
608
uint8_t num_counters;
609
uint8_t norm[2]; /* normalization num,denom */
610
};
611
612
#define _CA(f, m, g, s) { f, NVE4_COMPUTE_MP_PM_FUNC_MODE_##m, 0, g, 0, s }
613
#define _CB(f, m, g, s) { f, NVE4_COMPUTE_MP_PM_FUNC_MODE_##m, 1, g, 0, s }
614
#define _Q(n, c) [NVE4_HW_SM_QUERY_##n] = c
615
616
/* ==== Compute capability 3.0 (GK104:GK110) ==== */
617
static const struct nvc0_hw_sm_query_cfg
618
sm30_active_cycles =
619
{
620
.type = NVC0_HW_SM_QUERY_ACTIVE_CYCLES,
621
.ctr[0] = _CB(0x0001, B6, 0x02, 0x00000000),
622
.num_counters = 1,
623
.norm = { 1, 1 },
624
};
625
626
static const struct nvc0_hw_sm_query_cfg
627
sm30_active_warps =
628
{
629
.type = NVC0_HW_SM_QUERY_ACTIVE_WARPS,
630
.ctr[0] = _CB(0x003f, B6, 0x02, 0x31483104),
631
.num_counters = 1,
632
.norm = { 2, 1 },
633
};
634
635
static const struct nvc0_hw_sm_query_cfg
636
sm30_atom_cas_count =
637
{
638
.type = NVC0_HW_SM_QUERY_ATOM_CAS_COUNT,
639
.ctr[0] = _CA(0x0001, B6, 0x1c, 0x000000004),
640
.num_counters = 1,
641
.norm = { 1, 1 },
642
};
643
644
static const struct nvc0_hw_sm_query_cfg
645
sm30_atom_count =
646
{
647
.type = NVC0_HW_SM_QUERY_ATOM_COUNT,
648
.ctr[0] = _CA(0x0001, B6, 0x1c, 0x00000000),
649
.num_counters = 1,
650
.norm = { 1, 1 },
651
};
652
653
static const struct nvc0_hw_sm_query_cfg
654
sm30_branch =
655
{
656
.type = NVC0_HW_SM_QUERY_BRANCH,
657
.ctr[0] = _CA(0x0001, B6, 0x1c, 0x0000000c),
658
.num_counters = 1,
659
.norm = { 1, 1 },
660
};
661
662
static const struct nvc0_hw_sm_query_cfg
663
sm30_divergent_branch =
664
{
665
.type = NVC0_HW_SM_QUERY_DIVERGENT_BRANCH,
666
.ctr[0] = _CA(0x0001, B6, 0x1c, 0x00000010),
667
.num_counters = 1,
668
.norm = { 1, 1 },
669
};
670
671
static const struct nvc0_hw_sm_query_cfg
672
sm30_gld_request =
673
{
674
.type = NVC0_HW_SM_QUERY_GLD_REQUEST,
675
.ctr[0] = _CA(0x0001, B6, 0x1b, 0x00000010),
676
.num_counters = 1,
677
.norm = { 1, 1 },
678
};
679
680
static const struct nvc0_hw_sm_query_cfg
681
sm30_gld_mem_div_replay =
682
{
683
.type = NVC0_HW_SM_QUERY_GLD_MEM_DIV_REPLAY,
684
.ctr[0] = _CB(0x0001, B6, 0x08, 0x00000010),
685
.num_counters = 1,
686
.norm = { 1, 1 },
687
};
688
689
static const struct nvc0_hw_sm_query_cfg
690
sm30_gst_transactions =
691
{
692
.type = NVC0_HW_SM_QUERY_GST_TRANSACTIONS,
693
.ctr[0] = _CB(0x0001, B6, 0x11, 0x00000004),
694
.num_counters = 1,
695
.norm = { 1, 1 },
696
};
697
698
static const struct nvc0_hw_sm_query_cfg
699
sm30_gst_mem_div_replay =
700
{
701
.type = NVC0_HW_SM_QUERY_GST_MEM_DIV_REPLAY,
702
.ctr[0] = _CB(0x0001, B6, 0x08, 0x00000014),
703
.num_counters = 1,
704
.norm = { 1, 1 },
705
};
706
707
static const struct nvc0_hw_sm_query_cfg
708
sm30_gred_count =
709
{
710
.type = NVC0_HW_SM_QUERY_GRED_COUNT,
711
.ctr[0] = _CA(0x0001, B6, 0x1c, 0x00000008),
712
.num_counters = 1,
713
.norm = { 1, 1 },
714
};
715
716
static const struct nvc0_hw_sm_query_cfg
717
sm30_gst_request =
718
{
719
.type = NVC0_HW_SM_QUERY_GST_REQUEST,
720
.ctr[0] = _CA(0x0001, B6, 0x1b, 0x00000014),
721
.num_counters = 1,
722
.norm = { 1, 1 },
723
};
724
725
static const struct nvc0_hw_sm_query_cfg
726
sm30_inst_executed =
727
{
728
.type = NVC0_HW_SM_QUERY_INST_EXECUTED,
729
.ctr[0] = _CA(0x0003, B6, 0x04, 0x00000398),
730
.num_counters = 1,
731
.norm = { 1, 1 },
732
};
733
734
static const struct nvc0_hw_sm_query_cfg
735
sm30_inst_issued1 =
736
{
737
.type = NVC0_HW_SM_QUERY_INST_ISSUED1,
738
.ctr[0] = _CA(0x0001, B6, 0x05, 0x00000004),
739
.num_counters = 1,
740
.norm = { 1, 1 },
741
};
742
743
static const struct nvc0_hw_sm_query_cfg
744
sm30_inst_issued2 =
745
{
746
.type = NVC0_HW_SM_QUERY_INST_ISSUED2,
747
.ctr[0] = _CA(0x0001, B6, 0x05, 0x00000008),
748
.num_counters = 1,
749
.norm = { 1, 1 },
750
};
751
752
static const struct nvc0_hw_sm_query_cfg
753
sm30_l1_gld_hit =
754
{
755
.type = NVC0_HW_SM_QUERY_L1_GLD_HIT,
756
.ctr[0] = _CB(0x0001, B6, 0x10, 0x00000010),
757
.num_counters = 1,
758
.norm = { 1, 1 },
759
};
760
761
static const struct nvc0_hw_sm_query_cfg
762
sm30_l1_gld_miss =
763
{
764
.type = NVC0_HW_SM_QUERY_L1_GLD_MISS,
765
.ctr[0] = _CB(0x0001, B6, 0x10, 0x00000014),
766
.num_counters = 1,
767
.norm = { 1, 1 },
768
};
769
770
static const struct nvc0_hw_sm_query_cfg
771
sm30_l1_gld_transactions =
772
{
773
.type = NVC0_HW_SM_QUERY_L1_GLD_TRANSACTIONS,
774
.ctr[0] = _CB(0x0001, B6, 0x0f, 0x00000000),
775
.num_counters = 1,
776
.norm = { 1, 1 },
777
};
778
779
static const struct nvc0_hw_sm_query_cfg
780
sm30_l1_gst_transactions =
781
{
782
.type = NVC0_HW_SM_QUERY_L1_GST_TRANSACTIONS,
783
.ctr[0] = _CB(0x0001, B6, 0x0f, 0x00000004),
784
.num_counters = 1,
785
.norm = { 1, 1 },
786
};
787
788
static const struct nvc0_hw_sm_query_cfg
789
sm30_l1_local_ld_hit =
790
{
791
.type = NVC0_HW_SM_QUERY_L1_LOCAL_LD_HIT,
792
.ctr[0] = _CB(0x0001, B6, 0x10, 0x00000000),
793
.num_counters = 1,
794
.norm = { 1, 1 },
795
};
796
797
static const struct nvc0_hw_sm_query_cfg
798
sm30_l1_local_ld_miss =
799
{
800
.type = NVC0_HW_SM_QUERY_L1_LOCAL_LD_MISS,
801
.ctr[0] = _CB(0x0001, B6, 0x10, 0x00000004),
802
.num_counters = 1,
803
.norm = { 1, 1 },
804
};
805
806
static const struct nvc0_hw_sm_query_cfg
807
sm30_l1_local_st_hit =
808
{
809
.type = NVC0_HW_SM_QUERY_L1_LOCAL_ST_HIT,
810
.ctr[0] = _CB(0x0001, B6, 0x10, 0x00000008),
811
.num_counters = 1,
812
.norm = { 1, 1 },
813
};
814
815
static const struct nvc0_hw_sm_query_cfg
816
sm30_l1_local_st_miss =
817
{
818
.type = NVC0_HW_SM_QUERY_L1_LOCAL_ST_MISS,
819
.ctr[0] = _CB(0x0001, B6, 0x10, 0x0000000c),
820
.num_counters = 1,
821
.norm = { 1, 1 },
822
};
823
824
static const struct nvc0_hw_sm_query_cfg
825
sm30_l1_shared_ld_transactions =
826
{
827
.type = NVC0_HW_SM_QUERY_L1_SHARED_LD_TRANSACTIONS,
828
.ctr[0] = _CB(0x0001, B6, 0x0e, 0x00000008),
829
.num_counters = 1,
830
.norm = { 1, 1 },
831
};
832
833
static const struct nvc0_hw_sm_query_cfg
834
sm30_l1_shared_st_transactions =
835
{
836
.type = NVC0_HW_SM_QUERY_L1_SHARED_ST_TRANSACTIONS,
837
.ctr[0] = _CB(0x0001, B6, 0x0e, 0x0000000c),
838
.num_counters = 1,
839
.norm = { 1, 1 },
840
};
841
842
static const struct nvc0_hw_sm_query_cfg
843
sm30_local_ld =
844
{
845
.type = NVC0_HW_SM_QUERY_LOCAL_LD,
846
.ctr[0] = _CA(0x0001, B6, 0x1b, 0x00000008),
847
.num_counters = 1,
848
.norm = { 1, 1 },
849
};
850
851
static const struct nvc0_hw_sm_query_cfg
852
sm30_local_ld_transactions =
853
{
854
.type = NVC0_HW_SM_QUERY_LOCAL_LD_TRANSACTIONS,
855
.ctr[0] = _CB(0x0001, B6, 0x0e, 0x00000000),
856
.num_counters = 1,
857
.norm = { 1, 1 },
858
};
859
860
static const struct nvc0_hw_sm_query_cfg
861
sm30_local_st =
862
{
863
.type = NVC0_HW_SM_QUERY_LOCAL_ST,
864
.ctr[0] = _CA(0x0001, B6, 0x1b, 0x0000000c),
865
.num_counters = 1,
866
.norm = { 1, 1 },
867
};
868
869
static const struct nvc0_hw_sm_query_cfg
870
sm30_local_st_transactions =
871
{
872
.type = NVC0_HW_SM_QUERY_LOCAL_ST_TRANSACTIONS,
873
.ctr[0] = _CB(0x0001, B6, 0x0e, 0x00000004),
874
.num_counters = 1,
875
.norm = { 1, 1 },
876
};
877
878
static const struct nvc0_hw_sm_query_cfg
879
sm30_prof_trigger_0 =
880
{
881
.type = NVC0_HW_SM_QUERY_PROF_TRIGGER_0,
882
.ctr[0] = _CA(0x0001, B6, 0x01, 0x00000000),
883
.num_counters = 1,
884
.norm = { 1, 1 },
885
};
886
887
static const struct nvc0_hw_sm_query_cfg
888
sm30_prof_trigger_1 =
889
{
890
.type = NVC0_HW_SM_QUERY_PROF_TRIGGER_1,
891
.ctr[0] = _CA(0x0001, B6, 0x01, 0x00000004),
892
.num_counters = 1,
893
.norm = { 1, 1 },
894
};
895
896
static const struct nvc0_hw_sm_query_cfg
897
sm30_prof_trigger_2 =
898
{
899
.type = NVC0_HW_SM_QUERY_PROF_TRIGGER_2,
900
.ctr[0] = _CA(0x0001, B6, 0x01, 0x00000008),
901
.num_counters = 1,
902
.norm = { 1, 1 },
903
};
904
905
static const struct nvc0_hw_sm_query_cfg
906
sm30_prof_trigger_3 =
907
{
908
.type = NVC0_HW_SM_QUERY_PROF_TRIGGER_3,
909
.ctr[0] = _CA(0x0001, B6, 0x01, 0x0000000c),
910
.num_counters = 1,
911
.norm = { 1, 1 },
912
};
913
914
static const struct nvc0_hw_sm_query_cfg
915
sm30_prof_trigger_4 =
916
{
917
.type = NVC0_HW_SM_QUERY_PROF_TRIGGER_4,
918
.ctr[0] = _CA(0x0001, B6, 0x01, 0x00000010),
919
.num_counters = 1,
920
.norm = { 1, 1 },
921
};
922
923
static const struct nvc0_hw_sm_query_cfg
924
sm30_prof_trigger_5 =
925
{
926
.type = NVC0_HW_SM_QUERY_PROF_TRIGGER_5,
927
.ctr[0] = _CA(0x0001, B6, 0x01, 0x00000014),
928
.num_counters = 1,
929
.norm = { 1, 1 },
930
};
931
932
static const struct nvc0_hw_sm_query_cfg
933
sm30_prof_trigger_6 =
934
{
935
.type = NVC0_HW_SM_QUERY_PROF_TRIGGER_6,
936
.ctr[0] = _CA(0x0001, B6, 0x01, 0x00000018),
937
.num_counters = 1,
938
.norm = { 1, 1 },
939
};
940
941
static const struct nvc0_hw_sm_query_cfg
942
sm30_prof_trigger_7 =
943
{
944
.type = NVC0_HW_SM_QUERY_PROF_TRIGGER_7,
945
.ctr[0] = _CA(0x0001, B6, 0x01, 0x0000001c),
946
.num_counters = 1,
947
.norm = { 1, 1 },
948
};
949
950
static const struct nvc0_hw_sm_query_cfg
951
sm30_shared_ld =
952
{
953
.type = NVC0_HW_SM_QUERY_SHARED_LD,
954
.ctr[0] = _CA(0x0001, B6, 0x1b, 0x00000000),
955
.num_counters = 1,
956
.norm = { 1, 1 },
957
};
958
959
static const struct nvc0_hw_sm_query_cfg
960
sm30_shared_ld_replay =
961
{
962
.type = NVC0_HW_SM_QUERY_SHARED_LD_REPLAY,
963
.ctr[0] = _CB(0x0001, B6, 0x08, 0x00000008),
964
.num_counters = 1,
965
.norm = { 1, 1 },
966
};
967
968
static const struct nvc0_hw_sm_query_cfg
969
sm30_shared_st =
970
{
971
.type = NVC0_HW_SM_QUERY_SHARED_ST,
972
.ctr[0] = _CA(0x0001, B6, 0x1b, 0x00000004),
973
.num_counters = 1,
974
.norm = { 1, 1 },
975
};
976
977
static const struct nvc0_hw_sm_query_cfg
978
sm30_shared_st_replay =
979
{
980
.type = NVC0_HW_SM_QUERY_SHARED_ST_REPLAY,
981
.ctr[0] = _CB(0x0001, B6, 0x08, 0x0000000c),
982
.num_counters = 1,
983
.norm = { 1, 1 },
984
};
985
986
static const struct nvc0_hw_sm_query_cfg
987
sm30_sm_cta_launched =
988
{
989
.type = NVC0_HW_SM_QUERY_SM_CTA_LAUNCHED,
990
.ctr[0] = _CB(0x0001, B6, 0x02, 0x0000001c),
991
.num_counters = 1,
992
.norm = { 1, 1 },
993
};
994
995
static const struct nvc0_hw_sm_query_cfg
996
sm30_threads_launched =
997
{
998
.type = NVC0_HW_SM_QUERY_THREADS_LAUNCHED,
999
.ctr[0] = _CA(0x003f, B6, 0x03, 0x398a4188),
1000
.num_counters = 1,
1001
.norm = { 1, 1 },
1002
};
1003
1004
static const struct nvc0_hw_sm_query_cfg
1005
sm30_uncached_gld_transactions =
1006
{
1007
.type = NVC0_HW_SM_QUERY_UNCACHED_GLD_TRANSACTIONS,
1008
.ctr[0] = _CB(0x0001, B6, 0x11, 0x00000000),
1009
.num_counters = 1,
1010
.norm = { 1, 1 },
1011
};
1012
1013
static const struct nvc0_hw_sm_query_cfg
1014
sm30_warps_launched =
1015
{
1016
.type = NVC0_HW_SM_QUERY_WARPS_LAUNCHED,
1017
.ctr[0] = _CA(0x0001, B6, 0x03, 0x00000004),
1018
.num_counters = 1,
1019
.norm = { 1, 1 },
1020
};
1021
1022
/* NOTES:
1023
* active_warps: bit 0 alternates btw 0 and 1 for odd nr of warps
1024
* inst_executed etc.: we only count a single warp scheduler
1025
*/
1026
static const struct nvc0_hw_sm_query_cfg *sm30_hw_sm_queries[] =
1027
{
1028
&sm30_active_cycles,
1029
&sm30_active_warps,
1030
&sm30_atom_cas_count,
1031
&sm30_atom_count,
1032
&sm30_branch,
1033
&sm30_divergent_branch,
1034
&sm30_gld_request,
1035
&sm30_gld_mem_div_replay,
1036
&sm30_gst_transactions,
1037
&sm30_gst_mem_div_replay,
1038
&sm30_gred_count,
1039
&sm30_gst_request,
1040
&sm30_inst_executed,
1041
&sm30_inst_issued1,
1042
&sm30_inst_issued2,
1043
&sm30_l1_gld_hit,
1044
&sm30_l1_gld_miss,
1045
&sm30_l1_gld_transactions,
1046
&sm30_l1_gst_transactions,
1047
&sm30_l1_local_ld_hit,
1048
&sm30_l1_local_ld_miss,
1049
&sm30_l1_local_st_hit,
1050
&sm30_l1_local_st_miss,
1051
&sm30_l1_shared_ld_transactions,
1052
&sm30_l1_shared_st_transactions,
1053
&sm30_local_ld,
1054
&sm30_local_ld_transactions,
1055
&sm30_local_st,
1056
&sm30_local_st_transactions,
1057
&sm30_prof_trigger_0,
1058
&sm30_prof_trigger_1,
1059
&sm30_prof_trigger_2,
1060
&sm30_prof_trigger_3,
1061
&sm30_prof_trigger_4,
1062
&sm30_prof_trigger_5,
1063
&sm30_prof_trigger_6,
1064
&sm30_prof_trigger_7,
1065
&sm30_shared_ld,
1066
&sm30_shared_ld_replay,
1067
&sm30_shared_st,
1068
&sm30_shared_st_replay,
1069
&sm30_sm_cta_launched,
1070
&sm30_threads_launched,
1071
&sm30_uncached_gld_transactions,
1072
&sm30_warps_launched,
1073
};
1074
1075
/* ==== Compute capability 3.5 (GK110/GK208) ==== */
1076
static const struct nvc0_hw_sm_query_cfg
1077
sm35_atom_cas_count =
1078
{
1079
.type = NVC0_HW_SM_QUERY_ATOM_CAS_COUNT,
1080
.ctr[0] = _CA(0x0001, B6, 0x1a, 0x00000014),
1081
.num_counters = 1,
1082
.norm = { 1, 1 },
1083
};
1084
1085
static const struct nvc0_hw_sm_query_cfg
1086
sm35_atom_count =
1087
{
1088
.type = NVC0_HW_SM_QUERY_ATOM_COUNT,
1089
.ctr[0] = _CA(0x0001, B6, 0x1a, 0x00000010),
1090
.num_counters = 1,
1091
.norm = { 1, 1 },
1092
};
1093
1094
static const struct nvc0_hw_sm_query_cfg
1095
sm35_gred_count =
1096
{
1097
.type = NVC0_HW_SM_QUERY_GRED_COUNT,
1098
.ctr[0] = _CA(0x0001, B6, 0x1a, 0x00000018),
1099
.num_counters = 1,
1100
.norm = { 1, 1 },
1101
};
1102
1103
static const struct nvc0_hw_sm_query_cfg
1104
sm35_not_pred_off_inst_executed =
1105
{
1106
.type = NVC0_HW_SM_QUERY_NOT_PRED_OFF_INST_EXECUTED,
1107
.ctr[0] = _CA(0x003f, B6, 0x14, 0x29062080),
1108
.num_counters = 1,
1109
.norm = { 1, 1 },
1110
};
1111
1112
static const struct nvc0_hw_sm_query_cfg
1113
sm35_shared_ld_replay =
1114
{
1115
.type = NVC0_HW_SM_QUERY_SHARED_LD_REPLAY,
1116
.ctr[0] = _CB(0xaaaa, LOGOP, 0x13, 0x00000018),
1117
.ctr[1] = _CB(0x8888, LOGOP, 0x08, 0x00000151),
1118
.num_counters = 2,
1119
.norm = { 1, 1 },
1120
};
1121
1122
static const struct nvc0_hw_sm_query_cfg
1123
sm35_shared_st_replay =
1124
{
1125
.type = NVC0_HW_SM_QUERY_SHARED_ST_REPLAY,
1126
.ctr[0] = _CB(0xaaaa, LOGOP, 0x13, 0x00000018),
1127
.ctr[1] = _CB(0x8888, LOGOP, 0x08, 0x000001d1),
1128
.num_counters = 2,
1129
.norm = { 1, 1 },
1130
};
1131
1132
static const struct nvc0_hw_sm_query_cfg
1133
sm35_th_inst_executed =
1134
{
1135
.type = NVC0_HW_SM_QUERY_TH_INST_EXECUTED,
1136
.ctr[0] = _CA(0x003f, B6, 0x11, 0x29062080),
1137
.num_counters = 1,
1138
.norm = { 1, 1 },
1139
};
1140
1141
static const struct nvc0_hw_sm_query_cfg *sm35_hw_sm_queries[] =
1142
{
1143
&sm30_active_cycles,
1144
&sm30_active_warps,
1145
&sm35_atom_cas_count,
1146
&sm35_atom_count,
1147
&sm30_gld_request,
1148
&sm30_gld_mem_div_replay,
1149
&sm30_gst_transactions,
1150
&sm30_gst_mem_div_replay,
1151
&sm35_gred_count,
1152
&sm30_gst_request,
1153
&sm30_inst_executed,
1154
&sm30_inst_issued1,
1155
&sm30_inst_issued2,
1156
&sm30_l1_gld_hit,
1157
&sm30_l1_gld_miss,
1158
&sm30_l1_gld_transactions,
1159
&sm30_l1_gst_transactions,
1160
&sm30_l1_local_ld_hit,
1161
&sm30_l1_local_ld_miss,
1162
&sm30_l1_local_st_hit,
1163
&sm30_l1_local_st_miss,
1164
&sm30_l1_shared_ld_transactions,
1165
&sm30_l1_shared_st_transactions,
1166
&sm30_local_ld,
1167
&sm30_local_ld_transactions,
1168
&sm30_local_st,
1169
&sm30_local_st_transactions,
1170
&sm35_not_pred_off_inst_executed,
1171
&sm30_prof_trigger_0,
1172
&sm30_prof_trigger_1,
1173
&sm30_prof_trigger_2,
1174
&sm30_prof_trigger_3,
1175
&sm30_prof_trigger_4,
1176
&sm30_prof_trigger_5,
1177
&sm30_prof_trigger_6,
1178
&sm30_prof_trigger_7,
1179
&sm30_shared_ld,
1180
&sm35_shared_ld_replay,
1181
&sm30_shared_st,
1182
&sm35_shared_st_replay,
1183
&sm30_sm_cta_launched,
1184
&sm35_th_inst_executed,
1185
&sm30_threads_launched,
1186
&sm30_uncached_gld_transactions,
1187
&sm30_warps_launched,
1188
};
1189
1190
/* ==== Compute capability 5.0 (GM107/GM108) ==== */
1191
static const struct nvc0_hw_sm_query_cfg
1192
sm50_active_ctas =
1193
{
1194
.type = NVC0_HW_SM_QUERY_ACTIVE_CTAS,
1195
.ctr[0] = _CB(0x003f, B6, 0x01, 0x29062080),
1196
.num_counters = 1,
1197
.norm = { 1, 1 },
1198
};
1199
1200
static const struct nvc0_hw_sm_query_cfg
1201
sm50_active_cycles =
1202
{
1203
.type = NVC0_HW_SM_QUERY_ACTIVE_CYCLES,
1204
.ctr[0] = _CB(0x0001, B6, 0x00, 0x00000004),
1205
.num_counters = 1,
1206
.norm = { 1, 1 },
1207
};
1208
1209
static const struct nvc0_hw_sm_query_cfg
1210
sm50_active_warps =
1211
{
1212
.type = NVC0_HW_SM_QUERY_ACTIVE_WARPS,
1213
.ctr[0] = _CB(0x003f, B6, 0x00, 0x398a4188),
1214
.num_counters = 1,
1215
.norm = { 1, 1 },
1216
};
1217
1218
static const struct nvc0_hw_sm_query_cfg
1219
sm50_atom_count =
1220
{
1221
.type = NVC0_HW_SM_QUERY_ATOM_COUNT,
1222
.ctr[0] = _CA(0x0001, B6, 0x14, 0x00000004),
1223
.num_counters = 1,
1224
.norm = { 1, 1 },
1225
};
1226
1227
static const struct nvc0_hw_sm_query_cfg
1228
sm50_branch =
1229
{
1230
.type = NVC0_HW_SM_QUERY_BRANCH,
1231
.ctr[0] = _CA(0x0001, B6, 0x1a, 0x00000010),
1232
.num_counters = 1,
1233
.norm = { 1, 1 },
1234
};
1235
1236
static const struct nvc0_hw_sm_query_cfg
1237
sm50_divergent_branch =
1238
{
1239
.type = NVC0_HW_SM_QUERY_DIVERGENT_BRANCH,
1240
.ctr[0] = _CA(0x0001, B6, 0x1a, 0x00000004),
1241
.num_counters = 1,
1242
.norm = { 1, 1 },
1243
};
1244
1245
static const struct nvc0_hw_sm_query_cfg
1246
sm50_global_atom_cas =
1247
{
1248
.type = NVC0_HW_SM_QUERY_GLOBAL_ATOM_CAS,
1249
.ctr[0] = _CA(0x0001, B6, 0x14, 0x00000000),
1250
.num_counters = 1,
1251
.norm = { 1, 1 },
1252
};
1253
1254
static const struct nvc0_hw_sm_query_cfg
1255
sm50_global_ld =
1256
{
1257
.type = NVC0_HW_SM_QUERY_GLOBAL_LD,
1258
.ctr[0] = _CA(0x0001, B6, 0x14, 0x0000000c),
1259
.num_counters = 1,
1260
.norm = { 1, 1 },
1261
};
1262
1263
static const struct nvc0_hw_sm_query_cfg
1264
sm50_global_st =
1265
{
1266
.type = NVC0_HW_SM_QUERY_GLOBAL_ST,
1267
.ctr[0] = _CA(0x0001, B6, 0x14, 0x00000010),
1268
.num_counters = 1,
1269
.norm = { 1, 1 },
1270
};
1271
1272
static const struct nvc0_hw_sm_query_cfg
1273
sm50_gred_count =
1274
{
1275
.type = NVC0_HW_SM_QUERY_GRED_COUNT,
1276
.ctr[0] = _CA(0x0001, B6, 0x14, 0x00000008),
1277
.num_counters = 1,
1278
.norm = { 1, 1 },
1279
};
1280
1281
static const struct nvc0_hw_sm_query_cfg
1282
sm50_inst_executed =
1283
{
1284
.type = NVC0_HW_SM_QUERY_INST_EXECUTED,
1285
.ctr[0] = _CA(0x0003, B6, 0x02, 0x00000398),
1286
.num_counters = 1,
1287
.norm = { 1, 1 },
1288
};
1289
1290
static const struct nvc0_hw_sm_query_cfg
1291
sm50_inst_issued0 =
1292
{
1293
.type = NVC0_HW_SM_QUERY_INST_ISSUED0,
1294
.ctr[0] = _CA(0x0001, B6, 0x02, 0x0000000c),
1295
.num_counters = 1,
1296
.norm = { 1, 1 },
1297
};
1298
1299
static const struct nvc0_hw_sm_query_cfg
1300
sm50_inst_issued1 =
1301
{
1302
.type = NVC0_HW_SM_QUERY_INST_ISSUED1,
1303
.ctr[0] = _CA(0x0001, B6, 0x02, 0x00000010),
1304
.num_counters = 1,
1305
.norm = { 1, 1 },
1306
};
1307
1308
static const struct nvc0_hw_sm_query_cfg
1309
sm50_inst_issued2 =
1310
{
1311
.type = NVC0_HW_SM_QUERY_INST_ISSUED2,
1312
.ctr[0] = _CA(0x0001, B6, 0x02, 0x00000014),
1313
.num_counters = 1,
1314
.norm = { 1, 1 },
1315
};
1316
1317
static const struct nvc0_hw_sm_query_cfg
1318
sm50_local_ld =
1319
{
1320
.type = NVC0_HW_SM_QUERY_LOCAL_LD,
1321
.ctr[0] = _CA(0x0001, B6, 0x13, 0x00000004),
1322
.num_counters = 1,
1323
.norm = { 1, 1 },
1324
};
1325
1326
static const struct nvc0_hw_sm_query_cfg
1327
sm50_local_st =
1328
{
1329
.type = NVC0_HW_SM_QUERY_LOCAL_ST,
1330
.ctr[0] = _CA(0x0001, B6, 0x13, 0x00000000),
1331
.num_counters = 1,
1332
.norm = { 1, 1 },
1333
};
1334
1335
static const struct nvc0_hw_sm_query_cfg
1336
sm50_not_pred_off_inst_executed =
1337
{
1338
.type = NVC0_HW_SM_QUERY_NOT_PRED_OFF_INST_EXECUTED,
1339
.ctr[0] = _CA(0x003f, B6, 0x05, 0x29062080),
1340
.num_counters = 1,
1341
.norm = { 1, 1 },
1342
};
1343
1344
static const struct nvc0_hw_sm_query_cfg
1345
sm50_prof_trigger_0 =
1346
{
1347
.type = NVC0_HW_SM_QUERY_PROF_TRIGGER_0,
1348
.ctr[0] = _CA(0x0001, B6, 0x00, 0x00000000),
1349
.num_counters = 1,
1350
.norm = { 1, 1 },
1351
};
1352
1353
static const struct nvc0_hw_sm_query_cfg
1354
sm50_prof_trigger_1 =
1355
{
1356
.type = NVC0_HW_SM_QUERY_PROF_TRIGGER_1,
1357
.ctr[0] = _CA(0x0001, B6, 0x00, 0x00000004),
1358
.num_counters = 1,
1359
.norm = { 1, 1 },
1360
};
1361
1362
static const struct nvc0_hw_sm_query_cfg
1363
sm50_prof_trigger_2 =
1364
{
1365
.type = NVC0_HW_SM_QUERY_PROF_TRIGGER_2,
1366
.ctr[0] = _CA(0x0001, B6, 0x00, 0x00000008),
1367
.num_counters = 1,
1368
.norm = { 1, 1 },
1369
};
1370
1371
static const struct nvc0_hw_sm_query_cfg
1372
sm50_prof_trigger_3 =
1373
{
1374
.type = NVC0_HW_SM_QUERY_PROF_TRIGGER_3,
1375
.ctr[0] = _CA(0x0001, B6, 0x00, 0x0000000c),
1376
.num_counters = 1,
1377
.norm = { 1, 1 },
1378
};
1379
1380
static const struct nvc0_hw_sm_query_cfg
1381
sm50_prof_trigger_4 =
1382
{
1383
.type = NVC0_HW_SM_QUERY_PROF_TRIGGER_4,
1384
.ctr[0] = _CA(0x0001, B6, 0x00, 0x00000010),
1385
.num_counters = 1,
1386
.norm = { 1, 1 },
1387
};
1388
1389
static const struct nvc0_hw_sm_query_cfg
1390
sm50_prof_trigger_5 =
1391
{
1392
.type = NVC0_HW_SM_QUERY_PROF_TRIGGER_5,
1393
.ctr[0] = _CA(0x0001, B6, 0x00, 0x00000014),
1394
.num_counters = 1,
1395
.norm = { 1, 1 },
1396
};
1397
1398
static const struct nvc0_hw_sm_query_cfg
1399
sm50_prof_trigger_6 =
1400
{
1401
.type = NVC0_HW_SM_QUERY_PROF_TRIGGER_6,
1402
.ctr[0] = _CA(0x0001, B6, 0x00, 0x00000018),
1403
.num_counters = 1,
1404
.norm = { 1, 1 },
1405
};
1406
1407
static const struct nvc0_hw_sm_query_cfg
1408
sm50_prof_trigger_7 =
1409
{
1410
.type = NVC0_HW_SM_QUERY_PROF_TRIGGER_7,
1411
.ctr[0] = _CA(0x0001, B6, 0x00, 0x0000001c),
1412
.num_counters = 1,
1413
.norm = { 1, 1 },
1414
};
1415
1416
static const struct nvc0_hw_sm_query_cfg
1417
sm50_shared_atom =
1418
{
1419
.type = NVC0_HW_SM_QUERY_SHARED_ATOM,
1420
.ctr[0] = _CA(0x0001, B6, 0x13, 0x00000014),
1421
.num_counters = 1,
1422
.norm = { 1, 1 },
1423
};
1424
1425
static const struct nvc0_hw_sm_query_cfg
1426
sm50_shared_atom_cas =
1427
{
1428
.type = NVC0_HW_SM_QUERY_SHARED_ATOM_CAS,
1429
.ctr[0] = _CA(0x0001, B6, 0x13, 0x00000010),
1430
.num_counters = 1,
1431
.norm = { 1, 1 },
1432
};
1433
1434
static const struct nvc0_hw_sm_query_cfg
1435
sm50_shared_ld =
1436
{
1437
.type = NVC0_HW_SM_QUERY_SHARED_LD,
1438
.ctr[0] = _CA(0x0001, B6, 0x13, 0x00000008),
1439
.num_counters = 1,
1440
.norm = { 1, 1 },
1441
};
1442
1443
static const struct nvc0_hw_sm_query_cfg
1444
sm50_shared_ld_bank_conflict =
1445
{
1446
.type = NVC0_HW_SM_QUERY_SHARED_LD_BANK_CONFLICT,
1447
.ctr[0] = _CB(0x0001, B6, 0x0e, 0x00000000),
1448
.num_counters = 1,
1449
.norm = { 1, 1 },
1450
};
1451
1452
static const struct nvc0_hw_sm_query_cfg
1453
sm50_shared_ld_transactions =
1454
{
1455
.type = NVC0_HW_SM_QUERY_SHARED_LD_TRANSACTIONS,
1456
.ctr[0] = _CB(0x0001, B6, 0x0e, 0x00000008),
1457
.num_counters = 1,
1458
.norm = { 1, 1 },
1459
};
1460
1461
static const struct nvc0_hw_sm_query_cfg
1462
sm50_shared_st =
1463
{
1464
.type = NVC0_HW_SM_QUERY_SHARED_ST,
1465
.ctr[0] = _CA(0x0001, B6, 0x13, 0x0000000c),
1466
.num_counters = 1,
1467
.norm = { 1, 1 },
1468
};
1469
1470
static const struct nvc0_hw_sm_query_cfg
1471
sm50_shared_st_bank_conflict =
1472
{
1473
.type = NVC0_HW_SM_QUERY_SHARED_ST_BANK_CONFLICT,
1474
.ctr[0] = _CB(0x0001, B6, 0x0e, 0x00000004),
1475
.num_counters = 1,
1476
.norm = { 1, 1 },
1477
};
1478
1479
static const struct nvc0_hw_sm_query_cfg
1480
sm50_shared_st_transactions =
1481
{
1482
.type = NVC0_HW_SM_QUERY_SHARED_ST_TRANSACTIONS,
1483
.ctr[0] = _CB(0x0001, B6, 0x0e, 0x0000000c),
1484
.num_counters = 1,
1485
.norm = { 1, 1 },
1486
};
1487
1488
static const struct nvc0_hw_sm_query_cfg
1489
sm50_sm_cta_launched =
1490
{
1491
.type = NVC0_HW_SM_QUERY_SM_CTA_LAUNCHED,
1492
.ctr[0] = _CB(0x0001, B6, 0x01, 0x00000018),
1493
.num_counters = 1,
1494
.norm = { 1, 1 },
1495
};
1496
1497
static const struct nvc0_hw_sm_query_cfg
1498
sm50_th_inst_executed =
1499
{
1500
.type = NVC0_HW_SM_QUERY_TH_INST_EXECUTED,
1501
.ctr[0] = _CA(0x003f, B6, 0x04, 0x29062080),
1502
.num_counters = 1,
1503
.norm = { 1, 1 },
1504
};
1505
1506
static const struct nvc0_hw_sm_query_cfg
1507
sm50_warps_launched =
1508
{
1509
.type = NVC0_HW_SM_QUERY_WARPS_LAUNCHED,
1510
.ctr[0] = _CA(0x0001, B6, 0x02, 0x00000008),
1511
.num_counters = 1,
1512
.norm = { 1, 1 },
1513
};
1514
1515
static const struct nvc0_hw_sm_query_cfg *sm50_hw_sm_queries[] =
1516
{
1517
&sm50_active_ctas,
1518
&sm50_active_cycles,
1519
&sm50_active_warps,
1520
&sm50_atom_count,
1521
&sm50_branch,
1522
&sm50_divergent_branch,
1523
&sm50_global_atom_cas,
1524
&sm50_global_ld,
1525
&sm50_global_st,
1526
&sm50_gred_count,
1527
&sm50_inst_executed,
1528
&sm50_inst_issued0,
1529
&sm50_inst_issued1,
1530
&sm50_inst_issued2,
1531
&sm50_local_ld,
1532
&sm50_local_st,
1533
&sm50_not_pred_off_inst_executed,
1534
&sm50_prof_trigger_0,
1535
&sm50_prof_trigger_1,
1536
&sm50_prof_trigger_2,
1537
&sm50_prof_trigger_3,
1538
&sm50_prof_trigger_4,
1539
&sm50_prof_trigger_5,
1540
&sm50_prof_trigger_6,
1541
&sm50_prof_trigger_7,
1542
&sm50_shared_atom,
1543
&sm50_shared_atom_cas,
1544
&sm50_shared_ld,
1545
&sm50_shared_ld_bank_conflict,
1546
&sm50_shared_ld_transactions,
1547
&sm50_shared_st,
1548
&sm50_shared_st_bank_conflict,
1549
&sm50_shared_st_transactions,
1550
&sm50_sm_cta_launched,
1551
&sm50_th_inst_executed,
1552
&sm50_warps_launched,
1553
};
1554
1555
/* ==== Compute capability 5.2 (GM200/GM204/GM206) ==== */
1556
static const struct nvc0_hw_sm_query_cfg
1557
sm52_atom_count =
1558
{
1559
.type = NVC0_HW_SM_QUERY_ATOM_COUNT,
1560
.ctr[0] = _CA(0x0001, B6, 0x0a, 0x0000001c),
1561
.num_counters = 1,
1562
.norm = { 1, 1 },
1563
};
1564
1565
static const struct nvc0_hw_sm_query_cfg
1566
sm52_global_atom_cas =
1567
{
1568
.type = NVC0_HW_SM_QUERY_GLOBAL_ATOM_CAS,
1569
.ctr[0] = _CA(0x0001, B6, 0x0a, 0x00000018),
1570
.num_counters = 1,
1571
.norm = { 1, 1 },
1572
};
1573
1574
static const struct nvc0_hw_sm_query_cfg
1575
sm52_global_ld =
1576
{
1577
.type = NVC0_HW_SM_QUERY_GLOBAL_LD,
1578
.ctr[0] = _CA(0x0001, B6, 0x0b, 0x00000018),
1579
.num_counters = 1,
1580
.norm = { 1, 1 },
1581
};
1582
1583
static const struct nvc0_hw_sm_query_cfg
1584
sm52_global_st =
1585
{
1586
.type = NVC0_HW_SM_QUERY_GLOBAL_ST,
1587
.ctr[0] = _CA(0x0001, B6, 0x0b, 0x0000001c),
1588
.num_counters = 1,
1589
.norm = { 1, 1 },
1590
};
1591
1592
static const struct nvc0_hw_sm_query_cfg
1593
sm52_gred_count =
1594
{
1595
.type = NVC0_HW_SM_QUERY_GRED_COUNT,
1596
.ctr[0] = _CA(0x0001, B6, 0x0f, 0x00000018),
1597
.num_counters = 1,
1598
.norm = { 1, 1 },
1599
};
1600
1601
static const struct nvc0_hw_sm_query_cfg
1602
sm52_inst_executed =
1603
{
1604
.type = NVC0_HW_SM_QUERY_INST_EXECUTED,
1605
.ctr[0] = _CA(0x0003, B6, 0x03, 0x0000020c),
1606
.num_counters = 1,
1607
.norm = { 1, 1 },
1608
};
1609
1610
static const struct nvc0_hw_sm_query_cfg
1611
sm52_inst_issued0 =
1612
{
1613
.type = NVC0_HW_SM_QUERY_INST_ISSUED0,
1614
.ctr[0] = _CA(0x0001, B6, 0x03, 0x00000000),
1615
.num_counters = 1,
1616
.norm = { 1, 1 },
1617
};
1618
1619
static const struct nvc0_hw_sm_query_cfg
1620
sm52_inst_issued1 =
1621
{
1622
.type = NVC0_HW_SM_QUERY_INST_ISSUED1,
1623
.ctr[0] = _CA(0x0001, B6, 0x03, 0x00000004),
1624
.num_counters = 1,
1625
.norm = { 1, 1 },
1626
};
1627
1628
static const struct nvc0_hw_sm_query_cfg
1629
sm52_inst_issued2 =
1630
{
1631
.type = NVC0_HW_SM_QUERY_INST_ISSUED2,
1632
.ctr[0] = _CA(0x0001, B6, 0x03, 0x00000008),
1633
.num_counters = 1,
1634
.norm = { 1, 1 },
1635
};
1636
1637
static const struct nvc0_hw_sm_query_cfg
1638
sm52_local_ld =
1639
{
1640
.type = NVC0_HW_SM_QUERY_LOCAL_LD,
1641
.ctr[0] = _CA(0x0001, B6, 0x06, 0x0000001c),
1642
.num_counters = 1,
1643
.norm = { 1, 1 },
1644
};
1645
1646
static const struct nvc0_hw_sm_query_cfg
1647
sm52_local_st =
1648
{
1649
.type = NVC0_HW_SM_QUERY_LOCAL_ST,
1650
.ctr[0] = _CA(0x0001, B6, 0x06, 0x00000018),
1651
.num_counters = 1,
1652
.norm = { 1, 1 },
1653
};
1654
1655
static const struct nvc0_hw_sm_query_cfg
1656
sm52_shared_atom =
1657
{
1658
.type = NVC0_HW_SM_QUERY_SHARED_ATOM,
1659
.ctr[0] = _CA(0x0001, B6, 0x08, 0x0000001c),
1660
.num_counters = 1,
1661
.norm = { 1, 1 },
1662
};
1663
1664
static const struct nvc0_hw_sm_query_cfg
1665
sm52_shared_atom_cas =
1666
{
1667
.type = NVC0_HW_SM_QUERY_SHARED_ATOM_CAS,
1668
.ctr[0] = _CA(0x0001, B6, 0x08, 0x00000018),
1669
.num_counters = 1,
1670
.norm = { 1, 1 },
1671
};
1672
1673
static const struct nvc0_hw_sm_query_cfg
1674
sm52_shared_ld =
1675
{
1676
.type = NVC0_HW_SM_QUERY_SHARED_LD,
1677
.ctr[0] = _CA(0x0001, B6, 0x07, 0x00000018),
1678
.num_counters = 1,
1679
.norm = { 1, 1 },
1680
};
1681
1682
static const struct nvc0_hw_sm_query_cfg
1683
sm52_shared_st =
1684
{
1685
.type = NVC0_HW_SM_QUERY_SHARED_ST,
1686
.ctr[0] = _CA(0x0001, B6, 0x07, 0x0000001c),
1687
.num_counters = 1,
1688
.norm = { 1, 1 },
1689
};
1690
1691
static const struct nvc0_hw_sm_query_cfg
1692
sm52_warps_launched =
1693
{
1694
.type = NVC0_HW_SM_QUERY_WARPS_LAUNCHED,
1695
.ctr[0] = _CA(0x0001, B6, 0x02, 0x0000001c),
1696
.num_counters = 1,
1697
.norm = { 1, 1 },
1698
};
1699
1700
static const struct nvc0_hw_sm_query_cfg *sm52_hw_sm_queries[] =
1701
{
1702
&sm50_active_ctas,
1703
&sm50_active_cycles,
1704
&sm50_active_warps,
1705
&sm52_atom_count,
1706
&sm50_branch,
1707
&sm50_divergent_branch,
1708
&sm52_global_atom_cas,
1709
&sm52_global_ld,
1710
&sm52_global_st,
1711
&sm52_gred_count,
1712
&sm52_inst_executed,
1713
&sm52_inst_issued0,
1714
&sm52_inst_issued1,
1715
&sm52_inst_issued2,
1716
&sm52_local_ld,
1717
&sm52_local_st,
1718
&sm50_not_pred_off_inst_executed,
1719
&sm50_prof_trigger_0,
1720
&sm50_prof_trigger_1,
1721
&sm50_prof_trigger_2,
1722
&sm50_prof_trigger_3,
1723
&sm50_prof_trigger_4,
1724
&sm50_prof_trigger_5,
1725
&sm50_prof_trigger_6,
1726
&sm50_prof_trigger_7,
1727
&sm52_shared_atom,
1728
&sm52_shared_atom_cas,
1729
&sm52_shared_ld,
1730
&sm50_shared_ld_bank_conflict,
1731
&sm50_shared_ld_transactions,
1732
&sm52_shared_st,
1733
&sm50_shared_st_bank_conflict,
1734
&sm50_shared_st_transactions,
1735
&sm50_sm_cta_launched,
1736
&sm50_th_inst_executed,
1737
&sm52_warps_launched,
1738
};
1739
1740
#undef _Q
1741
#undef _CA
1742
#undef _CB
1743
1744
/* === PERFORMANCE MONITORING COUNTERS for NVC0:NVE4 === */
1745
/* NOTES:
1746
* - MP counters on GF100/GF110 (compute capability 2.0) are buggy
1747
* because there is a context-switch problem that we need to fix.
1748
* Results might be wrong sometimes, be careful!
1749
*/
1750
static const uint64_t nvc0_read_hw_sm_counters_code[] =
1751
{
1752
/* mov b32 $r8 $tidx
1753
* mov b32 $r9 $physid
1754
* mov b32 $r0 $pm0
1755
* mov b32 $r1 $pm1
1756
* mov b32 $r2 $pm2
1757
* mov b32 $r3 $pm3
1758
* mov b32 $r4 $pm4
1759
* mov b32 $r5 $pm5
1760
* mov b32 $r6 $pm6
1761
* mov b32 $r7 $pm7
1762
* set $p0 0x1 eq u32 $r8 0x0
1763
* mov b32 $r10 c15[0x6a0]
1764
* mov b32 $r11 c15[0x6a4]
1765
* ext u32 $r8 $r9 0x414
1766
* (not $p0) exit
1767
* mul $r8 u32 $r8 u32 48
1768
* add b32 $r10 $c $r10 $r8
1769
* add b32 $r11 $r11 0x0 $c
1770
* mov b32 $r8 c15[0x6a8]
1771
* st b128 wt g[$r10d+0x00] $r0q
1772
* st b128 wt g[$r10d+0x10] $r4q
1773
* st b32 wt g[$r10d+0x20] $r8
1774
* exit */
1775
0x2c00000084021c04ULL,
1776
0x2c0000000c025c04ULL,
1777
0x2c00000010001c04ULL,
1778
0x2c00000014005c04ULL,
1779
0x2c00000018009c04ULL,
1780
0x2c0000001c00dc04ULL,
1781
0x2c00000020011c04ULL,
1782
0x2c00000024015c04ULL,
1783
0x2c00000028019c04ULL,
1784
0x2c0000002c01dc04ULL,
1785
0x190e0000fc81dc03ULL,
1786
0x28007c1a80029de4ULL,
1787
0x28007c1a9002dde4ULL,
1788
0x7000c01050921c03ULL,
1789
0x80000000000021e7ULL,
1790
0x10000000c0821c02ULL,
1791
0x4801000020a29c03ULL,
1792
0x0800000000b2dc42ULL,
1793
0x28007c1aa0021de4ULL,
1794
0x9400000000a01fc5ULL,
1795
0x9400000040a11fc5ULL,
1796
0x9400000080a21f85ULL,
1797
0x8000000000001de7ULL
1798
};
1799
1800
#define _C(f, o, g, m, s) { f, NVC0_COMPUTE_MP_PM_OP_MODE_##o, 0, g, m, s }
1801
1802
/* ==== Compute capability 2.0 (GF100/GF110) ==== */
1803
static const struct nvc0_hw_sm_query_cfg
1804
sm20_active_cycles =
1805
{
1806
.type = NVC0_HW_SM_QUERY_ACTIVE_CYCLES,
1807
.ctr[0] = _C(0xaaaa, LOGOP, 0x11, 0x000000ff, 0x00000000),
1808
.num_counters = 1,
1809
.norm = { 1, 1 },
1810
};
1811
1812
static const struct nvc0_hw_sm_query_cfg
1813
sm20_active_warps =
1814
{
1815
.type = NVC0_HW_SM_QUERY_ACTIVE_WARPS,
1816
.ctr[0] = _C(0xaaaa, LOGOP, 0x24, 0x000000ff, 0x00000010),
1817
.ctr[1] = _C(0xaaaa, LOGOP, 0x24, 0x000000ff, 0x00000020),
1818
.ctr[2] = _C(0xaaaa, LOGOP, 0x24, 0x000000ff, 0x00000030),
1819
.ctr[3] = _C(0xaaaa, LOGOP, 0x24, 0x000000ff, 0x00000040),
1820
.ctr[4] = _C(0xaaaa, LOGOP, 0x24, 0x000000ff, 0x00000050),
1821
.ctr[5] = _C(0xaaaa, LOGOP, 0x24, 0x000000ff, 0x00000060),
1822
.num_counters = 6,
1823
.norm = { 1, 1 },
1824
};
1825
1826
static const struct nvc0_hw_sm_query_cfg
1827
sm20_atom_count =
1828
{
1829
.type = NVC0_HW_SM_QUERY_ATOM_COUNT,
1830
.ctr[0] = _C(0xaaaa, LOGOP, 0x63, 0x000000ff, 0x00000030),
1831
.num_counters = 1,
1832
.norm = { 1, 1 },
1833
};
1834
1835
static const struct nvc0_hw_sm_query_cfg
1836
sm20_branch =
1837
{
1838
.type = NVC0_HW_SM_QUERY_BRANCH,
1839
.ctr[0] = _C(0xaaaa, LOGOP, 0x1a, 0x000000ff, 0x00000000),
1840
.ctr[1] = _C(0xaaaa, LOGOP, 0x1a, 0x000000ff, 0x00000010),
1841
.num_counters = 2,
1842
.norm = { 1, 1 },
1843
};
1844
1845
static const struct nvc0_hw_sm_query_cfg
1846
sm20_divergent_branch =
1847
{
1848
.type = NVC0_HW_SM_QUERY_DIVERGENT_BRANCH,
1849
.ctr[0] = _C(0xaaaa, LOGOP, 0x19, 0x000000ff, 0x00000020),
1850
.ctr[1] = _C(0xaaaa, LOGOP, 0x19, 0x000000ff, 0x00000030),
1851
.num_counters = 2,
1852
.norm = { 1, 1 },
1853
};
1854
1855
static const struct nvc0_hw_sm_query_cfg
1856
sm20_gld_request =
1857
{
1858
.type = NVC0_HW_SM_QUERY_GLD_REQUEST,
1859
.ctr[0] = _C(0xaaaa, LOGOP, 0x64, 0x000000ff, 0x00000030),
1860
.num_counters = 1,
1861
.norm = { 1, 1 },
1862
};
1863
1864
static const struct nvc0_hw_sm_query_cfg
1865
sm20_gred_count =
1866
{
1867
.type = NVC0_HW_SM_QUERY_GRED_COUNT,
1868
.ctr[0] = _C(0xaaaa, LOGOP, 0x63, 0x000000ff, 0x00000040),
1869
.num_counters = 1,
1870
.norm = { 1, 1 },
1871
};
1872
1873
static const struct nvc0_hw_sm_query_cfg
1874
sm20_gst_request =
1875
{
1876
.type = NVC0_HW_SM_QUERY_GST_REQUEST,
1877
.ctr[0] = _C(0xaaaa, LOGOP, 0x64, 0x000000ff, 0x00000060),
1878
.num_counters = 1,
1879
.norm = { 1, 1 },
1880
};
1881
1882
static const struct nvc0_hw_sm_query_cfg
1883
sm20_inst_executed =
1884
{
1885
.type = NVC0_HW_SM_QUERY_INST_EXECUTED,
1886
.ctr[0] = _C(0xaaaa, LOGOP, 0x2d, 0x0000ffff, 0x00001000),
1887
.ctr[1] = _C(0xaaaa, LOGOP, 0x2d, 0x0000ffff, 0x00001010),
1888
.num_counters = 2,
1889
.norm = { 1, 1 },
1890
};
1891
1892
static const struct nvc0_hw_sm_query_cfg
1893
sm20_inst_issued =
1894
{
1895
.type = NVC0_HW_SM_QUERY_INST_ISSUED,
1896
.ctr[0] = _C(0xaaaa, LOGOP, 0x27, 0x0000ffff, 0x00007060),
1897
.ctr[1] = _C(0xaaaa, LOGOP, 0x27, 0x0000ffff, 0x00007070),
1898
.num_counters = 2,
1899
.norm = { 1, 1 },
1900
};
1901
1902
static const struct nvc0_hw_sm_query_cfg
1903
sm20_local_ld =
1904
{
1905
.type = NVC0_HW_SM_QUERY_LOCAL_LD,
1906
.ctr[0] = _C(0xaaaa, LOGOP, 0x64, 0x000000ff, 0x00000020),
1907
.num_counters = 1,
1908
.norm = { 1, 1 },
1909
};
1910
1911
static const struct nvc0_hw_sm_query_cfg
1912
sm20_local_st =
1913
{
1914
.type = NVC0_HW_SM_QUERY_LOCAL_ST,
1915
.ctr[0] = _C(0xaaaa, LOGOP, 0x64, 0x000000ff, 0x00000050),
1916
.num_counters = 1,
1917
.norm = { 1, 1 },
1918
};
1919
1920
static const struct nvc0_hw_sm_query_cfg
1921
sm20_prof_trigger_0 =
1922
{
1923
.type = NVC0_HW_SM_QUERY_PROF_TRIGGER_0,
1924
.ctr[0] = _C(0xaaaa, LOGOP, 0x01, 0x000000ff, 0x00000000),
1925
.num_counters = 1,
1926
.norm = { 1, 1 },
1927
};
1928
1929
static const struct nvc0_hw_sm_query_cfg
1930
sm20_prof_trigger_1 =
1931
{
1932
.type = NVC0_HW_SM_QUERY_PROF_TRIGGER_1,
1933
.ctr[0] = _C(0xaaaa, LOGOP, 0x01, 0x000000ff, 0x00000010),
1934
.num_counters = 1,
1935
.norm = { 1, 1 },
1936
};
1937
1938
static const struct nvc0_hw_sm_query_cfg
1939
sm20_prof_trigger_2 =
1940
{
1941
.type = NVC0_HW_SM_QUERY_PROF_TRIGGER_2,
1942
.ctr[0] = _C(0xaaaa, LOGOP, 0x01, 0x000000ff, 0x00000020),
1943
.num_counters = 1,
1944
.norm = { 1, 1 },
1945
};
1946
1947
static const struct nvc0_hw_sm_query_cfg
1948
sm20_prof_trigger_3 =
1949
{
1950
.type = NVC0_HW_SM_QUERY_PROF_TRIGGER_3,
1951
.ctr[0] = _C(0xaaaa, LOGOP, 0x01, 0x000000ff, 0x00000030),
1952
.num_counters = 1,
1953
.norm = { 1, 1 },
1954
};
1955
1956
static const struct nvc0_hw_sm_query_cfg
1957
sm20_prof_trigger_4 =
1958
{
1959
.type = NVC0_HW_SM_QUERY_PROF_TRIGGER_4,
1960
.ctr[0] = _C(0xaaaa, LOGOP, 0x01, 0x000000ff, 0x00000040),
1961
.num_counters = 1,
1962
.norm = { 1, 1 },
1963
};
1964
1965
static const struct nvc0_hw_sm_query_cfg
1966
sm20_prof_trigger_5 =
1967
{
1968
.type = NVC0_HW_SM_QUERY_PROF_TRIGGER_5,
1969
.ctr[0] = _C(0xaaaa, LOGOP, 0x01, 0x000000ff, 0x00000050),
1970
.num_counters = 1,
1971
.norm = { 1, 1 },
1972
};
1973
1974
static const struct nvc0_hw_sm_query_cfg
1975
sm20_prof_trigger_6 =
1976
{
1977
.type = NVC0_HW_SM_QUERY_PROF_TRIGGER_6,
1978
.ctr[0] = _C(0xaaaa, LOGOP, 0x01, 0x000000ff, 0x00000060),
1979
.num_counters = 1,
1980
.norm = { 1, 1 },
1981
};
1982
1983
static const struct nvc0_hw_sm_query_cfg
1984
sm20_prof_trigger_7 =
1985
{
1986
.type = NVC0_HW_SM_QUERY_PROF_TRIGGER_7,
1987
.ctr[0] = _C(0xaaaa, LOGOP, 0x01, 0x000000ff, 0x00000070),
1988
.num_counters = 1,
1989
.norm = { 1, 1 },
1990
};
1991
1992
static const struct nvc0_hw_sm_query_cfg
1993
sm20_shared_ld =
1994
{
1995
.type = NVC0_HW_SM_QUERY_SHARED_LD,
1996
.ctr[0] = _C(0xaaaa, LOGOP, 0x64, 0x000000ff, 0x00000010),
1997
.num_counters = 1,
1998
.norm = { 1, 1 },
1999
};
2000
2001
static const struct nvc0_hw_sm_query_cfg
2002
sm20_shared_st =
2003
{
2004
.type = NVC0_HW_SM_QUERY_SHARED_ST,
2005
.ctr[0] = _C(0xaaaa, LOGOP, 0x64, 0x000000ff, 0x00000040),
2006
.num_counters = 1,
2007
.norm = { 1, 1 },
2008
};
2009
2010
static const struct nvc0_hw_sm_query_cfg
2011
sm20_threads_launched =
2012
{
2013
.type = NVC0_HW_SM_QUERY_THREADS_LAUNCHED,
2014
.ctr[0] = _C(0xaaaa, LOGOP, 0x26, 0x000000ff, 0x00000010),
2015
.ctr[1] = _C(0xaaaa, LOGOP, 0x26, 0x000000ff, 0x00000020),
2016
.ctr[2] = _C(0xaaaa, LOGOP, 0x26, 0x000000ff, 0x00000030),
2017
.ctr[3] = _C(0xaaaa, LOGOP, 0x26, 0x000000ff, 0x00000040),
2018
.ctr[4] = _C(0xaaaa, LOGOP, 0x26, 0x000000ff, 0x00000050),
2019
.ctr[5] = _C(0xaaaa, LOGOP, 0x26, 0x000000ff, 0x00000060),
2020
.num_counters = 6,
2021
.norm = { 1, 1 },
2022
};
2023
2024
static const struct nvc0_hw_sm_query_cfg
2025
sm20_th_inst_executed_0 =
2026
{
2027
.type = NVC0_HW_SM_QUERY_TH_INST_EXECUTED_0,
2028
.ctr[0] = _C(0xaaaa, LOGOP, 0x2f, 0x000000ff, 0x00000000),
2029
.ctr[1] = _C(0xaaaa, LOGOP, 0x2f, 0x000000ff, 0x00000010),
2030
.ctr[2] = _C(0xaaaa, LOGOP, 0x2f, 0x000000ff, 0x00000020),
2031
.ctr[3] = _C(0xaaaa, LOGOP, 0x2f, 0x000000ff, 0x00000030),
2032
.ctr[4] = _C(0xaaaa, LOGOP, 0x2f, 0x000000ff, 0x00000040),
2033
.ctr[5] = _C(0xaaaa, LOGOP, 0x2f, 0x000000ff, 0x00000050),
2034
.num_counters = 6,
2035
.norm = { 1, 1 },
2036
};
2037
2038
static const struct nvc0_hw_sm_query_cfg
2039
sm20_th_inst_executed_1 =
2040
{
2041
.type = NVC0_HW_SM_QUERY_TH_INST_EXECUTED_1,
2042
.ctr[0] = _C(0xaaaa, LOGOP, 0x30, 0x000000ff, 0x00000000),
2043
.ctr[1] = _C(0xaaaa, LOGOP, 0x30, 0x000000ff, 0x00000010),
2044
.ctr[2] = _C(0xaaaa, LOGOP, 0x30, 0x000000ff, 0x00000020),
2045
.ctr[3] = _C(0xaaaa, LOGOP, 0x30, 0x000000ff, 0x00000030),
2046
.ctr[4] = _C(0xaaaa, LOGOP, 0x30, 0x000000ff, 0x00000040),
2047
.ctr[5] = _C(0xaaaa, LOGOP, 0x30, 0x000000ff, 0x00000050),
2048
.num_counters = 6,
2049
.norm = { 1, 1 },
2050
};
2051
2052
static const struct nvc0_hw_sm_query_cfg
2053
sm20_warps_launched =
2054
{
2055
.type = NVC0_HW_SM_QUERY_WARPS_LAUNCHED,
2056
.ctr[0] = _C(0xaaaa, LOGOP, 0x26, 0x000000ff, 0x00000000),
2057
.num_counters = 1,
2058
.norm = { 1, 1 },
2059
};
2060
2061
static const struct nvc0_hw_sm_query_cfg *sm20_hw_sm_queries[] =
2062
{
2063
&sm20_active_cycles,
2064
&sm20_active_warps,
2065
&sm20_atom_count,
2066
&sm20_branch,
2067
&sm20_divergent_branch,
2068
&sm20_gld_request,
2069
&sm20_gred_count,
2070
&sm20_gst_request,
2071
&sm20_inst_executed,
2072
&sm20_inst_issued,
2073
&sm20_local_ld,
2074
&sm20_local_st,
2075
&sm20_prof_trigger_0,
2076
&sm20_prof_trigger_1,
2077
&sm20_prof_trigger_2,
2078
&sm20_prof_trigger_3,
2079
&sm20_prof_trigger_4,
2080
&sm20_prof_trigger_5,
2081
&sm20_prof_trigger_6,
2082
&sm20_prof_trigger_7,
2083
&sm20_shared_ld,
2084
&sm20_shared_st,
2085
&sm20_threads_launched,
2086
&sm20_th_inst_executed_0,
2087
&sm20_th_inst_executed_1,
2088
&sm20_warps_launched,
2089
};
2090
2091
/* ==== Compute capability 2.1 (GF108+ except GF110) ==== */
2092
static const struct nvc0_hw_sm_query_cfg
2093
sm21_inst_executed =
2094
{
2095
.type = NVC0_HW_SM_QUERY_INST_EXECUTED,
2096
.ctr[0] = _C(0xaaaa, LOGOP, 0x2d, 0x000000ff, 0x00000000),
2097
.ctr[1] = _C(0xaaaa, LOGOP, 0x2d, 0x000000ff, 0x00000010),
2098
.ctr[2] = _C(0xaaaa, LOGOP, 0x2d, 0x000000ff, 0x00000020),
2099
.num_counters = 3,
2100
.norm = { 1, 1 },
2101
};
2102
2103
static const struct nvc0_hw_sm_query_cfg
2104
sm21_inst_issued1_0 =
2105
{
2106
.type = NVC0_HW_SM_QUERY_INST_ISSUED1_0,
2107
.ctr[0] = _C(0xaaaa, LOGOP, 0x7e, 0x000000ff, 0x00000010),
2108
.num_counters = 1,
2109
.norm = { 1, 1 },
2110
};
2111
2112
static const struct nvc0_hw_sm_query_cfg
2113
sm21_inst_issued1_1 =
2114
{
2115
.type = NVC0_HW_SM_QUERY_INST_ISSUED1_1,
2116
.ctr[0] = _C(0xaaaa, LOGOP, 0x7e, 0x000000ff, 0x00000040),
2117
.num_counters = 1,
2118
.norm = { 1, 1 },
2119
};
2120
2121
static const struct nvc0_hw_sm_query_cfg
2122
sm21_inst_issued2_0 =
2123
{
2124
.type = NVC0_HW_SM_QUERY_INST_ISSUED2_0,
2125
.ctr[0] = _C(0xaaaa, LOGOP, 0x7e, 0x000000ff, 0x00000020),
2126
.num_counters = 1,
2127
.norm = { 1, 1 },
2128
};
2129
2130
static const struct nvc0_hw_sm_query_cfg
2131
sm21_inst_issued2_1 =
2132
{
2133
.type = NVC0_HW_SM_QUERY_INST_ISSUED2_1,
2134
.ctr[0] = _C(0xaaaa, LOGOP, 0x7e, 0x000000ff, 0x00000050),
2135
.num_counters = 1,
2136
.norm = { 1, 1 },
2137
};
2138
2139
static const struct nvc0_hw_sm_query_cfg
2140
sm21_th_inst_executed_0 =
2141
{
2142
.type = NVC0_HW_SM_QUERY_TH_INST_EXECUTED_0,
2143
.ctr[0] = _C(0xaaaa, LOGOP, 0xa3, 0x000000ff, 0x00000000),
2144
.ctr[1] = _C(0xaaaa, LOGOP, 0xa3, 0x000000ff, 0x00000010),
2145
.ctr[2] = _C(0xaaaa, LOGOP, 0xa3, 0x000000ff, 0x00000020),
2146
.ctr[3] = _C(0xaaaa, LOGOP, 0xa3, 0x000000ff, 0x00000030),
2147
.ctr[4] = _C(0xaaaa, LOGOP, 0xa3, 0x000000ff, 0x00000040),
2148
.ctr[5] = _C(0xaaaa, LOGOP, 0xa3, 0x000000ff, 0x00000050),
2149
.num_counters = 6,
2150
.norm = { 1, 1 },
2151
};
2152
2153
static const struct nvc0_hw_sm_query_cfg
2154
sm21_th_inst_executed_1 =
2155
{
2156
.type = NVC0_HW_SM_QUERY_TH_INST_EXECUTED_1,
2157
.ctr[0] = _C(0xaaaa, LOGOP, 0xa5, 0x000000ff, 0x00000000),
2158
.ctr[1] = _C(0xaaaa, LOGOP, 0xa5, 0x000000ff, 0x00000010),
2159
.ctr[2] = _C(0xaaaa, LOGOP, 0xa5, 0x000000ff, 0x00000020),
2160
.ctr[3] = _C(0xaaaa, LOGOP, 0xa5, 0x000000ff, 0x00000030),
2161
.ctr[4] = _C(0xaaaa, LOGOP, 0xa5, 0x000000ff, 0x00000040),
2162
.ctr[5] = _C(0xaaaa, LOGOP, 0xa5, 0x000000ff, 0x00000050),
2163
.num_counters = 6,
2164
.norm = { 1, 1 },
2165
};
2166
2167
static const struct nvc0_hw_sm_query_cfg
2168
sm21_th_inst_executed_2 =
2169
{
2170
.type = NVC0_HW_SM_QUERY_TH_INST_EXECUTED_2,
2171
.ctr[0] = _C(0xaaaa, LOGOP, 0xa4, 0x000000ff, 0x00000000),
2172
.ctr[1] = _C(0xaaaa, LOGOP, 0xa4, 0x000000ff, 0x00000010),
2173
.ctr[2] = _C(0xaaaa, LOGOP, 0xa4, 0x000000ff, 0x00000020),
2174
.ctr[3] = _C(0xaaaa, LOGOP, 0xa4, 0x000000ff, 0x00000030),
2175
.ctr[4] = _C(0xaaaa, LOGOP, 0xa4, 0x000000ff, 0x00000040),
2176
.ctr[5] = _C(0xaaaa, LOGOP, 0xa4, 0x000000ff, 0x00000050),
2177
.num_counters = 6,
2178
.norm = { 1, 1 },
2179
};
2180
2181
static const struct nvc0_hw_sm_query_cfg
2182
sm21_th_inst_executed_3 =
2183
{
2184
.type = NVC0_HW_SM_QUERY_TH_INST_EXECUTED_3,
2185
.ctr[0] = _C(0xaaaa, LOGOP, 0xa6, 0x000000ff, 0x00000000),
2186
.ctr[1] = _C(0xaaaa, LOGOP, 0xa6, 0x000000ff, 0x00000010),
2187
.ctr[2] = _C(0xaaaa, LOGOP, 0xa6, 0x000000ff, 0x00000020),
2188
.ctr[3] = _C(0xaaaa, LOGOP, 0xa6, 0x000000ff, 0x00000030),
2189
.ctr[4] = _C(0xaaaa, LOGOP, 0xa6, 0x000000ff, 0x00000040),
2190
.ctr[5] = _C(0xaaaa, LOGOP, 0xa6, 0x000000ff, 0x00000050),
2191
.num_counters = 6,
2192
.norm = { 1, 1 },
2193
};
2194
2195
static const struct nvc0_hw_sm_query_cfg *sm21_hw_sm_queries[] =
2196
{
2197
&sm20_active_cycles,
2198
&sm20_active_warps,
2199
&sm20_atom_count,
2200
&sm20_branch,
2201
&sm20_divergent_branch,
2202
&sm20_gld_request,
2203
&sm20_gred_count,
2204
&sm20_gst_request,
2205
&sm21_inst_executed,
2206
&sm21_inst_issued1_0,
2207
&sm21_inst_issued1_1,
2208
&sm21_inst_issued2_0,
2209
&sm21_inst_issued2_1,
2210
&sm20_local_ld,
2211
&sm20_local_st,
2212
&sm20_prof_trigger_0,
2213
&sm20_prof_trigger_1,
2214
&sm20_prof_trigger_2,
2215
&sm20_prof_trigger_3,
2216
&sm20_prof_trigger_4,
2217
&sm20_prof_trigger_5,
2218
&sm20_prof_trigger_6,
2219
&sm20_prof_trigger_7,
2220
&sm20_shared_ld,
2221
&sm20_shared_st,
2222
&sm20_threads_launched,
2223
&sm21_th_inst_executed_0,
2224
&sm21_th_inst_executed_1,
2225
&sm21_th_inst_executed_2,
2226
&sm21_th_inst_executed_3,
2227
&sm20_warps_launched,
2228
};
2229
2230
#undef _C
2231
2232
static inline const struct nvc0_hw_sm_query_cfg **
2233
nvc0_hw_sm_get_queries(struct nvc0_screen *screen)
2234
{
2235
struct nouveau_device *dev = screen->base.device;
2236
2237
switch (screen->base.class_3d) {
2238
case GM200_3D_CLASS:
2239
return sm52_hw_sm_queries;
2240
case GM107_3D_CLASS:
2241
return sm50_hw_sm_queries;
2242
case NVF0_3D_CLASS:
2243
return sm35_hw_sm_queries;
2244
case NVE4_3D_CLASS:
2245
return sm30_hw_sm_queries;
2246
case NVC0_3D_CLASS:
2247
case NVC1_3D_CLASS:
2248
case NVC8_3D_CLASS:
2249
if (dev->chipset == 0xc0 || dev->chipset == 0xc8)
2250
return sm20_hw_sm_queries;
2251
return sm21_hw_sm_queries;
2252
}
2253
assert(0);
2254
return NULL;
2255
}
2256
2257
unsigned
2258
nvc0_hw_sm_get_num_queries(struct nvc0_screen *screen)
2259
{
2260
struct nouveau_device *dev = screen->base.device;
2261
2262
switch (screen->base.class_3d) {
2263
case GM200_3D_CLASS:
2264
return ARRAY_SIZE(sm52_hw_sm_queries);
2265
case GM107_3D_CLASS:
2266
return ARRAY_SIZE(sm50_hw_sm_queries);
2267
case NVF0_3D_CLASS:
2268
return ARRAY_SIZE(sm35_hw_sm_queries);
2269
case NVE4_3D_CLASS:
2270
return ARRAY_SIZE(sm30_hw_sm_queries);
2271
case NVC0_3D_CLASS:
2272
case NVC1_3D_CLASS:
2273
case NVC8_3D_CLASS:
2274
if (dev->chipset == 0xc0 || dev->chipset == 0xc8)
2275
return ARRAY_SIZE(sm20_hw_sm_queries);
2276
return ARRAY_SIZE(sm21_hw_sm_queries);
2277
}
2278
return 0;
2279
}
2280
2281
static const struct nvc0_hw_sm_query_cfg *
2282
nvc0_hw_sm_query_get_cfg(struct nvc0_context *nvc0, struct nvc0_hw_query *hq)
2283
{
2284
const struct nvc0_hw_sm_query_cfg **queries;
2285
struct nvc0_screen *screen = nvc0->screen;
2286
struct nvc0_query *q = &hq->base;
2287
unsigned num_queries;
2288
unsigned i;
2289
2290
num_queries = nvc0_hw_sm_get_num_queries(screen);
2291
queries = nvc0_hw_sm_get_queries(screen);
2292
2293
for (i = 0; i < num_queries; i++) {
2294
if (NVC0_HW_SM_QUERY(queries[i]->type) == q->type)
2295
return queries[i];
2296
}
2297
assert(0);
2298
return NULL;
2299
}
2300
2301
static void
2302
nvc0_hw_sm_destroy_query(struct nvc0_context *nvc0, struct nvc0_hw_query *hq)
2303
{
2304
struct nvc0_query *q = &hq->base;
2305
nvc0_hw_query_allocate(nvc0, q, 0);
2306
nouveau_fence_ref(NULL, &hq->fence);
2307
FREE(hq);
2308
}
2309
2310
static bool
2311
nve4_hw_sm_begin_query(struct nvc0_context *nvc0, struct nvc0_hw_query *hq)
2312
{
2313
struct nvc0_screen *screen = nvc0->screen;
2314
struct nouveau_pushbuf *push = nvc0->base.pushbuf;
2315
struct nvc0_hw_sm_query *hsq = nvc0_hw_sm_query(hq);
2316
const struct nvc0_hw_sm_query_cfg *cfg;
2317
unsigned i, c;
2318
unsigned num_ab[2] = { 0, 0 };
2319
2320
cfg = nvc0_hw_sm_query_get_cfg(nvc0, hq);
2321
2322
/* check if we have enough free counter slots */
2323
for (i = 0; i < cfg->num_counters; ++i)
2324
num_ab[cfg->ctr[i].sig_dom]++;
2325
2326
if (screen->pm.num_hw_sm_active[0] + num_ab[0] > 4 ||
2327
screen->pm.num_hw_sm_active[1] + num_ab[1] > 4) {
2328
NOUVEAU_ERR("Not enough free MP counter slots !\n");
2329
return false;
2330
}
2331
2332
assert(cfg->num_counters <= 4);
2333
PUSH_SPACE(push, 4 * 8 * + 6);
2334
2335
if (!screen->pm.mp_counters_enabled) {
2336
screen->pm.mp_counters_enabled = true;
2337
BEGIN_NVC0(push, SUBC_SW(0x06ac), 1);
2338
PUSH_DATA (push, 0x1fcb);
2339
}
2340
2341
/* set sequence field to 0 (used to check if result is available) */
2342
for (i = 0; i < screen->mp_count; ++i)
2343
hq->data[i * 10 + 10] = 0;
2344
hq->sequence++;
2345
2346
for (i = 0; i < cfg->num_counters; ++i) {
2347
const unsigned d = cfg->ctr[i].sig_dom;
2348
2349
if (!screen->pm.num_hw_sm_active[d]) {
2350
uint32_t m = (1 << 22) | (1 << (7 + (8 * !d)));
2351
if (screen->pm.num_hw_sm_active[!d])
2352
m |= 1 << (7 + (8 * d));
2353
BEGIN_NVC0(push, SUBC_SW(0x0600), 1);
2354
PUSH_DATA (push, m);
2355
}
2356
screen->pm.num_hw_sm_active[d]++;
2357
2358
for (c = d * 4; c < (d * 4 + 4); ++c) {
2359
if (!screen->pm.mp_counter[c]) {
2360
hsq->ctr[i] = c;
2361
screen->pm.mp_counter[c] = hsq;
2362
break;
2363
}
2364
}
2365
assert(c <= (d * 4 + 3)); /* must succeed, already checked for space */
2366
2367
/* configure and reset the counter(s) */
2368
if (d == 0)
2369
BEGIN_NVC0(push, NVE4_CP(MP_PM_A_SIGSEL(c & 3)), 1);
2370
else
2371
BEGIN_NVC0(push, NVE4_CP(MP_PM_B_SIGSEL(c & 3)), 1);
2372
PUSH_DATA (push, cfg->ctr[i].sig_sel);
2373
BEGIN_NVC0(push, NVE4_CP(MP_PM_SRCSEL(c)), 1);
2374
PUSH_DATA (push, cfg->ctr[i].src_sel + 0x2108421 * (c & 3));
2375
BEGIN_NVC0(push, NVE4_CP(MP_PM_FUNC(c)), 1);
2376
PUSH_DATA (push, (cfg->ctr[i].func << 4) | cfg->ctr[i].mode);
2377
BEGIN_NVC0(push, NVE4_CP(MP_PM_SET(c)), 1);
2378
PUSH_DATA (push, 0);
2379
}
2380
2381
if (screen->base.class_3d >= GM107_3D_CLASS) {
2382
/* Enable mask for counters, it's 8-bits value where 0:3 is for domain A
2383
* and 4:7 for domain B. For example, the mask for active_warps should be
2384
* 0x70 because it uses 3 counters in domain B. However, let's always
2385
* enable all counters because we don't want to track which ones is
2386
* enabled or not, and this allows to monitor multiple queries at the
2387
* same time. */
2388
BEGIN_NVC0(push, SUBC_CP(0x33e0), 1);
2389
PUSH_DATA (push, 0xff);
2390
}
2391
2392
return true;
2393
}
2394
2395
static bool
2396
nvc0_hw_sm_begin_query(struct nvc0_context *nvc0, struct nvc0_hw_query *hq)
2397
{
2398
struct nvc0_screen *screen = nvc0->screen;
2399
struct nouveau_pushbuf *push = nvc0->base.pushbuf;
2400
struct nvc0_hw_sm_query *hsq = nvc0_hw_sm_query(hq);
2401
const struct nvc0_hw_sm_query_cfg *cfg;
2402
unsigned i, c;
2403
2404
if (screen->base.class_3d >= NVE4_3D_CLASS)
2405
return nve4_hw_sm_begin_query(nvc0, hq);
2406
2407
cfg = nvc0_hw_sm_query_get_cfg(nvc0, hq);
2408
2409
/* check if we have enough free counter slots */
2410
if (screen->pm.num_hw_sm_active[0] + cfg->num_counters > 8) {
2411
NOUVEAU_ERR("Not enough free MP counter slots !\n");
2412
return false;
2413
}
2414
2415
assert(cfg->num_counters <= 8);
2416
PUSH_SPACE(push, 8 * 8 + 2);
2417
2418
/* set sequence field to 0 (used to check if result is available) */
2419
for (i = 0; i < screen->mp_count; ++i) {
2420
const unsigned b = (0x30 / 4) * i;
2421
hq->data[b + 8] = 0;
2422
}
2423
hq->sequence++;
2424
2425
for (i = 0; i < cfg->num_counters; ++i) {
2426
uint32_t mask_sel = 0x00000000;
2427
2428
if (!screen->pm.num_hw_sm_active[0]) {
2429
BEGIN_NVC0(push, SUBC_SW(0x0600), 1);
2430
PUSH_DATA (push, 0x80000000);
2431
}
2432
screen->pm.num_hw_sm_active[0]++;
2433
2434
for (c = 0; c < 8; ++c) {
2435
if (!screen->pm.mp_counter[c]) {
2436
hsq->ctr[i] = c;
2437
screen->pm.mp_counter[c] = hsq;
2438
break;
2439
}
2440
}
2441
2442
/* Oddly-enough, the signal id depends on the slot selected on Fermi but
2443
* not on Kepler. Fortunately, the signal ids are just offsetted by the
2444
* slot id! */
2445
mask_sel |= c;
2446
mask_sel |= (c << 8);
2447
mask_sel |= (c << 16);
2448
mask_sel |= (c << 24);
2449
mask_sel &= cfg->ctr[i].src_mask;
2450
2451
/* configure and reset the counter(s) */
2452
BEGIN_NVC0(push, NVC0_CP(MP_PM_SIGSEL(c)), 1);
2453
PUSH_DATA (push, cfg->ctr[i].sig_sel);
2454
BEGIN_NVC0(push, NVC0_CP(MP_PM_SRCSEL(c)), 1);
2455
PUSH_DATA (push, cfg->ctr[i].src_sel | mask_sel);
2456
BEGIN_NVC0(push, NVC0_CP(MP_PM_OP(c)), 1);
2457
PUSH_DATA (push, (cfg->ctr[i].func << 4) | cfg->ctr[i].mode);
2458
BEGIN_NVC0(push, NVC0_CP(MP_PM_SET(c)), 1);
2459
PUSH_DATA (push, 0);
2460
}
2461
return true;
2462
}
2463
2464
static inline struct nvc0_program *
2465
nvc0_hw_sm_get_program(struct nvc0_screen *screen)
2466
{
2467
struct nvc0_program *prog;
2468
2469
prog = CALLOC_STRUCT(nvc0_program);
2470
if (!prog)
2471
return NULL;
2472
2473
prog->type = PIPE_SHADER_COMPUTE;
2474
prog->translated = true;
2475
prog->parm_size = 12;
2476
2477
if (screen->base.class_3d >= GM107_3D_CLASS) {
2478
prog->code = (uint32_t *)gm107_read_hw_sm_counters_code;
2479
prog->code_size = sizeof(gm107_read_hw_sm_counters_code);
2480
prog->num_gprs = 14;
2481
} else
2482
if (screen->base.class_3d == NVE4_3D_CLASS ||
2483
screen->base.class_3d == NVF0_3D_CLASS) {
2484
if (screen->base.class_3d == NVE4_3D_CLASS) {
2485
prog->code = (uint32_t *)nve4_read_hw_sm_counters_code;
2486
prog->code_size = sizeof(nve4_read_hw_sm_counters_code);
2487
} else {
2488
prog->code = (uint32_t *)nvf0_read_hw_sm_counters_code;
2489
prog->code_size = sizeof(nvf0_read_hw_sm_counters_code);
2490
}
2491
prog->num_gprs = 14;
2492
} else {
2493
prog->code = (uint32_t *)nvc0_read_hw_sm_counters_code;
2494
prog->code_size = sizeof(nvc0_read_hw_sm_counters_code);
2495
prog->num_gprs = 12;
2496
}
2497
return prog;
2498
}
2499
2500
static inline void
2501
nvc0_hw_sm_upload_input(struct nvc0_context *nvc0, struct nvc0_hw_query *hq)
2502
{
2503
struct nouveau_pushbuf *push = nvc0->base.pushbuf;
2504
struct nvc0_screen *screen = nvc0->screen;
2505
uint64_t address;
2506
const int s = 5;
2507
2508
address = screen->uniform_bo->offset + NVC0_CB_AUX_INFO(s);
2509
2510
PUSH_SPACE(push, 11);
2511
2512
if (screen->base.class_3d >= NVE4_3D_CLASS) {
2513
BEGIN_NVC0(push, NVE4_CP(UPLOAD_DST_ADDRESS_HIGH), 2);
2514
PUSH_DATAh(push, address + NVC0_CB_AUX_MP_INFO);
2515
PUSH_DATA (push, address + NVC0_CB_AUX_MP_INFO);
2516
BEGIN_NVC0(push, NVE4_CP(UPLOAD_LINE_LENGTH_IN), 2);
2517
PUSH_DATA (push, 3 * 4);
2518
PUSH_DATA (push, 0x1);
2519
BEGIN_1IC0(push, NVE4_CP(UPLOAD_EXEC), 1 + 3);
2520
PUSH_DATA (push, NVE4_COMPUTE_UPLOAD_EXEC_LINEAR | (0x20 << 1));
2521
} else {
2522
BEGIN_NVC0(push, NVC0_CP(CB_SIZE), 3);
2523
PUSH_DATA (push, NVC0_CB_AUX_SIZE);
2524
PUSH_DATAh(push, address);
2525
PUSH_DATA (push, address);
2526
BEGIN_1IC0(push, NVC0_CP(CB_POS), 1 + 3);
2527
PUSH_DATA (push, NVC0_CB_AUX_MP_INFO);
2528
}
2529
PUSH_DATA (push, (hq->bo->offset + hq->base_offset));
2530
PUSH_DATAh(push, (hq->bo->offset + hq->base_offset));
2531
PUSH_DATA (push, hq->sequence);
2532
}
2533
2534
static void
2535
nvc0_hw_sm_end_query(struct nvc0_context *nvc0, struct nvc0_hw_query *hq)
2536
{
2537
struct nvc0_screen *screen = nvc0->screen;
2538
struct pipe_context *pipe = &nvc0->base.pipe;
2539
struct nouveau_pushbuf *push = nvc0->base.pushbuf;
2540
const bool is_nve4 = screen->base.class_3d >= NVE4_3D_CLASS;
2541
struct nvc0_hw_sm_query *hsq = nvc0_hw_sm_query(hq);
2542
struct nvc0_program *old = nvc0->compprog;
2543
struct pipe_grid_info info = {};
2544
uint32_t mask;
2545
uint32_t input[3];
2546
const uint block[3] = { 32, is_nve4 ? 4 : 1, 1 };
2547
const uint grid[3] = { screen->mp_count, screen->gpc_count, 1 };
2548
unsigned c, i;
2549
2550
if (unlikely(!screen->pm.prog))
2551
screen->pm.prog = nvc0_hw_sm_get_program(screen);
2552
2553
/* disable all counting */
2554
PUSH_SPACE(push, 8);
2555
for (c = 0; c < 8; ++c)
2556
if (screen->pm.mp_counter[c]) {
2557
if (is_nve4) {
2558
IMMED_NVC0(push, NVE4_CP(MP_PM_FUNC(c)), 0);
2559
} else {
2560
IMMED_NVC0(push, NVC0_CP(MP_PM_OP(c)), 0);
2561
}
2562
}
2563
/* release counters for this query */
2564
for (c = 0; c < 8; ++c) {
2565
if (screen->pm.mp_counter[c] == hsq) {
2566
uint8_t d = is_nve4 ? c / 4 : 0; /* only one domain for NVC0:NVE4 */
2567
screen->pm.num_hw_sm_active[d]--;
2568
screen->pm.mp_counter[c] = NULL;
2569
}
2570
}
2571
2572
if (screen->base.class_3d >= GM107_3D_CLASS)
2573
IMMED_NVC0(push, SUBC_CP(0x33e0), 0);
2574
2575
BCTX_REFN_bo(nvc0->bufctx_cp, CP_QUERY, NOUVEAU_BO_GART | NOUVEAU_BO_WR,
2576
hq->bo);
2577
2578
PUSH_SPACE(push, 1);
2579
IMMED_NVC0(push, SUBC_CP(NV50_GRAPH_SERIALIZE), 0);
2580
2581
/* upload input data for the compute shader which reads MP counters */
2582
nvc0_hw_sm_upload_input(nvc0, hq);
2583
2584
pipe->bind_compute_state(pipe, screen->pm.prog);
2585
for (i = 0; i < 3; i++) {
2586
info.block[i] = block[i];
2587
info.grid[i] = grid[i];
2588
}
2589
info.pc = 0;
2590
info.input = input;
2591
pipe->launch_grid(pipe, &info);
2592
pipe->bind_compute_state(pipe, old);
2593
2594
nouveau_bufctx_reset(nvc0->bufctx_cp, NVC0_BIND_CP_QUERY);
2595
2596
/* re-activate other counters */
2597
PUSH_SPACE(push, 16);
2598
mask = 0;
2599
for (c = 0; c < 8; ++c) {
2600
const struct nvc0_hw_sm_query_cfg *cfg;
2601
unsigned i;
2602
2603
hsq = screen->pm.mp_counter[c];
2604
if (!hsq)
2605
continue;
2606
2607
cfg = nvc0_hw_sm_query_get_cfg(nvc0, &hsq->base);
2608
for (i = 0; i < cfg->num_counters; ++i) {
2609
if (mask & (1 << hsq->ctr[i]))
2610
break;
2611
mask |= 1 << hsq->ctr[i];
2612
if (is_nve4) {
2613
BEGIN_NVC0(push, NVE4_CP(MP_PM_FUNC(hsq->ctr[i])), 1);
2614
} else {
2615
BEGIN_NVC0(push, NVC0_CP(MP_PM_OP(hsq->ctr[i])), 1);
2616
}
2617
PUSH_DATA (push, (cfg->ctr[i].func << 4) | cfg->ctr[i].mode);
2618
}
2619
}
2620
}
2621
2622
static inline bool
2623
nvc0_hw_sm_query_read_data(uint32_t count[32][8],
2624
struct nvc0_context *nvc0, bool wait,
2625
struct nvc0_hw_query *hq,
2626
const struct nvc0_hw_sm_query_cfg *cfg,
2627
unsigned mp_count)
2628
{
2629
struct nvc0_hw_sm_query *hsq = nvc0_hw_sm_query(hq);
2630
unsigned p, c;
2631
2632
for (p = 0; p < mp_count; ++p) {
2633
const unsigned b = (0x30 / 4) * p;
2634
2635
for (c = 0; c < cfg->num_counters; ++c) {
2636
if (hq->data[b + 8] != hq->sequence) {
2637
if (!wait)
2638
return false;
2639
if (nouveau_bo_wait(hq->bo, NOUVEAU_BO_RD, nvc0->base.client))
2640
return false;
2641
}
2642
count[p][c] = hq->data[b + hsq->ctr[c]] * (1 << c);
2643
}
2644
}
2645
return true;
2646
}
2647
2648
static inline bool
2649
nve4_hw_sm_query_read_data(uint32_t count[32][8],
2650
struct nvc0_context *nvc0, bool wait,
2651
struct nvc0_hw_query *hq,
2652
const struct nvc0_hw_sm_query_cfg *cfg,
2653
unsigned mp_count)
2654
{
2655
struct nvc0_hw_sm_query *hsq = nvc0_hw_sm_query(hq);
2656
unsigned p, c, d;
2657
2658
for (p = 0; p < mp_count; ++p) {
2659
const unsigned b = (0x60 / 4) * p;
2660
2661
for (c = 0; c < cfg->num_counters; ++c) {
2662
count[p][c] = 0;
2663
for (d = 0; d < ((hsq->ctr[c] & ~3) ? 1 : 4); ++d) {
2664
if (hq->data[b + 20 + d] != hq->sequence) {
2665
if (!wait)
2666
return false;
2667
if (nouveau_bo_wait(hq->bo, NOUVEAU_BO_RD, nvc0->base.client))
2668
return false;
2669
}
2670
if (hsq->ctr[c] & ~0x3)
2671
count[p][c] = hq->data[b + 16 + (hsq->ctr[c] & 3)];
2672
else
2673
count[p][c] += hq->data[b + d * 4 + hsq->ctr[c]];
2674
}
2675
}
2676
}
2677
return true;
2678
}
2679
2680
static bool
2681
nvc0_hw_sm_get_query_result(struct nvc0_context *nvc0, struct nvc0_hw_query *hq,
2682
bool wait, union pipe_query_result *result)
2683
{
2684
uint32_t count[32][8];
2685
uint64_t value = 0;
2686
unsigned mp_count = MIN2(nvc0->screen->mp_count_compute, 32);
2687
unsigned p, c;
2688
const struct nvc0_hw_sm_query_cfg *cfg;
2689
bool ret;
2690
2691
cfg = nvc0_hw_sm_query_get_cfg(nvc0, hq);
2692
2693
if (nvc0->screen->base.class_3d >= NVE4_3D_CLASS)
2694
ret = nve4_hw_sm_query_read_data(count, nvc0, wait, hq, cfg, mp_count);
2695
else
2696
ret = nvc0_hw_sm_query_read_data(count, nvc0, wait, hq, cfg, mp_count);
2697
if (!ret)
2698
return false;
2699
2700
for (c = 0; c < cfg->num_counters; ++c)
2701
for (p = 0; p < mp_count; ++p)
2702
value += count[p][c];
2703
value = (value * cfg->norm[0]) / cfg->norm[1];
2704
2705
*(uint64_t *)result = value;
2706
return true;
2707
}
2708
2709
static const struct nvc0_hw_query_funcs hw_sm_query_funcs = {
2710
.destroy_query = nvc0_hw_sm_destroy_query,
2711
.begin_query = nvc0_hw_sm_begin_query,
2712
.end_query = nvc0_hw_sm_end_query,
2713
.get_query_result = nvc0_hw_sm_get_query_result,
2714
};
2715
2716
struct nvc0_hw_query *
2717
nvc0_hw_sm_create_query(struct nvc0_context *nvc0, unsigned type)
2718
{
2719
struct nvc0_screen *screen = nvc0->screen;
2720
struct nvc0_hw_sm_query *hsq;
2721
struct nvc0_hw_query *hq;
2722
unsigned space;
2723
2724
if (nvc0->screen->base.drm->version < 0x01000101)
2725
return NULL;
2726
2727
if (type < NVC0_HW_SM_QUERY(0) || type > NVC0_HW_SM_QUERY_LAST)
2728
return NULL;
2729
2730
hsq = CALLOC_STRUCT(nvc0_hw_sm_query);
2731
if (!hsq)
2732
return NULL;
2733
2734
hq = &hsq->base;
2735
hq->funcs = &hw_sm_query_funcs;
2736
hq->base.type = type;
2737
2738
if (screen->base.class_3d >= NVE4_3D_CLASS) {
2739
/* for each MP:
2740
* [00] = WS0.C0
2741
* [04] = WS0.C1
2742
* [08] = WS0.C2
2743
* [0c] = WS0.C3
2744
* [10] = WS1.C0
2745
* [14] = WS1.C1
2746
* [18] = WS1.C2
2747
* [1c] = WS1.C3
2748
* [20] = WS2.C0
2749
* [24] = WS2.C1
2750
* [28] = WS2.C2
2751
* [2c] = WS2.C3
2752
* [30] = WS3.C0
2753
* [34] = WS3.C1
2754
* [38] = WS3.C2
2755
* [3c] = WS3.C3
2756
* [40] = MP.C4
2757
* [44] = MP.C5
2758
* [48] = MP.C6
2759
* [4c] = MP.C7
2760
* [50] = WS0.sequence
2761
* [54] = WS1.sequence
2762
* [58] = WS2.sequence
2763
* [5c] = WS3.sequence
2764
*/
2765
space = (4 * 4 + 4 + 4) * nvc0->screen->mp_count * sizeof(uint32_t);
2766
} else {
2767
/*
2768
* Note that padding is used to align memory access to 128 bits.
2769
*
2770
* for each MP:
2771
* [00] = MP.C0
2772
* [04] = MP.C1
2773
* [08] = MP.C2
2774
* [0c] = MP.C3
2775
* [10] = MP.C4
2776
* [14] = MP.C5
2777
* [18] = MP.C6
2778
* [1c] = MP.C7
2779
* [20] = MP.sequence
2780
* [24] = padding
2781
* [28] = padding
2782
* [2c] = padding
2783
*/
2784
space = (8 + 1 + 3) * nvc0->screen->mp_count * sizeof(uint32_t);
2785
}
2786
2787
if (!nvc0_hw_query_allocate(nvc0, &hq->base, space)) {
2788
FREE(hq);
2789
return NULL;
2790
}
2791
2792
return hq;
2793
}
2794
2795
int
2796
nvc0_hw_sm_get_driver_query_info(struct nvc0_screen *screen, unsigned id,
2797
struct pipe_driver_query_info *info)
2798
{
2799
int count = 0;
2800
2801
if (screen->base.drm->version >= 0x01000101) {
2802
if (screen->compute)
2803
count = nvc0_hw_sm_get_num_queries(screen);
2804
}
2805
2806
if (!info)
2807
return count;
2808
2809
if (id < count) {
2810
if (screen->compute) {
2811
if (screen->base.class_3d <= GM200_3D_CLASS) {
2812
const struct nvc0_hw_sm_query_cfg **queries =
2813
nvc0_hw_sm_get_queries(screen);
2814
2815
info->name = nvc0_hw_sm_query_get_name(queries[id]->type);
2816
info->query_type = NVC0_HW_SM_QUERY(queries[id]->type);
2817
info->group_id = NVC0_HW_SM_QUERY_GROUP;
2818
return 1;
2819
}
2820
}
2821
}
2822
return 0;
2823
}
2824
2825