Path: blob/21.2-virgl/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw_sm.c
4574 views
/*1* Copyright 2011 Christoph Bumiller2* Copyright 2015 Samuel Pitoiset3*4* Permission is hereby granted, free of charge, to any person obtaining a5* copy of this software and associated documentation files (the "Software"),6* to deal in the Software without restriction, including without limitation7* the rights to use, copy, modify, merge, publish, distribute, sublicense,8* and/or sell copies of the Software, and to permit persons to whom the9* Software is furnished to do so, subject to the following conditions:10*11* The above copyright notice and this permission notice shall be included in12* all copies or substantial portions of the Software.13*14* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR15* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,16* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL17* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR18* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,19* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR20* OTHER DEALINGS IN THE SOFTWARE.21*/2223#define NVC0_PUSH_EXPLICIT_SPACE_CHECKING2425#include "nvc0/nvc0_context.h"26#include "nvc0/nvc0_query_hw_sm.h"2728#include "nv_object.xml.h"29#include "nvc0/nve4_compute.xml.h"30#include "nvc0/nvc0_compute.xml.h"3132/* NOTE: intentionally using the same names as NV */33#define _Q(t, n, d) { NVC0_HW_SM_QUERY_##t, n, d }34static const struct {35unsigned type;36const char *name;37const char *desc;38} nvc0_hw_sm_queries[] = {39_Q(ACTIVE_CTAS,40"active_ctas",41"Accumulated number of active blocks per cycle. For every cycle it "42"increments by the number of active blocks in the cycle which can be in "43"the range 0 to 32."),4445_Q(ACTIVE_CYCLES,46"active_cycles",47"Number of cycles a multiprocessor has at least one active warp"),4849_Q(ACTIVE_WARPS,50"active_warps",51"Accumulated number of active warps per cycle. For every cycle it "52"increments by the number of active warps in the cycle which can be in "53"the range 0 to 64"),5455_Q(ATOM_CAS_COUNT,56"atom_cas_count",57"Number of warps executing atomic compare and swap operations. Increments "58"by one if at least one thread in a warp executes the instruction."),5960_Q(ATOM_COUNT,61"atom_count",62"Number of warps executing atomic reduction operations. Increments by one "63"if at least one thread in a warp executes the instruction"),6465_Q(BRANCH,66"branch",67"Number of branch instructions executed per warp on a multiprocessor"),6869_Q(DIVERGENT_BRANCH,70"divergent_branch",71"Number of divergent branches within a warp. This counter will be "72"incremented by one if at least one thread in a warp diverges (that is, "73"follows a different execution path) via a conditional branch"),7475_Q(GLD_REQUEST,76"gld_request",77"Number of executed load instructions where the state space is not "78"specified and hence generic addressing is used, increments per warp on a "79"multiprocessor. It can include the load operations from global,local and "80"shared state space"),8182_Q(GLD_MEM_DIV_REPLAY,83"global_ld_mem_divergence_replays",84"Number of instruction replays for global memory loads. Instruction is "85"replayed if the instruction is accessing more than one cache line of "86"128 bytes. For each extra cache line access the counter is incremented "87"by 1"),8889_Q(GLOBAL_ATOM_CAS,90"global_atom_cas",91"Number of ATOM.CAS instructions executed per warp."),9293_Q(GLOBAL_LD,94"global_load",95"Number of executed load instructions where state space is specified as "96"global, increments per warp on a multiprocessor."),9798_Q(GLOBAL_ST,99"global_store",100"Number of executed store instructions where state space is specified as "101"global, increments per warp on a multiprocessor."),102103_Q(GST_TRANSACTIONS,104"global_store_transaction",105"Number of global store transactions. Increments by 1 per transaction. "106"Transaction can be 32/64/96/128B"),107108_Q(GST_MEM_DIV_REPLAY,109"global_st_mem_divergence_replays",110"Number of instruction replays for global memory stores. Instruction is "111"replayed if the instruction is accessing more than one cache line of "112"128 bytes. For each extra cache line access the counter is incremented "113"by 1"),114115_Q(GRED_COUNT,116"gred_count",117"Number of warps executing reduction operations on global memory. "118"Increments by one if at least one thread in a warp executes the "119"instruction"),120121_Q(GST_REQUEST,122"gst_request",123"Number of executed store instructions where the state space is not "124"specified and hence generic addressing is used, increments per warp on a "125"multiprocessor. It can include the store operations to global,local and "126"shared state space"),127128_Q(INST_EXECUTED,129"inst_executed",130"Number of instructions executed, do not include replays"),131132_Q(INST_ISSUED,133"inst_issued",134"Number of instructions issued including replays"),135136_Q(INST_ISSUED0,137"inst_issued0",138"Number of cycles that did not issue any instruction, increments per "139"warp."),140141_Q(INST_ISSUED1,142"inst_issued1",143"Number of single instruction issued per cycle"),144145_Q(INST_ISSUED2,146"inst_issued2",147"Number of dual instructions issued per cycle"),148149_Q(INST_ISSUED1_0,150"inst_issued1_0",151"Number of single instruction issued per cycle in pipeline 0"),152153_Q(INST_ISSUED1_1,154"inst_issued1_1",155"Number of single instruction issued per cycle in pipeline 1"),156157_Q(INST_ISSUED2_0,158"inst_issued2_0",159"Number of dual instructions issued per cycle in pipeline 0"),160161_Q(INST_ISSUED2_1,162"inst_issued2_1",163"Number of dual instructions issued per cycle in pipeline 1"),164165_Q(L1_GLD_HIT,166"l1_global_load_hit",167"Number of cache lines that hit in L1 cache for global memory load "168"accesses. In case of perfect coalescing this increments by 1,2, and 4 for "169"32, 64 and 128 bit accesses by a warp respectively"),170171_Q(L1_GLD_MISS,172"l1_global_load_miss",173"Number of cache lines that miss in L1 cache for global memory load "174"accesses. In case of perfect coalescing this increments by 1,2, and 4 for "175"32, 64 and 128 bit accesses by a warp respectively"),176177_Q(L1_GLD_TRANSACTIONS,178"__l1_global_load_transactions",179"Number of global load transactions from L1 cache. Increments by 1 per "180"transaction. Transaction can be 32/64/96/128B"),181182_Q(L1_GST_TRANSACTIONS,183"__l1_global_store_transactions",184"Number of global store transactions from L1 cache. Increments by 1 per "185"transaction. Transaction can be 32/64/96/128B"),186187_Q(L1_LOCAL_LD_HIT,188"l1_local_load_hit",189"Number of cache lines that hit in L1 cache for local memory load "190"accesses. In case of perfect coalescing this increments by 1,2, and 4 for "191"32, 64 and 128 bit accesses by a warp respectively"),192193_Q(L1_LOCAL_LD_MISS,194"l1_local_load_miss",195"Number of cache lines that miss in L1 cache for local memory load "196"accesses. In case of perfect coalescing this increments by 1,2, and 4 for "197"32, 64 and 128 bit accesses by a warp respectively"),198199_Q(L1_LOCAL_ST_HIT,200"l1_local_store_hit",201"Number of cache lines that hit in L1 cache for local memory store "202"accesses. In case of perfect coalescing this increments by 1,2, and 4 for "203"32, 64 and 128 bit accesses by a warp respectively"),204205_Q(L1_LOCAL_ST_MISS,206"l1_local_store_miss",207"Number of cache lines that miss in L1 cache for local memory store "208"accesses. In case of perfect coalescing this increments by 1,2, and 4 for "209"32,64 and 128 bit accesses by a warp respectively"),210211_Q(L1_SHARED_LD_TRANSACTIONS,212"l1_shared_load_transactions",213"Number of shared load transactions. Increments by 1 per transaction. "214"Transaction can be 32/64/96/128B"),215216_Q(L1_SHARED_ST_TRANSACTIONS,217"l1_shared_store_transactions",218"Number of shared store transactions. Increments by 1 per transaction. "219"Transaction can be 32/64/96/128B"),220221_Q(LOCAL_LD,222"local_load",223"Number of executed load instructions where state space is specified as "224"local, increments per warp on a multiprocessor"),225226_Q(LOCAL_LD_TRANSACTIONS,227"local_load_transactions",228"Number of local load transactions from L1 cache. Increments by 1 per "229"transaction. Transaction can be 32/64/96/128B"),230231_Q(LOCAL_ST,232"local_store",233"Number of executed store instructions where state space is specified as "234"local, increments per warp on a multiprocessor"),235236_Q(LOCAL_ST_TRANSACTIONS,237"local_store_transactions",238"Number of local store transactions to L1 cache. Increments by 1 per "239"transaction. Transaction can be 32/64/96/128B."),240241_Q(NOT_PRED_OFF_INST_EXECUTED,242"not_predicated_off_thread_inst_executed",243"Number of not predicated off instructions executed by all threads, does "244"not include replays. For each instruction it increments by the number of "245"threads that execute this instruction"),246247_Q(PROF_TRIGGER_0,248"prof_trigger_00",249"User profiled generic trigger that can be inserted in any place of the "250"code to collect the related information. Increments per warp."),251252_Q(PROF_TRIGGER_1,253"prof_trigger_01",254"User profiled generic trigger that can be inserted in any place of the "255"code to collect the related information. Increments per warp."),256257_Q(PROF_TRIGGER_2,258"prof_trigger_02",259"User profiled generic trigger that can be inserted in any place of the "260"code to collect the related information. Increments per warp."),261262_Q(PROF_TRIGGER_3,263"prof_trigger_03",264"User profiled generic trigger that can be inserted in any place of the "265"code to collect the related information. Increments per warp."),266267_Q(PROF_TRIGGER_4,268"prof_trigger_04",269"User profiled generic trigger that can be inserted in any place of the "270"code to collect the related information. Increments per warp."),271272_Q(PROF_TRIGGER_5,273"prof_trigger_05",274"User profiled generic trigger that can be inserted in any place of the "275"code to collect the related information. Increments per warp."),276277_Q(PROF_TRIGGER_6,278"prof_trigger_06",279"User profiled generic trigger that can be inserted in any place of the "280"code to collect the related information. Increments per warp."),281282_Q(PROF_TRIGGER_7,283"prof_trigger_07",284"User profiled generic trigger that can be inserted in any place of the "285"code to collect the related information. Increments per warp."),286287_Q(SHARED_ATOM,288"shared_atom",289"Number of ATOMS instructions executed per warp."),290291_Q(SHARED_ATOM_CAS,292"shared_atom_cas",293"Number of ATOMS.CAS instructions executed per warp."),294295_Q(SHARED_LD,296"shared_load",297"Number of executed load instructions where state space is specified as "298"shared, increments per warp on a multiprocessor"),299300_Q(SHARED_LD_BANK_CONFLICT,301"shared_load_bank_conflict",302"Number of shared load bank conflict generated when the addresses for "303"two or more shared memory load requests fall in the same memory bank."),304305_Q(SHARED_LD_REPLAY,306"shared_load_replay",307"Replays caused due to shared load bank conflict (when the addresses for "308"two or more shared memory load requests fall in the same memory bank) or "309"when there is no conflict but the total number of words accessed by all "310"threads in the warp executing that instruction exceed the number of words "311"that can be loaded in one cycle (256 bytes)"),312313_Q(SHARED_LD_TRANSACTIONS,314"shared_ld_transactions",315"Number of transactions for shared load accesses. Maximum transaction "316"size in maxwell is 128 bytes, any warp accessing more that 128 bytes "317"will cause multiple transactions for a shared load instruction. This "318"also includes extra transactions caused by shared bank conflicts."),319320_Q(SHARED_ST,321"shared_store",322"Number of executed store instructions where state space is specified as "323"shared, increments per warp on a multiprocessor"),324325_Q(SHARED_ST_BANK_CONFLICT,326"shared_store_bank_conflict",327"Number of shared store bank conflict generated when the addresses for "328"two or more shared memory store requests fall in the same memory bank."),329330_Q(SHARED_ST_REPLAY,331"shared_store_replay",332"Replays caused due to shared store bank conflict (when the addresses for "333"two or more shared memory store requests fall in the same memory bank) or "334"when there is no conflict but the total number of words accessed by all "335"threads in the warp executing that instruction exceed the number of words "336"that can be stored in one cycle"),337338_Q(SHARED_ST_TRANSACTIONS,339"shared_st_transactions",340"Number of transactions for shared store accesses. Maximum transaction "341"size in maxwell is 128 bytes, any warp accessing more that 128 bytes "342"will cause multiple transactions for a shared store instruction. This "343"also includes extra transactions caused by shared bank conflicts."),344345_Q(SM_CTA_LAUNCHED,346"sm_cta_launched",347"Number of thread blocks launched on a multiprocessor"),348349_Q(THREADS_LAUNCHED,350"threads_launched",351"Number of threads launched on a multiprocessor"),352353_Q(TH_INST_EXECUTED,354"thread_inst_executed",355"Number of instructions executed by all threads, does not include "356"replays. For each instruction it increments by the number of threads in "357"the warp that execute the instruction"),358359_Q(TH_INST_EXECUTED_0,360"thread_inst_executed_0",361"Number of instructions executed by all threads, does not include "362"replays. For each instruction it increments by the number of threads in "363"the warp that execute the instruction in pipeline 0"),364365_Q(TH_INST_EXECUTED_1,366"thread_inst_executed_1",367"Number of instructions executed by all threads, does not include "368"replays. For each instruction it increments by the number of threads in "369"the warp that execute the instruction in pipeline 1"),370371_Q(TH_INST_EXECUTED_2,372"thread_inst_executed_2",373"Number of instructions executed by all threads, does not include "374"replays. For each instruction it increments by the number of threads in "375"the warp that execute the instruction in pipeline 2"),376377_Q(TH_INST_EXECUTED_3,378"thread_inst_executed_3",379"Number of instructions executed by all threads, does not include "380"replays. For each instruction it increments by the number of threads in "381"the warp that execute the instruction in pipeline 3"),382383_Q(UNCACHED_GLD_TRANSACTIONS,384"uncached_global_load_transaction",385"Number of uncached global load transactions. Increments by 1 per "386"transaction. Transaction can be 32/64/96/128B."),387388_Q(WARPS_LAUNCHED,389"warps_launched",390"Number of warps launched on a multiprocessor"),391};392393#undef _Q394395static inline const char *396nvc0_hw_sm_query_get_name(unsigned query_type)397{398unsigned i;399400for (i = 0; i < ARRAY_SIZE(nvc0_hw_sm_queries); i++) {401if (nvc0_hw_sm_queries[i].type == query_type)402return nvc0_hw_sm_queries[i].name;403}404assert(0);405return NULL;406}407408/* === PERFORMANCE MONITORING COUNTERS for NVE4+ === */409410/* Code to read out MP counters: They are accessible via mmio, too, but let's411* just avoid mapping registers in userspace. We'd have to know which MPs are412* enabled/present, too, and that information is not presently exposed.413* We could add a kernel interface for it, but reading the counters like this414* has the advantage of being async (if get_result isn't called immediately).415*/416static const uint64_t nve4_read_hw_sm_counters_code[] =417{418/* sched 0x20 0x20 0x20 0x20 0x20 0x20 0x20419* mov b32 $r8 $tidx420* mov b32 $r12 $physid421* mov b32 $r0 $pm0422* mov b32 $r1 $pm1423* mov b32 $r2 $pm2424* mov b32 $r3 $pm3425* mov b32 $r4 $pm4426* sched 0x20 0x20 0x23 0x04 0x20 0x04 0x2b427* mov b32 $r5 $pm5428* mov b32 $r6 $pm6429* mov b32 $r7 $pm7430* set $p0 0x1 eq u32 $r8 0x0431* mov b32 $r10 c7[0x6a0]432* ext u32 $r8 $r12 0x414433* mov b32 $r11 c7[0x6a4]434* sched 0x04 0x2e 0x04 0x20 0x20 0x28 0x04435* ext u32 $r9 $r12 0x208436* (not $p0) exit437* set $p1 0x1 eq u32 $r9 0x0438* mul $r8 u32 $r8 u32 96439* mul $r12 u32 $r9 u32 16440* mul $r13 u32 $r9 u32 4441* add b32 $r9 $r8 $r13442* sched 0x28 0x04 0x2c 0x04 0x2c 0x04 0x2c443* add b32 $r8 $r8 $r12444* mov b32 $r12 $r10445* add b32 $r10 $c $r10 $r8446* mov b32 $r13 $r11447* add b32 $r11 $r11 0x0 $c448* add b32 $r12 $c $r12 $r9449* st b128 wt g[$r10d] $r0q450* sched 0x4 0x2c 0x20 0x04 0x2e 0x00 0x00451* mov b32 $r0 c7[0x6a8]452* add b32 $r13 $r13 0x0 $c453* $p1 st b128 wt g[$r12d+0x40] $r4q454* st b32 wt g[$r12d+0x50] $r0455* exit */4560x2202020202020207ULL,4570x2c00000084021c04ULL,4580x2c0000000c031c04ULL,4590x2c00000010001c04ULL,4600x2c00000014005c04ULL,4610x2c00000018009c04ULL,4620x2c0000001c00dc04ULL,4630x2c00000020011c04ULL,4640x22b0420042320207ULL,4650x2c00000024015c04ULL,4660x2c00000028019c04ULL,4670x2c0000002c01dc04ULL,4680x190e0000fc81dc03ULL,4690x28005c1a80029de4ULL,4700x7000c01050c21c03ULL,4710x28005c1a9002dde4ULL,4720x204282020042e047ULL,4730x7000c00820c25c03ULL,4740x80000000000021e7ULL,4750x190e0000fc93dc03ULL,4760x1000000180821c02ULL,4770x1000000040931c02ULL,4780x1000000010935c02ULL,4790x4800000034825c03ULL,4800x22c042c042c04287ULL,4810x4800000030821c03ULL,4820x2800000028031de4ULL,4830x4801000020a29c03ULL,4840x280000002c035de4ULL,4850x0800000000b2dc42ULL,4860x4801000024c31c03ULL,4870x9400000000a01fc5ULL,4880x200002e04202c047ULL,4890x28005c1aa0001de4ULL,4900x0800000000d35c42ULL,4910x9400000100c107c5ULL,4920x9400000140c01f85ULL,4930x8000000000001de7ULL494};495496static const uint64_t nvf0_read_hw_sm_counters_code[] =497{498/* Same kernel as GK104 */4990x0880808080808080ULL,5000x86400000109c0022ULL,5010x86400000019c0032ULL,5020x86400000021c0002ULL,5030x86400000029c0006ULL,5040x86400000031c000aULL,5050x86400000039c000eULL,5060x86400000041c0012ULL,5070x08ac1080108c8080ULL,5080x86400000049c0016ULL,5090x86400000051c001aULL,5100x86400000059c001eULL,5110xdb201c007f9c201eULL,5120x64c03ce0d41c002aULL,5130xc00000020a1c3021ULL,5140x64c03ce0d49c002eULL,5150x0810a0808010b810ULL,5160xc0000001041c3025ULL,5170x180000000020003cULL,5180xdb201c007f9c243eULL,5190xc1c00000301c2021ULL,5200xc1c00000081c2431ULL,5210xc1c00000021c2435ULL,5220xe0800000069c2026ULL,5230x08b010b010b010a0ULL,5240xe0800000061c2022ULL,5250xe4c03c00051c0032ULL,5260xe0840000041c282aULL,5270xe4c03c00059c0036ULL,5280xe08040007f9c2c2eULL,5290xe0840000049c3032ULL,5300xfe800000001c2800ULL,5310x080000b81080b010ULL,5320x64c03ce0d51c0002ULL,5330xe08040007f9c3436ULL,5340xfe80000020043010ULL,5350xfc800000281c3000ULL,5360x18000000001c003cULL,537};538539static const uint64_t gm107_read_hw_sm_counters_code[] =540{5410x001d0400e4200701ULL, /* sched (st 0x1 wr 0x0) (st 0x1 wr 0x1) (st 0x1 wr 0x2) */5420xf0c8000002170008ULL, /* mov $r8 $tidx */5430xf0c800000037000cULL, /* mov $r12 $virtid */5440xf0c8000000470000ULL, /* mov $r0 $pm0 */5450x001e8400f0200761ULL, /* sched (st 0x1 wr 0x3) (st 0x1 wr 0x4) (st 0x1 wr 0x5) */5460xf0c8000000570001ULL, /* mov $r1 $pm1 */5470xf0c8000000670002ULL, /* mov $r2 $pm2 */5480xf0c8000000770003ULL, /* mov $r3 $pm3 */5490x001e8400f42007a1ULL, /* sched (st 0x1 wr 0x5) (st 0x1 wr 0x5) (st 0x1 wr 0x5) */5500xf0c8000000870004ULL, /* mov $r4 $pm4 */5510xf0c8000000970005ULL, /* mov $r5 $pm5 */5520xf0c8000000a70006ULL, /* mov $r6 $pm6 */5530x001f8401fc2007a1ULL, /* sched (st 0x1 wr 0x5) (st 0x1 wt 0x1) (st 0x1) */5540xf0c8000000b70007ULL, /* mov $r7 $pm7 */5550x5b6403800087ff07ULL, /* isetp eq u32 and $p0 0x1 0x0 $r8 0x1 */5560x4c98079c1a87000aULL, /* mov $r10 c7[0x6a0] 0xf */5570x001fa400fc2017e1ULL, /* sched (st 0x1 wt 0x2) (st 0x1) (st 0x9) */5580x3800000091470c08ULL, /* bfe u32 $r8 $r12 0x914 */5590x4c98079c1a97000bULL, /* mov $r11 c7[0x6a4] 0xf */5600x3800000020870c09ULL, /* bfe u32 $r9 $r12 0x208 */5610x001c1800fc2007edULL, /* sched (st 0xd) (st 0x1) (st 0x6 wr 0x0) */5620xe30000000008000fULL, /* not $p0 exit */5630x5b6403800097ff0fULL, /* isetp eq u32 and $p1 0x1 0x0 $r9 0x1 */5640x3838000006070808ULL, /* imul u32 u32 $r8 $r8 0x60 */5650x003f8400e0c00726ULL, /* sched (st 0x6 wr 0x1) (st 0x6 wr 0x0) (st 0x1 wt 0x1) */5660x383800000107090cULL, /* imul u32 u32 $r12 $r9 0x10 */5670x383800000047090dULL, /* imul u32 u32 $r13 $r9 0x4 */5680x5c10000000d70809ULL, /* iadd $r9 $r8 $r13 */5690x001f8400fcc017e1ULL, /* sched (st 0x1 wt 0x2) (st 0x6) (st 0x1) */5700x5c10000000c70808ULL, /* iadd $r8 $r8 $r12 */5710x5c98078000a7000cULL, /* mov $r12 $r10 0xf */5720x5c10800000870a0aULL, /* iadd cc $r10 $r10 $r8 */5730x001f8400fc2007e6ULL, /* sched (st 0x6) (st 0x1) (st 0x1) */5740x5c98078000b7000dULL, /* mov $r13 $r11 0xf */5750x5c1008000ff70b0bULL, /* iadd x $r11 $r11 0x0 */5760x5c10800000970c0cULL, /* iadd cc $r12 $r12 $r9 */5770x003f983c1c4007e1ULL, /* sched (st 0x1) (st 0x2 rd 0x0 wt 0x3c) (st 0x6 wt 0x1) */5780x5c1008000ff70d0dULL, /* iadd x $r13 $r13 0x0 */5790xbfd0000000070a00ULL, /* st e wt b128 g[$r10] $r0 0x1 */5800x4c98079c1aa70000ULL, /* mov $r0 c7[0x6a8] 0xf */5810x001fbc00fc2007e6ULL, /* sched (st 0x1) (st 0x1) (st 0xf) */5820xbfd0000004010c04ULL, /* $p1 st e wt b128 g[$r12+0x40] $r4 0x1 */5830xbf90000005070c00ULL, /* st e wt b32 g[$r12+0x50] $r0 0x1 */5840xe30000000007000fULL, /* exit */585};586587/* For simplicity, we will allocate as many group slots as we allocate counter588* slots. This means that a single counter which wants to source from 2 groups589* will have to be declared as using 2 counter slots. This shouldn't really be590* a problem because such queries don't make much sense ... (unless someone is591* really creative).592*/593struct nvc0_hw_sm_counter_cfg594{595uint32_t func : 16; /* mask or 4-bit logic op (depending on mode) */596uint32_t mode : 4; /* LOGOP,B6,LOGOP_B6(_PULSE) */597uint32_t sig_dom : 1; /* if 0, MP_PM_A (per warp-sched), if 1, MP_PM_B */598uint32_t sig_sel : 8; /* signal group */599uint32_t src_mask; /* mask for signal selection (only for NVC0:NVE4) */600uint32_t src_sel; /* signal selection for up to 4 sources */601};602603struct nvc0_hw_sm_query_cfg604{605unsigned type;606struct nvc0_hw_sm_counter_cfg ctr[8];607uint8_t num_counters;608uint8_t norm[2]; /* normalization num,denom */609};610611#define _CA(f, m, g, s) { f, NVE4_COMPUTE_MP_PM_FUNC_MODE_##m, 0, g, 0, s }612#define _CB(f, m, g, s) { f, NVE4_COMPUTE_MP_PM_FUNC_MODE_##m, 1, g, 0, s }613#define _Q(n, c) [NVE4_HW_SM_QUERY_##n] = c614615/* ==== Compute capability 3.0 (GK104:GK110) ==== */616static const struct nvc0_hw_sm_query_cfg617sm30_active_cycles =618{619.type = NVC0_HW_SM_QUERY_ACTIVE_CYCLES,620.ctr[0] = _CB(0x0001, B6, 0x02, 0x00000000),621.num_counters = 1,622.norm = { 1, 1 },623};624625static const struct nvc0_hw_sm_query_cfg626sm30_active_warps =627{628.type = NVC0_HW_SM_QUERY_ACTIVE_WARPS,629.ctr[0] = _CB(0x003f, B6, 0x02, 0x31483104),630.num_counters = 1,631.norm = { 2, 1 },632};633634static const struct nvc0_hw_sm_query_cfg635sm30_atom_cas_count =636{637.type = NVC0_HW_SM_QUERY_ATOM_CAS_COUNT,638.ctr[0] = _CA(0x0001, B6, 0x1c, 0x000000004),639.num_counters = 1,640.norm = { 1, 1 },641};642643static const struct nvc0_hw_sm_query_cfg644sm30_atom_count =645{646.type = NVC0_HW_SM_QUERY_ATOM_COUNT,647.ctr[0] = _CA(0x0001, B6, 0x1c, 0x00000000),648.num_counters = 1,649.norm = { 1, 1 },650};651652static const struct nvc0_hw_sm_query_cfg653sm30_branch =654{655.type = NVC0_HW_SM_QUERY_BRANCH,656.ctr[0] = _CA(0x0001, B6, 0x1c, 0x0000000c),657.num_counters = 1,658.norm = { 1, 1 },659};660661static const struct nvc0_hw_sm_query_cfg662sm30_divergent_branch =663{664.type = NVC0_HW_SM_QUERY_DIVERGENT_BRANCH,665.ctr[0] = _CA(0x0001, B6, 0x1c, 0x00000010),666.num_counters = 1,667.norm = { 1, 1 },668};669670static const struct nvc0_hw_sm_query_cfg671sm30_gld_request =672{673.type = NVC0_HW_SM_QUERY_GLD_REQUEST,674.ctr[0] = _CA(0x0001, B6, 0x1b, 0x00000010),675.num_counters = 1,676.norm = { 1, 1 },677};678679static const struct nvc0_hw_sm_query_cfg680sm30_gld_mem_div_replay =681{682.type = NVC0_HW_SM_QUERY_GLD_MEM_DIV_REPLAY,683.ctr[0] = _CB(0x0001, B6, 0x08, 0x00000010),684.num_counters = 1,685.norm = { 1, 1 },686};687688static const struct nvc0_hw_sm_query_cfg689sm30_gst_transactions =690{691.type = NVC0_HW_SM_QUERY_GST_TRANSACTIONS,692.ctr[0] = _CB(0x0001, B6, 0x11, 0x00000004),693.num_counters = 1,694.norm = { 1, 1 },695};696697static const struct nvc0_hw_sm_query_cfg698sm30_gst_mem_div_replay =699{700.type = NVC0_HW_SM_QUERY_GST_MEM_DIV_REPLAY,701.ctr[0] = _CB(0x0001, B6, 0x08, 0x00000014),702.num_counters = 1,703.norm = { 1, 1 },704};705706static const struct nvc0_hw_sm_query_cfg707sm30_gred_count =708{709.type = NVC0_HW_SM_QUERY_GRED_COUNT,710.ctr[0] = _CA(0x0001, B6, 0x1c, 0x00000008),711.num_counters = 1,712.norm = { 1, 1 },713};714715static const struct nvc0_hw_sm_query_cfg716sm30_gst_request =717{718.type = NVC0_HW_SM_QUERY_GST_REQUEST,719.ctr[0] = _CA(0x0001, B6, 0x1b, 0x00000014),720.num_counters = 1,721.norm = { 1, 1 },722};723724static const struct nvc0_hw_sm_query_cfg725sm30_inst_executed =726{727.type = NVC0_HW_SM_QUERY_INST_EXECUTED,728.ctr[0] = _CA(0x0003, B6, 0x04, 0x00000398),729.num_counters = 1,730.norm = { 1, 1 },731};732733static const struct nvc0_hw_sm_query_cfg734sm30_inst_issued1 =735{736.type = NVC0_HW_SM_QUERY_INST_ISSUED1,737.ctr[0] = _CA(0x0001, B6, 0x05, 0x00000004),738.num_counters = 1,739.norm = { 1, 1 },740};741742static const struct nvc0_hw_sm_query_cfg743sm30_inst_issued2 =744{745.type = NVC0_HW_SM_QUERY_INST_ISSUED2,746.ctr[0] = _CA(0x0001, B6, 0x05, 0x00000008),747.num_counters = 1,748.norm = { 1, 1 },749};750751static const struct nvc0_hw_sm_query_cfg752sm30_l1_gld_hit =753{754.type = NVC0_HW_SM_QUERY_L1_GLD_HIT,755.ctr[0] = _CB(0x0001, B6, 0x10, 0x00000010),756.num_counters = 1,757.norm = { 1, 1 },758};759760static const struct nvc0_hw_sm_query_cfg761sm30_l1_gld_miss =762{763.type = NVC0_HW_SM_QUERY_L1_GLD_MISS,764.ctr[0] = _CB(0x0001, B6, 0x10, 0x00000014),765.num_counters = 1,766.norm = { 1, 1 },767};768769static const struct nvc0_hw_sm_query_cfg770sm30_l1_gld_transactions =771{772.type = NVC0_HW_SM_QUERY_L1_GLD_TRANSACTIONS,773.ctr[0] = _CB(0x0001, B6, 0x0f, 0x00000000),774.num_counters = 1,775.norm = { 1, 1 },776};777778static const struct nvc0_hw_sm_query_cfg779sm30_l1_gst_transactions =780{781.type = NVC0_HW_SM_QUERY_L1_GST_TRANSACTIONS,782.ctr[0] = _CB(0x0001, B6, 0x0f, 0x00000004),783.num_counters = 1,784.norm = { 1, 1 },785};786787static const struct nvc0_hw_sm_query_cfg788sm30_l1_local_ld_hit =789{790.type = NVC0_HW_SM_QUERY_L1_LOCAL_LD_HIT,791.ctr[0] = _CB(0x0001, B6, 0x10, 0x00000000),792.num_counters = 1,793.norm = { 1, 1 },794};795796static const struct nvc0_hw_sm_query_cfg797sm30_l1_local_ld_miss =798{799.type = NVC0_HW_SM_QUERY_L1_LOCAL_LD_MISS,800.ctr[0] = _CB(0x0001, B6, 0x10, 0x00000004),801.num_counters = 1,802.norm = { 1, 1 },803};804805static const struct nvc0_hw_sm_query_cfg806sm30_l1_local_st_hit =807{808.type = NVC0_HW_SM_QUERY_L1_LOCAL_ST_HIT,809.ctr[0] = _CB(0x0001, B6, 0x10, 0x00000008),810.num_counters = 1,811.norm = { 1, 1 },812};813814static const struct nvc0_hw_sm_query_cfg815sm30_l1_local_st_miss =816{817.type = NVC0_HW_SM_QUERY_L1_LOCAL_ST_MISS,818.ctr[0] = _CB(0x0001, B6, 0x10, 0x0000000c),819.num_counters = 1,820.norm = { 1, 1 },821};822823static const struct nvc0_hw_sm_query_cfg824sm30_l1_shared_ld_transactions =825{826.type = NVC0_HW_SM_QUERY_L1_SHARED_LD_TRANSACTIONS,827.ctr[0] = _CB(0x0001, B6, 0x0e, 0x00000008),828.num_counters = 1,829.norm = { 1, 1 },830};831832static const struct nvc0_hw_sm_query_cfg833sm30_l1_shared_st_transactions =834{835.type = NVC0_HW_SM_QUERY_L1_SHARED_ST_TRANSACTIONS,836.ctr[0] = _CB(0x0001, B6, 0x0e, 0x0000000c),837.num_counters = 1,838.norm = { 1, 1 },839};840841static const struct nvc0_hw_sm_query_cfg842sm30_local_ld =843{844.type = NVC0_HW_SM_QUERY_LOCAL_LD,845.ctr[0] = _CA(0x0001, B6, 0x1b, 0x00000008),846.num_counters = 1,847.norm = { 1, 1 },848};849850static const struct nvc0_hw_sm_query_cfg851sm30_local_ld_transactions =852{853.type = NVC0_HW_SM_QUERY_LOCAL_LD_TRANSACTIONS,854.ctr[0] = _CB(0x0001, B6, 0x0e, 0x00000000),855.num_counters = 1,856.norm = { 1, 1 },857};858859static const struct nvc0_hw_sm_query_cfg860sm30_local_st =861{862.type = NVC0_HW_SM_QUERY_LOCAL_ST,863.ctr[0] = _CA(0x0001, B6, 0x1b, 0x0000000c),864.num_counters = 1,865.norm = { 1, 1 },866};867868static const struct nvc0_hw_sm_query_cfg869sm30_local_st_transactions =870{871.type = NVC0_HW_SM_QUERY_LOCAL_ST_TRANSACTIONS,872.ctr[0] = _CB(0x0001, B6, 0x0e, 0x00000004),873.num_counters = 1,874.norm = { 1, 1 },875};876877static const struct nvc0_hw_sm_query_cfg878sm30_prof_trigger_0 =879{880.type = NVC0_HW_SM_QUERY_PROF_TRIGGER_0,881.ctr[0] = _CA(0x0001, B6, 0x01, 0x00000000),882.num_counters = 1,883.norm = { 1, 1 },884};885886static const struct nvc0_hw_sm_query_cfg887sm30_prof_trigger_1 =888{889.type = NVC0_HW_SM_QUERY_PROF_TRIGGER_1,890.ctr[0] = _CA(0x0001, B6, 0x01, 0x00000004),891.num_counters = 1,892.norm = { 1, 1 },893};894895static const struct nvc0_hw_sm_query_cfg896sm30_prof_trigger_2 =897{898.type = NVC0_HW_SM_QUERY_PROF_TRIGGER_2,899.ctr[0] = _CA(0x0001, B6, 0x01, 0x00000008),900.num_counters = 1,901.norm = { 1, 1 },902};903904static const struct nvc0_hw_sm_query_cfg905sm30_prof_trigger_3 =906{907.type = NVC0_HW_SM_QUERY_PROF_TRIGGER_3,908.ctr[0] = _CA(0x0001, B6, 0x01, 0x0000000c),909.num_counters = 1,910.norm = { 1, 1 },911};912913static const struct nvc0_hw_sm_query_cfg914sm30_prof_trigger_4 =915{916.type = NVC0_HW_SM_QUERY_PROF_TRIGGER_4,917.ctr[0] = _CA(0x0001, B6, 0x01, 0x00000010),918.num_counters = 1,919.norm = { 1, 1 },920};921922static const struct nvc0_hw_sm_query_cfg923sm30_prof_trigger_5 =924{925.type = NVC0_HW_SM_QUERY_PROF_TRIGGER_5,926.ctr[0] = _CA(0x0001, B6, 0x01, 0x00000014),927.num_counters = 1,928.norm = { 1, 1 },929};930931static const struct nvc0_hw_sm_query_cfg932sm30_prof_trigger_6 =933{934.type = NVC0_HW_SM_QUERY_PROF_TRIGGER_6,935.ctr[0] = _CA(0x0001, B6, 0x01, 0x00000018),936.num_counters = 1,937.norm = { 1, 1 },938};939940static const struct nvc0_hw_sm_query_cfg941sm30_prof_trigger_7 =942{943.type = NVC0_HW_SM_QUERY_PROF_TRIGGER_7,944.ctr[0] = _CA(0x0001, B6, 0x01, 0x0000001c),945.num_counters = 1,946.norm = { 1, 1 },947};948949static const struct nvc0_hw_sm_query_cfg950sm30_shared_ld =951{952.type = NVC0_HW_SM_QUERY_SHARED_LD,953.ctr[0] = _CA(0x0001, B6, 0x1b, 0x00000000),954.num_counters = 1,955.norm = { 1, 1 },956};957958static const struct nvc0_hw_sm_query_cfg959sm30_shared_ld_replay =960{961.type = NVC0_HW_SM_QUERY_SHARED_LD_REPLAY,962.ctr[0] = _CB(0x0001, B6, 0x08, 0x00000008),963.num_counters = 1,964.norm = { 1, 1 },965};966967static const struct nvc0_hw_sm_query_cfg968sm30_shared_st =969{970.type = NVC0_HW_SM_QUERY_SHARED_ST,971.ctr[0] = _CA(0x0001, B6, 0x1b, 0x00000004),972.num_counters = 1,973.norm = { 1, 1 },974};975976static const struct nvc0_hw_sm_query_cfg977sm30_shared_st_replay =978{979.type = NVC0_HW_SM_QUERY_SHARED_ST_REPLAY,980.ctr[0] = _CB(0x0001, B6, 0x08, 0x0000000c),981.num_counters = 1,982.norm = { 1, 1 },983};984985static const struct nvc0_hw_sm_query_cfg986sm30_sm_cta_launched =987{988.type = NVC0_HW_SM_QUERY_SM_CTA_LAUNCHED,989.ctr[0] = _CB(0x0001, B6, 0x02, 0x0000001c),990.num_counters = 1,991.norm = { 1, 1 },992};993994static const struct nvc0_hw_sm_query_cfg995sm30_threads_launched =996{997.type = NVC0_HW_SM_QUERY_THREADS_LAUNCHED,998.ctr[0] = _CA(0x003f, B6, 0x03, 0x398a4188),999.num_counters = 1,1000.norm = { 1, 1 },1001};10021003static const struct nvc0_hw_sm_query_cfg1004sm30_uncached_gld_transactions =1005{1006.type = NVC0_HW_SM_QUERY_UNCACHED_GLD_TRANSACTIONS,1007.ctr[0] = _CB(0x0001, B6, 0x11, 0x00000000),1008.num_counters = 1,1009.norm = { 1, 1 },1010};10111012static const struct nvc0_hw_sm_query_cfg1013sm30_warps_launched =1014{1015.type = NVC0_HW_SM_QUERY_WARPS_LAUNCHED,1016.ctr[0] = _CA(0x0001, B6, 0x03, 0x00000004),1017.num_counters = 1,1018.norm = { 1, 1 },1019};10201021/* NOTES:1022* active_warps: bit 0 alternates btw 0 and 1 for odd nr of warps1023* inst_executed etc.: we only count a single warp scheduler1024*/1025static const struct nvc0_hw_sm_query_cfg *sm30_hw_sm_queries[] =1026{1027&sm30_active_cycles,1028&sm30_active_warps,1029&sm30_atom_cas_count,1030&sm30_atom_count,1031&sm30_branch,1032&sm30_divergent_branch,1033&sm30_gld_request,1034&sm30_gld_mem_div_replay,1035&sm30_gst_transactions,1036&sm30_gst_mem_div_replay,1037&sm30_gred_count,1038&sm30_gst_request,1039&sm30_inst_executed,1040&sm30_inst_issued1,1041&sm30_inst_issued2,1042&sm30_l1_gld_hit,1043&sm30_l1_gld_miss,1044&sm30_l1_gld_transactions,1045&sm30_l1_gst_transactions,1046&sm30_l1_local_ld_hit,1047&sm30_l1_local_ld_miss,1048&sm30_l1_local_st_hit,1049&sm30_l1_local_st_miss,1050&sm30_l1_shared_ld_transactions,1051&sm30_l1_shared_st_transactions,1052&sm30_local_ld,1053&sm30_local_ld_transactions,1054&sm30_local_st,1055&sm30_local_st_transactions,1056&sm30_prof_trigger_0,1057&sm30_prof_trigger_1,1058&sm30_prof_trigger_2,1059&sm30_prof_trigger_3,1060&sm30_prof_trigger_4,1061&sm30_prof_trigger_5,1062&sm30_prof_trigger_6,1063&sm30_prof_trigger_7,1064&sm30_shared_ld,1065&sm30_shared_ld_replay,1066&sm30_shared_st,1067&sm30_shared_st_replay,1068&sm30_sm_cta_launched,1069&sm30_threads_launched,1070&sm30_uncached_gld_transactions,1071&sm30_warps_launched,1072};10731074/* ==== Compute capability 3.5 (GK110/GK208) ==== */1075static const struct nvc0_hw_sm_query_cfg1076sm35_atom_cas_count =1077{1078.type = NVC0_HW_SM_QUERY_ATOM_CAS_COUNT,1079.ctr[0] = _CA(0x0001, B6, 0x1a, 0x00000014),1080.num_counters = 1,1081.norm = { 1, 1 },1082};10831084static const struct nvc0_hw_sm_query_cfg1085sm35_atom_count =1086{1087.type = NVC0_HW_SM_QUERY_ATOM_COUNT,1088.ctr[0] = _CA(0x0001, B6, 0x1a, 0x00000010),1089.num_counters = 1,1090.norm = { 1, 1 },1091};10921093static const struct nvc0_hw_sm_query_cfg1094sm35_gred_count =1095{1096.type = NVC0_HW_SM_QUERY_GRED_COUNT,1097.ctr[0] = _CA(0x0001, B6, 0x1a, 0x00000018),1098.num_counters = 1,1099.norm = { 1, 1 },1100};11011102static const struct nvc0_hw_sm_query_cfg1103sm35_not_pred_off_inst_executed =1104{1105.type = NVC0_HW_SM_QUERY_NOT_PRED_OFF_INST_EXECUTED,1106.ctr[0] = _CA(0x003f, B6, 0x14, 0x29062080),1107.num_counters = 1,1108.norm = { 1, 1 },1109};11101111static const struct nvc0_hw_sm_query_cfg1112sm35_shared_ld_replay =1113{1114.type = NVC0_HW_SM_QUERY_SHARED_LD_REPLAY,1115.ctr[0] = _CB(0xaaaa, LOGOP, 0x13, 0x00000018),1116.ctr[1] = _CB(0x8888, LOGOP, 0x08, 0x00000151),1117.num_counters = 2,1118.norm = { 1, 1 },1119};11201121static const struct nvc0_hw_sm_query_cfg1122sm35_shared_st_replay =1123{1124.type = NVC0_HW_SM_QUERY_SHARED_ST_REPLAY,1125.ctr[0] = _CB(0xaaaa, LOGOP, 0x13, 0x00000018),1126.ctr[1] = _CB(0x8888, LOGOP, 0x08, 0x000001d1),1127.num_counters = 2,1128.norm = { 1, 1 },1129};11301131static const struct nvc0_hw_sm_query_cfg1132sm35_th_inst_executed =1133{1134.type = NVC0_HW_SM_QUERY_TH_INST_EXECUTED,1135.ctr[0] = _CA(0x003f, B6, 0x11, 0x29062080),1136.num_counters = 1,1137.norm = { 1, 1 },1138};11391140static const struct nvc0_hw_sm_query_cfg *sm35_hw_sm_queries[] =1141{1142&sm30_active_cycles,1143&sm30_active_warps,1144&sm35_atom_cas_count,1145&sm35_atom_count,1146&sm30_gld_request,1147&sm30_gld_mem_div_replay,1148&sm30_gst_transactions,1149&sm30_gst_mem_div_replay,1150&sm35_gred_count,1151&sm30_gst_request,1152&sm30_inst_executed,1153&sm30_inst_issued1,1154&sm30_inst_issued2,1155&sm30_l1_gld_hit,1156&sm30_l1_gld_miss,1157&sm30_l1_gld_transactions,1158&sm30_l1_gst_transactions,1159&sm30_l1_local_ld_hit,1160&sm30_l1_local_ld_miss,1161&sm30_l1_local_st_hit,1162&sm30_l1_local_st_miss,1163&sm30_l1_shared_ld_transactions,1164&sm30_l1_shared_st_transactions,1165&sm30_local_ld,1166&sm30_local_ld_transactions,1167&sm30_local_st,1168&sm30_local_st_transactions,1169&sm35_not_pred_off_inst_executed,1170&sm30_prof_trigger_0,1171&sm30_prof_trigger_1,1172&sm30_prof_trigger_2,1173&sm30_prof_trigger_3,1174&sm30_prof_trigger_4,1175&sm30_prof_trigger_5,1176&sm30_prof_trigger_6,1177&sm30_prof_trigger_7,1178&sm30_shared_ld,1179&sm35_shared_ld_replay,1180&sm30_shared_st,1181&sm35_shared_st_replay,1182&sm30_sm_cta_launched,1183&sm35_th_inst_executed,1184&sm30_threads_launched,1185&sm30_uncached_gld_transactions,1186&sm30_warps_launched,1187};11881189/* ==== Compute capability 5.0 (GM107/GM108) ==== */1190static const struct nvc0_hw_sm_query_cfg1191sm50_active_ctas =1192{1193.type = NVC0_HW_SM_QUERY_ACTIVE_CTAS,1194.ctr[0] = _CB(0x003f, B6, 0x01, 0x29062080),1195.num_counters = 1,1196.norm = { 1, 1 },1197};11981199static const struct nvc0_hw_sm_query_cfg1200sm50_active_cycles =1201{1202.type = NVC0_HW_SM_QUERY_ACTIVE_CYCLES,1203.ctr[0] = _CB(0x0001, B6, 0x00, 0x00000004),1204.num_counters = 1,1205.norm = { 1, 1 },1206};12071208static const struct nvc0_hw_sm_query_cfg1209sm50_active_warps =1210{1211.type = NVC0_HW_SM_QUERY_ACTIVE_WARPS,1212.ctr[0] = _CB(0x003f, B6, 0x00, 0x398a4188),1213.num_counters = 1,1214.norm = { 1, 1 },1215};12161217static const struct nvc0_hw_sm_query_cfg1218sm50_atom_count =1219{1220.type = NVC0_HW_SM_QUERY_ATOM_COUNT,1221.ctr[0] = _CA(0x0001, B6, 0x14, 0x00000004),1222.num_counters = 1,1223.norm = { 1, 1 },1224};12251226static const struct nvc0_hw_sm_query_cfg1227sm50_branch =1228{1229.type = NVC0_HW_SM_QUERY_BRANCH,1230.ctr[0] = _CA(0x0001, B6, 0x1a, 0x00000010),1231.num_counters = 1,1232.norm = { 1, 1 },1233};12341235static const struct nvc0_hw_sm_query_cfg1236sm50_divergent_branch =1237{1238.type = NVC0_HW_SM_QUERY_DIVERGENT_BRANCH,1239.ctr[0] = _CA(0x0001, B6, 0x1a, 0x00000004),1240.num_counters = 1,1241.norm = { 1, 1 },1242};12431244static const struct nvc0_hw_sm_query_cfg1245sm50_global_atom_cas =1246{1247.type = NVC0_HW_SM_QUERY_GLOBAL_ATOM_CAS,1248.ctr[0] = _CA(0x0001, B6, 0x14, 0x00000000),1249.num_counters = 1,1250.norm = { 1, 1 },1251};12521253static const struct nvc0_hw_sm_query_cfg1254sm50_global_ld =1255{1256.type = NVC0_HW_SM_QUERY_GLOBAL_LD,1257.ctr[0] = _CA(0x0001, B6, 0x14, 0x0000000c),1258.num_counters = 1,1259.norm = { 1, 1 },1260};12611262static const struct nvc0_hw_sm_query_cfg1263sm50_global_st =1264{1265.type = NVC0_HW_SM_QUERY_GLOBAL_ST,1266.ctr[0] = _CA(0x0001, B6, 0x14, 0x00000010),1267.num_counters = 1,1268.norm = { 1, 1 },1269};12701271static const struct nvc0_hw_sm_query_cfg1272sm50_gred_count =1273{1274.type = NVC0_HW_SM_QUERY_GRED_COUNT,1275.ctr[0] = _CA(0x0001, B6, 0x14, 0x00000008),1276.num_counters = 1,1277.norm = { 1, 1 },1278};12791280static const struct nvc0_hw_sm_query_cfg1281sm50_inst_executed =1282{1283.type = NVC0_HW_SM_QUERY_INST_EXECUTED,1284.ctr[0] = _CA(0x0003, B6, 0x02, 0x00000398),1285.num_counters = 1,1286.norm = { 1, 1 },1287};12881289static const struct nvc0_hw_sm_query_cfg1290sm50_inst_issued0 =1291{1292.type = NVC0_HW_SM_QUERY_INST_ISSUED0,1293.ctr[0] = _CA(0x0001, B6, 0x02, 0x0000000c),1294.num_counters = 1,1295.norm = { 1, 1 },1296};12971298static const struct nvc0_hw_sm_query_cfg1299sm50_inst_issued1 =1300{1301.type = NVC0_HW_SM_QUERY_INST_ISSUED1,1302.ctr[0] = _CA(0x0001, B6, 0x02, 0x00000010),1303.num_counters = 1,1304.norm = { 1, 1 },1305};13061307static const struct nvc0_hw_sm_query_cfg1308sm50_inst_issued2 =1309{1310.type = NVC0_HW_SM_QUERY_INST_ISSUED2,1311.ctr[0] = _CA(0x0001, B6, 0x02, 0x00000014),1312.num_counters = 1,1313.norm = { 1, 1 },1314};13151316static const struct nvc0_hw_sm_query_cfg1317sm50_local_ld =1318{1319.type = NVC0_HW_SM_QUERY_LOCAL_LD,1320.ctr[0] = _CA(0x0001, B6, 0x13, 0x00000004),1321.num_counters = 1,1322.norm = { 1, 1 },1323};13241325static const struct nvc0_hw_sm_query_cfg1326sm50_local_st =1327{1328.type = NVC0_HW_SM_QUERY_LOCAL_ST,1329.ctr[0] = _CA(0x0001, B6, 0x13, 0x00000000),1330.num_counters = 1,1331.norm = { 1, 1 },1332};13331334static const struct nvc0_hw_sm_query_cfg1335sm50_not_pred_off_inst_executed =1336{1337.type = NVC0_HW_SM_QUERY_NOT_PRED_OFF_INST_EXECUTED,1338.ctr[0] = _CA(0x003f, B6, 0x05, 0x29062080),1339.num_counters = 1,1340.norm = { 1, 1 },1341};13421343static const struct nvc0_hw_sm_query_cfg1344sm50_prof_trigger_0 =1345{1346.type = NVC0_HW_SM_QUERY_PROF_TRIGGER_0,1347.ctr[0] = _CA(0x0001, B6, 0x00, 0x00000000),1348.num_counters = 1,1349.norm = { 1, 1 },1350};13511352static const struct nvc0_hw_sm_query_cfg1353sm50_prof_trigger_1 =1354{1355.type = NVC0_HW_SM_QUERY_PROF_TRIGGER_1,1356.ctr[0] = _CA(0x0001, B6, 0x00, 0x00000004),1357.num_counters = 1,1358.norm = { 1, 1 },1359};13601361static const struct nvc0_hw_sm_query_cfg1362sm50_prof_trigger_2 =1363{1364.type = NVC0_HW_SM_QUERY_PROF_TRIGGER_2,1365.ctr[0] = _CA(0x0001, B6, 0x00, 0x00000008),1366.num_counters = 1,1367.norm = { 1, 1 },1368};13691370static const struct nvc0_hw_sm_query_cfg1371sm50_prof_trigger_3 =1372{1373.type = NVC0_HW_SM_QUERY_PROF_TRIGGER_3,1374.ctr[0] = _CA(0x0001, B6, 0x00, 0x0000000c),1375.num_counters = 1,1376.norm = { 1, 1 },1377};13781379static const struct nvc0_hw_sm_query_cfg1380sm50_prof_trigger_4 =1381{1382.type = NVC0_HW_SM_QUERY_PROF_TRIGGER_4,1383.ctr[0] = _CA(0x0001, B6, 0x00, 0x00000010),1384.num_counters = 1,1385.norm = { 1, 1 },1386};13871388static const struct nvc0_hw_sm_query_cfg1389sm50_prof_trigger_5 =1390{1391.type = NVC0_HW_SM_QUERY_PROF_TRIGGER_5,1392.ctr[0] = _CA(0x0001, B6, 0x00, 0x00000014),1393.num_counters = 1,1394.norm = { 1, 1 },1395};13961397static const struct nvc0_hw_sm_query_cfg1398sm50_prof_trigger_6 =1399{1400.type = NVC0_HW_SM_QUERY_PROF_TRIGGER_6,1401.ctr[0] = _CA(0x0001, B6, 0x00, 0x00000018),1402.num_counters = 1,1403.norm = { 1, 1 },1404};14051406static const struct nvc0_hw_sm_query_cfg1407sm50_prof_trigger_7 =1408{1409.type = NVC0_HW_SM_QUERY_PROF_TRIGGER_7,1410.ctr[0] = _CA(0x0001, B6, 0x00, 0x0000001c),1411.num_counters = 1,1412.norm = { 1, 1 },1413};14141415static const struct nvc0_hw_sm_query_cfg1416sm50_shared_atom =1417{1418.type = NVC0_HW_SM_QUERY_SHARED_ATOM,1419.ctr[0] = _CA(0x0001, B6, 0x13, 0x00000014),1420.num_counters = 1,1421.norm = { 1, 1 },1422};14231424static const struct nvc0_hw_sm_query_cfg1425sm50_shared_atom_cas =1426{1427.type = NVC0_HW_SM_QUERY_SHARED_ATOM_CAS,1428.ctr[0] = _CA(0x0001, B6, 0x13, 0x00000010),1429.num_counters = 1,1430.norm = { 1, 1 },1431};14321433static const struct nvc0_hw_sm_query_cfg1434sm50_shared_ld =1435{1436.type = NVC0_HW_SM_QUERY_SHARED_LD,1437.ctr[0] = _CA(0x0001, B6, 0x13, 0x00000008),1438.num_counters = 1,1439.norm = { 1, 1 },1440};14411442static const struct nvc0_hw_sm_query_cfg1443sm50_shared_ld_bank_conflict =1444{1445.type = NVC0_HW_SM_QUERY_SHARED_LD_BANK_CONFLICT,1446.ctr[0] = _CB(0x0001, B6, 0x0e, 0x00000000),1447.num_counters = 1,1448.norm = { 1, 1 },1449};14501451static const struct nvc0_hw_sm_query_cfg1452sm50_shared_ld_transactions =1453{1454.type = NVC0_HW_SM_QUERY_SHARED_LD_TRANSACTIONS,1455.ctr[0] = _CB(0x0001, B6, 0x0e, 0x00000008),1456.num_counters = 1,1457.norm = { 1, 1 },1458};14591460static const struct nvc0_hw_sm_query_cfg1461sm50_shared_st =1462{1463.type = NVC0_HW_SM_QUERY_SHARED_ST,1464.ctr[0] = _CA(0x0001, B6, 0x13, 0x0000000c),1465.num_counters = 1,1466.norm = { 1, 1 },1467};14681469static const struct nvc0_hw_sm_query_cfg1470sm50_shared_st_bank_conflict =1471{1472.type = NVC0_HW_SM_QUERY_SHARED_ST_BANK_CONFLICT,1473.ctr[0] = _CB(0x0001, B6, 0x0e, 0x00000004),1474.num_counters = 1,1475.norm = { 1, 1 },1476};14771478static const struct nvc0_hw_sm_query_cfg1479sm50_shared_st_transactions =1480{1481.type = NVC0_HW_SM_QUERY_SHARED_ST_TRANSACTIONS,1482.ctr[0] = _CB(0x0001, B6, 0x0e, 0x0000000c),1483.num_counters = 1,1484.norm = { 1, 1 },1485};14861487static const struct nvc0_hw_sm_query_cfg1488sm50_sm_cta_launched =1489{1490.type = NVC0_HW_SM_QUERY_SM_CTA_LAUNCHED,1491.ctr[0] = _CB(0x0001, B6, 0x01, 0x00000018),1492.num_counters = 1,1493.norm = { 1, 1 },1494};14951496static const struct nvc0_hw_sm_query_cfg1497sm50_th_inst_executed =1498{1499.type = NVC0_HW_SM_QUERY_TH_INST_EXECUTED,1500.ctr[0] = _CA(0x003f, B6, 0x04, 0x29062080),1501.num_counters = 1,1502.norm = { 1, 1 },1503};15041505static const struct nvc0_hw_sm_query_cfg1506sm50_warps_launched =1507{1508.type = NVC0_HW_SM_QUERY_WARPS_LAUNCHED,1509.ctr[0] = _CA(0x0001, B6, 0x02, 0x00000008),1510.num_counters = 1,1511.norm = { 1, 1 },1512};15131514static const struct nvc0_hw_sm_query_cfg *sm50_hw_sm_queries[] =1515{1516&sm50_active_ctas,1517&sm50_active_cycles,1518&sm50_active_warps,1519&sm50_atom_count,1520&sm50_branch,1521&sm50_divergent_branch,1522&sm50_global_atom_cas,1523&sm50_global_ld,1524&sm50_global_st,1525&sm50_gred_count,1526&sm50_inst_executed,1527&sm50_inst_issued0,1528&sm50_inst_issued1,1529&sm50_inst_issued2,1530&sm50_local_ld,1531&sm50_local_st,1532&sm50_not_pred_off_inst_executed,1533&sm50_prof_trigger_0,1534&sm50_prof_trigger_1,1535&sm50_prof_trigger_2,1536&sm50_prof_trigger_3,1537&sm50_prof_trigger_4,1538&sm50_prof_trigger_5,1539&sm50_prof_trigger_6,1540&sm50_prof_trigger_7,1541&sm50_shared_atom,1542&sm50_shared_atom_cas,1543&sm50_shared_ld,1544&sm50_shared_ld_bank_conflict,1545&sm50_shared_ld_transactions,1546&sm50_shared_st,1547&sm50_shared_st_bank_conflict,1548&sm50_shared_st_transactions,1549&sm50_sm_cta_launched,1550&sm50_th_inst_executed,1551&sm50_warps_launched,1552};15531554/* ==== Compute capability 5.2 (GM200/GM204/GM206) ==== */1555static const struct nvc0_hw_sm_query_cfg1556sm52_atom_count =1557{1558.type = NVC0_HW_SM_QUERY_ATOM_COUNT,1559.ctr[0] = _CA(0x0001, B6, 0x0a, 0x0000001c),1560.num_counters = 1,1561.norm = { 1, 1 },1562};15631564static const struct nvc0_hw_sm_query_cfg1565sm52_global_atom_cas =1566{1567.type = NVC0_HW_SM_QUERY_GLOBAL_ATOM_CAS,1568.ctr[0] = _CA(0x0001, B6, 0x0a, 0x00000018),1569.num_counters = 1,1570.norm = { 1, 1 },1571};15721573static const struct nvc0_hw_sm_query_cfg1574sm52_global_ld =1575{1576.type = NVC0_HW_SM_QUERY_GLOBAL_LD,1577.ctr[0] = _CA(0x0001, B6, 0x0b, 0x00000018),1578.num_counters = 1,1579.norm = { 1, 1 },1580};15811582static const struct nvc0_hw_sm_query_cfg1583sm52_global_st =1584{1585.type = NVC0_HW_SM_QUERY_GLOBAL_ST,1586.ctr[0] = _CA(0x0001, B6, 0x0b, 0x0000001c),1587.num_counters = 1,1588.norm = { 1, 1 },1589};15901591static const struct nvc0_hw_sm_query_cfg1592sm52_gred_count =1593{1594.type = NVC0_HW_SM_QUERY_GRED_COUNT,1595.ctr[0] = _CA(0x0001, B6, 0x0f, 0x00000018),1596.num_counters = 1,1597.norm = { 1, 1 },1598};15991600static const struct nvc0_hw_sm_query_cfg1601sm52_inst_executed =1602{1603.type = NVC0_HW_SM_QUERY_INST_EXECUTED,1604.ctr[0] = _CA(0x0003, B6, 0x03, 0x0000020c),1605.num_counters = 1,1606.norm = { 1, 1 },1607};16081609static const struct nvc0_hw_sm_query_cfg1610sm52_inst_issued0 =1611{1612.type = NVC0_HW_SM_QUERY_INST_ISSUED0,1613.ctr[0] = _CA(0x0001, B6, 0x03, 0x00000000),1614.num_counters = 1,1615.norm = { 1, 1 },1616};16171618static const struct nvc0_hw_sm_query_cfg1619sm52_inst_issued1 =1620{1621.type = NVC0_HW_SM_QUERY_INST_ISSUED1,1622.ctr[0] = _CA(0x0001, B6, 0x03, 0x00000004),1623.num_counters = 1,1624.norm = { 1, 1 },1625};16261627static const struct nvc0_hw_sm_query_cfg1628sm52_inst_issued2 =1629{1630.type = NVC0_HW_SM_QUERY_INST_ISSUED2,1631.ctr[0] = _CA(0x0001, B6, 0x03, 0x00000008),1632.num_counters = 1,1633.norm = { 1, 1 },1634};16351636static const struct nvc0_hw_sm_query_cfg1637sm52_local_ld =1638{1639.type = NVC0_HW_SM_QUERY_LOCAL_LD,1640.ctr[0] = _CA(0x0001, B6, 0x06, 0x0000001c),1641.num_counters = 1,1642.norm = { 1, 1 },1643};16441645static const struct nvc0_hw_sm_query_cfg1646sm52_local_st =1647{1648.type = NVC0_HW_SM_QUERY_LOCAL_ST,1649.ctr[0] = _CA(0x0001, B6, 0x06, 0x00000018),1650.num_counters = 1,1651.norm = { 1, 1 },1652};16531654static const struct nvc0_hw_sm_query_cfg1655sm52_shared_atom =1656{1657.type = NVC0_HW_SM_QUERY_SHARED_ATOM,1658.ctr[0] = _CA(0x0001, B6, 0x08, 0x0000001c),1659.num_counters = 1,1660.norm = { 1, 1 },1661};16621663static const struct nvc0_hw_sm_query_cfg1664sm52_shared_atom_cas =1665{1666.type = NVC0_HW_SM_QUERY_SHARED_ATOM_CAS,1667.ctr[0] = _CA(0x0001, B6, 0x08, 0x00000018),1668.num_counters = 1,1669.norm = { 1, 1 },1670};16711672static const struct nvc0_hw_sm_query_cfg1673sm52_shared_ld =1674{1675.type = NVC0_HW_SM_QUERY_SHARED_LD,1676.ctr[0] = _CA(0x0001, B6, 0x07, 0x00000018),1677.num_counters = 1,1678.norm = { 1, 1 },1679};16801681static const struct nvc0_hw_sm_query_cfg1682sm52_shared_st =1683{1684.type = NVC0_HW_SM_QUERY_SHARED_ST,1685.ctr[0] = _CA(0x0001, B6, 0x07, 0x0000001c),1686.num_counters = 1,1687.norm = { 1, 1 },1688};16891690static const struct nvc0_hw_sm_query_cfg1691sm52_warps_launched =1692{1693.type = NVC0_HW_SM_QUERY_WARPS_LAUNCHED,1694.ctr[0] = _CA(0x0001, B6, 0x02, 0x0000001c),1695.num_counters = 1,1696.norm = { 1, 1 },1697};16981699static const struct nvc0_hw_sm_query_cfg *sm52_hw_sm_queries[] =1700{1701&sm50_active_ctas,1702&sm50_active_cycles,1703&sm50_active_warps,1704&sm52_atom_count,1705&sm50_branch,1706&sm50_divergent_branch,1707&sm52_global_atom_cas,1708&sm52_global_ld,1709&sm52_global_st,1710&sm52_gred_count,1711&sm52_inst_executed,1712&sm52_inst_issued0,1713&sm52_inst_issued1,1714&sm52_inst_issued2,1715&sm52_local_ld,1716&sm52_local_st,1717&sm50_not_pred_off_inst_executed,1718&sm50_prof_trigger_0,1719&sm50_prof_trigger_1,1720&sm50_prof_trigger_2,1721&sm50_prof_trigger_3,1722&sm50_prof_trigger_4,1723&sm50_prof_trigger_5,1724&sm50_prof_trigger_6,1725&sm50_prof_trigger_7,1726&sm52_shared_atom,1727&sm52_shared_atom_cas,1728&sm52_shared_ld,1729&sm50_shared_ld_bank_conflict,1730&sm50_shared_ld_transactions,1731&sm52_shared_st,1732&sm50_shared_st_bank_conflict,1733&sm50_shared_st_transactions,1734&sm50_sm_cta_launched,1735&sm50_th_inst_executed,1736&sm52_warps_launched,1737};17381739#undef _Q1740#undef _CA1741#undef _CB17421743/* === PERFORMANCE MONITORING COUNTERS for NVC0:NVE4 === */1744/* NOTES:1745* - MP counters on GF100/GF110 (compute capability 2.0) are buggy1746* because there is a context-switch problem that we need to fix.1747* Results might be wrong sometimes, be careful!1748*/1749static const uint64_t nvc0_read_hw_sm_counters_code[] =1750{1751/* mov b32 $r8 $tidx1752* mov b32 $r9 $physid1753* mov b32 $r0 $pm01754* mov b32 $r1 $pm11755* mov b32 $r2 $pm21756* mov b32 $r3 $pm31757* mov b32 $r4 $pm41758* mov b32 $r5 $pm51759* mov b32 $r6 $pm61760* mov b32 $r7 $pm71761* set $p0 0x1 eq u32 $r8 0x01762* mov b32 $r10 c15[0x6a0]1763* mov b32 $r11 c15[0x6a4]1764* ext u32 $r8 $r9 0x4141765* (not $p0) exit1766* mul $r8 u32 $r8 u32 481767* add b32 $r10 $c $r10 $r81768* add b32 $r11 $r11 0x0 $c1769* mov b32 $r8 c15[0x6a8]1770* st b128 wt g[$r10d+0x00] $r0q1771* st b128 wt g[$r10d+0x10] $r4q1772* st b32 wt g[$r10d+0x20] $r81773* exit */17740x2c00000084021c04ULL,17750x2c0000000c025c04ULL,17760x2c00000010001c04ULL,17770x2c00000014005c04ULL,17780x2c00000018009c04ULL,17790x2c0000001c00dc04ULL,17800x2c00000020011c04ULL,17810x2c00000024015c04ULL,17820x2c00000028019c04ULL,17830x2c0000002c01dc04ULL,17840x190e0000fc81dc03ULL,17850x28007c1a80029de4ULL,17860x28007c1a9002dde4ULL,17870x7000c01050921c03ULL,17880x80000000000021e7ULL,17890x10000000c0821c02ULL,17900x4801000020a29c03ULL,17910x0800000000b2dc42ULL,17920x28007c1aa0021de4ULL,17930x9400000000a01fc5ULL,17940x9400000040a11fc5ULL,17950x9400000080a21f85ULL,17960x8000000000001de7ULL1797};17981799#define _C(f, o, g, m, s) { f, NVC0_COMPUTE_MP_PM_OP_MODE_##o, 0, g, m, s }18001801/* ==== Compute capability 2.0 (GF100/GF110) ==== */1802static const struct nvc0_hw_sm_query_cfg1803sm20_active_cycles =1804{1805.type = NVC0_HW_SM_QUERY_ACTIVE_CYCLES,1806.ctr[0] = _C(0xaaaa, LOGOP, 0x11, 0x000000ff, 0x00000000),1807.num_counters = 1,1808.norm = { 1, 1 },1809};18101811static const struct nvc0_hw_sm_query_cfg1812sm20_active_warps =1813{1814.type = NVC0_HW_SM_QUERY_ACTIVE_WARPS,1815.ctr[0] = _C(0xaaaa, LOGOP, 0x24, 0x000000ff, 0x00000010),1816.ctr[1] = _C(0xaaaa, LOGOP, 0x24, 0x000000ff, 0x00000020),1817.ctr[2] = _C(0xaaaa, LOGOP, 0x24, 0x000000ff, 0x00000030),1818.ctr[3] = _C(0xaaaa, LOGOP, 0x24, 0x000000ff, 0x00000040),1819.ctr[4] = _C(0xaaaa, LOGOP, 0x24, 0x000000ff, 0x00000050),1820.ctr[5] = _C(0xaaaa, LOGOP, 0x24, 0x000000ff, 0x00000060),1821.num_counters = 6,1822.norm = { 1, 1 },1823};18241825static const struct nvc0_hw_sm_query_cfg1826sm20_atom_count =1827{1828.type = NVC0_HW_SM_QUERY_ATOM_COUNT,1829.ctr[0] = _C(0xaaaa, LOGOP, 0x63, 0x000000ff, 0x00000030),1830.num_counters = 1,1831.norm = { 1, 1 },1832};18331834static const struct nvc0_hw_sm_query_cfg1835sm20_branch =1836{1837.type = NVC0_HW_SM_QUERY_BRANCH,1838.ctr[0] = _C(0xaaaa, LOGOP, 0x1a, 0x000000ff, 0x00000000),1839.ctr[1] = _C(0xaaaa, LOGOP, 0x1a, 0x000000ff, 0x00000010),1840.num_counters = 2,1841.norm = { 1, 1 },1842};18431844static const struct nvc0_hw_sm_query_cfg1845sm20_divergent_branch =1846{1847.type = NVC0_HW_SM_QUERY_DIVERGENT_BRANCH,1848.ctr[0] = _C(0xaaaa, LOGOP, 0x19, 0x000000ff, 0x00000020),1849.ctr[1] = _C(0xaaaa, LOGOP, 0x19, 0x000000ff, 0x00000030),1850.num_counters = 2,1851.norm = { 1, 1 },1852};18531854static const struct nvc0_hw_sm_query_cfg1855sm20_gld_request =1856{1857.type = NVC0_HW_SM_QUERY_GLD_REQUEST,1858.ctr[0] = _C(0xaaaa, LOGOP, 0x64, 0x000000ff, 0x00000030),1859.num_counters = 1,1860.norm = { 1, 1 },1861};18621863static const struct nvc0_hw_sm_query_cfg1864sm20_gred_count =1865{1866.type = NVC0_HW_SM_QUERY_GRED_COUNT,1867.ctr[0] = _C(0xaaaa, LOGOP, 0x63, 0x000000ff, 0x00000040),1868.num_counters = 1,1869.norm = { 1, 1 },1870};18711872static const struct nvc0_hw_sm_query_cfg1873sm20_gst_request =1874{1875.type = NVC0_HW_SM_QUERY_GST_REQUEST,1876.ctr[0] = _C(0xaaaa, LOGOP, 0x64, 0x000000ff, 0x00000060),1877.num_counters = 1,1878.norm = { 1, 1 },1879};18801881static const struct nvc0_hw_sm_query_cfg1882sm20_inst_executed =1883{1884.type = NVC0_HW_SM_QUERY_INST_EXECUTED,1885.ctr[0] = _C(0xaaaa, LOGOP, 0x2d, 0x0000ffff, 0x00001000),1886.ctr[1] = _C(0xaaaa, LOGOP, 0x2d, 0x0000ffff, 0x00001010),1887.num_counters = 2,1888.norm = { 1, 1 },1889};18901891static const struct nvc0_hw_sm_query_cfg1892sm20_inst_issued =1893{1894.type = NVC0_HW_SM_QUERY_INST_ISSUED,1895.ctr[0] = _C(0xaaaa, LOGOP, 0x27, 0x0000ffff, 0x00007060),1896.ctr[1] = _C(0xaaaa, LOGOP, 0x27, 0x0000ffff, 0x00007070),1897.num_counters = 2,1898.norm = { 1, 1 },1899};19001901static const struct nvc0_hw_sm_query_cfg1902sm20_local_ld =1903{1904.type = NVC0_HW_SM_QUERY_LOCAL_LD,1905.ctr[0] = _C(0xaaaa, LOGOP, 0x64, 0x000000ff, 0x00000020),1906.num_counters = 1,1907.norm = { 1, 1 },1908};19091910static const struct nvc0_hw_sm_query_cfg1911sm20_local_st =1912{1913.type = NVC0_HW_SM_QUERY_LOCAL_ST,1914.ctr[0] = _C(0xaaaa, LOGOP, 0x64, 0x000000ff, 0x00000050),1915.num_counters = 1,1916.norm = { 1, 1 },1917};19181919static const struct nvc0_hw_sm_query_cfg1920sm20_prof_trigger_0 =1921{1922.type = NVC0_HW_SM_QUERY_PROF_TRIGGER_0,1923.ctr[0] = _C(0xaaaa, LOGOP, 0x01, 0x000000ff, 0x00000000),1924.num_counters = 1,1925.norm = { 1, 1 },1926};19271928static const struct nvc0_hw_sm_query_cfg1929sm20_prof_trigger_1 =1930{1931.type = NVC0_HW_SM_QUERY_PROF_TRIGGER_1,1932.ctr[0] = _C(0xaaaa, LOGOP, 0x01, 0x000000ff, 0x00000010),1933.num_counters = 1,1934.norm = { 1, 1 },1935};19361937static const struct nvc0_hw_sm_query_cfg1938sm20_prof_trigger_2 =1939{1940.type = NVC0_HW_SM_QUERY_PROF_TRIGGER_2,1941.ctr[0] = _C(0xaaaa, LOGOP, 0x01, 0x000000ff, 0x00000020),1942.num_counters = 1,1943.norm = { 1, 1 },1944};19451946static const struct nvc0_hw_sm_query_cfg1947sm20_prof_trigger_3 =1948{1949.type = NVC0_HW_SM_QUERY_PROF_TRIGGER_3,1950.ctr[0] = _C(0xaaaa, LOGOP, 0x01, 0x000000ff, 0x00000030),1951.num_counters = 1,1952.norm = { 1, 1 },1953};19541955static const struct nvc0_hw_sm_query_cfg1956sm20_prof_trigger_4 =1957{1958.type = NVC0_HW_SM_QUERY_PROF_TRIGGER_4,1959.ctr[0] = _C(0xaaaa, LOGOP, 0x01, 0x000000ff, 0x00000040),1960.num_counters = 1,1961.norm = { 1, 1 },1962};19631964static const struct nvc0_hw_sm_query_cfg1965sm20_prof_trigger_5 =1966{1967.type = NVC0_HW_SM_QUERY_PROF_TRIGGER_5,1968.ctr[0] = _C(0xaaaa, LOGOP, 0x01, 0x000000ff, 0x00000050),1969.num_counters = 1,1970.norm = { 1, 1 },1971};19721973static const struct nvc0_hw_sm_query_cfg1974sm20_prof_trigger_6 =1975{1976.type = NVC0_HW_SM_QUERY_PROF_TRIGGER_6,1977.ctr[0] = _C(0xaaaa, LOGOP, 0x01, 0x000000ff, 0x00000060),1978.num_counters = 1,1979.norm = { 1, 1 },1980};19811982static const struct nvc0_hw_sm_query_cfg1983sm20_prof_trigger_7 =1984{1985.type = NVC0_HW_SM_QUERY_PROF_TRIGGER_7,1986.ctr[0] = _C(0xaaaa, LOGOP, 0x01, 0x000000ff, 0x00000070),1987.num_counters = 1,1988.norm = { 1, 1 },1989};19901991static const struct nvc0_hw_sm_query_cfg1992sm20_shared_ld =1993{1994.type = NVC0_HW_SM_QUERY_SHARED_LD,1995.ctr[0] = _C(0xaaaa, LOGOP, 0x64, 0x000000ff, 0x00000010),1996.num_counters = 1,1997.norm = { 1, 1 },1998};19992000static const struct nvc0_hw_sm_query_cfg2001sm20_shared_st =2002{2003.type = NVC0_HW_SM_QUERY_SHARED_ST,2004.ctr[0] = _C(0xaaaa, LOGOP, 0x64, 0x000000ff, 0x00000040),2005.num_counters = 1,2006.norm = { 1, 1 },2007};20082009static const struct nvc0_hw_sm_query_cfg2010sm20_threads_launched =2011{2012.type = NVC0_HW_SM_QUERY_THREADS_LAUNCHED,2013.ctr[0] = _C(0xaaaa, LOGOP, 0x26, 0x000000ff, 0x00000010),2014.ctr[1] = _C(0xaaaa, LOGOP, 0x26, 0x000000ff, 0x00000020),2015.ctr[2] = _C(0xaaaa, LOGOP, 0x26, 0x000000ff, 0x00000030),2016.ctr[3] = _C(0xaaaa, LOGOP, 0x26, 0x000000ff, 0x00000040),2017.ctr[4] = _C(0xaaaa, LOGOP, 0x26, 0x000000ff, 0x00000050),2018.ctr[5] = _C(0xaaaa, LOGOP, 0x26, 0x000000ff, 0x00000060),2019.num_counters = 6,2020.norm = { 1, 1 },2021};20222023static const struct nvc0_hw_sm_query_cfg2024sm20_th_inst_executed_0 =2025{2026.type = NVC0_HW_SM_QUERY_TH_INST_EXECUTED_0,2027.ctr[0] = _C(0xaaaa, LOGOP, 0x2f, 0x000000ff, 0x00000000),2028.ctr[1] = _C(0xaaaa, LOGOP, 0x2f, 0x000000ff, 0x00000010),2029.ctr[2] = _C(0xaaaa, LOGOP, 0x2f, 0x000000ff, 0x00000020),2030.ctr[3] = _C(0xaaaa, LOGOP, 0x2f, 0x000000ff, 0x00000030),2031.ctr[4] = _C(0xaaaa, LOGOP, 0x2f, 0x000000ff, 0x00000040),2032.ctr[5] = _C(0xaaaa, LOGOP, 0x2f, 0x000000ff, 0x00000050),2033.num_counters = 6,2034.norm = { 1, 1 },2035};20362037static const struct nvc0_hw_sm_query_cfg2038sm20_th_inst_executed_1 =2039{2040.type = NVC0_HW_SM_QUERY_TH_INST_EXECUTED_1,2041.ctr[0] = _C(0xaaaa, LOGOP, 0x30, 0x000000ff, 0x00000000),2042.ctr[1] = _C(0xaaaa, LOGOP, 0x30, 0x000000ff, 0x00000010),2043.ctr[2] = _C(0xaaaa, LOGOP, 0x30, 0x000000ff, 0x00000020),2044.ctr[3] = _C(0xaaaa, LOGOP, 0x30, 0x000000ff, 0x00000030),2045.ctr[4] = _C(0xaaaa, LOGOP, 0x30, 0x000000ff, 0x00000040),2046.ctr[5] = _C(0xaaaa, LOGOP, 0x30, 0x000000ff, 0x00000050),2047.num_counters = 6,2048.norm = { 1, 1 },2049};20502051static const struct nvc0_hw_sm_query_cfg2052sm20_warps_launched =2053{2054.type = NVC0_HW_SM_QUERY_WARPS_LAUNCHED,2055.ctr[0] = _C(0xaaaa, LOGOP, 0x26, 0x000000ff, 0x00000000),2056.num_counters = 1,2057.norm = { 1, 1 },2058};20592060static const struct nvc0_hw_sm_query_cfg *sm20_hw_sm_queries[] =2061{2062&sm20_active_cycles,2063&sm20_active_warps,2064&sm20_atom_count,2065&sm20_branch,2066&sm20_divergent_branch,2067&sm20_gld_request,2068&sm20_gred_count,2069&sm20_gst_request,2070&sm20_inst_executed,2071&sm20_inst_issued,2072&sm20_local_ld,2073&sm20_local_st,2074&sm20_prof_trigger_0,2075&sm20_prof_trigger_1,2076&sm20_prof_trigger_2,2077&sm20_prof_trigger_3,2078&sm20_prof_trigger_4,2079&sm20_prof_trigger_5,2080&sm20_prof_trigger_6,2081&sm20_prof_trigger_7,2082&sm20_shared_ld,2083&sm20_shared_st,2084&sm20_threads_launched,2085&sm20_th_inst_executed_0,2086&sm20_th_inst_executed_1,2087&sm20_warps_launched,2088};20892090/* ==== Compute capability 2.1 (GF108+ except GF110) ==== */2091static const struct nvc0_hw_sm_query_cfg2092sm21_inst_executed =2093{2094.type = NVC0_HW_SM_QUERY_INST_EXECUTED,2095.ctr[0] = _C(0xaaaa, LOGOP, 0x2d, 0x000000ff, 0x00000000),2096.ctr[1] = _C(0xaaaa, LOGOP, 0x2d, 0x000000ff, 0x00000010),2097.ctr[2] = _C(0xaaaa, LOGOP, 0x2d, 0x000000ff, 0x00000020),2098.num_counters = 3,2099.norm = { 1, 1 },2100};21012102static const struct nvc0_hw_sm_query_cfg2103sm21_inst_issued1_0 =2104{2105.type = NVC0_HW_SM_QUERY_INST_ISSUED1_0,2106.ctr[0] = _C(0xaaaa, LOGOP, 0x7e, 0x000000ff, 0x00000010),2107.num_counters = 1,2108.norm = { 1, 1 },2109};21102111static const struct nvc0_hw_sm_query_cfg2112sm21_inst_issued1_1 =2113{2114.type = NVC0_HW_SM_QUERY_INST_ISSUED1_1,2115.ctr[0] = _C(0xaaaa, LOGOP, 0x7e, 0x000000ff, 0x00000040),2116.num_counters = 1,2117.norm = { 1, 1 },2118};21192120static const struct nvc0_hw_sm_query_cfg2121sm21_inst_issued2_0 =2122{2123.type = NVC0_HW_SM_QUERY_INST_ISSUED2_0,2124.ctr[0] = _C(0xaaaa, LOGOP, 0x7e, 0x000000ff, 0x00000020),2125.num_counters = 1,2126.norm = { 1, 1 },2127};21282129static const struct nvc0_hw_sm_query_cfg2130sm21_inst_issued2_1 =2131{2132.type = NVC0_HW_SM_QUERY_INST_ISSUED2_1,2133.ctr[0] = _C(0xaaaa, LOGOP, 0x7e, 0x000000ff, 0x00000050),2134.num_counters = 1,2135.norm = { 1, 1 },2136};21372138static const struct nvc0_hw_sm_query_cfg2139sm21_th_inst_executed_0 =2140{2141.type = NVC0_HW_SM_QUERY_TH_INST_EXECUTED_0,2142.ctr[0] = _C(0xaaaa, LOGOP, 0xa3, 0x000000ff, 0x00000000),2143.ctr[1] = _C(0xaaaa, LOGOP, 0xa3, 0x000000ff, 0x00000010),2144.ctr[2] = _C(0xaaaa, LOGOP, 0xa3, 0x000000ff, 0x00000020),2145.ctr[3] = _C(0xaaaa, LOGOP, 0xa3, 0x000000ff, 0x00000030),2146.ctr[4] = _C(0xaaaa, LOGOP, 0xa3, 0x000000ff, 0x00000040),2147.ctr[5] = _C(0xaaaa, LOGOP, 0xa3, 0x000000ff, 0x00000050),2148.num_counters = 6,2149.norm = { 1, 1 },2150};21512152static const struct nvc0_hw_sm_query_cfg2153sm21_th_inst_executed_1 =2154{2155.type = NVC0_HW_SM_QUERY_TH_INST_EXECUTED_1,2156.ctr[0] = _C(0xaaaa, LOGOP, 0xa5, 0x000000ff, 0x00000000),2157.ctr[1] = _C(0xaaaa, LOGOP, 0xa5, 0x000000ff, 0x00000010),2158.ctr[2] = _C(0xaaaa, LOGOP, 0xa5, 0x000000ff, 0x00000020),2159.ctr[3] = _C(0xaaaa, LOGOP, 0xa5, 0x000000ff, 0x00000030),2160.ctr[4] = _C(0xaaaa, LOGOP, 0xa5, 0x000000ff, 0x00000040),2161.ctr[5] = _C(0xaaaa, LOGOP, 0xa5, 0x000000ff, 0x00000050),2162.num_counters = 6,2163.norm = { 1, 1 },2164};21652166static const struct nvc0_hw_sm_query_cfg2167sm21_th_inst_executed_2 =2168{2169.type = NVC0_HW_SM_QUERY_TH_INST_EXECUTED_2,2170.ctr[0] = _C(0xaaaa, LOGOP, 0xa4, 0x000000ff, 0x00000000),2171.ctr[1] = _C(0xaaaa, LOGOP, 0xa4, 0x000000ff, 0x00000010),2172.ctr[2] = _C(0xaaaa, LOGOP, 0xa4, 0x000000ff, 0x00000020),2173.ctr[3] = _C(0xaaaa, LOGOP, 0xa4, 0x000000ff, 0x00000030),2174.ctr[4] = _C(0xaaaa, LOGOP, 0xa4, 0x000000ff, 0x00000040),2175.ctr[5] = _C(0xaaaa, LOGOP, 0xa4, 0x000000ff, 0x00000050),2176.num_counters = 6,2177.norm = { 1, 1 },2178};21792180static const struct nvc0_hw_sm_query_cfg2181sm21_th_inst_executed_3 =2182{2183.type = NVC0_HW_SM_QUERY_TH_INST_EXECUTED_3,2184.ctr[0] = _C(0xaaaa, LOGOP, 0xa6, 0x000000ff, 0x00000000),2185.ctr[1] = _C(0xaaaa, LOGOP, 0xa6, 0x000000ff, 0x00000010),2186.ctr[2] = _C(0xaaaa, LOGOP, 0xa6, 0x000000ff, 0x00000020),2187.ctr[3] = _C(0xaaaa, LOGOP, 0xa6, 0x000000ff, 0x00000030),2188.ctr[4] = _C(0xaaaa, LOGOP, 0xa6, 0x000000ff, 0x00000040),2189.ctr[5] = _C(0xaaaa, LOGOP, 0xa6, 0x000000ff, 0x00000050),2190.num_counters = 6,2191.norm = { 1, 1 },2192};21932194static const struct nvc0_hw_sm_query_cfg *sm21_hw_sm_queries[] =2195{2196&sm20_active_cycles,2197&sm20_active_warps,2198&sm20_atom_count,2199&sm20_branch,2200&sm20_divergent_branch,2201&sm20_gld_request,2202&sm20_gred_count,2203&sm20_gst_request,2204&sm21_inst_executed,2205&sm21_inst_issued1_0,2206&sm21_inst_issued1_1,2207&sm21_inst_issued2_0,2208&sm21_inst_issued2_1,2209&sm20_local_ld,2210&sm20_local_st,2211&sm20_prof_trigger_0,2212&sm20_prof_trigger_1,2213&sm20_prof_trigger_2,2214&sm20_prof_trigger_3,2215&sm20_prof_trigger_4,2216&sm20_prof_trigger_5,2217&sm20_prof_trigger_6,2218&sm20_prof_trigger_7,2219&sm20_shared_ld,2220&sm20_shared_st,2221&sm20_threads_launched,2222&sm21_th_inst_executed_0,2223&sm21_th_inst_executed_1,2224&sm21_th_inst_executed_2,2225&sm21_th_inst_executed_3,2226&sm20_warps_launched,2227};22282229#undef _C22302231static inline const struct nvc0_hw_sm_query_cfg **2232nvc0_hw_sm_get_queries(struct nvc0_screen *screen)2233{2234struct nouveau_device *dev = screen->base.device;22352236switch (screen->base.class_3d) {2237case GM200_3D_CLASS:2238return sm52_hw_sm_queries;2239case GM107_3D_CLASS:2240return sm50_hw_sm_queries;2241case NVF0_3D_CLASS:2242return sm35_hw_sm_queries;2243case NVE4_3D_CLASS:2244return sm30_hw_sm_queries;2245case NVC0_3D_CLASS:2246case NVC1_3D_CLASS:2247case NVC8_3D_CLASS:2248if (dev->chipset == 0xc0 || dev->chipset == 0xc8)2249return sm20_hw_sm_queries;2250return sm21_hw_sm_queries;2251}2252assert(0);2253return NULL;2254}22552256unsigned2257nvc0_hw_sm_get_num_queries(struct nvc0_screen *screen)2258{2259struct nouveau_device *dev = screen->base.device;22602261switch (screen->base.class_3d) {2262case GM200_3D_CLASS:2263return ARRAY_SIZE(sm52_hw_sm_queries);2264case GM107_3D_CLASS:2265return ARRAY_SIZE(sm50_hw_sm_queries);2266case NVF0_3D_CLASS:2267return ARRAY_SIZE(sm35_hw_sm_queries);2268case NVE4_3D_CLASS:2269return ARRAY_SIZE(sm30_hw_sm_queries);2270case NVC0_3D_CLASS:2271case NVC1_3D_CLASS:2272case NVC8_3D_CLASS:2273if (dev->chipset == 0xc0 || dev->chipset == 0xc8)2274return ARRAY_SIZE(sm20_hw_sm_queries);2275return ARRAY_SIZE(sm21_hw_sm_queries);2276}2277return 0;2278}22792280static const struct nvc0_hw_sm_query_cfg *2281nvc0_hw_sm_query_get_cfg(struct nvc0_context *nvc0, struct nvc0_hw_query *hq)2282{2283const struct nvc0_hw_sm_query_cfg **queries;2284struct nvc0_screen *screen = nvc0->screen;2285struct nvc0_query *q = &hq->base;2286unsigned num_queries;2287unsigned i;22882289num_queries = nvc0_hw_sm_get_num_queries(screen);2290queries = nvc0_hw_sm_get_queries(screen);22912292for (i = 0; i < num_queries; i++) {2293if (NVC0_HW_SM_QUERY(queries[i]->type) == q->type)2294return queries[i];2295}2296assert(0);2297return NULL;2298}22992300static void2301nvc0_hw_sm_destroy_query(struct nvc0_context *nvc0, struct nvc0_hw_query *hq)2302{2303struct nvc0_query *q = &hq->base;2304nvc0_hw_query_allocate(nvc0, q, 0);2305nouveau_fence_ref(NULL, &hq->fence);2306FREE(hq);2307}23082309static bool2310nve4_hw_sm_begin_query(struct nvc0_context *nvc0, struct nvc0_hw_query *hq)2311{2312struct nvc0_screen *screen = nvc0->screen;2313struct nouveau_pushbuf *push = nvc0->base.pushbuf;2314struct nvc0_hw_sm_query *hsq = nvc0_hw_sm_query(hq);2315const struct nvc0_hw_sm_query_cfg *cfg;2316unsigned i, c;2317unsigned num_ab[2] = { 0, 0 };23182319cfg = nvc0_hw_sm_query_get_cfg(nvc0, hq);23202321/* check if we have enough free counter slots */2322for (i = 0; i < cfg->num_counters; ++i)2323num_ab[cfg->ctr[i].sig_dom]++;23242325if (screen->pm.num_hw_sm_active[0] + num_ab[0] > 4 ||2326screen->pm.num_hw_sm_active[1] + num_ab[1] > 4) {2327NOUVEAU_ERR("Not enough free MP counter slots !\n");2328return false;2329}23302331assert(cfg->num_counters <= 4);2332PUSH_SPACE(push, 4 * 8 * + 6);23332334if (!screen->pm.mp_counters_enabled) {2335screen->pm.mp_counters_enabled = true;2336BEGIN_NVC0(push, SUBC_SW(0x06ac), 1);2337PUSH_DATA (push, 0x1fcb);2338}23392340/* set sequence field to 0 (used to check if result is available) */2341for (i = 0; i < screen->mp_count; ++i)2342hq->data[i * 10 + 10] = 0;2343hq->sequence++;23442345for (i = 0; i < cfg->num_counters; ++i) {2346const unsigned d = cfg->ctr[i].sig_dom;23472348if (!screen->pm.num_hw_sm_active[d]) {2349uint32_t m = (1 << 22) | (1 << (7 + (8 * !d)));2350if (screen->pm.num_hw_sm_active[!d])2351m |= 1 << (7 + (8 * d));2352BEGIN_NVC0(push, SUBC_SW(0x0600), 1);2353PUSH_DATA (push, m);2354}2355screen->pm.num_hw_sm_active[d]++;23562357for (c = d * 4; c < (d * 4 + 4); ++c) {2358if (!screen->pm.mp_counter[c]) {2359hsq->ctr[i] = c;2360screen->pm.mp_counter[c] = hsq;2361break;2362}2363}2364assert(c <= (d * 4 + 3)); /* must succeed, already checked for space */23652366/* configure and reset the counter(s) */2367if (d == 0)2368BEGIN_NVC0(push, NVE4_CP(MP_PM_A_SIGSEL(c & 3)), 1);2369else2370BEGIN_NVC0(push, NVE4_CP(MP_PM_B_SIGSEL(c & 3)), 1);2371PUSH_DATA (push, cfg->ctr[i].sig_sel);2372BEGIN_NVC0(push, NVE4_CP(MP_PM_SRCSEL(c)), 1);2373PUSH_DATA (push, cfg->ctr[i].src_sel + 0x2108421 * (c & 3));2374BEGIN_NVC0(push, NVE4_CP(MP_PM_FUNC(c)), 1);2375PUSH_DATA (push, (cfg->ctr[i].func << 4) | cfg->ctr[i].mode);2376BEGIN_NVC0(push, NVE4_CP(MP_PM_SET(c)), 1);2377PUSH_DATA (push, 0);2378}23792380if (screen->base.class_3d >= GM107_3D_CLASS) {2381/* Enable mask for counters, it's 8-bits value where 0:3 is for domain A2382* and 4:7 for domain B. For example, the mask for active_warps should be2383* 0x70 because it uses 3 counters in domain B. However, let's always2384* enable all counters because we don't want to track which ones is2385* enabled or not, and this allows to monitor multiple queries at the2386* same time. */2387BEGIN_NVC0(push, SUBC_CP(0x33e0), 1);2388PUSH_DATA (push, 0xff);2389}23902391return true;2392}23932394static bool2395nvc0_hw_sm_begin_query(struct nvc0_context *nvc0, struct nvc0_hw_query *hq)2396{2397struct nvc0_screen *screen = nvc0->screen;2398struct nouveau_pushbuf *push = nvc0->base.pushbuf;2399struct nvc0_hw_sm_query *hsq = nvc0_hw_sm_query(hq);2400const struct nvc0_hw_sm_query_cfg *cfg;2401unsigned i, c;24022403if (screen->base.class_3d >= NVE4_3D_CLASS)2404return nve4_hw_sm_begin_query(nvc0, hq);24052406cfg = nvc0_hw_sm_query_get_cfg(nvc0, hq);24072408/* check if we have enough free counter slots */2409if (screen->pm.num_hw_sm_active[0] + cfg->num_counters > 8) {2410NOUVEAU_ERR("Not enough free MP counter slots !\n");2411return false;2412}24132414assert(cfg->num_counters <= 8);2415PUSH_SPACE(push, 8 * 8 + 2);24162417/* set sequence field to 0 (used to check if result is available) */2418for (i = 0; i < screen->mp_count; ++i) {2419const unsigned b = (0x30 / 4) * i;2420hq->data[b + 8] = 0;2421}2422hq->sequence++;24232424for (i = 0; i < cfg->num_counters; ++i) {2425uint32_t mask_sel = 0x00000000;24262427if (!screen->pm.num_hw_sm_active[0]) {2428BEGIN_NVC0(push, SUBC_SW(0x0600), 1);2429PUSH_DATA (push, 0x80000000);2430}2431screen->pm.num_hw_sm_active[0]++;24322433for (c = 0; c < 8; ++c) {2434if (!screen->pm.mp_counter[c]) {2435hsq->ctr[i] = c;2436screen->pm.mp_counter[c] = hsq;2437break;2438}2439}24402441/* Oddly-enough, the signal id depends on the slot selected on Fermi but2442* not on Kepler. Fortunately, the signal ids are just offsetted by the2443* slot id! */2444mask_sel |= c;2445mask_sel |= (c << 8);2446mask_sel |= (c << 16);2447mask_sel |= (c << 24);2448mask_sel &= cfg->ctr[i].src_mask;24492450/* configure and reset the counter(s) */2451BEGIN_NVC0(push, NVC0_CP(MP_PM_SIGSEL(c)), 1);2452PUSH_DATA (push, cfg->ctr[i].sig_sel);2453BEGIN_NVC0(push, NVC0_CP(MP_PM_SRCSEL(c)), 1);2454PUSH_DATA (push, cfg->ctr[i].src_sel | mask_sel);2455BEGIN_NVC0(push, NVC0_CP(MP_PM_OP(c)), 1);2456PUSH_DATA (push, (cfg->ctr[i].func << 4) | cfg->ctr[i].mode);2457BEGIN_NVC0(push, NVC0_CP(MP_PM_SET(c)), 1);2458PUSH_DATA (push, 0);2459}2460return true;2461}24622463static inline struct nvc0_program *2464nvc0_hw_sm_get_program(struct nvc0_screen *screen)2465{2466struct nvc0_program *prog;24672468prog = CALLOC_STRUCT(nvc0_program);2469if (!prog)2470return NULL;24712472prog->type = PIPE_SHADER_COMPUTE;2473prog->translated = true;2474prog->parm_size = 12;24752476if (screen->base.class_3d >= GM107_3D_CLASS) {2477prog->code = (uint32_t *)gm107_read_hw_sm_counters_code;2478prog->code_size = sizeof(gm107_read_hw_sm_counters_code);2479prog->num_gprs = 14;2480} else2481if (screen->base.class_3d == NVE4_3D_CLASS ||2482screen->base.class_3d == NVF0_3D_CLASS) {2483if (screen->base.class_3d == NVE4_3D_CLASS) {2484prog->code = (uint32_t *)nve4_read_hw_sm_counters_code;2485prog->code_size = sizeof(nve4_read_hw_sm_counters_code);2486} else {2487prog->code = (uint32_t *)nvf0_read_hw_sm_counters_code;2488prog->code_size = sizeof(nvf0_read_hw_sm_counters_code);2489}2490prog->num_gprs = 14;2491} else {2492prog->code = (uint32_t *)nvc0_read_hw_sm_counters_code;2493prog->code_size = sizeof(nvc0_read_hw_sm_counters_code);2494prog->num_gprs = 12;2495}2496return prog;2497}24982499static inline void2500nvc0_hw_sm_upload_input(struct nvc0_context *nvc0, struct nvc0_hw_query *hq)2501{2502struct nouveau_pushbuf *push = nvc0->base.pushbuf;2503struct nvc0_screen *screen = nvc0->screen;2504uint64_t address;2505const int s = 5;25062507address = screen->uniform_bo->offset + NVC0_CB_AUX_INFO(s);25082509PUSH_SPACE(push, 11);25102511if (screen->base.class_3d >= NVE4_3D_CLASS) {2512BEGIN_NVC0(push, NVE4_CP(UPLOAD_DST_ADDRESS_HIGH), 2);2513PUSH_DATAh(push, address + NVC0_CB_AUX_MP_INFO);2514PUSH_DATA (push, address + NVC0_CB_AUX_MP_INFO);2515BEGIN_NVC0(push, NVE4_CP(UPLOAD_LINE_LENGTH_IN), 2);2516PUSH_DATA (push, 3 * 4);2517PUSH_DATA (push, 0x1);2518BEGIN_1IC0(push, NVE4_CP(UPLOAD_EXEC), 1 + 3);2519PUSH_DATA (push, NVE4_COMPUTE_UPLOAD_EXEC_LINEAR | (0x20 << 1));2520} else {2521BEGIN_NVC0(push, NVC0_CP(CB_SIZE), 3);2522PUSH_DATA (push, NVC0_CB_AUX_SIZE);2523PUSH_DATAh(push, address);2524PUSH_DATA (push, address);2525BEGIN_1IC0(push, NVC0_CP(CB_POS), 1 + 3);2526PUSH_DATA (push, NVC0_CB_AUX_MP_INFO);2527}2528PUSH_DATA (push, (hq->bo->offset + hq->base_offset));2529PUSH_DATAh(push, (hq->bo->offset + hq->base_offset));2530PUSH_DATA (push, hq->sequence);2531}25322533static void2534nvc0_hw_sm_end_query(struct nvc0_context *nvc0, struct nvc0_hw_query *hq)2535{2536struct nvc0_screen *screen = nvc0->screen;2537struct pipe_context *pipe = &nvc0->base.pipe;2538struct nouveau_pushbuf *push = nvc0->base.pushbuf;2539const bool is_nve4 = screen->base.class_3d >= NVE4_3D_CLASS;2540struct nvc0_hw_sm_query *hsq = nvc0_hw_sm_query(hq);2541struct nvc0_program *old = nvc0->compprog;2542struct pipe_grid_info info = {};2543uint32_t mask;2544uint32_t input[3];2545const uint block[3] = { 32, is_nve4 ? 4 : 1, 1 };2546const uint grid[3] = { screen->mp_count, screen->gpc_count, 1 };2547unsigned c, i;25482549if (unlikely(!screen->pm.prog))2550screen->pm.prog = nvc0_hw_sm_get_program(screen);25512552/* disable all counting */2553PUSH_SPACE(push, 8);2554for (c = 0; c < 8; ++c)2555if (screen->pm.mp_counter[c]) {2556if (is_nve4) {2557IMMED_NVC0(push, NVE4_CP(MP_PM_FUNC(c)), 0);2558} else {2559IMMED_NVC0(push, NVC0_CP(MP_PM_OP(c)), 0);2560}2561}2562/* release counters for this query */2563for (c = 0; c < 8; ++c) {2564if (screen->pm.mp_counter[c] == hsq) {2565uint8_t d = is_nve4 ? c / 4 : 0; /* only one domain for NVC0:NVE4 */2566screen->pm.num_hw_sm_active[d]--;2567screen->pm.mp_counter[c] = NULL;2568}2569}25702571if (screen->base.class_3d >= GM107_3D_CLASS)2572IMMED_NVC0(push, SUBC_CP(0x33e0), 0);25732574BCTX_REFN_bo(nvc0->bufctx_cp, CP_QUERY, NOUVEAU_BO_GART | NOUVEAU_BO_WR,2575hq->bo);25762577PUSH_SPACE(push, 1);2578IMMED_NVC0(push, SUBC_CP(NV50_GRAPH_SERIALIZE), 0);25792580/* upload input data for the compute shader which reads MP counters */2581nvc0_hw_sm_upload_input(nvc0, hq);25822583pipe->bind_compute_state(pipe, screen->pm.prog);2584for (i = 0; i < 3; i++) {2585info.block[i] = block[i];2586info.grid[i] = grid[i];2587}2588info.pc = 0;2589info.input = input;2590pipe->launch_grid(pipe, &info);2591pipe->bind_compute_state(pipe, old);25922593nouveau_bufctx_reset(nvc0->bufctx_cp, NVC0_BIND_CP_QUERY);25942595/* re-activate other counters */2596PUSH_SPACE(push, 16);2597mask = 0;2598for (c = 0; c < 8; ++c) {2599const struct nvc0_hw_sm_query_cfg *cfg;2600unsigned i;26012602hsq = screen->pm.mp_counter[c];2603if (!hsq)2604continue;26052606cfg = nvc0_hw_sm_query_get_cfg(nvc0, &hsq->base);2607for (i = 0; i < cfg->num_counters; ++i) {2608if (mask & (1 << hsq->ctr[i]))2609break;2610mask |= 1 << hsq->ctr[i];2611if (is_nve4) {2612BEGIN_NVC0(push, NVE4_CP(MP_PM_FUNC(hsq->ctr[i])), 1);2613} else {2614BEGIN_NVC0(push, NVC0_CP(MP_PM_OP(hsq->ctr[i])), 1);2615}2616PUSH_DATA (push, (cfg->ctr[i].func << 4) | cfg->ctr[i].mode);2617}2618}2619}26202621static inline bool2622nvc0_hw_sm_query_read_data(uint32_t count[32][8],2623struct nvc0_context *nvc0, bool wait,2624struct nvc0_hw_query *hq,2625const struct nvc0_hw_sm_query_cfg *cfg,2626unsigned mp_count)2627{2628struct nvc0_hw_sm_query *hsq = nvc0_hw_sm_query(hq);2629unsigned p, c;26302631for (p = 0; p < mp_count; ++p) {2632const unsigned b = (0x30 / 4) * p;26332634for (c = 0; c < cfg->num_counters; ++c) {2635if (hq->data[b + 8] != hq->sequence) {2636if (!wait)2637return false;2638if (nouveau_bo_wait(hq->bo, NOUVEAU_BO_RD, nvc0->base.client))2639return false;2640}2641count[p][c] = hq->data[b + hsq->ctr[c]] * (1 << c);2642}2643}2644return true;2645}26462647static inline bool2648nve4_hw_sm_query_read_data(uint32_t count[32][8],2649struct nvc0_context *nvc0, bool wait,2650struct nvc0_hw_query *hq,2651const struct nvc0_hw_sm_query_cfg *cfg,2652unsigned mp_count)2653{2654struct nvc0_hw_sm_query *hsq = nvc0_hw_sm_query(hq);2655unsigned p, c, d;26562657for (p = 0; p < mp_count; ++p) {2658const unsigned b = (0x60 / 4) * p;26592660for (c = 0; c < cfg->num_counters; ++c) {2661count[p][c] = 0;2662for (d = 0; d < ((hsq->ctr[c] & ~3) ? 1 : 4); ++d) {2663if (hq->data[b + 20 + d] != hq->sequence) {2664if (!wait)2665return false;2666if (nouveau_bo_wait(hq->bo, NOUVEAU_BO_RD, nvc0->base.client))2667return false;2668}2669if (hsq->ctr[c] & ~0x3)2670count[p][c] = hq->data[b + 16 + (hsq->ctr[c] & 3)];2671else2672count[p][c] += hq->data[b + d * 4 + hsq->ctr[c]];2673}2674}2675}2676return true;2677}26782679static bool2680nvc0_hw_sm_get_query_result(struct nvc0_context *nvc0, struct nvc0_hw_query *hq,2681bool wait, union pipe_query_result *result)2682{2683uint32_t count[32][8];2684uint64_t value = 0;2685unsigned mp_count = MIN2(nvc0->screen->mp_count_compute, 32);2686unsigned p, c;2687const struct nvc0_hw_sm_query_cfg *cfg;2688bool ret;26892690cfg = nvc0_hw_sm_query_get_cfg(nvc0, hq);26912692if (nvc0->screen->base.class_3d >= NVE4_3D_CLASS)2693ret = nve4_hw_sm_query_read_data(count, nvc0, wait, hq, cfg, mp_count);2694else2695ret = nvc0_hw_sm_query_read_data(count, nvc0, wait, hq, cfg, mp_count);2696if (!ret)2697return false;26982699for (c = 0; c < cfg->num_counters; ++c)2700for (p = 0; p < mp_count; ++p)2701value += count[p][c];2702value = (value * cfg->norm[0]) / cfg->norm[1];27032704*(uint64_t *)result = value;2705return true;2706}27072708static const struct nvc0_hw_query_funcs hw_sm_query_funcs = {2709.destroy_query = nvc0_hw_sm_destroy_query,2710.begin_query = nvc0_hw_sm_begin_query,2711.end_query = nvc0_hw_sm_end_query,2712.get_query_result = nvc0_hw_sm_get_query_result,2713};27142715struct nvc0_hw_query *2716nvc0_hw_sm_create_query(struct nvc0_context *nvc0, unsigned type)2717{2718struct nvc0_screen *screen = nvc0->screen;2719struct nvc0_hw_sm_query *hsq;2720struct nvc0_hw_query *hq;2721unsigned space;27222723if (nvc0->screen->base.drm->version < 0x01000101)2724return NULL;27252726if (type < NVC0_HW_SM_QUERY(0) || type > NVC0_HW_SM_QUERY_LAST)2727return NULL;27282729hsq = CALLOC_STRUCT(nvc0_hw_sm_query);2730if (!hsq)2731return NULL;27322733hq = &hsq->base;2734hq->funcs = &hw_sm_query_funcs;2735hq->base.type = type;27362737if (screen->base.class_3d >= NVE4_3D_CLASS) {2738/* for each MP:2739* [00] = WS0.C02740* [04] = WS0.C12741* [08] = WS0.C22742* [0c] = WS0.C32743* [10] = WS1.C02744* [14] = WS1.C12745* [18] = WS1.C22746* [1c] = WS1.C32747* [20] = WS2.C02748* [24] = WS2.C12749* [28] = WS2.C22750* [2c] = WS2.C32751* [30] = WS3.C02752* [34] = WS3.C12753* [38] = WS3.C22754* [3c] = WS3.C32755* [40] = MP.C42756* [44] = MP.C52757* [48] = MP.C62758* [4c] = MP.C72759* [50] = WS0.sequence2760* [54] = WS1.sequence2761* [58] = WS2.sequence2762* [5c] = WS3.sequence2763*/2764space = (4 * 4 + 4 + 4) * nvc0->screen->mp_count * sizeof(uint32_t);2765} else {2766/*2767* Note that padding is used to align memory access to 128 bits.2768*2769* for each MP:2770* [00] = MP.C02771* [04] = MP.C12772* [08] = MP.C22773* [0c] = MP.C32774* [10] = MP.C42775* [14] = MP.C52776* [18] = MP.C62777* [1c] = MP.C72778* [20] = MP.sequence2779* [24] = padding2780* [28] = padding2781* [2c] = padding2782*/2783space = (8 + 1 + 3) * nvc0->screen->mp_count * sizeof(uint32_t);2784}27852786if (!nvc0_hw_query_allocate(nvc0, &hq->base, space)) {2787FREE(hq);2788return NULL;2789}27902791return hq;2792}27932794int2795nvc0_hw_sm_get_driver_query_info(struct nvc0_screen *screen, unsigned id,2796struct pipe_driver_query_info *info)2797{2798int count = 0;27992800if (screen->base.drm->version >= 0x01000101) {2801if (screen->compute)2802count = nvc0_hw_sm_get_num_queries(screen);2803}28042805if (!info)2806return count;28072808if (id < count) {2809if (screen->compute) {2810if (screen->base.class_3d <= GM200_3D_CLASS) {2811const struct nvc0_hw_sm_query_cfg **queries =2812nvc0_hw_sm_get_queries(screen);28132814info->name = nvc0_hw_sm_query_get_name(queries[id]->type);2815info->query_type = NVC0_HW_SM_QUERY(queries[id]->type);2816info->group_id = NVC0_HW_SM_QUERY_GROUP;2817return 1;2818}2819}2820}2821return 0;2822}282328242825