Path: blob/21.2-virgl/src/intel/perf/intel_perf.h
4547 views
/*1* Copyright © 2018 Intel Corporation2*3* Permission is hereby granted, free of charge, to any person obtaining a4* copy of this software and associated documentation files (the "Software"),5* to deal in the Software without restriction, including without limitation6* the rights to use, copy, modify, merge, publish, distribute, sublicense,7* and/or sell copies of the Software, and to permit persons to whom the8* Software is furnished to do so, subject to the following conditions:9*10* The above copyright notice and this permission notice (including the next11* paragraph) shall be included in all copies or substantial portions of the12* Software.13*14* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR15* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,16* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL17* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER18* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING19* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS20* IN THE SOFTWARE.21*/2223#ifndef INTEL_PERF_H24#define INTEL_PERF_H2526#include <stdio.h>27#include <stdbool.h>28#include <stdint.h>29#include <string.h>3031#if defined(MAJOR_IN_SYSMACROS)32#include <sys/sysmacros.h>33#elif defined(MAJOR_IN_MKDEV)34#include <sys/mkdev.h>35#endif3637#include "util/hash_table.h"38#include "compiler/glsl/list.h"39#include "util/ralloc.h"4041#include "drm-uapi/i915_drm.h"4243#ifdef __cplusplus44extern "C" {45#endif4647struct intel_device_info;4849struct intel_perf_config;50struct intel_perf_query_info;5152#define INTEL_PERF_INVALID_CTX_ID (0xffffffff)5354enum intel_perf_counter_type {55INTEL_PERF_COUNTER_TYPE_EVENT,56INTEL_PERF_COUNTER_TYPE_DURATION_NORM,57INTEL_PERF_COUNTER_TYPE_DURATION_RAW,58INTEL_PERF_COUNTER_TYPE_THROUGHPUT,59INTEL_PERF_COUNTER_TYPE_RAW,60INTEL_PERF_COUNTER_TYPE_TIMESTAMP,61};6263enum intel_perf_counter_data_type {64INTEL_PERF_COUNTER_DATA_TYPE_BOOL32,65INTEL_PERF_COUNTER_DATA_TYPE_UINT32,66INTEL_PERF_COUNTER_DATA_TYPE_UINT64,67INTEL_PERF_COUNTER_DATA_TYPE_FLOAT,68INTEL_PERF_COUNTER_DATA_TYPE_DOUBLE,69};7071enum intel_perf_counter_units {72/* size */73INTEL_PERF_COUNTER_UNITS_BYTES,7475/* frequency */76INTEL_PERF_COUNTER_UNITS_HZ,7778/* time */79INTEL_PERF_COUNTER_UNITS_NS,80INTEL_PERF_COUNTER_UNITS_US,8182/**/83INTEL_PERF_COUNTER_UNITS_PIXELS,84INTEL_PERF_COUNTER_UNITS_TEXELS,85INTEL_PERF_COUNTER_UNITS_THREADS,86INTEL_PERF_COUNTER_UNITS_PERCENT,8788/* events */89INTEL_PERF_COUNTER_UNITS_MESSAGES,90INTEL_PERF_COUNTER_UNITS_NUMBER,91INTEL_PERF_COUNTER_UNITS_CYCLES,92INTEL_PERF_COUNTER_UNITS_EVENTS,93INTEL_PERF_COUNTER_UNITS_UTILIZATION,9495/**/96INTEL_PERF_COUNTER_UNITS_EU_SENDS_TO_L3_CACHE_LINES,97INTEL_PERF_COUNTER_UNITS_EU_ATOMIC_REQUESTS_TO_L3_CACHE_LINES,98INTEL_PERF_COUNTER_UNITS_EU_REQUESTS_TO_L3_CACHE_LINES,99INTEL_PERF_COUNTER_UNITS_EU_BYTES_PER_L3_CACHE_LINE,100101INTEL_PERF_COUNTER_UNITS_MAX102};103104struct intel_pipeline_stat {105uint32_t reg;106uint32_t numerator;107uint32_t denominator;108};109110/*111* The largest OA formats we can use include:112* For Haswell:113* 1 timestamp, 45 A counters, 8 B counters and 8 C counters.114* For Gfx8+115* 1 timestamp, 1 clock, 36 A counters, 8 B counters and 8 C counters116*117* Plus 2 PERF_CNT registers and 1 RPSTAT register.118*/119#define MAX_OA_REPORT_COUNTERS (62 + 2 + 1)120121/*122* When currently allocate only one page for pipeline statistics queries. Here123* we derived the maximum number of counters for that amount.124*/125#define STATS_BO_SIZE 4096126#define STATS_BO_END_OFFSET_BYTES (STATS_BO_SIZE / 2)127#define MAX_STAT_COUNTERS (STATS_BO_END_OFFSET_BYTES / 8)128129#define I915_PERF_OA_SAMPLE_SIZE (8 + /* drm_i915_perf_record_header */ \130256) /* OA counter report */131132struct intel_perf_query_result {133/**134* Storage for the final accumulated OA counters.135*/136uint64_t accumulator[MAX_OA_REPORT_COUNTERS];137138/**139* Hw ID used by the context on which the query was running.140*/141uint32_t hw_id;142143/**144* Number of reports accumulated to produce the results.145*/146uint32_t reports_accumulated;147148/**149* Frequency in the slices of the GT at the begin and end of the150* query.151*/152uint64_t slice_frequency[2];153154/**155* Frequency in the unslice of the GT at the begin and end of the156* query.157*/158uint64_t unslice_frequency[2];159160/**161* Frequency of the whole GT at the begin and end of the query.162*/163uint64_t gt_frequency[2];164165/**166* Timestamp of the query.167*/168uint64_t begin_timestamp;169170/**171* Whether the query was interrupted by another workload (aka preemption).172*/173bool query_disjoint;174};175176struct intel_perf_query_counter {177const char *name;178const char *desc;179const char *symbol_name;180const char *category;181enum intel_perf_counter_type type;182enum intel_perf_counter_data_type data_type;183enum intel_perf_counter_units units;184uint64_t raw_max;185size_t offset;186187union {188uint64_t (*oa_counter_read_uint64)(struct intel_perf_config *perf,189const struct intel_perf_query_info *query,190const struct intel_perf_query_result *results);191float (*oa_counter_read_float)(struct intel_perf_config *perf,192const struct intel_perf_query_info *query,193const struct intel_perf_query_result *results);194struct intel_pipeline_stat pipeline_stat;195};196};197198struct intel_perf_query_register_prog {199uint32_t reg;200uint32_t val;201};202203/* Register programming for a given query */204struct intel_perf_registers {205const struct intel_perf_query_register_prog *flex_regs;206uint32_t n_flex_regs;207208const struct intel_perf_query_register_prog *mux_regs;209uint32_t n_mux_regs;210211const struct intel_perf_query_register_prog *b_counter_regs;212uint32_t n_b_counter_regs;213};214215struct intel_perf_query_info {216struct intel_perf_config *perf;217218enum intel_perf_query_type {219INTEL_PERF_QUERY_TYPE_OA,220INTEL_PERF_QUERY_TYPE_RAW,221INTEL_PERF_QUERY_TYPE_PIPELINE,222} kind;223const char *name;224const char *symbol_name;225const char *guid;226struct intel_perf_query_counter *counters;227int n_counters;228int max_counters;229size_t data_size;230231/* OA specific */232uint64_t oa_metrics_set_id;233int oa_format;234235/* For indexing into the accumulator[] ... */236int gpu_time_offset;237int gpu_clock_offset;238int a_offset;239int b_offset;240int c_offset;241int perfcnt_offset;242int rpstat_offset;243244struct intel_perf_registers config;245};246247/* When not using the MI_RPC command, this structure describes the list of248* register offsets as well as their storage location so that they can be249* stored through a series of MI_SRM commands and accumulated with250* intel_perf_query_result_accumulate_snapshots().251*/252struct intel_perf_query_field_layout {253/* Alignment for the layout */254uint32_t alignment;255256/* Size of the whole layout */257uint32_t size;258259uint32_t n_fields;260261struct intel_perf_query_field {262/* MMIO location of this register */263uint16_t mmio_offset;264265/* Location of this register in the storage */266uint16_t location;267268/* Type of register, for accumulation (see intel_perf_query_info:*_offset269* fields)270*/271enum intel_perf_query_field_type {272INTEL_PERF_QUERY_FIELD_TYPE_MI_RPC,273INTEL_PERF_QUERY_FIELD_TYPE_SRM_PERFCNT,274INTEL_PERF_QUERY_FIELD_TYPE_SRM_RPSTAT,275INTEL_PERF_QUERY_FIELD_TYPE_SRM_OA_B,276INTEL_PERF_QUERY_FIELD_TYPE_SRM_OA_C,277} type;278279/* Index of register in the given type (for instance A31 or B2,280* etc...)281*/282uint8_t index;283284/* 4, 8 or 256 */285uint16_t size;286287/* If not 0, mask to apply to the register value. */288uint64_t mask;289} *fields;290};291292struct intel_perf_query_counter_info {293struct intel_perf_query_counter *counter;294295uint64_t query_mask;296297/**298* Each counter can be a part of many groups, each time at different index.299* This struct stores one of those locations.300*/301struct {302int group_idx; /* query/group number */303int counter_idx; /* index inside of query/group */304} location;305};306307struct intel_perf_config {308/* Whether i915 has DRM_I915_QUERY_PERF_CONFIG support. */309bool i915_query_supported;310311/* Version of the i915-perf subsystem, refer to i915_drm.h. */312int i915_perf_version;313314/* Powergating configuration for the running the query. */315struct drm_i915_gem_context_param_sseu sseu;316317struct intel_perf_query_info *queries;318int n_queries;319320struct intel_perf_query_counter_info *counter_infos;321int n_counters;322323struct intel_perf_query_field_layout query_layout;324325/* Variables referenced in the XML meta data for OA performance326* counters, e.g in the normalization equations.327*328* All uint64_t for consistent operand types in generated code329*/330struct {331uint64_t timestamp_frequency; /** $GpuTimestampFrequency */332uint64_t n_eus; /** $EuCoresTotalCount */333uint64_t n_eu_slices; /** $EuSlicesTotalCount */334uint64_t n_eu_sub_slices; /** $EuSubslicesTotalCount */335uint64_t eu_threads_count; /** $EuThreadsCount */336uint64_t slice_mask; /** $SliceMask */337uint64_t subslice_mask; /** $SubsliceMask */338uint64_t gt_min_freq; /** $GpuMinFrequency */339uint64_t gt_max_freq; /** $GpuMaxFrequency */340uint64_t revision; /** $SkuRevisionId */341bool query_mode; /** $QueryMode */342} sys_vars;343344/* OA metric sets, indexed by GUID, as know by Mesa at build time, to345* cross-reference with the GUIDs of configs advertised by the kernel at346* runtime347*/348struct hash_table *oa_metrics_table;349350/* When MDAPI hasn't configured the metric we need to use by the time the351* query begins, this OA metric is used as a fallback.352*/353uint64_t fallback_raw_oa_metric;354355/* Whether we have support for this platform. If true && n_queries == 0,356* this means we will not be able to use i915-perf because of it is in357* paranoid mode.358*/359bool platform_supported;360361/* Location of the device's sysfs entry. */362char sysfs_dev_dir[256];363364struct {365void *(*bo_alloc)(void *bufmgr, const char *name, uint64_t size);366void (*bo_unreference)(void *bo);367void *(*bo_map)(void *ctx, void *bo, unsigned flags);368void (*bo_unmap)(void *bo);369bool (*batch_references)(void *batch, void *bo);370void (*bo_wait_rendering)(void *bo);371int (*bo_busy)(void *bo);372void (*emit_stall_at_pixel_scoreboard)(void *ctx);373void (*emit_mi_report_perf_count)(void *ctx,374void *bo,375uint32_t offset_in_bytes,376uint32_t report_id);377void (*batchbuffer_flush)(void *ctx,378const char *file, int line);379void (*store_register_mem)(void *ctx, void *bo, uint32_t reg, uint32_t reg_size, uint32_t offset);380381} vtbl;382};383384struct intel_perf_counter_pass {385struct intel_perf_query_info *query;386struct intel_perf_query_counter *counter;387uint32_t pass;388};389390/** Initialize the intel_perf_config object for a given device.391*392* include_pipeline_statistics : Whether to add a pipeline statistic query393* intel_perf_query_info object394*395* use_register_snapshots : Whether the queries should include counters396* that rely on register snapshots using command397* streamer instructions (not possible when using398* only the OA buffer data).399*/400void intel_perf_init_metrics(struct intel_perf_config *perf_cfg,401const struct intel_device_info *devinfo,402int drm_fd,403bool include_pipeline_statistics,404bool use_register_snapshots);405406/** Query i915 for a metric id using guid.407*/408bool intel_perf_load_metric_id(struct intel_perf_config *perf_cfg,409const char *guid,410uint64_t *metric_id);411412/** Load a configuation's content from i915 using a guid.413*/414struct intel_perf_registers *intel_perf_load_configuration(struct intel_perf_config *perf_cfg,415int fd, const char *guid);416417/** Store a configuration into i915 using guid and return a new metric id.418*419* If guid is NULL, then a generated one will be provided by hashing the420* content of the configuration.421*/422uint64_t intel_perf_store_configuration(struct intel_perf_config *perf_cfg, int fd,423const struct intel_perf_registers *config,424const char *guid);425426/** Read the slice/unslice frequency from 2 OA reports and store then into427* result.428*/429void intel_perf_query_result_read_frequencies(struct intel_perf_query_result *result,430const struct intel_device_info *devinfo,431const uint32_t *start,432const uint32_t *end);433434/** Store the GT frequency as reported by the RPSTAT register.435*/436void intel_perf_query_result_read_gt_frequency(struct intel_perf_query_result *result,437const struct intel_device_info *devinfo,438const uint32_t start,439const uint32_t end);440441/** Store PERFCNT registers values.442*/443void intel_perf_query_result_read_perfcnts(struct intel_perf_query_result *result,444const struct intel_perf_query_info *query,445const uint64_t *start,446const uint64_t *end);447448/** Accumulate the delta between 2 OA reports into result for a given query.449*/450void intel_perf_query_result_accumulate(struct intel_perf_query_result *result,451const struct intel_perf_query_info *query,452const struct intel_device_info *devinfo,453const uint32_t *start,454const uint32_t *end);455456/** Accumulate the delta between 2 snapshots of OA perf registers (layout457* should match description specified through intel_perf_query_register_layout).458*/459void intel_perf_query_result_accumulate_fields(struct intel_perf_query_result *result,460const struct intel_perf_query_info *query,461const struct intel_device_info *devinfo,462const void *start,463const void *end,464bool no_oa_accumulate);465466void intel_perf_query_result_clear(struct intel_perf_query_result *result);467468/** Debug helper printing out query data.469*/470void intel_perf_query_result_print_fields(const struct intel_perf_query_info *query,471const struct intel_device_info *devinfo,472const void *data);473474static inline size_t475intel_perf_query_counter_get_size(const struct intel_perf_query_counter *counter)476{477switch (counter->data_type) {478case INTEL_PERF_COUNTER_DATA_TYPE_BOOL32:479return sizeof(uint32_t);480case INTEL_PERF_COUNTER_DATA_TYPE_UINT32:481return sizeof(uint32_t);482case INTEL_PERF_COUNTER_DATA_TYPE_UINT64:483return sizeof(uint64_t);484case INTEL_PERF_COUNTER_DATA_TYPE_FLOAT:485return sizeof(float);486case INTEL_PERF_COUNTER_DATA_TYPE_DOUBLE:487return sizeof(double);488default:489unreachable("invalid counter data type");490}491}492493static inline struct intel_perf_config *494intel_perf_new(void *ctx)495{496struct intel_perf_config *perf = rzalloc(ctx, struct intel_perf_config);497return perf;498}499500/** Whether we have the ability to hold off preemption on a batch so we don't501* have to look at the OA buffer to subtract unrelated workloads off the502* values captured through MI_* commands.503*/504static inline bool505intel_perf_has_hold_preemption(const struct intel_perf_config *perf)506{507return perf->i915_perf_version >= 3;508}509510/** Whether we have the ability to lock EU array power configuration for the511* duration of the performance recording. This is useful on Gfx11 where the HW512* architecture requires half the EU for particular workloads.513*/514static inline bool515intel_perf_has_global_sseu(const struct intel_perf_config *perf)516{517return perf->i915_perf_version >= 4;518}519520uint32_t intel_perf_get_n_passes(struct intel_perf_config *perf,521const uint32_t *counter_indices,522uint32_t counter_indices_count,523struct intel_perf_query_info **pass_queries);524void intel_perf_get_counters_passes(struct intel_perf_config *perf,525const uint32_t *counter_indices,526uint32_t counter_indices_count,527struct intel_perf_counter_pass *counter_pass);528529#ifdef __cplusplus530} // extern "C"531#endif532533#endif /* INTEL_PERF_H */534535536