Path: blob/21.2-virgl/src/intel/ds/intel_pps_driver.cc
4547 views
/*1* Copyright © 2020-2021 Collabora, Ltd.2* Author: Antonio Caggiano <[email protected]>3* Author: Corentin Noël <[email protected]>4*5* SPDX-License-Identifier: MIT6*/78#include "intel_pps_driver.h"910#include <dirent.h>11#include <fcntl.h>12#include <math.h>13#include <poll.h>14#include <strings.h>15#include <sys/ioctl.h>16#include <unistd.h>1718#include <i915_drm.h>19#include <intel/perf/intel_perf_query.h>2021#include <pps/pps.h>22#include <pps/pps_algorithm.h>2324#include "intel_pps_perf.h"2526namespace pps27{28uint64_t IntelDriver::get_min_sampling_period_ns()29{30return 500000;31}3233void IntelDriver::enable_counter(uint32_t counter_id)34{35auto &counter = counters[counter_id];36auto &group = groups[counter.group];37if (perf->query) {38if (perf->query->symbol_name != group.name) {39PPS_LOG_ERROR(40"Unable to enable metrics from different sets: %u "41"belongs to %s but %s is currently in use.",42counter_id,43perf->query->symbol_name,44group.name.c_str());45return;46}47}4849enabled_counters.emplace_back(counter);50if (!perf->query) {51perf->query = perf->find_query_by_name(group.name);52}53}5455void IntelDriver::enable_all_counters()56{57// We can only enable one metric set at a time so at least enable one.58for (auto &group : groups) {59if (group.name == "RenderBasic") {60for (uint32_t counter_id : group.counters) {61auto &counter = counters[counter_id];62enabled_counters.emplace_back(counter);63}6465perf->query = perf->find_query_by_name(group.name);66break;67}68}69}7071static uint64_t timespec_diff(timespec *begin, timespec *end)72{73return 1000000000ull * (end->tv_sec - begin->tv_sec) + end->tv_nsec - begin->tv_nsec;74}7576/// @brief This function tries to correlate CPU time with GPU time77std::optional<TimestampCorrelation> IntelDriver::query_correlation_timestamps() const78{79TimestampCorrelation corr = {};8081clock_t correlation_clock_id = CLOCK_BOOTTIME;8283drm_i915_reg_read reg_read = {};84const uint64_t render_ring_timestamp = 0x2358;85reg_read.offset = render_ring_timestamp | I915_REG_READ_8B_WA;8687constexpr size_t attempt_count = 3;88struct {89timespec cpu_ts_begin;90timespec cpu_ts_end;91uint64_t gpu_ts;92} attempts[attempt_count] = {};9394uint32_t best = 0;9596// Gather 3 correlations97for (uint32_t i = 0; i < attempt_count; i++) {98clock_gettime(correlation_clock_id, &attempts[i].cpu_ts_begin);99if (perf_ioctl(drm_device.fd, DRM_IOCTL_I915_REG_READ, ®_read) < 0) {100return std::nullopt;101}102clock_gettime(correlation_clock_id, &attempts[i].cpu_ts_end);103104attempts[i].gpu_ts = reg_read.val;105}106107// Now select the best108for (uint32_t i = 1; i < attempt_count; i++) {109if (timespec_diff(&attempts[i].cpu_ts_begin, &attempts[i].cpu_ts_end) <110timespec_diff(&attempts[best].cpu_ts_begin, &attempts[best].cpu_ts_end)) {111best = i;112}113}114115corr.cpu_timestamp =116(attempts[best].cpu_ts_begin.tv_sec * 1000000000ull + attempts[best].cpu_ts_begin.tv_nsec) +117timespec_diff(&attempts[best].cpu_ts_begin, &attempts[best].cpu_ts_end) / 2;118corr.gpu_timestamp = attempts[best].gpu_ts;119120return corr;121}122123void IntelDriver::get_new_correlation()124{125// Rotate left correlations by one position so to make space at the end126std::rotate(correlations.begin(), correlations.begin() + 1, correlations.end());127128// Then we overwrite the last correlation with a new one129if (auto corr = query_correlation_timestamps()) {130correlations.back() = *corr;131} else {132PPS_LOG_FATAL("Failed to get correlation timestamps");133}134}135136bool IntelDriver::init_perfcnt()137{138assert(!perf && "Intel perf should not be initialized at this point");139140perf = std::make_unique<IntelPerf>(drm_device.fd);141142for (auto &query : perf->get_queries()) {143// Create group144CounterGroup group = {};145group.id = groups.size();146group.name = query->symbol_name;147148for (int i = 0; i < query->n_counters; ++i) {149intel_perf_query_counter &counter = query->counters[i];150151// Create counter152Counter counter_desc = {};153counter_desc.id = counters.size();154counter_desc.name = counter.symbol_name;155counter_desc.group = group.id;156counter_desc.getter = [counter, query, this](157const Counter &c, const Driver &dri) -> Counter::Value {158switch (counter.data_type) {159case INTEL_PERF_COUNTER_DATA_TYPE_UINT64:160case INTEL_PERF_COUNTER_DATA_TYPE_UINT32:161case INTEL_PERF_COUNTER_DATA_TYPE_BOOL32:162return (int64_t)counter.oa_counter_read_uint64(perf->cfg, query, &result);163break;164case INTEL_PERF_COUNTER_DATA_TYPE_DOUBLE:165case INTEL_PERF_COUNTER_DATA_TYPE_FLOAT:166return counter.oa_counter_read_float(perf->cfg, query, &result);167break;168}169170return {};171};172173// Add counter id to the group174group.counters.emplace_back(counter_desc.id);175176// Store counter177counters.emplace_back(std::move(counter_desc));178}179180// Store group181groups.emplace_back(std::move(group));182}183184assert(groups.size() && "Failed to query groups");185assert(counters.size() && "Failed to query counters");186187// Clear accumulations188intel_perf_query_result_clear(&result);189190return true;191}192193void IntelDriver::enable_perfcnt(uint64_t sampling_period_ns)194{195this->sampling_period_ns = sampling_period_ns;196197// Fill correlations with an initial one198if (auto corr = query_correlation_timestamps()) {199correlations.fill(*corr);200} else {201PPS_LOG_FATAL("Failed to get correlation timestamps");202}203204if (!perf->open(sampling_period_ns)) {205PPS_LOG_FATAL("Failed to open intel perf");206}207}208209/// @brief Transforms the GPU timestop into a CPU timestamp equivalent210uint64_t IntelDriver::correlate_gpu_timestamp(const uint32_t gpu_ts)211{212auto &corr_a = correlations[0];213auto &corr_b = correlations[correlations.size() - 1];214215// A correlation timestamp has 36 bits, so get the first 32 to make it work with gpu_ts216uint64_t mask = 0xffffffff;217uint32_t corr_a_gpu_ts = corr_a.gpu_timestamp & mask;218uint32_t corr_b_gpu_ts = corr_b.gpu_timestamp & mask;219220// Make sure it is within the interval [a,b)221assert(gpu_ts >= corr_a_gpu_ts && "GPU TS < Corr a");222assert(gpu_ts < corr_b_gpu_ts && "GPU TS >= Corr b");223224uint32_t gpu_delta = gpu_ts - corr_a_gpu_ts;225// Factor to convert gpu time to cpu time226double gpu_to_cpu = (corr_b.cpu_timestamp - corr_a.cpu_timestamp) /227double(corr_b.gpu_timestamp - corr_a.gpu_timestamp);228uint64_t cpu_delta = gpu_delta * gpu_to_cpu;229return corr_a.cpu_timestamp + cpu_delta;230}231232void IntelDriver::disable_perfcnt()233{234perf = nullptr;235groups.clear();236counters.clear();237enabled_counters.clear();238}239240struct Report {241uint32_t version;242uint32_t timestamp;243uint32_t id;244};245246/// @brief Some perf record durations can be really short247/// @return True if the duration is at least close to the sampling period248static bool close_enough(uint64_t duration, uint64_t sampling_period)249{250return duration > sampling_period - 100000;251}252253/// @brief Transforms the raw data received in from the driver into records254std::vector<PerfRecord> IntelDriver::parse_perf_records(const std::vector<uint8_t> &data,255const size_t byte_count)256{257std::vector<PerfRecord> records;258records.reserve(128);259260PerfRecord record;261record.reserve(512);262263const uint8_t *iter = data.data();264const uint8_t *end = iter + byte_count;265266uint64_t prev_cpu_timestamp = last_cpu_timestamp;267268while (iter < end) {269// Iterate a record at a time270auto header = reinterpret_cast<const drm_i915_perf_record_header *>(iter);271272if (header->type == DRM_I915_PERF_RECORD_SAMPLE) {273// Report is next to the header274auto report = reinterpret_cast<const Report *>(header + 1);275auto cpu_timestamp = correlate_gpu_timestamp(report->timestamp);276auto duration = cpu_timestamp - prev_cpu_timestamp;277278// Skip perf-records that are too short by checking279// the distance between last report and this one280if (close_enough(duration, sampling_period_ns)) {281prev_cpu_timestamp = cpu_timestamp;282283// Add the new record to the list284record.resize(header->size); // Possibly 264?285memcpy(record.data(), iter, header->size);286records.emplace_back(record);287}288}289290// Go to the next record291iter += header->size;292}293294return records;295}296297/// @brief Read all the available data from the metric set currently in use298void IntelDriver::read_data_from_metric_set()299{300assert(metric_buffer.size() >= 1024 && "Metric buffer should have space for reading");301302ssize_t bytes_read = 0;303while ((bytes_read = perf->read_oa_stream(metric_buffer.data() + total_bytes_read,304metric_buffer.size() - total_bytes_read)) > 0 ||305errno == EINTR) {306total_bytes_read += std::max(ssize_t(0), bytes_read);307308// Increase size of the buffer for the next read309if (metric_buffer.size() / 2 < total_bytes_read) {310metric_buffer.resize(metric_buffer.size() * 2);311}312}313314assert(total_bytes_read < metric_buffer.size() && "Buffer not big enough");315}316317bool IntelDriver::dump_perfcnt()318{319if (!perf->oa_stream_ready()) {320return false;321}322323read_data_from_metric_set();324325get_new_correlation();326327auto new_records = parse_perf_records(metric_buffer, total_bytes_read);328if (new_records.empty()) {329PPS_LOG("No new records");330// No new records from the GPU yet331return false;332} else {333PPS_LOG("Records parsed bytes: %lu", total_bytes_read);334// Records are parsed correctly, so we can reset the335// number of bytes read so far from the metric set336total_bytes_read = 0;337}338339APPEND(records, new_records);340341if (records.size() < 2) {342// Not enough records to accumulate343return false;344}345346return true;347}348349uint32_t IntelDriver::gpu_next()350{351if (records.size() < 2) {352// Not enough records to accumulate353return 0;354}355356// Get first and second357auto record_a = reinterpret_cast<const drm_i915_perf_record_header *>(records[0].data());358auto record_b = reinterpret_cast<const drm_i915_perf_record_header *>(records[1].data());359360intel_perf_query_result_accumulate_fields(&result,361&perf->query.value(),362&perf->devinfo,363record_a + 1,364record_b + 1,365false /* no_oa_accumulate */);366367// Get last timestamp368auto report_b = reinterpret_cast<const Report *>(record_b + 1);369auto gpu_timestamp = report_b->timestamp;370371// Consume first record372records.erase(std::begin(records), std::begin(records) + 1);373374return gpu_timestamp;375}376377uint64_t IntelDriver::cpu_next()378{379if (auto gpu_timestamp = gpu_next()) {380auto cpu_timestamp = correlate_gpu_timestamp(gpu_timestamp);381382last_cpu_timestamp = cpu_timestamp;383return cpu_timestamp;384}385386return 0;387}388389uint64_t IntelDriver::next()390{391// Reset accumulation392intel_perf_query_result_clear(&result);393return cpu_next();394}395396} // namespace pps397398399