Path: blob/21.2-virgl/src/gallium/drivers/freedreno/freedreno_autotune.h
4570 views
/*1* Copyright © 2021 Google, Inc.2*3* Permission is hereby granted, free of charge, to any person obtaining a4* copy of this software and associated documentation files (the "Software"),5* to deal in the Software without restriction, including without limitation6* the rights to use, copy, modify, merge, publish, distribute, sublicense,7* and/or sell copies of the Software, and to permit persons to whom the8* Software is furnished to do so, subject to the following conditions:9*10* The above copyright notice and this permission notice (including the next11* paragraph) shall be included in all copies or substantial portions of the12* Software.13*14* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR15* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,16* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL17* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER18* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,19* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE20* SOFTWARE.21*/2223#ifndef FREEDRENO_AUTOTUNE_H24#define FREEDRENO_AUTOTUNE_H2526#include "util/hash_table.h"27#include "util/list.h"2829#include "freedreno_util.h"3031struct fd_autotune_results;3233/**34* "autotune" our decisions about bypass vs GMEM rendering, based on historical35* data about a given render target.36*37* In deciding which path to take there are tradeoffs, including some that38* are not reasonably estimateable without having some additional information:39*40* (1) If you know you are touching every pixel (ie. there is a glClear()),41* then the GMEM path will at least not cost more memory bandwidth than42* sysmem[1]43*44* (2) If there is no clear, GMEM could potentially cost *more* bandwidth45* due to sysmem->GMEM restore pass.46*47* (3) If you see a high draw count, that is an indication that there will be48* enough pixels accessed multiple times to benefit from the reduced49* memory bandwidth that GMEM brings50*51* (4) But high draw count where there is not much overdraw can actually be52* faster in bypass mode if it is pushing a lot of state change, due to53* not having to go thru the state changes per-tile[2]54*55* The approach taken is to measure the samples-passed for the batch to estimate56* the amount of overdraw to detect cases where the number of pixels touched is57* low.58*59* Note however, that (at least since a5xx) we have PERF_RB_{Z,C}_{READ,WRITE}60* performance countables, which give a more direct measurement of what we want61* to know (ie. is framebuffer memory access high enough to prefer GMEM), but62* with the downside of consuming half of the available RB counters. With the63* additional complication that external perfcntr collection (fdperf, perfetto)64* and the drive could be stomping on each other's feet. (Also reading the65* perfcntrs accurately requires a WFI.)66*67* [1] ignoring UBWC68* [2] ignoring early-tile-exit optimizations, but any draw that touches all/69* most of the tiles late in the tile-pass can defeat that70*/71struct fd_autotune {7273/**74* Cache to map batch->key (also used for batch-cache) to historical75* information about rendering to that particular render target.76*/77struct hash_table *ht;7879/**80* List of recently used historical results (to age out old results)81*/82struct list_head lru;8384/**85* GPU buffer used to communicate back results to the CPU86*/87struct fd_bo *results_mem;88struct fd_autotune_results *results;8990/**91* List of per-batch results that we are waiting for the GPU to finish92* with before reading back the results.93*/94struct list_head pending_results;9596uint32_t fence_counter;97uint32_t idx_counter;98};99100/**101* The layout of the memory used to read back per-batch results from the102* GPU103*104* Note this struct is intentionally aligned to 4k. And hw requires the105* sample start/stop locations to be 128b aligned.106*/107struct fd_autotune_results {108109/**110* The GPU writes back a "fence" seqno value from the cmdstream after111* it finishes writing it's result slot, so that the CPU knows when112* results are valid113*/114uint32_t fence;115116uint32_t __pad0;117uint64_t __pad1;118119/**120* From the cmdstream, the captured samples-passed values are recorded121* at the start and end of the batch.122*123* Note that we do the math on the CPU to avoid a WFI. But pre-emption124* may force us to revisit that.125*/126struct {127uint64_t samples_start;128uint64_t __pad0;129uint64_t samples_end;130uint64_t __pad1;131} result[127];132};133134#define offset(base, ptr) ((uint8_t *)(ptr) - (uint8_t *)(base))135#define results_ptr(at, member) \136(at)->results_mem, offset((at)->results, &(at)->results->member), 0, 0137138struct fd_batch_history;139140/**141* Tracks the results from an individual batch. Initially created per batch,142* and appended to the tail of at->pending_results. At a later time, when143* the GPU has finished writing the results,144*145* ralloc parent is the associated fd_batch_history146*/147struct fd_batch_result {148149/**150* The index/slot in fd_autotune_results::result[] to write start/end151* counter to152*/153unsigned idx;154155/**156* Fence value to write back to fd_autotune_results::fence after both157* start/end values written158*/159uint32_t fence;160161/*162* Below here, only used internally within autotune163*/164struct fd_batch_history *history;165struct list_head node;166uint32_t cost;167uint64_t samples_passed;168};169170void fd_autotune_init(struct fd_autotune *at, struct fd_device *dev);171void fd_autotune_fini(struct fd_autotune *at);172173struct fd_batch;174bool fd_autotune_use_bypass(struct fd_autotune *at,175struct fd_batch *batch) assert_dt;176177#endif /* FREEDRENO_AUTOTUNE_H */178179180