CoCalc -- freedreno

GitHub Repository: PojavLauncherTeam/mesa
Path: blob/21.2-virgl/src/gallium/drivers/freedreno/freedreno_autotune.h
⁴⁵⁷⁰ views
1
/*
2
 * Copyright © 2021 Google, Inc.
3
 *
4
 * Permission is hereby granted, free of charge, to any person obtaining a
5
 * copy of this software and associated documentation files (the "Software"),
6
 * to deal in the Software without restriction, including without limitation
7
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8
 * and/or sell copies of the Software, and to permit persons to whom the
9
 * Software is furnished to do so, subject to the following conditions:
10
 *
11
 * The above copyright notice and this permission notice (including the next
12
 * paragraph) shall be included in all copies or substantial portions of the
13
 * Software.
14
 *
15
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
 * SOFTWARE.
22
 */
23

24
#ifndef FREEDRENO_AUTOTUNE_H
25
#define FREEDRENO_AUTOTUNE_H
26

27
#include "util/hash_table.h"
28
#include "util/list.h"
29

30
#include "freedreno_util.h"
31

32
struct fd_autotune_results;
33

34
/**
35
 * "autotune" our decisions about bypass vs GMEM rendering, based on historical
36
 * data about a given render target.
37
 *
38
 * In deciding which path to take there are tradeoffs, including some that
39
 * are not reasonably estimateable without having some additional information:
40
 *
41
 *  (1) If you know you are touching every pixel (ie. there is a glClear()),
42
 *      then the GMEM path will at least not cost more memory bandwidth than
43
 *      sysmem[1]
44
 *
45
 *  (2) If there is no clear, GMEM could potentially cost *more* bandwidth
46
 *      due to sysmem->GMEM restore pass.
47
 *
48
 *  (3) If you see a high draw count, that is an indication that there will be
49
 *      enough pixels accessed multiple times to benefit from the reduced
50
 *      memory bandwidth that GMEM brings
51
 *
52
 *  (4) But high draw count where there is not much overdraw can actually be
53
 *      faster in bypass mode if it is pushing a lot of state change, due to
54
 *      not having to go thru the state changes per-tile[2]
55
 *
56
 * The approach taken is to measure the samples-passed for the batch to estimate
57
 * the amount of overdraw to detect cases where the number of pixels touched is
58
 * low.
59
 *
60
 * Note however, that (at least since a5xx) we have PERF_RB_{Z,C}_{READ,WRITE}
61
 * performance countables, which give a more direct measurement of what we want
62
 * to know (ie. is framebuffer memory access high enough to prefer GMEM), but
63
 * with the downside of consuming half of the available RB counters.  With the
64
 * additional complication that external perfcntr collection (fdperf, perfetto)
65
 * and the drive could be stomping on each other's feet.  (Also reading the
66
 * perfcntrs accurately requires a WFI.)
67
 *
68
 * [1] ignoring UBWC
69
 * [2] ignoring early-tile-exit optimizations, but any draw that touches all/
70
 *     most of the tiles late in the tile-pass can defeat that
71
 */
72
struct fd_autotune {
73

74
   /**
75
    * Cache to map batch->key (also used for batch-cache) to historical
76
    * information about rendering to that particular render target.
77
    */
78
   struct hash_table *ht;
79

80
   /**
81
    * List of recently used historical results (to age out old results)
82
    */
83
   struct list_head lru;
84

85
   /**
86
    * GPU buffer used to communicate back results to the CPU
87
    */
88
   struct fd_bo *results_mem;
89
   struct fd_autotune_results *results;
90

91
   /**
92
    * List of per-batch results that we are waiting for the GPU to finish
93
    * with before reading back the results.
94
    */
95
   struct list_head pending_results;
96

97
   uint32_t fence_counter;
98
   uint32_t idx_counter;
99
};
100

101
/**
102
 * The layout of the memory used to read back per-batch results from the
103
 * GPU
104
 *
105
 * Note this struct is intentionally aligned to 4k.  And hw requires the
106
 * sample start/stop locations to be 128b aligned.
107
 */
108
struct fd_autotune_results {
109

110
   /**
111
    * The GPU writes back a "fence" seqno value from the cmdstream after
112
    * it finishes writing it's result slot, so that the CPU knows when
113
    * results are valid
114
    */
115
   uint32_t fence;
116

117
   uint32_t __pad0;
118
   uint64_t __pad1;
119

120
   /**
121
    * From the cmdstream, the captured samples-passed values are recorded
122
    * at the start and end of the batch.
123
    *
124
    * Note that we do the math on the CPU to avoid a WFI.  But pre-emption
125
    * may force us to revisit that.
126
    */
127
   struct {
128
      uint64_t samples_start;
129
      uint64_t __pad0;
130
      uint64_t samples_end;
131
      uint64_t __pad1;
132
   } result[127];
133
};
134

135
#define offset(base, ptr) ((uint8_t *)(ptr) - (uint8_t *)(base))
136
#define results_ptr(at, member)                                                \
137
   (at)->results_mem, offset((at)->results, &(at)->results->member), 0, 0
138

139
struct fd_batch_history;
140

141
/**
142
 * Tracks the results from an individual batch.  Initially created per batch,
143
 * and appended to the tail of at->pending_results.  At a later time, when
144
 * the GPU has finished writing the results,
145
 *
146
 * ralloc parent is the associated fd_batch_history
147
 */
148
struct fd_batch_result {
149

150
   /**
151
    * The index/slot in fd_autotune_results::result[] to write start/end
152
    * counter to
153
    */
154
   unsigned idx;
155

156
   /**
157
    * Fence value to write back to fd_autotune_results::fence after both
158
    * start/end values written
159
    */
160
   uint32_t fence;
161

162
   /*
163
    * Below here, only used internally within autotune
164
    */
165
   struct fd_batch_history *history;
166
   struct list_head node;
167
   uint32_t cost;
168
   uint64_t samples_passed;
169
};
170

171
void fd_autotune_init(struct fd_autotune *at, struct fd_device *dev);
172
void fd_autotune_fini(struct fd_autotune *at);
173

174
struct fd_batch;
175
bool fd_autotune_use_bypass(struct fd_autotune *at,
176
                            struct fd_batch *batch) assert_dt;
177

178
#endif /* FREEDRENO_AUTOTUNE_H */
179

180
Product

Resources

Company