CoCalc -- pan

GitHub Repository: PojavLauncherTeam/mesa
Path: blob/21.2-virgl/src/panfrost/lib/pan_tiler.c
⁴⁵⁶⁰ views
1
/*
2
 * Copyright (C) 2019 Collabora, Ltd.
3
 *
4
 * Permission is hereby granted, free of charge, to any person obtaining a
5
 * copy of this software and associated documentation files (the "Software"),
6
 * to deal in the Software without restriction, including without limitation
7
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8
 * and/or sell copies of the Software, and to permit persons to whom the
9
 * Software is furnished to do so, subject to the following conditions:
10
 *
11
 * The above copyright notice and this permission notice (including the next
12
 * paragraph) shall be included in all copies or substantial portions of the
13
 * Software.
14
 *
15
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
 * SOFTWARE.
22
 *
23
 * Authors:
24
 *   Alyssa Rosenzweig <[email protected]>
25
 */
26

27
#include "util/u_math.h"
28
#include "util/macros.h"
29
#include "pan_device.h"
30
#include "pan_encoder.h"
31
#include "panfrost-quirks.h"
32

33
/* Mali GPUs are tiled-mode renderers, rather than immediate-mode.
34
 * Conceptually, the screen is divided into 16x16 tiles. Vertex shaders run.
35
 * Then, a fixed-function hardware block (the tiler) consumes the gl_Position
36
 * results. For each triangle specified, it marks each containing tile as
37
 * containing that triangle. This set of "triangles per tile" form the "polygon
38
 * list". Finally, the rasterization unit consumes the polygon list to invoke
39
 * the fragment shader.
40
 *
41
 * In practice, it's a bit more complicated than this. On Midgard chips with an
42
 * "advanced tiling unit" (all except T720/T820/T830), 16x16 is the logical
43
 * tile size, but Midgard features "hierarchical tiling", where power-of-two
44
 * multiples of the base tile size can be used: hierarchy level 0 (16x16),
45
 * level 1 (32x32), level 2 (64x64), per public information about Midgard's
46
 * tiling. In fact, tiling goes up to 4096x4096 (!), although in practice
47
 * 128x128 is the largest usually used (though higher modes are enabled).  The
48
 * idea behind hierarchical tiling is to use low tiling levels for small
49
 * triangles and high levels for large triangles, to minimize memory bandwidth
50
 * and repeated fragment shader invocations (the former issue inherent to
51
 * immediate-mode rendering and the latter common in traditional tilers).
52
 *
53
 * The tiler itself works by reading varyings in and writing a polygon list
54
 * out. Unfortunately (for us), both of these buffers are managed in main
55
 * memory; although they ideally will be cached, it is the drivers'
56
 * responsibility to allocate these buffers. Varying buffer allocation is
57
 * handled elsewhere, as it is not tiler specific; the real issue is allocating
58
 * the polygon list.
59
 *
60
 * This is hard, because from the driver's perspective, we have no information
61
 * about what geometry will actually look like on screen; that information is
62
 * only gained from running the vertex shader. (Theoretically, we could run the
63
 * vertex shaders in software as a prepass, or in hardware with transform
64
 * feedback as a prepass, but either idea is ludicrous on so many levels).
65
 *
66
 * Instead, Mali uses a bit of a hybrid approach, splitting the polygon list
67
 * into three distinct pieces. First, the driver statically determines which
68
 * tile hierarchy levels to use (more on that later). At this point, we know the
69
 * framebuffer dimensions and all the possible tilings of the framebuffer, so
70
 * we know exactly how many tiles exist across all hierarchy levels. The first
71
 * piece of the polygon list is the header, which is exactly 8 bytes per tile,
72
 * plus padding and a small 64-byte prologue. (If that doesn't remind you of
73
 * AFBC, it should. See pan_afbc.c for some fun parallels). The next part is
74
 * the polygon list body, which seems to contain 512 bytes per tile, again
75
 * across every level of the hierarchy. These two parts form the polygon list
76
 * buffer. This buffer has a statically determinable size, approximately equal
77
 * to the # of tiles across all hierarchy levels * (8 bytes + 512 bytes), plus
78
 * alignment / minimum restrictions / etc.
79
 *
80
 * The third piece is the easy one (for us): the tiler heap. In essence, the
81
 * tiler heap is a gigantic slab that's as big as could possibly be necessary
82
 * in the worst case imaginable. Just... a gigantic allocation that we give a
83
 * start and end pointer to. What's the catch? The tiler heap is lazily
84
 * allocated; that is, a huge amount of memory is _reserved_, but only a tiny
85
 * bit is actually allocated upfront. The GPU just keeps using the
86
 * unallocated-but-reserved portions as it goes along, generating page faults
87
 * if it goes beyond the allocation, and then the kernel is instructed to
88
 * expand the allocation on page fault (known in the vendor kernel as growable
89
 * memory). This is quite a bit of bookkeeping of its own, but that task is
90
 * pushed to kernel space and we can mostly ignore it here, just remembering to
91
 * set the GROWABLE flag so the kernel actually uses this path rather than
92
 * allocating a gigantic amount up front and burning a hole in RAM.
93
 *
94
 * As far as determining which hierarchy levels to use, the simple answer is
95
 * that right now, we don't. In the tiler configuration fields (consistent from
96
 * the earliest Midgard's SFBD through the latest Bifrost traces we have),
97
 * there is a hierarchy_mask field, controlling which levels (tile sizes) are
98
 * enabled. Ideally, the hierarchical tiling dream -- mapping big polygons to
99
 * big tiles and small polygons to small tiles -- would be realized here as
100
 * well. As long as there are polygons at all needing tiling, we always have to
101
 * have big tiles available, in case there are big polygons. But we don't
102
 * necessarily need small tiles available. Ideally, when there are small
103
 * polygons, small tiles are enabled (to avoid waste from putting small
104
 * triangles in the big tiles); when there are not, small tiles are disabled to
105
 * avoid enabling more levels than necessary, which potentially costs in memory
106
 * bandwidth / power / tiler performance.
107
 *
108
 * Of course, the driver has to figure this out statically. When tile
109
 * hiearchies are actually established, this occurs by the tiler in
110
 * fixed-function hardware, after the vertex shaders have run and there is
111
 * sufficient information to figure out the size of triangles. The driver has
112
 * no such luxury, again barring insane hacks like additionally running the
113
 * vertex shaders in software or in hardware via transform feedback. Thus, for
114
 * the driver, we need a heuristic approach.
115
 *
116
 * There are lots of heuristics to guess triangle size statically you could
117
 * imagine, but one approach shines as particularly simple-stupid: assume all
118
 * on-screen triangles are equal size and spread equidistantly throughout the
119
 * screen. Let's be clear, this is NOT A VALID ASSUMPTION. But if we roll with
120
 * it, then we see:
121
 *
122
 *      Triangle Area   = (Screen Area / # of triangles)
123
 *                      = (Width * Height) / (# of triangles)
124
 *
125
 * Or if you prefer, we can also make a third CRAZY assumption that we only draw
126
 * right triangles with edges parallel/perpendicular to the sides of the screen
127
 * with no overdraw, forming a triangle grid across the screen:
128
 *
129
 * |--w--|
130
 *  _____   |
131
 * | /| /|  |
132
 * |/_|/_|  h
133
 * | /| /|  |
134
 * |/_|/_|  |
135
 *
136
 * Then you can use some middle school geometry and algebra to work out the
137
 * triangle dimensions. I started working on this, but realised I didn't need
138
 * to to make my point, but couldn't bare to erase that ASCII art. Anyway.
139
 *
140
 * POINT IS, by considering the ratio of screen area and triangle count, we can
141
 * estimate the triangle size. For a small size, use small bins; for a large
142
 * size, use large bins. Intuitively, this metric makes sense: when there are
143
 * few triangles on a large screen, you're probably compositing a UI and
144
 * therefore the triangles are large; when there are a lot of triangles on a
145
 * small screen, you're probably rendering a 3D mesh and therefore the
146
 * triangles are tiny. (Or better said -- there will be tiny triangles, even if
147
 * there are also large triangles. There have to be unless you expect crazy
148
 * overdraw. Generally, it's better to allow more small bin sizes than
149
 * necessary than not allow enough.)
150
 *
151
 * From this heuristic (or whatever), we determine the minimum allowable tile
152
 * size, and we use that to decide the hierarchy masking, selecting from the
153
 * minimum "ideal" tile size to the maximum tile size (2048x2048 in practice).
154
 *
155
 * Once we have that mask and the framebuffer dimensions, we can compute the
156
 * size of the statically-sized polygon list structures, allocate them, and go!
157
 *
158
 * -----
159
 *
160
 * On T720, T820, and T830, there is no support for hierarchical tiling.
161
 * Instead, the hardware allows the driver to select the tile size dynamically
162
 * on a per-framebuffer basis, including allowing rectangular/non-square tiles.
163
 * Rules for tile size selection are as follows:
164
 *
165
 *  - Dimensions must be powers-of-two.
166
 *  - The smallest tile is 16x16.
167
 *  - The tile width/height is at most the framebuffer w/h (clamp up to 16 pix)
168
 *  - There must be no more than 64 tiles in either dimension.
169
 *
170
 * Within these constraints, the driver is free to pick a tile size according
171
 * to some heuristic, similar to units with an advanced tiling unit.
172
 *
173
 * To pick a size without any heuristics, we may satisfy the constraints by
174
 * defaulting to 16x16 (a power-of-two). This fits the minimum. For the size
175
 * constraint, consider:
176
 *
177
 *      # of tiles < 64
178
 *      ceil (fb / tile) < 64
179
 *      (fb / tile) <= (64 - 1)
180
 *      tile <= fb / (64 - 1) <= next_power_of_two(fb / (64 - 1))
181
 *
182
 * Hence we clamp up to align_pot(fb / (64 - 1)).
183
 
184
 * Extending to use a selection heuristic left for future work.
185
 *
186
 * Once the tile size (w, h) is chosen, we compute the hierarchy "mask":
187
 *
188
 *      hierarchy_mask = (log2(h / 16) << 6) | log2(w / 16)
189
 *
190
 * Of course with no hierarchical tiling, this is not a mask; it's just a field
191
 * specifying the tile size. But I digress.
192
 *
193
 * We also compute the polgon list sizes (with framebuffer size W, H) as:
194
 *
195
 *      full_size = 0x200 + 0x200 * ceil(W / w) * ceil(H / h)
196
 *      offset = 8 * ceil(W / w) * ceil(H / h)
197
 *
198
 * It further appears necessary to round down offset to the nearest 0x200.
199
 * Possibly we would also round down full_size to the nearest 0x200 but
200
 * full_size/0x200 = (1 + ceil(W / w) * ceil(H / h)) is an integer so there's
201
 * nothing to do.
202
 */
203

204
/* Hierarchical tiling spans from 16x16 to 4096x4096 tiles */
205

206
#define MIN_TILE_SIZE 16
207
#define MAX_TILE_SIZE 4096
208

209
/* Constants as shifts for easier power-of-two iteration */
210

211
#define MIN_TILE_SHIFT util_logbase2(MIN_TILE_SIZE)
212
#define MAX_TILE_SHIFT util_logbase2(MAX_TILE_SIZE)
213

214
/* The hierarchy has a 64-byte prologue */
215
#define PROLOGUE_SIZE 0x40
216

217
/* For each tile (across all hierarchy levels), there is 8 bytes of header */
218
#define HEADER_BYTES_PER_TILE 0x8
219

220
/* Likewise, each tile per level has 512 bytes of body */
221
#define FULL_BYTES_PER_TILE 0x200
222

223
/* If the width-x-height framebuffer is divided into tile_size-x-tile_size
224
 * tiles, how many tiles are there? Rounding up in each direction. For the
225
 * special case of tile_size=16, this aligns with the usual Midgard count.
226
 * tile_size must be a power-of-two. Not really repeat code from AFBC/checksum,
227
 * because those care about the stride (not just the overall count) and only at
228
 * a a fixed-tile size (not any of a number of power-of-twos) */
229

230
static unsigned
231
pan_tile_count(unsigned width, unsigned height, unsigned tile_width, unsigned tile_height)
232
{
233
        unsigned aligned_width = ALIGN_POT(width, tile_width);
234
        unsigned aligned_height = ALIGN_POT(height, tile_height);
235

236
        unsigned tile_count_x = aligned_width / tile_width;
237
        unsigned tile_count_y = aligned_height / tile_height;
238

239
        return tile_count_x * tile_count_y;
240
}
241

242
/* For `masked_count` of the smallest tile sizes masked out, computes how the
243
 * size of the polygon list header. We iterate the tile sizes (16x16 through
244
 * 2048x2048). For each tile size, we figure out how many tiles there are at
245
 * this hierarchy level and therefore many bytes this level is, leaving us with
246
 * a byte count for each level. We then just sum up the byte counts across the
247
 * levels to find a byte count for all levels. */
248

249
static unsigned
250
panfrost_hierarchy_size(
251
                unsigned width,
252
                unsigned height,
253
                unsigned mask,
254
                unsigned bytes_per_tile)
255
{
256
        unsigned size = PROLOGUE_SIZE;
257

258
        /* Iterate hierarchy levels */
259

260
        for (unsigned b = 0; b < (MAX_TILE_SHIFT - MIN_TILE_SHIFT); ++b) {
261
                /* Check if this level is enabled */
262
                if (!(mask & (1 << b)))
263
                        continue;
264

265
                /* Shift from a level to a tile size */
266
                unsigned tile_size = (1 << b) * MIN_TILE_SIZE;
267

268
                unsigned tile_count = pan_tile_count(width, height, tile_size, tile_size);
269
                unsigned level_count = bytes_per_tile * tile_count;
270

271
                size += level_count;
272
        }
273

274
        /* This size will be used as an offset, so ensure it's aligned */
275
        return ALIGN_POT(size, 0x200);
276
}
277

278
/* Implement the formula:
279
 *
280
 *      0x200 + bytes_per_tile * ceil(W / w) * ceil(H / h)
281
 *
282
 * rounding down the answer to the nearest 0x200. This is used to compute both
283
 * header and body sizes for GPUs without hierarchical tiling. Essentially,
284
 * computing a single hierarchy level, since there isn't any hierarchy!
285
 */
286

287
static unsigned
288
panfrost_flat_size(unsigned width, unsigned height, unsigned dim, unsigned bytes_per_tile)
289
{
290
        /* First, extract the tile dimensions */
291

292
        unsigned tw = (1 << (dim & 0b111)) * 8;
293
        unsigned th = (1 << ((dim & (0b111 << 6)) >> 6)) * 8;
294

295
        /* tile_count is ceil(W/w) * ceil(H/h) */
296
        unsigned raw = pan_tile_count(width, height, tw, th) * bytes_per_tile;
297

298
        /* Round down and add offset */
299
        return 0x200 + ((raw / 0x200) * 0x200);
300
}
301

302
/* Given a hierarchy mask and a framebuffer size, compute the header size */
303

304
unsigned
305
panfrost_tiler_header_size(unsigned width, unsigned height, unsigned mask, bool hierarchy)
306
{
307
        if (hierarchy)
308
                return panfrost_hierarchy_size(width, height, mask, HEADER_BYTES_PER_TILE);
309
        else
310
                return panfrost_flat_size(width, height, mask, HEADER_BYTES_PER_TILE);
311
}
312

313
/* The combined header/body is sized similarly (but it is significantly
314
 * larger), except that it can be empty when the tiler disabled, rather than
315
 * getting clamped to a minimum size.
316
 */
317

318
unsigned
319
panfrost_tiler_full_size(unsigned width, unsigned height, unsigned mask, bool hierarchy)
320
{
321
        if (hierarchy)
322
                return panfrost_hierarchy_size(width, height, mask, FULL_BYTES_PER_TILE);
323
        else
324
                return panfrost_flat_size(width, height, mask, FULL_BYTES_PER_TILE);
325
}
326

327
/* On GPUs without hierarchical tiling, we choose a tile size directly and
328
 * stuff it into the field otherwise known as hierarchy mask (not a mask). */
329

330
static unsigned
331
panfrost_choose_tile_size(
332
        unsigned width, unsigned height, unsigned vertex_count)
333
{
334
        /* Figure out the ideal tile size. Eventually a heuristic should be
335
         * used for this */
336

337
        unsigned best_w = 16;
338
        unsigned best_h = 16;
339

340
        /* Clamp so there are less than 64 tiles in each direction */
341

342
        best_w = MAX2(best_w, util_next_power_of_two(width / 63));
343
        best_h = MAX2(best_h, util_next_power_of_two(height / 63));
344

345
        /* We have our ideal tile size, so encode */
346

347
        unsigned exp_w = util_logbase2(best_w / 16);
348
        unsigned exp_h = util_logbase2(best_h / 16);
349

350
        return exp_w | (exp_h << 6);
351
}
352

353
/* In the future, a heuristic to choose a tiler hierarchy mask would go here.
354
 * At the moment, we just default to 0xFF, which enables all possible hierarchy
355
 * levels. Overall this yields good performance but presumably incurs a cost in
356
 * memory bandwidth / power consumption / etc, at least on smaller scenes that
357
 * don't really need all the smaller levels enabled */
358

359
unsigned
360
panfrost_choose_hierarchy_mask(
361
        unsigned width, unsigned height,
362
        unsigned vertex_count, bool hierarchy)
363
{
364
        /* If there is no geometry, we don't bother enabling anything */
365

366
        if (!vertex_count)
367
                return 0x00;
368

369
        if (!hierarchy)
370
                return panfrost_choose_tile_size(width, height, vertex_count);
371

372
        /* Otherwise, default everything on. TODO: Proper tests */
373

374
        return 0xFF;
375
}
376

377
unsigned
378
panfrost_tiler_get_polygon_list_size(const struct panfrost_device *dev,
379
                                     unsigned fb_width, unsigned fb_height,
380
                                     bool has_draws)
381
{
382
        if (pan_is_bifrost(dev))
383
                return 0;
384

385
        if (!has_draws)
386
                return MALI_MIDGARD_TILER_MINIMUM_HEADER_SIZE + 4;
387

388
        bool hierarchy = !(dev->quirks & MIDGARD_NO_HIER_TILING);
389
        unsigned hierarchy_mask =
390
                panfrost_choose_hierarchy_mask(fb_width, fb_height, 1, hierarchy);
391

392
        return panfrost_tiler_full_size(fb_width, fb_height, hierarchy_mask, hierarchy) +
393
                panfrost_tiler_header_size(fb_width, fb_height, hierarchy_mask, hierarchy);
394
}
395

396
Product

Resources

Company