CoCalc -- pan_tiling.c

GitHub Repository: PojavLauncherTeam/mesa
Path: blob/21.2-virgl/src/panfrost/shared/pan_tiling.c
⁴⁵⁶⁰ views
1
/*
2
 * Copyright (c) 2011-2013 Luc Verhaegen <[email protected]>
3
 * Copyright (c) 2018 Alyssa Rosenzweig <[email protected]>
4
 * Copyright (c) 2018 Vasily Khoruzhick <[email protected]>
5
 * Copyright (c) 2019 Collabora, Ltd.
6
 *
7
 * Permission is hereby granted, free of charge, to any person obtaining a
8
 * copy of this software and associated documentation files (the "Software"),
9
 * to deal in the Software without restriction, including without limitation
10
 * the rights to use, copy, modify, merge, publish, distribute, sub license,
11
 * and/or sell copies of the Software, and to permit persons to whom the
12
 * Software is furnished to do so, subject to the following conditions:
13
 *
14
 * The above copyright notice and this permission notice (including the
15
 * next paragraph) shall be included in all copies or substantial portions
16
 * of the Software.
17
 *
18
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20
 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
21
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
23
 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
24
 * DEALINGS IN THE SOFTWARE.
25
 *
26
 */
27

28
#include "pan_tiling.h"
29
#include <stdbool.h>
30
#include "util/macros.h"
31

32
/* This file implements software encode/decode of the tiling format used for
33
 * textures and framebuffers primarily on Utgard GPUs. Names for this format
34
 * include "Utgard-style tiling", "(Mali) swizzled textures", and
35
 * "U-interleaved" (the former two names being used in the community
36
 * Lima/Panfrost drivers; the latter name used internally at Arm).
37
 * Conceptually, like any tiling scheme, the pixel reordering attempts to 2D
38
 * spatial locality, to improve cache locality in both horizontal and vertical
39
 * directions.
40
 *
41
 * This format is tiled: first, the image dimensions must be aligned to 16
42
 * pixels in each axis. Once aligned, the image is divided into 16x16 tiles.
43
 * This size harmonizes with other properties of the GPU; on Midgard,
44
 * framebuffer tiles are logically 16x16 (this is the tile size used in
45
 * Transaction Elimination and the minimum tile size used in Hierarchical
46
 * Tiling). Conversely, for a standard 4 bytes-per-pixel format (like
47
 * RGBA8888), 16 pixels * 4 bytes/pixel = 64 bytes, equal to the cache line
48
 * size.
49
 *
50
 * Within each 16x16 block, the bits are reordered according to this pattern:
51
 *
52
 * | y3 | (x3 ^ y3) | y2 | (y2 ^ x2) | y1 | (y1 ^ x1) | y0 | (y0 ^ x0) |
53
 *
54
 * Basically, interleaving the X and Y bits, with XORs thrown in for every
55
 * adjacent bit pair.
56
 *
57
 * This is cheap to implement both encode/decode in both hardware and software.
58
 * In hardware, lines are simply rerouted to reorder and some XOR gates are
59
 * thrown in. Software has to be a bit more clever.
60
 *
61
 * In software, the trick is to divide the pattern into two lines:
62
 *
63
 *    | y3 | y3 | y2 | y2 | y1 | y1 | y0 | y0 |
64
 *  ^ |  0 | x3 |  0 | x2 |  0 | x1 |  0 | x0 |
65
 *
66
 * That is, duplicate the bits of the Y and space out the bits of the X. The
67
 * top line is a function only of Y, so it can be calculated once per row and
68
 * stored in a register. The bottom line is simply X with the bits spaced out.
69
 * Spacing out the X is easy enough with a LUT, or by subtracting+ANDing the
70
 * mask pattern (abusing carry bits).
71
 *
72
 * This format is also supported on Midgard GPUs, where it *can* be used for
73
 * textures and framebuffers. That said, in practice it is usually as a
74
 * fallback layout; Midgard introduces Arm FrameBuffer Compression, which is
75
 * significantly more efficient than Utgard-style tiling and preferred for both
76
 * textures and framebuffers, where possible. For unsupported texture types,
77
 * for instance sRGB textures and framebuffers, this tiling scheme is used at a
78
 * performance penalty, as AFBC is not compatible.
79
 */
80

81
/* Given the lower 4-bits of the Y coordinate, we would like to
82
 * duplicate every bit over. So instead of 0b1010, we would like
83
 * 0b11001100. The idea is that for the bits in the solely Y place, we
84
 * get a Y place, and the bits in the XOR place *also* get a Y. */
85

86
const uint32_t bit_duplication[16] = {
87
   0b00000000,
88
   0b00000011,
89
   0b00001100,
90
   0b00001111,
91
   0b00110000,
92
   0b00110011,
93
   0b00111100,
94
   0b00111111,
95
   0b11000000,
96
   0b11000011,
97
   0b11001100,
98
   0b11001111,
99
   0b11110000,
100
   0b11110011,
101
   0b11111100,
102
   0b11111111,
103
};
104

105
/* Space the bits out of a 4-bit nibble */
106

107
const unsigned space_4[16] = {
108
   0b0000000,
109
   0b0000001,
110
   0b0000100,
111
   0b0000101,
112
   0b0010000,
113
   0b0010001,
114
   0b0010100,
115
   0b0010101,
116
   0b1000000,
117
   0b1000001,
118
   0b1000100,
119
   0b1000101,
120
   0b1010000,
121
   0b1010001,
122
   0b1010100,
123
   0b1010101
124
};
125

126
/* The scheme uses 16x16 tiles */
127

128
#define TILE_WIDTH 16
129
#define TILE_HEIGHT 16
130
#define PIXELS_PER_TILE (TILE_WIDTH * TILE_HEIGHT)
131

132
/* We need a 128-bit type for idiomatically tiling bpp128 formats. The type must
133
 * only support copies and sizeof, so emulating with a packed structure works
134
 * well enough, but if there's a native 128-bit type we may we well prefer
135
 * that. */
136

137
#ifdef __SIZEOF_INT128__
138
typedef __uint128_t pan_uint128_t;
139
#else
140
typedef struct {
141
  uint64_t lo;
142
  uint64_t hi;
143
} __attribute__((packed)) pan_uint128_t;
144
#endif
145

146
typedef struct {
147
  uint16_t lo;
148
  uint8_t hi;
149
} __attribute__((packed)) pan_uint24_t;
150

151
/* Optimized routine to tile an aligned (w & 0xF == 0) texture. Explanation:
152
 *
153
 * dest_start precomputes the offset to the beginning of the first horizontal
154
 * tile we're writing to, knowing that x is 16-aligned. Tiles themselves are
155
 * stored linearly, so we get the X tile number by shifting and then multiply
156
 * by the bytes per tile .
157
 *
158
 * We iterate across the pixels we're trying to store in source-order. For each
159
 * row in the destination image, we figure out which row of 16x16 block we're
160
 * in, by slicing off the lower 4-bits (block_y).
161
 *
162
 * dest then precomputes the location of the top-left corner of the block the
163
 * row starts in. In pixel coordinates (where the origin is the top-left),
164
 * (block_y, 0) is the top-left corner of the leftmost tile in this row.  While
165
 * pixels are reordered within a block, the blocks themselves are stored
166
 * linearly, so multiplying block_y by the pixel stride of the destination
167
 * image equals the byte offset of that top-left corner of the block this row
168
 * is in.
169
 *
170
 * On the other hand, the source is linear so we compute the locations of the
171
 * start and end of the row in the source by a simple linear addressing.
172
 *
173
 * For indexing within the tile, we need to XOR with the [y3 y3 y2 y2 y1 y1 y0
174
 * y0] value. Since this is constant across a row, we look it up per-row and
175
 * store in expanded_y.
176
 *
177
 * Finally, we iterate each row in source order. In the outer loop, we iterate
178
 * each 16 pixel tile. Within each tile, we iterate the 16 pixels (this should
179
 * be unrolled), calculating the index within the tile and writing.
180
 */
181

182
#define TILED_ACCESS_TYPE(pixel_t, shift) \
183
static ALWAYS_INLINE void \
184
panfrost_access_tiled_image_##pixel_t \
185
                              (void *dst, void *src, \
186
                               uint16_t sx, uint16_t sy, \
187
                               uint16_t w, uint16_t h, \
188
                               uint32_t dst_stride, \
189
                               uint32_t src_stride, \
190
                               bool is_store) \
191
{ \
192
   uint8_t *dest_start = dst + ((sx >> 4) * PIXELS_PER_TILE * sizeof(pixel_t)); \
193
   for (int y = sy, src_y = 0; src_y < h; ++y, ++src_y) { \
194
      uint16_t block_y = y & ~0x0f; \
195
      uint8_t *dest = (uint8_t *) (dest_start + (block_y * dst_stride)); \
196
      pixel_t *source = src + (src_y * src_stride); \
197
      pixel_t *source_end = source + w; \
198
      unsigned expanded_y = bit_duplication[y & 0xF] << shift; \
199
      for (; source < source_end; dest += (PIXELS_PER_TILE << shift)) { \
200
         for (uint8_t i = 0; i < 16; ++i) { \
201
            unsigned index = expanded_y ^ (space_4[i] << shift); \
202
            if (is_store) \
203
                *((pixel_t *) (dest + index)) = *(source++); \
204
            else \
205
                *(source++) = *((pixel_t *) (dest + index)); \
206
         } \
207
      } \
208
   } \
209
} \
210

211
TILED_ACCESS_TYPE(uint8_t, 0);
212
TILED_ACCESS_TYPE(uint16_t, 1);
213
TILED_ACCESS_TYPE(uint32_t, 2);
214
TILED_ACCESS_TYPE(uint64_t, 3);
215
TILED_ACCESS_TYPE(pan_uint128_t, 4);
216

217
#define TILED_UNALIGNED_TYPE(pixel_t, is_store, tile_shift) { \
218
   const unsigned mask = (1 << tile_shift) - 1; \
219
   for (int y = sy, src_y = 0; src_y < h; ++y, ++src_y) { \
220
      unsigned block_y = y & ~mask; \
221
      unsigned block_start_s = block_y * dst_stride; \
222
      unsigned source_start = src_y * src_stride; \
223
      unsigned expanded_y = bit_duplication[y & mask]; \
224
 \
225
      for (int x = sx, src_x = 0; src_x < w; ++x, ++src_x) { \
226
         unsigned block_x_s = (x >> tile_shift) * (1 << (tile_shift * 2)); \
227
         unsigned index = expanded_y ^ space_4[x & mask]; \
228
         uint8_t *source = src + source_start + sizeof(pixel_t) * src_x; \
229
         uint8_t *dest = dst + block_start_s + sizeof(pixel_t) * (block_x_s + index); \
230
 \
231
         pixel_t *outp = (pixel_t *) (is_store ? dest : source); \
232
         pixel_t *inp = (pixel_t *) (is_store ? source : dest); \
233
         *outp = *inp; \
234
      } \
235
   } \
236
}
237

238
#define TILED_UNALIGNED_TYPES(store, shift) { \
239
   if (bpp == 8) \
240
      TILED_UNALIGNED_TYPE(uint8_t, store, shift) \
241
   else if (bpp == 16) \
242
      TILED_UNALIGNED_TYPE(uint16_t, store, shift) \
243
   else if (bpp == 24) \
244
      TILED_UNALIGNED_TYPE(pan_uint24_t, store, shift) \
245
   else if (bpp == 32) \
246
      TILED_UNALIGNED_TYPE(uint32_t, store, shift) \
247
   else if (bpp == 64) \
248
      TILED_UNALIGNED_TYPE(uint64_t, store, shift) \
249
   else if (bpp == 128) \
250
      TILED_UNALIGNED_TYPE(pan_uint128_t, store, shift) \
251
}
252

253
static void
254
panfrost_access_tiled_image_generic(void *dst, void *src,
255
                               unsigned sx, unsigned sy,
256
                               unsigned w, unsigned h,
257
                               uint32_t dst_stride,
258
                               uint32_t src_stride,
259
                               const struct util_format_description *desc,
260
                               bool _is_store)
261
{
262
   unsigned bpp = desc->block.bits;
263

264
   if (desc->block.width > 1) {
265
      w = DIV_ROUND_UP(w, desc->block.width);
266
      h = DIV_ROUND_UP(h, desc->block.height);
267

268
      if (_is_store)
269
         TILED_UNALIGNED_TYPES(true, 2)
270
      else
271
         TILED_UNALIGNED_TYPES(false, 2)
272
   } else {
273
      if (_is_store)
274
         TILED_UNALIGNED_TYPES(true, 4)
275
      else
276
         TILED_UNALIGNED_TYPES(false, 4)
277
   }
278
}
279

280
#define OFFSET(src, _x, _y) (void *) ((uint8_t *) src + ((_y) - orig_y) * src_stride + (((_x) - orig_x) * (bpp / 8)))
281

282
static ALWAYS_INLINE void
283
panfrost_access_tiled_image(void *dst, void *src,
284
                           unsigned x, unsigned y,
285
                           unsigned w, unsigned h,
286
                           uint32_t dst_stride,
287
                           uint32_t src_stride,
288
                           enum pipe_format format,
289
                           bool is_store)
290
{
291
   const struct util_format_description *desc = util_format_description(format);
292

293
   if (desc->block.width > 1 || desc->block.bits == 24) {
294
      panfrost_access_tiled_image_generic(dst, (void *) src,
295
            x, y, w, h,
296
            dst_stride, src_stride, desc, is_store);
297

298
      return;
299
   }
300

301
   unsigned bpp = desc->block.bits;
302
   unsigned first_full_tile_x = DIV_ROUND_UP(x, TILE_WIDTH) * TILE_WIDTH;
303
   unsigned first_full_tile_y = DIV_ROUND_UP(y, TILE_HEIGHT) * TILE_HEIGHT;
304
   unsigned last_full_tile_x = ((x + w) / TILE_WIDTH) * TILE_WIDTH;
305
   unsigned last_full_tile_y = ((y + h) / TILE_HEIGHT) * TILE_HEIGHT;
306

307
   /* First, tile the top portion */
308

309
   unsigned orig_x = x, orig_y = y;
310

311
   if (first_full_tile_y != y) {
312
      unsigned dist = MIN2(first_full_tile_y - y, h);
313

314
      panfrost_access_tiled_image_generic(dst, OFFSET(src, x, y),
315
            x, y, w, dist,
316
            dst_stride, src_stride, desc, is_store);
317

318
      if (dist == h)
319
         return;
320

321
      y += dist;
322
      h -= dist;
323
   }
324

325
   /* Next, the bottom portion */
326
   if (last_full_tile_y != (y + h)) {
327
      unsigned dist = (y + h) - last_full_tile_y;
328

329
      panfrost_access_tiled_image_generic(dst, OFFSET(src, x, last_full_tile_y),
330
            x, last_full_tile_y, w, dist,
331
            dst_stride, src_stride, desc, is_store);
332

333
      h -= dist;
334
   }
335

336
   /* The left portion */
337
   if (first_full_tile_x != x) {
338
      unsigned dist = MIN2(first_full_tile_x - x, w);
339

340
      panfrost_access_tiled_image_generic(dst, OFFSET(src, x, y),
341
            x, y, dist, h,
342
            dst_stride, src_stride, desc, is_store);
343

344
      if (dist == w)
345
         return;
346

347
      x += dist;
348
      w -= dist;
349
   }
350

351
   /* Finally, the right portion */
352
   if (last_full_tile_x != (x + w)) {
353
      unsigned dist = (x + w) - last_full_tile_x;
354

355
      panfrost_access_tiled_image_generic(dst, OFFSET(src, last_full_tile_x, y),
356
            last_full_tile_x, y, dist, h,
357
            dst_stride, src_stride, desc, is_store);
358

359
      w -= dist;
360
   }
361

362
   if (bpp == 8)
363
      panfrost_access_tiled_image_uint8_t(dst,  OFFSET(src, x, y), x, y, w, h, dst_stride, src_stride, is_store);
364
   else if (bpp == 16)
365
      panfrost_access_tiled_image_uint16_t(dst, OFFSET(src, x, y), x, y, w, h, dst_stride, src_stride, is_store);
366
   else if (bpp == 32)
367
      panfrost_access_tiled_image_uint32_t(dst, OFFSET(src, x, y), x, y, w, h, dst_stride, src_stride, is_store);
368
   else if (bpp == 64)
369
      panfrost_access_tiled_image_uint64_t(dst, OFFSET(src, x, y), x, y, w, h, dst_stride, src_stride, is_store);
370
   else if (bpp == 128)
371
      panfrost_access_tiled_image_pan_uint128_t(dst, OFFSET(src, x, y), x, y, w, h, dst_stride, src_stride, is_store);
372
}
373

374
void
375
panfrost_store_tiled_image(void *dst, const void *src,
376
                           unsigned x, unsigned y,
377
                           unsigned w, unsigned h,
378
                           uint32_t dst_stride,
379
                           uint32_t src_stride,
380
                           enum pipe_format format)
381
{
382
    panfrost_access_tiled_image(dst, (void *) src,
383
        x, y, w, h,
384
        dst_stride, src_stride, format, true);
385
}
386

387
void
388
panfrost_load_tiled_image(void *dst, const void *src,
389
                           unsigned x, unsigned y,
390
                           unsigned w, unsigned h,
391
                           uint32_t dst_stride,
392
                           uint32_t src_stride,
393
                           enum pipe_format format)
394
{
395
   panfrost_access_tiled_image((void *) src, dst,
396
       x, y, w, h,
397
       src_stride, dst_stride, format, false);
398
}
399

400
Product

Resources

Company