Path: blob/21.2-virgl/src/panfrost/shared/pan_tiling.c
4560 views
/*1* Copyright (c) 2011-2013 Luc Verhaegen <[email protected]>2* Copyright (c) 2018 Alyssa Rosenzweig <[email protected]>3* Copyright (c) 2018 Vasily Khoruzhick <[email protected]>4* Copyright (c) 2019 Collabora, Ltd.5*6* Permission is hereby granted, free of charge, to any person obtaining a7* copy of this software and associated documentation files (the "Software"),8* to deal in the Software without restriction, including without limitation9* the rights to use, copy, modify, merge, publish, distribute, sub license,10* and/or sell copies of the Software, and to permit persons to whom the11* Software is furnished to do so, subject to the following conditions:12*13* The above copyright notice and this permission notice (including the14* next paragraph) shall be included in all copies or substantial portions15* of the Software.16*17* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR18* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,19* FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL20* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER21* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING22* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER23* DEALINGS IN THE SOFTWARE.24*25*/2627#include "pan_tiling.h"28#include <stdbool.h>29#include "util/macros.h"3031/* This file implements software encode/decode of the tiling format used for32* textures and framebuffers primarily on Utgard GPUs. Names for this format33* include "Utgard-style tiling", "(Mali) swizzled textures", and34* "U-interleaved" (the former two names being used in the community35* Lima/Panfrost drivers; the latter name used internally at Arm).36* Conceptually, like any tiling scheme, the pixel reordering attempts to 2D37* spatial locality, to improve cache locality in both horizontal and vertical38* directions.39*40* This format is tiled: first, the image dimensions must be aligned to 1641* pixels in each axis. Once aligned, the image is divided into 16x16 tiles.42* This size harmonizes with other properties of the GPU; on Midgard,43* framebuffer tiles are logically 16x16 (this is the tile size used in44* Transaction Elimination and the minimum tile size used in Hierarchical45* Tiling). Conversely, for a standard 4 bytes-per-pixel format (like46* RGBA8888), 16 pixels * 4 bytes/pixel = 64 bytes, equal to the cache line47* size.48*49* Within each 16x16 block, the bits are reordered according to this pattern:50*51* | y3 | (x3 ^ y3) | y2 | (y2 ^ x2) | y1 | (y1 ^ x1) | y0 | (y0 ^ x0) |52*53* Basically, interleaving the X and Y bits, with XORs thrown in for every54* adjacent bit pair.55*56* This is cheap to implement both encode/decode in both hardware and software.57* In hardware, lines are simply rerouted to reorder and some XOR gates are58* thrown in. Software has to be a bit more clever.59*60* In software, the trick is to divide the pattern into two lines:61*62* | y3 | y3 | y2 | y2 | y1 | y1 | y0 | y0 |63* ^ | 0 | x3 | 0 | x2 | 0 | x1 | 0 | x0 |64*65* That is, duplicate the bits of the Y and space out the bits of the X. The66* top line is a function only of Y, so it can be calculated once per row and67* stored in a register. The bottom line is simply X with the bits spaced out.68* Spacing out the X is easy enough with a LUT, or by subtracting+ANDing the69* mask pattern (abusing carry bits).70*71* This format is also supported on Midgard GPUs, where it *can* be used for72* textures and framebuffers. That said, in practice it is usually as a73* fallback layout; Midgard introduces Arm FrameBuffer Compression, which is74* significantly more efficient than Utgard-style tiling and preferred for both75* textures and framebuffers, where possible. For unsupported texture types,76* for instance sRGB textures and framebuffers, this tiling scheme is used at a77* performance penalty, as AFBC is not compatible.78*/7980/* Given the lower 4-bits of the Y coordinate, we would like to81* duplicate every bit over. So instead of 0b1010, we would like82* 0b11001100. The idea is that for the bits in the solely Y place, we83* get a Y place, and the bits in the XOR place *also* get a Y. */8485const uint32_t bit_duplication[16] = {860b00000000,870b00000011,880b00001100,890b00001111,900b00110000,910b00110011,920b00111100,930b00111111,940b11000000,950b11000011,960b11001100,970b11001111,980b11110000,990b11110011,1000b11111100,1010b11111111,102};103104/* Space the bits out of a 4-bit nibble */105106const unsigned space_4[16] = {1070b0000000,1080b0000001,1090b0000100,1100b0000101,1110b0010000,1120b0010001,1130b0010100,1140b0010101,1150b1000000,1160b1000001,1170b1000100,1180b1000101,1190b1010000,1200b1010001,1210b1010100,1220b1010101123};124125/* The scheme uses 16x16 tiles */126127#define TILE_WIDTH 16128#define TILE_HEIGHT 16129#define PIXELS_PER_TILE (TILE_WIDTH * TILE_HEIGHT)130131/* We need a 128-bit type for idiomatically tiling bpp128 formats. The type must132* only support copies and sizeof, so emulating with a packed structure works133* well enough, but if there's a native 128-bit type we may we well prefer134* that. */135136#ifdef __SIZEOF_INT128__137typedef __uint128_t pan_uint128_t;138#else139typedef struct {140uint64_t lo;141uint64_t hi;142} __attribute__((packed)) pan_uint128_t;143#endif144145typedef struct {146uint16_t lo;147uint8_t hi;148} __attribute__((packed)) pan_uint24_t;149150/* Optimized routine to tile an aligned (w & 0xF == 0) texture. Explanation:151*152* dest_start precomputes the offset to the beginning of the first horizontal153* tile we're writing to, knowing that x is 16-aligned. Tiles themselves are154* stored linearly, so we get the X tile number by shifting and then multiply155* by the bytes per tile .156*157* We iterate across the pixels we're trying to store in source-order. For each158* row in the destination image, we figure out which row of 16x16 block we're159* in, by slicing off the lower 4-bits (block_y).160*161* dest then precomputes the location of the top-left corner of the block the162* row starts in. In pixel coordinates (where the origin is the top-left),163* (block_y, 0) is the top-left corner of the leftmost tile in this row. While164* pixels are reordered within a block, the blocks themselves are stored165* linearly, so multiplying block_y by the pixel stride of the destination166* image equals the byte offset of that top-left corner of the block this row167* is in.168*169* On the other hand, the source is linear so we compute the locations of the170* start and end of the row in the source by a simple linear addressing.171*172* For indexing within the tile, we need to XOR with the [y3 y3 y2 y2 y1 y1 y0173* y0] value. Since this is constant across a row, we look it up per-row and174* store in expanded_y.175*176* Finally, we iterate each row in source order. In the outer loop, we iterate177* each 16 pixel tile. Within each tile, we iterate the 16 pixels (this should178* be unrolled), calculating the index within the tile and writing.179*/180181#define TILED_ACCESS_TYPE(pixel_t, shift) \182static ALWAYS_INLINE void \183panfrost_access_tiled_image_##pixel_t \184(void *dst, void *src, \185uint16_t sx, uint16_t sy, \186uint16_t w, uint16_t h, \187uint32_t dst_stride, \188uint32_t src_stride, \189bool is_store) \190{ \191uint8_t *dest_start = dst + ((sx >> 4) * PIXELS_PER_TILE * sizeof(pixel_t)); \192for (int y = sy, src_y = 0; src_y < h; ++y, ++src_y) { \193uint16_t block_y = y & ~0x0f; \194uint8_t *dest = (uint8_t *) (dest_start + (block_y * dst_stride)); \195pixel_t *source = src + (src_y * src_stride); \196pixel_t *source_end = source + w; \197unsigned expanded_y = bit_duplication[y & 0xF] << shift; \198for (; source < source_end; dest += (PIXELS_PER_TILE << shift)) { \199for (uint8_t i = 0; i < 16; ++i) { \200unsigned index = expanded_y ^ (space_4[i] << shift); \201if (is_store) \202*((pixel_t *) (dest + index)) = *(source++); \203else \204*(source++) = *((pixel_t *) (dest + index)); \205} \206} \207} \208} \209210TILED_ACCESS_TYPE(uint8_t, 0);211TILED_ACCESS_TYPE(uint16_t, 1);212TILED_ACCESS_TYPE(uint32_t, 2);213TILED_ACCESS_TYPE(uint64_t, 3);214TILED_ACCESS_TYPE(pan_uint128_t, 4);215216#define TILED_UNALIGNED_TYPE(pixel_t, is_store, tile_shift) { \217const unsigned mask = (1 << tile_shift) - 1; \218for (int y = sy, src_y = 0; src_y < h; ++y, ++src_y) { \219unsigned block_y = y & ~mask; \220unsigned block_start_s = block_y * dst_stride; \221unsigned source_start = src_y * src_stride; \222unsigned expanded_y = bit_duplication[y & mask]; \223\224for (int x = sx, src_x = 0; src_x < w; ++x, ++src_x) { \225unsigned block_x_s = (x >> tile_shift) * (1 << (tile_shift * 2)); \226unsigned index = expanded_y ^ space_4[x & mask]; \227uint8_t *source = src + source_start + sizeof(pixel_t) * src_x; \228uint8_t *dest = dst + block_start_s + sizeof(pixel_t) * (block_x_s + index); \229\230pixel_t *outp = (pixel_t *) (is_store ? dest : source); \231pixel_t *inp = (pixel_t *) (is_store ? source : dest); \232*outp = *inp; \233} \234} \235}236237#define TILED_UNALIGNED_TYPES(store, shift) { \238if (bpp == 8) \239TILED_UNALIGNED_TYPE(uint8_t, store, shift) \240else if (bpp == 16) \241TILED_UNALIGNED_TYPE(uint16_t, store, shift) \242else if (bpp == 24) \243TILED_UNALIGNED_TYPE(pan_uint24_t, store, shift) \244else if (bpp == 32) \245TILED_UNALIGNED_TYPE(uint32_t, store, shift) \246else if (bpp == 64) \247TILED_UNALIGNED_TYPE(uint64_t, store, shift) \248else if (bpp == 128) \249TILED_UNALIGNED_TYPE(pan_uint128_t, store, shift) \250}251252static void253panfrost_access_tiled_image_generic(void *dst, void *src,254unsigned sx, unsigned sy,255unsigned w, unsigned h,256uint32_t dst_stride,257uint32_t src_stride,258const struct util_format_description *desc,259bool _is_store)260{261unsigned bpp = desc->block.bits;262263if (desc->block.width > 1) {264w = DIV_ROUND_UP(w, desc->block.width);265h = DIV_ROUND_UP(h, desc->block.height);266267if (_is_store)268TILED_UNALIGNED_TYPES(true, 2)269else270TILED_UNALIGNED_TYPES(false, 2)271} else {272if (_is_store)273TILED_UNALIGNED_TYPES(true, 4)274else275TILED_UNALIGNED_TYPES(false, 4)276}277}278279#define OFFSET(src, _x, _y) (void *) ((uint8_t *) src + ((_y) - orig_y) * src_stride + (((_x) - orig_x) * (bpp / 8)))280281static ALWAYS_INLINE void282panfrost_access_tiled_image(void *dst, void *src,283unsigned x, unsigned y,284unsigned w, unsigned h,285uint32_t dst_stride,286uint32_t src_stride,287enum pipe_format format,288bool is_store)289{290const struct util_format_description *desc = util_format_description(format);291292if (desc->block.width > 1 || desc->block.bits == 24) {293panfrost_access_tiled_image_generic(dst, (void *) src,294x, y, w, h,295dst_stride, src_stride, desc, is_store);296297return;298}299300unsigned bpp = desc->block.bits;301unsigned first_full_tile_x = DIV_ROUND_UP(x, TILE_WIDTH) * TILE_WIDTH;302unsigned first_full_tile_y = DIV_ROUND_UP(y, TILE_HEIGHT) * TILE_HEIGHT;303unsigned last_full_tile_x = ((x + w) / TILE_WIDTH) * TILE_WIDTH;304unsigned last_full_tile_y = ((y + h) / TILE_HEIGHT) * TILE_HEIGHT;305306/* First, tile the top portion */307308unsigned orig_x = x, orig_y = y;309310if (first_full_tile_y != y) {311unsigned dist = MIN2(first_full_tile_y - y, h);312313panfrost_access_tiled_image_generic(dst, OFFSET(src, x, y),314x, y, w, dist,315dst_stride, src_stride, desc, is_store);316317if (dist == h)318return;319320y += dist;321h -= dist;322}323324/* Next, the bottom portion */325if (last_full_tile_y != (y + h)) {326unsigned dist = (y + h) - last_full_tile_y;327328panfrost_access_tiled_image_generic(dst, OFFSET(src, x, last_full_tile_y),329x, last_full_tile_y, w, dist,330dst_stride, src_stride, desc, is_store);331332h -= dist;333}334335/* The left portion */336if (first_full_tile_x != x) {337unsigned dist = MIN2(first_full_tile_x - x, w);338339panfrost_access_tiled_image_generic(dst, OFFSET(src, x, y),340x, y, dist, h,341dst_stride, src_stride, desc, is_store);342343if (dist == w)344return;345346x += dist;347w -= dist;348}349350/* Finally, the right portion */351if (last_full_tile_x != (x + w)) {352unsigned dist = (x + w) - last_full_tile_x;353354panfrost_access_tiled_image_generic(dst, OFFSET(src, last_full_tile_x, y),355last_full_tile_x, y, dist, h,356dst_stride, src_stride, desc, is_store);357358w -= dist;359}360361if (bpp == 8)362panfrost_access_tiled_image_uint8_t(dst, OFFSET(src, x, y), x, y, w, h, dst_stride, src_stride, is_store);363else if (bpp == 16)364panfrost_access_tiled_image_uint16_t(dst, OFFSET(src, x, y), x, y, w, h, dst_stride, src_stride, is_store);365else if (bpp == 32)366panfrost_access_tiled_image_uint32_t(dst, OFFSET(src, x, y), x, y, w, h, dst_stride, src_stride, is_store);367else if (bpp == 64)368panfrost_access_tiled_image_uint64_t(dst, OFFSET(src, x, y), x, y, w, h, dst_stride, src_stride, is_store);369else if (bpp == 128)370panfrost_access_tiled_image_pan_uint128_t(dst, OFFSET(src, x, y), x, y, w, h, dst_stride, src_stride, is_store);371}372373void374panfrost_store_tiled_image(void *dst, const void *src,375unsigned x, unsigned y,376unsigned w, unsigned h,377uint32_t dst_stride,378uint32_t src_stride,379enum pipe_format format)380{381panfrost_access_tiled_image(dst, (void *) src,382x, y, w, h,383dst_stride, src_stride, format, true);384}385386void387panfrost_load_tiled_image(void *dst, const void *src,388unsigned x, unsigned y,389unsigned w, unsigned h,390uint32_t dst_stride,391uint32_t src_stride,392enum pipe_format format)393{394panfrost_access_tiled_image((void *) src, dst,395x, y, w, h,396src_stride, dst_stride, format, false);397}398399400