Path: blob/21.2-virgl/src/intel/isl/isl_tiled_memcpy.c
4547 views
/*1* Mesa 3-D graphics library2*3* Copyright 2012 Intel Corporation4* Copyright 2013 Google5*6* Permission is hereby granted, free of charge, to any person obtaining a7* copy of this software and associated documentation files (the8* "Software"), to deal in the Software without restriction, including9* without limitation the rights to use, copy, modify, merge, publish,10* distribute, sublicense, and/or sell copies of the Software, and to11* permit persons to whom the Software is furnished to do so, subject to12* the following conditions:13*14* The above copyright notice and this permission notice (including the15* next paragraph) shall be included in all copies or substantial portions16* of the Software.17*18* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS19* OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF20* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.21* IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR22* ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,23* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE24* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.25*26* Authors:27* Chad Versace <[email protected]>28* Frank Henigman <[email protected]>29*/3031#include <string.h>3233#include "util/macros.h"34#include "main/macros.h"3536#include "isl_priv.h"3738#if defined(__SSSE3__)39#include <tmmintrin.h>40#elif defined(__SSE2__)41#include <emmintrin.h>42#endif4344#define FILE_DEBUG_FLAG DEBUG_TEXTURE4546#define ALIGN_DOWN(a, b) ROUND_DOWN_TO(a, b)47#define ALIGN_UP(a, b) ALIGN(a, b)4849/* Tile dimensions. Width and span are in bytes, height is in pixels (i.e.50* unitless). A "span" is the most number of bytes we can copy from linear51* to tiled without needing to calculate a new destination address.52*/53static const uint32_t xtile_width = 512;54static const uint32_t xtile_height = 8;55static const uint32_t xtile_span = 64;56static const uint32_t ytile_width = 128;57static const uint32_t ytile_height = 32;58static const uint32_t ytile_span = 16;5960static inline uint32_t61ror(uint32_t n, uint32_t d)62{63return (n >> d) | (n << (32 - d));64}6566// bswap32 already exists as a macro on some platforms (FreeBSD)67#ifndef bswap3268static inline uint32_t69bswap32(uint32_t n)70{71#if defined(HAVE___BUILTIN_BSWAP32)72return __builtin_bswap32(n);73#else74return (n >> 24) |75((n >> 8) & 0x0000ff00) |76((n << 8) & 0x00ff0000) |77(n << 24);78#endif79}80#endif8182/**83* Copy RGBA to BGRA - swap R and B.84*/85static inline void *86rgba8_copy(void *dst, const void *src, size_t bytes)87{88uint32_t *d = dst;89uint32_t const *s = src;9091assert(bytes % 4 == 0);9293while (bytes >= 4) {94*d = ror(bswap32(*s), 8);95d += 1;96s += 1;97bytes -= 4;98}99return dst;100}101102#ifdef __SSSE3__103static const uint8_t rgba8_permutation[16] =104{ 2,1,0,3, 6,5,4,7, 10,9,8,11, 14,13,12,15 };105106static inline void107rgba8_copy_16_aligned_dst(void *dst, const void *src)108{109_mm_store_si128(dst,110_mm_shuffle_epi8(_mm_loadu_si128(src),111*(__m128i *)rgba8_permutation));112}113114static inline void115rgba8_copy_16_aligned_src(void *dst, const void *src)116{117_mm_storeu_si128(dst,118_mm_shuffle_epi8(_mm_load_si128(src),119*(__m128i *)rgba8_permutation));120}121122#elif defined(__SSE2__)123static inline void124rgba8_copy_16_aligned_dst(void *dst, const void *src)125{126__m128i srcreg, dstreg, agmask, ag, rb, br;127128agmask = _mm_set1_epi32(0xFF00FF00);129srcreg = _mm_loadu_si128((__m128i *)src);130131rb = _mm_andnot_si128(agmask, srcreg);132ag = _mm_and_si128(agmask, srcreg);133br = _mm_shufflehi_epi16(_mm_shufflelo_epi16(rb, _MM_SHUFFLE(2, 3, 0, 1)),134_MM_SHUFFLE(2, 3, 0, 1));135dstreg = _mm_or_si128(ag, br);136137_mm_store_si128((__m128i *)dst, dstreg);138}139140static inline void141rgba8_copy_16_aligned_src(void *dst, const void *src)142{143__m128i srcreg, dstreg, agmask, ag, rb, br;144145agmask = _mm_set1_epi32(0xFF00FF00);146srcreg = _mm_load_si128((__m128i *)src);147148rb = _mm_andnot_si128(agmask, srcreg);149ag = _mm_and_si128(agmask, srcreg);150br = _mm_shufflehi_epi16(_mm_shufflelo_epi16(rb, _MM_SHUFFLE(2, 3, 0, 1)),151_MM_SHUFFLE(2, 3, 0, 1));152dstreg = _mm_or_si128(ag, br);153154_mm_storeu_si128((__m128i *)dst, dstreg);155}156#endif157158/**159* Copy RGBA to BGRA - swap R and B, with the destination 16-byte aligned.160*/161static inline void *162rgba8_copy_aligned_dst(void *dst, const void *src, size_t bytes)163{164assert(bytes == 0 || !(((uintptr_t)dst) & 0xf));165166#if defined(__SSSE3__) || defined(__SSE2__)167if (bytes == 64) {168rgba8_copy_16_aligned_dst(dst + 0, src + 0);169rgba8_copy_16_aligned_dst(dst + 16, src + 16);170rgba8_copy_16_aligned_dst(dst + 32, src + 32);171rgba8_copy_16_aligned_dst(dst + 48, src + 48);172return dst;173}174175while (bytes >= 16) {176rgba8_copy_16_aligned_dst(dst, src);177src += 16;178dst += 16;179bytes -= 16;180}181#endif182183rgba8_copy(dst, src, bytes);184185return dst;186}187188/**189* Copy RGBA to BGRA - swap R and B, with the source 16-byte aligned.190*/191static inline void *192rgba8_copy_aligned_src(void *dst, const void *src, size_t bytes)193{194assert(bytes == 0 || !(((uintptr_t)src) & 0xf));195196#if defined(__SSSE3__) || defined(__SSE2__)197if (bytes == 64) {198rgba8_copy_16_aligned_src(dst + 0, src + 0);199rgba8_copy_16_aligned_src(dst + 16, src + 16);200rgba8_copy_16_aligned_src(dst + 32, src + 32);201rgba8_copy_16_aligned_src(dst + 48, src + 48);202return dst;203}204205while (bytes >= 16) {206rgba8_copy_16_aligned_src(dst, src);207src += 16;208dst += 16;209bytes -= 16;210}211#endif212213rgba8_copy(dst, src, bytes);214215return dst;216}217218/**219* Each row from y0 to y1 is copied in three parts: [x0,x1), [x1,x2), [x2,x3).220* These ranges are in bytes, i.e. pixels * bytes-per-pixel.221* The first and last ranges must be shorter than a "span" (the longest linear222* stretch within a tile) and the middle must equal a whole number of spans.223* Ranges may be empty. The region copied must land entirely within one tile.224* 'dst' is the start of the tile and 'src' is the corresponding225* address to copy from, though copying begins at (x0, y0).226* To enable swizzling 'swizzle_bit' must be 1<<6, otherwise zero.227* Swizzling flips bit 6 in the copy destination offset, when certain other228* bits are set in it.229*/230typedef void (*tile_copy_fn)(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,231uint32_t y0, uint32_t y1,232char *dst, const char *src,233int32_t linear_pitch,234uint32_t swizzle_bit,235isl_memcpy_type copy_type);236237/**238* Copy texture data from linear to X tile layout.239*240* \copydoc tile_copy_fn241*242* The mem_copy parameters allow the user to specify an alternative mem_copy243* function that, for instance, may do RGBA -> BGRA swizzling. The first244* function must handle any memory alignment while the second function must245* only handle 16-byte alignment in whichever side (source or destination) is246* tiled.247*/248static inline void249linear_to_xtiled(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,250uint32_t y0, uint32_t y1,251char *dst, const char *src,252int32_t src_pitch,253uint32_t swizzle_bit,254isl_mem_copy_fn mem_copy,255isl_mem_copy_fn mem_copy_align16)256{257/* The copy destination offset for each range copied is the sum of258* an X offset 'x0' or 'xo' and a Y offset 'yo.'259*/260uint32_t xo, yo;261262src += (ptrdiff_t)y0 * src_pitch;263264for (yo = y0 * xtile_width; yo < y1 * xtile_width; yo += xtile_width) {265/* Bits 9 and 10 of the copy destination offset control swizzling.266* Only 'yo' contributes to those bits in the total offset,267* so calculate 'swizzle' just once per row.268* Move bits 9 and 10 three and four places respectively down269* to bit 6 and xor them.270*/271uint32_t swizzle = ((yo >> 3) ^ (yo >> 4)) & swizzle_bit;272273mem_copy(dst + ((x0 + yo) ^ swizzle), src + x0, x1 - x0);274275for (xo = x1; xo < x2; xo += xtile_span) {276mem_copy_align16(dst + ((xo + yo) ^ swizzle), src + xo, xtile_span);277}278279mem_copy_align16(dst + ((xo + yo) ^ swizzle), src + x2, x3 - x2);280281src += src_pitch;282}283}284285/**286* Copy texture data from linear to Y tile layout.287*288* \copydoc tile_copy_fn289*/290static inline void291linear_to_ytiled(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,292uint32_t y0, uint32_t y3,293char *dst, const char *src,294int32_t src_pitch,295uint32_t swizzle_bit,296isl_mem_copy_fn mem_copy,297isl_mem_copy_fn mem_copy_align16)298{299/* Y tiles consist of columns that are 'ytile_span' wide (and the same height300* as the tile). Thus the destination offset for (x,y) is the sum of:301* (x % column_width) // position within column302* (x / column_width) * bytes_per_column // column number * bytes per column303* y * column_width304*305* The copy destination offset for each range copied is the sum of306* an X offset 'xo0' or 'xo' and a Y offset 'yo.'307*/308const uint32_t column_width = ytile_span;309const uint32_t bytes_per_column = column_width * ytile_height;310311uint32_t y1 = MIN2(y3, ALIGN_UP(y0, 4));312uint32_t y2 = MAX2(y1, ALIGN_DOWN(y3, 4));313314uint32_t xo0 = (x0 % ytile_span) + (x0 / ytile_span) * bytes_per_column;315uint32_t xo1 = (x1 % ytile_span) + (x1 / ytile_span) * bytes_per_column;316317/* Bit 9 of the destination offset control swizzling.318* Only the X offset contributes to bit 9 of the total offset,319* so swizzle can be calculated in advance for these X positions.320* Move bit 9 three places down to bit 6.321*/322uint32_t swizzle0 = (xo0 >> 3) & swizzle_bit;323uint32_t swizzle1 = (xo1 >> 3) & swizzle_bit;324325uint32_t x, yo;326327src += (ptrdiff_t)y0 * src_pitch;328329if (y0 != y1) {330for (yo = y0 * column_width; yo < y1 * column_width; yo += column_width) {331uint32_t xo = xo1;332uint32_t swizzle = swizzle1;333334mem_copy(dst + ((xo0 + yo) ^ swizzle0), src + x0, x1 - x0);335336/* Step by spans/columns. As it happens, the swizzle bit flips337* at each step so we don't need to calculate it explicitly.338*/339for (x = x1; x < x2; x += ytile_span) {340mem_copy_align16(dst + ((xo + yo) ^ swizzle), src + x, ytile_span);341xo += bytes_per_column;342swizzle ^= swizzle_bit;343}344345mem_copy_align16(dst + ((xo + yo) ^ swizzle), src + x2, x3 - x2);346347src += src_pitch;348}349}350351for (yo = y1 * column_width; yo < y2 * column_width; yo += 4 * column_width) {352uint32_t xo = xo1;353uint32_t swizzle = swizzle1;354355if (x0 != x1) {356mem_copy(dst + ((xo0 + yo + 0 * column_width) ^ swizzle0), src + x0 + 0 * src_pitch, x1 - x0);357mem_copy(dst + ((xo0 + yo + 1 * column_width) ^ swizzle0), src + x0 + 1 * src_pitch, x1 - x0);358mem_copy(dst + ((xo0 + yo + 2 * column_width) ^ swizzle0), src + x0 + 2 * src_pitch, x1 - x0);359mem_copy(dst + ((xo0 + yo + 3 * column_width) ^ swizzle0), src + x0 + 3 * src_pitch, x1 - x0);360}361362/* Step by spans/columns. As it happens, the swizzle bit flips363* at each step so we don't need to calculate it explicitly.364*/365for (x = x1; x < x2; x += ytile_span) {366mem_copy_align16(dst + ((xo + yo + 0 * column_width) ^ swizzle), src + x + 0 * src_pitch, ytile_span);367mem_copy_align16(dst + ((xo + yo + 1 * column_width) ^ swizzle), src + x + 1 * src_pitch, ytile_span);368mem_copy_align16(dst + ((xo + yo + 2 * column_width) ^ swizzle), src + x + 2 * src_pitch, ytile_span);369mem_copy_align16(dst + ((xo + yo + 3 * column_width) ^ swizzle), src + x + 3 * src_pitch, ytile_span);370xo += bytes_per_column;371swizzle ^= swizzle_bit;372}373374if (x2 != x3) {375mem_copy_align16(dst + ((xo + yo + 0 * column_width) ^ swizzle), src + x2 + 0 * src_pitch, x3 - x2);376mem_copy_align16(dst + ((xo + yo + 1 * column_width) ^ swizzle), src + x2 + 1 * src_pitch, x3 - x2);377mem_copy_align16(dst + ((xo + yo + 2 * column_width) ^ swizzle), src + x2 + 2 * src_pitch, x3 - x2);378mem_copy_align16(dst + ((xo + yo + 3 * column_width) ^ swizzle), src + x2 + 3 * src_pitch, x3 - x2);379}380381src += 4 * src_pitch;382}383384if (y2 != y3) {385for (yo = y2 * column_width; yo < y3 * column_width; yo += column_width) {386uint32_t xo = xo1;387uint32_t swizzle = swizzle1;388389mem_copy(dst + ((xo0 + yo) ^ swizzle0), src + x0, x1 - x0);390391/* Step by spans/columns. As it happens, the swizzle bit flips392* at each step so we don't need to calculate it explicitly.393*/394for (x = x1; x < x2; x += ytile_span) {395mem_copy_align16(dst + ((xo + yo) ^ swizzle), src + x, ytile_span);396xo += bytes_per_column;397swizzle ^= swizzle_bit;398}399400mem_copy_align16(dst + ((xo + yo) ^ swizzle), src + x2, x3 - x2);401402src += src_pitch;403}404}405}406407/**408* Copy texture data from X tile layout to linear.409*410* \copydoc tile_copy_fn411*/412static inline void413xtiled_to_linear(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,414uint32_t y0, uint32_t y1,415char *dst, const char *src,416int32_t dst_pitch,417uint32_t swizzle_bit,418isl_mem_copy_fn mem_copy,419isl_mem_copy_fn mem_copy_align16)420{421/* The copy destination offset for each range copied is the sum of422* an X offset 'x0' or 'xo' and a Y offset 'yo.'423*/424uint32_t xo, yo;425426dst += (ptrdiff_t)y0 * dst_pitch;427428for (yo = y0 * xtile_width; yo < y1 * xtile_width; yo += xtile_width) {429/* Bits 9 and 10 of the copy destination offset control swizzling.430* Only 'yo' contributes to those bits in the total offset,431* so calculate 'swizzle' just once per row.432* Move bits 9 and 10 three and four places respectively down433* to bit 6 and xor them.434*/435uint32_t swizzle = ((yo >> 3) ^ (yo >> 4)) & swizzle_bit;436437mem_copy(dst + x0, src + ((x0 + yo) ^ swizzle), x1 - x0);438439for (xo = x1; xo < x2; xo += xtile_span) {440mem_copy_align16(dst + xo, src + ((xo + yo) ^ swizzle), xtile_span);441}442443mem_copy_align16(dst + x2, src + ((xo + yo) ^ swizzle), x3 - x2);444445dst += dst_pitch;446}447}448449/**450* Copy texture data from Y tile layout to linear.451*452* \copydoc tile_copy_fn453*/454static inline void455ytiled_to_linear(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,456uint32_t y0, uint32_t y3,457char *dst, const char *src,458int32_t dst_pitch,459uint32_t swizzle_bit,460isl_mem_copy_fn mem_copy,461isl_mem_copy_fn mem_copy_align16)462{463/* Y tiles consist of columns that are 'ytile_span' wide (and the same height464* as the tile). Thus the destination offset for (x,y) is the sum of:465* (x % column_width) // position within column466* (x / column_width) * bytes_per_column // column number * bytes per column467* y * column_width468*469* The copy destination offset for each range copied is the sum of470* an X offset 'xo0' or 'xo' and a Y offset 'yo.'471*/472const uint32_t column_width = ytile_span;473const uint32_t bytes_per_column = column_width * ytile_height;474475uint32_t y1 = MIN2(y3, ALIGN_UP(y0, 4));476uint32_t y2 = MAX2(y1, ALIGN_DOWN(y3, 4));477478uint32_t xo0 = (x0 % ytile_span) + (x0 / ytile_span) * bytes_per_column;479uint32_t xo1 = (x1 % ytile_span) + (x1 / ytile_span) * bytes_per_column;480481/* Bit 9 of the destination offset control swizzling.482* Only the X offset contributes to bit 9 of the total offset,483* so swizzle can be calculated in advance for these X positions.484* Move bit 9 three places down to bit 6.485*/486uint32_t swizzle0 = (xo0 >> 3) & swizzle_bit;487uint32_t swizzle1 = (xo1 >> 3) & swizzle_bit;488489uint32_t x, yo;490491dst += (ptrdiff_t)y0 * dst_pitch;492493if (y0 != y1) {494for (yo = y0 * column_width; yo < y1 * column_width; yo += column_width) {495uint32_t xo = xo1;496uint32_t swizzle = swizzle1;497498mem_copy(dst + x0, src + ((xo0 + yo) ^ swizzle0), x1 - x0);499500/* Step by spans/columns. As it happens, the swizzle bit flips501* at each step so we don't need to calculate it explicitly.502*/503for (x = x1; x < x2; x += ytile_span) {504mem_copy_align16(dst + x, src + ((xo + yo) ^ swizzle), ytile_span);505xo += bytes_per_column;506swizzle ^= swizzle_bit;507}508509mem_copy_align16(dst + x2, src + ((xo + yo) ^ swizzle), x3 - x2);510511dst += dst_pitch;512}513}514515for (yo = y1 * column_width; yo < y2 * column_width; yo += 4 * column_width) {516uint32_t xo = xo1;517uint32_t swizzle = swizzle1;518519if (x0 != x1) {520mem_copy(dst + x0 + 0 * dst_pitch, src + ((xo0 + yo + 0 * column_width) ^ swizzle0), x1 - x0);521mem_copy(dst + x0 + 1 * dst_pitch, src + ((xo0 + yo + 1 * column_width) ^ swizzle0), x1 - x0);522mem_copy(dst + x0 + 2 * dst_pitch, src + ((xo0 + yo + 2 * column_width) ^ swizzle0), x1 - x0);523mem_copy(dst + x0 + 3 * dst_pitch, src + ((xo0 + yo + 3 * column_width) ^ swizzle0), x1 - x0);524}525526/* Step by spans/columns. As it happens, the swizzle bit flips527* at each step so we don't need to calculate it explicitly.528*/529for (x = x1; x < x2; x += ytile_span) {530mem_copy_align16(dst + x + 0 * dst_pitch, src + ((xo + yo + 0 * column_width) ^ swizzle), ytile_span);531mem_copy_align16(dst + x + 1 * dst_pitch, src + ((xo + yo + 1 * column_width) ^ swizzle), ytile_span);532mem_copy_align16(dst + x + 2 * dst_pitch, src + ((xo + yo + 2 * column_width) ^ swizzle), ytile_span);533mem_copy_align16(dst + x + 3 * dst_pitch, src + ((xo + yo + 3 * column_width) ^ swizzle), ytile_span);534xo += bytes_per_column;535swizzle ^= swizzle_bit;536}537538if (x2 != x3) {539mem_copy_align16(dst + x2 + 0 * dst_pitch, src + ((xo + yo + 0 * column_width) ^ swizzle), x3 - x2);540mem_copy_align16(dst + x2 + 1 * dst_pitch, src + ((xo + yo + 1 * column_width) ^ swizzle), x3 - x2);541mem_copy_align16(dst + x2 + 2 * dst_pitch, src + ((xo + yo + 2 * column_width) ^ swizzle), x3 - x2);542mem_copy_align16(dst + x2 + 3 * dst_pitch, src + ((xo + yo + 3 * column_width) ^ swizzle), x3 - x2);543}544545dst += 4 * dst_pitch;546}547548if (y2 != y3) {549for (yo = y2 * column_width; yo < y3 * column_width; yo += column_width) {550uint32_t xo = xo1;551uint32_t swizzle = swizzle1;552553mem_copy(dst + x0, src + ((xo0 + yo) ^ swizzle0), x1 - x0);554555/* Step by spans/columns. As it happens, the swizzle bit flips556* at each step so we don't need to calculate it explicitly.557*/558for (x = x1; x < x2; x += ytile_span) {559mem_copy_align16(dst + x, src + ((xo + yo) ^ swizzle), ytile_span);560xo += bytes_per_column;561swizzle ^= swizzle_bit;562}563564mem_copy_align16(dst + x2, src + ((xo + yo) ^ swizzle), x3 - x2);565566dst += dst_pitch;567}568}569}570571#if defined(INLINE_SSE41)572static ALWAYS_INLINE void *573_memcpy_streaming_load(void *dest, const void *src, size_t count)574{575if (count == 16) {576__m128i val = _mm_stream_load_si128((__m128i *)src);577_mm_storeu_si128((__m128i *)dest, val);578return dest;579} else if (count == 64) {580__m128i val0 = _mm_stream_load_si128(((__m128i *)src) + 0);581__m128i val1 = _mm_stream_load_si128(((__m128i *)src) + 1);582__m128i val2 = _mm_stream_load_si128(((__m128i *)src) + 2);583__m128i val3 = _mm_stream_load_si128(((__m128i *)src) + 3);584_mm_storeu_si128(((__m128i *)dest) + 0, val0);585_mm_storeu_si128(((__m128i *)dest) + 1, val1);586_mm_storeu_si128(((__m128i *)dest) + 2, val2);587_mm_storeu_si128(((__m128i *)dest) + 3, val3);588return dest;589} else {590assert(count < 64); /* and (count < 16) for ytiled */591return memcpy(dest, src, count);592}593}594#endif595596static isl_mem_copy_fn597choose_copy_function(isl_memcpy_type copy_type)598{599switch(copy_type) {600case ISL_MEMCPY:601return memcpy;602case ISL_MEMCPY_BGRA8:603return rgba8_copy;604case ISL_MEMCPY_STREAMING_LOAD:605#if defined(INLINE_SSE41)606return _memcpy_streaming_load;607#else608unreachable("ISL_MEMCOPY_STREAMING_LOAD requires sse4.1");609#endif610case ISL_MEMCPY_INVALID:611unreachable("invalid copy_type");612}613unreachable("unhandled copy_type");614return NULL;615}616617/**618* Copy texture data from linear to X tile layout, faster.619*620* Same as \ref linear_to_xtiled but faster, because it passes constant621* parameters for common cases, allowing the compiler to inline code622* optimized for those cases.623*624* \copydoc tile_copy_fn625*/626static FLATTEN void627linear_to_xtiled_faster(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,628uint32_t y0, uint32_t y1,629char *dst, const char *src,630int32_t src_pitch,631uint32_t swizzle_bit,632isl_memcpy_type copy_type)633{634isl_mem_copy_fn mem_copy = choose_copy_function(copy_type);635636if (x0 == 0 && x3 == xtile_width && y0 == 0 && y1 == xtile_height) {637if (mem_copy == memcpy)638return linear_to_xtiled(0, 0, xtile_width, xtile_width, 0, xtile_height,639dst, src, src_pitch, swizzle_bit, memcpy, memcpy);640else if (mem_copy == rgba8_copy)641return linear_to_xtiled(0, 0, xtile_width, xtile_width, 0, xtile_height,642dst, src, src_pitch, swizzle_bit,643rgba8_copy, rgba8_copy_aligned_dst);644else645unreachable("not reached");646} else {647if (mem_copy == memcpy)648return linear_to_xtiled(x0, x1, x2, x3, y0, y1,649dst, src, src_pitch, swizzle_bit,650memcpy, memcpy);651else if (mem_copy == rgba8_copy)652return linear_to_xtiled(x0, x1, x2, x3, y0, y1,653dst, src, src_pitch, swizzle_bit,654rgba8_copy, rgba8_copy_aligned_dst);655else656unreachable("not reached");657}658linear_to_xtiled(x0, x1, x2, x3, y0, y1,659dst, src, src_pitch, swizzle_bit, mem_copy, mem_copy);660}661662/**663* Copy texture data from linear to Y tile layout, faster.664*665* Same as \ref linear_to_ytiled but faster, because it passes constant666* parameters for common cases, allowing the compiler to inline code667* optimized for those cases.668*669* \copydoc tile_copy_fn670*/671static FLATTEN void672linear_to_ytiled_faster(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,673uint32_t y0, uint32_t y1,674char *dst, const char *src,675int32_t src_pitch,676uint32_t swizzle_bit,677isl_memcpy_type copy_type)678{679isl_mem_copy_fn mem_copy = choose_copy_function(copy_type);680681if (x0 == 0 && x3 == ytile_width && y0 == 0 && y1 == ytile_height) {682if (mem_copy == memcpy)683return linear_to_ytiled(0, 0, ytile_width, ytile_width, 0, ytile_height,684dst, src, src_pitch, swizzle_bit, memcpy, memcpy);685else if (mem_copy == rgba8_copy)686return linear_to_ytiled(0, 0, ytile_width, ytile_width, 0, ytile_height,687dst, src, src_pitch, swizzle_bit,688rgba8_copy, rgba8_copy_aligned_dst);689else690unreachable("not reached");691} else {692if (mem_copy == memcpy)693return linear_to_ytiled(x0, x1, x2, x3, y0, y1,694dst, src, src_pitch, swizzle_bit, memcpy, memcpy);695else if (mem_copy == rgba8_copy)696return linear_to_ytiled(x0, x1, x2, x3, y0, y1,697dst, src, src_pitch, swizzle_bit,698rgba8_copy, rgba8_copy_aligned_dst);699else700unreachable("not reached");701}702linear_to_ytiled(x0, x1, x2, x3, y0, y1,703dst, src, src_pitch, swizzle_bit, mem_copy, mem_copy);704}705706/**707* Copy texture data from X tile layout to linear, faster.708*709* Same as \ref xtile_to_linear but faster, because it passes constant710* parameters for common cases, allowing the compiler to inline code711* optimized for those cases.712*713* \copydoc tile_copy_fn714*/715static FLATTEN void716xtiled_to_linear_faster(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,717uint32_t y0, uint32_t y1,718char *dst, const char *src,719int32_t dst_pitch,720uint32_t swizzle_bit,721isl_memcpy_type copy_type)722{723isl_mem_copy_fn mem_copy = choose_copy_function(copy_type);724725if (x0 == 0 && x3 == xtile_width && y0 == 0 && y1 == xtile_height) {726if (mem_copy == memcpy)727return xtiled_to_linear(0, 0, xtile_width, xtile_width, 0, xtile_height,728dst, src, dst_pitch, swizzle_bit, memcpy, memcpy);729else if (mem_copy == rgba8_copy)730return xtiled_to_linear(0, 0, xtile_width, xtile_width, 0, xtile_height,731dst, src, dst_pitch, swizzle_bit,732rgba8_copy, rgba8_copy_aligned_src);733#if defined(INLINE_SSE41)734else if (mem_copy == _memcpy_streaming_load)735return xtiled_to_linear(0, 0, xtile_width, xtile_width, 0, xtile_height,736dst, src, dst_pitch, swizzle_bit,737memcpy, _memcpy_streaming_load);738#endif739else740unreachable("not reached");741} else {742if (mem_copy == memcpy)743return xtiled_to_linear(x0, x1, x2, x3, y0, y1,744dst, src, dst_pitch, swizzle_bit, memcpy, memcpy);745else if (mem_copy == rgba8_copy)746return xtiled_to_linear(x0, x1, x2, x3, y0, y1,747dst, src, dst_pitch, swizzle_bit,748rgba8_copy, rgba8_copy_aligned_src);749#if defined(INLINE_SSE41)750else if (mem_copy == _memcpy_streaming_load)751return xtiled_to_linear(x0, x1, x2, x3, y0, y1,752dst, src, dst_pitch, swizzle_bit,753memcpy, _memcpy_streaming_load);754#endif755else756unreachable("not reached");757}758xtiled_to_linear(x0, x1, x2, x3, y0, y1,759dst, src, dst_pitch, swizzle_bit, mem_copy, mem_copy);760}761762/**763* Copy texture data from Y tile layout to linear, faster.764*765* Same as \ref ytile_to_linear but faster, because it passes constant766* parameters for common cases, allowing the compiler to inline code767* optimized for those cases.768*769* \copydoc tile_copy_fn770*/771static FLATTEN void772ytiled_to_linear_faster(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,773uint32_t y0, uint32_t y1,774char *dst, const char *src,775int32_t dst_pitch,776uint32_t swizzle_bit,777isl_memcpy_type copy_type)778{779isl_mem_copy_fn mem_copy = choose_copy_function(copy_type);780781if (x0 == 0 && x3 == ytile_width && y0 == 0 && y1 == ytile_height) {782if (mem_copy == memcpy)783return ytiled_to_linear(0, 0, ytile_width, ytile_width, 0, ytile_height,784dst, src, dst_pitch, swizzle_bit, memcpy, memcpy);785else if (mem_copy == rgba8_copy)786return ytiled_to_linear(0, 0, ytile_width, ytile_width, 0, ytile_height,787dst, src, dst_pitch, swizzle_bit,788rgba8_copy, rgba8_copy_aligned_src);789#if defined(INLINE_SSE41)790else if (copy_type == ISL_MEMCPY_STREAMING_LOAD)791return ytiled_to_linear(0, 0, ytile_width, ytile_width, 0, ytile_height,792dst, src, dst_pitch, swizzle_bit,793memcpy, _memcpy_streaming_load);794#endif795else796unreachable("not reached");797} else {798if (mem_copy == memcpy)799return ytiled_to_linear(x0, x1, x2, x3, y0, y1,800dst, src, dst_pitch, swizzle_bit, memcpy, memcpy);801else if (mem_copy == rgba8_copy)802return ytiled_to_linear(x0, x1, x2, x3, y0, y1,803dst, src, dst_pitch, swizzle_bit,804rgba8_copy, rgba8_copy_aligned_src);805#if defined(INLINE_SSE41)806else if (copy_type == ISL_MEMCPY_STREAMING_LOAD)807return ytiled_to_linear(x0, x1, x2, x3, y0, y1,808dst, src, dst_pitch, swizzle_bit,809memcpy, _memcpy_streaming_load);810#endif811else812unreachable("not reached");813}814ytiled_to_linear(x0, x1, x2, x3, y0, y1,815dst, src, dst_pitch, swizzle_bit, mem_copy, mem_copy);816}817818/**819* Copy from linear to tiled texture.820*821* Divide the region given by X range [xt1, xt2) and Y range [yt1, yt2) into822* pieces that do not cross tile boundaries and copy each piece with a tile823* copy function (\ref tile_copy_fn).824* The X range is in bytes, i.e. pixels * bytes-per-pixel.825* The Y range is in pixels (i.e. unitless).826* 'dst' is the address of (0, 0) in the destination tiled texture.827* 'src' is the address of (xt1, yt1) in the source linear texture.828*/829static void830linear_to_tiled(uint32_t xt1, uint32_t xt2,831uint32_t yt1, uint32_t yt2,832char *dst, const char *src,833uint32_t dst_pitch, int32_t src_pitch,834bool has_swizzling,835enum isl_tiling tiling,836isl_memcpy_type copy_type)837{838tile_copy_fn tile_copy;839uint32_t xt0, xt3;840uint32_t yt0, yt3;841uint32_t xt, yt;842uint32_t tw, th, span;843uint32_t swizzle_bit = has_swizzling ? 1<<6 : 0;844845if (tiling == ISL_TILING_X) {846tw = xtile_width;847th = xtile_height;848span = xtile_span;849tile_copy = linear_to_xtiled_faster;850} else if (tiling == ISL_TILING_Y0) {851tw = ytile_width;852th = ytile_height;853span = ytile_span;854tile_copy = linear_to_ytiled_faster;855} else {856unreachable("unsupported tiling");857}858859/* Round out to tile boundaries. */860xt0 = ALIGN_DOWN(xt1, tw);861xt3 = ALIGN_UP (xt2, tw);862yt0 = ALIGN_DOWN(yt1, th);863yt3 = ALIGN_UP (yt2, th);864865/* Loop over all tiles to which we have something to copy.866* 'xt' and 'yt' are the origin of the destination tile, whether copying867* copying a full or partial tile.868* tile_copy() copies one tile or partial tile.869* Looping x inside y is the faster memory access pattern.870*/871for (yt = yt0; yt < yt3; yt += th) {872for (xt = xt0; xt < xt3; xt += tw) {873/* The area to update is [x0,x3) x [y0,y1).874* May not want the whole tile, hence the min and max.875*/876uint32_t x0 = MAX2(xt1, xt);877uint32_t y0 = MAX2(yt1, yt);878uint32_t x3 = MIN2(xt2, xt + tw);879uint32_t y1 = MIN2(yt2, yt + th);880881/* [x0,x3) is split into [x0,x1), [x1,x2), [x2,x3) such that882* the middle interval is the longest span-aligned part.883* The sub-ranges could be empty.884*/885uint32_t x1, x2;886x1 = ALIGN_UP(x0, span);887if (x1 > x3)888x1 = x2 = x3;889else890x2 = ALIGN_DOWN(x3, span);891892assert(x0 <= x1 && x1 <= x2 && x2 <= x3);893assert(x1 - x0 < span && x3 - x2 < span);894assert(x3 - x0 <= tw);895assert((x2 - x1) % span == 0);896897/* Translate by (xt,yt) for single-tile copier. */898tile_copy(x0-xt, x1-xt, x2-xt, x3-xt,899y0-yt, y1-yt,900dst + (ptrdiff_t)xt * th + (ptrdiff_t)yt * dst_pitch,901src + (ptrdiff_t)xt - xt1 + ((ptrdiff_t)yt - yt1) * src_pitch,902src_pitch,903swizzle_bit,904copy_type);905}906}907}908909/**910* Copy from tiled to linear texture.911*912* Divide the region given by X range [xt1, xt2) and Y range [yt1, yt2) into913* pieces that do not cross tile boundaries and copy each piece with a tile914* copy function (\ref tile_copy_fn).915* The X range is in bytes, i.e. pixels * bytes-per-pixel.916* The Y range is in pixels (i.e. unitless).917* 'dst' is the address of (xt1, yt1) in the destination linear texture.918* 'src' is the address of (0, 0) in the source tiled texture.919*/920static void921tiled_to_linear(uint32_t xt1, uint32_t xt2,922uint32_t yt1, uint32_t yt2,923char *dst, const char *src,924int32_t dst_pitch, uint32_t src_pitch,925bool has_swizzling,926enum isl_tiling tiling,927isl_memcpy_type copy_type)928{929tile_copy_fn tile_copy;930uint32_t xt0, xt3;931uint32_t yt0, yt3;932uint32_t xt, yt;933uint32_t tw, th, span;934uint32_t swizzle_bit = has_swizzling ? 1<<6 : 0;935936if (tiling == ISL_TILING_X) {937tw = xtile_width;938th = xtile_height;939span = xtile_span;940tile_copy = xtiled_to_linear_faster;941} else if (tiling == ISL_TILING_Y0) {942tw = ytile_width;943th = ytile_height;944span = ytile_span;945tile_copy = ytiled_to_linear_faster;946} else {947unreachable("unsupported tiling");948}949950#if defined(INLINE_SSE41)951if (copy_type == ISL_MEMCPY_STREAMING_LOAD) {952/* The hidden cacheline sized register used by movntdqa can apparently953* give you stale data, so do an mfence to invalidate it.954*/955_mm_mfence();956}957#endif958959/* Round out to tile boundaries. */960xt0 = ALIGN_DOWN(xt1, tw);961xt3 = ALIGN_UP (xt2, tw);962yt0 = ALIGN_DOWN(yt1, th);963yt3 = ALIGN_UP (yt2, th);964965/* Loop over all tiles to which we have something to copy.966* 'xt' and 'yt' are the origin of the destination tile, whether copying967* copying a full or partial tile.968* tile_copy() copies one tile or partial tile.969* Looping x inside y is the faster memory access pattern.970*/971for (yt = yt0; yt < yt3; yt += th) {972for (xt = xt0; xt < xt3; xt += tw) {973/* The area to update is [x0,x3) x [y0,y1).974* May not want the whole tile, hence the min and max.975*/976uint32_t x0 = MAX2(xt1, xt);977uint32_t y0 = MAX2(yt1, yt);978uint32_t x3 = MIN2(xt2, xt + tw);979uint32_t y1 = MIN2(yt2, yt + th);980981/* [x0,x3) is split into [x0,x1), [x1,x2), [x2,x3) such that982* the middle interval is the longest span-aligned part.983* The sub-ranges could be empty.984*/985uint32_t x1, x2;986x1 = ALIGN_UP(x0, span);987if (x1 > x3)988x1 = x2 = x3;989else990x2 = ALIGN_DOWN(x3, span);991992assert(x0 <= x1 && x1 <= x2 && x2 <= x3);993assert(x1 - x0 < span && x3 - x2 < span);994assert(x3 - x0 <= tw);995assert((x2 - x1) % span == 0);996997/* Translate by (xt,yt) for single-tile copier. */998tile_copy(x0-xt, x1-xt, x2-xt, x3-xt,999y0-yt, y1-yt,1000dst + (ptrdiff_t)xt - xt1 + ((ptrdiff_t)yt - yt1) * dst_pitch,1001src + (ptrdiff_t)xt * th + (ptrdiff_t)yt * src_pitch,1002dst_pitch,1003swizzle_bit,1004copy_type);1005}1006}1007}100810091010