Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
PojavLauncherTeam
GitHub Repository: PojavLauncherTeam/mesa
Path: blob/21.2-virgl/src/intel/isl/isl_tiled_memcpy.c
4547 views
1
/*
2
* Mesa 3-D graphics library
3
*
4
* Copyright 2012 Intel Corporation
5
* Copyright 2013 Google
6
*
7
* Permission is hereby granted, free of charge, to any person obtaining a
8
* copy of this software and associated documentation files (the
9
* "Software"), to deal in the Software without restriction, including
10
* without limitation the rights to use, copy, modify, merge, publish,
11
* distribute, sublicense, and/or sell copies of the Software, and to
12
* permit persons to whom the Software is furnished to do so, subject to
13
* the following conditions:
14
*
15
* The above copyright notice and this permission notice (including the
16
* next paragraph) shall be included in all copies or substantial portions
17
* of the Software.
18
*
19
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
20
* OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
21
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
22
* IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
23
* ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
24
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
25
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
26
*
27
* Authors:
28
* Chad Versace <[email protected]>
29
* Frank Henigman <[email protected]>
30
*/
31
32
#include <string.h>
33
34
#include "util/macros.h"
35
#include "main/macros.h"
36
37
#include "isl_priv.h"
38
39
#if defined(__SSSE3__)
40
#include <tmmintrin.h>
41
#elif defined(__SSE2__)
42
#include <emmintrin.h>
43
#endif
44
45
#define FILE_DEBUG_FLAG DEBUG_TEXTURE
46
47
#define ALIGN_DOWN(a, b) ROUND_DOWN_TO(a, b)
48
#define ALIGN_UP(a, b) ALIGN(a, b)
49
50
/* Tile dimensions. Width and span are in bytes, height is in pixels (i.e.
51
* unitless). A "span" is the most number of bytes we can copy from linear
52
* to tiled without needing to calculate a new destination address.
53
*/
54
static const uint32_t xtile_width = 512;
55
static const uint32_t xtile_height = 8;
56
static const uint32_t xtile_span = 64;
57
static const uint32_t ytile_width = 128;
58
static const uint32_t ytile_height = 32;
59
static const uint32_t ytile_span = 16;
60
61
static inline uint32_t
62
ror(uint32_t n, uint32_t d)
63
{
64
return (n >> d) | (n << (32 - d));
65
}
66
67
// bswap32 already exists as a macro on some platforms (FreeBSD)
68
#ifndef bswap32
69
static inline uint32_t
70
bswap32(uint32_t n)
71
{
72
#if defined(HAVE___BUILTIN_BSWAP32)
73
return __builtin_bswap32(n);
74
#else
75
return (n >> 24) |
76
((n >> 8) & 0x0000ff00) |
77
((n << 8) & 0x00ff0000) |
78
(n << 24);
79
#endif
80
}
81
#endif
82
83
/**
84
* Copy RGBA to BGRA - swap R and B.
85
*/
86
static inline void *
87
rgba8_copy(void *dst, const void *src, size_t bytes)
88
{
89
uint32_t *d = dst;
90
uint32_t const *s = src;
91
92
assert(bytes % 4 == 0);
93
94
while (bytes >= 4) {
95
*d = ror(bswap32(*s), 8);
96
d += 1;
97
s += 1;
98
bytes -= 4;
99
}
100
return dst;
101
}
102
103
#ifdef __SSSE3__
104
static const uint8_t rgba8_permutation[16] =
105
{ 2,1,0,3, 6,5,4,7, 10,9,8,11, 14,13,12,15 };
106
107
static inline void
108
rgba8_copy_16_aligned_dst(void *dst, const void *src)
109
{
110
_mm_store_si128(dst,
111
_mm_shuffle_epi8(_mm_loadu_si128(src),
112
*(__m128i *)rgba8_permutation));
113
}
114
115
static inline void
116
rgba8_copy_16_aligned_src(void *dst, const void *src)
117
{
118
_mm_storeu_si128(dst,
119
_mm_shuffle_epi8(_mm_load_si128(src),
120
*(__m128i *)rgba8_permutation));
121
}
122
123
#elif defined(__SSE2__)
124
static inline void
125
rgba8_copy_16_aligned_dst(void *dst, const void *src)
126
{
127
__m128i srcreg, dstreg, agmask, ag, rb, br;
128
129
agmask = _mm_set1_epi32(0xFF00FF00);
130
srcreg = _mm_loadu_si128((__m128i *)src);
131
132
rb = _mm_andnot_si128(agmask, srcreg);
133
ag = _mm_and_si128(agmask, srcreg);
134
br = _mm_shufflehi_epi16(_mm_shufflelo_epi16(rb, _MM_SHUFFLE(2, 3, 0, 1)),
135
_MM_SHUFFLE(2, 3, 0, 1));
136
dstreg = _mm_or_si128(ag, br);
137
138
_mm_store_si128((__m128i *)dst, dstreg);
139
}
140
141
static inline void
142
rgba8_copy_16_aligned_src(void *dst, const void *src)
143
{
144
__m128i srcreg, dstreg, agmask, ag, rb, br;
145
146
agmask = _mm_set1_epi32(0xFF00FF00);
147
srcreg = _mm_load_si128((__m128i *)src);
148
149
rb = _mm_andnot_si128(agmask, srcreg);
150
ag = _mm_and_si128(agmask, srcreg);
151
br = _mm_shufflehi_epi16(_mm_shufflelo_epi16(rb, _MM_SHUFFLE(2, 3, 0, 1)),
152
_MM_SHUFFLE(2, 3, 0, 1));
153
dstreg = _mm_or_si128(ag, br);
154
155
_mm_storeu_si128((__m128i *)dst, dstreg);
156
}
157
#endif
158
159
/**
160
* Copy RGBA to BGRA - swap R and B, with the destination 16-byte aligned.
161
*/
162
static inline void *
163
rgba8_copy_aligned_dst(void *dst, const void *src, size_t bytes)
164
{
165
assert(bytes == 0 || !(((uintptr_t)dst) & 0xf));
166
167
#if defined(__SSSE3__) || defined(__SSE2__)
168
if (bytes == 64) {
169
rgba8_copy_16_aligned_dst(dst + 0, src + 0);
170
rgba8_copy_16_aligned_dst(dst + 16, src + 16);
171
rgba8_copy_16_aligned_dst(dst + 32, src + 32);
172
rgba8_copy_16_aligned_dst(dst + 48, src + 48);
173
return dst;
174
}
175
176
while (bytes >= 16) {
177
rgba8_copy_16_aligned_dst(dst, src);
178
src += 16;
179
dst += 16;
180
bytes -= 16;
181
}
182
#endif
183
184
rgba8_copy(dst, src, bytes);
185
186
return dst;
187
}
188
189
/**
190
* Copy RGBA to BGRA - swap R and B, with the source 16-byte aligned.
191
*/
192
static inline void *
193
rgba8_copy_aligned_src(void *dst, const void *src, size_t bytes)
194
{
195
assert(bytes == 0 || !(((uintptr_t)src) & 0xf));
196
197
#if defined(__SSSE3__) || defined(__SSE2__)
198
if (bytes == 64) {
199
rgba8_copy_16_aligned_src(dst + 0, src + 0);
200
rgba8_copy_16_aligned_src(dst + 16, src + 16);
201
rgba8_copy_16_aligned_src(dst + 32, src + 32);
202
rgba8_copy_16_aligned_src(dst + 48, src + 48);
203
return dst;
204
}
205
206
while (bytes >= 16) {
207
rgba8_copy_16_aligned_src(dst, src);
208
src += 16;
209
dst += 16;
210
bytes -= 16;
211
}
212
#endif
213
214
rgba8_copy(dst, src, bytes);
215
216
return dst;
217
}
218
219
/**
220
* Each row from y0 to y1 is copied in three parts: [x0,x1), [x1,x2), [x2,x3).
221
* These ranges are in bytes, i.e. pixels * bytes-per-pixel.
222
* The first and last ranges must be shorter than a "span" (the longest linear
223
* stretch within a tile) and the middle must equal a whole number of spans.
224
* Ranges may be empty. The region copied must land entirely within one tile.
225
* 'dst' is the start of the tile and 'src' is the corresponding
226
* address to copy from, though copying begins at (x0, y0).
227
* To enable swizzling 'swizzle_bit' must be 1<<6, otherwise zero.
228
* Swizzling flips bit 6 in the copy destination offset, when certain other
229
* bits are set in it.
230
*/
231
typedef void (*tile_copy_fn)(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,
232
uint32_t y0, uint32_t y1,
233
char *dst, const char *src,
234
int32_t linear_pitch,
235
uint32_t swizzle_bit,
236
isl_memcpy_type copy_type);
237
238
/**
239
* Copy texture data from linear to X tile layout.
240
*
241
* \copydoc tile_copy_fn
242
*
243
* The mem_copy parameters allow the user to specify an alternative mem_copy
244
* function that, for instance, may do RGBA -> BGRA swizzling. The first
245
* function must handle any memory alignment while the second function must
246
* only handle 16-byte alignment in whichever side (source or destination) is
247
* tiled.
248
*/
249
static inline void
250
linear_to_xtiled(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,
251
uint32_t y0, uint32_t y1,
252
char *dst, const char *src,
253
int32_t src_pitch,
254
uint32_t swizzle_bit,
255
isl_mem_copy_fn mem_copy,
256
isl_mem_copy_fn mem_copy_align16)
257
{
258
/* The copy destination offset for each range copied is the sum of
259
* an X offset 'x0' or 'xo' and a Y offset 'yo.'
260
*/
261
uint32_t xo, yo;
262
263
src += (ptrdiff_t)y0 * src_pitch;
264
265
for (yo = y0 * xtile_width; yo < y1 * xtile_width; yo += xtile_width) {
266
/* Bits 9 and 10 of the copy destination offset control swizzling.
267
* Only 'yo' contributes to those bits in the total offset,
268
* so calculate 'swizzle' just once per row.
269
* Move bits 9 and 10 three and four places respectively down
270
* to bit 6 and xor them.
271
*/
272
uint32_t swizzle = ((yo >> 3) ^ (yo >> 4)) & swizzle_bit;
273
274
mem_copy(dst + ((x0 + yo) ^ swizzle), src + x0, x1 - x0);
275
276
for (xo = x1; xo < x2; xo += xtile_span) {
277
mem_copy_align16(dst + ((xo + yo) ^ swizzle), src + xo, xtile_span);
278
}
279
280
mem_copy_align16(dst + ((xo + yo) ^ swizzle), src + x2, x3 - x2);
281
282
src += src_pitch;
283
}
284
}
285
286
/**
287
* Copy texture data from linear to Y tile layout.
288
*
289
* \copydoc tile_copy_fn
290
*/
291
static inline void
292
linear_to_ytiled(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,
293
uint32_t y0, uint32_t y3,
294
char *dst, const char *src,
295
int32_t src_pitch,
296
uint32_t swizzle_bit,
297
isl_mem_copy_fn mem_copy,
298
isl_mem_copy_fn mem_copy_align16)
299
{
300
/* Y tiles consist of columns that are 'ytile_span' wide (and the same height
301
* as the tile). Thus the destination offset for (x,y) is the sum of:
302
* (x % column_width) // position within column
303
* (x / column_width) * bytes_per_column // column number * bytes per column
304
* y * column_width
305
*
306
* The copy destination offset for each range copied is the sum of
307
* an X offset 'xo0' or 'xo' and a Y offset 'yo.'
308
*/
309
const uint32_t column_width = ytile_span;
310
const uint32_t bytes_per_column = column_width * ytile_height;
311
312
uint32_t y1 = MIN2(y3, ALIGN_UP(y0, 4));
313
uint32_t y2 = MAX2(y1, ALIGN_DOWN(y3, 4));
314
315
uint32_t xo0 = (x0 % ytile_span) + (x0 / ytile_span) * bytes_per_column;
316
uint32_t xo1 = (x1 % ytile_span) + (x1 / ytile_span) * bytes_per_column;
317
318
/* Bit 9 of the destination offset control swizzling.
319
* Only the X offset contributes to bit 9 of the total offset,
320
* so swizzle can be calculated in advance for these X positions.
321
* Move bit 9 three places down to bit 6.
322
*/
323
uint32_t swizzle0 = (xo0 >> 3) & swizzle_bit;
324
uint32_t swizzle1 = (xo1 >> 3) & swizzle_bit;
325
326
uint32_t x, yo;
327
328
src += (ptrdiff_t)y0 * src_pitch;
329
330
if (y0 != y1) {
331
for (yo = y0 * column_width; yo < y1 * column_width; yo += column_width) {
332
uint32_t xo = xo1;
333
uint32_t swizzle = swizzle1;
334
335
mem_copy(dst + ((xo0 + yo) ^ swizzle0), src + x0, x1 - x0);
336
337
/* Step by spans/columns. As it happens, the swizzle bit flips
338
* at each step so we don't need to calculate it explicitly.
339
*/
340
for (x = x1; x < x2; x += ytile_span) {
341
mem_copy_align16(dst + ((xo + yo) ^ swizzle), src + x, ytile_span);
342
xo += bytes_per_column;
343
swizzle ^= swizzle_bit;
344
}
345
346
mem_copy_align16(dst + ((xo + yo) ^ swizzle), src + x2, x3 - x2);
347
348
src += src_pitch;
349
}
350
}
351
352
for (yo = y1 * column_width; yo < y2 * column_width; yo += 4 * column_width) {
353
uint32_t xo = xo1;
354
uint32_t swizzle = swizzle1;
355
356
if (x0 != x1) {
357
mem_copy(dst + ((xo0 + yo + 0 * column_width) ^ swizzle0), src + x0 + 0 * src_pitch, x1 - x0);
358
mem_copy(dst + ((xo0 + yo + 1 * column_width) ^ swizzle0), src + x0 + 1 * src_pitch, x1 - x0);
359
mem_copy(dst + ((xo0 + yo + 2 * column_width) ^ swizzle0), src + x0 + 2 * src_pitch, x1 - x0);
360
mem_copy(dst + ((xo0 + yo + 3 * column_width) ^ swizzle0), src + x0 + 3 * src_pitch, x1 - x0);
361
}
362
363
/* Step by spans/columns. As it happens, the swizzle bit flips
364
* at each step so we don't need to calculate it explicitly.
365
*/
366
for (x = x1; x < x2; x += ytile_span) {
367
mem_copy_align16(dst + ((xo + yo + 0 * column_width) ^ swizzle), src + x + 0 * src_pitch, ytile_span);
368
mem_copy_align16(dst + ((xo + yo + 1 * column_width) ^ swizzle), src + x + 1 * src_pitch, ytile_span);
369
mem_copy_align16(dst + ((xo + yo + 2 * column_width) ^ swizzle), src + x + 2 * src_pitch, ytile_span);
370
mem_copy_align16(dst + ((xo + yo + 3 * column_width) ^ swizzle), src + x + 3 * src_pitch, ytile_span);
371
xo += bytes_per_column;
372
swizzle ^= swizzle_bit;
373
}
374
375
if (x2 != x3) {
376
mem_copy_align16(dst + ((xo + yo + 0 * column_width) ^ swizzle), src + x2 + 0 * src_pitch, x3 - x2);
377
mem_copy_align16(dst + ((xo + yo + 1 * column_width) ^ swizzle), src + x2 + 1 * src_pitch, x3 - x2);
378
mem_copy_align16(dst + ((xo + yo + 2 * column_width) ^ swizzle), src + x2 + 2 * src_pitch, x3 - x2);
379
mem_copy_align16(dst + ((xo + yo + 3 * column_width) ^ swizzle), src + x2 + 3 * src_pitch, x3 - x2);
380
}
381
382
src += 4 * src_pitch;
383
}
384
385
if (y2 != y3) {
386
for (yo = y2 * column_width; yo < y3 * column_width; yo += column_width) {
387
uint32_t xo = xo1;
388
uint32_t swizzle = swizzle1;
389
390
mem_copy(dst + ((xo0 + yo) ^ swizzle0), src + x0, x1 - x0);
391
392
/* Step by spans/columns. As it happens, the swizzle bit flips
393
* at each step so we don't need to calculate it explicitly.
394
*/
395
for (x = x1; x < x2; x += ytile_span) {
396
mem_copy_align16(dst + ((xo + yo) ^ swizzle), src + x, ytile_span);
397
xo += bytes_per_column;
398
swizzle ^= swizzle_bit;
399
}
400
401
mem_copy_align16(dst + ((xo + yo) ^ swizzle), src + x2, x3 - x2);
402
403
src += src_pitch;
404
}
405
}
406
}
407
408
/**
409
* Copy texture data from X tile layout to linear.
410
*
411
* \copydoc tile_copy_fn
412
*/
413
static inline void
414
xtiled_to_linear(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,
415
uint32_t y0, uint32_t y1,
416
char *dst, const char *src,
417
int32_t dst_pitch,
418
uint32_t swizzle_bit,
419
isl_mem_copy_fn mem_copy,
420
isl_mem_copy_fn mem_copy_align16)
421
{
422
/* The copy destination offset for each range copied is the sum of
423
* an X offset 'x0' or 'xo' and a Y offset 'yo.'
424
*/
425
uint32_t xo, yo;
426
427
dst += (ptrdiff_t)y0 * dst_pitch;
428
429
for (yo = y0 * xtile_width; yo < y1 * xtile_width; yo += xtile_width) {
430
/* Bits 9 and 10 of the copy destination offset control swizzling.
431
* Only 'yo' contributes to those bits in the total offset,
432
* so calculate 'swizzle' just once per row.
433
* Move bits 9 and 10 three and four places respectively down
434
* to bit 6 and xor them.
435
*/
436
uint32_t swizzle = ((yo >> 3) ^ (yo >> 4)) & swizzle_bit;
437
438
mem_copy(dst + x0, src + ((x0 + yo) ^ swizzle), x1 - x0);
439
440
for (xo = x1; xo < x2; xo += xtile_span) {
441
mem_copy_align16(dst + xo, src + ((xo + yo) ^ swizzle), xtile_span);
442
}
443
444
mem_copy_align16(dst + x2, src + ((xo + yo) ^ swizzle), x3 - x2);
445
446
dst += dst_pitch;
447
}
448
}
449
450
/**
451
* Copy texture data from Y tile layout to linear.
452
*
453
* \copydoc tile_copy_fn
454
*/
455
static inline void
456
ytiled_to_linear(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,
457
uint32_t y0, uint32_t y3,
458
char *dst, const char *src,
459
int32_t dst_pitch,
460
uint32_t swizzle_bit,
461
isl_mem_copy_fn mem_copy,
462
isl_mem_copy_fn mem_copy_align16)
463
{
464
/* Y tiles consist of columns that are 'ytile_span' wide (and the same height
465
* as the tile). Thus the destination offset for (x,y) is the sum of:
466
* (x % column_width) // position within column
467
* (x / column_width) * bytes_per_column // column number * bytes per column
468
* y * column_width
469
*
470
* The copy destination offset for each range copied is the sum of
471
* an X offset 'xo0' or 'xo' and a Y offset 'yo.'
472
*/
473
const uint32_t column_width = ytile_span;
474
const uint32_t bytes_per_column = column_width * ytile_height;
475
476
uint32_t y1 = MIN2(y3, ALIGN_UP(y0, 4));
477
uint32_t y2 = MAX2(y1, ALIGN_DOWN(y3, 4));
478
479
uint32_t xo0 = (x0 % ytile_span) + (x0 / ytile_span) * bytes_per_column;
480
uint32_t xo1 = (x1 % ytile_span) + (x1 / ytile_span) * bytes_per_column;
481
482
/* Bit 9 of the destination offset control swizzling.
483
* Only the X offset contributes to bit 9 of the total offset,
484
* so swizzle can be calculated in advance for these X positions.
485
* Move bit 9 three places down to bit 6.
486
*/
487
uint32_t swizzle0 = (xo0 >> 3) & swizzle_bit;
488
uint32_t swizzle1 = (xo1 >> 3) & swizzle_bit;
489
490
uint32_t x, yo;
491
492
dst += (ptrdiff_t)y0 * dst_pitch;
493
494
if (y0 != y1) {
495
for (yo = y0 * column_width; yo < y1 * column_width; yo += column_width) {
496
uint32_t xo = xo1;
497
uint32_t swizzle = swizzle1;
498
499
mem_copy(dst + x0, src + ((xo0 + yo) ^ swizzle0), x1 - x0);
500
501
/* Step by spans/columns. As it happens, the swizzle bit flips
502
* at each step so we don't need to calculate it explicitly.
503
*/
504
for (x = x1; x < x2; x += ytile_span) {
505
mem_copy_align16(dst + x, src + ((xo + yo) ^ swizzle), ytile_span);
506
xo += bytes_per_column;
507
swizzle ^= swizzle_bit;
508
}
509
510
mem_copy_align16(dst + x2, src + ((xo + yo) ^ swizzle), x3 - x2);
511
512
dst += dst_pitch;
513
}
514
}
515
516
for (yo = y1 * column_width; yo < y2 * column_width; yo += 4 * column_width) {
517
uint32_t xo = xo1;
518
uint32_t swizzle = swizzle1;
519
520
if (x0 != x1) {
521
mem_copy(dst + x0 + 0 * dst_pitch, src + ((xo0 + yo + 0 * column_width) ^ swizzle0), x1 - x0);
522
mem_copy(dst + x0 + 1 * dst_pitch, src + ((xo0 + yo + 1 * column_width) ^ swizzle0), x1 - x0);
523
mem_copy(dst + x0 + 2 * dst_pitch, src + ((xo0 + yo + 2 * column_width) ^ swizzle0), x1 - x0);
524
mem_copy(dst + x0 + 3 * dst_pitch, src + ((xo0 + yo + 3 * column_width) ^ swizzle0), x1 - x0);
525
}
526
527
/* Step by spans/columns. As it happens, the swizzle bit flips
528
* at each step so we don't need to calculate it explicitly.
529
*/
530
for (x = x1; x < x2; x += ytile_span) {
531
mem_copy_align16(dst + x + 0 * dst_pitch, src + ((xo + yo + 0 * column_width) ^ swizzle), ytile_span);
532
mem_copy_align16(dst + x + 1 * dst_pitch, src + ((xo + yo + 1 * column_width) ^ swizzle), ytile_span);
533
mem_copy_align16(dst + x + 2 * dst_pitch, src + ((xo + yo + 2 * column_width) ^ swizzle), ytile_span);
534
mem_copy_align16(dst + x + 3 * dst_pitch, src + ((xo + yo + 3 * column_width) ^ swizzle), ytile_span);
535
xo += bytes_per_column;
536
swizzle ^= swizzle_bit;
537
}
538
539
if (x2 != x3) {
540
mem_copy_align16(dst + x2 + 0 * dst_pitch, src + ((xo + yo + 0 * column_width) ^ swizzle), x3 - x2);
541
mem_copy_align16(dst + x2 + 1 * dst_pitch, src + ((xo + yo + 1 * column_width) ^ swizzle), x3 - x2);
542
mem_copy_align16(dst + x2 + 2 * dst_pitch, src + ((xo + yo + 2 * column_width) ^ swizzle), x3 - x2);
543
mem_copy_align16(dst + x2 + 3 * dst_pitch, src + ((xo + yo + 3 * column_width) ^ swizzle), x3 - x2);
544
}
545
546
dst += 4 * dst_pitch;
547
}
548
549
if (y2 != y3) {
550
for (yo = y2 * column_width; yo < y3 * column_width; yo += column_width) {
551
uint32_t xo = xo1;
552
uint32_t swizzle = swizzle1;
553
554
mem_copy(dst + x0, src + ((xo0 + yo) ^ swizzle0), x1 - x0);
555
556
/* Step by spans/columns. As it happens, the swizzle bit flips
557
* at each step so we don't need to calculate it explicitly.
558
*/
559
for (x = x1; x < x2; x += ytile_span) {
560
mem_copy_align16(dst + x, src + ((xo + yo) ^ swizzle), ytile_span);
561
xo += bytes_per_column;
562
swizzle ^= swizzle_bit;
563
}
564
565
mem_copy_align16(dst + x2, src + ((xo + yo) ^ swizzle), x3 - x2);
566
567
dst += dst_pitch;
568
}
569
}
570
}
571
572
#if defined(INLINE_SSE41)
573
static ALWAYS_INLINE void *
574
_memcpy_streaming_load(void *dest, const void *src, size_t count)
575
{
576
if (count == 16) {
577
__m128i val = _mm_stream_load_si128((__m128i *)src);
578
_mm_storeu_si128((__m128i *)dest, val);
579
return dest;
580
} else if (count == 64) {
581
__m128i val0 = _mm_stream_load_si128(((__m128i *)src) + 0);
582
__m128i val1 = _mm_stream_load_si128(((__m128i *)src) + 1);
583
__m128i val2 = _mm_stream_load_si128(((__m128i *)src) + 2);
584
__m128i val3 = _mm_stream_load_si128(((__m128i *)src) + 3);
585
_mm_storeu_si128(((__m128i *)dest) + 0, val0);
586
_mm_storeu_si128(((__m128i *)dest) + 1, val1);
587
_mm_storeu_si128(((__m128i *)dest) + 2, val2);
588
_mm_storeu_si128(((__m128i *)dest) + 3, val3);
589
return dest;
590
} else {
591
assert(count < 64); /* and (count < 16) for ytiled */
592
return memcpy(dest, src, count);
593
}
594
}
595
#endif
596
597
static isl_mem_copy_fn
598
choose_copy_function(isl_memcpy_type copy_type)
599
{
600
switch(copy_type) {
601
case ISL_MEMCPY:
602
return memcpy;
603
case ISL_MEMCPY_BGRA8:
604
return rgba8_copy;
605
case ISL_MEMCPY_STREAMING_LOAD:
606
#if defined(INLINE_SSE41)
607
return _memcpy_streaming_load;
608
#else
609
unreachable("ISL_MEMCOPY_STREAMING_LOAD requires sse4.1");
610
#endif
611
case ISL_MEMCPY_INVALID:
612
unreachable("invalid copy_type");
613
}
614
unreachable("unhandled copy_type");
615
return NULL;
616
}
617
618
/**
619
* Copy texture data from linear to X tile layout, faster.
620
*
621
* Same as \ref linear_to_xtiled but faster, because it passes constant
622
* parameters for common cases, allowing the compiler to inline code
623
* optimized for those cases.
624
*
625
* \copydoc tile_copy_fn
626
*/
627
static FLATTEN void
628
linear_to_xtiled_faster(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,
629
uint32_t y0, uint32_t y1,
630
char *dst, const char *src,
631
int32_t src_pitch,
632
uint32_t swizzle_bit,
633
isl_memcpy_type copy_type)
634
{
635
isl_mem_copy_fn mem_copy = choose_copy_function(copy_type);
636
637
if (x0 == 0 && x3 == xtile_width && y0 == 0 && y1 == xtile_height) {
638
if (mem_copy == memcpy)
639
return linear_to_xtiled(0, 0, xtile_width, xtile_width, 0, xtile_height,
640
dst, src, src_pitch, swizzle_bit, memcpy, memcpy);
641
else if (mem_copy == rgba8_copy)
642
return linear_to_xtiled(0, 0, xtile_width, xtile_width, 0, xtile_height,
643
dst, src, src_pitch, swizzle_bit,
644
rgba8_copy, rgba8_copy_aligned_dst);
645
else
646
unreachable("not reached");
647
} else {
648
if (mem_copy == memcpy)
649
return linear_to_xtiled(x0, x1, x2, x3, y0, y1,
650
dst, src, src_pitch, swizzle_bit,
651
memcpy, memcpy);
652
else if (mem_copy == rgba8_copy)
653
return linear_to_xtiled(x0, x1, x2, x3, y0, y1,
654
dst, src, src_pitch, swizzle_bit,
655
rgba8_copy, rgba8_copy_aligned_dst);
656
else
657
unreachable("not reached");
658
}
659
linear_to_xtiled(x0, x1, x2, x3, y0, y1,
660
dst, src, src_pitch, swizzle_bit, mem_copy, mem_copy);
661
}
662
663
/**
664
* Copy texture data from linear to Y tile layout, faster.
665
*
666
* Same as \ref linear_to_ytiled but faster, because it passes constant
667
* parameters for common cases, allowing the compiler to inline code
668
* optimized for those cases.
669
*
670
* \copydoc tile_copy_fn
671
*/
672
static FLATTEN void
673
linear_to_ytiled_faster(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,
674
uint32_t y0, uint32_t y1,
675
char *dst, const char *src,
676
int32_t src_pitch,
677
uint32_t swizzle_bit,
678
isl_memcpy_type copy_type)
679
{
680
isl_mem_copy_fn mem_copy = choose_copy_function(copy_type);
681
682
if (x0 == 0 && x3 == ytile_width && y0 == 0 && y1 == ytile_height) {
683
if (mem_copy == memcpy)
684
return linear_to_ytiled(0, 0, ytile_width, ytile_width, 0, ytile_height,
685
dst, src, src_pitch, swizzle_bit, memcpy, memcpy);
686
else if (mem_copy == rgba8_copy)
687
return linear_to_ytiled(0, 0, ytile_width, ytile_width, 0, ytile_height,
688
dst, src, src_pitch, swizzle_bit,
689
rgba8_copy, rgba8_copy_aligned_dst);
690
else
691
unreachable("not reached");
692
} else {
693
if (mem_copy == memcpy)
694
return linear_to_ytiled(x0, x1, x2, x3, y0, y1,
695
dst, src, src_pitch, swizzle_bit, memcpy, memcpy);
696
else if (mem_copy == rgba8_copy)
697
return linear_to_ytiled(x0, x1, x2, x3, y0, y1,
698
dst, src, src_pitch, swizzle_bit,
699
rgba8_copy, rgba8_copy_aligned_dst);
700
else
701
unreachable("not reached");
702
}
703
linear_to_ytiled(x0, x1, x2, x3, y0, y1,
704
dst, src, src_pitch, swizzle_bit, mem_copy, mem_copy);
705
}
706
707
/**
708
* Copy texture data from X tile layout to linear, faster.
709
*
710
* Same as \ref xtile_to_linear but faster, because it passes constant
711
* parameters for common cases, allowing the compiler to inline code
712
* optimized for those cases.
713
*
714
* \copydoc tile_copy_fn
715
*/
716
static FLATTEN void
717
xtiled_to_linear_faster(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,
718
uint32_t y0, uint32_t y1,
719
char *dst, const char *src,
720
int32_t dst_pitch,
721
uint32_t swizzle_bit,
722
isl_memcpy_type copy_type)
723
{
724
isl_mem_copy_fn mem_copy = choose_copy_function(copy_type);
725
726
if (x0 == 0 && x3 == xtile_width && y0 == 0 && y1 == xtile_height) {
727
if (mem_copy == memcpy)
728
return xtiled_to_linear(0, 0, xtile_width, xtile_width, 0, xtile_height,
729
dst, src, dst_pitch, swizzle_bit, memcpy, memcpy);
730
else if (mem_copy == rgba8_copy)
731
return xtiled_to_linear(0, 0, xtile_width, xtile_width, 0, xtile_height,
732
dst, src, dst_pitch, swizzle_bit,
733
rgba8_copy, rgba8_copy_aligned_src);
734
#if defined(INLINE_SSE41)
735
else if (mem_copy == _memcpy_streaming_load)
736
return xtiled_to_linear(0, 0, xtile_width, xtile_width, 0, xtile_height,
737
dst, src, dst_pitch, swizzle_bit,
738
memcpy, _memcpy_streaming_load);
739
#endif
740
else
741
unreachable("not reached");
742
} else {
743
if (mem_copy == memcpy)
744
return xtiled_to_linear(x0, x1, x2, x3, y0, y1,
745
dst, src, dst_pitch, swizzle_bit, memcpy, memcpy);
746
else if (mem_copy == rgba8_copy)
747
return xtiled_to_linear(x0, x1, x2, x3, y0, y1,
748
dst, src, dst_pitch, swizzle_bit,
749
rgba8_copy, rgba8_copy_aligned_src);
750
#if defined(INLINE_SSE41)
751
else if (mem_copy == _memcpy_streaming_load)
752
return xtiled_to_linear(x0, x1, x2, x3, y0, y1,
753
dst, src, dst_pitch, swizzle_bit,
754
memcpy, _memcpy_streaming_load);
755
#endif
756
else
757
unreachable("not reached");
758
}
759
xtiled_to_linear(x0, x1, x2, x3, y0, y1,
760
dst, src, dst_pitch, swizzle_bit, mem_copy, mem_copy);
761
}
762
763
/**
764
* Copy texture data from Y tile layout to linear, faster.
765
*
766
* Same as \ref ytile_to_linear but faster, because it passes constant
767
* parameters for common cases, allowing the compiler to inline code
768
* optimized for those cases.
769
*
770
* \copydoc tile_copy_fn
771
*/
772
static FLATTEN void
773
ytiled_to_linear_faster(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,
774
uint32_t y0, uint32_t y1,
775
char *dst, const char *src,
776
int32_t dst_pitch,
777
uint32_t swizzle_bit,
778
isl_memcpy_type copy_type)
779
{
780
isl_mem_copy_fn mem_copy = choose_copy_function(copy_type);
781
782
if (x0 == 0 && x3 == ytile_width && y0 == 0 && y1 == ytile_height) {
783
if (mem_copy == memcpy)
784
return ytiled_to_linear(0, 0, ytile_width, ytile_width, 0, ytile_height,
785
dst, src, dst_pitch, swizzle_bit, memcpy, memcpy);
786
else if (mem_copy == rgba8_copy)
787
return ytiled_to_linear(0, 0, ytile_width, ytile_width, 0, ytile_height,
788
dst, src, dst_pitch, swizzle_bit,
789
rgba8_copy, rgba8_copy_aligned_src);
790
#if defined(INLINE_SSE41)
791
else if (copy_type == ISL_MEMCPY_STREAMING_LOAD)
792
return ytiled_to_linear(0, 0, ytile_width, ytile_width, 0, ytile_height,
793
dst, src, dst_pitch, swizzle_bit,
794
memcpy, _memcpy_streaming_load);
795
#endif
796
else
797
unreachable("not reached");
798
} else {
799
if (mem_copy == memcpy)
800
return ytiled_to_linear(x0, x1, x2, x3, y0, y1,
801
dst, src, dst_pitch, swizzle_bit, memcpy, memcpy);
802
else if (mem_copy == rgba8_copy)
803
return ytiled_to_linear(x0, x1, x2, x3, y0, y1,
804
dst, src, dst_pitch, swizzle_bit,
805
rgba8_copy, rgba8_copy_aligned_src);
806
#if defined(INLINE_SSE41)
807
else if (copy_type == ISL_MEMCPY_STREAMING_LOAD)
808
return ytiled_to_linear(x0, x1, x2, x3, y0, y1,
809
dst, src, dst_pitch, swizzle_bit,
810
memcpy, _memcpy_streaming_load);
811
#endif
812
else
813
unreachable("not reached");
814
}
815
ytiled_to_linear(x0, x1, x2, x3, y0, y1,
816
dst, src, dst_pitch, swizzle_bit, mem_copy, mem_copy);
817
}
818
819
/**
820
* Copy from linear to tiled texture.
821
*
822
* Divide the region given by X range [xt1, xt2) and Y range [yt1, yt2) into
823
* pieces that do not cross tile boundaries and copy each piece with a tile
824
* copy function (\ref tile_copy_fn).
825
* The X range is in bytes, i.e. pixels * bytes-per-pixel.
826
* The Y range is in pixels (i.e. unitless).
827
* 'dst' is the address of (0, 0) in the destination tiled texture.
828
* 'src' is the address of (xt1, yt1) in the source linear texture.
829
*/
830
static void
831
linear_to_tiled(uint32_t xt1, uint32_t xt2,
832
uint32_t yt1, uint32_t yt2,
833
char *dst, const char *src,
834
uint32_t dst_pitch, int32_t src_pitch,
835
bool has_swizzling,
836
enum isl_tiling tiling,
837
isl_memcpy_type copy_type)
838
{
839
tile_copy_fn tile_copy;
840
uint32_t xt0, xt3;
841
uint32_t yt0, yt3;
842
uint32_t xt, yt;
843
uint32_t tw, th, span;
844
uint32_t swizzle_bit = has_swizzling ? 1<<6 : 0;
845
846
if (tiling == ISL_TILING_X) {
847
tw = xtile_width;
848
th = xtile_height;
849
span = xtile_span;
850
tile_copy = linear_to_xtiled_faster;
851
} else if (tiling == ISL_TILING_Y0) {
852
tw = ytile_width;
853
th = ytile_height;
854
span = ytile_span;
855
tile_copy = linear_to_ytiled_faster;
856
} else {
857
unreachable("unsupported tiling");
858
}
859
860
/* Round out to tile boundaries. */
861
xt0 = ALIGN_DOWN(xt1, tw);
862
xt3 = ALIGN_UP (xt2, tw);
863
yt0 = ALIGN_DOWN(yt1, th);
864
yt3 = ALIGN_UP (yt2, th);
865
866
/* Loop over all tiles to which we have something to copy.
867
* 'xt' and 'yt' are the origin of the destination tile, whether copying
868
* copying a full or partial tile.
869
* tile_copy() copies one tile or partial tile.
870
* Looping x inside y is the faster memory access pattern.
871
*/
872
for (yt = yt0; yt < yt3; yt += th) {
873
for (xt = xt0; xt < xt3; xt += tw) {
874
/* The area to update is [x0,x3) x [y0,y1).
875
* May not want the whole tile, hence the min and max.
876
*/
877
uint32_t x0 = MAX2(xt1, xt);
878
uint32_t y0 = MAX2(yt1, yt);
879
uint32_t x3 = MIN2(xt2, xt + tw);
880
uint32_t y1 = MIN2(yt2, yt + th);
881
882
/* [x0,x3) is split into [x0,x1), [x1,x2), [x2,x3) such that
883
* the middle interval is the longest span-aligned part.
884
* The sub-ranges could be empty.
885
*/
886
uint32_t x1, x2;
887
x1 = ALIGN_UP(x0, span);
888
if (x1 > x3)
889
x1 = x2 = x3;
890
else
891
x2 = ALIGN_DOWN(x3, span);
892
893
assert(x0 <= x1 && x1 <= x2 && x2 <= x3);
894
assert(x1 - x0 < span && x3 - x2 < span);
895
assert(x3 - x0 <= tw);
896
assert((x2 - x1) % span == 0);
897
898
/* Translate by (xt,yt) for single-tile copier. */
899
tile_copy(x0-xt, x1-xt, x2-xt, x3-xt,
900
y0-yt, y1-yt,
901
dst + (ptrdiff_t)xt * th + (ptrdiff_t)yt * dst_pitch,
902
src + (ptrdiff_t)xt - xt1 + ((ptrdiff_t)yt - yt1) * src_pitch,
903
src_pitch,
904
swizzle_bit,
905
copy_type);
906
}
907
}
908
}
909
910
/**
911
* Copy from tiled to linear texture.
912
*
913
* Divide the region given by X range [xt1, xt2) and Y range [yt1, yt2) into
914
* pieces that do not cross tile boundaries and copy each piece with a tile
915
* copy function (\ref tile_copy_fn).
916
* The X range is in bytes, i.e. pixels * bytes-per-pixel.
917
* The Y range is in pixels (i.e. unitless).
918
* 'dst' is the address of (xt1, yt1) in the destination linear texture.
919
* 'src' is the address of (0, 0) in the source tiled texture.
920
*/
921
static void
922
tiled_to_linear(uint32_t xt1, uint32_t xt2,
923
uint32_t yt1, uint32_t yt2,
924
char *dst, const char *src,
925
int32_t dst_pitch, uint32_t src_pitch,
926
bool has_swizzling,
927
enum isl_tiling tiling,
928
isl_memcpy_type copy_type)
929
{
930
tile_copy_fn tile_copy;
931
uint32_t xt0, xt3;
932
uint32_t yt0, yt3;
933
uint32_t xt, yt;
934
uint32_t tw, th, span;
935
uint32_t swizzle_bit = has_swizzling ? 1<<6 : 0;
936
937
if (tiling == ISL_TILING_X) {
938
tw = xtile_width;
939
th = xtile_height;
940
span = xtile_span;
941
tile_copy = xtiled_to_linear_faster;
942
} else if (tiling == ISL_TILING_Y0) {
943
tw = ytile_width;
944
th = ytile_height;
945
span = ytile_span;
946
tile_copy = ytiled_to_linear_faster;
947
} else {
948
unreachable("unsupported tiling");
949
}
950
951
#if defined(INLINE_SSE41)
952
if (copy_type == ISL_MEMCPY_STREAMING_LOAD) {
953
/* The hidden cacheline sized register used by movntdqa can apparently
954
* give you stale data, so do an mfence to invalidate it.
955
*/
956
_mm_mfence();
957
}
958
#endif
959
960
/* Round out to tile boundaries. */
961
xt0 = ALIGN_DOWN(xt1, tw);
962
xt3 = ALIGN_UP (xt2, tw);
963
yt0 = ALIGN_DOWN(yt1, th);
964
yt3 = ALIGN_UP (yt2, th);
965
966
/* Loop over all tiles to which we have something to copy.
967
* 'xt' and 'yt' are the origin of the destination tile, whether copying
968
* copying a full or partial tile.
969
* tile_copy() copies one tile or partial tile.
970
* Looping x inside y is the faster memory access pattern.
971
*/
972
for (yt = yt0; yt < yt3; yt += th) {
973
for (xt = xt0; xt < xt3; xt += tw) {
974
/* The area to update is [x0,x3) x [y0,y1).
975
* May not want the whole tile, hence the min and max.
976
*/
977
uint32_t x0 = MAX2(xt1, xt);
978
uint32_t y0 = MAX2(yt1, yt);
979
uint32_t x3 = MIN2(xt2, xt + tw);
980
uint32_t y1 = MIN2(yt2, yt + th);
981
982
/* [x0,x3) is split into [x0,x1), [x1,x2), [x2,x3) such that
983
* the middle interval is the longest span-aligned part.
984
* The sub-ranges could be empty.
985
*/
986
uint32_t x1, x2;
987
x1 = ALIGN_UP(x0, span);
988
if (x1 > x3)
989
x1 = x2 = x3;
990
else
991
x2 = ALIGN_DOWN(x3, span);
992
993
assert(x0 <= x1 && x1 <= x2 && x2 <= x3);
994
assert(x1 - x0 < span && x3 - x2 < span);
995
assert(x3 - x0 <= tw);
996
assert((x2 - x1) % span == 0);
997
998
/* Translate by (xt,yt) for single-tile copier. */
999
tile_copy(x0-xt, x1-xt, x2-xt, x3-xt,
1000
y0-yt, y1-yt,
1001
dst + (ptrdiff_t)xt - xt1 + ((ptrdiff_t)yt - yt1) * dst_pitch,
1002
src + (ptrdiff_t)xt * th + (ptrdiff_t)yt * src_pitch,
1003
dst_pitch,
1004
swizzle_bit,
1005
copy_type);
1006
}
1007
}
1008
}
1009
1010