Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
PojavLauncherTeam
GitHub Repository: PojavLauncherTeam/mesa
Path: blob/21.2-virgl/src/gallium/drivers/swr/rasterizer/memory/StoreTile.h
4574 views
1
/****************************************************************************
2
* Copyright (C) 2014-2016 Intel Corporation. All Rights Reserved.
3
*
4
* Permission is hereby granted, free of charge, to any person obtaining a
5
* copy of this software and associated documentation files (the "Software"),
6
* to deal in the Software without restriction, including without limitation
7
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
8
* and/or sell copies of the Software, and to permit persons to whom the
9
* Software is furnished to do so, subject to the following conditions:
10
*
11
* The above copyright notice and this permission notice (including the next
12
* paragraph) shall be included in all copies or substantial portions of the
13
* Software.
14
*
15
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21
* IN THE SOFTWARE.
22
*
23
* @file StoreTile.h
24
*
25
* @brief Functionality for Store.
26
*
27
******************************************************************************/
28
#pragma once
29
30
#include "common/os.h"
31
#include "common/formats.h"
32
#include "core/context.h"
33
#include "core/rdtsc_core.h"
34
#include "core/format_conversion.h"
35
36
#include "memory/TilingFunctions.h"
37
#include "memory/Convert.h"
38
#include "memory/SurfaceState.h"
39
#include "core/multisample.h"
40
41
#include <array>
42
#include <sstream>
43
44
#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
45
46
// Function pointer to different storing functions for color, depth, and stencil based on incoming formats.
47
typedef void(*PFN_STORE_TILES)(uint8_t*, SWR_SURFACE_STATE*, uint32_t, uint32_t, uint32_t);
48
49
//////////////////////////////////////////////////////////////////////////
50
/// Store Raster Tile Function Tables.
51
//////////////////////////////////////////////////////////////////////////
52
extern PFN_STORE_TILES sStoreTilesTableColor[SWR_TILE_MODE_COUNT][NUM_SWR_FORMATS];
53
extern PFN_STORE_TILES sStoreTilesTableDepth[SWR_TILE_MODE_COUNT][NUM_SWR_FORMATS];
54
extern PFN_STORE_TILES sStoreTilesTableStencil[SWR_TILE_MODE_COUNT][NUM_SWR_FORMATS];
55
56
void InitStoreTilesTable_Linear_1();
57
void InitStoreTilesTable_Linear_2();
58
void InitStoreTilesTable_TileX_1();
59
void InitStoreTilesTable_TileX_2();
60
void InitStoreTilesTable_TileY_1();
61
void InitStoreTilesTable_TileY_2();
62
void InitStoreTilesTable_TileW();
63
void InitStoreTilesTable();
64
65
//////////////////////////////////////////////////////////////////////////
66
/// StorePixels
67
/// @brief Stores a 4x2 (AVX) raster-tile to two rows.
68
/// @param pSrc - Pointer to source raster tile in SWRZ pixel order
69
/// @param ppDsts - Array of destination pointers. Each pointer is
70
/// to a single row of at most 16B.
71
/// @tparam NumDests - Number of destination pointers. Each pair of
72
/// pointers is for a 16-byte column of two rows.
73
//////////////////////////////////////////////////////////////////////////
74
template <size_t PixelSize, size_t NumDests>
75
struct StorePixels
76
{
77
static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests]) = delete;
78
};
79
80
//////////////////////////////////////////////////////////////////////////
81
/// StorePixels (32-bit pixel specialization)
82
/// @brief Stores a 4x2 (AVX) raster-tile to two rows.
83
/// @param pSrc - Pointer to source raster tile in SWRZ pixel order
84
/// @param ppDsts - Array of destination pointers. Each pointer is
85
/// to a single row of at most 16B.
86
/// @tparam NumDests - Number of destination pointers. Each pair of
87
/// pointers is for a 16-byte column of two rows.
88
//////////////////////////////////////////////////////////////////////////
89
template <>
90
struct StorePixels<8, 2>
91
{
92
static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[2])
93
{
94
// Each 4-pixel row is 4 bytes.
95
const uint16_t* pPixSrc = (const uint16_t*)pSrc;
96
97
// Unswizzle from SWR-Z order
98
uint16_t* pRow = (uint16_t*)ppDsts[0];
99
pRow[0] = pPixSrc[0];
100
pRow[1] = pPixSrc[2];
101
102
pRow = (uint16_t*)ppDsts[1];
103
pRow[0] = pPixSrc[1];
104
pRow[1] = pPixSrc[3];
105
}
106
};
107
108
template <>
109
struct StorePixels<8, 4>
110
{
111
static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[4])
112
{
113
// 8 x 2 bytes = 16 bytes, 16 pixels
114
const uint16_t *pSrc16 = reinterpret_cast<const uint16_t *>(pSrc);
115
116
uint16_t **ppDsts16 = reinterpret_cast<uint16_t **>(ppDsts);
117
118
// Unswizzle from SWR-Z order
119
ppDsts16[0][0] = pSrc16[0]; // 0 1
120
ppDsts16[0][1] = pSrc16[2]; // 4 5
121
122
ppDsts16[1][0] = pSrc16[1]; // 2 3
123
ppDsts16[1][1] = pSrc16[3]; // 6 7
124
125
ppDsts16[2][0] = pSrc16[4]; // 8 9
126
ppDsts16[2][1] = pSrc16[6]; // C D
127
128
ppDsts16[3][0] = pSrc16[5]; // A B
129
ppDsts16[3][1] = pSrc16[7]; // E F
130
}
131
};
132
133
//////////////////////////////////////////////////////////////////////////
134
/// StorePixels (32-bit pixel specialization)
135
/// @brief Stores a 4x2 (AVX) raster-tile to two rows.
136
/// @param pSrc - Pointer to source raster tile in SWRZ pixel order
137
/// @param ppDsts - Array of destination pointers. Each pointer is
138
/// to a single row of at most 16B.
139
/// @tparam NumDests - Number of destination pointers. Each pair of
140
/// pointers is for a 16-byte column of two rows.
141
//////////////////////////////////////////////////////////////////////////
142
template <>
143
struct StorePixels<16, 2>
144
{
145
static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[2])
146
{
147
// Each 4-pixel row is 8 bytes.
148
const uint32_t* pPixSrc = (const uint32_t*)pSrc;
149
150
// Unswizzle from SWR-Z order
151
uint32_t* pRow = (uint32_t*)ppDsts[0];
152
pRow[0] = pPixSrc[0];
153
pRow[1] = pPixSrc[2];
154
155
pRow = (uint32_t*)ppDsts[1];
156
pRow[0] = pPixSrc[1];
157
pRow[1] = pPixSrc[3];
158
}
159
};
160
161
template <>
162
struct StorePixels<16, 4>
163
{
164
static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[4])
165
{
166
// 8 x 4 bytes = 32 bytes, 16 pixels
167
const uint32_t *pSrc32 = reinterpret_cast<const uint32_t *>(pSrc);
168
169
uint32_t **ppDsts32 = reinterpret_cast<uint32_t **>(ppDsts);
170
171
// Unswizzle from SWR-Z order
172
ppDsts32[0][0] = pSrc32[0]; // 0 1
173
ppDsts32[0][1] = pSrc32[2]; // 4 5
174
175
ppDsts32[1][0] = pSrc32[1]; // 2 3
176
ppDsts32[1][1] = pSrc32[3]; // 6 7
177
178
ppDsts32[2][0] = pSrc32[4]; // 8 9
179
ppDsts32[2][1] = pSrc32[6]; // C D
180
181
ppDsts32[3][0] = pSrc32[5]; // A B
182
ppDsts32[3][1] = pSrc32[7]; // E F
183
}
184
};
185
186
//////////////////////////////////////////////////////////////////////////
187
/// StorePixels (32-bit pixel specialization)
188
/// @brief Stores a 4x2 (AVX) raster-tile to two rows.
189
/// @param pSrc - Pointer to source raster tile in SWRZ pixel order
190
/// @param ppDsts - Array of destination pointers. Each pointer is
191
/// to a single row of at most 16B.
192
/// @tparam NumDests - Number of destination pointers. Each pair of
193
/// pointers is for a 16-byte column of two rows.
194
//////////////////////////////////////////////////////////////////////////
195
template <>
196
struct StorePixels<32, 2>
197
{
198
static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[2])
199
{
200
// Each 4-pixel row is 16-bytes
201
simd4scalari *pZRow01 = (simd4scalari*)pSrc;
202
simd4scalari vQuad00 = SIMD128::load_si(pZRow01);
203
simd4scalari vQuad01 = SIMD128::load_si(pZRow01 + 1);
204
205
simd4scalari vRow00 = SIMD128::unpacklo_epi64(vQuad00, vQuad01);
206
simd4scalari vRow10 = SIMD128::unpackhi_epi64(vQuad00, vQuad01);
207
208
SIMD128::storeu_si((simd4scalari*)ppDsts[0], vRow00);
209
SIMD128::storeu_si((simd4scalari*)ppDsts[1], vRow10);
210
}
211
};
212
213
template <>
214
struct StorePixels<32, 4>
215
{
216
static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[4])
217
{
218
// 4 x 16 bytes = 64 bytes, 16 pixels
219
const simd4scalari *pSrc128 = reinterpret_cast<const simd4scalari *>(pSrc);
220
221
simd4scalari **ppDsts128 = reinterpret_cast<simd4scalari **>(ppDsts);
222
223
// Unswizzle from SWR-Z order
224
simd4scalari quad0 = SIMD128::load_si(&pSrc128[0]); // 0 1 2 3
225
simd4scalari quad1 = SIMD128::load_si(&pSrc128[1]); // 4 5 6 7
226
simd4scalari quad2 = SIMD128::load_si(&pSrc128[2]); // 8 9 A B
227
simd4scalari quad3 = SIMD128::load_si(&pSrc128[3]); // C D E F
228
229
SIMD128::storeu_si(ppDsts128[0], SIMD128::unpacklo_epi64(quad0, quad1)); // 0 1 4 5
230
SIMD128::storeu_si(ppDsts128[1], SIMD128::unpackhi_epi64(quad0, quad1)); // 2 3 6 7
231
SIMD128::storeu_si(ppDsts128[2], SIMD128::unpacklo_epi64(quad2, quad3)); // 8 9 C D
232
SIMD128::storeu_si(ppDsts128[3], SIMD128::unpackhi_epi64(quad2, quad3)); // A B E F
233
}
234
};
235
236
//////////////////////////////////////////////////////////////////////////
237
/// StorePixels (32-bit pixel specialization)
238
/// @brief Stores a 4x2 (AVX) raster-tile to two rows.
239
/// @param pSrc - Pointer to source raster tile in SWRZ pixel order
240
/// @param ppDsts - Array of destination pointers. Each pointer is
241
/// to a single row of at most 16B.
242
/// @tparam NumDests - Number of destination pointers. Each pair of
243
/// pointers is for a 16-byte column of two rows.
244
//////////////////////////////////////////////////////////////////////////
245
template <>
246
struct StorePixels<64, 4>
247
{
248
static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[4])
249
{
250
// Each 4-pixel row is 32 bytes.
251
const simd4scalari* pPixSrc = (const simd4scalari*)pSrc;
252
253
// order of pointers match SWR-Z layout
254
simd4scalari** pvDsts = (simd4scalari**)&ppDsts[0];
255
*pvDsts[0] = pPixSrc[0];
256
*pvDsts[1] = pPixSrc[1];
257
*pvDsts[2] = pPixSrc[2];
258
*pvDsts[3] = pPixSrc[3];
259
}
260
};
261
262
template <>
263
struct StorePixels<64, 8>
264
{
265
static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[8])
266
{
267
// 8 x 16 bytes = 128 bytes, 16 pixels
268
const simd4scalari *pSrc128 = reinterpret_cast<const simd4scalari *>(pSrc);
269
270
simd4scalari **ppDsts128 = reinterpret_cast<simd4scalari **>(ppDsts);
271
272
// order of pointers match SWR-Z layout
273
*ppDsts128[0] = pSrc128[0]; // 0 1
274
*ppDsts128[1] = pSrc128[1]; // 2 3
275
*ppDsts128[2] = pSrc128[2]; // 4 5
276
*ppDsts128[3] = pSrc128[3]; // 6 7
277
*ppDsts128[4] = pSrc128[4]; // 8 9
278
*ppDsts128[5] = pSrc128[5]; // A B
279
*ppDsts128[6] = pSrc128[6]; // C D
280
*ppDsts128[7] = pSrc128[7]; // E F
281
}
282
};
283
284
//////////////////////////////////////////////////////////////////////////
285
/// StorePixels (32-bit pixel specialization)
286
/// @brief Stores a 4x2 (AVX) raster-tile to two rows.
287
/// @param pSrc - Pointer to source raster tile in SWRZ pixel order
288
/// @param ppDsts - Array of destination pointers. Each pointer is
289
/// to a single row of at most 16B.
290
/// @tparam NumDests - Number of destination pointers. Each pair of
291
/// pointers is for a 16-byte column of two rows.
292
//////////////////////////////////////////////////////////////////////////
293
template <>
294
struct StorePixels<128, 8>
295
{
296
static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[8])
297
{
298
// Each 4-pixel row is 64 bytes.
299
const simd4scalari* pPixSrc = (const simd4scalari*)pSrc;
300
301
// Unswizzle from SWR-Z order
302
simd4scalari** pvDsts = (simd4scalari**)&ppDsts[0];
303
*pvDsts[0] = pPixSrc[0];
304
*pvDsts[1] = pPixSrc[2];
305
*pvDsts[2] = pPixSrc[1];
306
*pvDsts[3] = pPixSrc[3];
307
*pvDsts[4] = pPixSrc[4];
308
*pvDsts[5] = pPixSrc[6];
309
*pvDsts[6] = pPixSrc[5];
310
*pvDsts[7] = pPixSrc[7];
311
}
312
};
313
314
template <>
315
struct StorePixels<128, 16>
316
{
317
static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[16])
318
{
319
// 16 x 16 bytes = 256 bytes, 16 pixels
320
const simd4scalari *pSrc128 = reinterpret_cast<const simd4scalari *>(pSrc);
321
322
simd4scalari **ppDsts128 = reinterpret_cast<simd4scalari **>(ppDsts);
323
324
for (uint32_t i = 0; i < 16; i += 4)
325
{
326
*ppDsts128[i + 0] = pSrc128[i + 0];
327
*ppDsts128[i + 1] = pSrc128[i + 2];
328
*ppDsts128[i + 2] = pSrc128[i + 1];
329
*ppDsts128[i + 3] = pSrc128[i + 3];
330
}
331
}
332
};
333
334
//////////////////////////////////////////////////////////////////////////
335
/// ConvertPixelsSOAtoAOS - Conversion for SIMD pixel (4x2 or 2x2)
336
//////////////////////////////////////////////////////////////////////////
337
template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
338
struct ConvertPixelsSOAtoAOS
339
{
340
//////////////////////////////////////////////////////////////////////////
341
/// @brief Converts a SIMD from the Hot Tile to the destination format
342
/// and converts from SOA to AOS.
343
/// @param pSrc - Pointer to raster tile.
344
/// @param pDst - Pointer to destination surface or deswizzling buffer.
345
template <size_t NumDests>
346
INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests])
347
{
348
static const uint32_t MAX_RASTER_TILE_BYTES = 16 * 16; // 16 pixels * 16 bytes per pixel
349
350
OSALIGNSIMD16(uint8_t) soaTile[MAX_RASTER_TILE_BYTES] = {0};
351
OSALIGNSIMD16(uint8_t) aosTile[MAX_RASTER_TILE_BYTES] = {0};
352
353
// Convert from SrcFormat --> DstFormat
354
simd16vector src;
355
LoadSOA<SrcFormat>(pSrc, src);
356
StoreSOA<DstFormat>(src, soaTile);
357
358
// Convert from SOA --> AOS
359
FormatTraits<DstFormat>::TransposeT::Transpose_simd16(soaTile, aosTile);
360
361
// Store data into destination
362
StorePixels<FormatTraits<DstFormat>::bpp, NumDests>::Store(aosTile, ppDsts);
363
}
364
};
365
366
//////////////////////////////////////////////////////////////////////////
367
/// ConvertPixelsSOAtoAOS - Conversion for SIMD pixel (4x2 or 2x2)
368
/// Specialization for no format conversion
369
//////////////////////////////////////////////////////////////////////////
370
template<SWR_FORMAT Format>
371
struct ConvertPixelsSOAtoAOS<Format, Format>
372
{
373
//////////////////////////////////////////////////////////////////////////
374
/// @brief Converts a SIMD from the Hot Tile to the destination format
375
/// and converts from SOA to AOS.
376
/// @param pSrc - Pointer to raster tile.
377
/// @param pDst - Pointer to destination surface or deswizzling buffer.
378
template <size_t NumDests>
379
INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests])
380
{
381
static const uint32_t MAX_RASTER_TILE_BYTES = 16 * 16; // 16 pixels * 16 bytes per pixel
382
383
OSALIGNSIMD16(uint8_t) aosTile[MAX_RASTER_TILE_BYTES];
384
385
// Convert from SOA --> AOS
386
FormatTraits<Format>::TransposeT::Transpose_simd16(pSrc, aosTile);
387
388
// Store data into destination
389
StorePixels<FormatTraits<Format>::bpp, NumDests>::Store(aosTile, ppDsts);
390
}
391
};
392
393
//////////////////////////////////////////////////////////////////////////
394
/// ConvertPixelsSOAtoAOS - Specialization conversion for B5G6R6_UNORM
395
//////////////////////////////////////////////////////////////////////////
396
template<>
397
struct ConvertPixelsSOAtoAOS < R32G32B32A32_FLOAT, B5G6R5_UNORM >
398
{
399
//////////////////////////////////////////////////////////////////////////
400
/// @brief Converts a SIMD from the Hot Tile to the destination format
401
/// and converts from SOA to AOS.
402
/// @param pSrc - Pointer to raster tile.
403
/// @param pDst - Pointer to destination surface or deswizzling buffer.
404
template <size_t NumDests>
405
INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests])
406
{
407
static const SWR_FORMAT SrcFormat = R32G32B32A32_FLOAT;
408
static const SWR_FORMAT DstFormat = B5G6R5_UNORM;
409
410
static const uint32_t MAX_RASTER_TILE_BYTES = 16 * 16; // 16 pixels * 16 bytes per pixel
411
412
OSALIGNSIMD16(uint8_t) aosTile[MAX_RASTER_TILE_BYTES];
413
414
// Load hot-tile
415
simd16vector src, dst;
416
LoadSOA<SrcFormat>(pSrc, src);
417
418
// deswizzle
419
dst.x = src[FormatTraits<DstFormat>::swizzle(0)];
420
dst.y = src[FormatTraits<DstFormat>::swizzle(1)];
421
dst.z = src[FormatTraits<DstFormat>::swizzle(2)];
422
423
// clamp
424
dst.x = Clamp<DstFormat>(dst.x, 0);
425
dst.y = Clamp<DstFormat>(dst.y, 1);
426
dst.z = Clamp<DstFormat>(dst.z, 2);
427
428
// normalize
429
dst.x = Normalize<DstFormat>(dst.x, 0);
430
dst.y = Normalize<DstFormat>(dst.y, 1);
431
dst.z = Normalize<DstFormat>(dst.z, 2);
432
433
// pack
434
simd16scalari packed = _simd16_castps_si(dst.x);
435
436
SWR_ASSERT(FormatTraits<DstFormat>::GetBPC(0) == 5);
437
SWR_ASSERT(FormatTraits<DstFormat>::GetBPC(1) == 6);
438
439
packed = _simd16_or_si(packed, _simd16_slli_epi32(_simd16_castps_si(dst.y), 5));
440
packed = _simd16_or_si(packed, _simd16_slli_epi32(_simd16_castps_si(dst.z), 5 + 6));
441
442
// pack low 16 bits of each 32 bit lane to low 128 bits of dst
443
uint32_t *pPacked = (uint32_t*)&packed;
444
uint16_t *pAosTile = (uint16_t*)&aosTile[0];
445
for (uint32_t t = 0; t < KNOB_SIMD16_WIDTH; ++t)
446
{
447
*pAosTile++ = *pPacked++;
448
}
449
450
// Store data into destination
451
StorePixels<FormatTraits<DstFormat>::bpp, NumDests>::Store(aosTile, ppDsts);
452
}
453
};
454
455
//////////////////////////////////////////////////////////////////////////
456
/// ConvertPixelsSOAtoAOS - Conversion for SIMD pixel (4x2 or 2x2)
457
//////////////////////////////////////////////////////////////////////////
458
template<>
459
struct ConvertPixelsSOAtoAOS<R32_FLOAT, R24_UNORM_X8_TYPELESS>
460
{
461
static const SWR_FORMAT SrcFormat = R32_FLOAT;
462
static const SWR_FORMAT DstFormat = R24_UNORM_X8_TYPELESS;
463
464
//////////////////////////////////////////////////////////////////////////
465
/// @brief Converts a SIMD from the Hot Tile to the destination format
466
/// and converts from SOA to AOS.
467
/// @param pSrc - Pointer to raster tile.
468
/// @param pDst - Pointer to destination surface or deswizzling buffer.
469
template <size_t NumDests>
470
INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests])
471
{
472
simd16scalar comp = _simd16_load_ps(reinterpret_cast<const float *>(pSrc));
473
474
// clamp
475
const simd16scalar zero = _simd16_setzero_ps();
476
const simd16scalar ones = _simd16_set1_ps(1.0f);
477
478
comp = _simd16_max_ps(comp, zero);
479
comp = _simd16_min_ps(comp, ones);
480
481
// normalize
482
comp = _simd16_mul_ps(comp, _simd16_set1_ps(FormatTraits<DstFormat>::fromFloat(0)));
483
484
simd16scalari temp = _simd16_cvtps_epi32(comp);
485
486
// swizzle
487
temp = _simd16_permute_epi32(temp, _simd16_set_epi32(15, 14, 11, 10, 13, 12, 9, 8, 7, 6, 3, 2, 5, 4, 1, 0));
488
489
// merge/store data into destination but don't overwrite the X8 bits
490
simdscalari destlo = _simd_loadu2_si(reinterpret_cast<simd4scalari *>(ppDsts[1]), reinterpret_cast<simd4scalari *>(ppDsts[0]));
491
simdscalari desthi = _simd_loadu2_si(reinterpret_cast<simd4scalari *>(ppDsts[3]), reinterpret_cast<simd4scalari *>(ppDsts[2]));
492
493
simd16scalari dest = _simd16_setzero_si();
494
495
dest = _simd16_insert_si(dest, destlo, 0);
496
dest = _simd16_insert_si(dest, desthi, 1);
497
498
simd16scalari mask = _simd16_set1_epi32(0x00FFFFFF);
499
500
dest = _simd16_or_si(_simd16_andnot_si(mask, dest), _simd16_and_si(mask, temp));
501
502
_simd_storeu2_si(reinterpret_cast<simd4scalari *>(ppDsts[1]), reinterpret_cast<simd4scalari *>(ppDsts[0]), _simd16_extract_si(dest, 0));
503
_simd_storeu2_si(reinterpret_cast<simd4scalari *>(ppDsts[3]), reinterpret_cast<simd4scalari *>(ppDsts[2]), _simd16_extract_si(dest, 1));
504
}
505
};
506
507
template<SWR_FORMAT DstFormat>
508
INLINE static void FlatConvert(const uint8_t* pSrc, uint8_t* pDst0, uint8_t* pDst1, uint8_t* pDst2, uint8_t* pDst3)
509
{
510
// swizzle rgba -> bgra while we load
511
simd16scalar comp0 = _simd16_load_ps(reinterpret_cast<const float*>(pSrc + FormatTraits<DstFormat>::swizzle(0) * sizeof(simd16scalar))); // float32 rrrrrrrrrrrrrrrr
512
simd16scalar comp1 = _simd16_load_ps(reinterpret_cast<const float*>(pSrc + FormatTraits<DstFormat>::swizzle(1) * sizeof(simd16scalar))); // float32 gggggggggggggggg
513
simd16scalar comp2 = _simd16_load_ps(reinterpret_cast<const float*>(pSrc + FormatTraits<DstFormat>::swizzle(2) * sizeof(simd16scalar))); // float32 bbbbbbbbbbbbbbbb
514
simd16scalar comp3 = _simd16_load_ps(reinterpret_cast<const float*>(pSrc + FormatTraits<DstFormat>::swizzle(3) * sizeof(simd16scalar))); // float32 aaaaaaaaaaaaaaaa
515
516
// clamp
517
const simd16scalar zero = _simd16_setzero_ps();
518
const simd16scalar ones = _simd16_set1_ps(1.0f);
519
520
comp0 = _simd16_max_ps(comp0, zero);
521
comp0 = _simd16_min_ps(comp0, ones);
522
523
comp1 = _simd16_max_ps(comp1, zero);
524
comp1 = _simd16_min_ps(comp1, ones);
525
526
comp2 = _simd16_max_ps(comp2, zero);
527
comp2 = _simd16_min_ps(comp2, ones);
528
529
comp3 = _simd16_max_ps(comp3, zero);
530
comp3 = _simd16_min_ps(comp3, ones);
531
532
// gamma-correct only rgb
533
if (FormatTraits<DstFormat>::isSRGB)
534
{
535
comp0 = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(0, comp0);
536
comp1 = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(1, comp1);
537
comp2 = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(2, comp2);
538
}
539
540
// convert float components from 0.0f..1.0f to correct scale for 0..255 dest format
541
comp0 = _simd16_mul_ps(comp0, _simd16_set1_ps(FormatTraits<DstFormat>::fromFloat(0)));
542
comp1 = _simd16_mul_ps(comp1, _simd16_set1_ps(FormatTraits<DstFormat>::fromFloat(1)));
543
comp2 = _simd16_mul_ps(comp2, _simd16_set1_ps(FormatTraits<DstFormat>::fromFloat(2)));
544
comp3 = _simd16_mul_ps(comp3, _simd16_set1_ps(FormatTraits<DstFormat>::fromFloat(3)));
545
546
// moving to 16 wide integer vector types
547
simd16scalari src0 = _simd16_cvtps_epi32(comp0); // padded byte rrrrrrrrrrrrrrrr
548
simd16scalari src1 = _simd16_cvtps_epi32(comp1); // padded byte gggggggggggggggg
549
simd16scalari src2 = _simd16_cvtps_epi32(comp2); // padded byte bbbbbbbbbbbbbbbb
550
simd16scalari src3 = _simd16_cvtps_epi32(comp3); // padded byte aaaaaaaaaaaaaaaa
551
552
// SOA to AOS conversion
553
src1 = _simd16_slli_epi32(src1, 8);
554
src2 = _simd16_slli_epi32(src2, 16);
555
src3 = _simd16_slli_epi32(src3, 24);
556
557
simd16scalari final = _simd16_or_si(_simd16_or_si(src0, src1), _simd16_or_si(src2, src3)); // 0 1 2 3 4 5 6 7 8 9 A B C D E F
558
559
// de-swizzle conversion
560
#if 1
561
simd16scalari final0 = _simd16_permute2f128_si(final, final, 0xA0); // (2, 2, 0, 0) // 0 1 2 3 0 1 2 3 8 9 A B 8 9 A B
562
simd16scalari final1 = _simd16_permute2f128_si(final, final, 0xF5); // (3, 3, 1, 1) // 4 5 6 7 4 5 6 7 C D E F C D E F
563
564
final = _simd16_shuffle_epi64(final0, final1, 0xCC); // (1 1 0 0 1 1 0 0) // 0 1 4 5 2 3 6 7 8 9 C D A B E F
565
566
#else
567
final = _simd16_permute_epi32(final, _simd16_set_epi32(15, 14, 11, 10, 13, 12, 9, 8, 7, 6, 3, 2, 5, 4, 1, 0));
568
569
#endif
570
// store 8x2 memory order:
571
// row0: [ pDst0, pDst2 ] = { 0 1 4 5 }, { 8 9 C D }
572
// row1: [ pDst1, pDst3 ] = { 2 3 6 7 }, { A B E F }
573
_simd_storeu2_si(reinterpret_cast<simd4scalari *>(pDst1), reinterpret_cast<simd4scalari *>(pDst0), _simd16_extract_si(final, 0));
574
_simd_storeu2_si(reinterpret_cast<simd4scalari *>(pDst3), reinterpret_cast<simd4scalari *>(pDst2), _simd16_extract_si(final, 1));
575
}
576
577
template<SWR_FORMAT DstFormat>
578
INLINE static void FlatConvert(const uint8_t* pSrc, uint8_t* pDst, uint8_t* pDst1)
579
{
580
static const uint32_t offset = sizeof(simdscalar);
581
582
// swizzle rgba -> bgra while we load
583
simdscalar vComp0 = _simd_load_ps((const float*)(pSrc + (FormatTraits<DstFormat>::swizzle(0))*offset)); // float32 rrrrrrrr
584
simdscalar vComp1 = _simd_load_ps((const float*)(pSrc + (FormatTraits<DstFormat>::swizzle(1))*offset)); // float32 gggggggg
585
simdscalar vComp2 = _simd_load_ps((const float*)(pSrc + (FormatTraits<DstFormat>::swizzle(2))*offset)); // float32 bbbbbbbb
586
simdscalar vComp3 = _simd_load_ps((const float*)(pSrc + (FormatTraits<DstFormat>::swizzle(3))*offset)); // float32 aaaaaaaa
587
588
// clamp
589
vComp0 = _simd_max_ps(vComp0, _simd_setzero_ps());
590
vComp0 = _simd_min_ps(vComp0, _simd_set1_ps(1.0f));
591
592
vComp1 = _simd_max_ps(vComp1, _simd_setzero_ps());
593
vComp1 = _simd_min_ps(vComp1, _simd_set1_ps(1.0f));
594
595
vComp2 = _simd_max_ps(vComp2, _simd_setzero_ps());
596
vComp2 = _simd_min_ps(vComp2, _simd_set1_ps(1.0f));
597
598
vComp3 = _simd_max_ps(vComp3, _simd_setzero_ps());
599
vComp3 = _simd_min_ps(vComp3, _simd_set1_ps(1.0f));
600
601
if (FormatTraits<DstFormat>::isSRGB)
602
{
603
// Gamma-correct only rgb
604
vComp0 = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(0, vComp0);
605
vComp1 = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(1, vComp1);
606
vComp2 = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(2, vComp2);
607
}
608
609
// convert float components from 0.0f .. 1.0f to correct scale for 0 .. 255 dest format
610
vComp0 = _simd_mul_ps(vComp0, _simd_set1_ps(FormatTraits<DstFormat>::fromFloat(0)));
611
vComp1 = _simd_mul_ps(vComp1, _simd_set1_ps(FormatTraits<DstFormat>::fromFloat(1)));
612
vComp2 = _simd_mul_ps(vComp2, _simd_set1_ps(FormatTraits<DstFormat>::fromFloat(2)));
613
vComp3 = _simd_mul_ps(vComp3, _simd_set1_ps(FormatTraits<DstFormat>::fromFloat(3)));
614
615
// moving to 8 wide integer vector types
616
simdscalari src0 = _simd_cvtps_epi32(vComp0); // padded byte rrrrrrrr
617
simdscalari src1 = _simd_cvtps_epi32(vComp1); // padded byte gggggggg
618
simdscalari src2 = _simd_cvtps_epi32(vComp2); // padded byte bbbbbbbb
619
simdscalari src3 = _simd_cvtps_epi32(vComp3); // padded byte aaaaaaaa
620
621
#if KNOB_ARCH <= KNOB_ARCH_AVX
622
623
// splitting into two sets of 4 wide integer vector types
624
// because AVX doesn't have instructions to support this operation at 8 wide
625
simd4scalari srcLo0 = _mm256_castsi256_si128(src0); // 000r000r000r000r
626
simd4scalari srcLo1 = _mm256_castsi256_si128(src1); // 000g000g000g000g
627
simd4scalari srcLo2 = _mm256_castsi256_si128(src2); // 000b000b000b000b
628
simd4scalari srcLo3 = _mm256_castsi256_si128(src3); // 000a000a000a000a
629
630
simd4scalari srcHi0 = _mm256_extractf128_si256(src0, 1); // 000r000r000r000r
631
simd4scalari srcHi1 = _mm256_extractf128_si256(src1, 1); // 000g000g000g000g
632
simd4scalari srcHi2 = _mm256_extractf128_si256(src2, 1); // 000b000b000b000b
633
simd4scalari srcHi3 = _mm256_extractf128_si256(src3, 1); // 000a000a000a000a
634
635
srcLo1 = _mm_slli_si128(srcLo1, 1); // 00g000g000g000g0
636
srcHi1 = _mm_slli_si128(srcHi1, 1); // 00g000g000g000g0
637
srcLo2 = _mm_slli_si128(srcLo2, 2); // 0b000b000b000b00
638
srcHi2 = _mm_slli_si128(srcHi2, 2); // 0b000b000b000b00
639
srcLo3 = _mm_slli_si128(srcLo3, 3); // a000a000a000a000
640
srcHi3 = _mm_slli_si128(srcHi3, 3); // a000a000a000a000
641
642
srcLo0 = SIMD128::or_si(srcLo0, srcLo1); // 00gr00gr00gr00gr
643
srcLo2 = SIMD128::or_si(srcLo2, srcLo3); // ab00ab00ab00ab00
644
645
srcHi0 = SIMD128::or_si(srcHi0, srcHi1); // 00gr00gr00gr00gr
646
srcHi2 = SIMD128::or_si(srcHi2, srcHi3); // ab00ab00ab00ab00
647
648
srcLo0 = SIMD128::or_si(srcLo0, srcLo2); // abgrabgrabgrabgr
649
srcHi0 = SIMD128::or_si(srcHi0, srcHi2); // abgrabgrabgrabgr
650
651
// unpack into rows that get the tiling order correct
652
simd4scalari vRow00 = SIMD128::unpacklo_epi64(srcLo0, srcHi0); // abgrabgrabgrabgrabgrabgrabgrabgr
653
simd4scalari vRow10 = SIMD128::unpackhi_epi64(srcLo0, srcHi0);
654
655
simdscalari final = _mm256_castsi128_si256(vRow00);
656
final = _mm256_insertf128_si256(final, vRow10, 1);
657
658
#else
659
660
// logic is as above, only wider
661
src1 = _mm256_slli_si256(src1, 1);
662
src2 = _mm256_slli_si256(src2, 2);
663
src3 = _mm256_slli_si256(src3, 3);
664
665
src0 = _mm256_or_si256(src0, src1);
666
src2 = _mm256_or_si256(src2, src3);
667
668
simdscalari final = _mm256_or_si256(src0, src2);
669
670
// adjust the data to get the tiling order correct 0 1 2 3 -> 0 2 1 3
671
final = _mm256_permute4x64_epi64(final, 0xD8);
672
#endif
673
674
_simd_storeu2_si((simd4scalari*)pDst1, (simd4scalari*)pDst, final);
675
}
676
677
template<SWR_FORMAT DstFormat>
678
INLINE static void FlatConvertNoAlpha(const uint8_t* pSrc, uint8_t* pDst0, uint8_t* pDst1, uint8_t* pDst2, uint8_t* pDst3)
679
{
680
// swizzle rgba -> bgra while we load
681
simd16scalar comp0 = _simd16_load_ps(reinterpret_cast<const float*>(pSrc + FormatTraits<DstFormat>::swizzle(0) * sizeof(simd16scalar))); // float32 rrrrrrrrrrrrrrrr
682
simd16scalar comp1 = _simd16_load_ps(reinterpret_cast<const float*>(pSrc + FormatTraits<DstFormat>::swizzle(1) * sizeof(simd16scalar))); // float32 gggggggggggggggg
683
simd16scalar comp2 = _simd16_load_ps(reinterpret_cast<const float*>(pSrc + FormatTraits<DstFormat>::swizzle(2) * sizeof(simd16scalar))); // float32 bbbbbbbbbbbbbbbb
684
685
// clamp
686
const simd16scalar zero = _simd16_setzero_ps();
687
const simd16scalar ones = _simd16_set1_ps(1.0f);
688
689
comp0 = _simd16_max_ps(comp0, zero);
690
comp0 = _simd16_min_ps(comp0, ones);
691
692
comp1 = _simd16_max_ps(comp1, zero);
693
comp1 = _simd16_min_ps(comp1, ones);
694
695
comp2 = _simd16_max_ps(comp2, zero);
696
comp2 = _simd16_min_ps(comp2, ones);
697
698
// gamma-correct only rgb
699
if (FormatTraits<DstFormat>::isSRGB)
700
{
701
comp0 = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(0, comp0);
702
comp1 = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(1, comp1);
703
comp2 = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(2, comp2);
704
}
705
706
// convert float components from 0.0f..1.0f to correct scale for 0..255 dest format
707
comp0 = _simd16_mul_ps(comp0, _simd16_set1_ps(FormatTraits<DstFormat>::fromFloat(0)));
708
comp1 = _simd16_mul_ps(comp1, _simd16_set1_ps(FormatTraits<DstFormat>::fromFloat(1)));
709
comp2 = _simd16_mul_ps(comp2, _simd16_set1_ps(FormatTraits<DstFormat>::fromFloat(2)));
710
711
// moving to 16 wide integer vector types
712
simd16scalari src0 = _simd16_cvtps_epi32(comp0); // padded byte rrrrrrrrrrrrrrrr
713
simd16scalari src1 = _simd16_cvtps_epi32(comp1); // padded byte gggggggggggggggg
714
simd16scalari src2 = _simd16_cvtps_epi32(comp2); // padded byte bbbbbbbbbbbbbbbb
715
716
// SOA to AOS conversion
717
src1 = _simd16_slli_epi32(src1, 8);
718
src2 = _simd16_slli_epi32(src2, 16);
719
720
simd16scalari final = _simd16_or_si(_simd16_or_si(src0, src1), src2); // 0 1 2 3 4 5 6 7 8 9 A B C D E F
721
722
// de-swizzle conversion
723
#if 1
724
simd16scalari final0 = _simd16_permute2f128_si(final, final, 0xA0); // (2, 2, 0, 0) // 0 1 2 3 0 1 2 3 8 9 A B 8 9 A B
725
simd16scalari final1 = _simd16_permute2f128_si(final, final, 0xF5); // (3, 3, 1, 1) // 4 5 6 7 4 5 6 7 C D E F C D E F
726
727
final = _simd16_shuffle_epi64(final0, final1, 0xCC); // (1 1 0 0 1 1 0 0) // 0 1 4 5 2 3 6 7 8 9 C D A B E F
728
729
#else
730
final = _simd16_permute_epi32(final, _simd16_set_epi32(15, 14, 11, 10, 13, 12, 9, 8, 7, 6, 3, 2, 5, 4, 1, 0));
731
732
#endif
733
// store 8x2 memory order:
734
// row0: [ pDst0, pDst2 ] = { 0 1 4 5 }, { 8 9 C D }
735
// row1: [ pDst1, pDst3 ] = { 2 3 6 7 }, { A B E F }
736
_simd_storeu2_si(reinterpret_cast<simd4scalari *>(pDst1), reinterpret_cast<simd4scalari *>(pDst0), _simd16_extract_si(final, 0));
737
_simd_storeu2_si(reinterpret_cast<simd4scalari *>(pDst3), reinterpret_cast<simd4scalari *>(pDst2), _simd16_extract_si(final, 1));
738
}
739
740
template<SWR_FORMAT DstFormat>
741
INLINE static void FlatConvertNoAlpha(const uint8_t* pSrc, uint8_t* pDst, uint8_t* pDst1)
742
{
743
static const uint32_t offset = sizeof(simdscalar);
744
745
// swizzle rgba -> bgra while we load
746
simdscalar vComp0 = _simd_load_ps((const float*)(pSrc + (FormatTraits<DstFormat>::swizzle(0))*offset)); // float32 rrrrrrrr
747
simdscalar vComp1 = _simd_load_ps((const float*)(pSrc + (FormatTraits<DstFormat>::swizzle(1))*offset)); // float32 gggggggg
748
simdscalar vComp2 = _simd_load_ps((const float*)(pSrc + (FormatTraits<DstFormat>::swizzle(2))*offset)); // float32 bbbbbbbb
749
// clamp
750
vComp0 = _simd_max_ps(vComp0, _simd_setzero_ps());
751
vComp0 = _simd_min_ps(vComp0, _simd_set1_ps(1.0f));
752
753
vComp1 = _simd_max_ps(vComp1, _simd_setzero_ps());
754
vComp1 = _simd_min_ps(vComp1, _simd_set1_ps(1.0f));
755
756
vComp2 = _simd_max_ps(vComp2, _simd_setzero_ps());
757
vComp2 = _simd_min_ps(vComp2, _simd_set1_ps(1.0f));
758
759
if (FormatTraits<DstFormat>::isSRGB)
760
{
761
// Gamma-correct only rgb
762
vComp0 = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(0, vComp0);
763
vComp1 = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(1, vComp1);
764
vComp2 = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(2, vComp2);
765
}
766
767
// convert float components from 0.0f .. 1.0f to correct scale for 0 .. 255 dest format
768
vComp0 = _simd_mul_ps(vComp0, _simd_set1_ps(FormatTraits<DstFormat>::fromFloat(0)));
769
vComp1 = _simd_mul_ps(vComp1, _simd_set1_ps(FormatTraits<DstFormat>::fromFloat(1)));
770
vComp2 = _simd_mul_ps(vComp2, _simd_set1_ps(FormatTraits<DstFormat>::fromFloat(2)));
771
772
// moving to 8 wide integer vector types
773
simdscalari src0 = _simd_cvtps_epi32(vComp0); // padded byte rrrrrrrr
774
simdscalari src1 = _simd_cvtps_epi32(vComp1); // padded byte gggggggg
775
simdscalari src2 = _simd_cvtps_epi32(vComp2); // padded byte bbbbbbbb
776
777
#if KNOB_ARCH <= KNOB_ARCH_AVX
778
779
// splitting into two sets of 4 wide integer vector types
780
// because AVX doesn't have instructions to support this operation at 8 wide
781
simd4scalari srcLo0 = _mm256_castsi256_si128(src0); // 000r000r000r000r
782
simd4scalari srcLo1 = _mm256_castsi256_si128(src1); // 000g000g000g000g
783
simd4scalari srcLo2 = _mm256_castsi256_si128(src2); // 000b000b000b000b
784
785
simd4scalari srcHi0 = _mm256_extractf128_si256(src0, 1); // 000r000r000r000r
786
simd4scalari srcHi1 = _mm256_extractf128_si256(src1, 1); // 000g000g000g000g
787
simd4scalari srcHi2 = _mm256_extractf128_si256(src2, 1); // 000b000b000b000b
788
789
srcLo1 = _mm_slli_si128(srcLo1, 1); // 00g000g000g000g0
790
srcHi1 = _mm_slli_si128(srcHi1, 1); // 00g000g000g000g0
791
srcLo2 = _mm_slli_si128(srcLo2, 2); // 0b000b000b000b00
792
srcHi2 = _mm_slli_si128(srcHi2, 2); // 0b000b000b000b00
793
794
srcLo0 = SIMD128::or_si(srcLo0, srcLo1); // 00gr00gr00gr00gr
795
796
srcHi0 = SIMD128::or_si(srcHi0, srcHi1); // 00gr00gr00gr00gr
797
798
srcLo0 = SIMD128::or_si(srcLo0, srcLo2); // 0bgr0bgr0bgr0bgr
799
srcHi0 = SIMD128::or_si(srcHi0, srcHi2); // 0bgr0bgr0bgr0bgr
800
801
// unpack into rows that get the tiling order correct
802
simd4scalari vRow00 = SIMD128::unpacklo_epi64(srcLo0, srcHi0); // 0bgr0bgr0bgr0bgr0bgr0bgr0bgr0bgr
803
simd4scalari vRow10 = SIMD128::unpackhi_epi64(srcLo0, srcHi0);
804
805
simdscalari final = _mm256_castsi128_si256(vRow00);
806
final = _mm256_insertf128_si256(final, vRow10, 1);
807
808
#else
809
810
// logic is as above, only wider
811
src1 = _mm256_slli_si256(src1, 1);
812
src2 = _mm256_slli_si256(src2, 2);
813
814
src0 = _mm256_or_si256(src0, src1);
815
816
simdscalari final = _mm256_or_si256(src0, src2);
817
818
// adjust the data to get the tiling order correct 0 1 2 3 -> 0 2 1 3
819
final = _mm256_permute4x64_epi64(final, 0xD8);
820
821
#endif
822
823
_simd_storeu2_si((simd4scalari*)pDst1, (simd4scalari*)pDst, final);
824
}
825
826
template<>
827
struct ConvertPixelsSOAtoAOS<R32G32B32A32_FLOAT, B8G8R8A8_UNORM>
828
{
829
template <size_t NumDests>
830
INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests])
831
{
832
FlatConvert<B8G8R8A8_UNORM>(pSrc, ppDsts[0], ppDsts[1], ppDsts[2], ppDsts[3]);
833
}
834
};
835
836
template<>
837
struct ConvertPixelsSOAtoAOS<R32G32B32A32_FLOAT, B8G8R8X8_UNORM>
838
{
839
template <size_t NumDests>
840
INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests])
841
{
842
FlatConvertNoAlpha<B8G8R8X8_UNORM>(pSrc, ppDsts[0], ppDsts[1], ppDsts[2], ppDsts[3]);
843
}
844
};
845
846
template<>
847
struct ConvertPixelsSOAtoAOS < R32G32B32A32_FLOAT, B8G8R8A8_UNORM_SRGB >
848
{
849
template <size_t NumDests>
850
INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests])
851
{
852
FlatConvert<B8G8R8A8_UNORM_SRGB>(pSrc, ppDsts[0], ppDsts[1], ppDsts[2], ppDsts[3]);
853
}
854
};
855
856
template<>
857
struct ConvertPixelsSOAtoAOS < R32G32B32A32_FLOAT, B8G8R8X8_UNORM_SRGB >
858
{
859
template <size_t NumDests>
860
INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests])
861
{
862
FlatConvertNoAlpha<B8G8R8X8_UNORM_SRGB>(pSrc, ppDsts[0], ppDsts[1], ppDsts[2], ppDsts[3]);
863
}
864
};
865
866
template<>
867
struct ConvertPixelsSOAtoAOS < R32G32B32A32_FLOAT, R8G8B8A8_UNORM >
868
{
869
template <size_t NumDests>
870
INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests])
871
{
872
FlatConvert<R8G8B8A8_UNORM>(pSrc, ppDsts[0], ppDsts[1], ppDsts[2], ppDsts[3]);
873
}
874
};
875
876
template<>
877
struct ConvertPixelsSOAtoAOS < R32G32B32A32_FLOAT, R8G8B8X8_UNORM >
878
{
879
template <size_t NumDests>
880
INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests])
881
{
882
FlatConvertNoAlpha<R8G8B8X8_UNORM>(pSrc, ppDsts[0], ppDsts[1], ppDsts[2], ppDsts[3]);
883
}
884
};
885
886
template<>
887
struct ConvertPixelsSOAtoAOS < R32G32B32A32_FLOAT, R8G8B8A8_UNORM_SRGB >
888
{
889
template <size_t NumDests>
890
INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests])
891
{
892
FlatConvert<R8G8B8A8_UNORM_SRGB>(pSrc, ppDsts[0], ppDsts[1], ppDsts[2], ppDsts[3]);
893
}
894
};
895
896
template<>
897
struct ConvertPixelsSOAtoAOS < R32G32B32A32_FLOAT, R8G8B8X8_UNORM_SRGB >
898
{
899
template <size_t NumDests>
900
INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests])
901
{
902
FlatConvertNoAlpha<R8G8B8X8_UNORM_SRGB>(pSrc, ppDsts[0], ppDsts[1], ppDsts[2], ppDsts[3]);
903
}
904
};
905
906
//////////////////////////////////////////////////////////////////////////
907
/// StoreRasterTile
908
//////////////////////////////////////////////////////////////////////////
909
template<typename TTraits, SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
910
struct StoreRasterTile
911
{
912
//////////////////////////////////////////////////////////////////////////
913
/// @brief Retrieve color from hot tile source which is always float.
914
/// @param pSrc - Pointer to raster tile.
915
/// @param x, y - Coordinates to raster tile.
916
/// @param output - output color
917
INLINE static void GetSwizzledSrcColor(
918
uint8_t* pSrc,
919
uint32_t x, uint32_t y,
920
float outputColor[4])
921
{
922
typedef SimdTile_16<SrcFormat, DstFormat> SimdT;
923
924
SimdT *pSrcSimdTiles = reinterpret_cast<SimdT *>(pSrc);
925
926
// Compute which simd tile we're accessing within 8x8 tile.
927
// i.e. Compute linear simd tile coordinate given (x, y) in pixel coordinates.
928
uint32_t simdIndex = (y / SIMD16_TILE_Y_DIM) * (KNOB_TILE_X_DIM / SIMD16_TILE_X_DIM) + (x / SIMD16_TILE_X_DIM);
929
930
SimdT *pSimdTile = &pSrcSimdTiles[simdIndex];
931
932
uint32_t simdOffset = (y % SIMD16_TILE_Y_DIM) * SIMD16_TILE_X_DIM + (x % SIMD16_TILE_X_DIM);
933
934
pSimdTile->GetSwizzledColor(simdOffset, outputColor);
935
}
936
937
//////////////////////////////////////////////////////////////////////////
938
/// @brief Stores an 8x8 raster tile to the destination surface.
939
/// @param pSrc - Pointer to raster tile.
940
/// @param pDstSurface - Destination surface state
941
/// @param x, y - Coordinates to raster tile.
942
INLINE static void Store(
943
uint8_t *pSrc,
944
SWR_SURFACE_STATE* pDstSurface,
945
uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex) // (x, y) pixel coordinate to start of raster tile.
946
{
947
uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U);
948
uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U);
949
950
// For each raster tile pixel (rx, ry)
951
for (uint32_t ry = 0; ry < KNOB_TILE_Y_DIM; ++ry)
952
{
953
for (uint32_t rx = 0; rx < KNOB_TILE_X_DIM; ++rx)
954
{
955
// Perform bounds checking.
956
if (((x + rx) < lodWidth) &&
957
((y + ry) < lodHeight))
958
{
959
float srcColor[4];
960
GetSwizzledSrcColor(pSrc, rx, ry, srcColor);
961
962
uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>((x + rx), (y + ry),
963
pDstSurface->arrayIndex + renderTargetArrayIndex, pDstSurface->arrayIndex + renderTargetArrayIndex,
964
sampleNum, pDstSurface->lod, pDstSurface);
965
{
966
ConvertPixelFromFloat<DstFormat>(pDst, srcColor);
967
}
968
}
969
}
970
}
971
}
972
973
//////////////////////////////////////////////////////////////////////////
974
/// @brief Resolves an 8x8 raster tile to the resolve destination surface.
975
/// @param pSrc - Pointer to raster tile.
976
/// @param pDstSurface - Destination surface state
977
/// @param x, y - Coordinates to raster tile.
978
/// @param sampleOffset - Offset between adjacent multisamples
979
INLINE static void Resolve(
980
uint8_t *pSrc,
981
SWR_SURFACE_STATE* pDstSurface,
982
uint32_t x, uint32_t y, uint32_t sampleOffset, uint32_t renderTargetArrayIndex) // (x, y) pixel coordinate to start of raster tile.
983
{
984
uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U);
985
uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U);
986
987
float oneOverNumSamples = 1.0f / pDstSurface->numSamples;
988
989
// For each raster tile pixel (rx, ry)
990
for (uint32_t ry = 0; ry < KNOB_TILE_Y_DIM; ++ry)
991
{
992
for (uint32_t rx = 0; rx < KNOB_TILE_X_DIM; ++rx)
993
{
994
// Perform bounds checking.
995
if (((x + rx) < lodWidth) &&
996
((y + ry) < lodHeight))
997
{
998
// Sum across samples
999
float resolveColor[4] = {0};
1000
for (uint32_t sampleNum = 0; sampleNum < pDstSurface->numSamples; sampleNum++)
1001
{
1002
float sampleColor[4] = {0};
1003
uint8_t *pSampleSrc = pSrc + sampleOffset * sampleNum;
1004
GetSwizzledSrcColor(pSampleSrc, rx, ry, sampleColor);
1005
resolveColor[0] += sampleColor[0];
1006
resolveColor[1] += sampleColor[1];
1007
resolveColor[2] += sampleColor[2];
1008
resolveColor[3] += sampleColor[3];
1009
}
1010
1011
// Divide by numSamples to average
1012
resolveColor[0] *= oneOverNumSamples;
1013
resolveColor[1] *= oneOverNumSamples;
1014
resolveColor[2] *= oneOverNumSamples;
1015
resolveColor[3] *= oneOverNumSamples;
1016
1017
// Use the resolve surface state
1018
SWR_SURFACE_STATE* pResolveSurface = (SWR_SURFACE_STATE*)pDstSurface->xpAuxBaseAddress;
1019
uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>((x + rx), (y + ry),
1020
pResolveSurface->arrayIndex + renderTargetArrayIndex, pResolveSurface->arrayIndex + renderTargetArrayIndex,
1021
0, pResolveSurface->lod, pResolveSurface);
1022
{
1023
ConvertPixelFromFloat<DstFormat>(pDst, resolveColor);
1024
}
1025
}
1026
}
1027
}
1028
}
1029
1030
};
1031
1032
template<typename TTraits, SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
1033
struct OptStoreRasterTile : StoreRasterTile<TTraits, SrcFormat, DstFormat>
1034
{};
1035
1036
//////////////////////////////////////////////////////////////////////////
1037
/// OptStoreRasterTile - SWR_TILE_MODE_NONE specialization for 8bpp
1038
//////////////////////////////////////////////////////////////////////////
1039
template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
1040
struct OptStoreRasterTile< TilingTraits<SWR_TILE_NONE, 8>, SrcFormat, DstFormat>
1041
{
1042
typedef StoreRasterTile<TilingTraits<SWR_TILE_NONE, 8>, SrcFormat, DstFormat> GenericStoreTile;
1043
static const size_t SRC_BYTES_PER_PIXEL = FormatTraits<SrcFormat>::bpp / 8;
1044
static const size_t DST_BYTES_PER_PIXEL = FormatTraits<DstFormat>::bpp / 8;
1045
1046
//////////////////////////////////////////////////////////////////////////
1047
/// @brief Stores an 8x8 raster tile to the destination surface.
1048
/// @param pSrc - Pointer to raster tile.
1049
/// @param pDstSurface - Destination surface state
1050
/// @param x, y - Coordinates to raster tile.
1051
INLINE static void Store(
1052
uint8_t *pSrc,
1053
SWR_SURFACE_STATE* pDstSurface,
1054
uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex)
1055
{
1056
// Punt non-full tiles to generic store
1057
uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U);
1058
uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U);
1059
1060
if (x + KNOB_TILE_X_DIM > lodWidth || y + KNOB_TILE_Y_DIM > lodHeight)
1061
{
1062
return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex);
1063
}
1064
1065
uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
1066
pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
1067
1068
const uint32_t dx = SIMD16_TILE_X_DIM * DST_BYTES_PER_PIXEL;
1069
const uint32_t dy = SIMD16_TILE_Y_DIM * pDstSurface->pitch - KNOB_TILE_X_DIM * DST_BYTES_PER_PIXEL;
1070
1071
uint8_t* ppDsts[] =
1072
{
1073
pDst, // row 0, col 0
1074
pDst + pDstSurface->pitch, // row 1, col 0
1075
pDst + dx / 2, // row 0, col 1
1076
pDst + pDstSurface->pitch + dx / 2 // row 1, col 1
1077
};
1078
1079
for (uint32_t yy = 0; yy < KNOB_TILE_Y_DIM; yy += SIMD16_TILE_Y_DIM)
1080
{
1081
for (uint32_t xx = 0; xx < KNOB_TILE_X_DIM; xx += SIMD16_TILE_X_DIM)
1082
{
1083
ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
1084
1085
pSrc += KNOB_SIMD16_WIDTH * SRC_BYTES_PER_PIXEL;
1086
1087
ppDsts[0] += dx;
1088
ppDsts[1] += dx;
1089
ppDsts[2] += dx;
1090
ppDsts[3] += dx;
1091
}
1092
1093
ppDsts[0] += dy;
1094
ppDsts[1] += dy;
1095
ppDsts[2] += dy;
1096
ppDsts[3] += dy;
1097
}
1098
}
1099
};
1100
1101
//////////////////////////////////////////////////////////////////////////
1102
/// OptStoreRasterTile - SWR_TILE_MODE_NONE specialization for 16bpp
1103
//////////////////////////////////////////////////////////////////////////
1104
template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
1105
struct OptStoreRasterTile< TilingTraits<SWR_TILE_NONE, 16>, SrcFormat, DstFormat>
1106
{
1107
typedef StoreRasterTile<TilingTraits<SWR_TILE_NONE, 16>, SrcFormat, DstFormat> GenericStoreTile;
1108
static const size_t SRC_BYTES_PER_PIXEL = FormatTraits<SrcFormat>::bpp / 8;
1109
static const size_t DST_BYTES_PER_PIXEL = FormatTraits<DstFormat>::bpp / 8;
1110
1111
//////////////////////////////////////////////////////////////////////////
1112
/// @brief Stores an 8x8 raster tile to the destination surface.
1113
/// @param pSrc - Pointer to raster tile.
1114
/// @param pDstSurface - Destination surface state
1115
/// @param x, y - Coordinates to raster tile.
1116
INLINE static void Store(
1117
uint8_t *pSrc,
1118
SWR_SURFACE_STATE* pDstSurface,
1119
uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex)
1120
{
1121
// Punt non-full tiles to generic store
1122
uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U);
1123
uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U);
1124
1125
if (x + KNOB_TILE_X_DIM > lodWidth || y + KNOB_TILE_Y_DIM > lodHeight)
1126
{
1127
return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex);
1128
}
1129
1130
uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
1131
pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
1132
1133
const uint32_t dx = SIMD16_TILE_X_DIM * DST_BYTES_PER_PIXEL;
1134
const uint32_t dy = SIMD16_TILE_Y_DIM * pDstSurface->pitch - KNOB_TILE_X_DIM * DST_BYTES_PER_PIXEL;
1135
1136
uint8_t* ppDsts[] =
1137
{
1138
pDst, // row 0, col 0
1139
pDst + pDstSurface->pitch, // row 1, col 0
1140
pDst + dx / 2, // row 0, col 1
1141
pDst + pDstSurface->pitch + dx / 2 // row 1, col 1
1142
};
1143
1144
for (uint32_t yy = 0; yy < KNOB_TILE_Y_DIM; yy += SIMD16_TILE_Y_DIM)
1145
{
1146
for (uint32_t xx = 0; xx < KNOB_TILE_X_DIM; xx += SIMD16_TILE_X_DIM)
1147
{
1148
ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
1149
1150
pSrc += KNOB_SIMD16_WIDTH * SRC_BYTES_PER_PIXEL;
1151
1152
ppDsts[0] += dx;
1153
ppDsts[1] += dx;
1154
ppDsts[2] += dx;
1155
ppDsts[3] += dx;
1156
}
1157
1158
ppDsts[0] += dy;
1159
ppDsts[1] += dy;
1160
ppDsts[2] += dy;
1161
ppDsts[3] += dy;
1162
}
1163
}
1164
};
1165
1166
//////////////////////////////////////////////////////////////////////////
1167
/// OptStoreRasterTile - SWR_TILE_MODE_NONE specialization for 32bpp
1168
//////////////////////////////////////////////////////////////////////////
1169
template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
1170
struct OptStoreRasterTile< TilingTraits<SWR_TILE_NONE, 32>, SrcFormat, DstFormat>
1171
{
1172
typedef StoreRasterTile<TilingTraits<SWR_TILE_NONE, 32>, SrcFormat, DstFormat> GenericStoreTile;
1173
static const size_t SRC_BYTES_PER_PIXEL = FormatTraits<SrcFormat>::bpp / 8;
1174
static const size_t DST_BYTES_PER_PIXEL = FormatTraits<DstFormat>::bpp / 8;
1175
1176
//////////////////////////////////////////////////////////////////////////
1177
/// @brief Stores an 8x8 raster tile to the destination surface.
1178
/// @param pSrc - Pointer to raster tile.
1179
/// @param pDstSurface - Destination surface state
1180
/// @param x, y - Coordinates to raster tile.
1181
INLINE static void Store(
1182
uint8_t *pSrc,
1183
SWR_SURFACE_STATE* pDstSurface,
1184
uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex)
1185
{
1186
// Punt non-full tiles to generic store
1187
uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U);
1188
uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U);
1189
1190
if (x + KNOB_TILE_X_DIM > lodWidth || y + KNOB_TILE_Y_DIM > lodHeight)
1191
{
1192
return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex);
1193
}
1194
1195
uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
1196
pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
1197
1198
const uint32_t dx = SIMD16_TILE_X_DIM * DST_BYTES_PER_PIXEL;
1199
const uint32_t dy = SIMD16_TILE_Y_DIM * pDstSurface->pitch - KNOB_TILE_X_DIM * DST_BYTES_PER_PIXEL;
1200
1201
uint8_t* ppDsts[] =
1202
{
1203
pDst, // row 0, col 0
1204
pDst + pDstSurface->pitch, // row 1, col 0
1205
pDst + dx / 2, // row 0, col 1
1206
pDst + pDstSurface->pitch + dx / 2 // row 1, col 1
1207
};
1208
1209
for (uint32_t yy = 0; yy < KNOB_TILE_Y_DIM; yy += SIMD16_TILE_Y_DIM)
1210
{
1211
for (uint32_t xx = 0; xx < KNOB_TILE_X_DIM; xx += SIMD16_TILE_X_DIM)
1212
{
1213
ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
1214
1215
pSrc += KNOB_SIMD16_WIDTH * SRC_BYTES_PER_PIXEL;
1216
1217
ppDsts[0] += dx;
1218
ppDsts[1] += dx;
1219
ppDsts[2] += dx;
1220
ppDsts[3] += dx;
1221
}
1222
1223
ppDsts[0] += dy;
1224
ppDsts[1] += dy;
1225
ppDsts[2] += dy;
1226
ppDsts[3] += dy;
1227
}
1228
}
1229
};
1230
1231
//////////////////////////////////////////////////////////////////////////
1232
/// OptStoreRasterTile - SWR_TILE_MODE_NONE specialization for 64bpp
1233
//////////////////////////////////////////////////////////////////////////
1234
template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
1235
struct OptStoreRasterTile< TilingTraits<SWR_TILE_NONE, 64>, SrcFormat, DstFormat>
1236
{
1237
typedef StoreRasterTile<TilingTraits<SWR_TILE_NONE, 64>, SrcFormat, DstFormat> GenericStoreTile;
1238
static const size_t SRC_BYTES_PER_PIXEL = FormatTraits<SrcFormat>::bpp / 8;
1239
static const size_t DST_BYTES_PER_PIXEL = FormatTraits<DstFormat>::bpp / 8;
1240
static const size_t MAX_DST_COLUMN_BYTES = 16;
1241
1242
//////////////////////////////////////////////////////////////////////////
1243
/// @brief Stores an 8x8 raster tile to the destination surface.
1244
/// @param pSrc - Pointer to raster tile.
1245
/// @param pDstSurface - Destination surface state
1246
/// @param x, y - Coordinates to raster tile.
1247
INLINE static void Store(
1248
uint8_t *pSrc,
1249
SWR_SURFACE_STATE* pDstSurface,
1250
uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex)
1251
{
1252
// Punt non-full tiles to generic store
1253
uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U);
1254
uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U);
1255
1256
if (x + KNOB_TILE_X_DIM > lodWidth || y + KNOB_TILE_Y_DIM > lodHeight)
1257
{
1258
return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex);
1259
}
1260
1261
uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
1262
pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
1263
1264
const uint32_t dx = SIMD16_TILE_X_DIM * DST_BYTES_PER_PIXEL;
1265
const uint32_t dy = SIMD16_TILE_Y_DIM * pDstSurface->pitch;
1266
1267
// we have to break these large spans up, since ConvertPixelsSOAtoAOS() can only work on max 16B spans (a TileY limitation)
1268
static_assert(dx == MAX_DST_COLUMN_BYTES * 4, "Invalid column offsets");
1269
1270
uint8_t *ppDsts[] =
1271
{
1272
pDst, // row 0, col 0
1273
pDst + pDstSurface->pitch, // row 1, col 0
1274
pDst + MAX_DST_COLUMN_BYTES, // row 0, col 1
1275
pDst + pDstSurface->pitch + MAX_DST_COLUMN_BYTES, // row 1, col 1
1276
pDst + MAX_DST_COLUMN_BYTES * 2, // row 0, col 2
1277
pDst + pDstSurface->pitch + MAX_DST_COLUMN_BYTES * 2, // row 1, col 2
1278
pDst + MAX_DST_COLUMN_BYTES * 3, // row 0, col 3
1279
pDst + pDstSurface->pitch + MAX_DST_COLUMN_BYTES * 3 // row 1, col 3
1280
};
1281
1282
for (uint32_t yy = 0; yy < KNOB_TILE_Y_DIM; yy += SIMD16_TILE_Y_DIM)
1283
{
1284
// Raster tile width is same as simd16 tile width
1285
static_assert(KNOB_TILE_X_DIM == SIMD16_TILE_X_DIM, "Invalid tile x dim");
1286
1287
ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
1288
1289
pSrc += KNOB_SIMD16_WIDTH * SRC_BYTES_PER_PIXEL;
1290
1291
for (uint32_t i = 0; i < ARRAY_SIZE(ppDsts); i += 1)
1292
{
1293
ppDsts[i] += dy;
1294
}
1295
}
1296
}
1297
};
1298
1299
//////////////////////////////////////////////////////////////////////////
1300
/// OptStoreRasterTile - SWR_TILE_MODE_NONE specialization for 128bpp
1301
//////////////////////////////////////////////////////////////////////////
1302
template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
1303
struct OptStoreRasterTile< TilingTraits<SWR_TILE_NONE, 128>, SrcFormat, DstFormat>
1304
{
1305
typedef StoreRasterTile<TilingTraits<SWR_TILE_NONE, 128>, SrcFormat, DstFormat> GenericStoreTile;
1306
static const size_t SRC_BYTES_PER_PIXEL = FormatTraits<SrcFormat>::bpp / 8;
1307
static const size_t DST_BYTES_PER_PIXEL = FormatTraits<DstFormat>::bpp / 8;
1308
static const size_t MAX_DST_COLUMN_BYTES = 16;
1309
1310
//////////////////////////////////////////////////////////////////////////
1311
/// @brief Stores an 8x8 raster tile to the destination surface.
1312
/// @param pSrc - Pointer to raster tile.
1313
/// @param pDstSurface - Destination surface state
1314
/// @param x, y - Coordinates to raster tile.
1315
INLINE static void Store(
1316
uint8_t *pSrc,
1317
SWR_SURFACE_STATE* pDstSurface,
1318
uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex)
1319
{
1320
// Punt non-full tiles to generic store
1321
uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U);
1322
uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U);
1323
1324
if (x + KNOB_TILE_X_DIM > lodWidth || y + KNOB_TILE_Y_DIM > lodHeight)
1325
{
1326
return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex);
1327
}
1328
1329
uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
1330
pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
1331
1332
const uint32_t dx = SIMD16_TILE_X_DIM * DST_BYTES_PER_PIXEL;
1333
const uint32_t dy = SIMD16_TILE_Y_DIM * pDstSurface->pitch;
1334
1335
// we have to break these large spans up, since ConvertPixelsSOAtoAOS() can only work on max 16B spans (a TileY limitation)
1336
static_assert(dx == MAX_DST_COLUMN_BYTES * 8, "Invalid column offsets");
1337
1338
uint8_t* ppDsts[] =
1339
{
1340
pDst, // row 0, col 0
1341
pDst + pDstSurface->pitch, // row 1, col 0
1342
pDst + MAX_DST_COLUMN_BYTES, // row 0, col 1
1343
pDst + pDstSurface->pitch + MAX_DST_COLUMN_BYTES, // row 1, col 1
1344
pDst + MAX_DST_COLUMN_BYTES * 2, // row 0, col 2
1345
pDst + pDstSurface->pitch + MAX_DST_COLUMN_BYTES * 2, // row 1, col 2
1346
pDst + MAX_DST_COLUMN_BYTES * 3, // row 0, col 3
1347
pDst + pDstSurface->pitch + MAX_DST_COLUMN_BYTES * 3, // row 1, col 3
1348
pDst + MAX_DST_COLUMN_BYTES * 4, // row 0, col 4
1349
pDst + pDstSurface->pitch + MAX_DST_COLUMN_BYTES * 4, // row 1, col 4
1350
pDst + MAX_DST_COLUMN_BYTES * 5, // row 0, col 5
1351
pDst + pDstSurface->pitch + MAX_DST_COLUMN_BYTES * 5, // row 1, col 5
1352
pDst + MAX_DST_COLUMN_BYTES * 6, // row 0, col 6
1353
pDst + pDstSurface->pitch + MAX_DST_COLUMN_BYTES * 6, // row 1, col 6
1354
pDst + MAX_DST_COLUMN_BYTES * 7, // row 0, col 7
1355
pDst + pDstSurface->pitch + MAX_DST_COLUMN_BYTES * 7, // row 1, col 7
1356
};
1357
1358
for (uint32_t yy = 0; yy < KNOB_TILE_Y_DIM; yy += SIMD16_TILE_Y_DIM)
1359
{
1360
// Raster tile width is same as simd16 tile width
1361
static_assert(KNOB_TILE_X_DIM == SIMD16_TILE_X_DIM, "Invalid tile x dim");
1362
1363
ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
1364
1365
pSrc += KNOB_SIMD16_WIDTH * SRC_BYTES_PER_PIXEL;
1366
1367
for (uint32_t i = 0; i < ARRAY_SIZE(ppDsts); i += 1)
1368
{
1369
ppDsts[i] += dy;
1370
}
1371
}
1372
}
1373
};
1374
1375
//////////////////////////////////////////////////////////////////////////
1376
/// OptStoreRasterTile - TILE_MODE_YMAJOR specialization for 8bpp
1377
//////////////////////////////////////////////////////////////////////////
1378
template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
1379
struct OptStoreRasterTile< TilingTraits<SWR_TILE_MODE_YMAJOR, 8>, SrcFormat, DstFormat>
1380
{
1381
typedef StoreRasterTile<TilingTraits<SWR_TILE_MODE_YMAJOR, 8>, SrcFormat, DstFormat> GenericStoreTile;
1382
static const size_t SRC_BYTES_PER_PIXEL = FormatTraits<SrcFormat>::bpp / 8;
1383
1384
//////////////////////////////////////////////////////////////////////////
1385
/// @brief Stores an 8x8 raster tile to the destination surface.
1386
/// @param pSrc - Pointer to raster tile.
1387
/// @param pDstSurface - Destination surface state
1388
/// @param x, y - Coordinates to raster tile.
1389
INLINE static void Store(
1390
uint8_t *pSrc,
1391
SWR_SURFACE_STATE* pDstSurface,
1392
uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex)
1393
{
1394
static const uint32_t DestRowWidthBytes = 16; // 16B rows
1395
1396
// Punt non-full tiles to generic store
1397
uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U);
1398
uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U);
1399
1400
if (x + KNOB_TILE_X_DIM > lodWidth || y + KNOB_TILE_Y_DIM > lodHeight)
1401
{
1402
return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex);
1403
}
1404
1405
// TileY is a column-major tiling mode where each 4KB tile consist of 8 columns of 32 x 16B rows.
1406
// We can compute the offsets to each column within the raster tile once and increment from these.
1407
// There will be 4 8x2 simd tiles in an 8x8 raster tile.
1408
uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
1409
pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
1410
1411
const uint32_t dy = SIMD16_TILE_Y_DIM * DestRowWidthBytes;
1412
1413
// The Hot Tile uses a row-major tiling mode and has a larger memory footprint. So we iterate in a row-major pattern.
1414
uint8_t *ppDsts[] =
1415
{
1416
pDst,
1417
pDst + DestRowWidthBytes,
1418
pDst + DestRowWidthBytes / 4,
1419
pDst + DestRowWidthBytes + DestRowWidthBytes / 4
1420
};
1421
1422
for (uint32_t yy = 0; yy < KNOB_TILE_Y_DIM; yy += SIMD16_TILE_Y_DIM)
1423
{
1424
// Raster tile width is same as simd16 tile width
1425
static_assert(KNOB_TILE_X_DIM == SIMD16_TILE_X_DIM, "Invalid tile x dim");
1426
1427
ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
1428
1429
pSrc += KNOB_SIMD16_WIDTH * SRC_BYTES_PER_PIXEL;
1430
1431
ppDsts[0] += dy;
1432
ppDsts[1] += dy;
1433
ppDsts[2] += dy;
1434
ppDsts[3] += dy;
1435
}
1436
}
1437
};
1438
1439
//////////////////////////////////////////////////////////////////////////
1440
/// OptStoreRasterTile - TILE_MODE_YMAJOR specialization for 16bpp
1441
//////////////////////////////////////////////////////////////////////////
1442
template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
1443
struct OptStoreRasterTile< TilingTraits<SWR_TILE_MODE_YMAJOR, 16>, SrcFormat, DstFormat>
1444
{
1445
typedef StoreRasterTile<TilingTraits<SWR_TILE_MODE_YMAJOR, 16>, SrcFormat, DstFormat> GenericStoreTile;
1446
static const size_t SRC_BYTES_PER_PIXEL = FormatTraits<SrcFormat>::bpp / 8;
1447
1448
//////////////////////////////////////////////////////////////////////////
1449
/// @brief Stores an 8x8 raster tile to the destination surface.
1450
/// @param pSrc - Pointer to raster tile.
1451
/// @param pDstSurface - Destination surface state
1452
/// @param x, y - Coordinates to raster tile.
1453
INLINE static void Store(
1454
uint8_t *pSrc,
1455
SWR_SURFACE_STATE* pDstSurface,
1456
uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex)
1457
{
1458
static const uint32_t DestRowWidthBytes = 16; // 16B rows
1459
1460
// Punt non-full tiles to generic store
1461
uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U);
1462
uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U);
1463
1464
if (x + KNOB_TILE_X_DIM > lodWidth || y + KNOB_TILE_Y_DIM > lodHeight)
1465
{
1466
return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex);
1467
}
1468
1469
// TileY is a column-major tiling mode where each 4KB tile consist of 8 columns of 32 x 16B rows.
1470
// We can compute the offsets to each column within the raster tile once and increment from these.
1471
// There will be 4 8x2 simd tiles in an 8x8 raster tile.
1472
uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
1473
pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
1474
1475
const uint32_t dy = SIMD16_TILE_Y_DIM * DestRowWidthBytes;
1476
1477
// The Hot Tile uses a row-major tiling mode and has a larger memory footprint. So we iterate in a row-major pattern.
1478
uint8_t *ppDsts[] =
1479
{
1480
pDst,
1481
pDst + DestRowWidthBytes,
1482
pDst + DestRowWidthBytes / 2,
1483
pDst + DestRowWidthBytes + DestRowWidthBytes / 2
1484
};
1485
1486
for (uint32_t yy = 0; yy < KNOB_TILE_Y_DIM; yy += SIMD16_TILE_Y_DIM)
1487
{
1488
// Raster tile width is same as simd16 tile width
1489
static_assert(KNOB_TILE_X_DIM == SIMD16_TILE_X_DIM, "Invalid tile x dim");
1490
1491
ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
1492
1493
pSrc += KNOB_SIMD16_WIDTH * SRC_BYTES_PER_PIXEL;
1494
1495
ppDsts[0] += dy;
1496
ppDsts[1] += dy;
1497
ppDsts[2] += dy;
1498
ppDsts[3] += dy;
1499
}
1500
}
1501
};
1502
1503
//////////////////////////////////////////////////////////////////////////
1504
/// OptStoreRasterTile - TILE_MODE_XMAJOR specialization for 32bpp
1505
//////////////////////////////////////////////////////////////////////////
1506
template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
1507
struct OptStoreRasterTile< TilingTraits<SWR_TILE_MODE_XMAJOR, 32>, SrcFormat, DstFormat>
1508
{
1509
typedef StoreRasterTile<TilingTraits<SWR_TILE_MODE_XMAJOR, 32>, SrcFormat, DstFormat> GenericStoreTile;
1510
static const size_t SRC_BYTES_PER_PIXEL = FormatTraits<SrcFormat>::bpp / 8;
1511
static const size_t DST_BYTES_PER_PIXEL = FormatTraits<DstFormat>::bpp / 8;
1512
1513
//////////////////////////////////////////////////////////////////////////
1514
/// @brief Stores an 8x8 raster tile to the destination surface.
1515
/// @param pSrc - Pointer to raster tile.
1516
/// @param pDstSurface - Destination surface state
1517
/// @param x, y - Coordinates to raster tile.
1518
INLINE static void Store(
1519
uint8_t *pSrc,
1520
SWR_SURFACE_STATE* pDstSurface,
1521
uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex)
1522
{
1523
static const uint32_t DestRowWidthBytes = 512; // 512B rows
1524
1525
// Punt non-full tiles to generic store
1526
uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U);
1527
uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U);
1528
1529
if (x + KNOB_TILE_X_DIM > lodWidth || y + KNOB_TILE_Y_DIM > lodHeight)
1530
{
1531
return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex);
1532
}
1533
1534
// TileX is a row-major tiling mode where each 4KB tile consist of 8 x 512B rows.
1535
// We can compute the offsets to each column within the raster tile once and increment from these.
1536
uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
1537
pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
1538
1539
const uint32_t dx = SIMD16_TILE_X_DIM * DST_BYTES_PER_PIXEL;
1540
const uint32_t dy = SIMD16_TILE_Y_DIM * DestRowWidthBytes - KNOB_TILE_X_DIM * DST_BYTES_PER_PIXEL;
1541
1542
uint8_t* ppDsts[] =
1543
{
1544
pDst, // row 0, col 0
1545
pDst + DestRowWidthBytes, // row 1, col 0
1546
pDst + dx / 2, // row 0, col 1
1547
pDst + DestRowWidthBytes + dx / 2 // row 1, col 1
1548
};
1549
1550
for (uint32_t yy = 0; yy < KNOB_TILE_Y_DIM; yy += SIMD16_TILE_Y_DIM)
1551
{
1552
for (uint32_t xx = 0; xx < KNOB_TILE_X_DIM; xx += SIMD16_TILE_X_DIM)
1553
{
1554
ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
1555
1556
pSrc += KNOB_SIMD16_WIDTH * SRC_BYTES_PER_PIXEL;
1557
1558
ppDsts[0] += dx;
1559
ppDsts[1] += dx;
1560
ppDsts[2] += dx;
1561
ppDsts[3] += dx;
1562
}
1563
1564
ppDsts[0] += dy;
1565
ppDsts[1] += dy;
1566
ppDsts[2] += dy;
1567
ppDsts[3] += dy;
1568
}
1569
}
1570
};
1571
1572
//////////////////////////////////////////////////////////////////////////
1573
/// OptStoreRasterTile - TILE_MODE_YMAJOR specialization for 32bpp
1574
//////////////////////////////////////////////////////////////////////////
1575
template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
1576
struct OptStoreRasterTile< TilingTraits<SWR_TILE_MODE_YMAJOR, 32>, SrcFormat, DstFormat>
1577
{
1578
typedef StoreRasterTile<TilingTraits<SWR_TILE_MODE_YMAJOR, 32>, SrcFormat, DstFormat> GenericStoreTile;
1579
static const size_t SRC_BYTES_PER_PIXEL = FormatTraits<SrcFormat>::bpp / 8;
1580
1581
//////////////////////////////////////////////////////////////////////////
1582
/// @brief Stores an 8x8 raster tile to the destination surface.
1583
/// @param pSrc - Pointer to raster tile.
1584
/// @param pDstSurface - Destination surface state
1585
/// @param x, y - Coordinates to raster tile.
1586
INLINE static void Store(
1587
uint8_t *pSrc,
1588
SWR_SURFACE_STATE* pDstSurface,
1589
uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex)
1590
{
1591
static const uint32_t DestRowWidthBytes = 16; // 16B rows
1592
static const uint32_t DestColumnBytes = DestRowWidthBytes * 32; // 16B x 32 rows.
1593
1594
// Punt non-full tiles to generic store
1595
uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U);
1596
uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U);
1597
1598
if (x + KNOB_TILE_X_DIM > lodWidth || y + KNOB_TILE_Y_DIM > lodHeight)
1599
{
1600
return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex);
1601
}
1602
1603
// TileY is a column-major tiling mode where each 4KB tile consist of 8 columns of 32 x 16B rows.
1604
// We can compute the offsets to each column within the raster tile once and increment from these.
1605
// There will be 4 8x2 simd tiles in an 8x8 raster tile.
1606
uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
1607
pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
1608
1609
// we have to break these large spans up, since ConvertPixelsSOAtoAOS() can only work on max 16B spans (a TileY limitation)
1610
const uint32_t dy = SIMD16_TILE_Y_DIM * DestRowWidthBytes;
1611
1612
// The Hot Tile uses a row-major tiling mode and has a larger memory footprint. So we iterate in a row-major pattern.
1613
uint8_t *ppDsts[] =
1614
{
1615
pDst, // row 0, col 0
1616
pDst + DestRowWidthBytes, // row 1, col 0
1617
pDst + DestColumnBytes, // row 0, col 1
1618
pDst + DestRowWidthBytes + DestColumnBytes // row 1, col 1
1619
};
1620
1621
for (uint32_t yy = 0; yy < KNOB_TILE_Y_DIM; yy += SIMD16_TILE_Y_DIM)
1622
{
1623
// Raster tile width is same as simd16 tile width
1624
static_assert(KNOB_TILE_X_DIM == SIMD16_TILE_X_DIM, "Invalid tile x dim");
1625
1626
ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
1627
1628
pSrc += KNOB_SIMD16_WIDTH * SRC_BYTES_PER_PIXEL;
1629
1630
ppDsts[0] += dy;
1631
ppDsts[1] += dy;
1632
ppDsts[2] += dy;
1633
ppDsts[3] += dy;
1634
}
1635
}
1636
};
1637
1638
//////////////////////////////////////////////////////////////////////////
1639
/// OptStoreRasterTile - TILE_MODE_YMAJOR specialization for 64bpp
1640
//////////////////////////////////////////////////////////////////////////
1641
template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
1642
struct OptStoreRasterTile< TilingTraits<SWR_TILE_MODE_YMAJOR, 64>, SrcFormat, DstFormat>
1643
{
1644
typedef StoreRasterTile<TilingTraits<SWR_TILE_MODE_YMAJOR, 64>, SrcFormat, DstFormat> GenericStoreTile;
1645
static const size_t SRC_BYTES_PER_PIXEL = FormatTraits<SrcFormat>::bpp / 8;
1646
1647
//////////////////////////////////////////////////////////////////////////
1648
/// @brief Stores an 8x8 raster tile to the destination surface.
1649
/// @param pSrc - Pointer to raster tile.
1650
/// @param pDstSurface - Destination surface state
1651
/// @param x, y - Coordinates to raster tile.
1652
INLINE static void Store(
1653
uint8_t *pSrc,
1654
SWR_SURFACE_STATE* pDstSurface,
1655
uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex)
1656
{
1657
static const uint32_t DestRowWidthBytes = 16; // 16B rows
1658
static const uint32_t DestColumnBytes = DestRowWidthBytes * 32; // 16B x 32 rows.
1659
1660
// Punt non-full tiles to generic store
1661
uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U);
1662
uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U);
1663
1664
if (x + KNOB_TILE_X_DIM > lodWidth || y + KNOB_TILE_Y_DIM > lodHeight)
1665
{
1666
return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex);
1667
}
1668
1669
// TileY is a column-major tiling mode where each 4KB tile consist of 8 columns of 32 x 16B rows.
1670
// We can compute the offsets to each column within the raster tile once and increment from these.
1671
// There will be 4 8x2 simd tiles in an 8x8 raster tile.
1672
uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
1673
pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
1674
1675
// we have to break these large spans up, since ConvertPixelsSOAtoAOS() can only work on max 16B spans (a TileY limitation)
1676
const uint32_t dy = SIMD16_TILE_Y_DIM * DestRowWidthBytes;
1677
1678
// The Hot Tile uses a row-major tiling mode and has a larger memory footprint. So we iterate in a row-major pattern.
1679
uint8_t *ppDsts[] =
1680
{
1681
pDst, // row 0, col 0
1682
pDst + DestRowWidthBytes, // row 1, col 0
1683
pDst + DestColumnBytes, // row 0, col 1
1684
pDst + DestRowWidthBytes + DestColumnBytes, // row 1, col 1
1685
pDst + DestColumnBytes * 2, // row 0, col 2
1686
pDst + DestRowWidthBytes + DestColumnBytes * 2, // row 1, col 2
1687
pDst + DestColumnBytes * 3, // row 0, col 3
1688
pDst + DestRowWidthBytes + DestColumnBytes * 3 // row 1, col 3
1689
};
1690
1691
for (uint32_t yy = 0; yy < KNOB_TILE_Y_DIM; yy += SIMD16_TILE_Y_DIM)
1692
{
1693
// Raster tile width is same as simd16 tile width
1694
static_assert(KNOB_TILE_X_DIM == SIMD16_TILE_X_DIM, "Invalid tile x dim");
1695
1696
ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
1697
1698
pSrc += KNOB_SIMD16_WIDTH * SRC_BYTES_PER_PIXEL;
1699
1700
for (uint32_t i = 0; i < ARRAY_SIZE(ppDsts); i += 1)
1701
{
1702
ppDsts[i] += dy;
1703
}
1704
}
1705
}
1706
};
1707
1708
//////////////////////////////////////////////////////////////////////////
1709
/// OptStoreRasterTile - SWR_TILE_MODE_YMAJOR specialization for 128bpp
1710
//////////////////////////////////////////////////////////////////////////
1711
template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
1712
struct OptStoreRasterTile< TilingTraits<SWR_TILE_MODE_YMAJOR, 128>, SrcFormat, DstFormat>
1713
{
1714
typedef StoreRasterTile<TilingTraits<SWR_TILE_MODE_YMAJOR, 128>, SrcFormat, DstFormat> GenericStoreTile;
1715
static const size_t SRC_BYTES_PER_PIXEL = FormatTraits<SrcFormat>::bpp / 8;
1716
1717
//////////////////////////////////////////////////////////////////////////
1718
/// @brief Stores an 8x8 raster tile to the destination surface.
1719
/// @param pSrc - Pointer to raster tile.
1720
/// @param pDstSurface - Destination surface state
1721
/// @param x, y - Coordinates to raster tile.
1722
INLINE static void Store(
1723
uint8_t *pSrc,
1724
SWR_SURFACE_STATE* pDstSurface,
1725
uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex)
1726
{
1727
static const uint32_t DestRowWidthBytes = 16; // 16B rows
1728
static const uint32_t DestColumnBytes = DestRowWidthBytes * 32; // 16B x 32 rows.
1729
1730
// Punt non-full tiles to generic store
1731
uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U);
1732
uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U);
1733
1734
if (x + KNOB_TILE_X_DIM > lodWidth || y + KNOB_TILE_Y_DIM > lodHeight)
1735
{
1736
return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex);
1737
}
1738
1739
// TileY is a column-major tiling mode where each 4KB tile consist of 8 columns of 32 x 16B rows.
1740
// We can compute the offsets to each column within the raster tile once and increment from these.
1741
// There will be 4 8x2 simd tiles in an 8x8 raster tile.
1742
uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
1743
pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
1744
1745
// we have to break these large spans up, since ConvertPixelsSOAtoAOS() can only work on max 16B spans (a TileY limitation)
1746
const uint32_t dy = SIMD16_TILE_Y_DIM * DestRowWidthBytes;
1747
1748
// The Hot Tile uses a row-major tiling mode and has a larger memory footprint. So we iterate in a row-major pattern.
1749
uint8_t *ppDsts[] =
1750
{
1751
pDst, // row 0, col 0
1752
pDst + DestRowWidthBytes, // row 1, col 0
1753
pDst + DestColumnBytes, // row 0, col 1
1754
pDst + DestRowWidthBytes + DestColumnBytes, // row 1, col 1
1755
pDst + DestColumnBytes * 2, // row 0, col 2
1756
pDst + DestRowWidthBytes + DestColumnBytes * 2, // row 1, col 2
1757
pDst + DestColumnBytes * 3, // row 0, col 3
1758
pDst + DestRowWidthBytes + DestColumnBytes * 3, // row 1, col 3
1759
pDst + DestColumnBytes * 4, // row 0, col 4
1760
pDst + DestRowWidthBytes + DestColumnBytes * 4, // row 1, col 4
1761
pDst + DestColumnBytes * 5, // row 0, col 5
1762
pDst + DestRowWidthBytes + DestColumnBytes * 5, // row 1, col 5
1763
pDst + DestColumnBytes * 6, // row 0, col 6
1764
pDst + DestRowWidthBytes + DestColumnBytes * 6, // row 1, col 6
1765
pDst + DestColumnBytes * 7, // row 0, col 7
1766
pDst + DestRowWidthBytes + DestColumnBytes * 7 // row 1, col 7
1767
};
1768
1769
for (uint32_t yy = 0; yy < KNOB_TILE_Y_DIM; yy += SIMD16_TILE_Y_DIM)
1770
{
1771
// Raster tile width is same as simd16 tile width
1772
static_assert(KNOB_TILE_X_DIM == SIMD16_TILE_X_DIM, "Invalid tile x dim");
1773
1774
ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
1775
1776
pSrc += KNOB_SIMD16_WIDTH * SRC_BYTES_PER_PIXEL;
1777
1778
for (uint32_t i = 0; i < ARRAY_SIZE(ppDsts); i += 1)
1779
{
1780
ppDsts[i] += dy;
1781
}
1782
}
1783
}
1784
};
1785
1786
//////////////////////////////////////////////////////////////////////////
1787
/// StoreMacroTile - Stores a macro tile which consists of raster tiles.
1788
//////////////////////////////////////////////////////////////////////////
1789
template<typename TTraits, SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
1790
struct StoreMacroTile
1791
{
1792
//////////////////////////////////////////////////////////////////////////
1793
/// @brief Stores a macrotile to the destination surface using safe implementation.
1794
/// @param pSrc - Pointer to macro tile.
1795
/// @param pDstSurface - Destination surface state
1796
/// @param x, y - Coordinates to macro tile
1797
static void StoreGeneric(
1798
uint8_t *pSrcHotTile,
1799
SWR_SURFACE_STATE* pDstSurface,
1800
uint32_t x, uint32_t y, uint32_t renderTargetArrayIndex)
1801
{
1802
PFN_STORE_TILES_INTERNAL pfnStore;
1803
pfnStore = StoreRasterTile<TTraits, SrcFormat, DstFormat>::Store;
1804
1805
// Store each raster tile from the hot tile to the destination surface.
1806
for (uint32_t row = 0; row < KNOB_MACROTILE_Y_DIM; row += KNOB_TILE_Y_DIM)
1807
{
1808
for (uint32_t col = 0; col < KNOB_MACROTILE_X_DIM; col += KNOB_TILE_X_DIM)
1809
{
1810
for (uint32_t sampleNum = 0; sampleNum < pDstSurface->numSamples; sampleNum++)
1811
{
1812
pfnStore(pSrcHotTile, pDstSurface, (x + col), (y + row), sampleNum, renderTargetArrayIndex);
1813
pSrcHotTile += KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * (FormatTraits<SrcFormat>::bpp / 8);
1814
}
1815
}
1816
}
1817
1818
}
1819
1820
typedef void(*PFN_STORE_TILES_INTERNAL)(uint8_t*, SWR_SURFACE_STATE*, uint32_t, uint32_t, uint32_t, uint32_t);
1821
//////////////////////////////////////////////////////////////////////////
1822
/// @brief Stores a macrotile to the destination surface.
1823
/// @param pSrc - Pointer to macro tile.
1824
/// @param pDstSurface - Destination surface state
1825
/// @param x, y - Coordinates to macro tile
1826
static void Store(
1827
uint8_t *pSrcHotTile,
1828
SWR_SURFACE_STATE* pDstSurface,
1829
uint32_t x, uint32_t y, uint32_t renderTargetArrayIndex)
1830
{
1831
PFN_STORE_TILES_INTERNAL pfnStore[SWR_MAX_NUM_MULTISAMPLES];
1832
1833
for (uint32_t sampleNum = 0; sampleNum < pDstSurface->numSamples; sampleNum++)
1834
{
1835
size_t dstSurfAddress = (size_t)ComputeSurfaceAddress<false, false>(
1836
0,
1837
0,
1838
pDstSurface->arrayIndex + renderTargetArrayIndex, // z for 3D surfaces
1839
pDstSurface->arrayIndex + renderTargetArrayIndex, // array index for 2D arrays
1840
sampleNum,
1841
pDstSurface->lod,
1842
pDstSurface);
1843
1844
// Only support generic store-tile if lod surface doesn't start on a page boundary and is non-linear
1845
bool bForceGeneric = ((pDstSurface->tileMode != SWR_TILE_NONE) && (0 != (dstSurfAddress & 0xfff))) ||
1846
(pDstSurface->bInterleavedSamples);
1847
1848
pfnStore[sampleNum] = (bForceGeneric || KNOB_USE_GENERIC_STORETILE) ? StoreRasterTile<TTraits, SrcFormat, DstFormat>::Store : OptStoreRasterTile<TTraits, SrcFormat, DstFormat>::Store;
1849
}
1850
1851
// Save original for pSrcHotTile resolve.
1852
uint8_t *pResolveSrcHotTile = pSrcHotTile;
1853
1854
// Store each raster tile from the hot tile to the destination surface.
1855
for(uint32_t row = 0; row < KNOB_MACROTILE_Y_DIM; row += KNOB_TILE_Y_DIM)
1856
{
1857
for(uint32_t col = 0; col < KNOB_MACROTILE_X_DIM; col += KNOB_TILE_X_DIM)
1858
{
1859
for(uint32_t sampleNum = 0; sampleNum < pDstSurface->numSamples; sampleNum++)
1860
{
1861
pfnStore[sampleNum](pSrcHotTile, pDstSurface, (x + col), (y + row), sampleNum, renderTargetArrayIndex);
1862
pSrcHotTile += KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * (FormatTraits<SrcFormat>::bpp / 8);
1863
}
1864
}
1865
}
1866
1867
if (pDstSurface->xpAuxBaseAddress)
1868
{
1869
uint32_t sampleOffset = KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * (FormatTraits<SrcFormat>::bpp / 8);
1870
// Store each raster tile from the hot tile to the destination surface.
1871
for(uint32_t row = 0; row < KNOB_MACROTILE_Y_DIM; row += KNOB_TILE_Y_DIM)
1872
{
1873
for(uint32_t col = 0; col < KNOB_MACROTILE_X_DIM; col += KNOB_TILE_X_DIM)
1874
{
1875
StoreRasterTile<TTraits, SrcFormat, DstFormat>::Resolve(pResolveSrcHotTile, pDstSurface, (x + col), (y + row), sampleOffset, renderTargetArrayIndex);
1876
pResolveSrcHotTile += sampleOffset * pDstSurface->numSamples;
1877
}
1878
}
1879
}
1880
}
1881
};
1882
1883
//////////////////////////////////////////////////////////////////////////
1884
/// InitStoreTilesTable - Helper for setting up the tables.
1885
template <SWR_TILE_MODE TTileMode, size_t NumTileModesT, size_t ArraySizeT>
1886
void InitStoreTilesTableColor_Half1(
1887
PFN_STORE_TILES (&table)[NumTileModesT][ArraySizeT])
1888
{
1889
table[TTileMode][R32G32B32A32_FLOAT] = StoreMacroTile<TilingTraits<TTileMode, 128>, R32G32B32A32_FLOAT, R32G32B32A32_FLOAT>::Store;
1890
table[TTileMode][R32G32B32A32_SINT] = StoreMacroTile<TilingTraits<TTileMode, 128>, R32G32B32A32_FLOAT, R32G32B32A32_SINT>::Store;
1891
table[TTileMode][R32G32B32A32_UINT] = StoreMacroTile<TilingTraits<TTileMode, 128>, R32G32B32A32_FLOAT, R32G32B32A32_UINT>::Store;
1892
table[TTileMode][R32G32B32X32_FLOAT] = StoreMacroTile<TilingTraits<TTileMode, 128>, R32G32B32A32_FLOAT, R32G32B32X32_FLOAT>::Store;
1893
table[TTileMode][R32G32B32A32_SSCALED] = StoreMacroTile<TilingTraits<TTileMode, 128>, R32G32B32A32_FLOAT, R32G32B32A32_SSCALED>::Store;
1894
table[TTileMode][R32G32B32A32_USCALED] = StoreMacroTile<TilingTraits<TTileMode, 128>, R32G32B32A32_FLOAT, R32G32B32A32_USCALED>::Store;
1895
table[TTileMode][R32G32B32_FLOAT] = StoreMacroTile<TilingTraits<TTileMode, 96>, R32G32B32A32_FLOAT, R32G32B32_FLOAT>::Store;
1896
table[TTileMode][R32G32B32_SINT] = StoreMacroTile<TilingTraits<TTileMode, 96>, R32G32B32A32_FLOAT, R32G32B32_SINT>::Store;
1897
table[TTileMode][R32G32B32_UINT] = StoreMacroTile<TilingTraits<TTileMode, 96>, R32G32B32A32_FLOAT, R32G32B32_UINT>::Store;
1898
table[TTileMode][R32G32B32_SSCALED] = StoreMacroTile<TilingTraits<TTileMode, 96>, R32G32B32A32_FLOAT, R32G32B32_SSCALED>::Store;
1899
table[TTileMode][R32G32B32_USCALED] = StoreMacroTile<TilingTraits<TTileMode, 96>, R32G32B32A32_FLOAT, R32G32B32_USCALED>::Store;
1900
table[TTileMode][R16G16B16A16_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R16G16B16A16_UNORM>::Store;
1901
table[TTileMode][R16G16B16A16_SNORM] = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R16G16B16A16_SNORM>::Store;
1902
table[TTileMode][R16G16B16A16_SINT] = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R16G16B16A16_SINT>::Store;
1903
table[TTileMode][R16G16B16A16_UINT] = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R16G16B16A16_UINT>::Store;
1904
table[TTileMode][R16G16B16A16_FLOAT] = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R16G16B16A16_FLOAT>::Store;
1905
table[TTileMode][R32G32_FLOAT] = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R32G32_FLOAT>::Store;
1906
table[TTileMode][R32G32_SINT] = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R32G32_SINT>::Store;
1907
table[TTileMode][R32G32_UINT] = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R32G32_UINT>::Store;
1908
table[TTileMode][R32_FLOAT_X8X24_TYPELESS] = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R32_FLOAT_X8X24_TYPELESS>::Store;
1909
table[TTileMode][X32_TYPELESS_G8X24_UINT] = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, X32_TYPELESS_G8X24_UINT>::Store;
1910
table[TTileMode][R16G16B16X16_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R16G16B16X16_UNORM>::Store;
1911
table[TTileMode][R16G16B16X16_FLOAT] = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R16G16B16X16_FLOAT>::Store;
1912
table[TTileMode][R16G16B16A16_SSCALED] = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R16G16B16A16_SSCALED>::Store;
1913
table[TTileMode][R16G16B16A16_USCALED] = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R16G16B16A16_USCALED>::Store;
1914
table[TTileMode][R32G32_SSCALED] = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R32G32_SSCALED>::Store;
1915
table[TTileMode][R32G32_USCALED] = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R32G32_USCALED>::Store;
1916
table[TTileMode][B8G8R8A8_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, B8G8R8A8_UNORM>::Store;
1917
table[TTileMode][B8G8R8A8_UNORM_SRGB] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, B8G8R8A8_UNORM_SRGB>::Store;
1918
table[TTileMode][R10G10B10A2_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R10G10B10A2_UNORM>::StoreGeneric;
1919
table[TTileMode][R10G10B10A2_UNORM_SRGB] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R10G10B10A2_UNORM_SRGB>::StoreGeneric;
1920
table[TTileMode][R10G10B10A2_UINT] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R10G10B10A2_UINT>::StoreGeneric;
1921
table[TTileMode][R8G8B8A8_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R8G8B8A8_UNORM>::Store;
1922
table[TTileMode][R8G8B8A8_UNORM_SRGB] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R8G8B8A8_UNORM_SRGB>::Store;
1923
table[TTileMode][R8G8B8A8_SNORM] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R8G8B8A8_SNORM>::Store;
1924
table[TTileMode][R8G8B8A8_SINT] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R8G8B8A8_SINT>::Store;
1925
table[TTileMode][R8G8B8A8_UINT] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R8G8B8A8_UINT>::Store;
1926
table[TTileMode][R16G16_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R16G16_UNORM>::Store;
1927
table[TTileMode][R16G16_SNORM] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R16G16_SNORM>::Store;
1928
table[TTileMode][R16G16_SINT] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R16G16_SINT>::Store;
1929
table[TTileMode][R16G16_UINT] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R16G16_UINT>::Store;
1930
table[TTileMode][R16G16_FLOAT] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R16G16_FLOAT>::Store;
1931
table[TTileMode][B10G10R10A2_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, B10G10R10A2_UNORM>::StoreGeneric;
1932
table[TTileMode][B10G10R10A2_UNORM_SRGB] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, B10G10R10A2_UNORM_SRGB>::StoreGeneric;
1933
table[TTileMode][R11G11B10_FLOAT] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R11G11B10_FLOAT>::StoreGeneric;
1934
table[TTileMode][R10G10B10_FLOAT_A2_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R10G10B10_FLOAT_A2_UNORM>::StoreGeneric;
1935
table[TTileMode][R32_SINT] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R32_SINT>::Store;
1936
table[TTileMode][R32_UINT] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R32_UINT>::Store;
1937
table[TTileMode][R32_FLOAT] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R32_FLOAT>::Store;
1938
table[TTileMode][R24_UNORM_X8_TYPELESS] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R24_UNORM_X8_TYPELESS>::StoreGeneric;
1939
table[TTileMode][X24_TYPELESS_G8_UINT] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, X24_TYPELESS_G8_UINT>::StoreGeneric;
1940
table[TTileMode][A32_FLOAT] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, A32_FLOAT>::Store;
1941
table[TTileMode][B8G8R8X8_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, B8G8R8X8_UNORM>::Store;
1942
table[TTileMode][B8G8R8X8_UNORM_SRGB] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, B8G8R8X8_UNORM_SRGB>::Store;
1943
table[TTileMode][R8G8B8X8_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R8G8B8X8_UNORM>::Store;
1944
table[TTileMode][R8G8B8X8_UNORM_SRGB] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R8G8B8X8_UNORM_SRGB>::Store;
1945
}
1946
1947
template <SWR_TILE_MODE TTileMode, size_t NumTileModesT, size_t ArraySizeT>
1948
void InitStoreTilesTableColor_Half2(
1949
PFN_STORE_TILES(&table)[NumTileModesT][ArraySizeT])
1950
{
1951
table[TTileMode][R9G9B9E5_SHAREDEXP] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R9G9B9E5_SHAREDEXP>::StoreGeneric;
1952
table[TTileMode][B10G10R10X2_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, B10G10R10X2_UNORM>::StoreGeneric;
1953
table[TTileMode][R10G10B10X2_USCALED] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R10G10B10X2_USCALED>::StoreGeneric;
1954
table[TTileMode][R8G8B8A8_SSCALED] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R8G8B8A8_SSCALED>::Store;
1955
table[TTileMode][R8G8B8A8_USCALED] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R8G8B8A8_USCALED>::Store;
1956
table[TTileMode][R16G16_SSCALED] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R16G16_SSCALED>::Store;
1957
table[TTileMode][R16G16_USCALED] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R16G16_USCALED>::Store;
1958
table[TTileMode][R32_SSCALED] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R32_SSCALED>::Store;
1959
table[TTileMode][R32_USCALED] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R32_USCALED>::Store;
1960
table[TTileMode][B5G6R5_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, B5G6R5_UNORM>::Store;
1961
table[TTileMode][B5G6R5_UNORM_SRGB] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, B5G6R5_UNORM_SRGB>::StoreGeneric;
1962
table[TTileMode][B5G5R5A1_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, B5G5R5A1_UNORM>::StoreGeneric;
1963
table[TTileMode][B5G5R5A1_UNORM_SRGB] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, B5G5R5A1_UNORM_SRGB>::StoreGeneric;
1964
table[TTileMode][B4G4R4A4_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, B4G4R4A4_UNORM>::StoreGeneric;
1965
table[TTileMode][B4G4R4A4_UNORM_SRGB] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, B4G4R4A4_UNORM_SRGB>::StoreGeneric;
1966
table[TTileMode][R8G8_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, R8G8_UNORM>::Store;
1967
table[TTileMode][R8G8_SNORM] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, R8G8_SNORM>::Store;
1968
table[TTileMode][R8G8_SINT] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, R8G8_SINT>::Store;
1969
table[TTileMode][R8G8_UINT] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, R8G8_UINT>::Store;
1970
table[TTileMode][R16_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, R16_UNORM>::Store;
1971
table[TTileMode][R16_SNORM] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, R16_SNORM>::Store;
1972
table[TTileMode][R16_SINT] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, R16_SINT>::Store;
1973
table[TTileMode][R16_UINT] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, R16_UINT>::Store;
1974
table[TTileMode][R16_FLOAT] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, R16_FLOAT>::Store;
1975
table[TTileMode][A16_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, A16_UNORM>::Store;
1976
table[TTileMode][A16_FLOAT] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, A16_FLOAT>::Store;
1977
table[TTileMode][B5G5R5X1_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, B5G5R5X1_UNORM>::StoreGeneric;
1978
table[TTileMode][B5G5R5X1_UNORM_SRGB] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, B5G5R5X1_UNORM_SRGB>::StoreGeneric;
1979
table[TTileMode][R8G8_SSCALED] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, R8G8_SSCALED>::Store;
1980
table[TTileMode][R8G8_USCALED] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, R8G8_USCALED>::Store;
1981
table[TTileMode][R16_SSCALED] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, R16_SSCALED>::Store;
1982
table[TTileMode][R16_USCALED] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, R16_USCALED>::Store;
1983
table[TTileMode][A1B5G5R5_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, A1B5G5R5_UNORM>::StoreGeneric;
1984
table[TTileMode][A4B4G4R4_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, A4B4G4R4_UNORM>::StoreGeneric;
1985
table[TTileMode][R8_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 8>, R32G32B32A32_FLOAT, R8_UNORM>::Store;
1986
table[TTileMode][R8_SNORM] = StoreMacroTile<TilingTraits<TTileMode, 8>, R32G32B32A32_FLOAT, R8_SNORM>::Store;
1987
table[TTileMode][R8_SINT] = StoreMacroTile<TilingTraits<TTileMode, 8>, R32G32B32A32_FLOAT, R8_SINT>::Store;
1988
table[TTileMode][R8_UINT] = StoreMacroTile<TilingTraits<TTileMode, 8>, R32G32B32A32_FLOAT, R8_UINT>::Store;
1989
table[TTileMode][A8_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 8>, R32G32B32A32_FLOAT, A8_UNORM>::Store;
1990
table[TTileMode][R8_SSCALED] = StoreMacroTile<TilingTraits<TTileMode, 8>, R32G32B32A32_FLOAT, R8_SSCALED>::Store;
1991
table[TTileMode][R8_USCALED] = StoreMacroTile<TilingTraits<TTileMode, 8>, R32G32B32A32_FLOAT, R8_USCALED>::Store;
1992
table[TTileMode][R8G8B8_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 24>, R32G32B32A32_FLOAT, R8G8B8_UNORM>::Store;
1993
table[TTileMode][R8G8B8_SNORM] = StoreMacroTile<TilingTraits<TTileMode, 24>, R32G32B32A32_FLOAT, R8G8B8_SNORM>::Store;
1994
table[TTileMode][R8G8B8_SSCALED] = StoreMacroTile<TilingTraits<TTileMode, 24>, R32G32B32A32_FLOAT, R8G8B8_SSCALED>::Store;
1995
table[TTileMode][R8G8B8_USCALED] = StoreMacroTile<TilingTraits<TTileMode, 24>, R32G32B32A32_FLOAT, R8G8B8_USCALED>::Store;
1996
table[TTileMode][R16G16B16_FLOAT] = StoreMacroTile<TilingTraits<TTileMode, 48>, R32G32B32A32_FLOAT, R16G16B16_FLOAT>::Store;
1997
table[TTileMode][R16G16B16_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 48>, R32G32B32A32_FLOAT, R16G16B16_UNORM>::Store;
1998
table[TTileMode][R16G16B16_SNORM] = StoreMacroTile<TilingTraits<TTileMode, 48>, R32G32B32A32_FLOAT, R16G16B16_SNORM>::Store;
1999
table[TTileMode][R16G16B16_SSCALED] = StoreMacroTile<TilingTraits<TTileMode, 48>, R32G32B32A32_FLOAT, R16G16B16_SSCALED>::Store;
2000
table[TTileMode][R16G16B16_USCALED] = StoreMacroTile<TilingTraits<TTileMode, 48>, R32G32B32A32_FLOAT, R16G16B16_USCALED>::Store;
2001
table[TTileMode][R8G8B8_UNORM_SRGB] = StoreMacroTile<TilingTraits<TTileMode, 24>, R32G32B32A32_FLOAT, R8G8B8_UNORM_SRGB>::Store;
2002
table[TTileMode][R16G16B16_UINT] = StoreMacroTile<TilingTraits<TTileMode, 48>, R32G32B32A32_FLOAT, R16G16B16_UINT>::Store;
2003
table[TTileMode][R16G16B16_SINT] = StoreMacroTile<TilingTraits<TTileMode, 48>, R32G32B32A32_FLOAT, R16G16B16_SINT>::Store;
2004
table[TTileMode][R10G10B10A2_SNORM] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R10G10B10A2_SNORM>::StoreGeneric;
2005
table[TTileMode][R10G10B10A2_USCALED] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R10G10B10A2_USCALED>::StoreGeneric;
2006
table[TTileMode][R10G10B10A2_SSCALED] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R10G10B10A2_SSCALED>::StoreGeneric;
2007
table[TTileMode][R10G10B10A2_SINT] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R10G10B10A2_SINT>::StoreGeneric;
2008
table[TTileMode][B10G10R10A2_SNORM] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, B10G10R10A2_SNORM>::StoreGeneric;
2009
table[TTileMode][B10G10R10A2_USCALED] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, B10G10R10A2_USCALED>::StoreGeneric;
2010
table[TTileMode][B10G10R10A2_SSCALED] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, B10G10R10A2_SSCALED>::StoreGeneric;
2011
table[TTileMode][B10G10R10A2_UINT] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, B10G10R10A2_UINT>::StoreGeneric;
2012
table[TTileMode][B10G10R10A2_SINT] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, B10G10R10A2_SINT>::StoreGeneric;
2013
table[TTileMode][R8G8B8_UINT] = StoreMacroTile<TilingTraits<TTileMode, 24>, R32G32B32A32_FLOAT, R8G8B8_UINT>::Store;
2014
table[TTileMode][R8G8B8_SINT] = StoreMacroTile<TilingTraits<TTileMode, 24>, R32G32B32A32_FLOAT, R8G8B8_SINT>::Store;
2015
}
2016
2017
//////////////////////////////////////////////////////////////////////////
2018
/// INIT_STORE_TILES_TABLE - Helper macro for setting up the tables.
2019
template <SWR_TILE_MODE TTileMode, size_t NumTileModes, size_t ArraySizeT>
2020
void InitStoreTilesTableDepth(
2021
PFN_STORE_TILES(&table)[NumTileModes][ArraySizeT])
2022
{
2023
table[TTileMode][R32_FLOAT] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32_FLOAT, R32_FLOAT>::Store;
2024
table[TTileMode][R32_FLOAT_X8X24_TYPELESS] = StoreMacroTile<TilingTraits<TTileMode, 64>, R32_FLOAT, R32_FLOAT_X8X24_TYPELESS>::Store;
2025
table[TTileMode][R24_UNORM_X8_TYPELESS] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32_FLOAT, R24_UNORM_X8_TYPELESS>::Store;
2026
table[TTileMode][R16_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32_FLOAT, R16_UNORM>::Store;
2027
}
2028
2029
template <SWR_TILE_MODE TTileMode, size_t NumTileModes, size_t ArraySizeT>
2030
void InitStoreTilesTableStencil(
2031
PFN_STORE_TILES(&table)[NumTileModes][ArraySizeT])
2032
{
2033
table[TTileMode][R8_UINT] = StoreMacroTile<TilingTraits<TTileMode, 8>, R8_UINT, R8_UINT>::Store;
2034
}
2035
2036
2037
//////////////////////////////////////////////////////////////////////////
2038
/// @brief Deswizzles and stores a full hottile to a render surface
2039
/// @param hPrivateContext - Handle to private DC
2040
/// @param srcFormat - Format for hot tile.
2041
/// @param renderTargetIndex - Index to destination render target
2042
/// @param x, y - Coordinates to raster tile.
2043
/// @param pSrcHotTile - Pointer to Hot Tile
2044
void SwrStoreHotTileToSurface(
2045
HANDLE hWorkerPrivateData,
2046
SWR_SURFACE_STATE *pDstSurface,
2047
BucketManager* pBucketMgr,
2048
SWR_FORMAT srcFormat,
2049
SWR_RENDERTARGET_ATTACHMENT renderTargetIndex,
2050
uint32_t x, uint32_t y, uint32_t renderTargetArrayIndex,
2051
uint8_t *pSrcHotTile);
2052
2053