Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
godotengine
GitHub Repository: godotengine/godot
Path: blob/master/thirdparty/etcpak/DecodeRGB.cpp
9833 views
1
#include "DecodeRGB.hpp"
2
#include "Tables.hpp"
3
#include "Math.hpp"
4
5
#include <string.h>
6
7
#ifdef __ARM_NEON
8
# include <arm_neon.h>
9
#endif
10
11
#if defined __SSE4_1__ || defined __AVX2__ || defined _MSC_VER
12
# ifdef _MSC_VER
13
# include <intrin.h>
14
# include <Windows.h>
15
# define _bswap(x) _byteswap_ulong(x)
16
# define _bswap64(x) _byteswap_uint64(x)
17
# else
18
# include <x86intrin.h>
19
# endif
20
#endif
21
22
#ifndef _bswap
23
# define _bswap(x) __builtin_bswap32(x)
24
# define _bswap64(x) __builtin_bswap64(x)
25
#endif
26
27
static uint8_t table59T58H[8] = { 3,6,11,16,23,32,41,64 };
28
29
namespace
30
{
31
32
static etcpak_force_inline int32_t expand6(uint32_t value)
33
{
34
return (value << 2) | (value >> 4);
35
}
36
37
static etcpak_force_inline int32_t expand7(uint32_t value)
38
{
39
return (value << 1) | (value >> 6);
40
}
41
42
static etcpak_force_inline void DecodeT( uint64_t block, uint32_t* dst, uint32_t w )
43
{
44
const auto r0 = ( block >> 24 ) & 0x1B;
45
const auto rh0 = ( r0 >> 3 ) & 0x3;
46
const auto rl0 = r0 & 0x3;
47
const auto g0 = ( block >> 20 ) & 0xF;
48
const auto b0 = ( block >> 16 ) & 0xF;
49
50
const auto r1 = ( block >> 12 ) & 0xF;
51
const auto g1 = ( block >> 8 ) & 0xF;
52
const auto b1 = ( block >> 4 ) & 0xF;
53
54
const auto cr0 = ( ( rh0 << 6 ) | ( rl0 << 4 ) | ( rh0 << 2 ) | rl0);
55
const auto cg0 = ( g0 << 4 ) | g0;
56
const auto cb0 = ( b0 << 4 ) | b0;
57
58
const auto cr1 = ( r1 << 4 ) | r1;
59
const auto cg1 = ( g1 << 4 ) | g1;
60
const auto cb1 = ( b1 << 4 ) | b1;
61
62
const auto codeword_hi = ( block >> 2 ) & 0x3;
63
const auto codeword_lo = block & 0x1;
64
const auto codeword = ( codeword_hi << 1 ) | codeword_lo;
65
66
const auto c2r = clampu8( cr1 + table59T58H[codeword] );
67
const auto c2g = clampu8( cg1 + table59T58H[codeword] );
68
const auto c2b = clampu8( cb1 + table59T58H[codeword] );
69
70
const auto c3r = clampu8( cr1 - table59T58H[codeword] );
71
const auto c3g = clampu8( cg1 - table59T58H[codeword] );
72
const auto c3b = clampu8( cb1 - table59T58H[codeword] );
73
74
const uint32_t col_tab[4] = {
75
uint32_t( cr0 | ( cg0 << 8 ) | ( cb0 << 16 ) | 0xFF000000 ),
76
uint32_t( c2r | ( c2g << 8 ) | ( c2b << 16 ) | 0xFF000000 ),
77
uint32_t( cr1 | ( cg1 << 8 ) | ( cb1 << 16 ) | 0xFF000000 ),
78
uint32_t( c3r | ( c3g << 8 ) | ( c3b << 16 ) | 0xFF000000 )
79
};
80
81
const uint32_t indexes = ( block >> 32 ) & 0xFFFFFFFF;
82
for( uint8_t j = 0; j < 4; j++ )
83
{
84
for( uint8_t i = 0; i < 4; i++ )
85
{
86
//2bit indices distributed on two lane 16bit numbers
87
const uint8_t index = ( ( ( indexes >> ( j + i * 4 + 16 ) ) & 0x1 ) << 1) | ( ( indexes >> ( j + i * 4 ) ) & 0x1);
88
dst[j * w + i] = col_tab[index];
89
}
90
}
91
}
92
93
static etcpak_force_inline void DecodeTAlpha( uint64_t block, uint64_t alpha, uint32_t* dst, uint32_t w )
94
{
95
const auto r0 = ( block >> 24 ) & 0x1B;
96
const auto rh0 = ( r0 >> 3 ) & 0x3;
97
const auto rl0 = r0 & 0x3;
98
const auto g0 = ( block >> 20 ) & 0xF;
99
const auto b0 = ( block >> 16 ) & 0xF;
100
101
const auto r1 = ( block >> 12 ) & 0xF;
102
const auto g1 = ( block >> 8 ) & 0xF;
103
const auto b1 = ( block >> 4 ) & 0xF;
104
105
const auto cr0 = ( ( rh0 << 6 ) | ( rl0 << 4 ) | ( rh0 << 2 ) | rl0);
106
const auto cg0 = ( g0 << 4 ) | g0;
107
const auto cb0 = ( b0 << 4 ) | b0;
108
109
const auto cr1 = ( r1 << 4 ) | r1;
110
const auto cg1 = ( g1 << 4 ) | g1;
111
const auto cb1 = ( b1 << 4 ) | b1;
112
113
const auto codeword_hi = ( block >> 2 ) & 0x3;
114
const auto codeword_lo = block & 0x1;
115
const auto codeword = (codeword_hi << 1) | codeword_lo;
116
117
const int32_t base = alpha >> 56;
118
const int32_t mul = ( alpha >> 52 ) & 0xF;
119
const auto tbl = g_alpha[( alpha >> 48 ) & 0xF];
120
121
const auto c2r = clampu8( cr1 + table59T58H[codeword] );
122
const auto c2g = clampu8( cg1 + table59T58H[codeword] );
123
const auto c2b = clampu8( cb1 + table59T58H[codeword] );
124
125
const auto c3r = clampu8( cr1 - table59T58H[codeword] );
126
const auto c3g = clampu8( cg1 - table59T58H[codeword] );
127
const auto c3b = clampu8( cb1 - table59T58H[codeword] );
128
129
const uint32_t col_tab[4] = {
130
uint32_t( cr0 | ( cg0 << 8 ) | ( cb0 << 16 ) ),
131
uint32_t( c2r | ( c2g << 8 ) | ( c2b << 16 ) ),
132
uint32_t( cr1 | ( cg1 << 8 ) | ( cb1 << 16 ) ),
133
uint32_t( c3r | ( c3g << 8 ) | ( c3b << 16 ) )
134
};
135
136
const uint32_t indexes = ( block >> 32 ) & 0xFFFFFFFF;
137
for( uint8_t j = 0; j < 4; j++ )
138
{
139
for( uint8_t i = 0; i < 4; i++ )
140
{
141
//2bit indices distributed on two lane 16bit numbers
142
const uint8_t index = ( ( ( indexes >> ( j + i * 4 + 16 ) ) & 0x1 ) << 1 ) | ( ( indexes >> ( j + i * 4 ) ) & 0x1 );
143
const auto amod = tbl[( alpha >> ( 45 - j * 3 - i * 12 ) ) & 0x7];
144
const uint32_t a = clampu8( base + amod * mul );
145
dst[j * w + i] = col_tab[index] | ( a << 24 );
146
}
147
}
148
}
149
150
static etcpak_force_inline void DecodeH( uint64_t block, uint32_t* dst, uint32_t w )
151
{
152
const uint32_t indexes = ( block >> 32 ) & 0xFFFFFFFF;
153
154
const auto r0444 = ( block >> 27 ) & 0xF;
155
const auto g0444 = ( ( block >> 20 ) & 0x1 ) | ( ( ( block >> 24 ) & 0x7 ) << 1 );
156
const auto b0444 = ( ( block >> 15 ) & 0x7 ) | ( ( ( block >> 19 ) & 0x1 ) << 3 );
157
158
const auto r1444 = ( block >> 11 ) & 0xF;
159
const auto g1444 = ( block >> 7 ) & 0xF;
160
const auto b1444 = ( block >> 3 ) & 0xF;
161
162
const auto r0 = ( r0444 << 4 ) | r0444;
163
const auto g0 = ( g0444 << 4 ) | g0444;
164
const auto b0 = ( b0444 << 4 ) | b0444;
165
166
const auto r1 = ( r1444 << 4 ) | r1444;
167
const auto g1 = ( g1444 << 4 ) | g1444;
168
const auto b1 = ( b1444 << 4 ) | b1444;
169
170
const auto codeword_hi = ( ( block & 0x1 ) << 1 ) | ( ( block & 0x4 ) );
171
const auto c0 = ( r0444 << 8 ) | ( g0444 << 4 ) | ( b0444 << 0 );
172
const auto c1 = ( block >> 3 ) & ( ( 1 << 12 ) - 1 );
173
const auto codeword_lo = ( c0 >= c1 ) ? 1 : 0;
174
const auto codeword = codeword_hi | codeword_lo;
175
176
const uint32_t col_tab[] = {
177
uint32_t( clampu8( r0 + table59T58H[codeword] ) | ( clampu8( g0 + table59T58H[codeword] ) << 8 ) | ( clampu8( b0 + table59T58H[codeword] ) << 16 ) ),
178
uint32_t( clampu8( r0 - table59T58H[codeword] ) | ( clampu8( g0 - table59T58H[codeword] ) << 8 ) | ( clampu8( b0 - table59T58H[codeword] ) << 16 ) ),
179
uint32_t( clampu8( r1 + table59T58H[codeword] ) | ( clampu8( g1 + table59T58H[codeword] ) << 8 ) | ( clampu8( b1 + table59T58H[codeword] ) << 16 ) ),
180
uint32_t( clampu8( r1 - table59T58H[codeword] ) | ( clampu8( g1 - table59T58H[codeword] ) << 8 ) | ( clampu8( b1 - table59T58H[codeword] ) << 16 ) )
181
};
182
183
for( uint8_t j = 0; j < 4; j++ )
184
{
185
for( uint8_t i = 0; i < 4; i++ )
186
{
187
const uint8_t index = ( ( ( indexes >> ( j + i * 4 + 16 ) ) & 0x1 ) << 1 ) | ( ( indexes >> ( j + i * 4 ) ) & 0x1 );
188
dst[j * w + i] = col_tab[index] | 0xFF000000;
189
}
190
}
191
}
192
193
static etcpak_force_inline void DecodeHAlpha( uint64_t block, uint64_t alpha, uint32_t* dst, uint32_t w )
194
{
195
const uint32_t indexes = ( block >> 32 ) & 0xFFFFFFFF;
196
197
const auto r0444 = ( block >> 27 ) & 0xF;
198
const auto g0444 = ( ( block >> 20 ) & 0x1 ) | ( ( ( block >> 24 ) & 0x7 ) << 1 );
199
const auto b0444 = ( ( block >> 15 ) & 0x7 ) | ( ( ( block >> 19 ) & 0x1 ) << 3 );
200
201
const auto r1444 = ( block >> 11 ) & 0xF;
202
const auto g1444 = ( block >> 7 ) & 0xF;
203
const auto b1444 = ( block >> 3 ) & 0xF;
204
205
const auto r0 = ( r0444 << 4 ) | r0444;
206
const auto g0 = ( g0444 << 4 ) | g0444;
207
const auto b0 = ( b0444 << 4 ) | b0444;
208
209
const auto r1 = ( r1444 << 4 ) | r1444;
210
const auto g1 = ( g1444 << 4 ) | g1444;
211
const auto b1 = ( b1444 << 4 ) | b1444;
212
213
const auto codeword_hi = ( ( block & 0x1 ) << 1 ) | ( ( block & 0x4 ) );
214
const auto c0 = ( r0444 << 8 ) | ( g0444 << 4 ) | ( b0444 << 0 );
215
const auto c1 = ( block >> 3 ) & ( ( 1 << 12 ) - 1 );
216
const auto codeword_lo = ( c0 >= c1 ) ? 1 : 0;
217
const auto codeword = codeword_hi | codeword_lo;
218
219
const int32_t base = alpha >> 56;
220
const int32_t mul = ( alpha >> 52 ) & 0xF;
221
const auto tbl = g_alpha[(alpha >> 48) & 0xF];
222
223
const uint32_t col_tab[] = {
224
uint32_t( clampu8( r0 + table59T58H[codeword] ) | ( clampu8( g0 + table59T58H[codeword] ) << 8 ) | ( clampu8( b0 + table59T58H[codeword] ) << 16 ) ),
225
uint32_t( clampu8( r0 - table59T58H[codeword] ) | ( clampu8( g0 - table59T58H[codeword] ) << 8 ) | ( clampu8( b0 - table59T58H[codeword] ) << 16 ) ),
226
uint32_t( clampu8( r1 + table59T58H[codeword] ) | ( clampu8( g1 + table59T58H[codeword] ) << 8 ) | ( clampu8( b1 + table59T58H[codeword] ) << 16 ) ),
227
uint32_t( clampu8( r1 - table59T58H[codeword] ) | ( clampu8( g1 - table59T58H[codeword] ) << 8 ) | ( clampu8( b1 - table59T58H[codeword] ) << 16 ) )
228
};
229
230
for( uint8_t j = 0; j < 4; j++ )
231
{
232
for( uint8_t i = 0; i < 4; i++ )
233
{
234
const uint8_t index = ( ( ( indexes >> ( j + i * 4 + 16 ) ) & 0x1 ) << 1 ) | ( ( indexes >> ( j + i * 4 ) ) & 0x1 );
235
const auto amod = tbl[( alpha >> ( 45 - j * 3 - i * 12) ) & 0x7];
236
const uint32_t a = clampu8( base + amod * mul );
237
dst[j * w + i] = col_tab[index] | ( a << 24 );
238
}
239
}
240
}
241
242
static etcpak_force_inline void DecodePlanar( uint64_t block, uint32_t* dst, uint32_t w )
243
{
244
const auto bv = expand6((block >> ( 0 + 32)) & 0x3F);
245
const auto gv = expand7((block >> ( 6 + 32)) & 0x7F);
246
const auto rv = expand6((block >> (13 + 32)) & 0x3F);
247
248
const auto bh = expand6((block >> (19 + 32)) & 0x3F);
249
const auto gh = expand7((block >> (25 + 32)) & 0x7F);
250
251
const auto rh0 = (block >> (32 - 32)) & 0x01;
252
const auto rh1 = ((block >> (34 - 32)) & 0x1F) << 1;
253
const auto rh = expand6(rh0 | rh1);
254
255
const auto bo0 = (block >> (39 - 32)) & 0x07;
256
const auto bo1 = ((block >> (43 - 32)) & 0x3) << 3;
257
const auto bo2 = ((block >> (48 - 32)) & 0x1) << 5;
258
const auto bo = expand6(bo0 | bo1 | bo2);
259
const auto go0 = (block >> (49 - 32)) & 0x3F;
260
const auto go1 = ((block >> (56 - 32)) & 0x01) << 6;
261
const auto go = expand7(go0 | go1);
262
const auto ro = expand6((block >> (57 - 32)) & 0x3F);
263
264
#ifdef __ARM_NEON
265
uint64_t init = uint64_t(uint16_t(rh-ro)) | ( uint64_t(uint16_t(gh-go)) << 16 ) | ( uint64_t(uint16_t(bh-bo)) << 32 );
266
int16x8_t chco = vreinterpretq_s16_u64( vdupq_n_u64( init ) );
267
init = uint64_t(uint16_t( (rv-ro) - 4 * (rh-ro) )) | ( uint64_t(uint16_t( (gv-go) - 4 * (gh-go) )) << 16 ) | ( uint64_t(uint16_t( (bv-bo) - 4 * (bh-bo) )) << 32 );
268
int16x8_t cvco = vreinterpretq_s16_u64( vdupq_n_u64( init ) );
269
init = uint64_t(4*ro+2) | ( uint64_t(4*go+2) << 16 ) | ( uint64_t(4*bo+2) << 32 ) | ( uint64_t(0xFFF) << 48 );
270
int16x8_t col = vreinterpretq_s16_u64( vdupq_n_u64( init ) );
271
272
for( int j=0; j<4; j++ )
273
{
274
for( int i=0; i<4; i++ )
275
{
276
uint8x8_t c = vqshrun_n_s16( col, 2 );
277
vst1_lane_u32( dst+j*w+i, vreinterpret_u32_u8( c ), 0 );
278
col = vaddq_s16( col, chco );
279
}
280
col = vaddq_s16( col, cvco );
281
}
282
#elif defined __AVX2__
283
const auto R0 = 4*ro+2;
284
const auto G0 = 4*go+2;
285
const auto B0 = 4*bo+2;
286
const auto RHO = rh-ro;
287
const auto GHO = gh-go;
288
const auto BHO = bh-bo;
289
290
__m256i cvco = _mm256_setr_epi16( rv - ro, gv - go, bv - bo, 0, rv - ro, gv - go, bv - bo, 0, rv - ro, gv - go, bv - bo, 0, rv - ro, gv - go, bv - bo, 0 );
291
__m256i col = _mm256_setr_epi16( R0, G0, B0, 0xFFF, R0+RHO, G0+GHO, B0+BHO, 0xFFF, R0+2*RHO, G0+2*GHO, B0+2*BHO, 0xFFF, R0+3*RHO, G0+3*GHO, B0+3*BHO, 0xFFF );
292
293
for( int j=0; j<4; j++ )
294
{
295
__m256i c = _mm256_srai_epi16( col, 2 );
296
__m128i s = _mm_packus_epi16( _mm256_castsi256_si128( c ), _mm256_extracti128_si256( c, 1 ) );
297
_mm_storeu_si128( (__m128i*)(dst+j*w), s );
298
col = _mm256_add_epi16( col, cvco );
299
}
300
#elif defined __SSE4_1__
301
__m128i chco = _mm_setr_epi16( rh - ro, gh - go, bh - bo, 0, 0, 0, 0, 0 );
302
__m128i cvco = _mm_setr_epi16( (rv - ro) - 4 * (rh - ro), (gv - go) - 4 * (gh - go), (bv - bo) - 4 * (bh - bo), 0, 0, 0, 0, 0 );
303
__m128i col = _mm_setr_epi16( 4*ro+2, 4*go+2, 4*bo+2, 0xFFF, 0, 0, 0, 0 );
304
305
for( int j=0; j<4; j++ )
306
{
307
for( int i=0; i<4; i++ )
308
{
309
__m128i c = _mm_srai_epi16( col, 2 );
310
__m128i s = _mm_packus_epi16( c, c );
311
dst[j*w+i] = _mm_cvtsi128_si32( s );
312
col = _mm_add_epi16( col, chco );
313
}
314
col = _mm_add_epi16( col, cvco );
315
}
316
#else
317
for( int j=0; j<4; j++ )
318
{
319
for( int i=0; i<4; i++ )
320
{
321
const uint32_t r = (i * (rh - ro) + j * (rv - ro) + 4 * ro + 2) >> 2;
322
const uint32_t g = (i * (gh - go) + j * (gv - go) + 4 * go + 2) >> 2;
323
const uint32_t b = (i * (bh - bo) + j * (bv - bo) + 4 * bo + 2) >> 2;
324
if( ( ( r | g | b ) & ~0xFF ) == 0 )
325
{
326
dst[j*w+i] = r | ( g << 8 ) | ( b << 16 ) | 0xFF000000;
327
}
328
else
329
{
330
const auto rc = clampu8( r );
331
const auto gc = clampu8( g );
332
const auto bc = clampu8( b );
333
dst[j*w+i] = rc | ( gc << 8 ) | ( bc << 16 ) | 0xFF000000;
334
}
335
}
336
}
337
#endif
338
}
339
340
static etcpak_force_inline void DecodePlanarAlpha( uint64_t block, uint64_t alpha, uint32_t* dst, uint32_t w )
341
{
342
const auto bv = expand6((block >> ( 0 + 32)) & 0x3F);
343
const auto gv = expand7((block >> ( 6 + 32)) & 0x7F);
344
const auto rv = expand6((block >> (13 + 32)) & 0x3F);
345
346
const auto bh = expand6((block >> (19 + 32)) & 0x3F);
347
const auto gh = expand7((block >> (25 + 32)) & 0x7F);
348
349
const auto rh0 = (block >> (32 - 32)) & 0x01;
350
const auto rh1 = ((block >> (34 - 32)) & 0x1F) << 1;
351
const auto rh = expand6(rh0 | rh1);
352
353
const auto bo0 = (block >> (39 - 32)) & 0x07;
354
const auto bo1 = ((block >> (43 - 32)) & 0x3) << 3;
355
const auto bo2 = ((block >> (48 - 32)) & 0x1) << 5;
356
const auto bo = expand6(bo0 | bo1 | bo2);
357
const auto go0 = (block >> (49 - 32)) & 0x3F;
358
const auto go1 = ((block >> (56 - 32)) & 0x01) << 6;
359
const auto go = expand7(go0 | go1);
360
const auto ro = expand6((block >> (57 - 32)) & 0x3F);
361
362
const int32_t base = alpha >> 56;
363
const int32_t mul = ( alpha >> 52 ) & 0xF;
364
const auto tbl = g_alpha[( alpha >> 48 ) & 0xF];
365
366
#ifdef __ARM_NEON
367
uint64_t init = uint64_t(uint16_t(rh-ro)) | ( uint64_t(uint16_t(gh-go)) << 16 ) | ( uint64_t(uint16_t(bh-bo)) << 32 );
368
int16x8_t chco = vreinterpretq_s16_u64( vdupq_n_u64( init ) );
369
init = uint64_t(uint16_t( (rv-ro) - 4 * (rh-ro) )) | ( uint64_t(uint16_t( (gv-go) - 4 * (gh-go) )) << 16 ) | ( uint64_t(uint16_t( (bv-bo) - 4 * (bh-bo) )) << 32 );
370
int16x8_t cvco = vreinterpretq_s16_u64( vdupq_n_u64( init ) );
371
init = uint64_t(4*ro+2) | ( uint64_t(4*go+2) << 16 ) | ( uint64_t(4*bo+2) << 32 );
372
int16x8_t col = vreinterpretq_s16_u64( vdupq_n_u64( init ) );
373
374
for( int j=0; j<4; j++ )
375
{
376
for( int i=0; i<4; i++ )
377
{
378
const auto amod = tbl[(alpha >> ( 45 - j*3 - i*12 )) & 0x7];
379
const uint32_t a = clampu8( base + amod * mul );
380
uint8x8_t c = vqshrun_n_s16( col, 2 );
381
dst[j*w+i] = vget_lane_u32( vreinterpret_u32_u8( c ), 0 ) | ( a << 24 );
382
col = vaddq_s16( col, chco );
383
}
384
col = vaddq_s16( col, cvco );
385
}
386
#elif defined __SSE4_1__
387
__m128i chco = _mm_setr_epi16( rh - ro, gh - go, bh - bo, 0, 0, 0, 0, 0 );
388
__m128i cvco = _mm_setr_epi16( (rv - ro) - 4 * (rh - ro), (gv - go) - 4 * (gh - go), (bv - bo) - 4 * (bh - bo), 0, 0, 0, 0, 0 );
389
__m128i col = _mm_setr_epi16( 4*ro+2, 4*go+2, 4*bo+2, 0, 0, 0, 0, 0 );
390
391
for( int j=0; j<4; j++ )
392
{
393
for( int i=0; i<4; i++ )
394
{
395
const auto amod = tbl[(alpha >> ( 45 - j*3 - i*12 )) & 0x7];
396
const uint32_t a = clampu8( base + amod * mul );
397
__m128i c = _mm_srai_epi16( col, 2 );
398
__m128i s = _mm_packus_epi16( c, c );
399
dst[j*w+i] = _mm_cvtsi128_si32( s ) | ( a << 24 );
400
col = _mm_add_epi16( col, chco );
401
}
402
col = _mm_add_epi16( col, cvco );
403
}
404
#else
405
for (auto j = 0; j < 4; j++)
406
{
407
for (auto i = 0; i < 4; i++)
408
{
409
const uint32_t r = (i * (rh - ro) + j * (rv - ro) + 4 * ro + 2) >> 2;
410
const uint32_t g = (i * (gh - go) + j * (gv - go) + 4 * go + 2) >> 2;
411
const uint32_t b = (i * (bh - bo) + j * (bv - bo) + 4 * bo + 2) >> 2;
412
const auto amod = tbl[(alpha >> ( 45 - j*3 - i*12 )) & 0x7];
413
const uint32_t a = clampu8( base + amod * mul );
414
if( ( ( r | g | b ) & ~0xFF ) == 0 )
415
{
416
dst[j*w+i] = r | ( g << 8 ) | ( b << 16 ) | ( a << 24 );
417
}
418
else
419
{
420
const auto rc = clampu8( r );
421
const auto gc = clampu8( g );
422
const auto bc = clampu8( b );
423
dst[j*w+i] = rc | ( gc << 8 ) | ( bc << 16 ) | ( a << 24 );
424
}
425
}
426
}
427
#endif
428
}
429
}
430
431
static etcpak_force_inline uint64_t ConvertByteOrder( uint64_t d )
432
{
433
uint32_t word[2];
434
memcpy( word, &d, 8 );
435
word[0] = _bswap( word[0] );
436
word[1] = _bswap( word[1] );
437
memcpy( &d, word, 8 );
438
return d;
439
}
440
441
static etcpak_force_inline void DecodeRGBPart( uint64_t d, uint32_t* dst, uint32_t w )
442
{
443
d = ConvertByteOrder( d );
444
445
uint32_t br[2], bg[2], bb[2];
446
447
if( d & 0x2 )
448
{
449
int32_t dr, dg, db;
450
451
uint32_t r0 = ( d & 0xF8000000 ) >> 27;
452
uint32_t g0 = ( d & 0x00F80000 ) >> 19;
453
uint32_t b0 = ( d & 0x0000F800 ) >> 11;
454
455
dr = ( int32_t(d) << 5 ) >> 29;
456
dg = ( int32_t(d) << 13 ) >> 29;
457
db = ( int32_t(d) << 21 ) >> 29;
458
459
int32_t r1 = int32_t(r0) + dr;
460
int32_t g1 = int32_t(g0) + dg;
461
int32_t b1 = int32_t(b0) + db;
462
463
// T mode
464
if ( (r1 < 0) || (r1 > 31) )
465
{
466
DecodeT( d, dst, w );
467
return;
468
}
469
470
// H mode
471
if ((g1 < 0) || (g1 > 31))
472
{
473
DecodeH( d, dst, w );
474
return;
475
}
476
477
// P mode
478
if( (b1 < 0) || (b1 > 31) )
479
{
480
DecodePlanar( d, dst, w );
481
return;
482
}
483
484
br[0] = ( r0 << 3 ) | ( r0 >> 2 );
485
br[1] = ( r1 << 3 ) | ( r1 >> 2 );
486
bg[0] = ( g0 << 3 ) | ( g0 >> 2 );
487
bg[1] = ( g1 << 3 ) | ( g1 >> 2 );
488
bb[0] = ( b0 << 3 ) | ( b0 >> 2 );
489
bb[1] = ( b1 << 3 ) | ( b1 >> 2 );
490
}
491
else
492
{
493
br[0] = ( ( d & 0xF0000000 ) >> 24 ) | ( ( d & 0xF0000000 ) >> 28 );
494
br[1] = ( ( d & 0x0F000000 ) >> 20 ) | ( ( d & 0x0F000000 ) >> 24 );
495
bg[0] = ( ( d & 0x00F00000 ) >> 16 ) | ( ( d & 0x00F00000 ) >> 20 );
496
bg[1] = ( ( d & 0x000F0000 ) >> 12 ) | ( ( d & 0x000F0000 ) >> 16 );
497
bb[0] = ( ( d & 0x0000F000 ) >> 8 ) | ( ( d & 0x0000F000 ) >> 12 );
498
bb[1] = ( ( d & 0x00000F00 ) >> 4 ) | ( ( d & 0x00000F00 ) >> 8 );
499
}
500
501
unsigned int tcw[2];
502
tcw[0] = ( d & 0xE0 ) >> 5;
503
tcw[1] = ( d & 0x1C ) >> 2;
504
505
uint32_t b1 = ( d >> 32 ) & 0xFFFF;
506
uint32_t b2 = ( d >> 48 );
507
508
b1 = ( b1 | ( b1 << 8 ) ) & 0x00FF00FF;
509
b1 = ( b1 | ( b1 << 4 ) ) & 0x0F0F0F0F;
510
b1 = ( b1 | ( b1 << 2 ) ) & 0x33333333;
511
b1 = ( b1 | ( b1 << 1 ) ) & 0x55555555;
512
513
b2 = ( b2 | ( b2 << 8 ) ) & 0x00FF00FF;
514
b2 = ( b2 | ( b2 << 4 ) ) & 0x0F0F0F0F;
515
b2 = ( b2 | ( b2 << 2 ) ) & 0x33333333;
516
b2 = ( b2 | ( b2 << 1 ) ) & 0x55555555;
517
518
uint32_t idx = b1 | ( b2 << 1 );
519
520
if( d & 0x1 )
521
{
522
for( int i=0; i<4; i++ )
523
{
524
for( int j=0; j<4; j++ )
525
{
526
const auto mod = g_table[tcw[j/2]][idx & 0x3];
527
const auto r = br[j/2] + mod;
528
const auto g = bg[j/2] + mod;
529
const auto b = bb[j/2] + mod;
530
if( ( ( r | g | b ) & ~0xFF ) == 0 )
531
{
532
dst[j*w+i] = r | ( g << 8 ) | ( b << 16 ) | 0xFF000000;
533
}
534
else
535
{
536
const auto rc = clampu8( r );
537
const auto gc = clampu8( g );
538
const auto bc = clampu8( b );
539
dst[j*w+i] = rc | ( gc << 8 ) | ( bc << 16 ) | 0xFF000000;
540
}
541
idx >>= 2;
542
}
543
}
544
}
545
else
546
{
547
for( int i=0; i<4; i++ )
548
{
549
const auto tbl = g_table[tcw[i/2]];
550
const auto cr = br[i/2];
551
const auto cg = bg[i/2];
552
const auto cb = bb[i/2];
553
554
for( int j=0; j<4; j++ )
555
{
556
const auto mod = tbl[idx & 0x3];
557
const auto r = cr + mod;
558
const auto g = cg + mod;
559
const auto b = cb + mod;
560
if( ( ( r | g | b ) & ~0xFF ) == 0 )
561
{
562
dst[j*w+i] = r | ( g << 8 ) | ( b << 16 ) | 0xFF000000;
563
}
564
else
565
{
566
const auto rc = clampu8( r );
567
const auto gc = clampu8( g );
568
const auto bc = clampu8( b );
569
dst[j*w+i] = rc | ( gc << 8 ) | ( bc << 16 ) | 0xFF000000;
570
}
571
idx >>= 2;
572
}
573
}
574
}
575
}
576
577
static etcpak_force_inline void DecodeRGBAPart( uint64_t d, uint64_t alpha, uint32_t* dst, uint32_t w )
578
{
579
d = ConvertByteOrder( d );
580
alpha = _bswap64( alpha );
581
582
uint32_t br[2], bg[2], bb[2];
583
584
if( d & 0x2 )
585
{
586
int32_t dr, dg, db;
587
588
uint32_t r0 = ( d & 0xF8000000 ) >> 27;
589
uint32_t g0 = ( d & 0x00F80000 ) >> 19;
590
uint32_t b0 = ( d & 0x0000F800 ) >> 11;
591
592
dr = ( int32_t(d) << 5 ) >> 29;
593
dg = ( int32_t(d) << 13 ) >> 29;
594
db = ( int32_t(d) << 21 ) >> 29;
595
596
int32_t r1 = int32_t(r0) + dr;
597
int32_t g1 = int32_t(g0) + dg;
598
int32_t b1 = int32_t(b0) + db;
599
600
// T mode
601
if ( (r1 < 0) || (r1 > 31) )
602
{
603
DecodeTAlpha( d, alpha, dst, w );
604
return;
605
}
606
607
// H mode
608
if ( (g1 < 0) || (g1 > 31) )
609
{
610
DecodeHAlpha( d, alpha, dst, w );
611
return;
612
}
613
614
// P mode
615
if ( (b1 < 0) || (b1 > 31) )
616
{
617
DecodePlanarAlpha( d, alpha, dst, w );
618
return;
619
}
620
621
br[0] = ( r0 << 3 ) | ( r0 >> 2 );
622
br[1] = ( r1 << 3 ) | ( r1 >> 2 );
623
bg[0] = ( g0 << 3 ) | ( g0 >> 2 );
624
bg[1] = ( g1 << 3 ) | ( g1 >> 2 );
625
bb[0] = ( b0 << 3 ) | ( b0 >> 2 );
626
bb[1] = ( b1 << 3 ) | ( b1 >> 2 );
627
}
628
else
629
{
630
br[0] = ( ( d & 0xF0000000 ) >> 24 ) | ( ( d & 0xF0000000 ) >> 28 );
631
br[1] = ( ( d & 0x0F000000 ) >> 20 ) | ( ( d & 0x0F000000 ) >> 24 );
632
bg[0] = ( ( d & 0x00F00000 ) >> 16 ) | ( ( d & 0x00F00000 ) >> 20 );
633
bg[1] = ( ( d & 0x000F0000 ) >> 12 ) | ( ( d & 0x000F0000 ) >> 16 );
634
bb[0] = ( ( d & 0x0000F000 ) >> 8 ) | ( ( d & 0x0000F000 ) >> 12 );
635
bb[1] = ( ( d & 0x00000F00 ) >> 4 ) | ( ( d & 0x00000F00 ) >> 8 );
636
}
637
638
unsigned int tcw[2];
639
tcw[0] = ( d & 0xE0 ) >> 5;
640
tcw[1] = ( d & 0x1C ) >> 2;
641
642
uint32_t b1 = ( d >> 32 ) & 0xFFFF;
643
uint32_t b2 = ( d >> 48 );
644
645
b1 = ( b1 | ( b1 << 8 ) ) & 0x00FF00FF;
646
b1 = ( b1 | ( b1 << 4 ) ) & 0x0F0F0F0F;
647
b1 = ( b1 | ( b1 << 2 ) ) & 0x33333333;
648
b1 = ( b1 | ( b1 << 1 ) ) & 0x55555555;
649
650
b2 = ( b2 | ( b2 << 8 ) ) & 0x00FF00FF;
651
b2 = ( b2 | ( b2 << 4 ) ) & 0x0F0F0F0F;
652
b2 = ( b2 | ( b2 << 2 ) ) & 0x33333333;
653
b2 = ( b2 | ( b2 << 1 ) ) & 0x55555555;
654
655
uint32_t idx = b1 | ( b2 << 1 );
656
657
const int32_t base = alpha >> 56;
658
const int32_t mul = ( alpha >> 52 ) & 0xF;
659
const auto atbl = g_alpha[( alpha >> 48 ) & 0xF];
660
661
if( d & 0x1 )
662
{
663
for( int i=0; i<4; i++ )
664
{
665
for( int j=0; j<4; j++ )
666
{
667
const auto mod = g_table[tcw[j/2]][idx & 0x3];
668
const auto r = br[j/2] + mod;
669
const auto g = bg[j/2] + mod;
670
const auto b = bb[j/2] + mod;
671
const auto amod = atbl[(alpha >> ( 45 - j*3 - i*12 )) & 0x7];
672
const uint32_t a = clampu8( base + amod * mul );
673
if( ( ( r | g | b ) & ~0xFF ) == 0 )
674
{
675
dst[j*w+i] = r | ( g << 8 ) | ( b << 16 ) | ( a << 24 );
676
}
677
else
678
{
679
const auto rc = clampu8( r );
680
const auto gc = clampu8( g );
681
const auto bc = clampu8( b );
682
dst[j*w+i] = rc | ( gc << 8 ) | ( bc << 16 ) | ( a << 24 );
683
}
684
idx >>= 2;
685
}
686
}
687
}
688
else
689
{
690
for( int i=0; i<4; i++ )
691
{
692
const auto tbl = g_table[tcw[i/2]];
693
const auto cr = br[i/2];
694
const auto cg = bg[i/2];
695
const auto cb = bb[i/2];
696
697
for( int j=0; j<4; j++ )
698
{
699
const auto mod = tbl[idx & 0x3];
700
const auto r = cr + mod;
701
const auto g = cg + mod;
702
const auto b = cb + mod;
703
const auto amod = atbl[(alpha >> ( 45 - j*3 - i*12 )) & 0x7];
704
const uint32_t a = clampu8( base + amod * mul );
705
if( ( ( r | g | b ) & ~0xFF ) == 0 )
706
{
707
dst[j*w+i] = r | ( g << 8 ) | ( b << 16 ) | ( a << 24 );
708
}
709
else
710
{
711
const auto rc = clampu8( r );
712
const auto gc = clampu8( g );
713
const auto bc = clampu8( b );
714
dst[j*w+i] = rc | ( gc << 8 ) | ( bc << 16 ) | ( a << 24 );
715
}
716
idx >>= 2;
717
}
718
}
719
}
720
}
721
722
static etcpak_force_inline void DecodeRPart( uint64_t r, uint32_t* dst, uint32_t w )
723
{
724
r = _bswap64( r );
725
726
const int32_t base = ( r >> 56 )*8+4;
727
const int32_t mul = ( r >> 52 ) & 0xF;
728
const auto atbl = g_alpha[( r >> 48 ) & 0xF];
729
730
for( int i=0; i<4; i++ )
731
{
732
for ( int j=0; j<4; j++ )
733
{
734
const auto amod = atbl[(r >> ( 45 - j*3 - i*12 )) & 0x7];
735
const uint32_t rc = clampu8( ( base + amod * g_alpha11Mul[mul] )/8 );
736
dst[j*w+i] = rc | 0xFF000000;
737
}
738
}
739
}
740
741
static etcpak_force_inline void DecodeRGPart( uint64_t r, uint64_t g, uint32_t* dst, uint32_t w )
742
{
743
r = _bswap64( r );
744
g = _bswap64( g );
745
746
const int32_t rbase = ( r >> 56 )*8+4;
747
const int32_t rmul = ( r >> 52 ) & 0xF;
748
const auto rtbl = g_alpha[( r >> 48 ) & 0xF];
749
750
const int32_t gbase = ( g >> 56 )*8+4;
751
const int32_t gmul = ( g >> 52 ) & 0xF;
752
const auto gtbl = g_alpha[( g >> 48 ) & 0xF];
753
754
for( int i=0; i<4; i++ )
755
{
756
for( int j=0; j<4; j++ )
757
{
758
const auto rmod = rtbl[(r >> ( 45 - j*3 - i*12 )) & 0x7];
759
const uint32_t rc = clampu8( ( rbase + rmod * g_alpha11Mul[rmul] )/8 );
760
761
const auto gmod = gtbl[(g >> ( 45 - j*3 - i*12 )) & 0x7];
762
const uint32_t gc = clampu8( ( gbase + gmod * g_alpha11Mul[gmul] )/8 );
763
764
dst[j*w+i] = rc | (gc << 8) | 0xFF000000;
765
}
766
}
767
}
768
769
void DecodeRBlock( const void* src, void* dst, size_t width )
770
{
771
uint64_t* srcPtr = (uint64_t*)src;
772
uint64_t r = *srcPtr++;
773
DecodeRPart( r, (uint32_t*)dst, width );
774
}
775
776
void DecodeRGBlock( const void* src, void* dst, size_t width )
777
{
778
uint64_t* srcPtr = (uint64_t*)src;
779
uint64_t r = *srcPtr++;
780
uint64_t g = *srcPtr++;
781
DecodeRGPart( r, g, (uint32_t*)dst, width );
782
}
783
784
void DecodeRGBBlock( const void* src, void* dst, size_t width )
785
{
786
uint64_t* srcPtr = (uint64_t*)src;
787
uint64_t d = *srcPtr++;
788
DecodeRGBPart( d, (uint32_t*)dst, width );
789
}
790
791
void DecodeRGBABlock( const void* src, void* dst, size_t width )
792
{
793
uint64_t* srcPtr = (uint64_t*)src;
794
uint64_t a = *srcPtr++;
795
uint64_t d = *srcPtr++;
796
DecodeRGBAPart( d, a, (uint32_t*)dst, width );
797
}
798
799