Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
godotengine
GitHub Repository: godotengine/godot
Path: blob/master/thirdparty/etcpak/ProcessDxtc.cpp
9832 views
1
#include "Dither.hpp"
2
#include "ForceInline.hpp"
3
#include "ProcessDxtc.hpp"
4
5
#include <assert.h>
6
#include <stdint.h>
7
#include <string.h>
8
9
#ifdef __ARM_NEON
10
# include <arm_neon.h>
11
#endif
12
13
#if defined __AVX__ && !defined __SSE4_1__
14
# define __SSE4_1__
15
#endif
16
17
#if defined __SSE4_1__ || defined __AVX2__
18
# ifdef _MSC_VER
19
# include <intrin.h>
20
# else
21
# include <x86intrin.h>
22
# ifndef _mm256_cvtsi256_si32
23
# define _mm256_cvtsi256_si32( v ) ( _mm_cvtsi128_si32( _mm256_castsi256_si128( v ) ) )
24
# endif
25
# endif
26
#endif
27
28
29
static etcpak_force_inline uint16_t to565( uint8_t r, uint8_t g, uint8_t b )
30
{
31
return ( ( r & 0xF8 ) << 8 ) | ( ( g & 0xFC ) << 3 ) | ( b >> 3 );
32
}
33
34
static etcpak_force_inline uint16_t to565( uint32_t c )
35
{
36
return
37
( ( c & 0xF80000 ) >> 19 ) |
38
( ( c & 0x00FC00 ) >> 5 ) |
39
( ( c & 0x0000F8 ) << 8 );
40
}
41
42
static const uint8_t DxtcIndexTable[256] = {
43
85, 87, 86, 84, 93, 95, 94, 92, 89, 91, 90, 88, 81, 83, 82, 80,
44
117, 119, 118, 116, 125, 127, 126, 124, 121, 123, 122, 120, 113, 115, 114, 112,
45
101, 103, 102, 100, 109, 111, 110, 108, 105, 107, 106, 104, 97, 99, 98, 96,
46
69, 71, 70, 68, 77, 79, 78, 76, 73, 75, 74, 72, 65, 67, 66, 64,
47
213, 215, 214, 212, 221, 223, 222, 220, 217, 219, 218, 216, 209, 211, 210, 208,
48
245, 247, 246, 244, 253, 255, 254, 252, 249, 251, 250, 248, 241, 243, 242, 240,
49
229, 231, 230, 228, 237, 239, 238, 236, 233, 235, 234, 232, 225, 227, 226, 224,
50
197, 199, 198, 196, 205, 207, 206, 204, 201, 203, 202, 200, 193, 195, 194, 192,
51
149, 151, 150, 148, 157, 159, 158, 156, 153, 155, 154, 152, 145, 147, 146, 144,
52
181, 183, 182, 180, 189, 191, 190, 188, 185, 187, 186, 184, 177, 179, 178, 176,
53
165, 167, 166, 164, 173, 175, 174, 172, 169, 171, 170, 168, 161, 163, 162, 160,
54
133, 135, 134, 132, 141, 143, 142, 140, 137, 139, 138, 136, 129, 131, 130, 128,
55
21, 23, 22, 20, 29, 31, 30, 28, 25, 27, 26, 24, 17, 19, 18, 16,
56
53, 55, 54, 52, 61, 63, 62, 60, 57, 59, 58, 56, 49, 51, 50, 48,
57
37, 39, 38, 36, 45, 47, 46, 44, 41, 43, 42, 40, 33, 35, 34, 32,
58
5, 7, 6, 4, 13, 15, 14, 12, 9, 11, 10, 8, 1, 3, 2, 0
59
};
60
61
static const uint8_t AlphaIndexTable_SSE[64] = {
62
9, 15, 14, 13, 12, 11, 10, 8, 57, 63, 62, 61, 60, 59, 58, 56,
63
49, 55, 54, 53, 52, 51, 50, 48, 41, 47, 46, 45, 44, 43, 42, 40,
64
33, 39, 38, 37, 36, 35, 34, 32, 25, 31, 30, 29, 28, 27, 26, 24,
65
17, 23, 22, 21, 20, 19, 18, 16, 1, 7, 6, 5, 4, 3, 2, 0,
66
};
67
68
static const uint16_t DivTable[255*3+1] = {
69
0xffff, 0xffff, 0xffff, 0xffff, 0xcccc, 0xaaaa, 0x9249, 0x8000, 0x71c7, 0x6666, 0x5d17, 0x5555, 0x4ec4, 0x4924, 0x4444, 0x4000,
70
0x3c3c, 0x38e3, 0x35e5, 0x3333, 0x30c3, 0x2e8b, 0x2c85, 0x2aaa, 0x28f5, 0x2762, 0x25ed, 0x2492, 0x234f, 0x2222, 0x2108, 0x2000,
71
0x1f07, 0x1e1e, 0x1d41, 0x1c71, 0x1bac, 0x1af2, 0x1a41, 0x1999, 0x18f9, 0x1861, 0x17d0, 0x1745, 0x16c1, 0x1642, 0x15c9, 0x1555,
72
0x14e5, 0x147a, 0x1414, 0x13b1, 0x1352, 0x12f6, 0x129e, 0x1249, 0x11f7, 0x11a7, 0x115b, 0x1111, 0x10c9, 0x1084, 0x1041, 0x1000,
73
0x0fc0, 0x0f83, 0x0f48, 0x0f0f, 0x0ed7, 0x0ea0, 0x0e6c, 0x0e38, 0x0e07, 0x0dd6, 0x0da7, 0x0d79, 0x0d4c, 0x0d20, 0x0cf6, 0x0ccc,
74
0x0ca4, 0x0c7c, 0x0c56, 0x0c30, 0x0c0c, 0x0be8, 0x0bc5, 0x0ba2, 0x0b81, 0x0b60, 0x0b40, 0x0b21, 0x0b02, 0x0ae4, 0x0ac7, 0x0aaa,
75
0x0a8e, 0x0a72, 0x0a57, 0x0a3d, 0x0a23, 0x0a0a, 0x09f1, 0x09d8, 0x09c0, 0x09a9, 0x0991, 0x097b, 0x0964, 0x094f, 0x0939, 0x0924,
76
0x090f, 0x08fb, 0x08e7, 0x08d3, 0x08c0, 0x08ad, 0x089a, 0x0888, 0x0876, 0x0864, 0x0853, 0x0842, 0x0831, 0x0820, 0x0810, 0x0800,
77
0x07f0, 0x07e0, 0x07d1, 0x07c1, 0x07b3, 0x07a4, 0x0795, 0x0787, 0x0779, 0x076b, 0x075d, 0x0750, 0x0743, 0x0736, 0x0729, 0x071c,
78
0x070f, 0x0703, 0x06f7, 0x06eb, 0x06df, 0x06d3, 0x06c8, 0x06bc, 0x06b1, 0x06a6, 0x069b, 0x0690, 0x0685, 0x067b, 0x0670, 0x0666,
79
0x065c, 0x0652, 0x0648, 0x063e, 0x0634, 0x062b, 0x0621, 0x0618, 0x060f, 0x0606, 0x05fd, 0x05f4, 0x05eb, 0x05e2, 0x05d9, 0x05d1,
80
0x05c9, 0x05c0, 0x05b8, 0x05b0, 0x05a8, 0x05a0, 0x0598, 0x0590, 0x0588, 0x0581, 0x0579, 0x0572, 0x056b, 0x0563, 0x055c, 0x0555,
81
0x054e, 0x0547, 0x0540, 0x0539, 0x0532, 0x052b, 0x0525, 0x051e, 0x0518, 0x0511, 0x050b, 0x0505, 0x04fe, 0x04f8, 0x04f2, 0x04ec,
82
0x04e6, 0x04e0, 0x04da, 0x04d4, 0x04ce, 0x04c8, 0x04c3, 0x04bd, 0x04b8, 0x04b2, 0x04ad, 0x04a7, 0x04a2, 0x049c, 0x0497, 0x0492,
83
0x048d, 0x0487, 0x0482, 0x047d, 0x0478, 0x0473, 0x046e, 0x0469, 0x0465, 0x0460, 0x045b, 0x0456, 0x0452, 0x044d, 0x0448, 0x0444,
84
0x043f, 0x043b, 0x0436, 0x0432, 0x042d, 0x0429, 0x0425, 0x0421, 0x041c, 0x0418, 0x0414, 0x0410, 0x040c, 0x0408, 0x0404, 0x0400,
85
0x03fc, 0x03f8, 0x03f4, 0x03f0, 0x03ec, 0x03e8, 0x03e4, 0x03e0, 0x03dd, 0x03d9, 0x03d5, 0x03d2, 0x03ce, 0x03ca, 0x03c7, 0x03c3,
86
0x03c0, 0x03bc, 0x03b9, 0x03b5, 0x03b2, 0x03ae, 0x03ab, 0x03a8, 0x03a4, 0x03a1, 0x039e, 0x039b, 0x0397, 0x0394, 0x0391, 0x038e,
87
0x038b, 0x0387, 0x0384, 0x0381, 0x037e, 0x037b, 0x0378, 0x0375, 0x0372, 0x036f, 0x036c, 0x0369, 0x0366, 0x0364, 0x0361, 0x035e,
88
0x035b, 0x0358, 0x0355, 0x0353, 0x0350, 0x034d, 0x034a, 0x0348, 0x0345, 0x0342, 0x0340, 0x033d, 0x033a, 0x0338, 0x0335, 0x0333,
89
0x0330, 0x032e, 0x032b, 0x0329, 0x0326, 0x0324, 0x0321, 0x031f, 0x031c, 0x031a, 0x0317, 0x0315, 0x0313, 0x0310, 0x030e, 0x030c,
90
0x0309, 0x0307, 0x0305, 0x0303, 0x0300, 0x02fe, 0x02fc, 0x02fa, 0x02f7, 0x02f5, 0x02f3, 0x02f1, 0x02ef, 0x02ec, 0x02ea, 0x02e8,
91
0x02e6, 0x02e4, 0x02e2, 0x02e0, 0x02de, 0x02dc, 0x02da, 0x02d8, 0x02d6, 0x02d4, 0x02d2, 0x02d0, 0x02ce, 0x02cc, 0x02ca, 0x02c8,
92
0x02c6, 0x02c4, 0x02c2, 0x02c0, 0x02be, 0x02bc, 0x02bb, 0x02b9, 0x02b7, 0x02b5, 0x02b3, 0x02b1, 0x02b0, 0x02ae, 0x02ac, 0x02aa,
93
0x02a8, 0x02a7, 0x02a5, 0x02a3, 0x02a1, 0x02a0, 0x029e, 0x029c, 0x029b, 0x0299, 0x0297, 0x0295, 0x0294, 0x0292, 0x0291, 0x028f,
94
0x028d, 0x028c, 0x028a, 0x0288, 0x0287, 0x0285, 0x0284, 0x0282, 0x0280, 0x027f, 0x027d, 0x027c, 0x027a, 0x0279, 0x0277, 0x0276,
95
0x0274, 0x0273, 0x0271, 0x0270, 0x026e, 0x026d, 0x026b, 0x026a, 0x0268, 0x0267, 0x0265, 0x0264, 0x0263, 0x0261, 0x0260, 0x025e,
96
0x025d, 0x025c, 0x025a, 0x0259, 0x0257, 0x0256, 0x0255, 0x0253, 0x0252, 0x0251, 0x024f, 0x024e, 0x024d, 0x024b, 0x024a, 0x0249,
97
0x0247, 0x0246, 0x0245, 0x0243, 0x0242, 0x0241, 0x0240, 0x023e, 0x023d, 0x023c, 0x023b, 0x0239, 0x0238, 0x0237, 0x0236, 0x0234,
98
0x0233, 0x0232, 0x0231, 0x0230, 0x022e, 0x022d, 0x022c, 0x022b, 0x022a, 0x0229, 0x0227, 0x0226, 0x0225, 0x0224, 0x0223, 0x0222,
99
0x0220, 0x021f, 0x021e, 0x021d, 0x021c, 0x021b, 0x021a, 0x0219, 0x0218, 0x0216, 0x0215, 0x0214, 0x0213, 0x0212, 0x0211, 0x0210,
100
0x020f, 0x020e, 0x020d, 0x020c, 0x020b, 0x020a, 0x0209, 0x0208, 0x0207, 0x0206, 0x0205, 0x0204, 0x0203, 0x0202, 0x0201, 0x0200,
101
0x01ff, 0x01fe, 0x01fd, 0x01fc, 0x01fb, 0x01fa, 0x01f9, 0x01f8, 0x01f7, 0x01f6, 0x01f5, 0x01f4, 0x01f3, 0x01f2, 0x01f1, 0x01f0,
102
0x01ef, 0x01ee, 0x01ed, 0x01ec, 0x01eb, 0x01ea, 0x01e9, 0x01e9, 0x01e8, 0x01e7, 0x01e6, 0x01e5, 0x01e4, 0x01e3, 0x01e2, 0x01e1,
103
0x01e0, 0x01e0, 0x01df, 0x01de, 0x01dd, 0x01dc, 0x01db, 0x01da, 0x01da, 0x01d9, 0x01d8, 0x01d7, 0x01d6, 0x01d5, 0x01d4, 0x01d4,
104
0x01d3, 0x01d2, 0x01d1, 0x01d0, 0x01cf, 0x01cf, 0x01ce, 0x01cd, 0x01cc, 0x01cb, 0x01cb, 0x01ca, 0x01c9, 0x01c8, 0x01c7, 0x01c7,
105
0x01c6, 0x01c5, 0x01c4, 0x01c3, 0x01c3, 0x01c2, 0x01c1, 0x01c0, 0x01c0, 0x01bf, 0x01be, 0x01bd, 0x01bd, 0x01bc, 0x01bb, 0x01ba,
106
0x01ba, 0x01b9, 0x01b8, 0x01b7, 0x01b7, 0x01b6, 0x01b5, 0x01b4, 0x01b4, 0x01b3, 0x01b2, 0x01b2, 0x01b1, 0x01b0, 0x01af, 0x01af,
107
0x01ae, 0x01ad, 0x01ad, 0x01ac, 0x01ab, 0x01aa, 0x01aa, 0x01a9, 0x01a8, 0x01a8, 0x01a7, 0x01a6, 0x01a6, 0x01a5, 0x01a4, 0x01a4,
108
0x01a3, 0x01a2, 0x01a2, 0x01a1, 0x01a0, 0x01a0, 0x019f, 0x019e, 0x019e, 0x019d, 0x019c, 0x019c, 0x019b, 0x019a, 0x019a, 0x0199,
109
0x0198, 0x0198, 0x0197, 0x0197, 0x0196, 0x0195, 0x0195, 0x0194, 0x0193, 0x0193, 0x0192, 0x0192, 0x0191, 0x0190, 0x0190, 0x018f,
110
0x018f, 0x018e, 0x018d, 0x018d, 0x018c, 0x018b, 0x018b, 0x018a, 0x018a, 0x0189, 0x0189, 0x0188, 0x0187, 0x0187, 0x0186, 0x0186,
111
0x0185, 0x0184, 0x0184, 0x0183, 0x0183, 0x0182, 0x0182, 0x0181, 0x0180, 0x0180, 0x017f, 0x017f, 0x017e, 0x017e, 0x017d, 0x017d,
112
0x017c, 0x017b, 0x017b, 0x017a, 0x017a, 0x0179, 0x0179, 0x0178, 0x0178, 0x0177, 0x0177, 0x0176, 0x0175, 0x0175, 0x0174, 0x0174,
113
0x0173, 0x0173, 0x0172, 0x0172, 0x0171, 0x0171, 0x0170, 0x0170, 0x016f, 0x016f, 0x016e, 0x016e, 0x016d, 0x016d, 0x016c, 0x016c,
114
0x016b, 0x016b, 0x016a, 0x016a, 0x0169, 0x0169, 0x0168, 0x0168, 0x0167, 0x0167, 0x0166, 0x0166, 0x0165, 0x0165, 0x0164, 0x0164,
115
0x0163, 0x0163, 0x0162, 0x0162, 0x0161, 0x0161, 0x0160, 0x0160, 0x015f, 0x015f, 0x015e, 0x015e, 0x015d, 0x015d, 0x015d, 0x015c,
116
0x015c, 0x015b, 0x015b, 0x015a, 0x015a, 0x0159, 0x0159, 0x0158, 0x0158, 0x0158, 0x0157, 0x0157, 0x0156, 0x0156
117
};
118
static const uint16_t DivTableNEON[255*3+1] = {
119
0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
120
0x0000, 0x1c71, 0x1af2, 0x1999, 0x1861, 0x1745, 0x1642, 0x1555, 0x147a, 0x13b1, 0x12f6, 0x1249, 0x11a7, 0x1111, 0x1084, 0x1000,
121
0x0f83, 0x0f0f, 0x0ea0, 0x0e38, 0x0dd6, 0x0d79, 0x0d20, 0x0ccc, 0x0c7c, 0x0c30, 0x0be8, 0x0ba2, 0x0b60, 0x0b21, 0x0ae4, 0x0aaa,
122
0x0a72, 0x0a3d, 0x0a0a, 0x09d8, 0x09a9, 0x097b, 0x094f, 0x0924, 0x08fb, 0x08d3, 0x08ad, 0x0888, 0x0864, 0x0842, 0x0820, 0x0800,
123
0x07e0, 0x07c1, 0x07a4, 0x0787, 0x076b, 0x0750, 0x0736, 0x071c, 0x0703, 0x06eb, 0x06d3, 0x06bc, 0x06a6, 0x0690, 0x067b, 0x0666,
124
0x0652, 0x063e, 0x062b, 0x0618, 0x0606, 0x05f4, 0x05e2, 0x05d1, 0x05c0, 0x05b0, 0x05a0, 0x0590, 0x0581, 0x0572, 0x0563, 0x0555,
125
0x0547, 0x0539, 0x052b, 0x051e, 0x0511, 0x0505, 0x04f8, 0x04ec, 0x04e0, 0x04d4, 0x04c8, 0x04bd, 0x04b2, 0x04a7, 0x049c, 0x0492,
126
0x0487, 0x047d, 0x0473, 0x0469, 0x0460, 0x0456, 0x044d, 0x0444, 0x043b, 0x0432, 0x0429, 0x0421, 0x0418, 0x0410, 0x0408, 0x0400,
127
0x03f8, 0x03f0, 0x03e8, 0x03e0, 0x03d9, 0x03d2, 0x03ca, 0x03c3, 0x03bc, 0x03b5, 0x03ae, 0x03a8, 0x03a1, 0x039b, 0x0394, 0x038e,
128
0x0387, 0x0381, 0x037b, 0x0375, 0x036f, 0x0369, 0x0364, 0x035e, 0x0358, 0x0353, 0x034d, 0x0348, 0x0342, 0x033d, 0x0338, 0x0333,
129
0x032e, 0x0329, 0x0324, 0x031f, 0x031a, 0x0315, 0x0310, 0x030c, 0x0307, 0x0303, 0x02fe, 0x02fa, 0x02f5, 0x02f1, 0x02ec, 0x02e8,
130
0x02e4, 0x02e0, 0x02dc, 0x02d8, 0x02d4, 0x02d0, 0x02cc, 0x02c8, 0x02c4, 0x02c0, 0x02bc, 0x02b9, 0x02b5, 0x02b1, 0x02ae, 0x02aa,
131
0x02a7, 0x02a3, 0x02a0, 0x029c, 0x0299, 0x0295, 0x0292, 0x028f, 0x028c, 0x0288, 0x0285, 0x0282, 0x027f, 0x027c, 0x0279, 0x0276,
132
0x0273, 0x0270, 0x026d, 0x026a, 0x0267, 0x0264, 0x0261, 0x025e, 0x025c, 0x0259, 0x0256, 0x0253, 0x0251, 0x024e, 0x024b, 0x0249,
133
0x0246, 0x0243, 0x0241, 0x023e, 0x023c, 0x0239, 0x0237, 0x0234, 0x0232, 0x0230, 0x022d, 0x022b, 0x0229, 0x0226, 0x0224, 0x0222,
134
0x021f, 0x021d, 0x021b, 0x0219, 0x0216, 0x0214, 0x0212, 0x0210, 0x020e, 0x020c, 0x020a, 0x0208, 0x0206, 0x0204, 0x0202, 0x0200,
135
0x01fe, 0x01fc, 0x01fa, 0x01f8, 0x01f6, 0x01f4, 0x01f2, 0x01f0, 0x01ee, 0x01ec, 0x01ea, 0x01e9, 0x01e7, 0x01e5, 0x01e3, 0x01e1,
136
0x01e0, 0x01de, 0x01dc, 0x01da, 0x01d9, 0x01d7, 0x01d5, 0x01d4, 0x01d2, 0x01d0, 0x01cf, 0x01cd, 0x01cb, 0x01ca, 0x01c8, 0x01c7,
137
0x01c5, 0x01c3, 0x01c2, 0x01c0, 0x01bf, 0x01bd, 0x01bc, 0x01ba, 0x01b9, 0x01b7, 0x01b6, 0x01b4, 0x01b3, 0x01b2, 0x01b0, 0x01af,
138
0x01ad, 0x01ac, 0x01aa, 0x01a9, 0x01a8, 0x01a6, 0x01a5, 0x01a4, 0x01a2, 0x01a1, 0x01a0, 0x019e, 0x019d, 0x019c, 0x019a, 0x0199,
139
0x0198, 0x0197, 0x0195, 0x0194, 0x0193, 0x0192, 0x0190, 0x018f, 0x018e, 0x018d, 0x018b, 0x018a, 0x0189, 0x0188, 0x0187, 0x0186,
140
0x0184, 0x0183, 0x0182, 0x0181, 0x0180, 0x017f, 0x017e, 0x017d, 0x017b, 0x017a, 0x0179, 0x0178, 0x0177, 0x0176, 0x0175, 0x0174,
141
0x0173, 0x0172, 0x0171, 0x0170, 0x016f, 0x016e, 0x016d, 0x016c, 0x016b, 0x016a, 0x0169, 0x0168, 0x0167, 0x0166, 0x0165, 0x0164,
142
0x0163, 0x0162, 0x0161, 0x0160, 0x015f, 0x015e, 0x015d, 0x015c, 0x015b, 0x015a, 0x0159, 0x0158, 0x0158, 0x0157, 0x0156, 0x0155,
143
0x0154, 0x0153, 0x0152, 0x0151, 0x0150, 0x0150, 0x014f, 0x014e, 0x014d, 0x014c, 0x014b, 0x014a, 0x014a, 0x0149, 0x0148, 0x0147,
144
0x0146, 0x0146, 0x0145, 0x0144, 0x0143, 0x0142, 0x0142, 0x0141, 0x0140, 0x013f, 0x013e, 0x013e, 0x013d, 0x013c, 0x013b, 0x013b,
145
0x013a, 0x0139, 0x0138, 0x0138, 0x0137, 0x0136, 0x0135, 0x0135, 0x0134, 0x0133, 0x0132, 0x0132, 0x0131, 0x0130, 0x0130, 0x012f,
146
0x012e, 0x012e, 0x012d, 0x012c, 0x012b, 0x012b, 0x012a, 0x0129, 0x0129, 0x0128, 0x0127, 0x0127, 0x0126, 0x0125, 0x0125, 0x0124,
147
0x0123, 0x0123, 0x0122, 0x0121, 0x0121, 0x0120, 0x0120, 0x011f, 0x011e, 0x011e, 0x011d, 0x011c, 0x011c, 0x011b, 0x011b, 0x011a,
148
0x0119, 0x0119, 0x0118, 0x0118, 0x0117, 0x0116, 0x0116, 0x0115, 0x0115, 0x0114, 0x0113, 0x0113, 0x0112, 0x0112, 0x0111, 0x0111,
149
0x0110, 0x010f, 0x010f, 0x010e, 0x010e, 0x010d, 0x010d, 0x010c, 0x010c, 0x010b, 0x010a, 0x010a, 0x0109, 0x0109, 0x0108, 0x0108,
150
0x0107, 0x0107, 0x0106, 0x0106, 0x0105, 0x0105, 0x0104, 0x0104, 0x0103, 0x0103, 0x0102, 0x0102, 0x0101, 0x0101, 0x0100, 0x0100,
151
0x00ff, 0x00ff, 0x00fe, 0x00fe, 0x00fd, 0x00fd, 0x00fc, 0x00fc, 0x00fb, 0x00fb, 0x00fa, 0x00fa, 0x00f9, 0x00f9, 0x00f8, 0x00f8,
152
0x00f7, 0x00f7, 0x00f6, 0x00f6, 0x00f5, 0x00f5, 0x00f4, 0x00f4, 0x00f4, 0x00f3, 0x00f3, 0x00f2, 0x00f2, 0x00f1, 0x00f1, 0x00f0,
153
0x00f0, 0x00f0, 0x00ef, 0x00ef, 0x00ee, 0x00ee, 0x00ed, 0x00ed, 0x00ed, 0x00ec, 0x00ec, 0x00eb, 0x00eb, 0x00ea, 0x00ea, 0x00ea,
154
0x00e9, 0x00e9, 0x00e8, 0x00e8, 0x00e7, 0x00e7, 0x00e7, 0x00e6, 0x00e6, 0x00e5, 0x00e5, 0x00e5, 0x00e4, 0x00e4, 0x00e3, 0x00e3,
155
0x00e3, 0x00e2, 0x00e2, 0x00e1, 0x00e1, 0x00e1, 0x00e0, 0x00e0, 0x00e0, 0x00df, 0x00df, 0x00de, 0x00de, 0x00de, 0x00dd, 0x00dd,
156
0x00dd, 0x00dc, 0x00dc, 0x00db, 0x00db, 0x00db, 0x00da, 0x00da, 0x00da, 0x00d9, 0x00d9, 0x00d9, 0x00d8, 0x00d8, 0x00d7, 0x00d7,
157
0x00d7, 0x00d6, 0x00d6, 0x00d6, 0x00d5, 0x00d5, 0x00d5, 0x00d4, 0x00d4, 0x00d4, 0x00d3, 0x00d3, 0x00d3, 0x00d2, 0x00d2, 0x00d2,
158
0x00d1, 0x00d1, 0x00d1, 0x00d0, 0x00d0, 0x00d0, 0x00cf, 0x00cf, 0x00cf, 0x00ce, 0x00ce, 0x00ce, 0x00cd, 0x00cd, 0x00cd, 0x00cc,
159
0x00cc, 0x00cc, 0x00cb, 0x00cb, 0x00cb, 0x00ca, 0x00ca, 0x00ca, 0x00c9, 0x00c9, 0x00c9, 0x00c9, 0x00c8, 0x00c8, 0x00c8, 0x00c7,
160
0x00c7, 0x00c7, 0x00c6, 0x00c6, 0x00c6, 0x00c5, 0x00c5, 0x00c5, 0x00c5, 0x00c4, 0x00c4, 0x00c4, 0x00c3, 0x00c3, 0x00c3, 0x00c3,
161
0x00c2, 0x00c2, 0x00c2, 0x00c1, 0x00c1, 0x00c1, 0x00c1, 0x00c0, 0x00c0, 0x00c0, 0x00bf, 0x00bf, 0x00bf, 0x00bf, 0x00be, 0x00be,
162
0x00be, 0x00bd, 0x00bd, 0x00bd, 0x00bd, 0x00bc, 0x00bc, 0x00bc, 0x00bc, 0x00bb, 0x00bb, 0x00bb, 0x00ba, 0x00ba, 0x00ba, 0x00ba,
163
0x00b9, 0x00b9, 0x00b9, 0x00b9, 0x00b8, 0x00b8, 0x00b8, 0x00b8, 0x00b7, 0x00b7, 0x00b7, 0x00b7, 0x00b6, 0x00b6, 0x00b6, 0x00b6,
164
0x00b5, 0x00b5, 0x00b5, 0x00b5, 0x00b4, 0x00b4, 0x00b4, 0x00b4, 0x00b3, 0x00b3, 0x00b3, 0x00b3, 0x00b2, 0x00b2, 0x00b2, 0x00b2,
165
0x00b1, 0x00b1, 0x00b1, 0x00b1, 0x00b0, 0x00b0, 0x00b0, 0x00b0, 0x00af, 0x00af, 0x00af, 0x00af, 0x00ae, 0x00ae, 0x00ae, 0x00ae,
166
0x00ae, 0x00ad, 0x00ad, 0x00ad, 0x00ad, 0x00ac, 0x00ac, 0x00ac, 0x00ac, 0x00ac, 0x00ab, 0x00ab, 0x00ab, 0x00ab,
167
};
168
169
static const uint16_t DivTableAlpha[256] = {
170
0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xe38e, 0xcccc, 0xba2e, 0xaaaa, 0x9d89, 0x9249, 0x8888, 0x8000,
171
0x7878, 0x71c7, 0x6bca, 0x6666, 0x6186, 0x5d17, 0x590b, 0x5555, 0x51eb, 0x4ec4, 0x4bda, 0x4924, 0x469e, 0x4444, 0x4210, 0x4000,
172
0x3e0f, 0x3c3c, 0x3a83, 0x38e3, 0x3759, 0x35e5, 0x3483, 0x3333, 0x31f3, 0x30c3, 0x2fa0, 0x2e8b, 0x2d82, 0x2c85, 0x2b93, 0x2aaa,
173
0x29cb, 0x28f5, 0x2828, 0x2762, 0x26a4, 0x25ed, 0x253c, 0x2492, 0x23ee, 0x234f, 0x22b6, 0x2222, 0x2192, 0x2108, 0x2082, 0x2000,
174
0x1f81, 0x1f07, 0x1e91, 0x1e1e, 0x1dae, 0x1d41, 0x1cd8, 0x1c71, 0x1c0e, 0x1bac, 0x1b4e, 0x1af2, 0x1a98, 0x1a41, 0x19ec, 0x1999,
175
0x1948, 0x18f9, 0x18ac, 0x1861, 0x1818, 0x17d0, 0x178a, 0x1745, 0x1702, 0x16c1, 0x1681, 0x1642, 0x1605, 0x15c9, 0x158e, 0x1555,
176
0x151d, 0x14e5, 0x14af, 0x147a, 0x1446, 0x1414, 0x13e2, 0x13b1, 0x1381, 0x1352, 0x1323, 0x12f6, 0x12c9, 0x129e, 0x1273, 0x1249,
177
0x121f, 0x11f7, 0x11cf, 0x11a7, 0x1181, 0x115b, 0x1135, 0x1111, 0x10ec, 0x10c9, 0x10a6, 0x1084, 0x1062, 0x1041, 0x1020, 0x1000,
178
0x0fe0, 0x0fc0, 0x0fa2, 0x0f83, 0x0f66, 0x0f48, 0x0f2b, 0x0f0f, 0x0ef2, 0x0ed7, 0x0ebb, 0x0ea0, 0x0e86, 0x0e6c, 0x0e52, 0x0e38,
179
0x0e1f, 0x0e07, 0x0dee, 0x0dd6, 0x0dbe, 0x0da7, 0x0d90, 0x0d79, 0x0d62, 0x0d4c, 0x0d36, 0x0d20, 0x0d0b, 0x0cf6, 0x0ce1, 0x0ccc,
180
0x0cb8, 0x0ca4, 0x0c90, 0x0c7c, 0x0c69, 0x0c56, 0x0c43, 0x0c30, 0x0c1e, 0x0c0c, 0x0bfa, 0x0be8, 0x0bd6, 0x0bc5, 0x0bb3, 0x0ba2,
181
0x0b92, 0x0b81, 0x0b70, 0x0b60, 0x0b50, 0x0b40, 0x0b30, 0x0b21, 0x0b11, 0x0b02, 0x0af3, 0x0ae4, 0x0ad6, 0x0ac7, 0x0ab8, 0x0aaa,
182
0x0a9c, 0x0a8e, 0x0a80, 0x0a72, 0x0a65, 0x0a57, 0x0a4a, 0x0a3d, 0x0a30, 0x0a23, 0x0a16, 0x0a0a, 0x09fd, 0x09f1, 0x09e4, 0x09d8,
183
0x09cc, 0x09c0, 0x09b4, 0x09a9, 0x099d, 0x0991, 0x0986, 0x097b, 0x0970, 0x0964, 0x095a, 0x094f, 0x0944, 0x0939, 0x092f, 0x0924,
184
0x091a, 0x090f, 0x0905, 0x08fb, 0x08f1, 0x08e7, 0x08dd, 0x08d3, 0x08ca, 0x08c0, 0x08b7, 0x08ad, 0x08a4, 0x089a, 0x0891, 0x0888,
185
0x087f, 0x0876, 0x086d, 0x0864, 0x085b, 0x0853, 0x084a, 0x0842, 0x0839, 0x0831, 0x0828, 0x0820, 0x0818, 0x0810, 0x0808, 0x0800,
186
};
187
188
static etcpak_force_inline uint64_t ProcessRGB( const uint8_t* src )
189
{
190
#ifdef __SSE4_1__
191
__m128i px0 = _mm_loadu_si128(((__m128i*)src) + 0);
192
__m128i px1 = _mm_loadu_si128(((__m128i*)src) + 1);
193
__m128i px2 = _mm_loadu_si128(((__m128i*)src) + 2);
194
__m128i px3 = _mm_loadu_si128(((__m128i*)src) + 3);
195
196
__m128i smask = _mm_set1_epi32( 0xF8FCF8 );
197
__m128i sd0 = _mm_and_si128( px0, smask );
198
__m128i sd1 = _mm_and_si128( px1, smask );
199
__m128i sd2 = _mm_and_si128( px2, smask );
200
__m128i sd3 = _mm_and_si128( px3, smask );
201
202
__m128i sc = _mm_shuffle_epi32(sd0, _MM_SHUFFLE(0, 0, 0, 0));
203
204
__m128i sc0 = _mm_cmpeq_epi8(sd0, sc);
205
__m128i sc1 = _mm_cmpeq_epi8(sd1, sc);
206
__m128i sc2 = _mm_cmpeq_epi8(sd2, sc);
207
__m128i sc3 = _mm_cmpeq_epi8(sd3, sc);
208
209
__m128i sm0 = _mm_and_si128(sc0, sc1);
210
__m128i sm1 = _mm_and_si128(sc2, sc3);
211
__m128i sm = _mm_and_si128(sm0, sm1);
212
213
if( _mm_testc_si128(sm, _mm_set1_epi32(-1)) )
214
{
215
uint32_t c;
216
memcpy( &c, src, 4 );
217
return uint64_t( to565( c ) ) << 16;
218
}
219
220
__m128i min0 = _mm_min_epu8( px0, px1 );
221
__m128i min1 = _mm_min_epu8( px2, px3 );
222
__m128i min2 = _mm_min_epu8( min0, min1 );
223
224
__m128i max0 = _mm_max_epu8( px0, px1 );
225
__m128i max1 = _mm_max_epu8( px2, px3 );
226
__m128i max2 = _mm_max_epu8( max0, max1 );
227
228
__m128i min3 = _mm_shuffle_epi32( min2, _MM_SHUFFLE( 2, 3, 0, 1 ) );
229
__m128i max3 = _mm_shuffle_epi32( max2, _MM_SHUFFLE( 2, 3, 0, 1 ) );
230
__m128i min4 = _mm_min_epu8( min2, min3 );
231
__m128i max4 = _mm_max_epu8( max2, max3 );
232
233
__m128i min5 = _mm_shuffle_epi32( min4, _MM_SHUFFLE( 0, 0, 2, 2 ) );
234
__m128i max5 = _mm_shuffle_epi32( max4, _MM_SHUFFLE( 0, 0, 2, 2 ) );
235
__m128i rmin = _mm_min_epu8( min4, min5 );
236
__m128i rmax = _mm_max_epu8( max4, max5 );
237
238
__m128i range1 = _mm_subs_epu8( rmax, rmin );
239
__m128i range2 = _mm_sad_epu8( rmax, rmin );
240
241
uint32_t vrange = _mm_cvtsi128_si32( range2 ) >> 1;
242
__m128i range = _mm_set1_epi16( DivTable[vrange] );
243
244
__m128i inset1 = _mm_srli_epi16( range1, 4 );
245
__m128i inset = _mm_and_si128( inset1, _mm_set1_epi8( 0xF ) );
246
__m128i min = _mm_adds_epu8( rmin, inset );
247
__m128i max = _mm_subs_epu8( rmax, inset );
248
249
__m128i c0 = _mm_subs_epu8( px0, rmin );
250
__m128i c1 = _mm_subs_epu8( px1, rmin );
251
__m128i c2 = _mm_subs_epu8( px2, rmin );
252
__m128i c3 = _mm_subs_epu8( px3, rmin );
253
254
__m128i is0 = _mm_maddubs_epi16( c0, _mm_set1_epi8( 1 ) );
255
__m128i is1 = _mm_maddubs_epi16( c1, _mm_set1_epi8( 1 ) );
256
__m128i is2 = _mm_maddubs_epi16( c2, _mm_set1_epi8( 1 ) );
257
__m128i is3 = _mm_maddubs_epi16( c3, _mm_set1_epi8( 1 ) );
258
259
__m128i s0 = _mm_hadd_epi16( is0, is1 );
260
__m128i s1 = _mm_hadd_epi16( is2, is3 );
261
262
__m128i m0 = _mm_mulhi_epu16( s0, range );
263
__m128i m1 = _mm_mulhi_epu16( s1, range );
264
265
__m128i p0 = _mm_packus_epi16( m0, m1 );
266
267
__m128i p1 = _mm_or_si128( _mm_srai_epi32( p0, 6 ), _mm_srai_epi32( p0, 12 ) );
268
__m128i p2 = _mm_or_si128( _mm_srai_epi32( p0, 18 ), p0 );
269
__m128i p3 = _mm_or_si128( p1, p2 );
270
__m128i p =_mm_shuffle_epi8( p3, _mm_set1_epi32( 0x0C080400 ) );
271
272
uint32_t vmin = _mm_cvtsi128_si32( min );
273
uint32_t vmax = _mm_cvtsi128_si32( max );
274
uint32_t vp = _mm_cvtsi128_si32( p );
275
276
return uint64_t( ( uint64_t( to565( vmin ) ) << 16 ) | to565( vmax ) | ( uint64_t( vp ) << 32 ) );
277
#elif defined __ARM_NEON
278
# ifdef __aarch64__
279
uint8x16x4_t px = vld4q_u8( src );
280
281
uint8x16_t lr = px.val[0];
282
uint8x16_t lg = px.val[1];
283
uint8x16_t lb = px.val[2];
284
285
uint8_t rmaxr = vmaxvq_u8( lr );
286
uint8_t rmaxg = vmaxvq_u8( lg );
287
uint8_t rmaxb = vmaxvq_u8( lb );
288
289
uint8_t rminr = vminvq_u8( lr );
290
uint8_t rming = vminvq_u8( lg );
291
uint8_t rminb = vminvq_u8( lb );
292
293
int rr = rmaxr - rminr;
294
int rg = rmaxg - rming;
295
int rb = rmaxb - rminb;
296
297
int vrange1 = rr + rg + rb;
298
uint16_t vrange2 = DivTableNEON[vrange1];
299
300
uint8_t insetr = rr >> 4;
301
uint8_t insetg = rg >> 4;
302
uint8_t insetb = rb >> 4;
303
304
uint8_t minr = rminr + insetr;
305
uint8_t ming = rming + insetg;
306
uint8_t minb = rminb + insetb;
307
308
uint8_t maxr = rmaxr - insetr;
309
uint8_t maxg = rmaxg - insetg;
310
uint8_t maxb = rmaxb - insetb;
311
312
uint8x16_t cr = vsubq_u8( lr, vdupq_n_u8( rminr ) );
313
uint8x16_t cg = vsubq_u8( lg, vdupq_n_u8( rming ) );
314
uint8x16_t cb = vsubq_u8( lb, vdupq_n_u8( rminb ) );
315
316
uint16x8_t is0l = vaddl_u8( vget_low_u8( cr ), vget_low_u8( cg ) );
317
uint16x8_t is0h = vaddl_u8( vget_high_u8( cr ), vget_high_u8( cg ) );
318
uint16x8_t is1l = vaddw_u8( is0l, vget_low_u8( cb ) );
319
uint16x8_t is1h = vaddw_u8( is0h, vget_high_u8( cb ) );
320
321
int16x8_t range = vdupq_n_s16( vrange2 );
322
uint16x8_t m0 = vreinterpretq_u16_s16( vqdmulhq_s16( vreinterpretq_s16_u16( is1l ), range ) );
323
uint16x8_t m1 = vreinterpretq_u16_s16( vqdmulhq_s16( vreinterpretq_s16_u16( is1h ), range ) );
324
325
uint8x8_t p00 = vmovn_u16( m0 );
326
uint8x8_t p01 = vmovn_u16( m1 );
327
uint8x16_t p0 = vcombine_u8( p00, p01 );
328
329
uint32x4_t p1 = vaddq_u32( vshrq_n_u32( vreinterpretq_u32_u8( p0 ), 6 ), vshrq_n_u32( vreinterpretq_u32_u8( p0 ), 12 ) );
330
uint32x4_t p2 = vaddq_u32( vshrq_n_u32( vreinterpretq_u32_u8( p0 ), 18 ), vreinterpretq_u32_u8( p0 ) );
331
uint32x4_t p3 = vaddq_u32( p1, p2 );
332
333
uint16x4x2_t p4 = vuzp_u16( vget_low_u16( vreinterpretq_u16_u32( p3 ) ), vget_high_u16( vreinterpretq_u16_u32( p3 ) ) );
334
uint8x8x2_t p = vuzp_u8( vreinterpret_u8_u16( p4.val[0] ), vreinterpret_u8_u16( p4.val[0] ) );
335
336
uint32_t vp;
337
vst1_lane_u32( &vp, vreinterpret_u32_u8( p.val[0] ), 0 );
338
339
return uint64_t( ( uint64_t( to565( minr, ming, minb ) ) << 16 ) | to565( maxr, maxg, maxb ) | ( uint64_t( vp ) << 32 ) );
340
# else
341
uint32x4_t px0 = vld1q_u32( (uint32_t*)src );
342
uint32x4_t px1 = vld1q_u32( (uint32_t*)src + 4 );
343
uint32x4_t px2 = vld1q_u32( (uint32_t*)src + 8 );
344
uint32x4_t px3 = vld1q_u32( (uint32_t*)src + 12 );
345
346
uint32x4_t smask = vdupq_n_u32( 0xF8FCF8 );
347
uint32x4_t sd0 = vandq_u32( smask, px0 );
348
uint32x4_t sd1 = vandq_u32( smask, px1 );
349
uint32x4_t sd2 = vandq_u32( smask, px2 );
350
uint32x4_t sd3 = vandq_u32( smask, px3 );
351
352
uint32x4_t sc = vdupq_n_u32( sd0[0] );
353
354
uint32x4_t sc0 = vceqq_u32( sd0, sc );
355
uint32x4_t sc1 = vceqq_u32( sd1, sc );
356
uint32x4_t sc2 = vceqq_u32( sd2, sc );
357
uint32x4_t sc3 = vceqq_u32( sd3, sc );
358
359
uint32x4_t sm0 = vandq_u32( sc0, sc1 );
360
uint32x4_t sm1 = vandq_u32( sc2, sc3 );
361
int64x2_t sm = vreinterpretq_s64_u32( vandq_u32( sm0, sm1 ) );
362
363
if( sm[0] == -1 && sm[1] == -1 )
364
{
365
return uint64_t( to565( src[0], src[1], src[2] ) ) << 16;
366
}
367
368
uint32x4_t mask = vdupq_n_u32( 0xFFFFFF );
369
uint8x16_t l0 = vreinterpretq_u8_u32( vandq_u32( mask, px0 ) );
370
uint8x16_t l1 = vreinterpretq_u8_u32( vandq_u32( mask, px1 ) );
371
uint8x16_t l2 = vreinterpretq_u8_u32( vandq_u32( mask, px2 ) );
372
uint8x16_t l3 = vreinterpretq_u8_u32( vandq_u32( mask, px3 ) );
373
374
uint8x16_t min0 = vminq_u8( l0, l1 );
375
uint8x16_t min1 = vminq_u8( l2, l3 );
376
uint8x16_t min2 = vminq_u8( min0, min1 );
377
378
uint8x16_t max0 = vmaxq_u8( l0, l1 );
379
uint8x16_t max1 = vmaxq_u8( l2, l3 );
380
uint8x16_t max2 = vmaxq_u8( max0, max1 );
381
382
uint8x16_t min3 = vreinterpretq_u8_u32( vrev64q_u32( vreinterpretq_u32_u8( min2 ) ) );
383
uint8x16_t max3 = vreinterpretq_u8_u32( vrev64q_u32( vreinterpretq_u32_u8( max2 ) ) );
384
385
uint8x16_t min4 = vminq_u8( min2, min3 );
386
uint8x16_t max4 = vmaxq_u8( max2, max3 );
387
388
uint8x16_t min5 = vcombine_u8( vget_high_u8( min4 ), vget_low_u8( min4 ) );
389
uint8x16_t max5 = vcombine_u8( vget_high_u8( max4 ), vget_low_u8( max4 ) );
390
391
uint8x16_t rmin = vminq_u8( min4, min5 );
392
uint8x16_t rmax = vmaxq_u8( max4, max5 );
393
394
uint8x16_t range1 = vsubq_u8( rmax, rmin );
395
uint8x8_t range2 = vget_low_u8( range1 );
396
uint8x8x2_t range3 = vzip_u8( range2, vdup_n_u8( 0 ) );
397
uint16x4_t range4 = vreinterpret_u16_u8( range3.val[0] );
398
399
uint16_t vrange1;
400
uint16x4_t range5 = vpadd_u16( range4, range4 );
401
uint16x4_t range6 = vpadd_u16( range5, range5 );
402
vst1_lane_u16( &vrange1, range6, 0 );
403
404
uint32_t vrange2 = ( 2 << 16 ) / uint32_t( vrange1 + 1 );
405
uint16x8_t range = vdupq_n_u16( vrange2 );
406
407
uint8x16_t inset = vshrq_n_u8( range1, 4 );
408
uint8x16_t min = vaddq_u8( rmin, inset );
409
uint8x16_t max = vsubq_u8( rmax, inset );
410
411
uint8x16_t c0 = vsubq_u8( l0, rmin );
412
uint8x16_t c1 = vsubq_u8( l1, rmin );
413
uint8x16_t c2 = vsubq_u8( l2, rmin );
414
uint8x16_t c3 = vsubq_u8( l3, rmin );
415
416
uint16x8_t is0 = vpaddlq_u8( c0 );
417
uint16x8_t is1 = vpaddlq_u8( c1 );
418
uint16x8_t is2 = vpaddlq_u8( c2 );
419
uint16x8_t is3 = vpaddlq_u8( c3 );
420
421
uint16x4_t is4 = vpadd_u16( vget_low_u16( is0 ), vget_high_u16( is0 ) );
422
uint16x4_t is5 = vpadd_u16( vget_low_u16( is1 ), vget_high_u16( is1 ) );
423
uint16x4_t is6 = vpadd_u16( vget_low_u16( is2 ), vget_high_u16( is2 ) );
424
uint16x4_t is7 = vpadd_u16( vget_low_u16( is3 ), vget_high_u16( is3 ) );
425
426
uint16x8_t s0 = vcombine_u16( is4, is5 );
427
uint16x8_t s1 = vcombine_u16( is6, is7 );
428
429
uint16x8_t m0 = vreinterpretq_u16_s16( vqdmulhq_s16( vreinterpretq_s16_u16( s0 ), vreinterpretq_s16_u16( range ) ) );
430
uint16x8_t m1 = vreinterpretq_u16_s16( vqdmulhq_s16( vreinterpretq_s16_u16( s1 ), vreinterpretq_s16_u16( range ) ) );
431
432
uint8x8_t p00 = vmovn_u16( m0 );
433
uint8x8_t p01 = vmovn_u16( m1 );
434
uint8x16_t p0 = vcombine_u8( p00, p01 );
435
436
uint32x4_t p1 = vaddq_u32( vshrq_n_u32( vreinterpretq_u32_u8( p0 ), 6 ), vshrq_n_u32( vreinterpretq_u32_u8( p0 ), 12 ) );
437
uint32x4_t p2 = vaddq_u32( vshrq_n_u32( vreinterpretq_u32_u8( p0 ), 18 ), vreinterpretq_u32_u8( p0 ) );
438
uint32x4_t p3 = vaddq_u32( p1, p2 );
439
440
uint16x4x2_t p4 = vuzp_u16( vget_low_u16( vreinterpretq_u16_u32( p3 ) ), vget_high_u16( vreinterpretq_u16_u32( p3 ) ) );
441
uint8x8x2_t p = vuzp_u8( vreinterpret_u8_u16( p4.val[0] ), vreinterpret_u8_u16( p4.val[0] ) );
442
443
uint32_t vmin, vmax, vp;
444
vst1q_lane_u32( &vmin, vreinterpretq_u32_u8( min ), 0 );
445
vst1q_lane_u32( &vmax, vreinterpretq_u32_u8( max ), 0 );
446
vst1_lane_u32( &vp, vreinterpret_u32_u8( p.val[0] ), 0 );
447
448
return uint64_t( ( uint64_t( to565( vmin ) ) << 16 ) | to565( vmax ) | ( uint64_t( vp ) << 32 ) );
449
# endif
450
#else
451
uint32_t ref;
452
memcpy( &ref, src, 4 );
453
uint32_t refMask = ref & 0xF8FCF8;
454
auto stmp = src + 4;
455
for( int i=1; i<16; i++ )
456
{
457
uint32_t px;
458
memcpy( &px, stmp, 4 );
459
if( ( px & 0xF8FCF8 ) != refMask ) break;
460
stmp += 4;
461
}
462
if( stmp == src + 64 )
463
{
464
return uint64_t( to565( ref ) ) << 16;
465
}
466
467
uint8_t min[3] = { src[0], src[1], src[2] };
468
uint8_t max[3] = { src[0], src[1], src[2] };
469
auto tmp = src + 4;
470
for( int i=1; i<16; i++ )
471
{
472
for( int j=0; j<3; j++ )
473
{
474
if( tmp[j] < min[j] ) min[j] = tmp[j];
475
else if( tmp[j] > max[j] ) max[j] = tmp[j];
476
}
477
tmp += 4;
478
}
479
480
const uint32_t range = DivTable[max[0] - min[0] + max[1] - min[1] + max[2] - min[2]];
481
const uint32_t rmin = min[0] + min[1] + min[2];
482
for( int i=0; i<3; i++ )
483
{
484
const uint8_t inset = ( max[i] - min[i] ) >> 4;
485
min[i] += inset;
486
max[i] -= inset;
487
}
488
489
uint32_t data = 0;
490
for( int i=0; i<16; i++ )
491
{
492
const uint32_t c = src[0] + src[1] + src[2] - rmin;
493
const uint8_t idx = ( c * range ) >> 16;
494
data |= idx << (i*2);
495
src += 4;
496
}
497
498
return uint64_t( ( uint64_t( to565( min[0], min[1], min[2] ) ) << 16 ) | to565( max[0], max[1], max[2] ) | ( uint64_t( data ) << 32 ) );
499
#endif
500
}
501
502
#ifdef __AVX2__
503
static etcpak_force_inline void ProcessRGB_AVX( const uint8_t* src, char*& dst )
504
{
505
__m256i px0 = _mm256_loadu_si256(((__m256i*)src) + 0);
506
__m256i px1 = _mm256_loadu_si256(((__m256i*)src) + 1);
507
__m256i px2 = _mm256_loadu_si256(((__m256i*)src) + 2);
508
__m256i px3 = _mm256_loadu_si256(((__m256i*)src) + 3);
509
510
__m256i smask = _mm256_set1_epi32( 0xF8FCF8 );
511
__m256i sd0 = _mm256_and_si256( px0, smask );
512
__m256i sd1 = _mm256_and_si256( px1, smask );
513
__m256i sd2 = _mm256_and_si256( px2, smask );
514
__m256i sd3 = _mm256_and_si256( px3, smask );
515
516
__m256i sc = _mm256_shuffle_epi32(sd0, _MM_SHUFFLE(0, 0, 0, 0));
517
518
__m256i sc0 = _mm256_cmpeq_epi8(sd0, sc);
519
__m256i sc1 = _mm256_cmpeq_epi8(sd1, sc);
520
__m256i sc2 = _mm256_cmpeq_epi8(sd2, sc);
521
__m256i sc3 = _mm256_cmpeq_epi8(sd3, sc);
522
523
__m256i sm0 = _mm256_and_si256(sc0, sc1);
524
__m256i sm1 = _mm256_and_si256(sc2, sc3);
525
__m256i sm = _mm256_and_si256(sm0, sm1);
526
527
const int64_t solid0 = 1 - _mm_testc_si128( _mm256_castsi256_si128( sm ), _mm_set1_epi32( -1 ) );
528
const int64_t solid1 = 1 - _mm_testc_si128( _mm256_extracti128_si256( sm, 1 ), _mm_set1_epi32( -1 ) );
529
530
if( solid0 + solid1 == 0 )
531
{
532
const auto c0 = uint64_t( to565( src[0], src[1], src[2] ) );
533
const auto c1 = uint64_t( to565( src[16], src[17], src[18] ) );
534
memcpy( dst, &c0, 8 );
535
memcpy( dst+8, &c1, 8 );
536
dst += 16;
537
return;
538
}
539
540
__m256i min0 = _mm256_min_epu8( px0, px1 );
541
__m256i min1 = _mm256_min_epu8( px2, px3 );
542
__m256i min2 = _mm256_min_epu8( min0, min1 );
543
544
__m256i max0 = _mm256_max_epu8( px0, px1 );
545
__m256i max1 = _mm256_max_epu8( px2, px3 );
546
__m256i max2 = _mm256_max_epu8( max0, max1 );
547
548
__m256i min3 = _mm256_shuffle_epi32( min2, _MM_SHUFFLE( 2, 3, 0, 1 ) );
549
__m256i max3 = _mm256_shuffle_epi32( max2, _MM_SHUFFLE( 2, 3, 0, 1 ) );
550
__m256i min4 = _mm256_min_epu8( min2, min3 );
551
__m256i max4 = _mm256_max_epu8( max2, max3 );
552
553
__m256i min5 = _mm256_shuffle_epi32( min4, _MM_SHUFFLE( 0, 0, 2, 2 ) );
554
__m256i max5 = _mm256_shuffle_epi32( max4, _MM_SHUFFLE( 0, 0, 2, 2 ) );
555
__m256i rmin = _mm256_min_epu8( min4, min5 );
556
__m256i rmax = _mm256_max_epu8( max4, max5 );
557
558
__m256i range1 = _mm256_subs_epu8( rmax, rmin );
559
__m256i range2 = _mm256_sad_epu8( rmax, rmin );
560
561
uint16_t vrange0 = DivTable[_mm256_cvtsi256_si32( range2 ) >> 1];
562
uint16_t vrange1 = DivTable[_mm256_extract_epi16( range2, 8 ) >> 1];
563
__m256i range00 = _mm256_set1_epi16( vrange0 );
564
__m256i range = _mm256_inserti128_si256( range00, _mm_set1_epi16( vrange1 ), 1 );
565
566
__m256i inset1 = _mm256_srli_epi16( range1, 4 );
567
__m256i inset = _mm256_and_si256( inset1, _mm256_set1_epi8( 0xF ) );
568
__m256i min = _mm256_adds_epu8( rmin, inset );
569
__m256i max = _mm256_subs_epu8( rmax, inset );
570
571
__m256i c0 = _mm256_subs_epu8( px0, rmin );
572
__m256i c1 = _mm256_subs_epu8( px1, rmin );
573
__m256i c2 = _mm256_subs_epu8( px2, rmin );
574
__m256i c3 = _mm256_subs_epu8( px3, rmin );
575
576
__m256i is0 = _mm256_maddubs_epi16( c0, _mm256_set1_epi8( 1 ) );
577
__m256i is1 = _mm256_maddubs_epi16( c1, _mm256_set1_epi8( 1 ) );
578
__m256i is2 = _mm256_maddubs_epi16( c2, _mm256_set1_epi8( 1 ) );
579
__m256i is3 = _mm256_maddubs_epi16( c3, _mm256_set1_epi8( 1 ) );
580
581
__m256i s0 = _mm256_hadd_epi16( is0, is1 );
582
__m256i s1 = _mm256_hadd_epi16( is2, is3 );
583
584
__m256i m0 = _mm256_mulhi_epu16( s0, range );
585
__m256i m1 = _mm256_mulhi_epu16( s1, range );
586
587
__m256i p0 = _mm256_packus_epi16( m0, m1 );
588
589
__m256i p1 = _mm256_or_si256( _mm256_srai_epi32( p0, 6 ), _mm256_srai_epi32( p0, 12 ) );
590
__m256i p2 = _mm256_or_si256( _mm256_srai_epi32( p0, 18 ), p0 );
591
__m256i p3 = _mm256_or_si256( p1, p2 );
592
__m256i p =_mm256_shuffle_epi8( p3, _mm256_set1_epi32( 0x0C080400 ) );
593
594
__m256i mm0 = _mm256_unpacklo_epi8( _mm256_setzero_si256(), min );
595
__m256i mm1 = _mm256_unpacklo_epi8( _mm256_setzero_si256(), max );
596
__m256i mm2 = _mm256_unpacklo_epi64( mm1, mm0 );
597
__m256i mmr = _mm256_slli_epi64( _mm256_srli_epi64( mm2, 11 ), 11 );
598
__m256i mmg = _mm256_slli_epi64( _mm256_srli_epi64( mm2, 26 ), 5 );
599
__m256i mmb = _mm256_srli_epi64( _mm256_slli_epi64( mm2, 16 ), 59 );
600
__m256i mm3 = _mm256_or_si256( mmr, mmg );
601
__m256i mm4 = _mm256_or_si256( mm3, mmb );
602
__m256i mm5 = _mm256_shuffle_epi8( mm4, _mm256_set1_epi32( 0x09080100 ) );
603
604
__m256i d0 = _mm256_unpacklo_epi32( mm5, p );
605
__m256i d1 = _mm256_permute4x64_epi64( d0, _MM_SHUFFLE( 3, 2, 2, 0 ) );
606
__m128i d2 = _mm256_castsi256_si128( d1 );
607
608
__m128i mask = _mm_set_epi64x( 0xFFFF0000 | -solid1, 0xFFFF0000 | -solid0 );
609
__m128i d3 = _mm_and_si128( d2, mask );
610
_mm_storeu_si128( (__m128i*)dst, d3 );
611
612
for( int j=4; j<8; j++ ) dst[j] = (char)DxtcIndexTable[(uint8_t)dst[j]];
613
for( int j=12; j<16; j++ ) dst[j] = (char)DxtcIndexTable[(uint8_t)dst[j]];
614
615
dst += 16;
616
}
617
#endif
618
619
static const uint8_t AlphaIndexTable[8] = { 1, 7, 6, 5, 4, 3, 2, 0 };
620
621
static etcpak_force_inline uint64_t ProcessAlpha( const uint8_t* src )
622
{
623
uint8_t solid8 = *src;
624
uint16_t solid16 = uint16_t( solid8 ) | ( uint16_t( solid8 ) << 8 );
625
uint32_t solid32 = uint32_t( solid16 ) | ( uint32_t( solid16 ) << 16 );
626
uint64_t solid64 = uint64_t( solid32 ) | ( uint64_t( solid32 ) << 32 );
627
if( memcmp( src, &solid64, 8 ) == 0 && memcmp( src+8, &solid64, 8 ) == 0 )
628
{
629
return solid8;
630
}
631
632
uint8_t min = src[0];
633
uint8_t max = min;
634
for( int i=1; i<16; i++ )
635
{
636
const auto v = src[i];
637
if( v > max ) max = v;
638
else if( v < min ) min = v;
639
}
640
641
uint32_t range = ( 8 << 13 ) / ( 1 + max - min );
642
uint64_t data = 0;
643
for( int i=0; i<16; i++ )
644
{
645
uint8_t a = src[i] - min;
646
uint64_t idx = AlphaIndexTable[( a * range ) >> 13];
647
data |= idx << (i*3);
648
}
649
650
return max | ( min << 8 ) | ( data << 16 );
651
}
652
653
#ifdef __SSE4_1__
654
static etcpak_force_inline uint64_t ProcessRGB_SSE( __m128i px0, __m128i px1, __m128i px2, __m128i px3 )
655
{
656
__m128i smask = _mm_set1_epi32( 0xF8FCF8 );
657
__m128i sd0 = _mm_and_si128( px0, smask );
658
__m128i sd1 = _mm_and_si128( px1, smask );
659
__m128i sd2 = _mm_and_si128( px2, smask );
660
__m128i sd3 = _mm_and_si128( px3, smask );
661
662
__m128i sc = _mm_shuffle_epi32(sd0, _MM_SHUFFLE(0, 0, 0, 0));
663
664
__m128i sc0 = _mm_cmpeq_epi8(sd0, sc);
665
__m128i sc1 = _mm_cmpeq_epi8(sd1, sc);
666
__m128i sc2 = _mm_cmpeq_epi8(sd2, sc);
667
__m128i sc3 = _mm_cmpeq_epi8(sd3, sc);
668
669
__m128i sm0 = _mm_and_si128(sc0, sc1);
670
__m128i sm1 = _mm_and_si128(sc2, sc3);
671
__m128i sm = _mm_and_si128(sm0, sm1);
672
673
if( _mm_testc_si128(sm, _mm_set1_epi32(-1)) )
674
{
675
return uint64_t( to565( _mm_cvtsi128_si32( px0 ) ) ) << 16;
676
}
677
678
px0 = _mm_and_si128( px0, _mm_set1_epi32( 0xFFFFFF ) );
679
px1 = _mm_and_si128( px1, _mm_set1_epi32( 0xFFFFFF ) );
680
px2 = _mm_and_si128( px2, _mm_set1_epi32( 0xFFFFFF ) );
681
px3 = _mm_and_si128( px3, _mm_set1_epi32( 0xFFFFFF ) );
682
683
__m128i min0 = _mm_min_epu8( px0, px1 );
684
__m128i min1 = _mm_min_epu8( px2, px3 );
685
__m128i min2 = _mm_min_epu8( min0, min1 );
686
687
__m128i max0 = _mm_max_epu8( px0, px1 );
688
__m128i max1 = _mm_max_epu8( px2, px3 );
689
__m128i max2 = _mm_max_epu8( max0, max1 );
690
691
__m128i min3 = _mm_shuffle_epi32( min2, _MM_SHUFFLE( 2, 3, 0, 1 ) );
692
__m128i max3 = _mm_shuffle_epi32( max2, _MM_SHUFFLE( 2, 3, 0, 1 ) );
693
__m128i min4 = _mm_min_epu8( min2, min3 );
694
__m128i max4 = _mm_max_epu8( max2, max3 );
695
696
__m128i min5 = _mm_shuffle_epi32( min4, _MM_SHUFFLE( 0, 0, 2, 2 ) );
697
__m128i max5 = _mm_shuffle_epi32( max4, _MM_SHUFFLE( 0, 0, 2, 2 ) );
698
__m128i rmin = _mm_min_epu8( min4, min5 );
699
__m128i rmax = _mm_max_epu8( max4, max5 );
700
701
__m128i range1 = _mm_subs_epu8( rmax, rmin );
702
__m128i range2 = _mm_sad_epu8( rmax, rmin );
703
704
uint32_t vrange = _mm_cvtsi128_si32( range2 ) >> 1;
705
__m128i range = _mm_set1_epi16( DivTable[vrange] );
706
707
__m128i inset1 = _mm_srli_epi16( range1, 4 );
708
__m128i inset = _mm_and_si128( inset1, _mm_set1_epi8( 0xF ) );
709
__m128i min = _mm_adds_epu8( rmin, inset );
710
__m128i max = _mm_subs_epu8( rmax, inset );
711
712
__m128i c0 = _mm_subs_epu8( px0, rmin );
713
__m128i c1 = _mm_subs_epu8( px1, rmin );
714
__m128i c2 = _mm_subs_epu8( px2, rmin );
715
__m128i c3 = _mm_subs_epu8( px3, rmin );
716
717
__m128i is0 = _mm_maddubs_epi16( c0, _mm_set1_epi8( 1 ) );
718
__m128i is1 = _mm_maddubs_epi16( c1, _mm_set1_epi8( 1 ) );
719
__m128i is2 = _mm_maddubs_epi16( c2, _mm_set1_epi8( 1 ) );
720
__m128i is3 = _mm_maddubs_epi16( c3, _mm_set1_epi8( 1 ) );
721
722
__m128i s0 = _mm_hadd_epi16( is0, is1 );
723
__m128i s1 = _mm_hadd_epi16( is2, is3 );
724
725
__m128i m0 = _mm_mulhi_epu16( s0, range );
726
__m128i m1 = _mm_mulhi_epu16( s1, range );
727
728
__m128i p0 = _mm_packus_epi16( m0, m1 );
729
730
__m128i p1 = _mm_or_si128( _mm_srai_epi32( p0, 6 ), _mm_srai_epi32( p0, 12 ) );
731
__m128i p2 = _mm_or_si128( _mm_srai_epi32( p0, 18 ), p0 );
732
__m128i p3 = _mm_or_si128( p1, p2 );
733
__m128i p =_mm_shuffle_epi8( p3, _mm_set1_epi32( 0x0C080400 ) );
734
735
uint32_t vmin = _mm_cvtsi128_si32( min );
736
uint32_t vmax = _mm_cvtsi128_si32( max );
737
uint32_t vp = _mm_cvtsi128_si32( p );
738
739
return uint64_t( ( uint64_t( to565( vmin ) ) << 16 ) | to565( vmax ) | ( uint64_t( vp ) << 32 ) );
740
}
741
742
static etcpak_force_inline uint64_t ProcessOneChannel_SSE( __m128i a )
743
{
744
__m128i solidCmp = _mm_shuffle_epi8( a, _mm_setzero_si128() );
745
__m128i cmpRes = _mm_cmpeq_epi8( a, solidCmp );
746
if( _mm_testc_si128( cmpRes, _mm_set1_epi32( -1 ) ) )
747
{
748
return _mm_cvtsi128_si32( a ) & 0xFF;
749
}
750
751
__m128i a1 = _mm_shuffle_epi32( a, _MM_SHUFFLE( 2, 3, 0, 1 ) );
752
__m128i max1 = _mm_max_epu8( a, a1 );
753
__m128i min1 = _mm_min_epu8( a, a1 );
754
__m128i amax2 = _mm_shuffle_epi32( max1, _MM_SHUFFLE( 0, 0, 2, 2 ) );
755
__m128i amin2 = _mm_shuffle_epi32( min1, _MM_SHUFFLE( 0, 0, 2, 2 ) );
756
__m128i max2 = _mm_max_epu8( max1, amax2 );
757
__m128i min2 = _mm_min_epu8( min1, amin2 );
758
__m128i amax3 = _mm_alignr_epi8( max2, max2, 2 );
759
__m128i amin3 = _mm_alignr_epi8( min2, min2, 2 );
760
__m128i max3 = _mm_max_epu8( max2, amax3 );
761
__m128i min3 = _mm_min_epu8( min2, amin3 );
762
__m128i amax4 = _mm_alignr_epi8( max3, max3, 1 );
763
__m128i amin4 = _mm_alignr_epi8( min3, min3, 1 );
764
__m128i max = _mm_max_epu8( max3, amax4 );
765
__m128i min = _mm_min_epu8( min3, amin4 );
766
__m128i minmax = _mm_unpacklo_epi8( max, min );
767
768
__m128i r = _mm_sub_epi8( max, min );
769
int range = _mm_cvtsi128_si32( r ) & 0xFF;
770
__m128i rv = _mm_set1_epi16( DivTableAlpha[range] );
771
772
__m128i v = _mm_sub_epi8( a, min );
773
774
__m128i lo16 = _mm_unpacklo_epi8( v, _mm_setzero_si128() );
775
__m128i hi16 = _mm_unpackhi_epi8( v, _mm_setzero_si128() );
776
777
__m128i lomul = _mm_mulhi_epu16( lo16, rv );
778
__m128i himul = _mm_mulhi_epu16( hi16, rv );
779
780
__m128i p0 = _mm_packus_epi16( lomul, himul );
781
__m128i p1 = _mm_or_si128( _mm_and_si128( p0, _mm_set1_epi16( 0x3F ) ), _mm_srai_epi16( _mm_and_si128( p0, _mm_set1_epi16( 0x3F00 ) ), 5 ) );
782
__m128i p2 = _mm_packus_epi16( p1, p1 );
783
784
uint64_t pi = _mm_cvtsi128_si64( p2 );
785
uint64_t data = 0;
786
for( int i=0; i<8; i++ )
787
{
788
uint64_t idx = AlphaIndexTable_SSE[(pi>>(i*8)) & 0x3F];
789
data |= idx << (i*6);
790
}
791
return (uint64_t)(uint16_t)_mm_cvtsi128_si32( minmax ) | ( data << 16 );
792
}
793
794
static etcpak_force_inline uint64_t ProcessAlpha_SSE( __m128i px0, __m128i px1, __m128i px2, __m128i px3 )
795
{
796
__m128i mask = _mm_setr_epi32( 0x0f0b0703, -1, -1, -1 );
797
798
__m128i m0 = _mm_shuffle_epi8( px0, mask );
799
__m128i m1 = _mm_shuffle_epi8( px1, _mm_shuffle_epi32( mask, _MM_SHUFFLE( 3, 3, 0, 3 ) ) );
800
__m128i m2 = _mm_shuffle_epi8( px2, _mm_shuffle_epi32( mask, _MM_SHUFFLE( 3, 0, 3, 3 ) ) );
801
__m128i m3 = _mm_shuffle_epi8( px3, _mm_shuffle_epi32( mask, _MM_SHUFFLE( 0, 3, 3, 3 ) ) );
802
__m128i m4 = _mm_or_si128( m0, m1 );
803
__m128i m5 = _mm_or_si128( m2, m3 );
804
__m128i a = _mm_or_si128( m4, m5 );
805
806
return ProcessOneChannel_SSE( a );
807
}
808
#endif
809
810
void CompressBc1( const uint32_t* src, uint64_t* dst, uint32_t blocks, size_t width )
811
{
812
#ifdef __AVX2__
813
if( width%8 == 0 )
814
{
815
blocks /= 2;
816
uint32_t buf[8*4];
817
int i = 0;
818
char* dst8 = (char*)dst;
819
820
do
821
{
822
auto tmp = (char*)buf;
823
memcpy( tmp, src + width * 0, 8*4 );
824
memcpy( tmp + 8*4, src + width * 1, 8*4 );
825
memcpy( tmp + 16*4, src + width * 2, 8*4 );
826
memcpy( tmp + 24*4, src + width * 3, 8*4 );
827
src += 8;
828
if( ++i == width/8 )
829
{
830
src += width * 3;
831
i = 0;
832
}
833
834
ProcessRGB_AVX( (uint8_t*)buf, dst8 );
835
}
836
while( --blocks );
837
}
838
else
839
#endif
840
{
841
uint32_t buf[4*4];
842
int i = 0;
843
844
auto ptr = dst;
845
do
846
{
847
auto tmp = (char*)buf;
848
memcpy( tmp, src + width * 0, 4*4 );
849
memcpy( tmp + 4*4, src + width * 1, 4*4 );
850
memcpy( tmp + 8*4, src + width * 2, 4*4 );
851
memcpy( tmp + 12*4, src + width * 3, 4*4 );
852
src += 4;
853
if( ++i == width/4 )
854
{
855
src += width * 3;
856
i = 0;
857
}
858
859
const auto c = ProcessRGB( (uint8_t*)buf );
860
uint8_t fix[8];
861
memcpy( fix, &c, 8 );
862
for( int j=4; j<8; j++ ) fix[j] = DxtcIndexTable[fix[j]];
863
memcpy( ptr, fix, sizeof( uint64_t ) );
864
ptr++;
865
}
866
while( --blocks );
867
}
868
}
869
870
void CompressBc1Dither( const uint32_t* src, uint64_t* dst, uint32_t blocks, size_t width )
871
{
872
uint32_t buf[4*4];
873
int i = 0;
874
875
auto ptr = dst;
876
do
877
{
878
auto tmp = (char*)buf;
879
memcpy( tmp, src + width * 0, 4*4 );
880
memcpy( tmp + 4*4, src + width * 1, 4*4 );
881
memcpy( tmp + 8*4, src + width * 2, 4*4 );
882
memcpy( tmp + 12*4, src + width * 3, 4*4 );
883
src += 4;
884
if( ++i == width/4 )
885
{
886
src += width * 3;
887
i = 0;
888
}
889
890
Dither( (uint8_t*)buf );
891
892
const auto c = ProcessRGB( (uint8_t*)buf );
893
uint8_t fix[8];
894
memcpy( fix, &c, 8 );
895
for( int j=4; j<8; j++ ) fix[j] = DxtcIndexTable[fix[j]];
896
memcpy( ptr, fix, sizeof( uint64_t ) );
897
ptr++;
898
}
899
while( --blocks );
900
}
901
902
void CompressBc3( const uint32_t* src, uint64_t* dst, uint32_t blocks, size_t width )
903
{
904
int i = 0;
905
auto ptr = dst;
906
do
907
{
908
#ifdef __SSE4_1__
909
__m128i px0 = _mm_loadu_si128( (__m128i*)( src + width * 0 ) );
910
__m128i px1 = _mm_loadu_si128( (__m128i*)( src + width * 1 ) );
911
__m128i px2 = _mm_loadu_si128( (__m128i*)( src + width * 2 ) );
912
__m128i px3 = _mm_loadu_si128( (__m128i*)( src + width * 3 ) );
913
914
src += 4;
915
if( ++i == width/4 )
916
{
917
src += width * 3;
918
i = 0;
919
}
920
921
*ptr++ = ProcessAlpha_SSE( px0, px1, px2, px3 );
922
923
const auto c = ProcessRGB_SSE( px0, px1, px2, px3 );
924
uint8_t fix[8];
925
memcpy( fix, &c, 8 );
926
for( int j=4; j<8; j++ ) fix[j] = DxtcIndexTable[fix[j]];
927
memcpy( ptr, fix, sizeof( uint64_t ) );
928
ptr++;
929
#else
930
uint32_t rgba[4*4];
931
uint8_t alpha[4*4];
932
933
auto tmp = (char*)rgba;
934
memcpy( tmp, src + width * 0, 4*4 );
935
memcpy( tmp + 4*4, src + width * 1, 4*4 );
936
memcpy( tmp + 8*4, src + width * 2, 4*4 );
937
memcpy( tmp + 12*4, src + width * 3, 4*4 );
938
src += 4;
939
if( ++i == width/4 )
940
{
941
src += width * 3;
942
i = 0;
943
}
944
945
for( int i=0; i<16; i++ )
946
{
947
alpha[i] = rgba[i] >> 24;
948
rgba[i] &= 0xFFFFFF;
949
}
950
*ptr++ = ProcessAlpha( alpha );
951
952
const auto c = ProcessRGB( (uint8_t*)rgba );
953
uint8_t fix[8];
954
memcpy( fix, &c, 8 );
955
for( int j=4; j<8; j++ ) fix[j] = DxtcIndexTable[fix[j]];
956
memcpy( ptr, fix, sizeof( uint64_t ) );
957
ptr++;
958
#endif
959
}
960
while( --blocks );
961
}
962
963
void CompressBc4( const uint32_t* src, uint64_t* dst, uint32_t blocks, size_t width )
964
{
965
int i = 0;
966
auto ptr = dst;
967
do
968
{
969
#ifdef __SSE4_1__
970
__m128i px0 = _mm_loadu_si128( (__m128i*)( src + width * 0 ) );
971
__m128i px1 = _mm_loadu_si128( (__m128i*)( src + width * 1 ) );
972
__m128i px2 = _mm_loadu_si128( (__m128i*)( src + width * 2 ) );
973
__m128i px3 = _mm_loadu_si128( (__m128i*)( src + width * 3 ) );
974
975
src += 4;
976
if( ++i == width/4 )
977
{
978
src += width * 3;
979
i = 0;
980
}
981
982
__m128i mask = _mm_setr_epi32( 0x0c080400, -1, -1, -1 );
983
984
__m128i m0 = _mm_shuffle_epi8( px0, mask );
985
__m128i m1 = _mm_shuffle_epi8( px1, _mm_shuffle_epi32( mask, _MM_SHUFFLE( 3, 3, 0, 3 ) ) );
986
__m128i m2 = _mm_shuffle_epi8( px2, _mm_shuffle_epi32( mask, _MM_SHUFFLE( 3, 0, 3, 3 ) ) );
987
__m128i m3 = _mm_shuffle_epi8( px3, _mm_shuffle_epi32( mask, _MM_SHUFFLE( 0, 3, 3, 3 ) ) );
988
__m128i m4 = _mm_or_si128( m0, m1 );
989
__m128i m5 = _mm_or_si128( m2, m3 );
990
991
*ptr++ = ProcessOneChannel_SSE( _mm_or_si128( m4, m5 ) );
992
#else
993
uint8_t r[4*4];
994
auto rgba = src;
995
for( int i=0; i<4; i++ )
996
{
997
r[i*4] = rgba[0] & 0xff;
998
r[i*4+1] = rgba[1] & 0xff;
999
r[i*4+2] = rgba[2] & 0xff;
1000
r[i*4+3] = rgba[3] & 0xff;
1001
1002
rgba += width;
1003
}
1004
1005
src += 4;
1006
if( ++i == width/4 )
1007
{
1008
src += width * 3;
1009
i = 0;
1010
}
1011
1012
*ptr++ = ProcessAlpha( r );
1013
#endif
1014
} while( --blocks );
1015
}
1016
1017
void CompressBc5( const uint32_t* src, uint64_t* dst, uint32_t blocks, size_t width )
1018
{
1019
int i = 0;
1020
auto ptr = dst;
1021
do
1022
{
1023
#ifdef __SSE4_1__
1024
__m128i px0 = _mm_loadu_si128( (__m128i*)( src + width * 0 ) );
1025
__m128i px1 = _mm_loadu_si128( (__m128i*)( src + width * 1 ) );
1026
__m128i px2 = _mm_loadu_si128( (__m128i*)( src + width * 2 ) );
1027
__m128i px3 = _mm_loadu_si128( (__m128i*)( src + width * 3 ) );
1028
1029
src += 4;
1030
if( ++i == width/4 )
1031
{
1032
src += width*3;
1033
i = 0;
1034
}
1035
1036
__m128i mask = _mm_setr_epi32( 0x0c080400, -1, -1, -1 );
1037
1038
__m128i m0 = _mm_shuffle_epi8( px0, mask );
1039
__m128i m1 = _mm_shuffle_epi8( px1, _mm_shuffle_epi32( mask, _MM_SHUFFLE( 3, 3, 0, 3 ) ) );
1040
__m128i m2 = _mm_shuffle_epi8( px2, _mm_shuffle_epi32( mask, _MM_SHUFFLE( 3, 0, 3, 3 ) ) );
1041
__m128i m3 = _mm_shuffle_epi8( px3, _mm_shuffle_epi32( mask, _MM_SHUFFLE( 0, 3, 3, 3 ) ) );
1042
__m128i m4 = _mm_or_si128( m0, m1 );
1043
__m128i m5 = _mm_or_si128( m2, m3 );
1044
1045
*ptr++ = ProcessOneChannel_SSE( _mm_or_si128( m4, m5 ) );
1046
1047
mask = _mm_setr_epi32( 0x0d090501, -1, -1, -1 );
1048
1049
m0 = _mm_shuffle_epi8( px0, mask );
1050
m1 = _mm_shuffle_epi8( px1, _mm_shuffle_epi32( mask, _MM_SHUFFLE( 3, 3, 0, 3 ) ) );
1051
m2 = _mm_shuffle_epi8( px2, _mm_shuffle_epi32( mask, _MM_SHUFFLE( 3, 0, 3, 3 ) ) );
1052
m3 = _mm_shuffle_epi8( px3, _mm_shuffle_epi32( mask, _MM_SHUFFLE( 0, 3, 3, 3 ) ) );
1053
m4 = _mm_or_si128( m0, m1 );
1054
m5 = _mm_or_si128( m2, m3 );
1055
1056
*ptr++ = ProcessOneChannel_SSE( _mm_or_si128( m4, m5 ) );
1057
#else
1058
uint8_t rg[4*4*2];
1059
auto rgba = src;
1060
for( int i=0; i<4; i++ )
1061
{
1062
rg[i*4] = rgba[0] & 0xff;
1063
rg[i*4+1] = rgba[1] & 0xff;
1064
rg[i*4+2] = rgba[2] & 0xff;
1065
rg[i*4+3] = rgba[3] & 0xff;
1066
1067
rg[16+i*4] = (rgba[0] & 0xff00) >> 8;
1068
rg[16+i*4+1] = (rgba[1] & 0xff00) >> 8;
1069
rg[16+i*4+2] = (rgba[2] & 0xff00) >> 8;
1070
rg[16+i*4+3] = (rgba[3] & 0xff00) >> 8;
1071
1072
rgba += width;
1073
}
1074
1075
src += 4;
1076
if( ++i == width/4 )
1077
{
1078
src += width*3;
1079
i = 0;
1080
}
1081
1082
*ptr++ = ProcessAlpha( rg );
1083
*ptr++ = ProcessAlpha( &rg[16] );
1084
#endif
1085
} while( --blocks );
1086
}
1087
1088