CoCalc -- basisu_astc

GitHub Repository: godotengine/godot
Path: blob/master/thirdparty/basis_universal/transcoder/basisu_astc_helpers.h
⁹⁹⁰⁵ views
1
// basisu_astc_helpers.h
2
// Be sure to define ASTC_HELPERS_IMPLEMENTATION somewhere to get the implementation, otherwise you only get the header.
3
#pragma once
4
#ifndef BASISU_ASTC_HELPERS_HEADER
5
#define BASISU_ASTC_HELPERS_HEADER
6

7
#include <stdlib.h>
8
#include <stdint.h>
9
#include <math.h>
10
#include <fenv.h>
11

12
namespace astc_helpers
13
{
14
	const uint32_t MAX_WEIGHT_VALUE = 64; // grid texel weights must range from [0,64]
15
	const uint32_t MIN_GRID_DIM = 2; // the minimum dimension of a block's weight grid
16
	const uint32_t MIN_BLOCK_DIM = 4, MAX_BLOCK_DIM = 12; // the valid block dimensions in texels
17
	const uint32_t MAX_GRID_WEIGHTS = 64; // a block may have a maximum of 64 weight grid values
18
	const uint32_t NUM_MODE11_ENDPOINTS = 6, NUM_MODE7_ENDPOINTS = 4;
19

20
	static const uint32_t NUM_ASTC_BLOCK_SIZES = 14;
21
	extern const uint8_t g_astc_block_sizes[NUM_ASTC_BLOCK_SIZES][2];
22

23
	// The Color Endpoint Modes (CEM's)
24
	enum cems
25
	{
26
		CEM_LDR_LUM_DIRECT = 0,
27
		CEM_LDR_LUM_BASE_PLUS_OFS = 1,
28
		CEM_HDR_LUM_LARGE_RANGE = 2,
29
		CEM_HDR_LUM_SMALL_RANGE = 3,
30
		CEM_LDR_LUM_ALPHA_DIRECT = 4,
31
		CEM_LDR_LUM_ALPHA_BASE_PLUS_OFS = 5,
32
		CEM_LDR_RGB_BASE_SCALE = 6,
33
		CEM_HDR_RGB_BASE_SCALE = 7,
34
		CEM_LDR_RGB_DIRECT = 8,
35
		CEM_LDR_RGB_BASE_PLUS_OFFSET = 9,
36
		CEM_LDR_RGB_BASE_SCALE_PLUS_TWO_A = 10,
37
		CEM_HDR_RGB = 11,
38
		CEM_LDR_RGBA_DIRECT = 12,
39
		CEM_LDR_RGBA_BASE_PLUS_OFFSET = 13,
40
		CEM_HDR_RGB_LDR_ALPHA = 14,
41
		CEM_HDR_RGB_HDR_ALPHA = 15
42
	};
43

44
	// All Bounded Integer Sequence Coding (BISE or ISE) ranges.
45
	// Weights: Ranges [0,11] are valid.
46
	// Endpoints: Ranges [4,20] are valid.
47
	enum bise_levels
48
	{
49
		BISE_2_LEVELS = 0,
50
		BISE_3_LEVELS = 1,
51
		BISE_4_LEVELS = 2,
52
		BISE_5_LEVELS = 3,
53
		BISE_6_LEVELS = 4,
54
		BISE_8_LEVELS = 5,
55
		BISE_10_LEVELS = 6,
56
		BISE_12_LEVELS = 7,
57
		BISE_16_LEVELS = 8,
58
		BISE_20_LEVELS = 9,
59
		BISE_24_LEVELS = 10,
60
		BISE_32_LEVELS = 11,
61
		BISE_40_LEVELS = 12,
62
		BISE_48_LEVELS = 13,
63
		BISE_64_LEVELS = 14,
64
		BISE_80_LEVELS = 15,
65
		BISE_96_LEVELS = 16,
66
		BISE_128_LEVELS = 17,
67
		BISE_160_LEVELS = 18,
68
		BISE_192_LEVELS = 19,
69
		BISE_256_LEVELS = 20
70
	};
71

72
	const uint32_t TOTAL_ISE_RANGES = 21;
73

74
	// Valid endpoint ISE ranges
75
	const uint32_t FIRST_VALID_ENDPOINT_ISE_RANGE = BISE_6_LEVELS; // 4
76
	const uint32_t LAST_VALID_ENDPOINT_ISE_RANGE = BISE_256_LEVELS; // 20
77
	const uint32_t TOTAL_ENDPOINT_ISE_RANGES = LAST_VALID_ENDPOINT_ISE_RANGE - FIRST_VALID_ENDPOINT_ISE_RANGE + 1;
78

79
	// Valid weight ISE ranges
80
	const uint32_t FIRST_VALID_WEIGHT_ISE_RANGE = BISE_2_LEVELS; // 0
81
	const uint32_t LAST_VALID_WEIGHT_ISE_RANGE = BISE_32_LEVELS; // 11
82
	const uint32_t TOTAL_WEIGHT_ISE_RANGES = LAST_VALID_WEIGHT_ISE_RANGE - FIRST_VALID_WEIGHT_ISE_RANGE + 1;
83

84
	// The ISE range table.
85
	extern const int8_t g_ise_range_table[TOTAL_ISE_RANGES][3]; // 0=bits (0 to 8), 1=trits (0 or 1), 2=quints (0 or 1)
86

87
	// Possible Color Component Select values, used in dual plane mode. 
88
	// The CCS component will be interpolated using the 2nd weight plane.
89
	enum ccs
90
	{
91
		CCS_GBA_R = 0,
92
		CCS_RBA_G = 1,
93
		CCS_RGA_B = 2,
94
		CCS_RGB_A = 3
95
	};
96
		
97
	struct astc_block
98
	{
99
		uint32_t m_vals[4];
100
	};
101

102
	const uint32_t MAX_PARTITIONS = 4;				// Max # of partitions or subsets for single plane mode
103
	const uint32_t MAX_DUAL_PLANE_PARTITIONS = 3;	// Max # of partitions or subsets for dual plane mode
104
	const uint32_t NUM_PARTITION_PATTERNS = 1024;	// Total # of partition pattern seeds (10-bits)
105
	const uint32_t MAX_ENDPOINTS = 18;				// Maximum # of endpoint values in a block
106

107
	struct log_astc_block
108
	{
109
		bool m_error_flag;
110
		
111
		bool m_solid_color_flag_ldr, m_solid_color_flag_hdr;
112

113
		uint8_t m_user_mode;					// user defined value, not used in this module
114
		
115
		// Rest is only valid if !m_solid_color_flag_ldr && !m_solid_color_flag_hdr
116
		uint8_t m_grid_width, m_grid_height;	// weight grid dimensions, not the dimension of the block
117
		
118
		bool m_dual_plane;
119

120
		uint8_t m_weight_ise_range;				// 0-11
121
		uint8_t m_endpoint_ise_range;			// 4-20, this is actually inferred from the size of the other config bits+weights, but this is here for checking
122

123
		uint8_t m_color_component_selector;	// 0-3, controls which channel uses the 2nd (odd) weights, only used in dual plane mode
124

125
		uint8_t m_num_partitions;				// or the # of subsets, 1-4 (1-3 if dual plane mode)
126
		uint16_t m_partition_id;				// 10-bits, must be 0 if m_num_partitions==1
127
		
128
		uint8_t m_color_endpoint_modes[MAX_PARTITIONS]; // each subset's CEM's
129
		
130
		union
131
		{
132
			// ISE weight grid values. In dual plane mode, the order is p0,p1,  p0,p1,  etc.
133
			uint8_t m_weights[MAX_GRID_WEIGHTS];
134
			uint16_t m_solid_color[4];
135
		};
136
		
137
		// ISE endpoint values
138
		// Endpoint order examples:
139
		// 1 subset LA : LL0 LH0 AL0 AH0
140
		// 1 subset RGB : RL0 RH0 GL0 GH0 BL0 BH0
141
		// 1 subset RGBA : RL0 RH0 GL0 GH0 BL0 BH0 AL0 AH0
142
		// 2 subset LA : LL0 LH0 AL0 AH0 LL1 LH1 AL1 AH1
143
		// 2 subset RGB : RL0 RH0 GL0 GH0 BL0 BH0 RL1 RH1 GL1 GH1 BL1 BH1
144
		// 2 subset RGBA : RL0 RH0 GL0 GH0 BL0 BH0 AL0 AH0 RL1 RH1 GL1 GH1 BL1 BH1 AL1 AH1
145
		uint8_t m_endpoints[MAX_ENDPOINTS];
146
				
147
		void clear()
148
		{
149
			memset(this, 0, sizeof(*this));
150
		}
151
	};
152

153
	// Open interval
154
	inline int bounds_check(int v, int l, int h) { (void)v; (void)l; (void)h; assert(v >= l && v < h); return v; }
155
	inline uint32_t bounds_check(uint32_t v, uint32_t l, uint32_t h) { (void)v; (void)l; (void)h; assert(v >= l && v < h); return v; }
156

157
	inline uint32_t get_bits(uint32_t val, int low, int high)
158
	{
159
		const int num_bits = (high - low) + 1;
160
		assert((num_bits >= 1) && (num_bits <= 32));
161

162
		val >>= low;
163
		if (num_bits != 32)
164
			val &= ((1u << num_bits) - 1);
165

166
		return val;
167
	}
168

169
	// Returns the number of levels in the given ISE range.
170
	inline uint32_t get_ise_levels(uint32_t ise_range) 
171
	{ 
172
		assert(ise_range < TOTAL_ISE_RANGES);
173
		return (1 + 2 * g_ise_range_table[ise_range][1] + 4 * g_ise_range_table[ise_range][2]) << g_ise_range_table[ise_range][0];
174
	}
175

176
	inline int get_ise_sequence_bits(int count, int range)
177
	{
178
		// See 18.22 Data Size Determination - note this will be <= the # of bits actually written by encode_bise(). (It's magic.)
179
		int total_bits = g_ise_range_table[range][0] * count;
180
		total_bits += (g_ise_range_table[range][1] * 8 * count + 4) / 5;
181
		total_bits += (g_ise_range_table[range][2] * 7 * count + 2) / 3;
182
		return total_bits;
183
	}
184
		
185
	inline uint32_t weight_interpolate(uint32_t l, uint32_t h, uint32_t w)
186
	{
187
		assert(w <= MAX_WEIGHT_VALUE);
188
		return (l * (64 - w) + h * w + 32) >> 6;
189
	}
190

191
	void encode_bise(uint32_t* pDst, const uint8_t* pSrc_vals, uint32_t bit_pos, int num_vals, int range, uint32_t *pStats = nullptr);
192

193
	struct pack_stats
194
	{
195
		uint32_t m_header_bits;
196
		uint32_t m_endpoint_bits;
197
		uint32_t m_weight_bits;
198

199
		inline pack_stats() { clear(); }
200
		inline void clear() { memset(this, 0, sizeof(*this)); }
201
	};
202

203
	// Packs a logical to physical ASTC block. Note this does not validate the block's dimensions (use is_valid_block_size()), just the grid dimensions.
204
	bool pack_astc_block(astc_block &phys_block, const log_astc_block& log_block, int* pExpected_endpoint_range = nullptr, pack_stats *pStats = nullptr);
205

206
	// Pack LDR void extent (really solid color) blocks. For LDR, pass in (val | (val << 8)) for each component.
207
	void pack_void_extent_ldr(astc_block& blk, uint16_t r, uint16_t g, uint16_t b, uint16_t a, pack_stats *pStats = nullptr);
208

209
	// Pack HDR void extent (16-bit values are FP16/half floats - no NaN/Inf's)
210
	void pack_void_extent_hdr(astc_block& blk, uint16_t rh, uint16_t gh, uint16_t bh, uint16_t ah, pack_stats* pStats = nullptr);
211

212
	// These helpers are all quite slow, but are useful for table preparation.
213
	
214
	// Dequantizes ISE encoded endpoint val to [0,255]
215
	uint32_t dequant_bise_endpoint(uint32_t val, uint32_t ise_range); // ISE ranges 4-11
216
		
217
	// Dequantizes ISE encoded weight val to [0,64]
218
	uint32_t dequant_bise_weight(uint32_t val, uint32_t ise_range); // ISE ranges 0-10
219

220
	uint32_t find_nearest_bise_endpoint(int v, uint32_t ise_range);
221
	uint32_t find_nearest_bise_weight(int v, uint32_t ise_range);
222

223
	void create_quant_tables(
224
		uint8_t* pVal_to_ise,	// [0-255] or [0-64] value to nearest ISE symbol, array size is [256] or [65]
225
		uint8_t* pISE_to_val,	// ASTC encoded ISE symbol to [0,255] or [0,64] value, [levels]
226
		uint8_t* pISE_to_rank,	// returns the level rank index given an ISE symbol, [levels]
227
		uint8_t* pRank_to_ISE,  // returns the ISE symbol given a level rank, inverse of pISE_to_rank, [levels]
228
		uint32_t ise_range,		// ise range, [4,20] for endpoints, [0,11] for weights
229
		bool weight_flag);		// false if block endpoints, true if weights
230

231
	// True if the CEM is LDR.
232
	bool is_cem_ldr(uint32_t mode);
233
	inline bool is_cem_hdr(uint32_t mode) { return !is_cem_ldr(mode); }
234

235
	// True if the passed in dimensions are a valid ASTC block size. There are 14 supported configs, from 4x4 (8bpp) to 12x12 (.89bpp).
236
	bool is_valid_block_size(uint32_t w, uint32_t h);
237

238
	bool block_has_any_hdr_cems(const log_astc_block& log_blk);
239
	bool block_has_any_ldr_cems(const log_astc_block& log_blk);
240
	
241
	// Returns the # of endpoint values for the given CEM.
242
	inline uint32_t get_num_cem_values(uint32_t cem) { assert(cem <= 15); return 2 + 2 * (cem >> 2); }
243

244
	struct dequant_table
245
	{
246
		basisu::vector<uint8_t> m_val_to_ise;	// [0-255] or [0-64] value to nearest ISE symbol, array size is [256] or [65]
247
		basisu::vector<uint8_t> m_ISE_to_val;	// ASTC encoded ISE symbol to [0,255] or [0,64] value, [levels]
248
		basisu::vector<uint8_t> m_ISE_to_rank;	// returns the level rank index given an ISE symbol, [levels]
249
		basisu::vector<uint8_t> m_rank_to_ISE;  // returns the ISE symbol given a level rank, inverse of pISE_to_rank, [levels]		
250

251
		void init(bool weight_flag, uint32_t num_levels, bool init_rank_tabs)
252
		{
253
			m_val_to_ise.resize(weight_flag ? (MAX_WEIGHT_VALUE + 1) : 256);
254
			m_ISE_to_val.resize(num_levels);
255
			if (init_rank_tabs)
256
			{
257
				m_ISE_to_rank.resize(num_levels);
258
				m_rank_to_ISE.resize(num_levels);
259
			}
260
		}
261
	};
262

263
	struct dequant_tables
264
	{
265
		dequant_table m_weights[TOTAL_WEIGHT_ISE_RANGES];
266
		dequant_table m_endpoints[TOTAL_ENDPOINT_ISE_RANGES];
267

268
		const dequant_table& get_weight_tab(uint32_t range) const
269
		{
270
			assert((range >= FIRST_VALID_WEIGHT_ISE_RANGE) && (range <= LAST_VALID_WEIGHT_ISE_RANGE));
271
			return m_weights[range - FIRST_VALID_WEIGHT_ISE_RANGE];
272
		}
273

274
		dequant_table& get_weight_tab(uint32_t range)
275
		{
276
			assert((range >= FIRST_VALID_WEIGHT_ISE_RANGE) && (range <= LAST_VALID_WEIGHT_ISE_RANGE));
277
			return m_weights[range - FIRST_VALID_WEIGHT_ISE_RANGE];
278
		}
279

280
		const dequant_table& get_endpoint_tab(uint32_t range) const
281
		{
282
			assert((range >= FIRST_VALID_ENDPOINT_ISE_RANGE) && (range <= LAST_VALID_ENDPOINT_ISE_RANGE));
283
			return m_endpoints[range - FIRST_VALID_ENDPOINT_ISE_RANGE];
284
		}
285

286
		dequant_table& get_endpoint_tab(uint32_t range)
287
		{
288
			assert((range >= FIRST_VALID_ENDPOINT_ISE_RANGE) && (range <= LAST_VALID_ENDPOINT_ISE_RANGE));
289
			return m_endpoints[range - FIRST_VALID_ENDPOINT_ISE_RANGE];
290
		}
291

292
		void init(bool init_rank_tabs)
293
		{
294
			for (uint32_t range = FIRST_VALID_WEIGHT_ISE_RANGE; range <= LAST_VALID_WEIGHT_ISE_RANGE; range++)
295
			{
296
				const uint32_t num_levels = get_ise_levels(range);
297
				dequant_table& tab = get_weight_tab(range);
298

299
				tab.init(true, num_levels, init_rank_tabs);
300

301
				create_quant_tables(tab.m_val_to_ise.data(), tab.m_ISE_to_val.data(), init_rank_tabs ? tab.m_ISE_to_rank.data() : nullptr, init_rank_tabs ? tab.m_rank_to_ISE.data() : nullptr, range, true);
302
			}
303

304
			for (uint32_t range = FIRST_VALID_ENDPOINT_ISE_RANGE; range <= LAST_VALID_ENDPOINT_ISE_RANGE; range++)
305
			{
306
				const uint32_t num_levels = get_ise_levels(range);
307
				dequant_table& tab = get_endpoint_tab(range);
308

309
				tab.init(false, num_levels, init_rank_tabs);
310

311
				create_quant_tables(tab.m_val_to_ise.data(), tab.m_ISE_to_val.data(), init_rank_tabs ? tab.m_ISE_to_rank.data() : nullptr, init_rank_tabs ? tab.m_rank_to_ISE.data() : nullptr, range, false);
312
			}
313
		}
314
	};
315

316
	extern dequant_tables g_dequant_tables;
317
	void init_tables(bool init_rank_tabs);
318

319
	struct weighted_sample
320
	{
321
		uint8_t m_src_x;
322
		uint8_t m_src_y;
323
		uint8_t m_weights[2][2]; // [y][x], scaled by 16, round by adding 8
324
	};
325

326
	void compute_upsample_weights(
327
		int block_width, int block_height,
328
		int weight_grid_width, int weight_grid_height,
329
		weighted_sample* pWeights); // there will be block_width * block_height bilinear samples
330

331
	void upsample_weight_grid(
332
		uint32_t bx, uint32_t by,		// destination/to dimension
333
		uint32_t wx, uint32_t wy,		// source/from dimension
334
		const uint8_t* pSrc_weights,	// these are dequantized [0,64] weights, NOT ISE symbols, [wy][wx]
335
		uint8_t* pDst_weights);			// [by][bx]
336
		
337
	// Procedurally returns the texel partition/subset index given the block coordinate and config.
338
	int compute_texel_partition(uint32_t seedIn, uint32_t xIn, uint32_t yIn, uint32_t zIn, int num_partitions, bool small_block);
339
		
340
	void blue_contract(
341
		int r, int g, int b, int a,
342
		int& dr, int& dg, int& db, int& da);
343

344
	void bit_transfer_signed(int& a, int& b);
345

346
	void decode_endpoint(uint32_t cem_index, int (*pEndpoints)[2], const uint8_t* pE);
347

348
	typedef uint16_t half_float;
349
	half_float float_to_half(float val, bool toward_zero);
350
	float half_to_float(half_float hval);
351

352
	// Notes:
353
	// qlog16_to_half(half_to_qlog16(half_val_as_int)) == half_val_as_int (is lossless)
354
	// However, this is not lossless in the general sense.
355
	inline half_float qlog16_to_half(int k)
356
	{
357
		assert((k >= 0) && (k <= 0xFFFF));
358

359
		int E = (k & 0xF800) >> 11;
360
		int M = k & 0x7FF;
361

362
		int Mt;
363
		if (M < 512)
364
			Mt = 3 * M;
365
		else if (M >= 1536)
366
			Mt = 5 * M - 2048;
367
		else
368
			Mt = 4 * M - 512;
369

370
		return (half_float)((E << 10) + (Mt >> 3));
371
	}
372

373
	const int MAX_RGB9E5 = 0xff80;
374
	void unpack_rgb9e5(uint32_t packed, float& r, float& g, float& b);
375
	uint32_t pack_rgb9e5(float r, float g, float b);
376
	
377
	enum decode_mode
378
	{
379
		cDecodeModeSRGB8 = 0,	// returns uint8_t's, not valid on HDR blocks
380
		cDecodeModeLDR8 = 1,	// returns uint8_t's, not valid on HDR blocks
381
		cDecodeModeHDR16 = 2,   // returns uint16_t's (half floats), valid on all LDR/HDR blocks
382
		cDecodeModeRGB9E5 = 3	// returns uint32_t's, packed as RGB 9E5 (shared exponent), see https://registry.khronos.org/OpenGL/extensions/EXT/EXT_texture_shared_exponent.txt
383
	};
384

385
	// Decodes logical block to output pixels.
386
	// pPixels must point to either 32-bit pixel values (SRGB8/LDR8/9E5) or 64-bit pixel values (HDR16)
387
	bool decode_block(const log_astc_block& log_blk, void* pPixels, uint32_t blk_width, uint32_t blk_height, decode_mode dec_mode);
388

389
	void decode_bise(uint32_t ise_range, uint8_t* pVals, uint32_t num_vals, const uint8_t *pBits128, uint32_t bit_ofs);
390

391
	// Unpack a physical ASTC encoded GPU texture block to a logical block description.
392
	bool unpack_block(const void* pASTC_block, log_astc_block& log_blk, uint32_t blk_width, uint32_t blk_height);
393
					
394
} // namespace astc_helpers
395

396
#endif // BASISU_ASTC_HELPERS_HEADER
397

398
//------------------------------------------------------------------
399

400
#ifdef BASISU_ASTC_HELPERS_IMPLEMENTATION
401

402
namespace astc_helpers
403
{
404
	template<typename T> inline T my_min(T a, T b) { return (a < b) ? a : b; }
405
	template<typename T> inline T my_max(T a, T b) { return (a > b) ? a : b; }
406

407
	const uint8_t g_astc_block_sizes[NUM_ASTC_BLOCK_SIZES][2] = { 
408
		{ 4, 4 }, { 5, 4 }, { 5, 5 }, { 6, 5 }, 
409
		{ 6, 6 }, { 8, 5 }, { 8, 6 }, { 10, 5 }, 
410
		{ 10, 6 }, { 8, 8 }, { 10, 8 }, { 10, 10 }, 
411
		{ 12, 10 }, { 12, 12 } 
412
	};
413

414
	const int8_t g_ise_range_table[TOTAL_ISE_RANGES][3] =
415
	{
416
		//b  t  q
417
		//2  3  5	 // rng  ise_index	notes
418
		{ 1, 0, 0 }, // 0..1 0
419
		{ 0, 1, 0 }, // 0..2 1
420
		{ 2, 0, 0 }, // 0..3 2
421
		{ 0, 0, 1 }, // 0..4 3
422
		{ 1, 1, 0 }, // 0..5 4			min endpoint ISE index
423
		{ 3, 0, 0 }, // 0..7 5
424
		{ 1, 0, 1 }, // 0..9 6
425
		{ 2, 1, 0 }, // 0..11 7
426
		{ 4, 0, 0 }, // 0..15 8
427
		{ 2, 0, 1 }, // 0..19 9
428
		{ 3, 1, 0 }, // 0..23 10
429
		{ 5, 0, 0 }, // 0..31 11		max weight ISE index
430
		{ 3, 0, 1 }, // 0..39 12
431
		{ 4, 1, 0 }, // 0..47 13
432
		{ 6, 0, 0 }, // 0..63 14
433
		{ 4, 0, 1 }, // 0..79 15
434
		{ 5, 1, 0 }, // 0..95 16
435
		{ 7, 0, 0 }, // 0..127 17
436
		{ 5, 0, 1 }, // 0..159 18
437
		{ 6, 1, 0 }, // 0..191 19
438
		{ 8, 0, 0 }, // 0..255 20
439
	};
440
		
441
	static inline void astc_set_bits_1_to_9(uint32_t* pDst, uint32_t& bit_offset, uint32_t code, uint32_t codesize)
442
	{
443
		uint8_t* pBuf = reinterpret_cast<uint8_t*>(pDst);
444

445
		assert(codesize <= 9);
446
		if (codesize)
447
		{
448
			uint32_t byte_bit_offset = bit_offset & 7;
449
			uint32_t val = code << byte_bit_offset;
450

451
			uint32_t index = bit_offset >> 3;
452
			pBuf[index] |= (uint8_t)val;
453

454
			if (codesize > (8 - byte_bit_offset))
455
				pBuf[index + 1] |= (uint8_t)(val >> 8);
456

457
			bit_offset += codesize;
458
		}
459
	}
460

461
	static inline uint32_t astc_extract_bits(uint32_t bits, int low, int high)
462
	{
463
		return (bits >> low) & ((1 << (high - low + 1)) - 1);
464
	}
465

466
	// Writes bits to output in an endian safe way
467
	static inline void astc_set_bits(uint32_t* pOutput, uint32_t& bit_pos, uint32_t value, uint32_t total_bits)
468
	{
469
		assert(total_bits <= 31);
470
		assert(value < (1u << total_bits));
471

472
		uint8_t* pBytes = reinterpret_cast<uint8_t*>(pOutput);
473

474
		while (total_bits)
475
		{
476
			const uint32_t bits_to_write = my_min<int>(total_bits, 8 - (bit_pos & 7));
477

478
			pBytes[bit_pos >> 3] |= static_cast<uint8_t>(value << (bit_pos & 7));
479

480
			bit_pos += bits_to_write;
481
			total_bits -= bits_to_write;
482
			value >>= bits_to_write;
483
		}
484
	}
485

486
	static const uint8_t g_astc_quint_encode[125] =
487
	{
488
		0, 1, 2, 3, 4, 8, 9, 10, 11, 12, 16, 17, 18, 19, 20, 24, 25, 26, 27, 28, 5, 13, 21, 29, 6, 32, 33, 34, 35, 36, 40, 41, 42, 43, 44, 48, 49, 50, 51, 52, 56, 57,
489
		58, 59, 60, 37, 45, 53, 61, 14, 64, 65, 66, 67, 68, 72, 73, 74, 75, 76, 80, 81, 82, 83, 84, 88, 89, 90, 91, 92, 69, 77, 85, 93, 22, 96, 97, 98, 99, 100, 104,
490
		105, 106, 107, 108, 112, 113, 114, 115, 116, 120, 121, 122, 123, 124, 101, 109, 117, 125, 30, 102, 103, 70, 71, 38, 110, 111, 78, 79, 46, 118, 119, 86, 87, 54,
491
		126, 127, 94, 95, 62, 39, 47, 55, 63, 7 /*31 - results in the same decode as 7*/
492
	};
493

494
	// Encodes 3 values to output, usable for any range that uses quints and bits
495
	static inline void astc_encode_quints(uint32_t* pOutput, const uint8_t* pValues, uint32_t& bit_pos, int n, uint32_t* pStats)
496
	{
497
		// First extract the quints and the bits from the 3 input values
498
		int quints = 0, bits[3];
499
		const uint32_t bit_mask = (1 << n) - 1;
500
		for (int i = 0; i < 3; i++)
501
		{
502
			static const int s_muls[3] = { 1, 5, 25 };
503

504
			const int t = pValues[i] >> n;
505

506
			quints += t * s_muls[i];
507
			bits[i] = pValues[i] & bit_mask;
508
		}
509

510
		// Encode the quints, by inverting the bit manipulations done by the decoder, converting 3 quints into 7-bits.
511
		// See https://www.khronos.org/registry/DataFormat/specs/1.2/dataformat.1.2.html#astc-integer-sequence-encoding
512

513
		assert(quints < 125);
514
		const int T = g_astc_quint_encode[quints];
515

516
		// Now interleave the 7 encoded quint bits with the bits to form the encoded output. See table 95-96.
517
		astc_set_bits(pOutput, bit_pos, bits[0] | (astc_extract_bits(T, 0, 2) << n) | (bits[1] << (3 + n)) | (astc_extract_bits(T, 3, 4) << (3 + n * 2)) |
518
			(bits[2] << (5 + n * 2)) | (astc_extract_bits(T, 5, 6) << (5 + n * 3)), 7 + n * 3);
519

520
		if (pStats)
521
			*pStats += n * 3 + 7;
522
	}
523

524
	static const uint8_t g_astc_trit_encode[243] = { 0, 1, 2, 4, 5, 6, 8, 9, 10, 16, 17, 18, 20, 21, 22, 24, 25, 26, 3, 7, 11, 19, 23, 27, 12, 13, 14, 32, 33, 34, 36, 37, 38, 40, 41, 42, 48, 49, 50, 52, 53, 54, 56, 57, 58, 35, 39,
525
		43, 51, 55, 59, 44, 45, 46, 64, 65, 66, 68, 69, 70, 72, 73, 74, 80, 81, 82, 84, 85, 86, 88, 89, 90, 67, 71, 75, 83, 87, 91, 76, 77, 78, 128, 129, 130, 132, 133, 134, 136, 137, 138, 144, 145, 146, 148, 149, 150, 152, 153, 154,
526
		131, 135, 139, 147, 151, 155, 140, 141, 142, 160, 161, 162, 164, 165, 166, 168, 169, 170, 176, 177, 178, 180, 181, 182, 184, 185, 186, 163, 167, 171, 179, 183, 187, 172, 173, 174, 192, 193, 194, 196, 197, 198, 200, 201, 202,
527
		208, 209, 210, 212, 213, 214, 216, 217, 218, 195, 199, 203, 211, 215, 219, 204, 205, 206, 96, 97, 98, 100, 101, 102, 104, 105, 106, 112, 113, 114, 116, 117, 118, 120, 121, 122, 99, 103, 107, 115, 119, 123, 108, 109, 110, 224,
528
		225, 226, 228, 229, 230, 232, 233, 234, 240, 241, 242, 244, 245, 246, 248, 249, 250, 227, 231, 235, 243, 247, 251, 236, 237, 238, 28, 29, 30, 60, 61, 62, 92, 93, 94, 156, 157, 158, 188, 189, 190, 220, 221, 222, 31, 63, 95, 159,
529
		191, 223, 124, 125, 126 };
530

531
	// Encodes 5 values to output, usable for any range that uses trits and bits
532
	static void astc_encode_trits(uint32_t* pOutput, const uint8_t* pValues, uint32_t& bit_pos, int n, uint32_t *pStats)
533
	{
534
		// First extract the trits and the bits from the 5 input values
535
		int trits = 0, bits[5];
536
		const uint32_t bit_mask = (1 << n) - 1;
537
		for (int i = 0; i < 5; i++)
538
		{
539
			static const int s_muls[5] = { 1, 3, 9, 27, 81 };
540

541
			const int t = pValues[i] >> n;
542

543
			trits += t * s_muls[i];
544
			bits[i] = pValues[i] & bit_mask;
545
		}
546

547
		// Encode the trits, by inverting the bit manipulations done by the decoder, converting 5 trits into 8-bits.
548
		// See https://www.khronos.org/registry/DataFormat/specs/1.2/dataformat.1.2.html#astc-integer-sequence-encoding
549

550
		assert(trits < 243);
551
		const int T = g_astc_trit_encode[trits];
552

553
		// Now interleave the 8 encoded trit bits with the bits to form the encoded output. See table 94.
554
		astc_set_bits(pOutput, bit_pos, bits[0] | (astc_extract_bits(T, 0, 1) << n) | (bits[1] << (2 + n)), n * 2 + 2);
555
		
556
		astc_set_bits(pOutput, bit_pos, astc_extract_bits(T, 2, 3) | (bits[2] << 2) | (astc_extract_bits(T, 4, 4) << (2 + n)) | (bits[3] << (3 + n)) | (astc_extract_bits(T, 5, 6) << (3 + n * 2)) |
557
			(bits[4] << (5 + n * 2)) | (astc_extract_bits(T, 7, 7) << (5 + n * 3)), n * 3 + 6);
558
		
559
		if (pStats)
560
			*pStats += n * 5 + 8;
561
	}
562

563
	// Packs values using ASTC's BISE to output buffer.
564
	void encode_bise(uint32_t* pDst, const uint8_t* pSrc_vals, uint32_t bit_pos, int num_vals, int range, uint32_t *pStats)
565
	{
566
		uint32_t temp[5] = { 0 };
567

568
		const int num_bits = g_ise_range_table[range][0];
569

570
		int group_size = 0;
571
		if (g_ise_range_table[range][1])
572
			group_size = 5;
573
		else if (g_ise_range_table[range][2])
574
			group_size = 3;
575

576
#ifndef NDEBUG
577
		const uint32_t num_levels = get_ise_levels(range);
578
		for (int i = 0; i < num_vals; i++)
579
		{
580
			assert(pSrc_vals[i] < num_levels);
581
		}
582
#endif
583

584
		if (group_size)
585
		{
586
			// Range has trits or quints - pack each group of 5 or 3 values 
587
			const int total_groups = (group_size == 5) ? ((num_vals + 4) / 5) : ((num_vals + 2) / 3);
588

589
			for (int group_index = 0; group_index < total_groups; group_index++)
590
			{
591
				uint8_t vals[5] = { 0 };
592

593
				const int limit = my_min(group_size, num_vals - group_index * group_size);
594
				for (int i = 0; i < limit; i++)
595
					vals[i] = pSrc_vals[group_index * group_size + i];
596

597
				// Note this always writes a group of 3 or 5 bits values, even for incomplete groups. So it can write more than needed. 
598
				// get_ise_sequence_bits() returns the # of bits that must be written for proper decoding.
599
				if (group_size == 5)
600
					astc_encode_trits(temp, vals, bit_pos, num_bits, pStats);
601
				else
602
					astc_encode_quints(temp, vals, bit_pos, num_bits, pStats);
603
			}
604
		}
605
		else
606
		{
607
			for (int i = 0; i < num_vals; i++)
608
				astc_set_bits_1_to_9(temp, bit_pos, pSrc_vals[i], num_bits);
609

610
			if (pStats)
611
				*pStats += num_vals * num_bits;
612
		}
613

614
		pDst[0] |= temp[0]; pDst[1] |= temp[1];
615
		pDst[2] |= temp[2]; pDst[3] |= temp[3];
616
	}
617

618
	inline uint32_t rev_dword(uint32_t bits)
619
	{
620
		uint32_t v = (bits << 16) | (bits >> 16);
621
		v = ((v & 0x00ff00ff) << 8) | ((v & 0xff00ff00) >> 8); v = ((v & 0x0f0f0f0f) << 4) | ((v & 0xf0f0f0f0) >> 4);
622
		v = ((v & 0x33333333) << 2) | ((v & 0xcccccccc) >> 2); v = ((v & 0x55555555) << 1) | ((v & 0xaaaaaaaa) >> 1);
623
		return v;
624
	}
625

626
	static inline bool is_packable(int value, int num_bits) { assert((num_bits >= 1) && (num_bits < 31)); return (value >= 0) && (value < (1 << num_bits)); }
627

628
	static bool get_config_bits(const log_astc_block &log_block, uint32_t &config_bits)
629
	{
630
		config_bits = 0;
631

632
		const int W = log_block.m_grid_width, H = log_block.m_grid_height;
633

634
		const uint32_t P = log_block.m_weight_ise_range >= 6; // high precision
635
		const uint32_t Dp_P = (log_block.m_dual_plane << 1) | P; // pack dual plane+high precision bits
636
		
637
		// See Tables 81-82
638
		// Compute p from weight range
639
		uint32_t p = 2 + log_block.m_weight_ise_range - (P ? 6 : 0);
640
		
641
		// Rearrange p's bits to p0 p2 p1
642
		p = (p >> 1) + ((p & 1) << 2);
643
		
644
		// Try encoding each row of table 82.
645

646
		// W+4 H+2
647
		if (is_packable(W - 4, 2) && is_packable(H - 2, 2))
648
		{
649
			config_bits = (Dp_P << 9) | ((W - 4) << 7) | ((H - 2) << 5) | ((p & 4) << 2) | (p & 3);
650
			return true;
651
		}
652

653
		// W+8 H+2
654
		if (is_packable(W - 8, 2) && is_packable(H - 2, 2))
655
		{
656
			config_bits = (Dp_P << 9) | ((W - 8) << 7) | ((H - 2) << 5) | ((p & 4) << 2) | 4 | (p & 3);
657
			return true;
658
		}
659

660
		// W+2 H+8
661
		if (is_packable(W - 2, 2) && is_packable(H - 8, 2))
662
		{
663
			config_bits = (Dp_P << 9) | ((H - 8) << 7) | ((W - 2) << 5) | ((p & 4) << 2) | 8 | (p & 3);
664
			return true;
665
		}
666

667
		// W+2 H+6
668
		if (is_packable(W - 2, 2) && is_packable(H - 6, 1))
669
		{
670
			config_bits = (Dp_P << 9) | ((H - 6) << 7) | ((W - 2) << 5) | ((p & 4) << 2) | 12 | (p & 3);
671
			return true;
672
		}
673

674
		// W+2 H+2
675
		if (is_packable(W - 2, 1) && is_packable(H - 2, 2))
676
		{
677
			config_bits = (Dp_P << 9) | ((W) << 7) | ((H - 2) << 5) | ((p & 4) << 2) | 12 | (p & 3);
678
			return true;
679
		}
680
				
681
		// 12 H+2
682
		if ((W == 12) && is_packable(H - 2, 2))
683
		{
684
			config_bits = (Dp_P << 9) | ((H - 2) << 5) | (p << 2);
685
			return true;
686
		}
687

688
		// W+2 12
689
		if ((H == 12) && is_packable(W - 2, 2))
690
		{
691
			config_bits = (Dp_P << 9) | (1 << 7) | ((W - 2) << 5) | (p << 2);
692
			return true;
693
		}
694

695
		// 6 10
696
		if ((W == 6) && (H == 10))
697
		{
698
			config_bits = (Dp_P << 9) | (3 << 7) | (p << 2);
699
			return true;
700
		}
701

702
		// 10 6
703
		if ((W == 10) && (H == 6))
704
		{
705
			config_bits = (Dp_P << 9) | (0b1101 << 5) | (p << 2);
706
			return true;
707
		}
708
				
709
		// W+6 H+6 (no dual plane or high prec)
710
		if ((!Dp_P) && is_packable(W - 6, 2) && is_packable(H - 6, 2))
711
		{
712
			config_bits = ((H - 6) << 9) | 256 | ((W - 6) << 5) | (p << 2);
713
			return true;
714
		}
715

716
		// Failed: unsupported weight grid dimensions or config.
717
		return false;
718
	}
719

720
	bool pack_astc_block(astc_block& phys_block, const log_astc_block& log_block, int* pExpected_endpoint_range, pack_stats *pStats)
721
	{
722
		memset(&phys_block, 0, sizeof(phys_block));
723

724
		if (pExpected_endpoint_range)
725
			*pExpected_endpoint_range = -1;
726

727
		assert(!log_block.m_error_flag);
728
		if (log_block.m_error_flag)
729
			return false;
730
				
731
		if (log_block.m_solid_color_flag_ldr)
732
		{
733
			pack_void_extent_ldr(phys_block, log_block.m_solid_color[0], log_block.m_solid_color[1], log_block.m_solid_color[2], log_block.m_solid_color[3], pStats);
734
			return true;
735
		}
736
		else if (log_block.m_solid_color_flag_hdr)
737
		{
738
			pack_void_extent_hdr(phys_block, log_block.m_solid_color[0], log_block.m_solid_color[1], log_block.m_solid_color[2], log_block.m_solid_color[3], pStats);
739
			return true;
740
		}
741
				
742
		if ((log_block.m_num_partitions < 1) || (log_block.m_num_partitions > MAX_PARTITIONS))
743
			return false;
744

745
		// Max usable weight range is 11
746
		if (log_block.m_weight_ise_range > LAST_VALID_WEIGHT_ISE_RANGE)
747
			return false;
748

749
		// See 23.24 Illegal Encodings, [0,5] is the minimum ISE encoding for endpoints
750
		if ((log_block.m_endpoint_ise_range < FIRST_VALID_ENDPOINT_ISE_RANGE) || (log_block.m_endpoint_ise_range > LAST_VALID_ENDPOINT_ISE_RANGE))
751
			return false;
752

753
		if (log_block.m_color_component_selector > 3)
754
			return false;
755

756
		// TODO: sanity check grid width/height vs. block's physical width/height
757
				
758
		uint32_t config_bits = 0;
759
		if (!get_config_bits(log_block, config_bits))
760
			return false;
761

762
		uint32_t bit_pos = 0;
763
		astc_set_bits(&phys_block.m_vals[0], bit_pos, config_bits, 11);
764
		if (pStats)
765
			pStats->m_header_bits += 11;
766

767
		const uint32_t total_grid_weights = (log_block.m_dual_plane ? 2 : 1) * (log_block.m_grid_width * log_block.m_grid_height);
768
		const uint32_t total_weight_bits = get_ise_sequence_bits(total_grid_weights, log_block.m_weight_ise_range);
769

770
		// 18.24 Illegal Encodings
771
		if ((!total_grid_weights) || (total_grid_weights > MAX_GRID_WEIGHTS) || (total_weight_bits < 24) || (total_weight_bits > 96))
772
			return false;
773

774
		uint32_t total_extra_bits = 0;
775

776
		astc_set_bits(&phys_block.m_vals[0], bit_pos, log_block.m_num_partitions - 1, 2);
777
		if (pStats)
778
			pStats->m_header_bits += 2;
779

780
		if (log_block.m_num_partitions > 1)
781
		{
782
			if (log_block.m_partition_id >= NUM_PARTITION_PATTERNS)
783
				return false;
784

785
			astc_set_bits(&phys_block.m_vals[0], bit_pos, log_block.m_partition_id, 10);
786
			if (pStats)
787
				pStats->m_header_bits += 10;
788

789
			uint32_t highest_cem = 0, lowest_cem = UINT32_MAX;
790
			for (uint32_t j = 0; j < log_block.m_num_partitions; j++)
791
			{
792
				highest_cem = my_max<uint32_t>(highest_cem, log_block.m_color_endpoint_modes[j]);
793
				lowest_cem = my_min<uint32_t>(lowest_cem, log_block.m_color_endpoint_modes[j]);
794
			}
795

796
			if (highest_cem > 15)
797
				return false;
798
			
799
			// Ensure CEM range is contiguous
800
			if (((highest_cem >> 2) > (1 + (lowest_cem >> 2))))
801
				return false;
802

803
			// See tables 79/80
804
			uint32_t encoded_cem = log_block.m_color_endpoint_modes[0] << 2;
805
			if (lowest_cem != highest_cem)
806
			{
807
				encoded_cem = my_min<uint32_t>(3, 1 + (lowest_cem >> 2));
808

809
				// See tables at 23.11 Color Endpoint Mode
810
				for (uint32_t j = 0; j < log_block.m_num_partitions; j++)
811
				{
812
					const int M = log_block.m_color_endpoint_modes[j] & 3;
813
					
814
					const int C = (log_block.m_color_endpoint_modes[j] >> 2) - ((encoded_cem & 3) - 1);
815
					if ((C & 1) != C)
816
						return false;
817

818
					encoded_cem |= (C << (2 + j)) | (M << (2 + log_block.m_num_partitions + 2 * j));
819
				}
820

821
				total_extra_bits = 3 * log_block.m_num_partitions - 4;
822

823
				if ((total_weight_bits + total_extra_bits) > 128)
824
					return false;
825

826
				uint32_t cem_bit_pos = 128 - total_weight_bits - total_extra_bits;
827
				astc_set_bits(&phys_block.m_vals[0], cem_bit_pos, encoded_cem >> 6, total_extra_bits);
828
				if (pStats)
829
					pStats->m_header_bits += total_extra_bits;
830
			}
831

832
			astc_set_bits(&phys_block.m_vals[0], bit_pos, encoded_cem & 0x3f, 6);
833
			if (pStats)
834
				pStats->m_header_bits += 6;
835
		}
836
		else
837
		{
838
			if (log_block.m_partition_id)
839
				return false;
840
			if (log_block.m_color_endpoint_modes[0] > 15)
841
				return false;
842

843
			astc_set_bits(&phys_block.m_vals[0], bit_pos, log_block.m_color_endpoint_modes[0], 4);
844
			if (pStats)
845
				pStats->m_header_bits += 4;
846
		}
847

848
		if (log_block.m_dual_plane)
849
		{
850
			if (log_block.m_num_partitions > 3)
851
				return false;
852

853
			total_extra_bits += 2;
854
			
855
			uint32_t ccs_bit_pos = 128 - (int)total_weight_bits - (int)total_extra_bits;
856
			astc_set_bits(&phys_block.m_vals[0], ccs_bit_pos, log_block.m_color_component_selector, 2);
857
			if (pStats)
858
				pStats->m_header_bits += 2;
859
		}
860

861
		const uint32_t total_config_bits = bit_pos + total_extra_bits;
862
		const int num_remaining_bits = 128 - (int)total_config_bits - (int)total_weight_bits;
863
		if (num_remaining_bits < 0)
864
			return false;
865

866
		uint32_t total_cem_vals = 0;
867
		for (uint32_t j = 0; j < log_block.m_num_partitions; j++)
868
			total_cem_vals += 2 + 2 * (log_block.m_color_endpoint_modes[j] >> 2);
869

870
		if (total_cem_vals > MAX_ENDPOINTS)
871
			return false;
872

873
		int endpoint_ise_range = -1;
874
		for (int k = 20; k > 0; k--)
875
		{
876
			int bits = get_ise_sequence_bits(total_cem_vals, k);
877
			if (bits <= num_remaining_bits)
878
			{
879
				endpoint_ise_range = k;
880
				break;
881
			}
882
		}
883

884
		// See 23.24 Illegal Encodings, [0,5] is the minimum ISE encoding for endpoints
885
		if (endpoint_ise_range < (int)FIRST_VALID_ENDPOINT_ISE_RANGE)
886
			return false;
887

888
		// Ensure the caller utilized the right endpoint ISE range.
889
		if ((int)log_block.m_endpoint_ise_range != endpoint_ise_range)
890
		{
891
			if (pExpected_endpoint_range)
892
				*pExpected_endpoint_range = endpoint_ise_range;
893
			return false;
894
		}
895

896
		if (pStats)
897
		{
898
			pStats->m_endpoint_bits += get_ise_sequence_bits(total_cem_vals, endpoint_ise_range);
899
			pStats->m_weight_bits += get_ise_sequence_bits(total_grid_weights, log_block.m_weight_ise_range);
900
		}
901

902
		// Pack endpoints forwards
903
		encode_bise(&phys_block.m_vals[0], log_block.m_endpoints, bit_pos, total_cem_vals, endpoint_ise_range);
904
		
905
		// Pack weights backwards
906
		uint32_t weight_data[4] = { 0 };
907
		encode_bise(weight_data, log_block.m_weights, 0, total_grid_weights, log_block.m_weight_ise_range);
908

909
		for (uint32_t i = 0; i < 4; i++)
910
			phys_block.m_vals[i] |= rev_dword(weight_data[3 - i]);
911

912
		return true;
913
	}
914

915
	static inline uint32_t bit_replication_scale(uint32_t src, int num_src_bits, int num_dst_bits)
916
	{
917
		assert(num_src_bits <= num_dst_bits);
918
		assert((src & ((1 << num_src_bits) - 1)) == src);
919

920
		uint32_t dst = 0;
921
		for (int shift = num_dst_bits - num_src_bits; shift > -num_src_bits; shift -= num_src_bits)
922
			dst |= (shift >= 0) ? (src << shift) : (src >> -shift);
923

924
		return dst;
925
	}
926

927
	uint32_t dequant_bise_endpoint(uint32_t val, uint32_t ise_range)
928
	{
929
		assert((ise_range >= FIRST_VALID_ENDPOINT_ISE_RANGE) && (ise_range <= LAST_VALID_ENDPOINT_ISE_RANGE));
930
		assert(val < get_ise_levels(ise_range));
931

932
		uint32_t u = 0;
933

934
		switch (ise_range)
935
		{
936
		case 5:
937
		{
938
			u = bit_replication_scale(val, 3, 8);
939
			break;
940
		}
941
		case 8:
942
		{
943
			u = bit_replication_scale(val, 4, 8);
944
			break;
945
		}
946
		case 11:
947
		{
948
			u = bit_replication_scale(val, 5, 8);
949
			break;
950
		}
951
		case 14:
952
		{
953
			u = bit_replication_scale(val, 6, 8);
954
			break;
955
		}
956
		case 17:
957
		{
958
			u = bit_replication_scale(val, 7, 8);
959
			break;
960
		}
961
		case 20:
962
		{
963
			u = val;
964
			break;
965
		}
966
		case 4:
967
		case 6:
968
		case 7:
969
		case 9:
970
		case 10:
971
		case 12:
972
		case 13:
973
		case 15:
974
		case 16:
975
		case 18:
976
		case 19:
977
		{
978
			const uint32_t num_bits = g_ise_range_table[ise_range][0];
979
			const uint32_t num_trits = g_ise_range_table[ise_range][1]; BASISU_NOTE_UNUSED(num_trits);
980
			const uint32_t num_quints = g_ise_range_table[ise_range][2]; BASISU_NOTE_UNUSED(num_quints);
981

982
			// compute Table 103 row index
983
			const int range_index = (num_bits * 2 + (num_quints ? 1 : 0)) - 2;
984

985
			assert(range_index >= 0 && range_index <= 10);
986

987
			uint32_t bits = val & ((1 << num_bits) - 1);
988
			uint32_t tval = val >> num_bits;
989

990
			assert(tval < (num_trits ? 3U : 5U));
991

992
			uint32_t a = bits & 1;
993
			uint32_t b = (bits >> 1) & 1;
994
			uint32_t c = (bits >> 2) & 1;
995
			uint32_t d = (bits >> 3) & 1;
996
			uint32_t e = (bits >> 4) & 1;
997
			uint32_t f = (bits >> 5) & 1;
998

999
			uint32_t A = a ? 511 : 0;
1000
			uint32_t B = 0;
1001

1002
			switch (range_index)
1003
			{
1004
			case 2:
1005
			{
1006
				// 876543210
1007
				// b000b0bb0
1008
				B = (b << 1) | (b << 2) | (b << 4) | (b << 8);
1009
				break;
1010
			}
1011
			case 3:
1012
			{
1013
				// 876543210
1014
				// b0000bb00
1015
				B = (b << 2) | (b << 3) | (b << 8);
1016
				break;
1017
			}
1018
			case 4:
1019
			{
1020
				// 876543210
1021
				// cb000cbcb
1022
				B = b | (c << 1) | (b << 2) | (c << 3) | (b << 7) | (c << 8);
1023
				break;
1024
			}
1025
			case 5:
1026
			{
1027
				// 876543210
1028
				// cb0000cbc
1029
				B = c | (b << 1) | (c << 2) | (b << 7) | (c << 8);
1030
				break;
1031
			}
1032
			case 6:
1033
			{
1034
				// 876543210
1035
				// dcb000dcb
1036
				B = b | (c << 1) | (d << 2) | (b << 6) | (c << 7) | (d << 8);
1037
				break;
1038
			}
1039
			case 7:
1040
			{
1041
				// 876543210
1042
				// dcb0000dc
1043
				B = c | (d << 1) | (b << 6) | (c << 7) | (d << 8);
1044
				break;
1045
			}
1046
			case 8:
1047
			{
1048
				// 876543210
1049
				// edcb000ed
1050
				B = d | (e << 1) | (b << 5) | (c << 6) | (d << 7) | (e << 8);
1051
				break;
1052
			}
1053
			case 9:
1054
			{
1055
				// 876543210
1056
				// edcb0000e
1057
				B = e | (b << 5) | (c << 6) | (d << 7) | (e << 8);
1058
				break;
1059
			}
1060
			case 10:
1061
			{
1062
				// 876543210
1063
				// fedcb000f
1064
				B = f | (b << 4) | (c << 5) | (d << 6) | (e << 7) | (f << 8);
1065
				break;
1066
			}
1067
			default:
1068
				break;
1069
			}
1070

1071
			static uint8_t C_vals[11] = { 204, 113, 93, 54, 44, 26, 22, 13, 11, 6, 5 };
1072
			uint32_t C = C_vals[range_index];
1073
			uint32_t D = tval;
1074

1075
			u = D * C + B;
1076
			u = u ^ A;
1077
			u = (A & 0x80) | (u >> 2);
1078

1079
			break;
1080
		}
1081
		default:
1082
		{
1083
			assert(0);
1084
			break;
1085
		}
1086
		}
1087

1088
		return u;
1089
	}
1090

1091
	uint32_t dequant_bise_weight(uint32_t val, uint32_t ise_range)
1092
	{
1093
		assert(val < get_ise_levels(ise_range));
1094

1095
		uint32_t u = 0;
1096
		switch (ise_range)
1097
		{
1098
		case 0: 
1099
		{
1100
			u = val ? 63 : 0;
1101
			break;
1102
		}
1103
		case 1: // 0-2 
1104
		{
1105
			const uint8_t s_tab_0_2[3] = { 0, 32, 63 };
1106
			u = s_tab_0_2[val];
1107
			break;
1108
		}
1109
		case 2: // 0-3
1110
		{
1111
			u = bit_replication_scale(val, 2, 6);
1112
			break;
1113
		}
1114
		case 3: // 0-4
1115
		{
1116
			const uint8_t s_tab_0_4[5] = { 0, 16, 32, 47, 63 };
1117
			u = s_tab_0_4[val];
1118
			break;
1119
		}
1120
		case 5: // 0-7
1121
		{
1122
			u = bit_replication_scale(val, 3, 6);
1123
			break;
1124
		}
1125
		case 8: // 0-15
1126
		{
1127
			u = bit_replication_scale(val, 4, 6);
1128
			break;
1129
		}
1130
		case 11: // 0-31
1131
		{
1132
			u = bit_replication_scale(val, 5, 6);
1133
			break;
1134
		}
1135
		case 4: // 0-5
1136
		case 6: // 0-9
1137
		case 7: // 0-11
1138
		case 9: // 0-19
1139
		case 10: // 0-23
1140
		{
1141
			const uint32_t num_bits = g_ise_range_table[ise_range][0];
1142
			const uint32_t num_trits = g_ise_range_table[ise_range][1]; BASISU_NOTE_UNUSED(num_trits);
1143
			const uint32_t num_quints = g_ise_range_table[ise_range][2]; BASISU_NOTE_UNUSED(num_quints);
1144
			
1145
			// compute Table 103 row index
1146
			const int range_index = num_bits * 2 + (num_quints ? 1 : 0);
1147

1148
			// Extract bits and tris/quints from value
1149
			const uint32_t bits = val & ((1u << num_bits) - 1);
1150
			const uint32_t D = val >> num_bits;
1151

1152
			assert(D < (num_trits ? 3U : 5U));
1153

1154
			// Now dequantize
1155
			// See Table 103. ASTC weight unquantization parameters
1156
			static const uint32_t C_table[5] = { 50, 28, 23, 13, 11 };
1157
					
1158
			const uint32_t a = bits & 1, b = (bits >> 1) & 1, c = (bits >> 2) & 1;
1159

1160
			const uint32_t A = (a == 0) ? 0 : 0x7F;
1161
						
1162
			uint32_t B = 0;
1163
			if (range_index == 4)
1164
				B = ((b << 6) | (b << 2) | (b << 0));
1165
			else if (range_index == 5)
1166
				B = ((b << 6) | (b << 1));
1167
			else if (range_index == 6)
1168
				B = ((c << 6) | (b << 5) | (c << 1) | (b << 0));
1169

1170
			const uint32_t C = C_table[range_index - 2];
1171

1172
			u = D * C + B;
1173
			u = u ^ A;
1174
			u = (A & 0x20) | (u >> 2);
1175
			break;
1176
		}
1177
		default:
1178
			assert(0);
1179
			break;
1180
		}
1181

1182
		if (u > 32)
1183
			u++;
1184

1185
		return u;
1186
	}
1187

1188
	// Returns the nearest ISE symbol given a [0,255] endpoint value.
1189
	uint32_t find_nearest_bise_endpoint(int v, uint32_t ise_range)
1190
	{
1191
		assert(ise_range >= FIRST_VALID_ENDPOINT_ISE_RANGE && ise_range <= LAST_VALID_ENDPOINT_ISE_RANGE);
1192

1193
		const uint32_t total_levels = get_ise_levels(ise_range);
1194
		int best_e = INT_MAX, best_index = 0;
1195
		for (uint32_t i = 0; i < total_levels; i++)
1196
		{
1197
			const int qv = dequant_bise_endpoint(i, ise_range);
1198
			int e = labs(v - qv);
1199
			if (e < best_e)
1200
			{
1201
				best_e = e;
1202
				best_index = i;
1203
				if (!best_e)
1204
					break;
1205
			}
1206
		}
1207
		return best_index;
1208
	}
1209

1210
	// Returns the nearest ISE weight given a [0,64] endpoint value.
1211
	uint32_t find_nearest_bise_weight(int v, uint32_t ise_range)
1212
	{
1213
		assert(ise_range >= FIRST_VALID_WEIGHT_ISE_RANGE && ise_range <= LAST_VALID_WEIGHT_ISE_RANGE);
1214
		assert(v <= (int)MAX_WEIGHT_VALUE);
1215

1216
		const uint32_t total_levels = get_ise_levels(ise_range);
1217
		int best_e = INT_MAX, best_index = 0;
1218
		for (uint32_t i = 0; i < total_levels; i++)
1219
		{
1220
			const int qv = dequant_bise_weight(i, ise_range);
1221
			int e = labs(v - qv);
1222
			if (e < best_e)
1223
			{
1224
				best_e = e;
1225
				best_index = i;
1226
				if (!best_e)
1227
					break;
1228
			}
1229
		}
1230
		return best_index;
1231
	}
1232

1233
	void create_quant_tables(
1234
		uint8_t* pVal_to_ise,	// [0-255] or [0-64] value to nearest ISE symbol, array size is [256] or [65]
1235
		uint8_t* pISE_to_val,	// ASTC encoded ISE symbol to [0,255] or [0,64] value, [levels]
1236
		uint8_t* pISE_to_rank,	// returns the level rank index given an ISE symbol, [levels]
1237
		uint8_t* pRank_to_ISE,  // returns the ISE symbol given a level rank, inverse of pISE_to_rank, [levels]
1238
		uint32_t ise_range,		// ise range, [4,20] for endpoints, [0,11] for weights
1239
		bool weight_flag)		// false if block endpoints, true if weights
1240
	{
1241
		const uint32_t num_dequant_vals = weight_flag ? (MAX_WEIGHT_VALUE + 1) : 256;
1242

1243
		for (uint32_t i = 0; i < num_dequant_vals; i++)
1244
		{
1245
			uint32_t bise_index = weight_flag ? astc_helpers::find_nearest_bise_weight(i, ise_range) : astc_helpers::find_nearest_bise_endpoint(i, ise_range);
1246

1247
			if (pVal_to_ise)
1248
				pVal_to_ise[i] = (uint8_t)bise_index;
1249

1250
			if (pISE_to_val)
1251
				pISE_to_val[bise_index] = weight_flag ? (uint8_t)astc_helpers::dequant_bise_weight(bise_index, ise_range) : (uint8_t)astc_helpers::dequant_bise_endpoint(bise_index, ise_range);
1252
		}
1253

1254
		if (pISE_to_rank || pRank_to_ISE)
1255
		{
1256
			const uint32_t num_levels = get_ise_levels(ise_range);
1257

1258
			if (!g_ise_range_table[ise_range][1] && !g_ise_range_table[ise_range][2])
1259
			{
1260
				// Only bits
1261
				for (uint32_t i = 0; i < num_levels; i++)
1262
				{
1263
					if (pISE_to_rank)
1264
						pISE_to_rank[i] = (uint8_t)i;
1265

1266
					if (pRank_to_ISE)
1267
						pRank_to_ISE[i] = (uint8_t)i;
1268
				}
1269
			}
1270
			else
1271
			{
1272
				// Range has trits or quints
1273
				uint32_t vals[256];
1274
				for (uint32_t i = 0; i < num_levels; i++)
1275
				{
1276
					uint32_t v = weight_flag ? astc_helpers::dequant_bise_weight(i, ise_range) : astc_helpers::dequant_bise_endpoint(i, ise_range);
1277
					
1278
					// Low=ISE value
1279
					// High=dequantized value
1280
					vals[i] = (v << 16) | i;
1281
				}
1282
				
1283
				// Sorts by dequantized value
1284
				std::sort(vals, vals + num_levels);
1285
				
1286
				for (uint32_t rank = 0; rank < num_levels; rank++)
1287
				{
1288
					uint32_t ise_val = (uint8_t)vals[rank];
1289

1290
					if (pISE_to_rank)
1291
						pISE_to_rank[ise_val] = (uint8_t)rank;
1292
					
1293
					if (pRank_to_ISE)
1294
						pRank_to_ISE[rank] = (uint8_t)ise_val;
1295
				}
1296
			}
1297
		}
1298
	}
1299

1300
	void pack_void_extent_ldr(astc_block &blk, uint16_t rh, uint16_t gh, uint16_t bh, uint16_t ah, pack_stats* pStats)
1301
	{
1302
		uint8_t* pDst = (uint8_t*)&blk.m_vals[0];
1303
		memset(pDst, 0xFF, 16);
1304

1305
		pDst[0] = 0b11111100;
1306
		pDst[1] = 0b11111101;
1307

1308
		pDst[8] = (uint8_t)rh;
1309
		pDst[9] = (uint8_t)(rh >> 8);
1310
		pDst[10] = (uint8_t)gh;
1311
		pDst[11] = (uint8_t)(gh >> 8);
1312
		pDst[12] = (uint8_t)bh;
1313
		pDst[13] = (uint8_t)(bh >> 8);
1314
		pDst[14] = (uint8_t)ah;
1315
		pDst[15] = (uint8_t)(ah >> 8);
1316

1317
		if (pStats)
1318
			pStats->m_header_bits += 128;
1319
	}
1320

1321
	// rh-ah are half-floats
1322
	void pack_void_extent_hdr(astc_block& blk, uint16_t rh, uint16_t gh, uint16_t bh, uint16_t ah, pack_stats *pStats) 
1323
	{
1324
		uint8_t* pDst = (uint8_t*)&blk.m_vals[0];
1325
		memset(pDst, 0xFF, 16);
1326

1327
		pDst[0] = 0b11111100;
1328
		
1329
		pDst[8] = (uint8_t)rh;
1330
		pDst[9] = (uint8_t)(rh >> 8);
1331
		pDst[10] = (uint8_t)gh;
1332
		pDst[11] = (uint8_t)(gh >> 8);
1333
		pDst[12] = (uint8_t)bh;
1334
		pDst[13] = (uint8_t)(bh >> 8);
1335
		pDst[14] = (uint8_t)ah;
1336
		pDst[15] = (uint8_t)(ah >> 8);
1337

1338
		if (pStats)
1339
			pStats->m_header_bits += 128;
1340
	}
1341
		
1342
	bool is_cem_ldr(uint32_t mode)
1343
	{
1344
		switch (mode)
1345
		{
1346
		case CEM_LDR_LUM_DIRECT:
1347
		case CEM_LDR_LUM_BASE_PLUS_OFS:
1348
		case CEM_LDR_LUM_ALPHA_DIRECT:
1349
		case CEM_LDR_LUM_ALPHA_BASE_PLUS_OFS:
1350
		case CEM_LDR_RGB_BASE_SCALE:
1351
		case CEM_LDR_RGB_DIRECT:
1352
		case CEM_LDR_RGB_BASE_PLUS_OFFSET:
1353
		case CEM_LDR_RGB_BASE_SCALE_PLUS_TWO_A:
1354
		case CEM_LDR_RGBA_DIRECT:
1355
		case CEM_LDR_RGBA_BASE_PLUS_OFFSET:
1356
			return true;
1357
		default:
1358
			break;
1359
		}
1360
	
1361
		return false;
1362
	}
1363

1364
	bool is_valid_block_size(uint32_t w, uint32_t h)
1365
	{
1366
		assert((w >= MIN_BLOCK_DIM) && (w <= MAX_BLOCK_DIM));
1367
		assert((h >= MIN_BLOCK_DIM) && (h <= MAX_BLOCK_DIM));
1368

1369
#define SIZECHK(x, y) if ((w == (x)) && (h == (y))) return true;
1370
		SIZECHK(4, 4);
1371
		SIZECHK(5, 4);
1372

1373
		SIZECHK(5, 5);
1374

1375
		SIZECHK(6, 5);
1376
		SIZECHK(6, 6);
1377

1378
		SIZECHK(8, 5);
1379
		SIZECHK(8, 6);
1380
		SIZECHK(10, 5);
1381
		SIZECHK(10, 6);
1382

1383
		SIZECHK(8, 8);
1384
		SIZECHK(10, 8);
1385
		SIZECHK(10, 10);
1386

1387
		SIZECHK(12, 10);
1388
		SIZECHK(12, 12);
1389
#undef SIZECHK
1390

1391
		return false;
1392
	}
1393

1394
	bool block_has_any_hdr_cems(const log_astc_block& log_blk)
1395
	{
1396
		assert((log_blk.m_num_partitions >= 1) && (log_blk.m_num_partitions <= MAX_PARTITIONS));
1397

1398
		for (uint32_t i = 0; i < log_blk.m_num_partitions; i++)
1399
			if (is_cem_hdr(log_blk.m_color_endpoint_modes[i]))
1400
				return true;
1401

1402
		return false;
1403
	}
1404

1405
	bool block_has_any_ldr_cems(const log_astc_block& log_blk)
1406
	{
1407
		assert((log_blk.m_num_partitions >= 1) && (log_blk.m_num_partitions <= MAX_PARTITIONS));
1408

1409
		for (uint32_t i = 0; i < log_blk.m_num_partitions; i++)
1410
			if (!is_cem_hdr(log_blk.m_color_endpoint_modes[i]))
1411
				return true;
1412

1413
		return false;
1414
	}
1415
		
1416
	dequant_tables g_dequant_tables;
1417

1418
	void precompute_texel_partitions_4x4();
1419
	void precompute_texel_partitions_6x6();
1420

1421
	void init_tables(bool init_rank_tabs)
1422
	{
1423
		g_dequant_tables.init(init_rank_tabs);
1424
		
1425
		precompute_texel_partitions_4x4();
1426
		precompute_texel_partitions_6x6();
1427
	}
1428
		
1429
	void compute_upsample_weights(
1430
		int block_width, int block_height,
1431
		int weight_grid_width, int weight_grid_height,
1432
		weighted_sample* pWeights) // there will be block_width * block_height bilinear samples
1433
	{
1434
		const uint32_t scaleX = (1024 + block_width / 2) / (block_width - 1);
1435
		const uint32_t scaleY = (1024 + block_height / 2) / (block_height - 1);
1436

1437
		for (int texelY = 0; texelY < block_height; texelY++)
1438
		{
1439
			for (int texelX = 0; texelX < block_width; texelX++)
1440
			{
1441
				const uint32_t gX = (scaleX * texelX * (weight_grid_width - 1) + 32) >> 6;
1442
				const uint32_t gY = (scaleY * texelY * (weight_grid_height - 1) + 32) >> 6;
1443
				const uint32_t jX = gX >> 4;
1444
				const uint32_t jY = gY >> 4;
1445
				const uint32_t fX = gX & 0xf;
1446
				const uint32_t fY = gY & 0xf;
1447
				const uint32_t w11 = (fX * fY + 8) >> 4;
1448
				const uint32_t w10 = fY - w11;
1449
				const uint32_t w01 = fX - w11;
1450
				const uint32_t w00 = 16 - fX - fY + w11;
1451

1452
				weighted_sample& s = pWeights[texelX + texelY * block_width];
1453
				s.m_src_x = (uint8_t)jX;
1454
				s.m_src_y = (uint8_t)jY;
1455
				s.m_weights[0][0] = (uint8_t)w00;
1456
				s.m_weights[0][1] = (uint8_t)w01;
1457
				s.m_weights[1][0] = (uint8_t)w10;
1458
				s.m_weights[1][1] = (uint8_t)w11;
1459
			}
1460
		}
1461
	}
1462

1463
	// Should be dequantized [0,64] weights
1464
	void upsample_weight_grid(
1465
		uint32_t bx, uint32_t by,		// destination/to dimension
1466
		uint32_t wx, uint32_t wy,		// source/from dimension
1467
		const uint8_t* pSrc_weights,	// these are dequantized [0,64] weights, NOT ISE symbols, [wy][wx]
1468
		uint8_t* pDst_weights)			// [by][bx]
1469
	{
1470
		assert((bx >= 2) && (by >= 2) && (bx <= 12) && (by <= 12));
1471
		assert((wx >= 2) && (wy >= 2) && (wx <= bx) && (wy <= by));
1472

1473
		const uint32_t total_src_weights = wx * wy;
1474
		const uint32_t total_dst_weights = bx * by;
1475

1476
		if (total_src_weights == total_dst_weights)
1477
		{
1478
			memcpy(pDst_weights, pSrc_weights, total_src_weights);
1479
			return;
1480
		}
1481

1482
		weighted_sample weights[12 * 12];
1483
		compute_upsample_weights(bx, by, wx, wy, weights);
1484

1485
		const weighted_sample* pS = weights;
1486

1487
		for (uint32_t y = 0; y < by; y++)
1488
		{
1489
			for (uint32_t x = 0; x < bx; x++, ++pS)
1490
			{
1491
				const uint32_t w00 = pS->m_weights[0][0];
1492
				const uint32_t w01 = pS->m_weights[0][1];
1493
				const uint32_t w10 = pS->m_weights[1][0];
1494
				const uint32_t w11 = pS->m_weights[1][1];
1495

1496
				assert(w00 || w01 || w10 || w11);
1497

1498
				const uint32_t sx = pS->m_src_x, sy = pS->m_src_y;
1499

1500
				uint32_t total = 8;
1501
				if (w00) total += pSrc_weights[bounds_check(sx + sy * wx, 0U, total_src_weights)] * w00;
1502
				if (w01) total += pSrc_weights[bounds_check(sx + 1 + sy * wx, 0U, total_src_weights)] * w01;
1503
				if (w10) total += pSrc_weights[bounds_check(sx + (sy + 1) * wx, 0U, total_src_weights)] * w10;
1504
				if (w11) total += pSrc_weights[bounds_check(sx + 1 + (sy + 1) * wx, 0U, total_src_weights)] * w11;
1505

1506
				pDst_weights[x + y * bx] = (uint8_t)(total >> 4);
1507
			}
1508
		}
1509
	}
1510

1511
	inline uint32_t hash52(uint32_t v)
1512
	{
1513
		uint32_t p = v;
1514
		p ^= p >> 15;   p -= p << 17;   p += p << 7;    p += p << 4;
1515
		p ^= p >> 5;   p += p << 16;   p ^= p >> 7;    p ^= p >> 3;
1516
		p ^= p << 6;   p ^= p >> 17;
1517
		return p;
1518
	}
1519

1520
	// small_block = num_blk_pixels < 31
1521
	int compute_texel_partition(uint32_t seedIn, uint32_t xIn, uint32_t yIn, uint32_t zIn, int num_partitions, bool small_block)
1522
	{
1523
		assert(zIn == 0);
1524

1525
		const uint32_t  x = small_block ? xIn << 1 : xIn;
1526
		const uint32_t  y = small_block ? yIn << 1 : yIn;
1527
		const uint32_t  z = small_block ? zIn << 1 : zIn;
1528
		const uint32_t  seed = seedIn + 1024 * (num_partitions - 1);
1529
		const uint32_t  rnum = hash52(seed);
1530

1531
		uint8_t         seed1 = (uint8_t)(rnum & 0xf);
1532
		uint8_t         seed2 = (uint8_t)((rnum >> 4) & 0xf);
1533
		uint8_t         seed3 = (uint8_t)((rnum >> 8) & 0xf);
1534
		uint8_t         seed4 = (uint8_t)((rnum >> 12) & 0xf);
1535
		uint8_t         seed5 = (uint8_t)((rnum >> 16) & 0xf);
1536
		uint8_t         seed6 = (uint8_t)((rnum >> 20) & 0xf);
1537
		uint8_t         seed7 = (uint8_t)((rnum >> 24) & 0xf);
1538
		uint8_t         seed8 = (uint8_t)((rnum >> 28) & 0xf);
1539
		uint8_t         seed9 = (uint8_t)((rnum >> 18) & 0xf);
1540
		uint8_t         seed10 = (uint8_t)((rnum >> 22) & 0xf);
1541
		uint8_t         seed11 = (uint8_t)((rnum >> 26) & 0xf);
1542
		uint8_t         seed12 = (uint8_t)(((rnum >> 30) | (rnum << 2)) & 0xf);
1543

1544
		seed1 = (uint8_t)(seed1 * seed1);
1545
		seed2 = (uint8_t)(seed2 * seed2);
1546
		seed3 = (uint8_t)(seed3 * seed3);
1547
		seed4 = (uint8_t)(seed4 * seed4);
1548
		seed5 = (uint8_t)(seed5 * seed5);
1549
		seed6 = (uint8_t)(seed6 * seed6);
1550
		seed7 = (uint8_t)(seed7 * seed7);
1551
		seed8 = (uint8_t)(seed8 * seed8);
1552
		seed9 = (uint8_t)(seed9 * seed9);
1553
		seed10 = (uint8_t)(seed10 * seed10);
1554
		seed11 = (uint8_t)(seed11 * seed11);
1555
		seed12 = (uint8_t)(seed12 * seed12);
1556

1557
		const int shA = (seed & 2) != 0 ? 4 : 5;
1558
		const int shB = (num_partitions == 3) ? 6 : 5;
1559
		const int sh1 = (seed & 1) != 0 ? shA : shB;
1560
		const int sh2 = (seed & 1) != 0 ? shB : shA;
1561
		const int sh3 = (seed & 0x10) != 0 ? sh1 : sh2;
1562

1563
		seed1 = (uint8_t)(seed1 >> sh1);
1564
		seed2 = (uint8_t)(seed2 >> sh2);
1565
		seed3 = (uint8_t)(seed3 >> sh1);
1566
		seed4 = (uint8_t)(seed4 >> sh2);
1567
		seed5 = (uint8_t)(seed5 >> sh1);
1568
		seed6 = (uint8_t)(seed6 >> sh2);
1569
		seed7 = (uint8_t)(seed7 >> sh1);
1570
		seed8 = (uint8_t)(seed8 >> sh2);
1571
		seed9 = (uint8_t)(seed9 >> sh3);
1572
		seed10 = (uint8_t)(seed10 >> sh3);
1573
		seed11 = (uint8_t)(seed11 >> sh3);
1574
		seed12 = (uint8_t)(seed12 >> sh3);
1575

1576
		const int a = 0x3f & (seed1 * x + seed2 * y + seed11 * z + (rnum >> 14));
1577
		const int b = 0x3f & (seed3 * x + seed4 * y + seed12 * z + (rnum >> 10));
1578
		const int c = (num_partitions >= 3) ? 0x3f & (seed5 * x + seed6 * y + seed9 * z + (rnum >> 6)) : 0;
1579
		const int d = (num_partitions >= 4) ? 0x3f & (seed7 * x + seed8 * y + seed10 * z + (rnum >> 2)) : 0;
1580

1581
		return (a >= b && a >= c && a >= d) ? 0
1582
			: (b >= c && b >= d) ? 1
1583
			: (c >= d) ? 2
1584
			: 3;
1585
	}
1586

1587
	// 4x4, 2 and 3 subsets
1588
	static uint32_t g_texel_partitions_4x4[1024][2]; 
1589
	
1590
	// 6x6, 2 and 3 subsets (2 subsets low 4 bits, 3 subsets high 4 bits)
1591
	static uint8_t g_texel_partitions_6x6[1024][6 * 6];
1592

1593
	void precompute_texel_partitions_4x4()
1594
	{
1595
		for (uint32_t p = 0; p < 1024; p++)
1596
		{
1597
			uint32_t v2 = 0, v3 = 0;
1598

1599
			for (uint32_t y = 0; y < 4; y++)
1600
			{
1601
				for (uint32_t x = 0; x < 4; x++)
1602
				{
1603
					const uint32_t shift = x * 2 + y * 8;
1604
					v2 |= (compute_texel_partition(p, x, y, 0, 2, true) << shift);
1605
					v3 |= (compute_texel_partition(p, x, y, 0, 3, true) << shift);
1606
				}
1607
			}
1608

1609
			g_texel_partitions_4x4[p][0] = v2;
1610
			g_texel_partitions_4x4[p][1] = v3;
1611
		}
1612
	}
1613

1614
	void precompute_texel_partitions_6x6()
1615
	{
1616
		for (uint32_t p = 0; p < 1024; p++)
1617
		{
1618
			for (uint32_t y = 0; y < 6; y++)
1619
			{
1620
				for (uint32_t x = 0; x < 6; x++)
1621
				{
1622
					const uint32_t p2 = compute_texel_partition(p, x, y, 0, 2, false);
1623
					const uint32_t p3 = compute_texel_partition(p, x, y, 0, 3, false);
1624
					
1625
					assert((p2 <= 1) && (p3 <= 2));
1626
					g_texel_partitions_6x6[p][x + y * 6] = (uint8_t)((p3 << 4) | p2);
1627
				}
1628
			}
1629
		}
1630
	}
1631

1632
	static inline int get_precompute_texel_partitions_4x4(uint32_t seed, uint32_t x, uint32_t y, uint32_t num_partitions)
1633
	{
1634
		assert(g_texel_partitions_4x4[1][0]);
1635
		assert(seed < 1024);
1636
		assert((x <= 3) && (y <= 3));
1637
		assert((num_partitions >= 2) && (num_partitions <= 3));
1638
	
1639
		const uint32_t shift = x * 2 + y * 8;
1640
		return (g_texel_partitions_4x4[seed][num_partitions - 2] >> shift) & 3;
1641
	}
1642

1643
	static inline int get_precompute_texel_partitions_6x6(uint32_t seed, uint32_t x, uint32_t y, uint32_t num_partitions)
1644
	{
1645
		assert(g_texel_partitions_6x6[0][0]);
1646
		assert(seed < 1024);
1647
		assert((x <= 5) && (y <= 5));
1648
		assert((num_partitions >= 2) && (num_partitions <= 3));
1649

1650
		const uint32_t shift = (num_partitions == 3) ? 4 : 0;
1651
		return (g_texel_partitions_6x6[seed][x + y * 6] >> shift) & 3;
1652
	}
1653

1654
	void blue_contract(
1655
		int r, int g, int b, int a, 
1656
		int &dr, int &dg, int &db, int &da)
1657
	{
1658
		dr = (r + b) >> 1;
1659
		dg = (g + b) >> 1;
1660
		db = b;
1661
		da = a;
1662
	}
1663

1664
	inline void bit_transfer_signed(int& a, int& b)
1665
	{
1666
		b >>= 1;
1667
		b |= (a & 0x80);
1668
		a >>= 1;
1669
		a &= 0x3F;
1670
		if ((a & 0x20) != 0) 
1671
			a -= 0x40;
1672
	}
1673

1674
	static inline int clamp(int a, int l, int h)
1675
	{
1676
		if (a < l)
1677
			a = l;
1678
		else if (a > h)
1679
			a = h;
1680
		return a;
1681
	}
1682

1683
	static inline float clampf(float a, float l, float h)
1684
	{
1685
		if (a < l)
1686
			a = l;
1687
		else if (a > h)
1688
			a = h;
1689
		return a;
1690
	}
1691

1692
	inline int sign_extend(int src, int num_src_bits)
1693
	{
1694
		assert((num_src_bits >= 2) && (num_src_bits <= 31));
1695

1696
		const bool negative = (src & (1 << (num_src_bits - 1))) != 0;
1697
		if (negative)
1698
			return src | ~((1 << num_src_bits) - 1);
1699
		else
1700
			return src & ((1 << num_src_bits) - 1);
1701
	}
1702

1703
	// endpoints is [4][2]
1704
	void decode_endpoint(uint32_t cem_index, int (*pEndpoints)[2], const uint8_t *pE)
1705
	{
1706
		assert(cem_index <= CEM_HDR_RGB_HDR_ALPHA);
1707

1708
		int v0 = pE[0], v1 = pE[1];
1709

1710
		int& e0_r = pEndpoints[0][0], &e0_g = pEndpoints[1][0], &e0_b = pEndpoints[2][0], &e0_a = pEndpoints[3][0];
1711
		int& e1_r = pEndpoints[0][1], &e1_g = pEndpoints[1][1], &e1_b = pEndpoints[2][1], &e1_a = pEndpoints[3][1];
1712

1713
		switch (cem_index)
1714
		{
1715
		case CEM_LDR_LUM_DIRECT:
1716
		{
1717
			e0_r = v0; e1_r = v1;
1718
			e0_g = v0; e1_g = v1;
1719
			e0_b = v0; e1_b = v1;
1720
			e0_a = 0xFF; e1_a = 0xFF;
1721
			break;
1722
		}
1723
		case CEM_LDR_LUM_BASE_PLUS_OFS:
1724
		{
1725
			int l0 = (v0 >> 2) | (v1 & 0xc0);
1726
			int l1 = l0 + (v1 & 0x3f);
1727

1728
			if (l1 > 0xFF)
1729
				l1 = 0xFF;
1730

1731
			e0_r = l0; e1_r = l1;
1732
			e0_g = l0; e1_g = l1;
1733
			e0_b = l0; e1_b = l1;
1734
			e0_a = 0xFF; e1_a = 0xFF;
1735
			break;
1736
		}
1737
		case CEM_LDR_LUM_ALPHA_DIRECT:
1738
		{
1739
			int v2 = pE[2], v3 = pE[3];
1740

1741
			e0_r = v0; e1_r = v1;
1742
			e0_g = v0; e1_g = v1;
1743
			e0_b = v0; e1_b = v1;
1744
			e0_a = v2; e1_a = v3;
1745
			break;
1746
		}
1747
		case CEM_LDR_LUM_ALPHA_BASE_PLUS_OFS:
1748
		{
1749
			int v2 = pE[2], v3 = pE[3];
1750

1751
			bit_transfer_signed(v1, v0);
1752
			bit_transfer_signed(v3, v2);
1753

1754
			e0_r = v0; e1_r = v0 + v1;
1755
			e0_g = v0; e1_g = v0 + v1;
1756
			e0_b = v0; e1_b = v0 + v1;
1757
			e0_a = v2; e1_a = v2 + v3;
1758

1759
			for (uint32_t c = 0; c < 4; c++)
1760
			{
1761
				pEndpoints[c][0] = clamp(pEndpoints[c][0], 0, 255);
1762
				pEndpoints[c][1] = clamp(pEndpoints[c][1], 0, 255);
1763
			}
1764

1765
			break;
1766
		}
1767
		case CEM_LDR_RGB_BASE_SCALE:
1768
		{
1769
			int v2 = pE[2], v3 = pE[3];
1770

1771
			e0_r = (v0 * v3) >> 8; e1_r = v0;
1772
			e0_g = (v1 * v3) >> 8; e1_g = v1;
1773
			e0_b = (v2 * v3) >> 8; e1_b = v2;
1774
			e0_a = 0xFF; e1_a = 0xFF;
1775

1776
			break;
1777
		}
1778
		case CEM_LDR_RGB_DIRECT:
1779
		{
1780
			int v2 = pE[2], v3 = pE[3], v4 = pE[4], v5 = pE[5];
1781

1782
			if ((v1 + v3 + v5) >= (v0 + v2 + v4))
1783
			{
1784
				e0_r = v0; e1_r = v1;
1785
				e0_g = v2; e1_g = v3;
1786
				e0_b = v4; e1_b = v5;
1787
				e0_a = 0xFF; e1_a = 0xFF;
1788
			}
1789
			else
1790
			{
1791
				blue_contract(v1, v3, v5, 0xFF, e0_r, e0_g, e0_b, e0_a);
1792
				blue_contract(v0, v2, v4, 0xFF, e1_r, e1_g, e1_b, e1_a);
1793
			}
1794

1795
			break;
1796
		}
1797
		case CEM_LDR_RGB_BASE_PLUS_OFFSET:
1798
		{
1799
			int v2 = pE[2], v3 = pE[3], v4 = pE[4], v5 = pE[5];
1800

1801
			bit_transfer_signed(v1, v0);
1802
			bit_transfer_signed(v3, v2);
1803
			bit_transfer_signed(v5, v4);
1804

1805
			if ((v1 + v3 + v5) >= 0)
1806
			{
1807
				e0_r = v0; e1_r = v0 + v1;
1808
				e0_g = v2; e1_g = v2 + v3;
1809
				e0_b = v4; e1_b = v4 + v5;
1810
				e0_a = 0xFF; e1_a = 0xFF;
1811
			}
1812
			else
1813
			{
1814
				blue_contract(v0 + v1, v2 + v3, v4 + v5, 0xFF, e0_r, e0_g, e0_b, e0_a);
1815
				blue_contract(v0, v2, v4, 0xFF, e1_r, e1_g, e1_b, e1_a);
1816
			}
1817

1818
			for (uint32_t c = 0; c < 4; c++)
1819
			{
1820
				pEndpoints[c][0] = clamp(pEndpoints[c][0], 0, 255);
1821
				pEndpoints[c][1] = clamp(pEndpoints[c][1], 0, 255);
1822
			}
1823

1824
			break;
1825
		}
1826
		case CEM_LDR_RGB_BASE_SCALE_PLUS_TWO_A:
1827
		{
1828
			int v2 = pE[2], v3 = pE[3], v4 = pE[4], v5 = pE[5];
1829

1830
			e0_r = (v0 * v3) >> 8; e1_r = v0;
1831
			e0_g = (v1 * v3) >> 8; e1_g = v1;
1832
			e0_b = (v2 * v3) >> 8; e1_b = v2;
1833
			e0_a = v4; e1_a = v5;
1834

1835
			break;
1836
		}
1837
		case CEM_LDR_RGBA_DIRECT:
1838
		{
1839
			int v2 = pE[2], v3 = pE[3], v4 = pE[4], v5 = pE[5], v6 = pE[6], v7 = pE[7];
1840

1841
			if ((v1 + v3 + v5) >= (v0 + v2 + v4))
1842
			{
1843
				e0_r = v0; e1_r = v1;
1844
				e0_g = v2; e1_g = v3;
1845
				e0_b = v4; e1_b = v5;
1846
				e0_a = v6; e1_a = v7;
1847
			}
1848
			else
1849
			{
1850
				blue_contract(v1, v3, v5, v7, e0_r, e0_g, e0_b, e0_a);
1851
				blue_contract(v0, v2, v4, v6, e1_r, e1_g, e1_b, e1_a);
1852
			}
1853

1854
			break;
1855
		}
1856
		case CEM_LDR_RGBA_BASE_PLUS_OFFSET:
1857
		{
1858
			int v2 = pE[2], v3 = pE[3], v4 = pE[4], v5 = pE[5], v6 = pE[6], v7 = pE[7];
1859

1860
			bit_transfer_signed(v1, v0);
1861
			bit_transfer_signed(v3, v2);
1862
			bit_transfer_signed(v5, v4);
1863
			bit_transfer_signed(v7, v6);
1864

1865
			if ((v1 + v3 + v5) >= 0)
1866
			{
1867
				e0_r = v0; e1_r = v0 + v1;
1868
				e0_g = v2; e1_g = v2 + v3;
1869
				e0_b = v4; e1_b = v4 + v5;
1870
				e0_a = v6; e1_a = v6 + v7;
1871
			}
1872
			else
1873
			{
1874
				blue_contract(v0 + v1, v2 + v3, v4 + v5, v6 + v7, e0_r, e0_g, e0_b, e0_a);
1875
				blue_contract(v0, v2, v4, v6, e1_r, e1_g, e1_b, e1_a);
1876
			}
1877

1878
			for (uint32_t c = 0; c < 4; c++)
1879
			{
1880
				pEndpoints[c][0] = clamp(pEndpoints[c][0], 0, 255);
1881
				pEndpoints[c][1] = clamp(pEndpoints[c][1], 0, 255);
1882
			}
1883

1884
			break;
1885
		}
1886
		case CEM_HDR_LUM_LARGE_RANGE:
1887
		{
1888
			int y0, y1;
1889
			if (v1 >= v0)
1890
			{
1891
				y0 = (v0 << 4);
1892
				y1 = (v1 << 4);
1893
			}
1894
			else
1895
			{
1896
				y0 = (v1 << 4) + 8;
1897
				y1 = (v0 << 4) - 8;
1898
			}
1899

1900
			e0_r = y0; e1_r = y1;
1901
			e0_g = y0; e1_g = y1;
1902
			e0_b = y0; e1_b = y1;
1903
			e0_a = 0x780; e1_a = 0x780;
1904
						
1905
			break;
1906
		}
1907
		case CEM_HDR_LUM_SMALL_RANGE:
1908
		{
1909
			int y0, y1, d;
1910

1911
			if ((v0 & 0x80) != 0)
1912
			{
1913
				y0 = ((v1 & 0xE0) << 4) | ((v0 & 0x7F) << 2);
1914
				d = (v1 & 0x1F) << 2;
1915
			}
1916
			else
1917
			{
1918
				y0 = ((v1 & 0xF0) << 4) | ((v0 & 0x7F) << 1);
1919
				d = (v1 & 0x0F) << 1;
1920
			}
1921
						
1922
			y1 = y0 + d;
1923
			if (y1 > 0xFFF) 
1924
				y1 = 0xFFF;
1925
						
1926
			e0_r = y0; e1_r = y1;
1927
			e0_g = y0; e1_g = y1;
1928
			e0_b = y0; e1_b = y1;
1929
			e0_a = 0x780; e1_a = 0x780;
1930

1931
			break;
1932
		}
1933
		case CEM_HDR_RGB_BASE_SCALE:
1934
		{
1935
			int v2 = pE[2], v3 = pE[3];
1936
						
1937
			int modeval = ((v0 & 0xC0) >> 6) | ((v1 & 0x80) >> 5) | ((v2 & 0x80) >> 4);
1938
			
1939
			int majcomp, mode;
1940
			if ((modeval & 0xC) != 0xC) 
1941
			{
1942
				majcomp = modeval >> 2; 
1943
				mode = modeval & 3;
1944
			}
1945
			else if (modeval != 0xF) 
1946
			{
1947
				majcomp = modeval & 3;  
1948
				mode = 4;
1949
			}
1950
			else 
1951
			{
1952
				majcomp = 0; 
1953
				mode = 5;
1954
			}
1955

1956
			int red = v0 & 0x3f; 
1957
			int green = v1 & 0x1f;
1958
			int blue = v2 & 0x1f; 
1959
			int scale = v3 & 0x1f;
1960

1961
			int x0 = (v1 >> 6) & 1; 
1962
			int x1 = (v1 >> 5) & 1; 
1963
			int x2 = (v2 >> 6) & 1;
1964
			int x3 = (v2 >> 5) & 1; 
1965
			int x4 = (v3 >> 7) & 1; 
1966
			int x5 = (v3 >> 6) & 1;
1967
			int x6 = (v3 >> 5) & 1;
1968

1969
			int ohm = 1 << mode;
1970
			if (ohm & 0x30) green |= x0 << 6;
1971
			if (ohm & 0x3A) green |= x1 << 5;
1972
			if (ohm & 0x30) blue |= x2 << 6;
1973
			if (ohm & 0x3A) blue |= x3 << 5;
1974
			if (ohm & 0x3D) scale |= x6 << 5;
1975
			if (ohm & 0x2D) scale |= x5 << 6;
1976
			if (ohm & 0x04) scale |= x4 << 7;
1977
			if (ohm & 0x3B) red |= x4 << 6;
1978
			if (ohm & 0x04) red |= x3 << 6;
1979
			if (ohm & 0x10) red |= x5 << 7;
1980
			if (ohm & 0x0F) red |= x2 << 7;
1981
			if (ohm & 0x05) red |= x1 << 8;
1982
			if (ohm & 0x0A) red |= x0 << 8;
1983
			if (ohm & 0x05) red |= x0 << 9;
1984
			if (ohm & 0x02) red |= x6 << 9;
1985
			if (ohm & 0x01) red |= x3 << 10;
1986
			if (ohm & 0x02) red |= x5 << 10;
1987

1988
			static const int s_shamts[6] = { 1,1,2,3,4,5 };
1989
			
1990
			const int shamt = s_shamts[mode];
1991
			red <<= shamt; 
1992
			green <<= shamt; 
1993
			blue <<= shamt; 
1994
			scale <<= shamt;
1995

1996
			if (mode != 5) 
1997
			{ 
1998
				green = red - green; 
1999
				blue = red - blue; 
2000
			}
2001

2002
			if (majcomp == 1) 
2003
				std::swap(red, green);
2004

2005
			if (majcomp == 2) 
2006
				std::swap(red, blue);
2007
						
2008
			e1_r = clamp(red, 0, 0xFFF);
2009
			e1_g = clamp(green, 0, 0xFFF);
2010
			e1_b = clamp(blue, 0, 0xFFF);
2011
			e1_a = 0x780;
2012

2013
			e0_r = clamp(red - scale, 0, 0xFFF);
2014
			e0_g = clamp(green - scale, 0, 0xFFF);
2015
			e0_b = clamp(blue - scale, 0, 0xFFF);
2016
			e0_a = 0x780;
2017

2018
			break;
2019
		}
2020
		case CEM_HDR_RGB_HDR_ALPHA:
2021
		case CEM_HDR_RGB_LDR_ALPHA:
2022
		case CEM_HDR_RGB:
2023
		{
2024
			int v2 = pE[2], v3 = pE[3], v4 = pE[4], v5 = pE[5];
2025

2026
			int majcomp = ((v4 & 0x80) >> 7) | ((v5 & 0x80) >> 6);
2027

2028
			e0_a = 0x780;
2029
			e1_a = 0x780;
2030

2031
			if (majcomp == 3) 
2032
			{
2033
				e0_r = v0 << 4;
2034
				e0_g = v2 << 4;
2035
				e0_b = (v4 & 0x7f) << 5;
2036

2037
				e1_r = v1 << 4;
2038
				e1_g = v3 << 4;
2039
				e1_b = (v5 & 0x7f) << 5;
2040
			}
2041
			else
2042
			{
2043
				int mode = ((v1 & 0x80) >> 7) | ((v2 & 0x80) >> 6) | ((v3 & 0x80) >> 5);
2044
				int va = v0 | ((v1 & 0x40) << 2);
2045
				int vb0 = v2 & 0x3f;
2046
				int vb1 = v3 & 0x3f;
2047
				int vc = v1 & 0x3f;
2048
				int vd0 = v4 & 0x7f;
2049
				int vd1 = v5 & 0x7f;
2050

2051
				static const int s_dbitstab[8] = { 7,6,7,6,5,6,5,6 };
2052
				vd0 = sign_extend(vd0, s_dbitstab[mode]);
2053
				vd1 = sign_extend(vd1, s_dbitstab[mode]);
2054

2055
				int x0 = (v2 >> 6) & 1;
2056
				int x1 = (v3 >> 6) & 1;
2057
				int x2 = (v4 >> 6) & 1;
2058
				int x3 = (v5 >> 6) & 1;
2059
				int x4 = (v4 >> 5) & 1;
2060
				int x5 = (v5 >> 5) & 1;
2061

2062
				int ohm = 1 << mode;
2063
				if (ohm & 0xA4) va |= x0 << 9;
2064
				if (ohm & 0x08) va |= x2 << 9;
2065
				if (ohm & 0x50) va |= x4 << 9;
2066
				if (ohm & 0x50) va |= x5 << 10;
2067
				if (ohm & 0xA0) va |= x1 << 10;
2068
				if (ohm & 0xC0) va |= x2 << 11;
2069
				if (ohm & 0x04) vc |= x1 << 6;
2070
				if (ohm & 0xE8) vc |= x3 << 6;
2071
				if (ohm & 0x20) vc |= x2 << 7;
2072
				if (ohm & 0x5B) vb0 |= x0 << 6;
2073
				if (ohm & 0x5B) vb1 |= x1 << 6;
2074
				if (ohm & 0x12) vb0 |= x2 << 7;
2075
				if (ohm & 0x12) vb1 |= x3 << 7;
2076

2077
				int shamt = (mode >> 1) ^ 3;
2078
				va  = (uint32_t)va  << shamt;
2079
				vb0 = (uint32_t)vb0 << shamt;
2080
				vb1 = (uint32_t)vb1 << shamt;
2081
				vc  = (uint32_t)vc  << shamt;
2082
				vd0 = (uint32_t)vd0 << shamt;
2083
				vd1 = (uint32_t)vd1 << shamt;
2084

2085
				e1_r = clamp(va, 0, 0xFFF);
2086
				e1_g = clamp(va - vb0, 0, 0xFFF);
2087
				e1_b = clamp(va - vb1, 0, 0xFFF);
2088

2089
				e0_r = clamp(va - vc, 0, 0xFFF);
2090
				e0_g = clamp(va - vb0 - vc - vd0, 0, 0xFFF);
2091
				e0_b = clamp(va - vb1 - vc - vd1, 0, 0xFFF);
2092

2093
				if (majcomp == 1)
2094
				{
2095
					std::swap(e0_r, e0_g);
2096
					std::swap(e1_r, e1_g);
2097
				}
2098
				else if (majcomp == 2)
2099
				{
2100
					std::swap(e0_r, e0_b);
2101
					std::swap(e1_r, e1_b);
2102
				}
2103
			}
2104

2105
			if (cem_index == CEM_HDR_RGB_LDR_ALPHA)
2106
			{
2107
				int v6 = pE[6], v7 = pE[7];
2108

2109
				e0_a = v6;
2110
				e1_a = v7;
2111
			}
2112
			else if (cem_index == CEM_HDR_RGB_HDR_ALPHA)
2113
			{
2114
				int v6 = pE[6], v7 = pE[7];
2115

2116
				// Extract mode bits
2117
				int mode = ((v6 >> 7) & 1) | ((v7 >> 6) & 2);
2118
				v6 &= 0x7F;
2119
				v7 &= 0x7F;
2120

2121
				if (mode == 3)
2122
				{
2123
					e0_a = v6 << 5;
2124
					e1_a = v7 << 5;
2125
				}
2126
				else
2127
				{
2128
					v6 |= (v7 << (mode + 1)) & 0x780;
2129
					v7 &= (0x3F >> mode);
2130
					v7 ^= (0x20 >> mode);
2131
					v7 -= (0x20 >> mode);
2132
					v6 <<= (4 - mode); 
2133
					v7 <<= (4 - mode);
2134

2135
					v7 += v6;
2136
					v7 = clamp(v7, 0, 0xFFF);
2137
					e0_a = v6; 
2138
					e1_a = v7;
2139
				}
2140
			}
2141

2142
			break;
2143
		}
2144
		default:
2145
		{
2146
			assert(0);
2147
			for (uint32_t c = 0; c < 4; c++)
2148
			{
2149
				pEndpoints[c][0] = 0;
2150
				pEndpoints[c][1] = 0;
2151
			}
2152
			break;
2153
		}
2154
		}
2155
	}
2156
		
2157
	static inline bool is_half_inf_or_nan(half_float v)
2158
	{
2159
		return get_bits(v, 10, 14) == 31;
2160
	}
2161

2162
	// This float->half conversion matches how "F32TO16" works on Intel GPU's.
2163
	half_float float_to_half(float val, bool toward_zero)
2164
	{
2165
		union { float f; int32_t i; uint32_t u; } fi = { val };
2166
		const int flt_m = fi.i & 0x7FFFFF, flt_e = (fi.i >> 23) & 0xFF, flt_s = (fi.i >> 31) & 0x1;
2167
		int s = flt_s, e = 0, m = 0;
2168

2169
		// inf/NaN
2170
		if (flt_e == 0xff)
2171
		{
2172
			e = 31;
2173
			if (flt_m != 0) // NaN
2174
				m = 1;
2175
		}
2176
		// not zero or denormal
2177
		else if (flt_e != 0)
2178
		{
2179
			int new_exp = flt_e - 127;
2180
			if (new_exp > 15)
2181
				e = 31;
2182
			else if (new_exp < -14)
2183
			{
2184
				if (toward_zero)
2185
					m = (int)truncf((1 << 24) * fabsf(fi.f));
2186
				else
2187
					m = lrintf((1 << 24) * fabsf(fi.f));
2188
			}
2189
			else
2190
			{
2191
				e = new_exp + 15;
2192
				if (toward_zero)
2193
					m = (int)truncf((float)flt_m * (1.0f / (float)(1 << 13)));
2194
				else
2195
					m = lrintf((float)flt_m * (1.0f / (float)(1 << 13)));
2196
			}
2197
		}
2198

2199
		assert((0 <= m) && (m <= 1024));
2200
		if (m == 1024)
2201
		{
2202
			e++;
2203
			m = 0;
2204
		}
2205

2206
		assert((s >= 0) && (s <= 1));
2207
		assert((e >= 0) && (e <= 31));
2208
		assert((m >= 0) && (m <= 1023));
2209

2210
		half_float result = (half_float)((s << 15) | (e << 10) | m);
2211
		return result;
2212
	}
2213

2214
	float half_to_float(half_float hval)
2215
	{
2216
		union { float f; uint32_t u; } x = { 0 };
2217

2218
		uint32_t s = ((uint32_t)hval >> 15) & 1;
2219
		uint32_t e = ((uint32_t)hval >> 10) & 0x1F;
2220
		uint32_t m = (uint32_t)hval & 0x3FF;
2221

2222
		if (!e)
2223
		{
2224
			if (!m)
2225
			{
2226
				// +- 0
2227
				x.u = s << 31;
2228
				return x.f;
2229
			}
2230
			else
2231
			{
2232
				// denormalized
2233
				while (!(m & 0x00000400))
2234
				{
2235
					m <<= 1;
2236
					--e;
2237
				}
2238

2239
				++e;
2240
				m &= ~0x00000400;
2241
			}
2242
		}
2243
		else if (e == 31)
2244
		{
2245
			if (m == 0)
2246
			{
2247
				// +/- INF
2248
				x.u = (s << 31) | 0x7f800000;
2249
				return x.f;
2250
			}
2251
			else
2252
			{
2253
				// +/- NaN
2254
				x.u = (s << 31) | 0x7f800000 | (m << 13);
2255
				return x.f;
2256
			}
2257
		}
2258

2259
		e = e + (127 - 15);
2260
		m = m << 13;
2261

2262
		assert(s <= 1);
2263
		assert(m <= 0x7FFFFF);
2264
		assert(e <= 255);
2265

2266
		x.u = m | (e << 23) | (s << 31);
2267
		return x.f;
2268
	}
2269
		
2270
	// See https://registry.khronos.org/OpenGL/extensions/EXT/EXT_texture_shared_exponent.txt
2271
	const int RGB9E5_EXPONENT_BITS = 5, RGB9E5_MANTISSA_BITS = 9, RGB9E5_EXP_BIAS = 15, RGB9E5_MAX_VALID_BIASED_EXP = 31;
2272
	const int MAX_RGB9E5_EXP = (RGB9E5_MAX_VALID_BIASED_EXP - RGB9E5_EXP_BIAS);
2273
	const int RGB9E5_MANTISSA_VALUES = (1 << RGB9E5_MANTISSA_BITS);
2274
	const int MAX_RGB9E5_MANTISSA = (RGB9E5_MANTISSA_VALUES - 1);
2275
	//const int MAX_RGB9E5 = (int)(((float)MAX_RGB9E5_MANTISSA) / RGB9E5_MANTISSA_VALUES * (1 << MAX_RGB9E5_EXP));
2276
	const int EPSILON_RGB9E5 = (int)((1.0f / (float)RGB9E5_MANTISSA_VALUES) / (float)(1 << RGB9E5_EXP_BIAS));
2277
		
2278
	void unpack_rgb9e5(uint32_t packed, float& r, float& g, float& b)
2279
	{
2280
		int x = packed & 511;
2281
		int y = (packed >> 9) & 511;
2282
		int z = (packed >> 18) & 511;
2283
		int w = (packed >> 27) & 31;
2284

2285
		const float scale = powf(2.0f, static_cast<float>(w - RGB9E5_EXP_BIAS - RGB9E5_MANTISSA_BITS));
2286

2287
		r = x * scale;
2288
		g = y * scale;
2289
		b = z * scale;
2290
	}
2291
			
2292
	// floor_log2 is not correct for the denorm and zero values, but we are going to do a max of this value with the minimum rgb9e5 exponent that will hide these problem cases.
2293
	static inline int floor_log2(float x) 
2294
	{
2295
		union float754
2296
		{
2297
			unsigned int raw;
2298
			float value;
2299
		};
2300

2301
		float754 f;
2302
		f.value = x;
2303
		// Extract float exponent
2304
		return ((f.raw >> 23) & 0xFF) - 127;
2305
	}
2306

2307
	static inline int maximumi(int a, int b) { return (a > b) ? a : b; }
2308
	static inline float maximumf(float a, float b) { return (a > b) ? a : b; }
2309

2310
	uint32_t pack_rgb9e5(float r, float g, float b)
2311
	{
2312
		r = clampf(r, 0.0f, MAX_RGB9E5);
2313
		g = clampf(g, 0.0f, MAX_RGB9E5);
2314
		b = clampf(b, 0.0f, MAX_RGB9E5);
2315

2316
		float maxrgb = maximumf(maximumf(r, g), b);
2317
		int exp_shared = maximumi(-RGB9E5_EXP_BIAS - 1, floor_log2(maxrgb)) + 1 + RGB9E5_EXP_BIAS;
2318
		assert((exp_shared >= 0) && (exp_shared <= RGB9E5_MAX_VALID_BIASED_EXP));
2319

2320
		float denom = powf(2.0f, (float)(exp_shared - RGB9E5_EXP_BIAS - RGB9E5_MANTISSA_BITS));
2321

2322
		int maxm = (int)floorf((maxrgb / denom) + 0.5f);
2323
		if (maxm == (MAX_RGB9E5_MANTISSA + 1))
2324
		{
2325
			denom *= 2;
2326
			exp_shared += 1;
2327
			assert(exp_shared <= RGB9E5_MAX_VALID_BIASED_EXP);
2328
		}
2329
		else 
2330
		{
2331
			assert(maxm <= MAX_RGB9E5_MANTISSA);
2332
		}
2333

2334
		int rm = (int)floorf((r / denom) + 0.5f);
2335
		int gm = (int)floorf((g / denom) + 0.5f);
2336
		int bm = (int)floorf((b / denom) + 0.5f);
2337

2338
		assert((rm >= 0) && (rm <= MAX_RGB9E5_MANTISSA));
2339
		assert((gm >= 0) && (gm <= MAX_RGB9E5_MANTISSA));
2340
		assert((bm >= 0) && (bm <= MAX_RGB9E5_MANTISSA));
2341
		
2342
		return rm | (gm << 9) | (bm << 18) | (exp_shared << 27);
2343
	}
2344

2345
	static inline int clz17(uint32_t x)
2346
	{
2347
		assert(x <= 0x1FFFF);
2348
		x &= 0x1FFFF;
2349

2350
		if (!x)
2351
			return 17;
2352
				
2353
		uint32_t n = 0;
2354
		while ((x & 0x10000) == 0)
2355
		{
2356
			x <<= 1u;
2357
			n++;
2358
		}
2359

2360
		return n;
2361
	}
2362

2363
	static inline uint32_t pack_rgb9e5_ldr_astc(int Cr, int Cg, int Cb)
2364
	{
2365
		int lz = clz17(Cr | Cg | Cb | 1);
2366
		if (Cr == 65535) { Cr = 65536; lz = 0; }
2367
		if (Cg == 65535) { Cg = 65536; lz = 0; }
2368
		if (Cb == 65535) { Cb = 65536; lz = 0; }
2369
		Cr <<= lz; Cg <<= lz; Cb <<= lz;
2370
		Cr = (Cr >> 8) & 0x1FF;
2371
		Cg = (Cg >> 8) & 0x1FF;
2372
		Cb = (Cb >> 8) & 0x1FF;
2373
		uint32_t exponent = 16 - lz;
2374
		uint32_t texel = (exponent << 27) | (Cb << 18) | (Cg << 9) | Cr;
2375
		return texel;
2376
	}
2377

2378
	static inline uint32_t pack_rgb9e5_hdr_astc(int Cr, int Cg, int Cb)
2379
	{
2380
		if (Cr > 0x7c00) Cr = 0; else if (Cr == 0x7c00) Cr = 0x7bff;
2381
		if (Cg > 0x7c00) Cg = 0; else if (Cg == 0x7c00) Cg = 0x7bff;
2382
		if (Cb > 0x7c00) Cb = 0; else if (Cb == 0x7c00) Cb = 0x7bff;
2383
		int Re = (Cr >> 10) & 0x1F;
2384
		int Ge = (Cg >> 10) & 0x1F;
2385
		int Be = (Cb >> 10) & 0x1F;
2386
		int Rex = (Re == 0) ? 1 : Re;
2387
		int Gex = (Ge == 0) ? 1 : Ge;
2388
		int Bex = (Be == 0) ? 1 : Be;
2389
		int Xm = ((Cr | Cg | Cb) & 0x200) >> 9;
2390
		int Xe = Re | Ge | Be;
2391
		uint32_t rshift, gshift, bshift, expo;
2392

2393
		if (Xe == 0)
2394
		{
2395
			expo = rshift = gshift = bshift = Xm;
2396
		}
2397
		else if (Re >= Ge && Re >= Be)
2398
		{
2399
			expo = Rex + 1;
2400
			rshift = 2;
2401
			gshift = Rex - Gex + 2;
2402
			bshift = Rex - Bex + 2;
2403
		}
2404
		else if (Ge >= Be)
2405
		{
2406
			expo = Gex + 1;
2407
			rshift = Gex - Rex + 2;
2408
			gshift = 2;
2409
			bshift = Gex - Bex + 2;
2410
		}
2411
		else
2412
		{
2413
			expo = Bex + 1;
2414
			rshift = Bex - Rex + 2;
2415
			gshift = Bex - Gex + 2;
2416
			bshift = 2;
2417
		}
2418

2419
		int Rm = (Cr & 0x3FF) | (Re == 0 ? 0 : 0x400);
2420
		int Gm = (Cg & 0x3FF) | (Ge == 0 ? 0 : 0x400);
2421
		int Bm = (Cb & 0x3FF) | (Be == 0 ? 0 : 0x400);
2422
		Rm = (Rm >> rshift) & 0x1FF;
2423
		Gm = (Gm >> gshift) & 0x1FF;
2424
		Bm = (Bm >> bshift) & 0x1FF;
2425

2426
		uint32_t texel = (expo << 27) | (Bm << 18) | (Gm << 9) | (Rm << 0);
2427
		return texel;
2428
	}
2429
		
2430
	// Important: pPixels is either 32-bit/texel or 64-bit/texel.
2431
	bool decode_block(const log_astc_block& log_blk, void* pPixels, uint32_t blk_width, uint32_t blk_height, decode_mode dec_mode)
2432
	{
2433
		assert(is_valid_block_size(blk_width, blk_height));
2434
				
2435
		assert(g_dequant_tables.m_endpoints[0].m_ISE_to_val.size());
2436
		if (!g_dequant_tables.m_endpoints[0].m_ISE_to_val.size())
2437
			return false;
2438

2439
		const uint32_t num_blk_pixels = blk_width * blk_height;
2440
		
2441
		// Write block error color
2442
		if (dec_mode == cDecodeModeHDR16)
2443
		{
2444
			// NaN's
2445
			memset(pPixels, 0xFF, num_blk_pixels * sizeof(half_float) * 4);
2446
		}
2447
		else if (dec_mode == cDecodeModeRGB9E5)
2448
		{
2449
			const uint32_t purple_9e5 = pack_rgb9e5(1.0f, 0.0f, 1.0f);
2450

2451
			for (uint32_t i = 0; i < num_blk_pixels; i++)
2452
				((uint32_t*)pPixels)[i] = purple_9e5;
2453
		}
2454
		else
2455
		{
2456
			for (uint32_t i = 0; i < num_blk_pixels; i++)
2457
				((uint32_t*)pPixels)[i] = 0xFFFF00FF;
2458
		}
2459

2460
		if (log_blk.m_error_flag)
2461
		{
2462
			// Should this return false? It's not an invalid logical block config, though.
2463
			return false;
2464
		}
2465

2466
		// Handle solid color blocks
2467
		if (log_blk.m_solid_color_flag_ldr)
2468
		{
2469
			// LDR solid block
2470
			if (dec_mode == cDecodeModeHDR16)
2471
			{
2472
				// Convert LDR pixels to half-float
2473
				half_float h[4];
2474
				for (uint32_t c = 0; c < 4; c++)
2475
					h[c] = (log_blk.m_solid_color[c] == 0xFFFF) ? 0x3C00 : float_to_half((float)log_blk.m_solid_color[c] * (1.0f / 65536.0f), true);
2476

2477
				for (uint32_t i = 0; i < num_blk_pixels; i++)
2478
					memcpy((uint16_t*)pPixels + i * 4, h, sizeof(half_float) * 4);
2479
			}
2480
			else if (dec_mode == cDecodeModeRGB9E5)
2481
			{
2482
				float r = (log_blk.m_solid_color[0] == 0xFFFF) ? 1.0f : ((float)log_blk.m_solid_color[0] * (1.0f / 65536.0f));
2483
				float g = (log_blk.m_solid_color[1] == 0xFFFF) ? 1.0f : ((float)log_blk.m_solid_color[1] * (1.0f / 65536.0f));
2484
				float b = (log_blk.m_solid_color[2] == 0xFFFF) ? 1.0f : ((float)log_blk.m_solid_color[2] * (1.0f / 65536.0f));
2485

2486
				const uint32_t packed = pack_rgb9e5(r, g, b);
2487

2488
				for (uint32_t i = 0; i < num_blk_pixels; i++)
2489
					((uint32_t*)pPixels)[i] = packed;
2490
			}
2491
			else
2492
			{
2493
				// Convert LDR pixels to 8-bits
2494
				for (uint32_t i = 0; i < num_blk_pixels; i++)
2495
					for (uint32_t c = 0; c < 4; c++)
2496
						((uint8_t*)pPixels)[i * 4 + c] = (log_blk.m_solid_color[c] >> 8);
2497
			}
2498

2499
			return true;
2500
		}
2501
		else if (log_blk.m_solid_color_flag_hdr)
2502
		{
2503
			// HDR solid block, decode mode must be half-float or RGB9E5
2504
			if (dec_mode == cDecodeModeHDR16)
2505
			{
2506
				for (uint32_t i = 0; i < num_blk_pixels; i++)
2507
					memcpy((uint16_t*)pPixels + i * 4, log_blk.m_solid_color, sizeof(half_float) * 4);
2508
			}
2509
			else if (dec_mode == cDecodeModeRGB9E5)
2510
			{
2511
				float r = half_to_float(log_blk.m_solid_color[0]);
2512
				float g = half_to_float(log_blk.m_solid_color[1]);
2513
				float b = half_to_float(log_blk.m_solid_color[2]);
2514
				
2515
				const uint32_t packed = pack_rgb9e5(r, g, b);
2516

2517
				for (uint32_t i = 0; i < num_blk_pixels; i++)
2518
					((uint32_t*)pPixels)[i] = packed;
2519
			}
2520
			else
2521
			{
2522
				return false;
2523
			}
2524

2525
			return true;
2526
		}
2527
						
2528
		// Sanity check block's config
2529
		if ((log_blk.m_grid_width < 2) || (log_blk.m_grid_height < 2))
2530
			return false;
2531
		if ((log_blk.m_grid_width > blk_width) || (log_blk.m_grid_height > blk_height))
2532
			return false;
2533

2534
		if ((log_blk.m_endpoint_ise_range < FIRST_VALID_ENDPOINT_ISE_RANGE) || (log_blk.m_endpoint_ise_range > LAST_VALID_ENDPOINT_ISE_RANGE))
2535
			return false;
2536
		if ((log_blk.m_weight_ise_range < FIRST_VALID_WEIGHT_ISE_RANGE) || (log_blk.m_weight_ise_range > LAST_VALID_WEIGHT_ISE_RANGE))
2537
			return false;
2538
		if ((log_blk.m_num_partitions < 1) || (log_blk.m_num_partitions > MAX_PARTITIONS))
2539
			return false;
2540
		if ((log_blk.m_dual_plane) && (log_blk.m_num_partitions > MAX_DUAL_PLANE_PARTITIONS))
2541
			return false;
2542
		if (log_blk.m_partition_id >= NUM_PARTITION_PATTERNS)
2543
			return false;
2544
		if ((log_blk.m_num_partitions == 1) && (log_blk.m_partition_id > 0))
2545
			return false;
2546
		if (log_blk.m_color_component_selector > 3)
2547
			return false;
2548

2549
		const uint32_t total_endpoint_levels = get_ise_levels(log_blk.m_endpoint_ise_range);
2550
		const uint32_t total_weight_levels = get_ise_levels(log_blk.m_weight_ise_range);
2551
				
2552
		bool is_ldr_endpoints[MAX_PARTITIONS];
2553

2554
		// Check CEM's
2555
		uint32_t total_cem_vals = 0;
2556
		for (uint32_t i = 0; i < log_blk.m_num_partitions; i++)
2557
		{
2558
			if (log_blk.m_color_endpoint_modes[i] > 15)
2559
				return false;
2560

2561
			total_cem_vals += get_num_cem_values(log_blk.m_color_endpoint_modes[i]);
2562
			
2563
			is_ldr_endpoints[i] = is_cem_ldr(log_blk.m_color_endpoint_modes[i]);
2564
		}
2565

2566
		if (total_cem_vals > MAX_ENDPOINTS)
2567
			return false;
2568

2569
		const dequant_table& endpoint_dequant_tab = g_dequant_tables.get_endpoint_tab(log_blk.m_endpoint_ise_range);
2570
		const uint8_t* pEndpoint_dequant = endpoint_dequant_tab.m_ISE_to_val.data();
2571

2572
		// Dequantized endpoints to [0,255]
2573
		uint8_t dequantized_endpoints[MAX_ENDPOINTS];
2574
		for (uint32_t i = 0; i < total_cem_vals; i++)
2575
		{
2576
			if (log_blk.m_endpoints[i] >= total_endpoint_levels)
2577
				return false;
2578
			dequantized_endpoints[i] = pEndpoint_dequant[log_blk.m_endpoints[i]];
2579
		}
2580
				
2581
		// Dequantize weights to [0,64]
2582
		uint8_t dequantized_weights[2][12 * 12];
2583
		
2584
		const dequant_table& weight_dequant_tab = g_dequant_tables.get_weight_tab(log_blk.m_weight_ise_range);
2585
		const uint8_t* pWeight_dequant = weight_dequant_tab.m_ISE_to_val.data();
2586
		
2587
		const uint32_t total_weight_vals = (log_blk.m_dual_plane ? 2 : 1) * log_blk.m_grid_width * log_blk.m_grid_height;
2588
		for (uint32_t i = 0; i < total_weight_vals; i++)
2589
		{
2590
			if (log_blk.m_weights[i] >= total_weight_levels)
2591
				return false;
2592

2593
			const uint32_t plane_index = log_blk.m_dual_plane ? (i & 1) : 0;
2594
			const uint32_t grid_index = log_blk.m_dual_plane ? (i >> 1) : i;
2595

2596
			dequantized_weights[plane_index][grid_index] = pWeight_dequant[log_blk.m_weights[i]];
2597
		}
2598

2599
		// Upsample weight grid. [0,64] weights
2600
		uint8_t upsampled_weights[2][12 * 12];
2601

2602
		upsample_weight_grid(blk_width, blk_height, log_blk.m_grid_width, log_blk.m_grid_height, &dequantized_weights[0][0], &upsampled_weights[0][0]);
2603
		if (log_blk.m_dual_plane)
2604
			upsample_weight_grid(blk_width, blk_height, log_blk.m_grid_width, log_blk.m_grid_height, &dequantized_weights[1][0], &upsampled_weights[1][0]);
2605

2606
		// Decode CEM's
2607
		int endpoints[4][4][2]; // [subset][comp][l/h]
2608

2609
		uint32_t endpoint_val_index = 0;
2610
		for (uint32_t subset = 0; subset < log_blk.m_num_partitions; subset++)
2611
		{
2612
			const uint32_t cem_index = log_blk.m_color_endpoint_modes[subset];
2613

2614
			decode_endpoint(cem_index, &endpoints[subset][0], &dequantized_endpoints[endpoint_val_index]);
2615

2616
			endpoint_val_index += get_num_cem_values(cem_index);
2617
		}
2618

2619
		// Decode texels
2620
		const bool small_block = num_blk_pixels < 31;
2621
		const bool use_precomputed_texel_partitions_4x4 = (blk_width == 4) && (blk_height == 4) && (log_blk.m_num_partitions >= 2) && (log_blk.m_num_partitions <= 3);
2622
		const bool use_precomputed_texel_partitions_6x6 = (blk_width == 6) && (blk_height == 6) && (log_blk.m_num_partitions >= 2) && (log_blk.m_num_partitions <= 3);
2623
		const uint32_t ccs = log_blk.m_dual_plane ? log_blk.m_color_component_selector : UINT32_MAX;
2624
		
2625
		bool success = true;
2626

2627
		if (dec_mode == cDecodeModeRGB9E5)
2628
		{
2629
			// returns uint32_t's
2630
			for (uint32_t y = 0; y < blk_height; y++)
2631
			{
2632
				for (uint32_t x = 0; x < blk_width; x++)
2633
				{
2634
					const uint32_t pixel_index = x + y * blk_width;
2635
					
2636
					uint32_t subset = 0;
2637
					if (log_blk.m_num_partitions > 1)
2638
					{
2639
						if (use_precomputed_texel_partitions_4x4)
2640
							subset = get_precompute_texel_partitions_4x4(log_blk.m_partition_id, x, y, log_blk.m_num_partitions);
2641
						else if (use_precomputed_texel_partitions_6x6)
2642
							subset = get_precompute_texel_partitions_6x6(log_blk.m_partition_id, x, y, log_blk.m_num_partitions);
2643
						else
2644
							subset = compute_texel_partition(log_blk.m_partition_id, x, y, 0, log_blk.m_num_partitions, small_block);
2645
					}
2646

2647
					int comp[3];
2648

2649
					for (uint32_t c = 0; c < 3; c++)
2650
					{
2651
						const uint32_t w = upsampled_weights[(c == ccs) ? 1 : 0][pixel_index];
2652

2653
						if (is_ldr_endpoints[subset])
2654
						{
2655
							assert((endpoints[subset][c][0] >= 0) && (endpoints[subset][c][0] <= 0xFF));
2656
							assert((endpoints[subset][c][1] >= 0) && (endpoints[subset][c][1] <= 0xFF));
2657

2658
							int le = endpoints[subset][c][0];
2659
							int he = endpoints[subset][c][1];
2660

2661
							le = (le << 8) | le;
2662
							he = (he << 8) | he;
2663

2664
							int k = weight_interpolate(le, he, w);
2665
							assert((k >= 0) && (k <= 0xFFFF));
2666

2667
							comp[c] = k; // 1.0
2668
						}
2669
						else
2670
						{
2671
							assert((endpoints[subset][c][0] >= 0) && (endpoints[subset][c][0] <= 0xFFF));
2672
							assert((endpoints[subset][c][1] >= 0) && (endpoints[subset][c][1] <= 0xFFF));
2673

2674
							int le = endpoints[subset][c][0] << 4;
2675
							int he = endpoints[subset][c][1] << 4;
2676

2677
							int qlog16 = weight_interpolate(le, he, w);
2678

2679
							comp[c] = qlog16_to_half(qlog16);
2680

2681
							if (is_half_inf_or_nan((half_float)comp[c]))
2682
								comp[c] = 0x7BFF;
2683
						}
2684
						
2685
					} // c
2686

2687
					uint32_t packed;
2688
					if (is_ldr_endpoints[subset])
2689
						packed = pack_rgb9e5_ldr_astc(comp[0], comp[1], comp[2]);
2690
					else
2691
						packed = pack_rgb9e5_hdr_astc(comp[0], comp[1], comp[2]);
2692

2693
					((uint32_t*)pPixels)[pixel_index] = packed;
2694

2695
				} // x
2696
			} // y
2697
		}
2698
		else if (dec_mode == cDecodeModeHDR16)
2699
		{
2700
			// Note: must round towards zero when converting float to half for ASTC (18.19 Weight Application)
2701
			
2702
			// returns half floats
2703
			for (uint32_t y = 0; y < blk_height; y++)
2704
			{
2705
				for (uint32_t x = 0; x < blk_width; x++)
2706
				{
2707
					const uint32_t pixel_index = x + y * blk_width;
2708
					
2709
					uint32_t subset = 0;
2710
					if (log_blk.m_num_partitions > 1)
2711
					{
2712
						if (use_precomputed_texel_partitions_4x4)
2713
							subset = get_precompute_texel_partitions_4x4(log_blk.m_partition_id, x, y, log_blk.m_num_partitions);
2714
						else if (use_precomputed_texel_partitions_6x6)
2715
							subset = get_precompute_texel_partitions_6x6(log_blk.m_partition_id, x, y, log_blk.m_num_partitions);
2716
						else
2717
							subset = compute_texel_partition(log_blk.m_partition_id, x, y, 0, log_blk.m_num_partitions, small_block);
2718
					}
2719

2720
					for (uint32_t c = 0; c < 4; c++)
2721
					{
2722
						const uint32_t w = upsampled_weights[(c == ccs) ? 1 : 0][pixel_index];
2723

2724
						half_float o;
2725

2726
						if ( (is_ldr_endpoints[subset]) ||
2727
							 ((log_blk.m_color_endpoint_modes[subset] == CEM_HDR_RGB_LDR_ALPHA) && (c == 3)) )
2728
						{
2729
							assert((endpoints[subset][c][0] >= 0) && (endpoints[subset][c][0] <= 0xFF));
2730
							assert((endpoints[subset][c][1] >= 0) && (endpoints[subset][c][1] <= 0xFF));
2731

2732
							int le = endpoints[subset][c][0];
2733
							int he = endpoints[subset][c][1];
2734

2735
							le = (le << 8) | le;
2736
							he = (he << 8) | he;
2737

2738
							int k = weight_interpolate(le, he, w);
2739
							assert((k >= 0) && (k <= 0xFFFF));
2740

2741
							if (k == 0xFFFF)
2742
								o = 0x3C00; // 1.0
2743
							else
2744
								o = float_to_half((float)k * (1.0f / 65536.0f), true);
2745
						}
2746
						else
2747
						{
2748
							assert((endpoints[subset][c][0] >= 0) && (endpoints[subset][c][0] <= 0xFFF));
2749
							assert((endpoints[subset][c][1] >= 0) && (endpoints[subset][c][1] <= 0xFFF));
2750

2751
							int le = endpoints[subset][c][0] << 4;
2752
							int he = endpoints[subset][c][1] << 4;
2753

2754
							int qlog16 = weight_interpolate(le, he, w);
2755
							
2756
							o = qlog16_to_half(qlog16);
2757

2758
							if (is_half_inf_or_nan(o))
2759
								o = 0x7BFF;
2760
						}
2761
												
2762
						((half_float*)pPixels)[pixel_index * 4 + c] = o;
2763
					}
2764

2765
				} // x
2766
			} // y
2767
		}
2768
		else
2769
		{
2770
			// returns uint8_t's
2771
			for (uint32_t y = 0; y < blk_height; y++)
2772
			{
2773
				for (uint32_t x = 0; x < blk_width; x++)
2774
				{
2775
					const uint32_t pixel_index = x + y * blk_width;
2776

2777
					uint32_t subset = 0;
2778
					if (log_blk.m_num_partitions > 1)
2779
					{
2780
						if (use_precomputed_texel_partitions_4x4)
2781
							subset = get_precompute_texel_partitions_4x4(log_blk.m_partition_id, x, y, log_blk.m_num_partitions);
2782
						else if (use_precomputed_texel_partitions_6x6)
2783
							subset = get_precompute_texel_partitions_6x6(log_blk.m_partition_id, x, y, log_blk.m_num_partitions);
2784
						else
2785
							subset = compute_texel_partition(log_blk.m_partition_id, x, y, 0, log_blk.m_num_partitions, small_block);
2786
					}
2787

2788
					if (!is_ldr_endpoints[subset])
2789
					{
2790
						((uint32_t*)pPixels)[pixel_index * 4] = 0xFFFF00FF;
2791
						success = false;
2792
					}
2793
					else
2794
					{
2795
						for (uint32_t c = 0; c < 4; c++)
2796
						{
2797
							const uint32_t w = upsampled_weights[(c == ccs) ? 1 : 0][pixel_index];
2798

2799
							int le = endpoints[subset][c][0];
2800
							int he = endpoints[subset][c][1];
2801

2802
							// FIXME: the spec is apparently wrong? this matches ARM's and Google's decoder
2803
							//if ((dec_mode == cDecodeModeSRGB8) && (c <= 2))
2804
							// See https://github.com/ARM-software/astc-encoder/issues/447
2805
							if (dec_mode == cDecodeModeSRGB8)
2806
							{
2807
								le = (le << 8) | 0x80;
2808
								he = (he << 8) | 0x80;
2809
							}
2810
							else
2811
							{
2812
								le = (le << 8) | le;
2813
								he = (he << 8) | he;
2814
							}
2815

2816
							uint32_t k = weight_interpolate(le, he, w);
2817

2818
							// FIXME: This is what the spec says to do in LDR mode, but this is not what ARM's decoder does
2819
							// See decompress_symbolic_block(), decode_texel() and unorm16_to_sf16. 
2820
							// It seems to effectively divide by 65535.0 and convert to FP16, then back to float, mul by 255.0, add .5 and then convert to 8-bit.
2821
							((uint8_t*)pPixels)[pixel_index * 4 + c] = (uint8_t)(k >> 8);
2822
						}
2823
					}
2824

2825
				} // x
2826
			} // y
2827
		}
2828
		
2829
		return success;
2830
	}
2831

2832
	//------------------------------------------------
2833
	// Physical to logical block decoding
2834

2835
	// unsigned 128-bit int, with some signed helpers
2836
	class uint128
2837
	{
2838
		uint64_t m_lo, m_hi;
2839

2840
	public:
2841
		uint128() = default;
2842
		inline uint128(uint64_t lo) : m_lo(lo), m_hi(0) { }
2843
		inline uint128(uint64_t lo, uint64_t hi) : m_lo(lo), m_hi(hi) { }
2844
		inline uint128(const uint128& other) : m_lo(other.m_lo), m_hi(other.m_hi) { }
2845

2846
		inline uint128& set_signed(int64_t lo) { m_lo = lo; m_hi = (lo < 0) ? UINT64_MAX : 0; return *this; }
2847
		inline uint128& set(uint64_t lo) { m_lo = lo; m_hi = 0; return *this; }
2848

2849
		inline explicit operator uint8_t () const { return (uint8_t)m_lo; }
2850
		inline explicit operator uint16_t () const { return (uint16_t)m_lo; }
2851
		inline explicit operator uint32_t () const { return (uint32_t)m_lo; }
2852
		inline explicit operator uint64_t () const { return m_lo; }
2853

2854
		inline uint128& operator= (const uint128& rhs) { m_lo = rhs.m_lo; m_hi = rhs.m_hi; return *this; }
2855
		inline uint128& operator= (const uint64_t val) { m_lo = val; m_hi = 0; return *this; }
2856

2857
		inline uint64_t get_low() const { return m_lo; }
2858
		inline uint64_t& get_low() { return m_lo; }
2859

2860
		inline uint64_t get_high() const { return m_hi; }
2861
		inline uint64_t& get_high() { return m_hi; }
2862

2863
		inline bool operator== (const uint128& rhs) const { return (m_lo == rhs.m_lo) && (m_hi == rhs.m_hi); }
2864
		inline bool operator!= (const uint128& rhs) const { return (m_lo != rhs.m_lo) || (m_hi != rhs.m_hi); }
2865

2866
		inline bool operator< (const uint128& rhs) const
2867
		{
2868
			if (m_hi < rhs.m_hi)
2869
				return true;
2870

2871
			if (m_hi == rhs.m_hi)
2872
			{
2873
				if (m_lo < rhs.m_lo)
2874
					return true;
2875
			}
2876

2877
			return false;
2878
		}
2879

2880
		inline bool operator> (const uint128& rhs) const { return (rhs < *this); }
2881

2882
		inline bool operator<= (const uint128& rhs) const { return (*this == rhs) || (*this < rhs); }
2883
		inline bool operator>= (const uint128& rhs) const { return (*this == rhs) || (*this > rhs); }
2884

2885
		inline bool is_zero() const { return (m_lo == 0) && (m_hi == 0); }
2886
		inline bool is_all_ones() const { return (m_lo == UINT64_MAX) && (m_hi == UINT64_MAX); }
2887
		inline bool is_non_zero() const { return (m_lo != 0) || (m_hi != 0); }
2888
		inline explicit operator bool() const { return is_non_zero(); }
2889
		inline bool is_signed() const { return ((int64_t)m_hi) < 0; }
2890

2891
		inline bool signed_less(const uint128& rhs) const
2892
		{
2893
			const bool l_signed = is_signed(), r_signed = rhs.is_signed();
2894

2895
			if (l_signed == r_signed)
2896
				return *this < rhs;
2897

2898
			if (l_signed && !r_signed)
2899
				return true;
2900

2901
			assert(!l_signed && r_signed);
2902
			return false;
2903
		}
2904

2905
		inline bool signed_greater(const uint128& rhs) const { return rhs.signed_less(*this); }
2906
		inline bool signed_less_equal(const uint128& rhs) const { return !rhs.signed_less(*this); }
2907
		inline bool signed_greater_equal(const uint128& rhs) const { return !signed_less(rhs); }
2908

2909
		double get_double() const
2910
		{
2911
			double res = 0;
2912

2913
			if (m_hi)
2914
				res = (double)m_hi * pow(2.0f, 64.0f);
2915

2916
			res += (double)m_lo;
2917

2918
			return res;
2919
		}
2920

2921
		double get_signed_double() const
2922
		{
2923
			if (is_signed())
2924
				return -(uint128(*this).abs().get_double());
2925
			else
2926
				return get_double();
2927
		}
2928

2929
		inline uint128 abs() const
2930
		{
2931
			uint128 res(*this);
2932
			if (res.is_signed())
2933
				res = -res;
2934
			return res;
2935
		}
2936

2937
		inline uint128& operator<<= (int shift)
2938
		{
2939
			assert(shift >= 0);
2940
			if (shift < 0)
2941
				return *this;
2942

2943
			m_hi = (shift >= 64) ? ((shift >= 128) ? 0 : (m_lo << (shift - 64))) : (m_hi << shift);
2944

2945
			if ((shift) && (shift < 64))
2946
				m_hi |= (m_lo >> (64 - shift));
2947

2948
			m_lo = (shift >= 64) ? 0 : (m_lo << shift);
2949

2950
			return *this;
2951
		}
2952

2953
		inline uint128 operator<< (int shift) const { uint128 res(*this); res <<= shift; return res; }
2954

2955
		inline uint128& operator>>= (int shift)
2956
		{
2957
			assert(shift >= 0);
2958
			if (shift < 0)
2959
				return *this;
2960

2961
			m_lo = (shift >= 64) ? ((shift >= 128) ? 0 : (m_hi >> (shift - 64))) : (m_lo >> shift);
2962

2963
			if ((shift) && (shift < 64))
2964
				m_lo |= (m_hi << (64 - shift));
2965

2966
			m_hi = (shift >= 64) ? 0 : (m_hi >> shift);
2967

2968
			return *this;
2969
		}
2970

2971
		inline uint128 operator>> (int shift) const { uint128 res(*this); res >>= shift; return res; }
2972

2973
		inline uint128 signed_shift_right(int shift) const
2974
		{
2975
			uint128 res(*this);
2976
			res >>= shift;
2977

2978
			if (is_signed())
2979
			{
2980
				uint128 x(0U);
2981
				x = ~x;
2982
				x >>= shift;
2983
				res |= (~x);
2984
			}
2985

2986
			return res;
2987
		}
2988

2989
		inline uint128& operator |= (const uint128& rhs) { m_lo |= rhs.m_lo; m_hi |= rhs.m_hi; return *this; }
2990
		inline uint128 operator | (const uint128& rhs) const { uint128 res(*this); res |= rhs; return res; }
2991

2992
		inline uint128& operator &= (const uint128& rhs) { m_lo &= rhs.m_lo; m_hi &= rhs.m_hi; return *this; }
2993
		inline uint128 operator & (const uint128& rhs) const { uint128 res(*this); res &= rhs;	return res; }
2994

2995
		inline uint128& operator ^= (const uint128& rhs) { m_lo ^= rhs.m_lo; m_hi ^= rhs.m_hi; return *this; }
2996
		inline uint128 operator ^ (const uint128& rhs) const { uint128 res(*this); res ^= rhs;	return res; }
2997

2998
		inline uint128 operator ~() const { return uint128(~m_lo, ~m_hi); }
2999

3000
		inline uint128 operator -() const { uint128 res(~*this); if (++res.m_lo == 0) ++res.m_hi; return res; }
3001

3002
		// prefix
3003
		inline uint128 operator ++()
3004
		{
3005
			if (++m_lo == 0)
3006
				++m_hi;
3007
			return *this;
3008
		}
3009

3010
		// postfix
3011
		inline uint128 operator ++(int)
3012
		{
3013
			uint128 res(*this);
3014
			if (++m_lo == 0)
3015
				++m_hi;
3016
			return res;
3017
		}
3018

3019
		// prefix
3020
		inline uint128 operator --()
3021
		{
3022
			const uint64_t t = m_lo;
3023
			if (--m_lo > t)
3024
				--m_hi;
3025
			return *this;
3026
		}
3027

3028
		// postfix
3029
		inline uint128 operator --(int)
3030
		{
3031
			const uint64_t t = m_lo;
3032
			uint128 res(*this);
3033
			if (--m_lo > t)
3034
				--m_hi;
3035
			return res;
3036
		}
3037

3038
		inline uint128& operator+= (const uint128& rhs)
3039
		{
3040
			const uint64_t t = m_lo + rhs.m_lo;
3041
			m_hi = m_hi + rhs.m_hi + (t < m_lo);
3042
			m_lo = t;
3043
			return *this;
3044
		}
3045

3046
		inline uint128 operator+ (const uint128& rhs) const { uint128 res(*this); res += rhs; return res; }
3047

3048
		inline uint128& operator-= (const uint128& rhs)
3049
		{
3050
			const uint64_t t = m_lo - rhs.m_lo;
3051
			m_hi = m_hi - rhs.m_hi - (t > m_lo);
3052
			m_lo = t;
3053
			return *this;
3054
		}
3055

3056
		inline uint128 operator- (const uint128& rhs) const { uint128 res(*this); res -= rhs; return res; }
3057

3058
		// computes bit by bit, very slow
3059
		uint128& operator*=(const uint128& rhs)
3060
		{
3061
			uint128 temp(*this), result(0U);
3062

3063
			for (uint128 bitmask(rhs); bitmask; bitmask >>= 1, temp <<= 1)
3064
				if (bitmask.get_low() & 1)
3065
					result += temp;
3066

3067
			*this = result;
3068
			return *this;
3069
		}
3070

3071
		uint128 operator*(const uint128& rhs) const { uint128 res(*this); res *= rhs; return res; }
3072

3073
		// computes bit by bit, very slow
3074
		friend uint128 divide(const uint128& dividend, const uint128& divisor, uint128& remainder)
3075
		{
3076
			remainder = 0;
3077

3078
			if (!divisor)
3079
			{
3080
				assert(0);
3081
				return ~uint128(0U);
3082
			}
3083

3084
			uint128 quotient(0), one(1);
3085

3086
			for (int i = 127; i >= 0; i--)
3087
			{
3088
				remainder = (remainder << 1) | ((dividend >> i) & one);
3089
				if (remainder >= divisor)
3090
				{
3091
					remainder -= divisor;
3092
					quotient |= (one << i);
3093
				}
3094
			}
3095

3096
			return quotient;
3097
		}
3098

3099
		uint128 operator/(const uint128& rhs) const { uint128 remainder, res; res = divide(*this, rhs, remainder); return res; }
3100
		uint128 operator/=(const uint128& rhs) { uint128 remainder; *this = divide(*this, rhs, remainder); return *this; }
3101

3102
		uint128 operator%(const uint128& rhs) const { uint128 remainder; divide(*this, rhs, remainder); return remainder; }
3103
		uint128 operator%=(const uint128& rhs) { uint128 remainder; divide(*this, rhs, remainder); *this = remainder; return *this; }
3104

3105
		void print_hex(FILE* pFile) const
3106
		{
3107
			fprintf(pFile, "0x%016llx%016llx", (unsigned long long int)m_hi, (unsigned long long int)m_lo);
3108
		}
3109

3110
		void format_unsigned(std::string& res) const
3111
		{
3112
			basisu::vector<uint8_t> digits;
3113
			digits.reserve(39 + 1);
3114

3115
			uint128 k(*this), ten(10);
3116
			do
3117
			{
3118
				uint128 r;
3119
				k = divide(k, ten, r);
3120
				digits.push_back((uint8_t)r);
3121
			} while (k);
3122

3123
			for (int i = (int)digits.size() - 1; i >= 0; i--)
3124
				res += ('0' + digits[i]);
3125
		}
3126

3127
		void format_signed(std::string& res) const
3128
		{
3129
			uint128 val(*this);
3130

3131
			if (val.is_signed())
3132
			{
3133
				res.push_back('-');
3134
				val = -val;
3135
			}
3136

3137
			val.format_unsigned(res);
3138
		}
3139

3140
		void print_unsigned(FILE* pFile)
3141
		{
3142
			std::string str;
3143
			format_unsigned(str);
3144
			fprintf(pFile, "%s", str.c_str());
3145
		}
3146

3147
		void print_signed(FILE* pFile)
3148
		{
3149
			std::string str;
3150
			format_signed(str);
3151
			fprintf(pFile, "%s", str.c_str());
3152
		}
3153

3154
		uint128 get_reversed_bits() const
3155
		{
3156
			uint128 res;
3157

3158
			const uint32_t* pSrc = (const uint32_t*)this;
3159
			uint32_t* pDst = (uint32_t*)&res;
3160

3161
			pDst[0] = rev_dword(pSrc[3]);
3162
			pDst[1] = rev_dword(pSrc[2]);
3163
			pDst[2] = rev_dword(pSrc[1]);
3164
			pDst[3] = rev_dword(pSrc[0]);
3165

3166
			return res;
3167
		}
3168

3169
		uint128 get_byteswapped() const
3170
		{
3171
			uint128 res;
3172

3173
			const uint8_t* pSrc = (const uint8_t*)this;
3174
			uint8_t* pDst = (uint8_t*)&res;
3175

3176
			for (uint32_t i = 0; i < 16; i++)
3177
				pDst[i] = pSrc[15 - i];
3178

3179
			return res;
3180
		}
3181

3182
		inline uint64_t get_bits64(uint32_t bit_ofs, uint32_t bit_len) const
3183
		{
3184
			assert(bit_ofs < 128);
3185
			assert(bit_len && (bit_len <= 64) && ((bit_ofs + bit_len) <= 128));
3186

3187
			uint128 res(*this);
3188
			res >>= bit_ofs;
3189

3190
			const uint64_t bitmask = (bit_len == 64) ? UINT64_MAX : ((1ull << bit_len) - 1);
3191
			return res.get_low() & bitmask;
3192
		}
3193

3194
		inline uint32_t get_bits(uint32_t bit_ofs, uint32_t bit_len) const
3195
		{
3196
			assert(bit_len <= 32);
3197
			return (uint32_t)get_bits64(bit_ofs, bit_len);
3198
		}
3199

3200
		inline uint32_t next_bits(uint32_t& bit_ofs, uint32_t len) const
3201
		{
3202
			assert(len && (len <= 32));
3203
			uint32_t x = get_bits(bit_ofs, len);
3204
			bit_ofs += len;
3205
			return x;
3206
		}
3207

3208
		inline uint128& set_bits(uint64_t val, uint32_t bit_ofs, uint32_t num_bits)
3209
		{
3210
			assert(bit_ofs < 128);
3211
			assert(num_bits && (num_bits <= 64) && ((bit_ofs + num_bits) <= 128));
3212

3213
			uint128 bitmask(1);
3214
			bitmask = (bitmask << num_bits) - 1;
3215
			assert(uint128(val) <= bitmask);
3216

3217
			bitmask <<= bit_ofs;
3218
			*this &= ~bitmask;
3219

3220
			*this = *this | (uint128(val) << bit_ofs);
3221
			return *this;
3222
		}
3223
	};
3224
		
3225
	static bool decode_void_extent(const uint128& bits, log_astc_block& log_blk)
3226
	{
3227
		if (bits.get_bits(10, 2) != 0b11)
3228
			return false;
3229

3230
		uint32_t bit_ofs = 12;
3231
		const uint32_t min_s = bits.next_bits(bit_ofs, 13);
3232
		const uint32_t max_s = bits.next_bits(bit_ofs, 13);
3233
		const uint32_t min_t = bits.next_bits(bit_ofs, 13);
3234
		const uint32_t max_t = bits.next_bits(bit_ofs, 13);
3235
		assert(bit_ofs == 64);
3236
		
3237
		const bool all_extents_all_ones = (min_s == 0x1FFF) && (max_s == 0x1FFF) && (min_t == 0x1FFF) && (max_t == 0x1FFF);
3238
		
3239
		if (!all_extents_all_ones && ((min_s >= max_s) || (min_t >= max_t)))
3240
			return false;
3241

3242
		const bool hdr_flag = bits.get_bits(9, 1) != 0;
3243

3244
		if (hdr_flag)
3245
			log_blk.m_solid_color_flag_hdr = true;
3246
		else
3247
			log_blk.m_solid_color_flag_ldr = true;
3248

3249
		log_blk.m_solid_color[0] = (uint16_t)bits.get_bits(64, 16);
3250
		log_blk.m_solid_color[1] = (uint16_t)bits.get_bits(80, 16);
3251
		log_blk.m_solid_color[2] = (uint16_t)bits.get_bits(96, 16);
3252
		log_blk.m_solid_color[3] = (uint16_t)bits.get_bits(112, 16);
3253

3254
		if (log_blk.m_solid_color_flag_hdr)
3255
		{
3256
			for (uint32_t c = 0; c < 4; c++)
3257
				if (is_half_inf_or_nan(log_blk.m_solid_color[c]))
3258
					return false;
3259
		}
3260
		
3261
		return true;
3262
	}
3263

3264
	struct astc_dec_row
3265
	{
3266
		int8_t Dp_ofs, P_ofs, W_ofs, W_size, H_ofs, H_size, W_bias, H_bias, p0_ofs, p1_ofs, p2_ofs;
3267
	};
3268

3269
	static const astc_dec_row s_dec_rows[10] =
3270
	{
3271
		// Dp_ofs, P_ofs, W_ofs, W_size, H_ofs, H_size, W_bias, H_bias, p0_ofs, p1_ofs, p2_ofs;
3272
		{  10,     9,     7,     2,      5,     2,      4,      2,      4,      0,      1      }, // 4 2
3273
		{  10,     9,     7,     2,      5,     2,      8,      2,      4,      0,      1      }, // 8 2 
3274
		{  10,     9,     5,     2,      7,     2,      2,      8,      4,      0,      1      }, // 2 8
3275
		{  10,     9,     5,     2,      7,     1,      2,      6,      4,      0,      1      }, // 2 6
3276

3277
		{  10,     9,     7,     1,      5,     2,      2,      2,      4,      0,      1      }, // 2 2
3278
		{  10,     9,     0,     0,      5,     2,      12,     2,      4,      2,      3      }, // 12 2
3279
		{  10,     9,     5,     2,      0,     0,      2,     12,      4,      2,      3      }, // 2 12
3280
		{  10,     9,     0,     0,      0,     0,      6,     10,      4,      2,      3      }, // 6 10
3281

3282
		{  10,     9,     0,     0,      0,     0,      10,    6,       4,      2,      3      }, // 10 6
3283
		{  -1,    -1,     5,     2,      9,     2,      6,     6,       4,      2,      3      }, // 6 6
3284
	};
3285

3286
	static bool decode_config(const uint128& bits, log_astc_block& log_blk)
3287
	{
3288
		// Reserved
3289
		if (bits.get_bits(0, 4) == 0)
3290
			return false;
3291

3292
		// Reserved
3293
		if ((bits.get_bits(0, 2) == 0) && (bits.get_bits(6, 3) == 0b111))
3294
		{
3295
			if (bits.get_bits(2, 4) != 0b1111) 
3296
				return false;
3297
		}
3298

3299
		// Void extent
3300
		if (bits.get_bits(0, 9) == 0b111111100)
3301
			return decode_void_extent(bits, log_blk);
3302
												
3303
		// Check rows
3304
		const uint32_t x0_2 = bits.get_bits(0, 2), x2_2 = bits.get_bits(2, 2);
3305
		const uint32_t x5_4 = bits.get_bits(5, 4), x8_1 = bits.get_bits(8, 1);
3306
		const uint32_t x7_2 = bits.get_bits(7, 2);
3307

3308
		int row_index = -1;
3309
		if (x0_2 == 0)
3310
		{
3311
			if (x7_2 == 0b00)
3312
				row_index = 5;
3313
			else if (x7_2 == 0b01)
3314
				row_index = 6;
3315
			else if (x5_4 == 0b1100)
3316
				row_index = 7;
3317
			else if (x5_4 == 0b1101)
3318
				row_index = 8;
3319
			else if (x7_2 == 0b10)
3320
				row_index = 9;
3321
		}
3322
		else
3323
		{
3324
			if (x2_2 == 0b00)
3325
				row_index = 0;
3326
			else if (x2_2 == 0b01)
3327
				row_index = 1;
3328
			else if (x2_2 == 0b10)
3329
				row_index = 2;
3330
			else if ((x2_2 == 0b11) && (x8_1 == 0))
3331
				row_index = 3;
3332
			else if ((x2_2 == 0b11) && (x8_1 == 1))
3333
				row_index = 4;
3334
		}
3335
		if (row_index < 0)
3336
			return false;
3337

3338
		const astc_dec_row& r = s_dec_rows[row_index];
3339

3340
		bool P = false, Dp = false;
3341
		uint32_t W = r.W_bias, H = r.H_bias;
3342

3343
		if (r.P_ofs >= 0)
3344
			P = bits.get_bits(r.P_ofs, 1) != 0;
3345

3346
		if (r.Dp_ofs >= 0)
3347
			Dp = bits.get_bits(r.Dp_ofs, 1) != 0;
3348
				
3349
		if (r.W_size)
3350
			W += bits.get_bits(r.W_ofs, r.W_size);
3351

3352
		if (r.H_size)
3353
			H += bits.get_bits(r.H_ofs, r.H_size);
3354

3355
		assert((W >= MIN_GRID_DIM) && (W <= MAX_BLOCK_DIM));
3356
		assert((H >= MIN_GRID_DIM) && (H <= MAX_BLOCK_DIM));
3357
		
3358
		int p0 = bits.get_bits(r.p0_ofs, 1);
3359
		int p1 = bits.get_bits(r.p1_ofs, 1);
3360
		int p2 = bits.get_bits(r.p2_ofs, 1);
3361

3362
		uint32_t p = p0 | (p1 << 1) | (p2 << 2);
3363
		if (p < 2)
3364
			return false;
3365
		
3366
		log_blk.m_grid_width = (uint8_t)W;
3367
		log_blk.m_grid_height = (uint8_t)H;
3368
		
3369
		log_blk.m_weight_ise_range = (uint8_t)((p - 2) + (P * BISE_10_LEVELS));
3370
		assert(log_blk.m_weight_ise_range <= LAST_VALID_WEIGHT_ISE_RANGE);
3371

3372
		log_blk.m_dual_plane = Dp;
3373

3374
		return true;
3375
	}
3376

3377
	static inline uint32_t read_le_dword(const uint8_t* pBytes)
3378
	{
3379
		return (pBytes[0]) | (pBytes[1] << 8U) | (pBytes[2] << 16U) | (pBytes[3] << 24U);
3380
	}
3381

3382
	// See 18.12.Integer Sequence Encoding - tables computed by executing the decoder functions with all possible 8/7-bit inputs.
3383
	static const uint8_t s_trit_decode[256][5] =
3384
	{
3385
		{0,0,0,0,0},{1,0,0,0,0},{2,0,0,0,0},{0,0,2,0,0},{0,1,0,0,0},{1,1,0,0,0},{2,1,0,0,0},{1,0,2,0,0},
3386
		{0,2,0,0,0},{1,2,0,0,0},{2,2,0,0,0},{2,0,2,0,0},{0,2,2,0,0},{1,2,2,0,0},{2,2,2,0,0},{2,0,2,0,0},
3387
		{0,0,1,0,0},{1,0,1,0,0},{2,0,1,0,0},{0,1,2,0,0},{0,1,1,0,0},{1,1,1,0,0},{2,1,1,0,0},{1,1,2,0,0},
3388
		{0,2,1,0,0},{1,2,1,0,0},{2,2,1,0,0},{2,1,2,0,0},{0,0,0,2,2},{1,0,0,2,2},{2,0,0,2,2},{0,0,2,2,2},
3389
		{0,0,0,1,0},{1,0,0,1,0},{2,0,0,1,0},{0,0,2,1,0},{0,1,0,1,0},{1,1,0,1,0},{2,1,0,1,0},{1,0,2,1,0},
3390
		{0,2,0,1,0},{1,2,0,1,0},{2,2,0,1,0},{2,0,2,1,0},{0,2,2,1,0},{1,2,2,1,0},{2,2,2,1,0},{2,0,2,1,0},
3391
		{0,0,1,1,0},{1,0,1,1,0},{2,0,1,1,0},{0,1,2,1,0},{0,1,1,1,0},{1,1,1,1,0},{2,1,1,1,0},{1,1,2,1,0},
3392
		{0,2,1,1,0},{1,2,1,1,0},{2,2,1,1,0},{2,1,2,1,0},{0,1,0,2,2},{1,1,0,2,2},{2,1,0,2,2},{1,0,2,2,2},
3393
		{0,0,0,2,0},{1,0,0,2,0},{2,0,0,2,0},{0,0,2,2,0},{0,1,0,2,0},{1,1,0,2,0},{2,1,0,2,0},{1,0,2,2,0},
3394
		{0,2,0,2,0},{1,2,0,2,0},{2,2,0,2,0},{2,0,2,2,0},{0,2,2,2,0},{1,2,2,2,0},{2,2,2,2,0},{2,0,2,2,0},
3395
		{0,0,1,2,0},{1,0,1,2,0},{2,0,1,2,0},{0,1,2,2,0},{0,1,1,2,0},{1,1,1,2,0},{2,1,1,2,0},{1,1,2,2,0},
3396
		{0,2,1,2,0},{1,2,1,2,0},{2,2,1,2,0},{2,1,2,2,0},{0,2,0,2,2},{1,2,0,2,2},{2,2,0,2,2},{2,0,2,2,2},
3397
		{0,0,0,0,2},{1,0,0,0,2},{2,0,0,0,2},{0,0,2,0,2},{0,1,0,0,2},{1,1,0,0,2},{2,1,0,0,2},{1,0,2,0,2},
3398
		{0,2,0,0,2},{1,2,0,0,2},{2,2,0,0,2},{2,0,2,0,2},{0,2,2,0,2},{1,2,2,0,2},{2,2,2,0,2},{2,0,2,0,2},
3399
		{0,0,1,0,2},{1,0,1,0,2},{2,0,1,0,2},{0,1,2,0,2},{0,1,1,0,2},{1,1,1,0,2},{2,1,1,0,2},{1,1,2,0,2},
3400
		{0,2,1,0,2},{1,2,1,0,2},{2,2,1,0,2},{2,1,2,0,2},{0,2,2,2,2},{1,2,2,2,2},{2,2,2,2,2},{2,0,2,2,2},
3401
		{0,0,0,0,1},{1,0,0,0,1},{2,0,0,0,1},{0,0,2,0,1},{0,1,0,0,1},{1,1,0,0,1},{2,1,0,0,1},{1,0,2,0,1},
3402
		{0,2,0,0,1},{1,2,0,0,1},{2,2,0,0,1},{2,0,2,0,1},{0,2,2,0,1},{1,2,2,0,1},{2,2,2,0,1},{2,0,2,0,1},
3403
		{0,0,1,0,1},{1,0,1,0,1},{2,0,1,0,1},{0,1,2,0,1},{0,1,1,0,1},{1,1,1,0,1},{2,1,1,0,1},{1,1,2,0,1},
3404
		{0,2,1,0,1},{1,2,1,0,1},{2,2,1,0,1},{2,1,2,0,1},{0,0,1,2,2},{1,0,1,2,2},{2,0,1,2,2},{0,1,2,2,2},
3405
		{0,0,0,1,1},{1,0,0,1,1},{2,0,0,1,1},{0,0,2,1,1},{0,1,0,1,1},{1,1,0,1,1},{2,1,0,1,1},{1,0,2,1,1},
3406
		{0,2,0,1,1},{1,2,0,1,1},{2,2,0,1,1},{2,0,2,1,1},{0,2,2,1,1},{1,2,2,1,1},{2,2,2,1,1},{2,0,2,1,1},
3407
		{0,0,1,1,1},{1,0,1,1,1},{2,0,1,1,1},{0,1,2,1,1},{0,1,1,1,1},{1,1,1,1,1},{2,1,1,1,1},{1,1,2,1,1},
3408
		{0,2,1,1,1},{1,2,1,1,1},{2,2,1,1,1},{2,1,2,1,1},{0,1,1,2,2},{1,1,1,2,2},{2,1,1,2,2},{1,1,2,2,2},
3409
		{0,0,0,2,1},{1,0,0,2,1},{2,0,0,2,1},{0,0,2,2,1},{0,1,0,2,1},{1,1,0,2,1},{2,1,0,2,1},{1,0,2,2,1},
3410
		{0,2,0,2,1},{1,2,0,2,1},{2,2,0,2,1},{2,0,2,2,1},{0,2,2,2,1},{1,2,2,2,1},{2,2,2,2,1},{2,0,2,2,1},
3411
		{0,0,1,2,1},{1,0,1,2,1},{2,0,1,2,1},{0,1,2,2,1},{0,1,1,2,1},{1,1,1,2,1},{2,1,1,2,1},{1,1,2,2,1},
3412
		{0,2,1,2,1},{1,2,1,2,1},{2,2,1,2,1},{2,1,2,2,1},{0,2,1,2,2},{1,2,1,2,2},{2,2,1,2,2},{2,1,2,2,2},
3413
		{0,0,0,1,2},{1,0,0,1,2},{2,0,0,1,2},{0,0,2,1,2},{0,1,0,1,2},{1,1,0,1,2},{2,1,0,1,2},{1,0,2,1,2},
3414
		{0,2,0,1,2},{1,2,0,1,2},{2,2,0,1,2},{2,0,2,1,2},{0,2,2,1,2},{1,2,2,1,2},{2,2,2,1,2},{2,0,2,1,2},
3415
		{0,0,1,1,2},{1,0,1,1,2},{2,0,1,1,2},{0,1,2,1,2},{0,1,1,1,2},{1,1,1,1,2},{2,1,1,1,2},{1,1,2,1,2},
3416
		{0,2,1,1,2},{1,2,1,1,2},{2,2,1,1,2},{2,1,2,1,2},{0,2,2,2,2},{1,2,2,2,2},{2,2,2,2,2},{2,1,2,2,2}
3417
	};
3418

3419
	static const uint8_t s_quint_decode[128][3] =
3420
	{
3421
		{0,0,0},{1,0,0},{2,0,0},{3,0,0},{4,0,0},{0,4,0},{4,4,0},{4,4,4},
3422
		{0,1,0},{1,1,0},{2,1,0},{3,1,0},{4,1,0},{1,4,0},{4,4,1},{4,4,4},
3423
		{0,2,0},{1,2,0},{2,2,0},{3,2,0},{4,2,0},{2,4,0},{4,4,2},{4,4,4},
3424
		{0,3,0},{1,3,0},{2,3,0},{3,3,0},{4,3,0},{3,4,0},{4,4,3},{4,4,4},
3425
		{0,0,1},{1,0,1},{2,0,1},{3,0,1},{4,0,1},{0,4,1},{4,0,4},{0,4,4},
3426
		{0,1,1},{1,1,1},{2,1,1},{3,1,1},{4,1,1},{1,4,1},{4,1,4},{1,4,4},
3427
		{0,2,1},{1,2,1},{2,2,1},{3,2,1},{4,2,1},{2,4,1},{4,2,4},{2,4,4},
3428
		{0,3,1},{1,3,1},{2,3,1},{3,3,1},{4,3,1},{3,4,1},{4,3,4},{3,4,4},
3429
		{0,0,2},{1,0,2},{2,0,2},{3,0,2},{4,0,2},{0,4,2},{2,0,4},{3,0,4},
3430
		{0,1,2},{1,1,2},{2,1,2},{3,1,2},{4,1,2},{1,4,2},{2,1,4},{3,1,4},
3431
		{0,2,2},{1,2,2},{2,2,2},{3,2,2},{4,2,2},{2,4,2},{2,2,4},{3,2,4},
3432
		{0,3,2},{1,3,2},{2,3,2},{3,3,2},{4,3,2},{3,4,2},{2,3,4},{3,3,4},
3433
		{0,0,3},{1,0,3},{2,0,3},{3,0,3},{4,0,3},{0,4,3},{0,0,4},{1,0,4},
3434
		{0,1,3},{1,1,3},{2,1,3},{3,1,3},{4,1,3},{1,4,3},{0,1,4},{1,1,4},
3435
		{0,2,3},{1,2,3},{2,2,3},{3,2,3},{4,2,3},{2,4,3},{0,2,4},{1,2,4},
3436
		{0,3,3},{1,3,3},{2,3,3},{3,3,3},{4,3,3},{3,4,3},{0,3,4},{1,3,4}
3437
	};
3438

3439
	static void decode_trit_block(uint8_t* pVals, uint32_t num_vals, const uint128& bits, uint32_t& bit_ofs, uint32_t bits_per_val)
3440
	{
3441
		assert((num_vals >= 1) && (num_vals <= 5));
3442
		uint32_t m[5] = { 0 }, T = 0;
3443

3444
		static const uint8_t s_t_bits[5] = { 2, 2, 1, 2, 1 };
3445

3446
		for (uint32_t T_ofs = 0, c = 0; c < num_vals; c++)
3447
		{
3448
			if (bits_per_val)
3449
				m[c] = bits.next_bits(bit_ofs, bits_per_val);
3450
			T |= (bits.next_bits(bit_ofs, s_t_bits[c]) << T_ofs);
3451
			T_ofs += s_t_bits[c];
3452
		}
3453

3454
		const uint8_t (&p_trits)[5] = s_trit_decode[T];
3455

3456
		for (uint32_t i = 0; i < num_vals; i++)
3457
			pVals[i] = (uint8_t)((p_trits[i] << bits_per_val) | m[i]);
3458
	}
3459

3460
	static void decode_quint_block(uint8_t* pVals, uint32_t num_vals, const uint128& bits, uint32_t& bit_ofs, uint32_t bits_per_val)
3461
	{
3462
		assert((num_vals >= 1) && (num_vals <= 3));
3463
		uint32_t m[3] = { 0 }, T = 0;
3464

3465
		static const uint8_t s_t_bits[3] = { 3, 2, 2 };
3466

3467
		for (uint32_t T_ofs = 0, c = 0; c < num_vals; c++)
3468
		{
3469
			if (bits_per_val)
3470
				m[c] = bits.next_bits(bit_ofs, bits_per_val);
3471
			T |= (bits.next_bits(bit_ofs, s_t_bits[c]) << T_ofs);
3472
			T_ofs += s_t_bits[c];
3473
		}
3474

3475
		const uint8_t (&p_quints)[3] = s_quint_decode[T];
3476

3477
		for (uint32_t i = 0; i < num_vals; i++)
3478
			pVals[i] = (uint8_t)((p_quints[i] << bits_per_val) | m[i]);
3479
	}
3480

3481
	static void decode_bise(uint32_t ise_range, uint8_t* pVals, uint32_t num_vals, const uint128& bits, uint32_t bit_ofs)
3482
	{
3483
		assert(num_vals && (ise_range < TOTAL_ISE_RANGES));
3484
		
3485
		const uint32_t bits_per_val = g_ise_range_table[ise_range][0];
3486

3487
		if (g_ise_range_table[ise_range][1])
3488
		{
3489
			// Trits+bits, 5 vals per block, 7 bits extra per block
3490
			const uint32_t total_blocks = (num_vals + 4) / 5;
3491
			for (uint32_t b = 0; b < total_blocks; b++)
3492
			{
3493
				const uint32_t num_vals_in_block = std::min<int>(num_vals - 5 * b, 5);
3494
				decode_trit_block(pVals + 5 * b, num_vals_in_block, bits, bit_ofs, bits_per_val);
3495
			}
3496
		}
3497
		else if (g_ise_range_table[ise_range][2])
3498
		{
3499
			// Quints+bits, 3 vals per block, 8 bits extra per block
3500
			const uint32_t total_blocks = (num_vals + 2) / 3;
3501
			for (uint32_t b = 0; b < total_blocks; b++)
3502
			{
3503
				const uint32_t num_vals_in_block = std::min<int>(num_vals - 3 * b, 3);
3504
				decode_quint_block(pVals + 3 * b, num_vals_in_block, bits, bit_ofs, bits_per_val);
3505
			}
3506
		}
3507
		else
3508
		{
3509
			assert(bits_per_val);
3510

3511
			// Only bits
3512
			for (uint32_t i = 0; i < num_vals; i++)
3513
				pVals[i] = (uint8_t)bits.next_bits(bit_ofs, bits_per_val);
3514
		}
3515
	}
3516

3517
	void decode_bise(uint32_t ise_range, uint8_t* pVals, uint32_t num_vals, const uint8_t* pBits128, uint32_t bit_ofs)
3518
	{
3519
		const uint128 bits(
3520
			(uint64_t)read_le_dword(pBits128) | (((uint64_t)read_le_dword(pBits128 + sizeof(uint32_t))) << 32),
3521
			(uint64_t)read_le_dword(pBits128 + sizeof(uint32_t) * 2) | (((uint64_t)read_le_dword(pBits128 + sizeof(uint32_t) * 3)) << 32));
3522

3523
		return decode_bise(ise_range, pVals, num_vals, bits, bit_ofs);
3524
	}
3525
		
3526
	// Decodes a physical ASTC block to a logical ASTC block.
3527
	// blk_width/blk_height are only used to validate the weight grid's dimensions.
3528
	bool unpack_block(const void* pASTC_block, log_astc_block& log_blk, uint32_t blk_width, uint32_t blk_height)
3529
	{
3530
		assert(is_valid_block_size(blk_width, blk_height));
3531
				
3532
		const uint8_t* pS = (uint8_t*)pASTC_block;
3533

3534
		log_blk.clear();
3535
		log_blk.m_error_flag = true;
3536
		
3537
		const uint128 bits(
3538
			(uint64_t)read_le_dword(pS) | (((uint64_t)read_le_dword(pS + sizeof(uint32_t))) << 32),
3539
			(uint64_t)read_le_dword(pS + sizeof(uint32_t) * 2) | (((uint64_t)read_le_dword(pS + sizeof(uint32_t) * 3)) << 32));
3540
		
3541
		const uint128 rev_bits(bits.get_reversed_bits());
3542
				
3543
		if (!decode_config(bits, log_blk))
3544
			return false;
3545

3546
		if (log_blk.m_solid_color_flag_hdr || log_blk.m_solid_color_flag_ldr)
3547
		{
3548
			// Void extent
3549
			log_blk.m_error_flag = false;
3550
			return true;
3551
		}
3552

3553
		// Check grid dimensions
3554
		if ((log_blk.m_grid_width > blk_width) || (log_blk.m_grid_height > blk_height))
3555
			return false;
3556
		
3557
		// Now we have the grid width/height, dual plane, weight ISE range
3558
		
3559
		const uint32_t total_grid_weights = (log_blk.m_dual_plane ? 2 : 1) * (log_blk.m_grid_width * log_blk.m_grid_height);
3560
		const uint32_t total_weight_bits = get_ise_sequence_bits(total_grid_weights, log_blk.m_weight_ise_range);
3561
				
3562
		// 18.24 Illegal Encodings
3563
		if ((!total_grid_weights) || (total_grid_weights > MAX_GRID_WEIGHTS) || (total_weight_bits < 24) || (total_weight_bits > 96))
3564
			return false;
3565
		
3566
		const uint32_t end_of_weight_bit_ofs = 128 - total_weight_bits;
3567

3568
		uint32_t total_extra_bits = 0;
3569

3570
		// Right before the weight bits, there may be extra CEM bits, then the 2 CCS bits if dual plane.
3571

3572
		log_blk.m_num_partitions = (uint8_t)(bits.get_bits(11, 2) + 1);
3573
		if (log_blk.m_num_partitions == 1)
3574
			log_blk.m_color_endpoint_modes[0] = (uint8_t)(bits.get_bits(13, 4)); // read CEM bits
3575
		else
3576
		{
3577
			// 2 or more partitions
3578
			if (log_blk.m_dual_plane && (log_blk.m_num_partitions == 4))
3579
				return false;
3580

3581
			log_blk.m_partition_id = (uint16_t)bits.get_bits(13, 10);
3582

3583
			uint32_t cem_bits = bits.get_bits(23, 6);
3584

3585
			if ((cem_bits & 3) == 0)
3586
			{
3587
				// All CEM's the same
3588
				for (uint32_t i = 0; i < log_blk.m_num_partitions; i++)
3589
					log_blk.m_color_endpoint_modes[i] = (uint8_t)(cem_bits >> 2);
3590
			}
3591
			else
3592
			{
3593
				// CEM's different, but within up to 2 adjacent classes
3594
				const uint32_t first_cem_index = ((cem_bits & 3) - 1) * 4;
3595

3596
				total_extra_bits = 3 * log_blk.m_num_partitions - 4;
3597

3598
				if ((total_weight_bits + total_extra_bits) > 128)
3599
					return false;
3600

3601
				uint32_t cem_bit_pos = end_of_weight_bit_ofs - total_extra_bits;
3602
				
3603
				uint32_t c[4] = { 0 }, m[4] = { 0 };
3604
				
3605
				cem_bits >>= 2;
3606
				for (uint32_t i = 0; i < log_blk.m_num_partitions; i++, cem_bits >>= 1)
3607
					c[i] = cem_bits & 1;
3608

3609
				switch (log_blk.m_num_partitions)
3610
				{
3611
				case 2:
3612
				{
3613
					m[0] = cem_bits & 3;
3614
					m[1] = bits.next_bits(cem_bit_pos, 2);
3615
					break;
3616
				}
3617
				case 3:
3618
				{
3619
					m[0] = cem_bits & 1;
3620
					m[0] |= (bits.next_bits(cem_bit_pos, 1) << 1);
3621
					m[1] = bits.next_bits(cem_bit_pos, 2);
3622
					m[2] = bits.next_bits(cem_bit_pos, 2);
3623
					break;
3624
				}
3625
				case 4:
3626
				{
3627
					for (uint32_t i = 0; i < 4; i++)
3628
						m[i] = bits.next_bits(cem_bit_pos, 2);
3629
					break;
3630
				}
3631
				default:
3632
				{
3633
					assert(0);
3634
					break;
3635
				}
3636
				}
3637

3638
				assert(cem_bit_pos == end_of_weight_bit_ofs);
3639

3640
				for (uint32_t i = 0; i < log_blk.m_num_partitions; i++)
3641
				{
3642
					log_blk.m_color_endpoint_modes[i] = (uint8_t)(first_cem_index + (c[i] * 4) + m[i]);
3643
					assert(log_blk.m_color_endpoint_modes[i] <= 15);
3644
				}
3645
			}
3646
		}
3647

3648
		// Now we have all the CEM indices.
3649

3650
		if (log_blk.m_dual_plane)
3651
		{
3652
			// Read CCS bits, beneath any CEM bits
3653
			total_extra_bits += 2;
3654

3655
			if (total_extra_bits > end_of_weight_bit_ofs)
3656
				return false;
3657

3658
			uint32_t ccs_bit_pos = end_of_weight_bit_ofs - total_extra_bits;
3659
			log_blk.m_color_component_selector = (uint8_t)(bits.get_bits(ccs_bit_pos, 2));
3660
		}
3661

3662
		uint32_t config_bit_pos = 11 + 2; // config+num_parts
3663
		if (log_blk.m_num_partitions == 1)
3664
			config_bit_pos += 4; // CEM bits
3665
		else
3666
			config_bit_pos += 10 + 6; // part_id+CEM bits
3667

3668
		// config+num_parts+total_extra_bits (CEM extra+CCS)
3669
		uint32_t total_config_bits = config_bit_pos + total_extra_bits;
3670
		
3671
		// Compute number of remaining bits in block
3672
		const int num_remaining_bits = 128 - (int)total_config_bits - (int)total_weight_bits;
3673
		if (num_remaining_bits < 0)
3674
			return false;
3675

3676
		// Compute total number of ISE encoded color endpoint mode values
3677
		uint32_t total_cem_vals = 0;
3678
		for (uint32_t j = 0; j < log_blk.m_num_partitions; j++)
3679
			total_cem_vals += get_num_cem_values(log_blk.m_color_endpoint_modes[j]);
3680

3681
		if (total_cem_vals > MAX_ENDPOINTS)
3682
			return false;
3683

3684
		// Infer endpoint ISE range based off the # of values we need to encode, and the # of remaining bits in the block
3685
		int endpoint_ise_range = -1;
3686
		for (int k = 20; k > 0; k--)
3687
		{
3688
			int b = get_ise_sequence_bits(total_cem_vals, k);
3689
			if (b <= num_remaining_bits)
3690
			{
3691
				endpoint_ise_range = k;
3692
				break;
3693
			}
3694
		}
3695

3696
		// See 23.24 Illegal Encodings, [0,5] is the minimum ISE encoding for endpoints
3697
		if (endpoint_ise_range < (int)FIRST_VALID_ENDPOINT_ISE_RANGE)
3698
			return false;
3699

3700
		log_blk.m_endpoint_ise_range = (uint8_t)endpoint_ise_range;
3701

3702
		// Decode endpoints forwards in block
3703
		decode_bise(log_blk.m_endpoint_ise_range, log_blk.m_endpoints, total_cem_vals, bits, config_bit_pos);
3704

3705
		// Decode grid weights backwards in block
3706
		decode_bise(log_blk.m_weight_ise_range, log_blk.m_weights, total_grid_weights, rev_bits, 0);
3707

3708
		log_blk.m_error_flag = false;
3709

3710
		return true;
3711
	}
3712
		
3713
} // namespace astc_helpers
3714

3715
#endif //BASISU_ASTC_HELPERS_IMPLEMENTATION
3716

3717
Product

Resources

Company