CoCalc -- meshletcodec.cpp

GitHub Repository: godotengine/godot
Path: blob/master/thirdparty/meshoptimizer/meshletcodec.cpp
⁵⁹²⁰⁹ views
1
// This file is part of meshoptimizer library; see meshoptimizer.h for version/license details
2
#include "meshoptimizer.h"
3

4
#include <assert.h>
5
#include <string.h>
6

7
// The block below auto-detects SIMD ISA that can be used on the target platform
8
#ifndef MESHOPTIMIZER_NO_SIMD
9

10
// The SIMD implementation requires SSE4.1, which can be enabled unconditionally through compiler settings
11
#if defined(__AVX__) || defined(__SSE4_1__)
12
#define SIMD_SSE
13
#endif
14

15
// MSVC supports compiling SSE4.1 code regardless of compile options; we use a cpuid-based scalar fallback
16
#if !defined(SIMD_SSE) && defined(_MSC_VER) && !defined(__clang__) && (defined(_M_IX86) || (defined(_M_X64) && !defined(_M_ARM64EC)))
17
#define SIMD_SSE
18
#define SIMD_FALLBACK
19
#endif
20

21
// GCC 4.9+ and clang 3.8+ support targeting SIMD ISA from individual functions; we use a cpuid-based scalar fallback
22
#if !defined(SIMD_SSE) && ((defined(__clang__) && __clang_major__ * 100 + __clang_minor__ >= 308) || (defined(__GNUC__) && __GNUC__ * 100 + __GNUC_MINOR__ >= 409)) && (defined(__i386__) || defined(__x86_64__))
23
#define SIMD_SSE
24
#define SIMD_FALLBACK
25
#define SIMD_TARGET __attribute__((target("sse4.1")))
26
#endif
27

28
// When targeting AArch64, enable NEON SIMD unconditionally; we do not support SIMD decoding for 32-bit ARM
29
#if defined(__aarch64__) || (defined(_MSC_VER) && (defined(_M_ARM64) || defined(_M_ARM64EC)) && _MSC_VER >= 1922)
30
#define SIMD_NEON
31
#endif
32

33
#if defined(_MSC_VER) && !defined(__clang__) && _MSC_VER > 1930
34
#define SIMD_FLATTEN [[msvc::flatten]]
35
#elif defined(__GNUC__) || defined(__clang__)
36
#define SIMD_FLATTEN __attribute__((flatten))
37
#else
38
#define SIMD_FLATTEN
39
#endif
40

41
#ifndef SIMD_TARGET
42
#define SIMD_TARGET
43
#endif
44

45
#endif // !MESHOPTIMIZER_NO_SIMD
46

47
#ifdef SIMD_SSE
48
#include <smmintrin.h>
49
#endif
50

51
#ifdef SIMD_NEON
52
#include <arm_neon.h>
53
#endif
54

55
#if defined(SIMD_SSE) && defined(SIMD_FALLBACK)
56
#ifdef _MSC_VER
57
#include <intrin.h> // __cpuid
58
#else
59
#include <cpuid.h> // __cpuid
60
#endif
61
#endif
62

63
#ifndef TRACE
64
#define TRACE 0
65
#endif
66

67
#if TRACE
68
#include <stdio.h>
69
#endif
70

71
namespace meshopt
72
{
73

74
typedef unsigned int EdgeFifo8[8][2];
75

76
static int rotateTriangle(unsigned int a, unsigned int b, unsigned int c)
77
{
78
	return (a > b && a > c) ? 1 : (b > c ? 2 : 0);
79
}
80

81
static int getEdgeFifo8(EdgeFifo8 fifo, unsigned int a, unsigned int b, unsigned int c, size_t offset)
82
{
83
	for (int i = 0; i < 8; ++i)
84
	{
85
		size_t index = (offset - 1 - i) & 7;
86

87
		unsigned int e0 = fifo[index][0];
88
		unsigned int e1 = fifo[index][1];
89

90
		if (e0 == a && e1 == b)
91
			return (i << 2) | 0;
92
		if (e0 == b && e1 == c)
93
			return (i << 2) | 1;
94
		if (e0 == c && e1 == a)
95
			return (i << 2) | 2;
96
	}
97

98
	return -1;
99
}
100

101
static void pushEdgeFifo8(EdgeFifo8 fifo, unsigned int a, unsigned int b, size_t& offset)
102
{
103
	fifo[offset][0] = a;
104
	fifo[offset][1] = b;
105
	offset = (offset + 1) & 7;
106
}
107

108
static size_t encodeTriangles(unsigned char* codes, unsigned char* extra, const unsigned char* triangles, size_t triangle_count)
109
{
110
	EdgeFifo8 edgefifo;
111
	memset(edgefifo, -1, sizeof(edgefifo));
112

113
	size_t edgefifooffset = 0;
114

115
	unsigned int next = 0;
116

117
	// 4-bit triangle codes give us 16 options that we use as follows:
118
	// 3*2 edge reuse (2 edges * 3 last triangles) * 2 next/explicit = 12 options
119
	// 4 remaining options = next bits; 000, 001, 011, 111.
120
	// triangles are rotated to make next bits line up.
121
	memset(codes, 0, (triangle_count + 1) / 2);
122

123
	static const int rotations[] = {0, 1, 2, 0, 1};
124

125
	unsigned char* start = extra;
126

127
	for (size_t i = 0; i < triangle_count; ++i)
128
	{
129
#if TRACE > 1
130
		unsigned int last = next;
131
#endif
132

133
		int fer = getEdgeFifo8(edgefifo, triangles[i * 3 + 0], triangles[i * 3 + 1], triangles[i * 3 + 2], edgefifooffset);
134

135
		if (fer >= 0 && (fer >> 2) < 6)
136
		{
137
			// note: getEdgeFifo8 implicitly rotates triangles by matching a/b to existing edge
138
			const int* order = rotations + (fer & 3);
139

140
			unsigned int a = triangles[i * 3 + order[0]], b = triangles[i * 3 + order[1]], c = triangles[i * 3 + order[2]];
141

142
			int fec = (c == next) ? (next++, 0) : 1;
143

144
#if TRACE > 1
145
			printf("%3d+ | %3d %3d %3d | edge: e%d c%d\n", last, a, b, c, fer >> 2, fec);
146
#endif
147

148
			unsigned int code = (fer >> 2) * 2 + fec;
149

150
			codes[i / 2] |= (unsigned char)(code << ((i & 1) * 4));
151

152
			if (fec)
153
				*extra++ = (unsigned char)c;
154

155
			pushEdgeFifo8(edgefifo, c, b, edgefifooffset);
156
			pushEdgeFifo8(edgefifo, a, c, edgefifooffset);
157
		}
158
		else
159
		{
160
			// rotate triangles to minimize the need for extra vertices
161
			int rotation = rotateTriangle(triangles[i * 3 + 0], triangles[i * 3 + 1], triangles[i * 3 + 2]);
162
			const int* order = rotations + rotation;
163

164
			unsigned int a = triangles[i * 3 + order[0]], b = triangles[i * 3 + order[1]], c = triangles[i * 3 + order[2]];
165

166
			// fe must be continuous: once a vertex is encoded with next, further vertices must also be encoded with next
167
			int fea = (a == next && b == next + 1 && c == next + 2) ? (next++, 0) : 1;
168
			int feb = (b == next && c == next + 1) ? (next++, 0) : 1;
169
			int fec = (c == next) ? (next++, 0) : 1;
170

171
			assert(fea == 1 || feb == 0);
172
			assert(feb == 1 || fec == 0);
173

174
#if TRACE > 1
175
			printf("%3d+ | %3d %3d %3d | restart: %d%d%d\n", last, a, b, c, fea, feb, fec);
176
#endif
177

178
			unsigned int code = 12 + (fea + feb + fec);
179

180
			codes[i / 2] |= (unsigned char)(code << ((i & 1) * 4));
181

182
			if (fea)
183
				*extra++ = (unsigned char)a;
184
			if (feb)
185
				*extra++ = (unsigned char)b;
186
			if (fec)
187
				*extra++ = (unsigned char)c;
188

189
			pushEdgeFifo8(edgefifo, c, b, edgefifooffset);
190
			pushEdgeFifo8(edgefifo, a, c, edgefifooffset);
191
		}
192
	}
193

194
	return extra - start;
195
}
196

197
static size_t encodeVertices(unsigned char* ctrl, unsigned char* data, const unsigned int* vertices, size_t vertex_count)
198
{
199
	// grouped varint, 2 bit per value to indicate 0/1/2/3 byte deltas, with per-group 4-byte fallback
200
	memset(ctrl, 0, (vertex_count + 3) / 4);
201

202
	unsigned char* start = data;
203

204
	unsigned int last = ~0u;
205

206
	for (size_t i = 0; i < vertex_count; i += 4)
207
	{
208
		unsigned int gv[4] = {};
209

210
		for (int k = 0; k < 4 && i + k < vertex_count; ++k)
211
		{
212
			unsigned int d = vertices[i + k] - last - 1;
213
			unsigned int v = (d << 1) ^ (int(d) >> 31);
214

215
			gv[k] = v;
216
			last = vertices[i + k];
217
		}
218

219
		// if any value needs 4 bytes, or if *all* values need 3 bytes, we use 4 bytes for all values
220
		// this allows us to encode most 3-byte deltas with 3 bytes which saves space overall
221
		bool use4 = (gv[0] | gv[1] | gv[2] | gv[3]) > 0xffffff || (gv[0] > 0xffff && gv[1] > 0xffff && gv[2] > 0xffff && gv[3] > 0xffff);
222

223
		for (int k = 0; k < 4; ++k)
224
		{
225
			unsigned int v = gv[k];
226

227
			// 0/1/2/3 bytes per value, or all 4 values use 4 bytes
228
			int code = use4 ? 3 : (v == 0 ? 0 : (v < 256 ? 1 : (v < 65536 ? 2 : 3)));
229

230
			if (code > 0)
231
				*data++ = (unsigned char)(v & 0xff);
232
			if (code > 1)
233
				*data++ = (unsigned char)((v >> 8) & 0xff);
234
			if (code > 2)
235
				*data++ = (unsigned char)((v >> 16) & 0xff);
236
			if (use4)
237
				*data++ = (unsigned char)((v >> 24) & 0xff);
238

239
			// split low and high bits into two nibbles for better packing
240
			ctrl[i / 4] |= ((code & 1) << k) | ((code >> 1) << (k + 4));
241
		}
242
	}
243

244
	return data - start;
245
}
246

247
#if defined(SIMD_FALLBACK) || (!defined(SIMD_SSE) && !defined(SIMD_NEON))
248
inline void writeTriangle(unsigned int* triangles, size_t i, unsigned int fifo)
249
{
250
	// output triangle is stored without extra edge vertex (0xcbac => 0xcba)
251
	triangles[i] = fifo >> 8;
252
}
253

254
inline void writeTriangle(unsigned char* triangles, size_t i, unsigned int fifo)
255
{
256
	triangles[i * 3 + 0] = (unsigned char)(fifo >> 8);
257
	triangles[i * 3 + 1] = (unsigned char)(fifo >> 16);
258
	triangles[i * 3 + 2] = (unsigned char)(fifo >> 24);
259
}
260

261
template <typename T>
262
static const unsigned char* decodeTriangles(T* triangles, const unsigned char* codes, const unsigned char* extra, const unsigned char* bound, size_t triangle_count)
263
{
264
	// branchlessly read next or extra vertex and advance pointers
265
#define NEXT(var, ec) \
266
	e = *extra; \
267
	unsigned int var = (ec) ? e : next; \
268
	extra += (ec), next += 1 - (ec)
269

270
	unsigned int next = 0;
271
	unsigned int fifo[3] = {}; // two edge fifo entries in one uint: 0xcbac
272

273
	for (size_t i = 0; i < triangle_count; ++i)
274
	{
275
		if (extra > bound)
276
			return NULL;
277

278
		unsigned int code = (codes[i / 2] >> ((i & 1) * 4)) & 0xF;
279
		unsigned int tri;
280

281
		if (code < 12)
282
		{
283
			// reuse
284
			unsigned int edge = fifo[code / 4];
285
			edge >>= (code << 3) & 16; // shift by 16 if bit 1 is set (odd edge for each triangle)
286

287
			// 0-1 extra vertices
288
			unsigned int e;
289
			NEXT(c, code & 1);
290

291
			// repack triangle into edge format (0xcbac)
292
			tri = ((edge & 0xff) << 16) | (edge & 0xff00) | c | (c << 24);
293
		}
294
		else
295
		{
296
			// restart
297
			int fea = code > 12;
298
			int feb = code > 13;
299
			int fec = code > 14;
300

301
			// 0-3 extra vertices
302
			unsigned int e;
303
			NEXT(a, fea);
304
			NEXT(b, feb);
305
			NEXT(c, fec);
306

307
			// repack triangle into edge format (0xcbac)
308
			tri = c | (a << 8) | (b << 16) | (c << 24);
309
		}
310

311
		writeTriangle(triangles, i, tri);
312

313
		fifo[2] = fifo[1];
314
		fifo[1] = fifo[0];
315
		fifo[0] = tri;
316
	}
317

318
	return extra;
319

320
#undef NEXT
321
}
322

323
template <typename V>
324
static const unsigned char* decodeVertices(V* vertices, const unsigned char* ctrl, const unsigned char* data, const unsigned char* bound, size_t vertex_count)
325
{
326
	unsigned int last = ~0u;
327

328
	for (size_t i = 0; i < vertex_count; i += 4)
329
	{
330
		if (data > bound)
331
			return NULL;
332

333
		unsigned char code4 = ctrl[i / 4];
334

335
		for (int k = 0; k < 4; ++k)
336
		{
337
			int code = ((code4 >> k) & 1) | ((code4 >> (k + 3)) & 2);
338
			int length = code4 == 0xff ? 4 : code;
339

340
			// branchlessly read up to 4 bytes
341
			unsigned int mask = (length == 4) ? ~0u : (1 << (8 * length)) - 1;
342
			unsigned int v = (data[0] | (data[1] << 8) | (data[2] << 16) | (data[3] << 24)) & mask;
343

344
			// unzigzag + 1
345
			unsigned int d = (v >> 1) ^ -int(v & 1);
346
			unsigned int r = last + d + 1;
347

348
			if (i + k < vertex_count)
349
				vertices[i + k] = V(r);
350

351
			data += length;
352
			last = r;
353
		}
354
	}
355

356
	return data;
357
}
358

359
static int decodeMeshlet(void* vertices, void* triangles, const unsigned char* codes, const unsigned char* ctrl, const unsigned char* data, const unsigned char* bound, size_t vertex_count, size_t triangle_count, size_t vertex_size, size_t triangle_size)
360
{
361
	if (vertex_size == 4)
362
		data = decodeVertices(static_cast<unsigned int*>(vertices), ctrl, data, bound, vertex_count);
363
	else
364
		data = decodeVertices(static_cast<unsigned short*>(vertices), ctrl, data, bound, vertex_count);
365
	if (!data)
366
		return -2;
367

368
	if (triangle_size == 4)
369
		data = decodeTriangles(static_cast<unsigned int*>(triangles), codes, data, bound, triangle_count);
370
	else
371
		data = decodeTriangles(static_cast<unsigned char*>(triangles), codes, data, bound, triangle_count);
372
	if (!data)
373
		return -2;
374

375
	return (data == bound) ? 0 : -3;
376
}
377
#endif
378

379
#if defined(SIMD_SSE) || defined(SIMD_NEON)
380
// SIMD state is stored in a single 16b register as follows:
381
// 0..5: 6 next extra bytes
382
// 6..14: 9 bytes = 3 triangles worth of index data
383
// 15: 'next' byte
384

385
// upon reading each triangle pair we need to transform this state such that the 9 bytes with triangle data contain the newly decoded triangles,
386
// which is a permutation of original state modulo per-element additions
387
// this transform can be chained to decode second triangle from original state; we create tables for 256 combinations of two 4-bit triangle codes
388
// the actual decoding becomes shuffle+add per triangle pair, plus management of extra bytes
389
static unsigned char kDecodeTableMasks[256][16];
390
static unsigned char kDecodeTableExtra[256];
391

392
// for SIMD vertex decoding we need to unpack 4 values with 0-4 bytes in each
393
// this can be done with a single control-dependent shuffle per group
394
static unsigned char kDecodeTableVerts[256][16];
395
static unsigned char kDecodeTableLength[256];
396

397
static bool decodeBuildTables()
398
{
399
#define NEXT(var, ec) \
400
	shuf[var] = (ec) ? (unsigned char)extra : 15; \
401
	next[var] = (ec) ? 0 : (unsigned char)nextoff; \
402
	extra += (ec), nextoff += 1 - (ec)
403

404
	// check for SSE4.1 support if we have a fallback path
405
#if defined(SIMD_SSE) && defined(SIMD_FALLBACK)
406
	int cpuinfo[4] = {};
407
#ifdef _MSC_VER
408
	__cpuid(cpuinfo, 1);
409
#else
410
	__cpuid(1, cpuinfo[0], cpuinfo[1], cpuinfo[2], cpuinfo[3]);
411
#endif
412
	// bit 19 = SSE4.1
413
	if ((cpuinfo[2] & (1 << 19)) == 0)
414
		return false;
415
#endif
416

417
	// fill triangle decoding tables for each combination of two triangle codes
418
	for (int code = 0; code < 256; ++code)
419
	{
420
		unsigned char shuf[16] = {};
421
		unsigned char next[16] = {};
422
		int extra = 0;
423
		int nextoff = 0;
424

425
		// state 0..5 will be refilled every iteration, so we ignore that
426
		// state 6..8 will always contain the last decoded triangle because every triangle shifts fifo equally, so we can decode it independently
427
		shuf[6] = 12;
428
		shuf[7] = 13;
429
		shuf[8] = 14;
430

431
		// state 15 will contain next (potentially incremented a few times)
432
		shuf[15] = 15;
433

434
		// state 9..11 will contain the first decoded triangle (tri0), which can refer to extra/next and the original triangle history
435
		// state 12..14 will contain the second decoded triangle (tri1); when decoding edge reuse, we need to handle edge 0/1 specially as it was just decoded earlier
436
		for (int k = 0; k < 2; ++k)
437
		{
438
			int tri = (code >> (k * 4)) & 0xf;
439

440
			if (tri < 12)
441
			{
442
				if (k == 1 && tri / 4 == 0)
443
				{
444
					// we need to decode one of two edges from the triangle we just decoded earlier
445
					// for that we simply need to copy shuf/next values for the two decoded indices
446
					shuf[9 + k * 3] = shuf[9 + ((tri & 2) ? 2 : 0)];
447
					next[9 + k * 3] = next[9 + ((tri & 2) ? 2 : 0)];
448

449
					shuf[10 + k * 3] = shuf[9 + ((tri & 2) ? 1 : 2)];
450
					next[10 + k * 3] = next[9 + ((tri & 2) ? 1 : 2)];
451
				}
452
				else
453
				{
454
					// reuse: edge comes from the history based on edge index
455
					// note: we reuse with an offset because last triangle in the original history was consumed by tri0
456
					int trioff = 6 + k * 3 + (2 - tri / 4) * 3;
457

458
					// edge cb or ac
459
					shuf[9 + k * 3] = (unsigned char)(trioff + ((tri & 2) ? 2 : 0));
460
					shuf[10 + k * 3] = (unsigned char)(trioff + ((tri & 2) ? 1 : 2));
461
				}
462

463
				// third vertex is either next or comes from extra
464
				NEXT(11 + k * 3, tri & 1);
465
			}
466
			else
467
			{
468
				// restart: three vertices, each comes from next or extra
469
				int fea = tri > 12;
470
				int feb = tri > 13;
471
				int fec = tri > 14;
472

473
				NEXT(9 + k * 3, fea);
474
				NEXT(10 + k * 3, feb);
475
				NEXT(11 + k * 3, fec);
476
			}
477
		}
478

479
		// next needs to advance
480
		next[15] = (unsigned char)nextoff;
481

482
		// next[0..8] = 0 trivially (never written to); next[9] must also be 0 because nextoff is 0 initially
483
		// shuf[0..5] is not used, which allows us to pack next[10..15] + shuf[6..15] into a single 16-byte entry
484
		assert(next[9] == 0);
485
		memcpy(&kDecodeTableMasks[code][0], &next[10], 6);
486
		memcpy(&kDecodeTableMasks[code][6], &shuf[6], 10);
487
		kDecodeTableExtra[code] = (unsigned char)extra;
488
	}
489

490
	// fill vertex decoding tables for each combination of four vertex references
491
	for (unsigned int i = 0; i < 256; ++i)
492
	{
493
		unsigned char shuf[16] = {};
494
		int offset = 0;
495

496
		for (int k = 0; k < 4; ++k)
497
		{
498
			int code = ((i >> k) & 1) | ((i >> (k + 3)) & 2);
499
			int length = i == 0xff ? 4 : code; // 0/1/2/3 bytes, or all 4 bytes if code==0xff
500

501
			shuf[k * 4 + 0] = (length > 0) ? (unsigned char)(offset + 0) : 0x80;
502
			shuf[k * 4 + 1] = (length > 1) ? (unsigned char)(offset + 1) : 0x80;
503
			shuf[k * 4 + 2] = (length > 2) ? (unsigned char)(offset + 2) : 0x80;
504
			shuf[k * 4 + 3] = (length > 3) ? (unsigned char)(offset + 3) : 0x80;
505

506
			offset += length;
507
		}
508

509
		memcpy(kDecodeTableVerts[i], shuf, sizeof(shuf));
510
		kDecodeTableLength[i] = (unsigned char)offset;
511
	}
512

513
	return true;
514

515
#undef NEXT
516
}
517

518
static bool gDecodeTablesInitialized = decodeBuildTables();
519
#endif
520

521
#if defined(SIMD_SSE)
522
SIMD_TARGET
523
inline __m128i decodeTriangleGroup(__m128i state, unsigned char code, const unsigned char*& extra)
524
{
525
	__m128i shuf = _mm_loadu_si128(reinterpret_cast<const __m128i*>(kDecodeTableMasks[code]));
526
	__m128i next = _mm_slli_si128(shuf, 10);
527

528
	// patch first 6 bytes with current extra and roll state forward
529
	__m128i ext = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(extra));
530
	state = _mm_blend_epi16(state, ext, 7);
531
	state = _mm_add_epi8(_mm_shuffle_epi8(state, shuf), next);
532

533
	extra += kDecodeTableExtra[code];
534

535
	return state;
536
}
537

538
SIMD_TARGET
539
inline __m128i decodeVertexGroup(__m128i last, unsigned char code, const unsigned char*& data)
540
{
541
	__m128i word = _mm_loadu_si128(reinterpret_cast<const __m128i*>(data));
542
	__m128i shuf = _mm_loadu_si128(reinterpret_cast<const __m128i*>(kDecodeTableVerts[code]));
543

544
	__m128i v = _mm_shuffle_epi8(word, shuf);
545

546
	// unzigzag+1
547
	__m128i xl = _mm_sub_epi32(_mm_setzero_si128(), _mm_and_si128(v, _mm_set1_epi32(1)));
548
	__m128i xr = _mm_srli_epi32(v, 1);
549
	__m128i x = _mm_add_epi32(_mm_xor_si128(xl, xr), _mm_set1_epi32(1));
550

551
	// prefix sum
552
	x = _mm_add_epi32(x, _mm_slli_si128(x, 8));
553
	x = _mm_add_epi32(x, _mm_slli_si128(x, 4));
554
	x = _mm_add_epi32(x, _mm_shuffle_epi32(last, 0xff));
555

556
	data += kDecodeTableLength[code];
557

558
	return x;
559
}
560
#endif
561

562
#if defined(SIMD_NEON)
563
SIMD_TARGET
564
inline uint8x16_t decodeTriangleGroup(uint8x16_t state, unsigned char code, const unsigned char*& extra)
565
{
566
	uint8x16_t shuf = vld1q_u8(kDecodeTableMasks[code]);
567
	uint8x16_t next = vextq_u8(vdupq_n_u8(0), shuf, 6);
568

569
	// patch first 6 bytes with current extra and roll state forward
570
	uint8x8_t extl = vld1_u8(extra);
571
	uint8x16_t ext = vcombine_u8(extl, vdup_n_u8(0));
572
	state = vbslq_u8(vcombine_u8(vcreate_u8(0xffffffffffffull), vdup_n_u8(0)), ext, state);
573
	state = vaddq_u8(vqtbl1q_u8(state, shuf), next);
574

575
	extra += kDecodeTableExtra[code];
576

577
	return state;
578
}
579

580
SIMD_TARGET
581
inline uint32x4_t decodeVertexGroup(uint32x4_t last, unsigned char code, const unsigned char*& data)
582
{
583
	uint8x16_t word = vld1q_u8(data);
584
	uint8x16_t shuf = vld1q_u8(kDecodeTableVerts[code]);
585

586
	uint32x4_t v = vreinterpretq_u32_u8(vqtbl1q_u8(word, shuf));
587

588
	// unzigzag+1
589
	uint32x4_t xl = vsubq_u32(vdupq_n_u32(0), vandq_u32(v, vdupq_n_u32(1)));
590
	uint32x4_t xr = vshrq_n_u32(v, 1);
591
	uint32x4_t x = vaddq_u32(veorq_u32(xl, xr), vdupq_n_u32(1));
592

593
	// prefix sum
594
	x = vaddq_u32(x, vextq_u32(vdupq_n_u32(0), x, 2));
595
	x = vaddq_u32(x, vextq_u32(vdupq_n_u32(0), x, 3));
596
	x = vaddq_u32(x, vdupq_n_u32(vgetq_lane_u32(last, 3)));
597

598
	data += kDecodeTableLength[code];
599

600
	return x;
601
}
602
#endif
603

604
#if defined(SIMD_SSE)
605
#ifdef __GNUC__
606
typedef int __attribute__((aligned(1))) unaligned_int;
607
#else
608
typedef int unaligned_int;
609
#endif
610
#endif
611

612
#if defined(SIMD_SSE) || defined(SIMD_NEON)
613
SIMD_TARGET
614
static const unsigned char* decodeTrianglesSimd(unsigned int* triangles, const unsigned char* codes, const unsigned char* extra, const unsigned char* bound, size_t triangle_count)
615
{
616
#if defined(SIMD_SSE)
617
	__m128i repack = _mm_setr_epi8(9, 10, 11, -1, 12, 13, 14, -1, 0, 0, 0, 0, 0, 0, 0, 0);
618
	__m128i state = _mm_setzero_si128();
619
#elif defined(SIMD_NEON)
620
	uint8x8_t repack = vcreate_u8(0xff0e0d0cff0b0a09ull);
621
	uint8x16_t state = vdupq_n_u8(0);
622
#endif
623

624
	size_t groups = triangle_count / 2;
625

626
	// process all complete groups
627
	for (size_t i = 0; i < groups; ++i)
628
	{
629
		unsigned char code = *codes++;
630

631
		if (extra > bound)
632
			return NULL;
633

634
		state = decodeTriangleGroup(state, code, extra);
635

636
		// write 6 bytes of new triangle data into output, formatted as 8 bytes with 0 padding
637
#if defined(SIMD_SSE)
638
		__m128i r = _mm_shuffle_epi8(state, repack);
639
		_mm_storel_epi64(reinterpret_cast<__m128i*>(&triangles[i * 2]), r);
640
#elif defined(SIMD_NEON)
641
		uint32x2_t r = vreinterpret_u32_u8(vqtbl1_u8(state, repack));
642
		vst1_u32(&triangles[i * 2], r);
643
#endif
644
	}
645

646
	// process a 1 triangle tail; to maintain the memory safety guarantee we have to write a 32-bit element
647
	if (triangle_count & 1)
648
	{
649
		unsigned char code = *codes++;
650

651
		if (extra > bound)
652
			return NULL;
653

654
		state = decodeTriangleGroup(state, code, extra);
655

656
		unsigned int* tail = &triangles[triangle_count & ~1u];
657

658
#if defined(SIMD_SSE)
659
		__m128i r = _mm_shuffle_epi8(state, repack);
660
		*tail = unsigned(_mm_cvtsi128_si32(r));
661
#elif defined(SIMD_NEON)
662
		uint32x2_t r = vreinterpret_u32_u8(vqtbl1_u8(state, repack));
663
		vst1_lane_u32(tail, r, 0);
664
#endif
665
	}
666

667
	return extra;
668
}
669

670
SIMD_TARGET
671
static const unsigned char* decodeTrianglesSimd(unsigned char* triangles, const unsigned char* codes, const unsigned char* extra, const unsigned char* bound, size_t triangle_count)
672
{
673
#if defined(SIMD_SSE)
674
	__m128i state = _mm_setzero_si128();
675
#elif defined(SIMD_NEON)
676
	uint8x16_t state = vdupq_n_u8(0);
677
#endif
678

679
	// because the output buffer is guaranteed to have 32-bit aligned size available, we can optimize writes and tail processing
680
	// instead of processing triangles 2 at a time, we process 2 *pairs* at a time (12-byte write) followed by a tail pair, if present
681
	// if the number of triangles mod 4 is 3, we'd normally need to write 12k+9 bytes, but we can instead overwrite up to 3 bytes in the main loop
682
	size_t groups = (triangle_count + 1) / 4;
683

684
	// process all complete groups
685
	for (size_t i = 0; i < groups; ++i)
686
	{
687
		unsigned char code0 = *codes++;
688
		unsigned char code1 = *codes++;
689

690
		// each triangle pair reads <=6 bytes from extra, so two pairs need <=12 bytes and gap guarantees 16 byte of overread
691
		if (extra > bound)
692
			return NULL;
693

694
		state = decodeTriangleGroup(state, code0, extra);
695

696
		// write first decoded triangle and first index of second decoded triangle
697
#if defined(SIMD_SSE)
698
		__m128i r0 = _mm_srli_si128(state, 9);
699
		*reinterpret_cast<unaligned_int*>(&triangles[i * 12]) = _mm_cvtsi128_si32(r0);
700
#elif defined(SIMD_NEON)
701
		uint8x16_t r0 = vextq_u8(state, vdupq_n_u8(0), 9);
702
		vst1q_lane_u32(reinterpret_cast<unsigned int*>(&triangles[i * 12]), vreinterpretq_u32_u8(r0), 0);
703
#endif
704

705
		state = decodeTriangleGroup(state, code1, extra);
706

707
		// write last two indices of second decoded triangle that we didn't write above plus two new ones
708
		// note that the second decoded triangle has shifted down to 6-8 bytes, hence shift by 7
709
#if defined(SIMD_SSE)
710
		__m128i r1 = _mm_srli_si128(state, 7);
711
		_mm_storel_epi64(reinterpret_cast<__m128i*>(&triangles[i * 12 + 4]), r1);
712
#elif defined(SIMD_NEON)
713
		uint8x16_t r1 = vextq_u8(state, vdupq_n_u8(0), 7);
714
		vst1_u8(&triangles[i * 12 + 4], vget_low_u8(r1));
715
#endif
716
	}
717

718
	// process a 1-2 triangle tail; to maintain the memory safety guarantee we have to write 1-2 32-bit elements
719
	if (groups * 4 < triangle_count)
720
	{
721
		unsigned char code = *codes++;
722

723
		if (extra > bound)
724
			return NULL;
725

726
		state = decodeTriangleGroup(state, code, extra);
727

728
		unsigned char* tail = &triangles[(triangle_count & ~3u) * 3];
729

730
#if defined(SIMD_SSE)
731
		__m128i r = _mm_srli_si128(state, 9);
732

733
		*reinterpret_cast<unaligned_int*>(tail) = _mm_cvtsi128_si32(r);
734
		if ((triangle_count & 3) > 1)
735
			*reinterpret_cast<unaligned_int*>(tail + 4) = _mm_extract_epi32(r, 1);
736
#elif defined(SIMD_NEON)
737
		uint8x16_t r = vextq_u8(state, vdupq_n_u8(0), 9);
738

739
		vst1q_lane_u32(reinterpret_cast<unsigned int*>(tail), vreinterpretq_u32_u8(r), 0);
740
		if ((triangle_count & 3) > 1)
741
			vst1q_lane_u32(reinterpret_cast<unsigned int*>(tail + 4), vreinterpretq_u32_u8(r), 1);
742
#endif
743
	}
744

745
	return extra;
746
}
747

748
SIMD_TARGET
749
static const unsigned char* decodeVerticesSimd(unsigned int* vertices, const unsigned char* ctrl, const unsigned char* data, const unsigned char* bound, size_t vertex_count)
750
{
751
#if defined(SIMD_SSE)
752
	__m128i last = _mm_set1_epi32(-1);
753
#elif defined(SIMD_NEON)
754
	uint32x4_t last = vdupq_n_u32(~0u);
755
#endif
756

757
	size_t groups = vertex_count / 4;
758

759
	// process all complete groups
760
	for (size_t i = 0; i < groups; ++i)
761
	{
762
		unsigned char code = *ctrl++;
763
		if (data > bound)
764
			return NULL;
765

766
		last = decodeVertexGroup(last, code, data);
767

768
#if defined(SIMD_SSE)
769
		_mm_storeu_si128(reinterpret_cast<__m128i*>(&vertices[i * 4]), last);
770
#elif defined(SIMD_NEON)
771
		vst1q_u32(&vertices[i * 4], last);
772
#endif
773
	}
774

775
	// process a 1-3 vertex tail; to maintain the memory safety guarantee we have to write individual elements
776
	if (vertex_count & 3)
777
	{
778
		unsigned char code = *ctrl++;
779

780
		if (data > bound)
781
			return NULL;
782

783
		last = decodeVertexGroup(last, code, data);
784

785
		unsigned int* tail = &vertices[vertex_count & ~3u];
786

787
#if defined(SIMD_SSE)
788
		tail[0] = _mm_cvtsi128_si32(last);
789
		if ((vertex_count & 3) > 1)
790
			tail[1] = _mm_extract_epi32(last, 1);
791
		if ((vertex_count & 3) > 2)
792
			tail[2] = _mm_extract_epi32(last, 2);
793
#elif defined(SIMD_NEON)
794
		vst1q_lane_u32(&tail[0], last, 0);
795
		if ((vertex_count & 3) > 1)
796
			vst1q_lane_u32(&tail[1], last, 1);
797
		if ((vertex_count & 3) > 2)
798
			vst1q_lane_u32(&tail[2], last, 2);
799
#endif
800
	}
801

802
	return data;
803
}
804

805
SIMD_TARGET
806
static const unsigned char* decodeVerticesSimd(unsigned short* vertices, const unsigned char* ctrl, const unsigned char* data, const unsigned char* bound, size_t vertex_count)
807
{
808
#if defined(SIMD_SSE)
809
	__m128i repack = _mm_setr_epi8(0, 1, 4, 5, 8, 9, 12, 13, 0, 0, 0, 0, 0, 0, 0, 0);
810
	__m128i last = _mm_set1_epi32(-1);
811
#elif defined(SIMD_NEON)
812
	uint32x4_t last = vdupq_n_u32(~0u);
813
#endif
814

815
	// because the output buffer is guaranteed to have 32-bit aligned size available, we can simplify tail processing
816
	// if the number of vertices mod 4 is 3, we'd normally need to write 8+6 bytes, but we can instead overwrite up to 2 bytes in the main loop
817
	size_t groups = (vertex_count + 1) / 4;
818

819
	// process all complete groups
820
	for (size_t i = 0; i < groups; ++i)
821
	{
822
		unsigned char code = *ctrl++;
823

824
		if (data > bound)
825
			return NULL;
826

827
		last = decodeVertexGroup(last, code, data);
828

829
#if defined(SIMD_SSE)
830
		__m128i r = _mm_shuffle_epi8(last, repack);
831
		_mm_storel_epi64(reinterpret_cast<__m128i*>(&vertices[i * 4]), r);
832
#elif defined(SIMD_NEON)
833
		uint16x4_t r = vmovn_u32(last);
834
		vst1_u16(&vertices[i * 4], r);
835
#endif
836
	}
837

838
	// process a 1-2 vertex tail; to maintain the memory safety guarantee we have to write a 32-bit element
839
	if (groups * 4 < vertex_count)
840
	{
841
		unsigned char code = *ctrl++;
842

843
		if (data > bound)
844
			return NULL;
845

846
		last = decodeVertexGroup(last, code, data);
847

848
		unsigned short* tail = &vertices[vertex_count & ~3u];
849

850
#if defined(SIMD_SSE)
851
		__m128i r = _mm_shufflelo_epi16(last, 8);
852
		*reinterpret_cast<unaligned_int*>(tail) = _mm_cvtsi128_si32(r);
853
#elif defined(SIMD_NEON)
854
		uint16x4_t r = vmovn_u32(last);
855
		vst1_lane_u32(reinterpret_cast<unsigned int*>(tail), vreinterpret_u32_u16(r), 0);
856
#endif
857
	}
858

859
	return data;
860
}
861

862
template <int Raw>
863
SIMD_TARGET SIMD_FLATTEN static int
864
decodeMeshletSimd(void* vertices, void* triangles, const unsigned char* codes, const unsigned char* ctrl, const unsigned char* data, const unsigned char* bound, size_t vertex_count, size_t triangle_count, size_t vertex_size, size_t triangle_size)
865
{
866
	assert(gDecodeTablesInitialized);
867
	(void)gDecodeTablesInitialized;
868

869
#ifdef __clang__
870
	// data is guaranteed to be non-null initially; if decode loops never hit bounds errors, it remains non-null
871
	__builtin_assume(data);
872
#endif
873

874
	// decodes 4 vertices at a time with tail processing; writes up to align(vertex_size * vertex_count, 4)
875
	// raw decoding skips tail processing by rounding up vertex count; it's safe because output buffer is guaranteed to have extra space, and tail control data is 0
876
	if (vertex_size == 4 || Raw)
877
		data = decodeVerticesSimd(static_cast<unsigned int*>(vertices), ctrl, data, bound, Raw ? (vertex_count + 3) & ~3 : vertex_count);
878
	else
879
		data = decodeVerticesSimd(static_cast<unsigned short*>(vertices), ctrl, data, bound, vertex_count);
880
	if (!data)
881
		return -2;
882

883
	// decodes 2/4 triangles at a time with tail processing; writes up to align(triangle_size * triangle_count, 4)
884
	// raw decoding skips tail processing by rounding up triangle count; it's safe because output buffer is guaranteed to have extra space, and tail code data is 0
885
	if (triangle_size == 4 || Raw)
886
		data = decodeTrianglesSimd(static_cast<unsigned int*>(triangles), codes, data, bound, Raw ? (triangle_count + 1) & ~1 : triangle_count);
887
	else
888
		data = decodeTrianglesSimd(static_cast<unsigned char*>(triangles), codes, data, bound, triangle_count);
889
	if (!data)
890
		return -2;
891

892
	return (data == bound) ? 0 : -3;
893
}
894
#endif
895

896
} // namespace meshopt
897

898
size_t meshopt_encodeMeshletBound(size_t max_vertices, size_t max_triangles)
899
{
900
	size_t codes_size = (max_triangles + 1) / 2;
901
	size_t extra_size = max_triangles * 3;
902

903
	size_t ctrl_size = (max_vertices + 3) / 4;
904
	size_t data_size = (max_vertices + 3) / 4 * 16; // worst case: 16 bytes per vertex group
905

906
	size_t gap_size = (codes_size + ctrl_size < 16) ? 16 - (codes_size + ctrl_size) : 0;
907

908
	return codes_size + extra_size + ctrl_size + data_size + gap_size;
909
}
910

911
size_t meshopt_encodeMeshlet(unsigned char* buffer, size_t buffer_size, const unsigned int* vertices, size_t vertex_count, const unsigned char* triangles, size_t triangle_count)
912
{
913
	using namespace meshopt;
914

915
	assert(triangle_count <= 256 && vertex_count <= 256);
916

917
	// 4 bits per triangle + up to three bytes of extra data
918
	unsigned char codes[256 / 2];
919
	unsigned char extra[256 * 3];
920
	size_t codes_size = (triangle_count + 1) / 2;
921
	size_t extra_size = encodeTriangles(codes, extra, triangles, triangle_count);
922
	assert(extra_size <= sizeof(extra));
923

924
	// 2 bits per vertex + up to 4 bytes of actual data
925
	unsigned char ctrl[256 / 4];
926
	unsigned char data[256 * 4];
927
	size_t ctrl_size = (vertex_count + 3) / 4;
928
	size_t data_size = encodeVertices(ctrl, data, vertices, vertex_count);
929
	assert(data_size <= sizeof(data));
930

931
	// we need to ensure that up to 16 bytes after extra+data are available for SIMD decoding
932
	// to minimize overhead, we place fixed-size codes+control at the end of the buffer
933
	size_t gap_size = (codes_size + ctrl_size < 16) ? 16 - (codes_size + ctrl_size) : 0;
934

935
	size_t result = codes_size + extra_size + ctrl_size + data_size + gap_size;
936

937
	if (result > buffer_size)
938
		return 0;
939

940
	// variable-size data first
941
	memcpy(buffer, data, data_size);
942
	buffer += data_size;
943
	memcpy(buffer, extra, extra_size);
944
	buffer += extra_size;
945

946
	// gap (for accelerated decoding) separates variable-size and fixed-size data
947
	memset(buffer, 0, gap_size);
948
	buffer += gap_size;
949

950
	// fixed-size data last; it can be located from buffer end during decoding
951
	memcpy(buffer, ctrl, ctrl_size);
952
	buffer += ctrl_size;
953
	memcpy(buffer, codes, codes_size);
954
	buffer += codes_size;
955

956
#if TRACE > 1
957
	printf("extra:");
958
	for (size_t i = 0; i < extra_size; ++i)
959
		printf(" %d", extra[i]);
960
	printf("\n");
961

962
	unsigned int minv = ~0u;
963
	for (size_t i = 0; i < vertex_count; ++i)
964
		minv = minv < vertices[i] ? minv : vertices[i];
965

966
	printf("vertices: [%d+]", minv);
967
	for (size_t i = 0; i < vertex_count; ++i)
968
		printf(" %d", vertices[i] - minv);
969
	printf("\n");
970
#endif
971

972
#if TRACE
973
	printf("stats: %d vertices, %d triangles => %d bytes (triangles: %d codes, %d extra; vertices: %d control, %d data; %d gap)\n",
974
	    int(vertex_count), int(triangle_count), int(result),
975
	    int(codes_size), int(extra_size), int(ctrl_size), int(data_size), int(gap_size));
976
#endif
977

978
	return result;
979
}
980

981
int meshopt_decodeMeshlet(void* vertices, size_t vertex_count, size_t vertex_size, void* triangles, size_t triangle_count, size_t triangle_size, const unsigned char* buffer, size_t buffer_size)
982
{
983
	using namespace meshopt;
984

985
	assert(triangle_count <= 256 && vertex_count <= 256);
986
	assert(vertex_size == 4 || vertex_size == 2);
987
	assert(triangle_size == 4 || triangle_size == 3);
988

989
	// layout must match encoding
990
	size_t codes_size = (triangle_count + 1) / 2;
991
	size_t ctrl_size = (vertex_count + 3) / 4;
992
	size_t gap_size = (codes_size + ctrl_size < 16) ? 16 - (codes_size + ctrl_size) : 0;
993

994
	if (buffer_size < codes_size + ctrl_size + gap_size)
995
		return -2;
996

997
	const unsigned char* end = buffer + buffer_size;
998
	const unsigned char* codes = end - codes_size;
999
	const unsigned char* ctrl = codes - ctrl_size;
1000
	const unsigned char* data = buffer;
1001

1002
	// gap ensures we have at least 16 bytes available after bound; this allows SIMD decoders to over-read safely
1003
	const unsigned char* bound = ctrl - gap_size;
1004
	assert(bound >= buffer && bound + 16 <= buffer + buffer_size);
1005

1006
#if defined(SIMD_FALLBACK)
1007
	return (gDecodeTablesInitialized ? decodeMeshletSimd<0> : decodeMeshlet)(vertices, triangles, codes, ctrl, data, bound, vertex_count, triangle_count, vertex_size, triangle_size);
1008
#elif defined(SIMD_SSE) || defined(SIMD_NEON)
1009
	return decodeMeshletSimd<0>(vertices, triangles, codes, ctrl, data, bound, vertex_count, triangle_count, vertex_size, triangle_size);
1010
#else
1011
	return decodeMeshlet(vertices, triangles, codes, ctrl, data, bound, vertex_count, triangle_count, vertex_size, triangle_size);
1012
#endif
1013
}
1014

1015
int meshopt_decodeMeshletRaw(unsigned int* vertices, size_t vertex_count, unsigned int* triangles, size_t triangle_count, const unsigned char* buffer, size_t buffer_size)
1016
{
1017
	using namespace meshopt;
1018

1019
	assert(triangle_count <= 256 && vertex_count <= 256);
1020

1021
	// layout must match encoding
1022
	size_t codes_size = (triangle_count + 1) / 2;
1023
	size_t ctrl_size = (vertex_count + 3) / 4;
1024
	size_t gap_size = (codes_size + ctrl_size < 16) ? 16 - (codes_size + ctrl_size) : 0;
1025

1026
	if (buffer_size < codes_size + ctrl_size + gap_size)
1027
		return -2;
1028

1029
	const unsigned char* end = buffer + buffer_size;
1030
	const unsigned char* codes = end - codes_size;
1031
	const unsigned char* ctrl = codes - ctrl_size;
1032
	const unsigned char* data = buffer;
1033

1034
	// gap ensures we have at least 16 bytes available after bound; this allows SIMD decoders to over-read safely
1035
	const unsigned char* bound = ctrl - gap_size;
1036
	assert(bound >= buffer && bound + 16 <= buffer + buffer_size);
1037

1038
#if defined(SIMD_FALLBACK)
1039
	return (gDecodeTablesInitialized ? decodeMeshletSimd<1> : decodeMeshlet)(vertices, triangles, codes, ctrl, data, bound, vertex_count, triangle_count, 4, 4);
1040
#elif defined(SIMD_SSE) || defined(SIMD_NEON)
1041
	return decodeMeshletSimd<1>(vertices, triangles, codes, ctrl, data, bound, vertex_count, triangle_count, 4, 4);
1042
#else
1043
	return decodeMeshlet(vertices, triangles, codes, ctrl, data, bound, vertex_count, triangle_count, 4, 4);
1044
#endif
1045
}
1046

1047
#undef SIMD_SSE
1048
#undef SIMD_NEON
1049
#undef SIMD_FALLBACK
1050
#undef SIMD_FLATTEN
1051
#undef SIMD_TARGET
1052

1053
Product

Resources

Company