Path: blob/master/thirdparty/meshoptimizer/meshletcodec.cpp
59209 views
// This file is part of meshoptimizer library; see meshoptimizer.h for version/license details1#include "meshoptimizer.h"23#include <assert.h>4#include <string.h>56// The block below auto-detects SIMD ISA that can be used on the target platform7#ifndef MESHOPTIMIZER_NO_SIMD89// The SIMD implementation requires SSE4.1, which can be enabled unconditionally through compiler settings10#if defined(__AVX__) || defined(__SSE4_1__)11#define SIMD_SSE12#endif1314// MSVC supports compiling SSE4.1 code regardless of compile options; we use a cpuid-based scalar fallback15#if !defined(SIMD_SSE) && defined(_MSC_VER) && !defined(__clang__) && (defined(_M_IX86) || (defined(_M_X64) && !defined(_M_ARM64EC)))16#define SIMD_SSE17#define SIMD_FALLBACK18#endif1920// GCC 4.9+ and clang 3.8+ support targeting SIMD ISA from individual functions; we use a cpuid-based scalar fallback21#if !defined(SIMD_SSE) && ((defined(__clang__) && __clang_major__ * 100 + __clang_minor__ >= 308) || (defined(__GNUC__) && __GNUC__ * 100 + __GNUC_MINOR__ >= 409)) && (defined(__i386__) || defined(__x86_64__))22#define SIMD_SSE23#define SIMD_FALLBACK24#define SIMD_TARGET __attribute__((target("sse4.1")))25#endif2627// When targeting AArch64, enable NEON SIMD unconditionally; we do not support SIMD decoding for 32-bit ARM28#if defined(__aarch64__) || (defined(_MSC_VER) && (defined(_M_ARM64) || defined(_M_ARM64EC)) && _MSC_VER >= 1922)29#define SIMD_NEON30#endif3132#if defined(_MSC_VER) && !defined(__clang__) && _MSC_VER > 193033#define SIMD_FLATTEN [[msvc::flatten]]34#elif defined(__GNUC__) || defined(__clang__)35#define SIMD_FLATTEN __attribute__((flatten))36#else37#define SIMD_FLATTEN38#endif3940#ifndef SIMD_TARGET41#define SIMD_TARGET42#endif4344#endif // !MESHOPTIMIZER_NO_SIMD4546#ifdef SIMD_SSE47#include <smmintrin.h>48#endif4950#ifdef SIMD_NEON51#include <arm_neon.h>52#endif5354#if defined(SIMD_SSE) && defined(SIMD_FALLBACK)55#ifdef _MSC_VER56#include <intrin.h> // __cpuid57#else58#include <cpuid.h> // __cpuid59#endif60#endif6162#ifndef TRACE63#define TRACE 064#endif6566#if TRACE67#include <stdio.h>68#endif6970namespace meshopt71{7273typedef unsigned int EdgeFifo8[8][2];7475static int rotateTriangle(unsigned int a, unsigned int b, unsigned int c)76{77return (a > b && a > c) ? 1 : (b > c ? 2 : 0);78}7980static int getEdgeFifo8(EdgeFifo8 fifo, unsigned int a, unsigned int b, unsigned int c, size_t offset)81{82for (int i = 0; i < 8; ++i)83{84size_t index = (offset - 1 - i) & 7;8586unsigned int e0 = fifo[index][0];87unsigned int e1 = fifo[index][1];8889if (e0 == a && e1 == b)90return (i << 2) | 0;91if (e0 == b && e1 == c)92return (i << 2) | 1;93if (e0 == c && e1 == a)94return (i << 2) | 2;95}9697return -1;98}99100static void pushEdgeFifo8(EdgeFifo8 fifo, unsigned int a, unsigned int b, size_t& offset)101{102fifo[offset][0] = a;103fifo[offset][1] = b;104offset = (offset + 1) & 7;105}106107static size_t encodeTriangles(unsigned char* codes, unsigned char* extra, const unsigned char* triangles, size_t triangle_count)108{109EdgeFifo8 edgefifo;110memset(edgefifo, -1, sizeof(edgefifo));111112size_t edgefifooffset = 0;113114unsigned int next = 0;115116// 4-bit triangle codes give us 16 options that we use as follows:117// 3*2 edge reuse (2 edges * 3 last triangles) * 2 next/explicit = 12 options118// 4 remaining options = next bits; 000, 001, 011, 111.119// triangles are rotated to make next bits line up.120memset(codes, 0, (triangle_count + 1) / 2);121122static const int rotations[] = {0, 1, 2, 0, 1};123124unsigned char* start = extra;125126for (size_t i = 0; i < triangle_count; ++i)127{128#if TRACE > 1129unsigned int last = next;130#endif131132int fer = getEdgeFifo8(edgefifo, triangles[i * 3 + 0], triangles[i * 3 + 1], triangles[i * 3 + 2], edgefifooffset);133134if (fer >= 0 && (fer >> 2) < 6)135{136// note: getEdgeFifo8 implicitly rotates triangles by matching a/b to existing edge137const int* order = rotations + (fer & 3);138139unsigned int a = triangles[i * 3 + order[0]], b = triangles[i * 3 + order[1]], c = triangles[i * 3 + order[2]];140141int fec = (c == next) ? (next++, 0) : 1;142143#if TRACE > 1144printf("%3d+ | %3d %3d %3d | edge: e%d c%d\n", last, a, b, c, fer >> 2, fec);145#endif146147unsigned int code = (fer >> 2) * 2 + fec;148149codes[i / 2] |= (unsigned char)(code << ((i & 1) * 4));150151if (fec)152*extra++ = (unsigned char)c;153154pushEdgeFifo8(edgefifo, c, b, edgefifooffset);155pushEdgeFifo8(edgefifo, a, c, edgefifooffset);156}157else158{159// rotate triangles to minimize the need for extra vertices160int rotation = rotateTriangle(triangles[i * 3 + 0], triangles[i * 3 + 1], triangles[i * 3 + 2]);161const int* order = rotations + rotation;162163unsigned int a = triangles[i * 3 + order[0]], b = triangles[i * 3 + order[1]], c = triangles[i * 3 + order[2]];164165// fe must be continuous: once a vertex is encoded with next, further vertices must also be encoded with next166int fea = (a == next && b == next + 1 && c == next + 2) ? (next++, 0) : 1;167int feb = (b == next && c == next + 1) ? (next++, 0) : 1;168int fec = (c == next) ? (next++, 0) : 1;169170assert(fea == 1 || feb == 0);171assert(feb == 1 || fec == 0);172173#if TRACE > 1174printf("%3d+ | %3d %3d %3d | restart: %d%d%d\n", last, a, b, c, fea, feb, fec);175#endif176177unsigned int code = 12 + (fea + feb + fec);178179codes[i / 2] |= (unsigned char)(code << ((i & 1) * 4));180181if (fea)182*extra++ = (unsigned char)a;183if (feb)184*extra++ = (unsigned char)b;185if (fec)186*extra++ = (unsigned char)c;187188pushEdgeFifo8(edgefifo, c, b, edgefifooffset);189pushEdgeFifo8(edgefifo, a, c, edgefifooffset);190}191}192193return extra - start;194}195196static size_t encodeVertices(unsigned char* ctrl, unsigned char* data, const unsigned int* vertices, size_t vertex_count)197{198// grouped varint, 2 bit per value to indicate 0/1/2/3 byte deltas, with per-group 4-byte fallback199memset(ctrl, 0, (vertex_count + 3) / 4);200201unsigned char* start = data;202203unsigned int last = ~0u;204205for (size_t i = 0; i < vertex_count; i += 4)206{207unsigned int gv[4] = {};208209for (int k = 0; k < 4 && i + k < vertex_count; ++k)210{211unsigned int d = vertices[i + k] - last - 1;212unsigned int v = (d << 1) ^ (int(d) >> 31);213214gv[k] = v;215last = vertices[i + k];216}217218// if any value needs 4 bytes, or if *all* values need 3 bytes, we use 4 bytes for all values219// this allows us to encode most 3-byte deltas with 3 bytes which saves space overall220bool use4 = (gv[0] | gv[1] | gv[2] | gv[3]) > 0xffffff || (gv[0] > 0xffff && gv[1] > 0xffff && gv[2] > 0xffff && gv[3] > 0xffff);221222for (int k = 0; k < 4; ++k)223{224unsigned int v = gv[k];225226// 0/1/2/3 bytes per value, or all 4 values use 4 bytes227int code = use4 ? 3 : (v == 0 ? 0 : (v < 256 ? 1 : (v < 65536 ? 2 : 3)));228229if (code > 0)230*data++ = (unsigned char)(v & 0xff);231if (code > 1)232*data++ = (unsigned char)((v >> 8) & 0xff);233if (code > 2)234*data++ = (unsigned char)((v >> 16) & 0xff);235if (use4)236*data++ = (unsigned char)((v >> 24) & 0xff);237238// split low and high bits into two nibbles for better packing239ctrl[i / 4] |= ((code & 1) << k) | ((code >> 1) << (k + 4));240}241}242243return data - start;244}245246#if defined(SIMD_FALLBACK) || (!defined(SIMD_SSE) && !defined(SIMD_NEON))247inline void writeTriangle(unsigned int* triangles, size_t i, unsigned int fifo)248{249// output triangle is stored without extra edge vertex (0xcbac => 0xcba)250triangles[i] = fifo >> 8;251}252253inline void writeTriangle(unsigned char* triangles, size_t i, unsigned int fifo)254{255triangles[i * 3 + 0] = (unsigned char)(fifo >> 8);256triangles[i * 3 + 1] = (unsigned char)(fifo >> 16);257triangles[i * 3 + 2] = (unsigned char)(fifo >> 24);258}259260template <typename T>261static const unsigned char* decodeTriangles(T* triangles, const unsigned char* codes, const unsigned char* extra, const unsigned char* bound, size_t triangle_count)262{263// branchlessly read next or extra vertex and advance pointers264#define NEXT(var, ec) \265e = *extra; \266unsigned int var = (ec) ? e : next; \267extra += (ec), next += 1 - (ec)268269unsigned int next = 0;270unsigned int fifo[3] = {}; // two edge fifo entries in one uint: 0xcbac271272for (size_t i = 0; i < triangle_count; ++i)273{274if (extra > bound)275return NULL;276277unsigned int code = (codes[i / 2] >> ((i & 1) * 4)) & 0xF;278unsigned int tri;279280if (code < 12)281{282// reuse283unsigned int edge = fifo[code / 4];284edge >>= (code << 3) & 16; // shift by 16 if bit 1 is set (odd edge for each triangle)285286// 0-1 extra vertices287unsigned int e;288NEXT(c, code & 1);289290// repack triangle into edge format (0xcbac)291tri = ((edge & 0xff) << 16) | (edge & 0xff00) | c | (c << 24);292}293else294{295// restart296int fea = code > 12;297int feb = code > 13;298int fec = code > 14;299300// 0-3 extra vertices301unsigned int e;302NEXT(a, fea);303NEXT(b, feb);304NEXT(c, fec);305306// repack triangle into edge format (0xcbac)307tri = c | (a << 8) | (b << 16) | (c << 24);308}309310writeTriangle(triangles, i, tri);311312fifo[2] = fifo[1];313fifo[1] = fifo[0];314fifo[0] = tri;315}316317return extra;318319#undef NEXT320}321322template <typename V>323static const unsigned char* decodeVertices(V* vertices, const unsigned char* ctrl, const unsigned char* data, const unsigned char* bound, size_t vertex_count)324{325unsigned int last = ~0u;326327for (size_t i = 0; i < vertex_count; i += 4)328{329if (data > bound)330return NULL;331332unsigned char code4 = ctrl[i / 4];333334for (int k = 0; k < 4; ++k)335{336int code = ((code4 >> k) & 1) | ((code4 >> (k + 3)) & 2);337int length = code4 == 0xff ? 4 : code;338339// branchlessly read up to 4 bytes340unsigned int mask = (length == 4) ? ~0u : (1 << (8 * length)) - 1;341unsigned int v = (data[0] | (data[1] << 8) | (data[2] << 16) | (data[3] << 24)) & mask;342343// unzigzag + 1344unsigned int d = (v >> 1) ^ -int(v & 1);345unsigned int r = last + d + 1;346347if (i + k < vertex_count)348vertices[i + k] = V(r);349350data += length;351last = r;352}353}354355return data;356}357358static int decodeMeshlet(void* vertices, void* triangles, const unsigned char* codes, const unsigned char* ctrl, const unsigned char* data, const unsigned char* bound, size_t vertex_count, size_t triangle_count, size_t vertex_size, size_t triangle_size)359{360if (vertex_size == 4)361data = decodeVertices(static_cast<unsigned int*>(vertices), ctrl, data, bound, vertex_count);362else363data = decodeVertices(static_cast<unsigned short*>(vertices), ctrl, data, bound, vertex_count);364if (!data)365return -2;366367if (triangle_size == 4)368data = decodeTriangles(static_cast<unsigned int*>(triangles), codes, data, bound, triangle_count);369else370data = decodeTriangles(static_cast<unsigned char*>(triangles), codes, data, bound, triangle_count);371if (!data)372return -2;373374return (data == bound) ? 0 : -3;375}376#endif377378#if defined(SIMD_SSE) || defined(SIMD_NEON)379// SIMD state is stored in a single 16b register as follows:380// 0..5: 6 next extra bytes381// 6..14: 9 bytes = 3 triangles worth of index data382// 15: 'next' byte383384// upon reading each triangle pair we need to transform this state such that the 9 bytes with triangle data contain the newly decoded triangles,385// which is a permutation of original state modulo per-element additions386// this transform can be chained to decode second triangle from original state; we create tables for 256 combinations of two 4-bit triangle codes387// the actual decoding becomes shuffle+add per triangle pair, plus management of extra bytes388static unsigned char kDecodeTableMasks[256][16];389static unsigned char kDecodeTableExtra[256];390391// for SIMD vertex decoding we need to unpack 4 values with 0-4 bytes in each392// this can be done with a single control-dependent shuffle per group393static unsigned char kDecodeTableVerts[256][16];394static unsigned char kDecodeTableLength[256];395396static bool decodeBuildTables()397{398#define NEXT(var, ec) \399shuf[var] = (ec) ? (unsigned char)extra : 15; \400next[var] = (ec) ? 0 : (unsigned char)nextoff; \401extra += (ec), nextoff += 1 - (ec)402403// check for SSE4.1 support if we have a fallback path404#if defined(SIMD_SSE) && defined(SIMD_FALLBACK)405int cpuinfo[4] = {};406#ifdef _MSC_VER407__cpuid(cpuinfo, 1);408#else409__cpuid(1, cpuinfo[0], cpuinfo[1], cpuinfo[2], cpuinfo[3]);410#endif411// bit 19 = SSE4.1412if ((cpuinfo[2] & (1 << 19)) == 0)413return false;414#endif415416// fill triangle decoding tables for each combination of two triangle codes417for (int code = 0; code < 256; ++code)418{419unsigned char shuf[16] = {};420unsigned char next[16] = {};421int extra = 0;422int nextoff = 0;423424// state 0..5 will be refilled every iteration, so we ignore that425// state 6..8 will always contain the last decoded triangle because every triangle shifts fifo equally, so we can decode it independently426shuf[6] = 12;427shuf[7] = 13;428shuf[8] = 14;429430// state 15 will contain next (potentially incremented a few times)431shuf[15] = 15;432433// state 9..11 will contain the first decoded triangle (tri0), which can refer to extra/next and the original triangle history434// state 12..14 will contain the second decoded triangle (tri1); when decoding edge reuse, we need to handle edge 0/1 specially as it was just decoded earlier435for (int k = 0; k < 2; ++k)436{437int tri = (code >> (k * 4)) & 0xf;438439if (tri < 12)440{441if (k == 1 && tri / 4 == 0)442{443// we need to decode one of two edges from the triangle we just decoded earlier444// for that we simply need to copy shuf/next values for the two decoded indices445shuf[9 + k * 3] = shuf[9 + ((tri & 2) ? 2 : 0)];446next[9 + k * 3] = next[9 + ((tri & 2) ? 2 : 0)];447448shuf[10 + k * 3] = shuf[9 + ((tri & 2) ? 1 : 2)];449next[10 + k * 3] = next[9 + ((tri & 2) ? 1 : 2)];450}451else452{453// reuse: edge comes from the history based on edge index454// note: we reuse with an offset because last triangle in the original history was consumed by tri0455int trioff = 6 + k * 3 + (2 - tri / 4) * 3;456457// edge cb or ac458shuf[9 + k * 3] = (unsigned char)(trioff + ((tri & 2) ? 2 : 0));459shuf[10 + k * 3] = (unsigned char)(trioff + ((tri & 2) ? 1 : 2));460}461462// third vertex is either next or comes from extra463NEXT(11 + k * 3, tri & 1);464}465else466{467// restart: three vertices, each comes from next or extra468int fea = tri > 12;469int feb = tri > 13;470int fec = tri > 14;471472NEXT(9 + k * 3, fea);473NEXT(10 + k * 3, feb);474NEXT(11 + k * 3, fec);475}476}477478// next needs to advance479next[15] = (unsigned char)nextoff;480481// next[0..8] = 0 trivially (never written to); next[9] must also be 0 because nextoff is 0 initially482// shuf[0..5] is not used, which allows us to pack next[10..15] + shuf[6..15] into a single 16-byte entry483assert(next[9] == 0);484memcpy(&kDecodeTableMasks[code][0], &next[10], 6);485memcpy(&kDecodeTableMasks[code][6], &shuf[6], 10);486kDecodeTableExtra[code] = (unsigned char)extra;487}488489// fill vertex decoding tables for each combination of four vertex references490for (unsigned int i = 0; i < 256; ++i)491{492unsigned char shuf[16] = {};493int offset = 0;494495for (int k = 0; k < 4; ++k)496{497int code = ((i >> k) & 1) | ((i >> (k + 3)) & 2);498int length = i == 0xff ? 4 : code; // 0/1/2/3 bytes, or all 4 bytes if code==0xff499500shuf[k * 4 + 0] = (length > 0) ? (unsigned char)(offset + 0) : 0x80;501shuf[k * 4 + 1] = (length > 1) ? (unsigned char)(offset + 1) : 0x80;502shuf[k * 4 + 2] = (length > 2) ? (unsigned char)(offset + 2) : 0x80;503shuf[k * 4 + 3] = (length > 3) ? (unsigned char)(offset + 3) : 0x80;504505offset += length;506}507508memcpy(kDecodeTableVerts[i], shuf, sizeof(shuf));509kDecodeTableLength[i] = (unsigned char)offset;510}511512return true;513514#undef NEXT515}516517static bool gDecodeTablesInitialized = decodeBuildTables();518#endif519520#if defined(SIMD_SSE)521SIMD_TARGET522inline __m128i decodeTriangleGroup(__m128i state, unsigned char code, const unsigned char*& extra)523{524__m128i shuf = _mm_loadu_si128(reinterpret_cast<const __m128i*>(kDecodeTableMasks[code]));525__m128i next = _mm_slli_si128(shuf, 10);526527// patch first 6 bytes with current extra and roll state forward528__m128i ext = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(extra));529state = _mm_blend_epi16(state, ext, 7);530state = _mm_add_epi8(_mm_shuffle_epi8(state, shuf), next);531532extra += kDecodeTableExtra[code];533534return state;535}536537SIMD_TARGET538inline __m128i decodeVertexGroup(__m128i last, unsigned char code, const unsigned char*& data)539{540__m128i word = _mm_loadu_si128(reinterpret_cast<const __m128i*>(data));541__m128i shuf = _mm_loadu_si128(reinterpret_cast<const __m128i*>(kDecodeTableVerts[code]));542543__m128i v = _mm_shuffle_epi8(word, shuf);544545// unzigzag+1546__m128i xl = _mm_sub_epi32(_mm_setzero_si128(), _mm_and_si128(v, _mm_set1_epi32(1)));547__m128i xr = _mm_srli_epi32(v, 1);548__m128i x = _mm_add_epi32(_mm_xor_si128(xl, xr), _mm_set1_epi32(1));549550// prefix sum551x = _mm_add_epi32(x, _mm_slli_si128(x, 8));552x = _mm_add_epi32(x, _mm_slli_si128(x, 4));553x = _mm_add_epi32(x, _mm_shuffle_epi32(last, 0xff));554555data += kDecodeTableLength[code];556557return x;558}559#endif560561#if defined(SIMD_NEON)562SIMD_TARGET563inline uint8x16_t decodeTriangleGroup(uint8x16_t state, unsigned char code, const unsigned char*& extra)564{565uint8x16_t shuf = vld1q_u8(kDecodeTableMasks[code]);566uint8x16_t next = vextq_u8(vdupq_n_u8(0), shuf, 6);567568// patch first 6 bytes with current extra and roll state forward569uint8x8_t extl = vld1_u8(extra);570uint8x16_t ext = vcombine_u8(extl, vdup_n_u8(0));571state = vbslq_u8(vcombine_u8(vcreate_u8(0xffffffffffffull), vdup_n_u8(0)), ext, state);572state = vaddq_u8(vqtbl1q_u8(state, shuf), next);573574extra += kDecodeTableExtra[code];575576return state;577}578579SIMD_TARGET580inline uint32x4_t decodeVertexGroup(uint32x4_t last, unsigned char code, const unsigned char*& data)581{582uint8x16_t word = vld1q_u8(data);583uint8x16_t shuf = vld1q_u8(kDecodeTableVerts[code]);584585uint32x4_t v = vreinterpretq_u32_u8(vqtbl1q_u8(word, shuf));586587// unzigzag+1588uint32x4_t xl = vsubq_u32(vdupq_n_u32(0), vandq_u32(v, vdupq_n_u32(1)));589uint32x4_t xr = vshrq_n_u32(v, 1);590uint32x4_t x = vaddq_u32(veorq_u32(xl, xr), vdupq_n_u32(1));591592// prefix sum593x = vaddq_u32(x, vextq_u32(vdupq_n_u32(0), x, 2));594x = vaddq_u32(x, vextq_u32(vdupq_n_u32(0), x, 3));595x = vaddq_u32(x, vdupq_n_u32(vgetq_lane_u32(last, 3)));596597data += kDecodeTableLength[code];598599return x;600}601#endif602603#if defined(SIMD_SSE)604#ifdef __GNUC__605typedef int __attribute__((aligned(1))) unaligned_int;606#else607typedef int unaligned_int;608#endif609#endif610611#if defined(SIMD_SSE) || defined(SIMD_NEON)612SIMD_TARGET613static const unsigned char* decodeTrianglesSimd(unsigned int* triangles, const unsigned char* codes, const unsigned char* extra, const unsigned char* bound, size_t triangle_count)614{615#if defined(SIMD_SSE)616__m128i repack = _mm_setr_epi8(9, 10, 11, -1, 12, 13, 14, -1, 0, 0, 0, 0, 0, 0, 0, 0);617__m128i state = _mm_setzero_si128();618#elif defined(SIMD_NEON)619uint8x8_t repack = vcreate_u8(0xff0e0d0cff0b0a09ull);620uint8x16_t state = vdupq_n_u8(0);621#endif622623size_t groups = triangle_count / 2;624625// process all complete groups626for (size_t i = 0; i < groups; ++i)627{628unsigned char code = *codes++;629630if (extra > bound)631return NULL;632633state = decodeTriangleGroup(state, code, extra);634635// write 6 bytes of new triangle data into output, formatted as 8 bytes with 0 padding636#if defined(SIMD_SSE)637__m128i r = _mm_shuffle_epi8(state, repack);638_mm_storel_epi64(reinterpret_cast<__m128i*>(&triangles[i * 2]), r);639#elif defined(SIMD_NEON)640uint32x2_t r = vreinterpret_u32_u8(vqtbl1_u8(state, repack));641vst1_u32(&triangles[i * 2], r);642#endif643}644645// process a 1 triangle tail; to maintain the memory safety guarantee we have to write a 32-bit element646if (triangle_count & 1)647{648unsigned char code = *codes++;649650if (extra > bound)651return NULL;652653state = decodeTriangleGroup(state, code, extra);654655unsigned int* tail = &triangles[triangle_count & ~1u];656657#if defined(SIMD_SSE)658__m128i r = _mm_shuffle_epi8(state, repack);659*tail = unsigned(_mm_cvtsi128_si32(r));660#elif defined(SIMD_NEON)661uint32x2_t r = vreinterpret_u32_u8(vqtbl1_u8(state, repack));662vst1_lane_u32(tail, r, 0);663#endif664}665666return extra;667}668669SIMD_TARGET670static const unsigned char* decodeTrianglesSimd(unsigned char* triangles, const unsigned char* codes, const unsigned char* extra, const unsigned char* bound, size_t triangle_count)671{672#if defined(SIMD_SSE)673__m128i state = _mm_setzero_si128();674#elif defined(SIMD_NEON)675uint8x16_t state = vdupq_n_u8(0);676#endif677678// because the output buffer is guaranteed to have 32-bit aligned size available, we can optimize writes and tail processing679// instead of processing triangles 2 at a time, we process 2 *pairs* at a time (12-byte write) followed by a tail pair, if present680// if the number of triangles mod 4 is 3, we'd normally need to write 12k+9 bytes, but we can instead overwrite up to 3 bytes in the main loop681size_t groups = (triangle_count + 1) / 4;682683// process all complete groups684for (size_t i = 0; i < groups; ++i)685{686unsigned char code0 = *codes++;687unsigned char code1 = *codes++;688689// each triangle pair reads <=6 bytes from extra, so two pairs need <=12 bytes and gap guarantees 16 byte of overread690if (extra > bound)691return NULL;692693state = decodeTriangleGroup(state, code0, extra);694695// write first decoded triangle and first index of second decoded triangle696#if defined(SIMD_SSE)697__m128i r0 = _mm_srli_si128(state, 9);698*reinterpret_cast<unaligned_int*>(&triangles[i * 12]) = _mm_cvtsi128_si32(r0);699#elif defined(SIMD_NEON)700uint8x16_t r0 = vextq_u8(state, vdupq_n_u8(0), 9);701vst1q_lane_u32(reinterpret_cast<unsigned int*>(&triangles[i * 12]), vreinterpretq_u32_u8(r0), 0);702#endif703704state = decodeTriangleGroup(state, code1, extra);705706// write last two indices of second decoded triangle that we didn't write above plus two new ones707// note that the second decoded triangle has shifted down to 6-8 bytes, hence shift by 7708#if defined(SIMD_SSE)709__m128i r1 = _mm_srli_si128(state, 7);710_mm_storel_epi64(reinterpret_cast<__m128i*>(&triangles[i * 12 + 4]), r1);711#elif defined(SIMD_NEON)712uint8x16_t r1 = vextq_u8(state, vdupq_n_u8(0), 7);713vst1_u8(&triangles[i * 12 + 4], vget_low_u8(r1));714#endif715}716717// process a 1-2 triangle tail; to maintain the memory safety guarantee we have to write 1-2 32-bit elements718if (groups * 4 < triangle_count)719{720unsigned char code = *codes++;721722if (extra > bound)723return NULL;724725state = decodeTriangleGroup(state, code, extra);726727unsigned char* tail = &triangles[(triangle_count & ~3u) * 3];728729#if defined(SIMD_SSE)730__m128i r = _mm_srli_si128(state, 9);731732*reinterpret_cast<unaligned_int*>(tail) = _mm_cvtsi128_si32(r);733if ((triangle_count & 3) > 1)734*reinterpret_cast<unaligned_int*>(tail + 4) = _mm_extract_epi32(r, 1);735#elif defined(SIMD_NEON)736uint8x16_t r = vextq_u8(state, vdupq_n_u8(0), 9);737738vst1q_lane_u32(reinterpret_cast<unsigned int*>(tail), vreinterpretq_u32_u8(r), 0);739if ((triangle_count & 3) > 1)740vst1q_lane_u32(reinterpret_cast<unsigned int*>(tail + 4), vreinterpretq_u32_u8(r), 1);741#endif742}743744return extra;745}746747SIMD_TARGET748static const unsigned char* decodeVerticesSimd(unsigned int* vertices, const unsigned char* ctrl, const unsigned char* data, const unsigned char* bound, size_t vertex_count)749{750#if defined(SIMD_SSE)751__m128i last = _mm_set1_epi32(-1);752#elif defined(SIMD_NEON)753uint32x4_t last = vdupq_n_u32(~0u);754#endif755756size_t groups = vertex_count / 4;757758// process all complete groups759for (size_t i = 0; i < groups; ++i)760{761unsigned char code = *ctrl++;762if (data > bound)763return NULL;764765last = decodeVertexGroup(last, code, data);766767#if defined(SIMD_SSE)768_mm_storeu_si128(reinterpret_cast<__m128i*>(&vertices[i * 4]), last);769#elif defined(SIMD_NEON)770vst1q_u32(&vertices[i * 4], last);771#endif772}773774// process a 1-3 vertex tail; to maintain the memory safety guarantee we have to write individual elements775if (vertex_count & 3)776{777unsigned char code = *ctrl++;778779if (data > bound)780return NULL;781782last = decodeVertexGroup(last, code, data);783784unsigned int* tail = &vertices[vertex_count & ~3u];785786#if defined(SIMD_SSE)787tail[0] = _mm_cvtsi128_si32(last);788if ((vertex_count & 3) > 1)789tail[1] = _mm_extract_epi32(last, 1);790if ((vertex_count & 3) > 2)791tail[2] = _mm_extract_epi32(last, 2);792#elif defined(SIMD_NEON)793vst1q_lane_u32(&tail[0], last, 0);794if ((vertex_count & 3) > 1)795vst1q_lane_u32(&tail[1], last, 1);796if ((vertex_count & 3) > 2)797vst1q_lane_u32(&tail[2], last, 2);798#endif799}800801return data;802}803804SIMD_TARGET805static const unsigned char* decodeVerticesSimd(unsigned short* vertices, const unsigned char* ctrl, const unsigned char* data, const unsigned char* bound, size_t vertex_count)806{807#if defined(SIMD_SSE)808__m128i repack = _mm_setr_epi8(0, 1, 4, 5, 8, 9, 12, 13, 0, 0, 0, 0, 0, 0, 0, 0);809__m128i last = _mm_set1_epi32(-1);810#elif defined(SIMD_NEON)811uint32x4_t last = vdupq_n_u32(~0u);812#endif813814// because the output buffer is guaranteed to have 32-bit aligned size available, we can simplify tail processing815// if the number of vertices mod 4 is 3, we'd normally need to write 8+6 bytes, but we can instead overwrite up to 2 bytes in the main loop816size_t groups = (vertex_count + 1) / 4;817818// process all complete groups819for (size_t i = 0; i < groups; ++i)820{821unsigned char code = *ctrl++;822823if (data > bound)824return NULL;825826last = decodeVertexGroup(last, code, data);827828#if defined(SIMD_SSE)829__m128i r = _mm_shuffle_epi8(last, repack);830_mm_storel_epi64(reinterpret_cast<__m128i*>(&vertices[i * 4]), r);831#elif defined(SIMD_NEON)832uint16x4_t r = vmovn_u32(last);833vst1_u16(&vertices[i * 4], r);834#endif835}836837// process a 1-2 vertex tail; to maintain the memory safety guarantee we have to write a 32-bit element838if (groups * 4 < vertex_count)839{840unsigned char code = *ctrl++;841842if (data > bound)843return NULL;844845last = decodeVertexGroup(last, code, data);846847unsigned short* tail = &vertices[vertex_count & ~3u];848849#if defined(SIMD_SSE)850__m128i r = _mm_shufflelo_epi16(last, 8);851*reinterpret_cast<unaligned_int*>(tail) = _mm_cvtsi128_si32(r);852#elif defined(SIMD_NEON)853uint16x4_t r = vmovn_u32(last);854vst1_lane_u32(reinterpret_cast<unsigned int*>(tail), vreinterpret_u32_u16(r), 0);855#endif856}857858return data;859}860861template <int Raw>862SIMD_TARGET SIMD_FLATTEN static int863decodeMeshletSimd(void* vertices, void* triangles, const unsigned char* codes, const unsigned char* ctrl, const unsigned char* data, const unsigned char* bound, size_t vertex_count, size_t triangle_count, size_t vertex_size, size_t triangle_size)864{865assert(gDecodeTablesInitialized);866(void)gDecodeTablesInitialized;867868#ifdef __clang__869// data is guaranteed to be non-null initially; if decode loops never hit bounds errors, it remains non-null870__builtin_assume(data);871#endif872873// decodes 4 vertices at a time with tail processing; writes up to align(vertex_size * vertex_count, 4)874// raw decoding skips tail processing by rounding up vertex count; it's safe because output buffer is guaranteed to have extra space, and tail control data is 0875if (vertex_size == 4 || Raw)876data = decodeVerticesSimd(static_cast<unsigned int*>(vertices), ctrl, data, bound, Raw ? (vertex_count + 3) & ~3 : vertex_count);877else878data = decodeVerticesSimd(static_cast<unsigned short*>(vertices), ctrl, data, bound, vertex_count);879if (!data)880return -2;881882// decodes 2/4 triangles at a time with tail processing; writes up to align(triangle_size * triangle_count, 4)883// raw decoding skips tail processing by rounding up triangle count; it's safe because output buffer is guaranteed to have extra space, and tail code data is 0884if (triangle_size == 4 || Raw)885data = decodeTrianglesSimd(static_cast<unsigned int*>(triangles), codes, data, bound, Raw ? (triangle_count + 1) & ~1 : triangle_count);886else887data = decodeTrianglesSimd(static_cast<unsigned char*>(triangles), codes, data, bound, triangle_count);888if (!data)889return -2;890891return (data == bound) ? 0 : -3;892}893#endif894895} // namespace meshopt896897size_t meshopt_encodeMeshletBound(size_t max_vertices, size_t max_triangles)898{899size_t codes_size = (max_triangles + 1) / 2;900size_t extra_size = max_triangles * 3;901902size_t ctrl_size = (max_vertices + 3) / 4;903size_t data_size = (max_vertices + 3) / 4 * 16; // worst case: 16 bytes per vertex group904905size_t gap_size = (codes_size + ctrl_size < 16) ? 16 - (codes_size + ctrl_size) : 0;906907return codes_size + extra_size + ctrl_size + data_size + gap_size;908}909910size_t meshopt_encodeMeshlet(unsigned char* buffer, size_t buffer_size, const unsigned int* vertices, size_t vertex_count, const unsigned char* triangles, size_t triangle_count)911{912using namespace meshopt;913914assert(triangle_count <= 256 && vertex_count <= 256);915916// 4 bits per triangle + up to three bytes of extra data917unsigned char codes[256 / 2];918unsigned char extra[256 * 3];919size_t codes_size = (triangle_count + 1) / 2;920size_t extra_size = encodeTriangles(codes, extra, triangles, triangle_count);921assert(extra_size <= sizeof(extra));922923// 2 bits per vertex + up to 4 bytes of actual data924unsigned char ctrl[256 / 4];925unsigned char data[256 * 4];926size_t ctrl_size = (vertex_count + 3) / 4;927size_t data_size = encodeVertices(ctrl, data, vertices, vertex_count);928assert(data_size <= sizeof(data));929930// we need to ensure that up to 16 bytes after extra+data are available for SIMD decoding931// to minimize overhead, we place fixed-size codes+control at the end of the buffer932size_t gap_size = (codes_size + ctrl_size < 16) ? 16 - (codes_size + ctrl_size) : 0;933934size_t result = codes_size + extra_size + ctrl_size + data_size + gap_size;935936if (result > buffer_size)937return 0;938939// variable-size data first940memcpy(buffer, data, data_size);941buffer += data_size;942memcpy(buffer, extra, extra_size);943buffer += extra_size;944945// gap (for accelerated decoding) separates variable-size and fixed-size data946memset(buffer, 0, gap_size);947buffer += gap_size;948949// fixed-size data last; it can be located from buffer end during decoding950memcpy(buffer, ctrl, ctrl_size);951buffer += ctrl_size;952memcpy(buffer, codes, codes_size);953buffer += codes_size;954955#if TRACE > 1956printf("extra:");957for (size_t i = 0; i < extra_size; ++i)958printf(" %d", extra[i]);959printf("\n");960961unsigned int minv = ~0u;962for (size_t i = 0; i < vertex_count; ++i)963minv = minv < vertices[i] ? minv : vertices[i];964965printf("vertices: [%d+]", minv);966for (size_t i = 0; i < vertex_count; ++i)967printf(" %d", vertices[i] - minv);968printf("\n");969#endif970971#if TRACE972printf("stats: %d vertices, %d triangles => %d bytes (triangles: %d codes, %d extra; vertices: %d control, %d data; %d gap)\n",973int(vertex_count), int(triangle_count), int(result),974int(codes_size), int(extra_size), int(ctrl_size), int(data_size), int(gap_size));975#endif976977return result;978}979980int meshopt_decodeMeshlet(void* vertices, size_t vertex_count, size_t vertex_size, void* triangles, size_t triangle_count, size_t triangle_size, const unsigned char* buffer, size_t buffer_size)981{982using namespace meshopt;983984assert(triangle_count <= 256 && vertex_count <= 256);985assert(vertex_size == 4 || vertex_size == 2);986assert(triangle_size == 4 || triangle_size == 3);987988// layout must match encoding989size_t codes_size = (triangle_count + 1) / 2;990size_t ctrl_size = (vertex_count + 3) / 4;991size_t gap_size = (codes_size + ctrl_size < 16) ? 16 - (codes_size + ctrl_size) : 0;992993if (buffer_size < codes_size + ctrl_size + gap_size)994return -2;995996const unsigned char* end = buffer + buffer_size;997const unsigned char* codes = end - codes_size;998const unsigned char* ctrl = codes - ctrl_size;999const unsigned char* data = buffer;10001001// gap ensures we have at least 16 bytes available after bound; this allows SIMD decoders to over-read safely1002const unsigned char* bound = ctrl - gap_size;1003assert(bound >= buffer && bound + 16 <= buffer + buffer_size);10041005#if defined(SIMD_FALLBACK)1006return (gDecodeTablesInitialized ? decodeMeshletSimd<0> : decodeMeshlet)(vertices, triangles, codes, ctrl, data, bound, vertex_count, triangle_count, vertex_size, triangle_size);1007#elif defined(SIMD_SSE) || defined(SIMD_NEON)1008return decodeMeshletSimd<0>(vertices, triangles, codes, ctrl, data, bound, vertex_count, triangle_count, vertex_size, triangle_size);1009#else1010return decodeMeshlet(vertices, triangles, codes, ctrl, data, bound, vertex_count, triangle_count, vertex_size, triangle_size);1011#endif1012}10131014int meshopt_decodeMeshletRaw(unsigned int* vertices, size_t vertex_count, unsigned int* triangles, size_t triangle_count, const unsigned char* buffer, size_t buffer_size)1015{1016using namespace meshopt;10171018assert(triangle_count <= 256 && vertex_count <= 256);10191020// layout must match encoding1021size_t codes_size = (triangle_count + 1) / 2;1022size_t ctrl_size = (vertex_count + 3) / 4;1023size_t gap_size = (codes_size + ctrl_size < 16) ? 16 - (codes_size + ctrl_size) : 0;10241025if (buffer_size < codes_size + ctrl_size + gap_size)1026return -2;10271028const unsigned char* end = buffer + buffer_size;1029const unsigned char* codes = end - codes_size;1030const unsigned char* ctrl = codes - ctrl_size;1031const unsigned char* data = buffer;10321033// gap ensures we have at least 16 bytes available after bound; this allows SIMD decoders to over-read safely1034const unsigned char* bound = ctrl - gap_size;1035assert(bound >= buffer && bound + 16 <= buffer + buffer_size);10361037#if defined(SIMD_FALLBACK)1038return (gDecodeTablesInitialized ? decodeMeshletSimd<1> : decodeMeshlet)(vertices, triangles, codes, ctrl, data, bound, vertex_count, triangle_count, 4, 4);1039#elif defined(SIMD_SSE) || defined(SIMD_NEON)1040return decodeMeshletSimd<1>(vertices, triangles, codes, ctrl, data, bound, vertex_count, triangle_count, 4, 4);1041#else1042return decodeMeshlet(vertices, triangles, codes, ctrl, data, bound, vertex_count, triangle_count, 4, 4);1043#endif1044}10451046#undef SIMD_SSE1047#undef SIMD_NEON1048#undef SIMD_FALLBACK1049#undef SIMD_FLATTEN1050#undef SIMD_TARGET105110521053