Path: blob/master/thirdparty/meshoptimizer/vertexcodec.cpp
9903 views
// This file is part of meshoptimizer library; see meshoptimizer.h for version/license details1#include "meshoptimizer.h"23#include <assert.h>4#include <string.h>56// The block below auto-detects SIMD ISA that can be used on the target platform7#ifndef MESHOPTIMIZER_NO_SIMD89// The SIMD implementation requires SSSE3, which can be enabled unconditionally through compiler settings10#if defined(__AVX__) || defined(__SSSE3__)11#define SIMD_SSE12#endif1314// An experimental implementation using AVX512 instructions; it's only enabled when AVX512 is enabled through compiler settings15#if defined(__AVX512VBMI2__) && defined(__AVX512VBMI__) && defined(__AVX512VL__) && defined(__POPCNT__)16#undef SIMD_SSE17#define SIMD_AVX18#endif1920// MSVC supports compiling SSSE3 code regardless of compile options; we use a cpuid-based scalar fallback21#if !defined(SIMD_SSE) && !defined(SIMD_AVX) && defined(_MSC_VER) && !defined(__clang__) && (defined(_M_IX86) || defined(_M_X64))22#define SIMD_SSE23#define SIMD_FALLBACK24#endif2526// GCC 4.9+ and clang 3.8+ support targeting SIMD ISA from individual functions; we use a cpuid-based scalar fallback27#if !defined(SIMD_SSE) && !defined(SIMD_AVX) && ((defined(__clang__) && __clang_major__ * 100 + __clang_minor__ >= 308) || (defined(__GNUC__) && __GNUC__ * 100 + __GNUC_MINOR__ >= 409)) && (defined(__i386__) || defined(__x86_64__))28#define SIMD_SSE29#define SIMD_FALLBACK30#define SIMD_TARGET __attribute__((target("ssse3")))31#endif3233// GCC/clang define these when NEON support is available34#if defined(__ARM_NEON__) || defined(__ARM_NEON)35#define SIMD_NEON36#endif3738// On MSVC, we assume that ARM builds always target NEON-capable devices39#if !defined(SIMD_NEON) && defined(_MSC_VER) && (defined(_M_ARM) || defined(_M_ARM64))40#define SIMD_NEON41#endif4243// When targeting Wasm SIMD we can't use runtime cpuid checks so we unconditionally enable SIMD44#if defined(__wasm_simd128__)45#define SIMD_WASM46// Prevent compiling other variant when wasm simd compilation is active47#undef SIMD_NEON48#undef SIMD_SSE49#undef SIMD_AVX50#endif5152#ifndef SIMD_TARGET53#define SIMD_TARGET54#endif5556// When targeting AArch64/x64, optimize for latency to allow decoding of individual 16-byte groups to overlap57// We don't do this for 32-bit systems because we need 64-bit math for this and this will hurt in-order CPUs58#if defined(__x86_64__) || defined(_M_X64) || defined(__aarch64__) || defined(_M_ARM64)59#define SIMD_LATENCYOPT60#endif6162// In switch dispatch, marking default case as unreachable allows to remove redundant bounds checks63#if defined(__GNUC__)64#define SIMD_UNREACHABLE() __builtin_unreachable()65#elif defined(_MSC_VER)66#define SIMD_UNREACHABLE() __assume(false)67#else68#define SIMD_UNREACHABLE() assert(!"Unreachable")69#endif7071#endif // !MESHOPTIMIZER_NO_SIMD7273#ifdef SIMD_SSE74#include <tmmintrin.h>75#endif7677#if defined(SIMD_SSE) && defined(SIMD_FALLBACK)78#ifdef _MSC_VER79#include <intrin.h> // __cpuid80#else81#include <cpuid.h> // __cpuid82#endif83#endif8485#ifdef SIMD_AVX86#include <immintrin.h>87#endif8889#ifdef SIMD_NEON90#if defined(_MSC_VER) && defined(_M_ARM64)91#include <arm64_neon.h>92#else93#include <arm_neon.h>94#endif95#endif9697#ifdef SIMD_WASM98#include <wasm_simd128.h>99#endif100101#ifndef TRACE102#define TRACE 0103#endif104105#if TRACE106#include <stdio.h>107#endif108109#ifdef SIMD_WASM110#define wasmx_splat_v32x4(v, i) wasm_i32x4_shuffle(v, v, i, i, i, i)111#define wasmx_unpacklo_v8x16(a, b) wasm_i8x16_shuffle(a, b, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23)112#define wasmx_unpackhi_v8x16(a, b) wasm_i8x16_shuffle(a, b, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31)113#define wasmx_unpacklo_v16x8(a, b) wasm_i16x8_shuffle(a, b, 0, 8, 1, 9, 2, 10, 3, 11)114#define wasmx_unpackhi_v16x8(a, b) wasm_i16x8_shuffle(a, b, 4, 12, 5, 13, 6, 14, 7, 15)115#define wasmx_unpacklo_v64x2(a, b) wasm_i64x2_shuffle(a, b, 0, 2)116#define wasmx_unpackhi_v64x2(a, b) wasm_i64x2_shuffle(a, b, 1, 3)117#endif118119namespace meshopt120{121122const unsigned char kVertexHeader = 0xa0;123124static int gEncodeVertexVersion = 0;125const int kDecodeVertexVersion = 1;126127const size_t kVertexBlockSizeBytes = 8192;128const size_t kVertexBlockMaxSize = 256;129const size_t kByteGroupSize = 16;130const size_t kByteGroupDecodeLimit = 24;131const size_t kTailMinSizeV0 = 32;132const size_t kTailMinSizeV1 = 24;133134static const int kBitsV0[4] = {0, 2, 4, 8};135static const int kBitsV1[5] = {0, 1, 2, 4, 8};136137const int kEncodeDefaultLevel = 2;138139static size_t getVertexBlockSize(size_t vertex_size)140{141// make sure the entire block fits into the scratch buffer and is aligned to byte group size142// note: the block size is implicitly part of the format, so we can't change it without breaking compatibility143size_t result = (kVertexBlockSizeBytes / vertex_size) & ~(kByteGroupSize - 1);144145return (result < kVertexBlockMaxSize) ? result : kVertexBlockMaxSize;146}147148inline unsigned int rotate(unsigned int v, int r)149{150return (v << r) | (v >> ((32 - r) & 31));151}152153template <typename T>154inline T zigzag(T v)155{156return (0 - (v >> (sizeof(T) * 8 - 1))) ^ (v << 1);157}158159template <typename T>160inline T unzigzag(T v)161{162return (0 - (v & 1)) ^ (v >> 1);163}164165#if TRACE166struct Stats167{168size_t size;169size_t header; // bytes for header170size_t bitg[9]; // bytes for bit groups171size_t bitc[8]; // bit consistency: how many bits are shared between all bytes in a group172size_t ctrl[4]; // number of control groups173};174175static Stats* bytestats = NULL;176static Stats vertexstats[256];177#endif178179static bool encodeBytesGroupZero(const unsigned char* buffer)180{181assert(kByteGroupSize == sizeof(unsigned long long) * 2);182183unsigned long long v[2];184memcpy(v, buffer, sizeof(v));185186return (v[0] | v[1]) == 0;187}188189static size_t encodeBytesGroupMeasure(const unsigned char* buffer, int bits)190{191assert(bits >= 0 && bits <= 8);192193if (bits == 0)194return encodeBytesGroupZero(buffer) ? 0 : size_t(-1);195196if (bits == 8)197return kByteGroupSize;198199size_t result = kByteGroupSize * bits / 8;200201unsigned char sentinel = (1 << bits) - 1;202203for (size_t i = 0; i < kByteGroupSize; ++i)204result += buffer[i] >= sentinel;205206return result;207}208209static unsigned char* encodeBytesGroup(unsigned char* data, const unsigned char* buffer, int bits)210{211assert(bits >= 0 && bits <= 8);212assert(kByteGroupSize % 8 == 0);213214if (bits == 0)215return data;216217if (bits == 8)218{219memcpy(data, buffer, kByteGroupSize);220return data + kByteGroupSize;221}222223size_t byte_size = 8 / bits;224assert(kByteGroupSize % byte_size == 0);225226// fixed portion: bits bits for each value227// variable portion: full byte for each out-of-range value (using 1...1 as sentinel)228unsigned char sentinel = (1 << bits) - 1;229230for (size_t i = 0; i < kByteGroupSize; i += byte_size)231{232unsigned char byte = 0;233234for (size_t k = 0; k < byte_size; ++k)235{236unsigned char enc = (buffer[i + k] >= sentinel) ? sentinel : buffer[i + k];237238byte <<= bits;239byte |= enc;240}241242// encode 1-bit groups in reverse bit order243// this makes them faster to decode alongside other groups244if (bits == 1)245byte = (unsigned char)(((byte * 0x80200802ull) & 0x0884422110ull) * 0x0101010101ull >> 32);246247*data++ = byte;248}249250for (size_t i = 0; i < kByteGroupSize; ++i)251{252unsigned char v = buffer[i];253254// branchless append of out-of-range values255*data = v;256data += v >= sentinel;257}258259return data;260}261262static unsigned char* encodeBytes(unsigned char* data, unsigned char* data_end, const unsigned char* buffer, size_t buffer_size, const int bits[4])263{264assert(buffer_size % kByteGroupSize == 0);265266unsigned char* header = data;267268// round number of groups to 4 to get number of header bytes269size_t header_size = (buffer_size / kByteGroupSize + 3) / 4;270271if (size_t(data_end - data) < header_size)272return NULL;273274data += header_size;275276memset(header, 0, header_size);277278int last_bits = -1;279280for (size_t i = 0; i < buffer_size; i += kByteGroupSize)281{282if (size_t(data_end - data) < kByteGroupDecodeLimit)283return NULL;284285int best_bitk = 3;286size_t best_size = encodeBytesGroupMeasure(buffer + i, bits[best_bitk]);287288for (int bitk = 0; bitk < 3; ++bitk)289{290size_t size = encodeBytesGroupMeasure(buffer + i, bits[bitk]);291292// favor consistent bit selection across groups, but never replace literals293if (size < best_size || (size == best_size && bits[bitk] == last_bits && bits[best_bitk] != 8))294{295best_bitk = bitk;296best_size = size;297}298}299300size_t header_offset = i / kByteGroupSize;301header[header_offset / 4] |= best_bitk << ((header_offset % 4) * 2);302303int best_bits = bits[best_bitk];304unsigned char* next = encodeBytesGroup(data, buffer + i, best_bits);305306assert(data + best_size == next);307data = next;308last_bits = best_bits;309310#if TRACE311bytestats->bitg[best_bits] += best_size;312#endif313}314315#if TRACE316bytestats->header += header_size;317#endif318319return data;320}321322template <typename T, bool Xor>323static void encodeDeltas1(unsigned char* buffer, const unsigned char* vertex_data, size_t vertex_count, size_t vertex_size, const unsigned char last_vertex[256], size_t k, int rot)324{325size_t k0 = k & ~(sizeof(T) - 1);326int ks = (k & (sizeof(T) - 1)) * 8;327328T p = last_vertex[k0];329for (size_t j = 1; j < sizeof(T); ++j)330p |= T(last_vertex[k0 + j]) << (j * 8);331332const unsigned char* vertex = vertex_data + k0;333334for (size_t i = 0; i < vertex_count; ++i)335{336T v = vertex[0];337for (size_t j = 1; j < sizeof(T); ++j)338v |= vertex[j] << (j * 8);339340T d = Xor ? T(rotate(v ^ p, rot)) : zigzag(T(v - p));341342buffer[i] = (unsigned char)(d >> ks);343p = v;344vertex += vertex_size;345}346}347348static void encodeDeltas(unsigned char* buffer, const unsigned char* vertex_data, size_t vertex_count, size_t vertex_size, const unsigned char last_vertex[256], size_t k, int channel)349{350switch (channel & 3)351{352case 0:353return encodeDeltas1<unsigned char, false>(buffer, vertex_data, vertex_count, vertex_size, last_vertex, k, 0);354case 1:355return encodeDeltas1<unsigned short, false>(buffer, vertex_data, vertex_count, vertex_size, last_vertex, k, 0);356case 2:357return encodeDeltas1<unsigned int, true>(buffer, vertex_data, vertex_count, vertex_size, last_vertex, k, channel >> 4);358default:359assert(!"Unsupported channel encoding"); // unreachable360}361}362363static int estimateBits(unsigned char v)364{365return v <= 15 ? (v <= 3 ? (v == 0 ? 0 : 2) : 4) : 8;366}367368static int estimateRotate(const unsigned char* vertex_data, size_t vertex_count, size_t vertex_size, size_t k, size_t group_size)369{370size_t sizes[8] = {};371372const unsigned char* vertex = vertex_data + k;373unsigned int last = vertex[0] | (vertex[1] << 8) | (vertex[2] << 16) | (vertex[3] << 24);374375for (size_t i = 0; i < vertex_count; i += group_size)376{377unsigned int bitg = 0;378379// calculate bit consistency mask for the group380for (size_t j = 0; j < group_size && i + j < vertex_count; ++j)381{382unsigned int v = vertex[0] | (vertex[1] << 8) | (vertex[2] << 16) | (vertex[3] << 24);383unsigned int d = v ^ last;384385bitg |= d;386last = v;387vertex += vertex_size;388}389390#if TRACE391for (int j = 0; j < 32; ++j)392vertexstats[k + (j / 8)].bitc[j % 8] += (i + group_size < vertex_count ? group_size : vertex_count - i) * (1 - ((bitg >> j) & 1));393#endif394395for (int j = 0; j < 8; ++j)396{397unsigned int bitr = rotate(bitg, j);398399sizes[j] += estimateBits((unsigned char)(bitr >> 0)) + estimateBits((unsigned char)(bitr >> 8));400sizes[j] += estimateBits((unsigned char)(bitr >> 16)) + estimateBits((unsigned char)(bitr >> 24));401}402}403404int best_rot = 0;405for (int rot = 1; rot < 8; ++rot)406best_rot = (sizes[rot] < sizes[best_rot]) ? rot : best_rot;407408return best_rot;409}410411static int estimateChannel(const unsigned char* vertex_data, size_t vertex_count, size_t vertex_size, size_t k, size_t vertex_block_size, size_t block_skip, int max_channel, int xor_rot)412{413unsigned char block[kVertexBlockMaxSize];414assert(vertex_block_size <= kVertexBlockMaxSize);415416unsigned char last_vertex[256] = {};417418size_t sizes[3] = {};419assert(max_channel <= 3);420421for (size_t i = 0; i < vertex_count; i += vertex_block_size * block_skip)422{423size_t block_size = i + vertex_block_size < vertex_count ? vertex_block_size : vertex_count - i;424size_t block_size_aligned = (block_size + kByteGroupSize - 1) & ~(kByteGroupSize - 1);425426memcpy(last_vertex, vertex_data + (i == 0 ? 0 : i - 1) * vertex_size, vertex_size);427428// we sometimes encode elements we didn't fill when rounding to kByteGroupSize429if (block_size < block_size_aligned)430memset(block + block_size, 0, block_size_aligned - block_size);431432for (int channel = 0; channel < max_channel; ++channel)433for (size_t j = 0; j < 4; ++j)434{435encodeDeltas(block, vertex_data + i * vertex_size, block_size, vertex_size, last_vertex, k + j, channel | (xor_rot << 4));436437for (size_t ig = 0; ig < block_size; ig += kByteGroupSize)438{439// to maximize encoding performance we only evaluate 1/2/4/8 bit groups440size_t size1 = encodeBytesGroupMeasure(block + ig, 1);441size_t size2 = encodeBytesGroupMeasure(block + ig, 2);442size_t size4 = encodeBytesGroupMeasure(block + ig, 4);443size_t size8 = encodeBytesGroupMeasure(block + ig, 8);444445size_t best_size = size1 < size2 ? size1 : size2;446best_size = best_size < size4 ? best_size : size4;447best_size = best_size < size8 ? best_size : size8;448449sizes[channel] += best_size;450}451}452}453454int best_channel = 0;455for (int channel = 1; channel < max_channel; ++channel)456best_channel = (sizes[channel] < sizes[best_channel]) ? channel : best_channel;457458return best_channel == 2 ? best_channel | (xor_rot << 4) : best_channel;459}460461static bool estimateControlZero(const unsigned char* buffer, size_t vertex_count_aligned)462{463for (size_t i = 0; i < vertex_count_aligned; i += kByteGroupSize)464if (!encodeBytesGroupZero(buffer + i))465return false;466467return true;468}469470static int estimateControl(const unsigned char* buffer, size_t vertex_count, size_t vertex_count_aligned, int level)471{472if (estimateControlZero(buffer, vertex_count_aligned))473return 2; // zero encoding474475if (level == 0)476return 1; // 1248 encoding in level 0 for encoding speed477478// round number of groups to 4 to get number of header bytes479size_t header_size = (vertex_count_aligned / kByteGroupSize + 3) / 4;480481size_t est_bytes0 = header_size, est_bytes1 = header_size;482483for (size_t i = 0; i < vertex_count_aligned; i += kByteGroupSize)484{485// assumes kBitsV1[] = {0, 1, 2, 4, 8} for performance486size_t size0 = encodeBytesGroupMeasure(buffer + i, 0);487size_t size1 = encodeBytesGroupMeasure(buffer + i, 1);488size_t size2 = encodeBytesGroupMeasure(buffer + i, 2);489size_t size4 = encodeBytesGroupMeasure(buffer + i, 4);490size_t size8 = encodeBytesGroupMeasure(buffer + i, 8);491492// both control modes have access to 1/2/4 bit encoding493size_t size12 = size1 < size2 ? size1 : size2;494size_t size124 = size12 < size4 ? size12 : size4;495496// each control mode has access to 0/8 bit encoding respectively497est_bytes0 += size124 < size0 ? size124 : size0;498est_bytes1 += size124 < size8 ? size124 : size8;499}500501// pick shortest control entry but prefer literal encoding502if (est_bytes0 < vertex_count || est_bytes1 < vertex_count)503return est_bytes0 < est_bytes1 ? 0 : 1;504else505return 3; // literal encoding506}507508static unsigned char* encodeVertexBlock(unsigned char* data, unsigned char* data_end, const unsigned char* vertex_data, size_t vertex_count, size_t vertex_size, unsigned char last_vertex[256], const unsigned char* channels, int version, int level)509{510assert(vertex_count > 0 && vertex_count <= kVertexBlockMaxSize);511assert(vertex_size % 4 == 0);512513unsigned char buffer[kVertexBlockMaxSize];514assert(sizeof(buffer) % kByteGroupSize == 0);515516size_t vertex_count_aligned = (vertex_count + kByteGroupSize - 1) & ~(kByteGroupSize - 1);517518// we sometimes encode elements we didn't fill when rounding to kByteGroupSize519memset(buffer, 0, sizeof(buffer));520521size_t control_size = version == 0 ? 0 : vertex_size / 4;522if (size_t(data_end - data) < control_size)523return NULL;524525unsigned char* control = data;526data += control_size;527528memset(control, 0, control_size);529530for (size_t k = 0; k < vertex_size; ++k)531{532encodeDeltas(buffer, vertex_data, vertex_count, vertex_size, last_vertex, k, version == 0 ? 0 : channels[k / 4]);533534#if TRACE535const unsigned char* olddata = data;536bytestats = &vertexstats[k];537#endif538539int ctrl = 0;540541if (version != 0)542{543ctrl = estimateControl(buffer, vertex_count, vertex_count_aligned, level);544545assert(unsigned(ctrl) < 4);546control[k / 4] |= ctrl << ((k % 4) * 2);547548#if TRACE549vertexstats[k].ctrl[ctrl]++;550#endif551}552553if (ctrl == 3)554{555// literal encoding556if (size_t(data_end - data) < vertex_count)557return NULL;558559memcpy(data, buffer, vertex_count);560data += vertex_count;561}562else if (ctrl != 2) // non-zero encoding563{564data = encodeBytes(data, data_end, buffer, vertex_count_aligned, version == 0 ? kBitsV0 : kBitsV1 + ctrl);565if (!data)566return NULL;567}568569#if TRACE570bytestats = NULL;571vertexstats[k].size += data - olddata;572#endif573}574575memcpy(last_vertex, &vertex_data[vertex_size * (vertex_count - 1)], vertex_size);576577return data;578}579580#if defined(SIMD_FALLBACK) || (!defined(SIMD_SSE) && !defined(SIMD_NEON) && !defined(SIMD_AVX) && !defined(SIMD_WASM))581static const unsigned char* decodeBytesGroup(const unsigned char* data, unsigned char* buffer, int bits)582{583#define READ() byte = *data++584#define NEXT(bits) enc = byte >> (8 - bits), byte <<= bits, encv = *data_var, *buffer++ = (enc == (1 << bits) - 1) ? encv : enc, data_var += (enc == (1 << bits) - 1)585586unsigned char byte, enc, encv;587const unsigned char* data_var;588589switch (bits)590{591case 0:592memset(buffer, 0, kByteGroupSize);593return data;594case 1:595data_var = data + 2;596597// 2 groups with 8 1-bit values in each byte (reversed from the order in other groups)598READ();599byte = (unsigned char)(((byte * 0x80200802ull) & 0x0884422110ull) * 0x0101010101ull >> 32);600NEXT(1), NEXT(1), NEXT(1), NEXT(1), NEXT(1), NEXT(1), NEXT(1), NEXT(1);601READ();602byte = (unsigned char)(((byte * 0x80200802ull) & 0x0884422110ull) * 0x0101010101ull >> 32);603NEXT(1), NEXT(1), NEXT(1), NEXT(1), NEXT(1), NEXT(1), NEXT(1), NEXT(1);604605return data_var;606case 2:607data_var = data + 4;608609// 4 groups with 4 2-bit values in each byte610READ(), NEXT(2), NEXT(2), NEXT(2), NEXT(2);611READ(), NEXT(2), NEXT(2), NEXT(2), NEXT(2);612READ(), NEXT(2), NEXT(2), NEXT(2), NEXT(2);613READ(), NEXT(2), NEXT(2), NEXT(2), NEXT(2);614615return data_var;616case 4:617data_var = data + 8;618619// 8 groups with 2 4-bit values in each byte620READ(), NEXT(4), NEXT(4);621READ(), NEXT(4), NEXT(4);622READ(), NEXT(4), NEXT(4);623READ(), NEXT(4), NEXT(4);624READ(), NEXT(4), NEXT(4);625READ(), NEXT(4), NEXT(4);626READ(), NEXT(4), NEXT(4);627READ(), NEXT(4), NEXT(4);628629return data_var;630case 8:631memcpy(buffer, data, kByteGroupSize);632return data + kByteGroupSize;633default:634assert(!"Unexpected bit length"); // unreachable635return data;636}637638#undef READ639#undef NEXT640}641642static const unsigned char* decodeBytes(const unsigned char* data, const unsigned char* data_end, unsigned char* buffer, size_t buffer_size, const int* bits)643{644assert(buffer_size % kByteGroupSize == 0);645646// round number of groups to 4 to get number of header bytes647size_t header_size = (buffer_size / kByteGroupSize + 3) / 4;648if (size_t(data_end - data) < header_size)649return NULL;650651const unsigned char* header = data;652data += header_size;653654for (size_t i = 0; i < buffer_size; i += kByteGroupSize)655{656if (size_t(data_end - data) < kByteGroupDecodeLimit)657return NULL;658659size_t header_offset = i / kByteGroupSize;660int bitsk = (header[header_offset / 4] >> ((header_offset % 4) * 2)) & 3;661662data = decodeBytesGroup(data, buffer + i, bits[bitsk]);663}664665return data;666}667668template <typename T, bool Xor>669static void decodeDeltas1(const unsigned char* buffer, unsigned char* transposed, size_t vertex_count, size_t vertex_size, const unsigned char* last_vertex, int rot)670{671for (size_t k = 0; k < 4; k += sizeof(T))672{673size_t vertex_offset = k;674675T p = last_vertex[0];676for (size_t j = 1; j < sizeof(T); ++j)677p |= last_vertex[j] << (8 * j);678679for (size_t i = 0; i < vertex_count; ++i)680{681T v = buffer[i];682for (size_t j = 1; j < sizeof(T); ++j)683v |= buffer[i + vertex_count * j] << (8 * j);684685v = Xor ? T(rotate(v, rot)) ^ p : unzigzag(v) + p;686687for (size_t j = 0; j < sizeof(T); ++j)688transposed[vertex_offset + j] = (unsigned char)(v >> (j * 8));689690p = v;691692vertex_offset += vertex_size;693}694695buffer += vertex_count * sizeof(T);696last_vertex += sizeof(T);697}698}699700static const unsigned char* decodeVertexBlock(const unsigned char* data, const unsigned char* data_end, unsigned char* vertex_data, size_t vertex_count, size_t vertex_size, unsigned char last_vertex[256], const unsigned char* channels, int version)701{702assert(vertex_count > 0 && vertex_count <= kVertexBlockMaxSize);703704unsigned char buffer[kVertexBlockMaxSize * 4];705unsigned char transposed[kVertexBlockSizeBytes];706707size_t vertex_count_aligned = (vertex_count + kByteGroupSize - 1) & ~(kByteGroupSize - 1);708assert(vertex_count <= vertex_count_aligned);709710size_t control_size = version == 0 ? 0 : vertex_size / 4;711if (size_t(data_end - data) < control_size)712return NULL;713714const unsigned char* control = data;715data += control_size;716717for (size_t k = 0; k < vertex_size; k += 4)718{719unsigned char ctrl_byte = version == 0 ? 0 : control[k / 4];720721for (size_t j = 0; j < 4; ++j)722{723int ctrl = (ctrl_byte >> (j * 2)) & 3;724725if (ctrl == 3)726{727// literal encoding728if (size_t(data_end - data) < vertex_count)729return NULL;730731memcpy(buffer + j * vertex_count, data, vertex_count);732data += vertex_count;733}734else if (ctrl == 2)735{736// zero encoding737memset(buffer + j * vertex_count, 0, vertex_count);738}739else740{741data = decodeBytes(data, data_end, buffer + j * vertex_count, vertex_count_aligned, version == 0 ? kBitsV0 : kBitsV1 + ctrl);742if (!data)743return NULL;744}745}746747int channel = version == 0 ? 0 : channels[k / 4];748749switch (channel & 3)750{751case 0:752decodeDeltas1<unsigned char, false>(buffer, transposed + k, vertex_count, vertex_size, last_vertex + k, 0);753break;754case 1:755decodeDeltas1<unsigned short, false>(buffer, transposed + k, vertex_count, vertex_size, last_vertex + k, 0);756break;757case 2:758decodeDeltas1<unsigned int, true>(buffer, transposed + k, vertex_count, vertex_size, last_vertex + k, (32 - (channel >> 4)) & 31);759break;760default:761return NULL; // invalid channel type762}763}764765memcpy(vertex_data, transposed, vertex_count * vertex_size);766767memcpy(last_vertex, &transposed[vertex_size * (vertex_count - 1)], vertex_size);768769return data;770}771#endif772773#if defined(SIMD_SSE) || defined(SIMD_NEON) || defined(SIMD_WASM)774static unsigned char kDecodeBytesGroupShuffle[256][8];775static unsigned char kDecodeBytesGroupCount[256];776777#ifdef __wasm__778__attribute__((cold)) // this saves 500 bytes in the output binary - we don't need to vectorize this loop!779#endif780static bool781decodeBytesGroupBuildTables()782{783for (int mask = 0; mask < 256; ++mask)784{785unsigned char shuffle[8];786unsigned char count = 0;787788for (int i = 0; i < 8; ++i)789{790int maski = (mask >> i) & 1;791shuffle[i] = maski ? count : 0x80;792count += (unsigned char)(maski);793}794795memcpy(kDecodeBytesGroupShuffle[mask], shuffle, 8);796kDecodeBytesGroupCount[mask] = count;797}798799return true;800}801802static bool gDecodeBytesGroupInitialized = decodeBytesGroupBuildTables();803#endif804805#ifdef SIMD_SSE806SIMD_TARGET807inline __m128i decodeShuffleMask(unsigned char mask0, unsigned char mask1)808{809__m128i sm0 = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(&kDecodeBytesGroupShuffle[mask0]));810__m128i sm1 = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(&kDecodeBytesGroupShuffle[mask1]));811__m128i sm1off = _mm_set1_epi8(kDecodeBytesGroupCount[mask0]);812813__m128i sm1r = _mm_add_epi8(sm1, sm1off);814815return _mm_unpacklo_epi64(sm0, sm1r);816}817818SIMD_TARGET819inline const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsigned char* buffer, int hbits)820{821switch (hbits)822{823case 0:824case 4:825{826__m128i result = _mm_setzero_si128();827828_mm_storeu_si128(reinterpret_cast<__m128i*>(buffer), result);829830return data;831}832833case 1:834case 6:835{836#ifdef __GNUC__837typedef int __attribute__((aligned(1))) unaligned_int;838#else839typedef int unaligned_int;840#endif841842#ifdef SIMD_LATENCYOPT843unsigned int data32;844memcpy(&data32, data, 4);845data32 &= data32 >> 1;846847// arrange bits such that low bits of nibbles of data64 contain all 2-bit elements of data32848unsigned long long data64 = ((unsigned long long)data32 << 30) | (data32 & 0x3fffffff);849850// adds all 1-bit nibbles together; the sum fits in 4 bits because datacnt=16 would have used mode 3851int datacnt = int(((data64 & 0x1111111111111111ull) * 0x1111111111111111ull) >> 60);852#endif853854__m128i sel2 = _mm_cvtsi32_si128(*reinterpret_cast<const unaligned_int*>(data));855__m128i rest = _mm_loadu_si128(reinterpret_cast<const __m128i*>(data + 4));856857__m128i sel22 = _mm_unpacklo_epi8(_mm_srli_epi16(sel2, 4), sel2);858__m128i sel2222 = _mm_unpacklo_epi8(_mm_srli_epi16(sel22, 2), sel22);859__m128i sel = _mm_and_si128(sel2222, _mm_set1_epi8(3));860861__m128i mask = _mm_cmpeq_epi8(sel, _mm_set1_epi8(3));862int mask16 = _mm_movemask_epi8(mask);863unsigned char mask0 = (unsigned char)(mask16 & 255);864unsigned char mask1 = (unsigned char)(mask16 >> 8);865866__m128i shuf = decodeShuffleMask(mask0, mask1);867__m128i result = _mm_or_si128(_mm_shuffle_epi8(rest, shuf), _mm_andnot_si128(mask, sel));868869_mm_storeu_si128(reinterpret_cast<__m128i*>(buffer), result);870871#ifdef SIMD_LATENCYOPT872return data + 4 + datacnt;873#else874return data + 4 + kDecodeBytesGroupCount[mask0] + kDecodeBytesGroupCount[mask1];875#endif876}877878case 2:879case 7:880{881#ifdef SIMD_LATENCYOPT882unsigned long long data64;883memcpy(&data64, data, 8);884data64 &= data64 >> 1;885data64 &= data64 >> 2;886887// adds all 1-bit nibbles together; the sum fits in 4 bits because datacnt=16 would have used mode 3888int datacnt = int(((data64 & 0x1111111111111111ull) * 0x1111111111111111ull) >> 60);889#endif890891__m128i sel4 = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(data));892__m128i rest = _mm_loadu_si128(reinterpret_cast<const __m128i*>(data + 8));893894__m128i sel44 = _mm_unpacklo_epi8(_mm_srli_epi16(sel4, 4), sel4);895__m128i sel = _mm_and_si128(sel44, _mm_set1_epi8(15));896897__m128i mask = _mm_cmpeq_epi8(sel, _mm_set1_epi8(15));898int mask16 = _mm_movemask_epi8(mask);899unsigned char mask0 = (unsigned char)(mask16 & 255);900unsigned char mask1 = (unsigned char)(mask16 >> 8);901902__m128i shuf = decodeShuffleMask(mask0, mask1);903__m128i result = _mm_or_si128(_mm_shuffle_epi8(rest, shuf), _mm_andnot_si128(mask, sel));904905_mm_storeu_si128(reinterpret_cast<__m128i*>(buffer), result);906907#ifdef SIMD_LATENCYOPT908return data + 8 + datacnt;909#else910return data + 8 + kDecodeBytesGroupCount[mask0] + kDecodeBytesGroupCount[mask1];911#endif912}913914case 3:915case 8:916{917__m128i result = _mm_loadu_si128(reinterpret_cast<const __m128i*>(data));918919_mm_storeu_si128(reinterpret_cast<__m128i*>(buffer), result);920921return data + 16;922}923924case 5:925{926__m128i rest = _mm_loadu_si128(reinterpret_cast<const __m128i*>(data + 2));927928unsigned char mask0 = data[0];929unsigned char mask1 = data[1];930931__m128i shuf = decodeShuffleMask(mask0, mask1);932__m128i result = _mm_shuffle_epi8(rest, shuf);933934_mm_storeu_si128(reinterpret_cast<__m128i*>(buffer), result);935936return data + 2 + kDecodeBytesGroupCount[mask0] + kDecodeBytesGroupCount[mask1];937}938939default:940SIMD_UNREACHABLE(); // unreachable941}942}943#endif944945#ifdef SIMD_AVX946static const __m128i kDecodeBytesGroupConfig[8][2] = {947{_mm_setzero_si128(), _mm_setzero_si128()},948{_mm_set1_epi8(3), _mm_setr_epi8(6, 4, 2, 0, 14, 12, 10, 8, 22, 20, 18, 16, 30, 28, 26, 24)},949{_mm_set1_epi8(15), _mm_setr_epi8(4, 0, 12, 8, 20, 16, 28, 24, 36, 32, 44, 40, 52, 48, 60, 56)},950{_mm_setzero_si128(), _mm_setzero_si128()},951{_mm_setzero_si128(), _mm_setzero_si128()},952{_mm_set1_epi8(1), _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)},953{_mm_set1_epi8(3), _mm_setr_epi8(6, 4, 2, 0, 14, 12, 10, 8, 22, 20, 18, 16, 30, 28, 26, 24)},954{_mm_set1_epi8(15), _mm_setr_epi8(4, 0, 12, 8, 20, 16, 28, 24, 36, 32, 44, 40, 52, 48, 60, 56)},955};956957SIMD_TARGET958inline const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsigned char* buffer, int hbits)959{960switch (hbits)961{962case 0:963case 4:964{965__m128i result = _mm_setzero_si128();966967_mm_storeu_si128(reinterpret_cast<__m128i*>(buffer), result);968969return data;970}971972case 5: // 1-bit973case 1: // 2-bit974case 6:975case 2: // 4-bit976case 7:977{978const unsigned char* skip = data + (2 << (hbits < 3 ? hbits : hbits - 5));979980__m128i selb = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(data));981__m128i rest = _mm_loadu_si128(reinterpret_cast<const __m128i*>(skip));982983__m128i sent = kDecodeBytesGroupConfig[hbits][0];984__m128i ctrl = kDecodeBytesGroupConfig[hbits][1];985986__m128i selw = _mm_shuffle_epi32(selb, 0x44);987__m128i sel = _mm_and_si128(sent, _mm_multishift_epi64_epi8(ctrl, selw));988__mmask16 mask16 = _mm_cmp_epi8_mask(sel, sent, _MM_CMPINT_EQ);989990__m128i result = _mm_mask_expand_epi8(sel, mask16, rest);991992_mm_storeu_si128(reinterpret_cast<__m128i*>(buffer), result);993994return skip + _mm_popcnt_u32(mask16);995}996997case 3:998case 8:999{1000__m128i result = _mm_loadu_si128(reinterpret_cast<const __m128i*>(data));10011002_mm_storeu_si128(reinterpret_cast<__m128i*>(buffer), result);10031004return data + 16;1005}10061007default:1008SIMD_UNREACHABLE(); // unreachable1009}1010}1011#endif10121013#ifdef SIMD_NEON1014SIMD_TARGET1015inline uint8x16_t shuffleBytes(unsigned char mask0, unsigned char mask1, uint8x8_t rest0, uint8x8_t rest1)1016{1017uint8x8_t sm0 = vld1_u8(kDecodeBytesGroupShuffle[mask0]);1018uint8x8_t sm1 = vld1_u8(kDecodeBytesGroupShuffle[mask1]);10191020uint8x8_t r0 = vtbl1_u8(rest0, sm0);1021uint8x8_t r1 = vtbl1_u8(rest1, sm1);10221023return vcombine_u8(r0, r1);1024}10251026SIMD_TARGET1027inline void neonMoveMask(uint8x16_t mask, unsigned char& mask0, unsigned char& mask1)1028{1029// magic constant found using z3 SMT assuming mask has 8 groups of 0xff or 0x001030const uint64_t magic = 0x000103070f1f3f80ull;10311032uint64x2_t mask2 = vreinterpretq_u64_u8(mask);10331034mask0 = uint8_t((vgetq_lane_u64(mask2, 0) * magic) >> 56);1035mask1 = uint8_t((vgetq_lane_u64(mask2, 1) * magic) >> 56);1036}10371038SIMD_TARGET1039inline const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsigned char* buffer, int hbits)1040{1041switch (hbits)1042{1043case 0:1044case 4:1045{1046uint8x16_t result = vdupq_n_u8(0);10471048vst1q_u8(buffer, result);10491050return data;1051}10521053case 1:1054case 6:1055{1056#ifdef SIMD_LATENCYOPT1057unsigned int data32;1058memcpy(&data32, data, 4);1059data32 &= data32 >> 1;10601061// arrange bits such that low bits of nibbles of data64 contain all 2-bit elements of data321062unsigned long long data64 = ((unsigned long long)data32 << 30) | (data32 & 0x3fffffff);10631064// adds all 1-bit nibbles together; the sum fits in 4 bits because datacnt=16 would have used mode 31065int datacnt = int(((data64 & 0x1111111111111111ull) * 0x1111111111111111ull) >> 60);1066#endif10671068uint8x8_t sel2 = vld1_u8(data);1069uint8x8_t sel22 = vzip_u8(vshr_n_u8(sel2, 4), sel2).val[0];1070uint8x8x2_t sel2222 = vzip_u8(vshr_n_u8(sel22, 2), sel22);1071uint8x16_t sel = vandq_u8(vcombine_u8(sel2222.val[0], sel2222.val[1]), vdupq_n_u8(3));10721073uint8x16_t mask = vceqq_u8(sel, vdupq_n_u8(3));1074unsigned char mask0, mask1;1075neonMoveMask(mask, mask0, mask1);10761077uint8x8_t rest0 = vld1_u8(data + 4);1078uint8x8_t rest1 = vld1_u8(data + 4 + kDecodeBytesGroupCount[mask0]);10791080uint8x16_t result = vbslq_u8(mask, shuffleBytes(mask0, mask1, rest0, rest1), sel);10811082vst1q_u8(buffer, result);10831084#ifdef SIMD_LATENCYOPT1085return data + 4 + datacnt;1086#else1087return data + 4 + kDecodeBytesGroupCount[mask0] + kDecodeBytesGroupCount[mask1];1088#endif1089}10901091case 2:1092case 7:1093{1094#ifdef SIMD_LATENCYOPT1095unsigned long long data64;1096memcpy(&data64, data, 8);1097data64 &= data64 >> 1;1098data64 &= data64 >> 2;10991100// adds all 1-bit nibbles together; the sum fits in 4 bits because datacnt=16 would have used mode 31101int datacnt = int(((data64 & 0x1111111111111111ull) * 0x1111111111111111ull) >> 60);1102#endif11031104uint8x8_t sel4 = vld1_u8(data);1105uint8x8x2_t sel44 = vzip_u8(vshr_n_u8(sel4, 4), vand_u8(sel4, vdup_n_u8(15)));1106uint8x16_t sel = vcombine_u8(sel44.val[0], sel44.val[1]);11071108uint8x16_t mask = vceqq_u8(sel, vdupq_n_u8(15));1109unsigned char mask0, mask1;1110neonMoveMask(mask, mask0, mask1);11111112uint8x8_t rest0 = vld1_u8(data + 8);1113uint8x8_t rest1 = vld1_u8(data + 8 + kDecodeBytesGroupCount[mask0]);11141115uint8x16_t result = vbslq_u8(mask, shuffleBytes(mask0, mask1, rest0, rest1), sel);11161117vst1q_u8(buffer, result);11181119#ifdef SIMD_LATENCYOPT1120return data + 8 + datacnt;1121#else1122return data + 8 + kDecodeBytesGroupCount[mask0] + kDecodeBytesGroupCount[mask1];1123#endif1124}11251126case 3:1127case 8:1128{1129uint8x16_t result = vld1q_u8(data);11301131vst1q_u8(buffer, result);11321133return data + 16;1134}11351136case 5:1137{1138unsigned char mask0 = data[0];1139unsigned char mask1 = data[1];11401141uint8x8_t rest0 = vld1_u8(data + 2);1142uint8x8_t rest1 = vld1_u8(data + 2 + kDecodeBytesGroupCount[mask0]);11431144uint8x16_t result = shuffleBytes(mask0, mask1, rest0, rest1);11451146vst1q_u8(buffer, result);11471148return data + 2 + kDecodeBytesGroupCount[mask0] + kDecodeBytesGroupCount[mask1];1149}11501151default:1152SIMD_UNREACHABLE(); // unreachable1153}1154}1155#endif11561157#ifdef SIMD_WASM1158SIMD_TARGET1159inline v128_t decodeShuffleMask(unsigned char mask0, unsigned char mask1)1160{1161v128_t sm0 = wasm_v128_load(&kDecodeBytesGroupShuffle[mask0]);1162v128_t sm1 = wasm_v128_load(&kDecodeBytesGroupShuffle[mask1]);11631164v128_t sm1off = wasm_v128_load8_splat(&kDecodeBytesGroupCount[mask0]);1165v128_t sm1r = wasm_i8x16_add(sm1, sm1off);11661167return wasmx_unpacklo_v64x2(sm0, sm1r);1168}11691170SIMD_TARGET1171inline void wasmMoveMask(v128_t mask, unsigned char& mask0, unsigned char& mask1)1172{1173// magic constant found using z3 SMT assuming mask has 8 groups of 0xff or 0x001174const uint64_t magic = 0x000103070f1f3f80ull;11751176mask0 = uint8_t((wasm_i64x2_extract_lane(mask, 0) * magic) >> 56);1177mask1 = uint8_t((wasm_i64x2_extract_lane(mask, 1) * magic) >> 56);1178}11791180SIMD_TARGET1181inline const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsigned char* buffer, int hbits)1182{1183switch (hbits)1184{1185case 0:1186case 4:1187{1188v128_t result = wasm_i8x16_splat(0);11891190wasm_v128_store(buffer, result);11911192return data;1193}11941195case 1:1196case 6:1197{1198v128_t sel2 = wasm_v128_load(data);1199v128_t rest = wasm_v128_load(data + 4);12001201v128_t sel22 = wasmx_unpacklo_v8x16(wasm_i16x8_shr(sel2, 4), sel2);1202v128_t sel2222 = wasmx_unpacklo_v8x16(wasm_i16x8_shr(sel22, 2), sel22);1203v128_t sel = wasm_v128_and(sel2222, wasm_i8x16_splat(3));12041205v128_t mask = wasm_i8x16_eq(sel, wasm_i8x16_splat(3));12061207unsigned char mask0, mask1;1208wasmMoveMask(mask, mask0, mask1);12091210v128_t shuf = decodeShuffleMask(mask0, mask1);1211v128_t result = wasm_v128_bitselect(wasm_i8x16_swizzle(rest, shuf), sel, mask);12121213wasm_v128_store(buffer, result);12141215return data + 4 + kDecodeBytesGroupCount[mask0] + kDecodeBytesGroupCount[mask1];1216}12171218case 2:1219case 7:1220{1221v128_t sel4 = wasm_v128_load(data);1222v128_t rest = wasm_v128_load(data + 8);12231224v128_t sel44 = wasmx_unpacklo_v8x16(wasm_i16x8_shr(sel4, 4), sel4);1225v128_t sel = wasm_v128_and(sel44, wasm_i8x16_splat(15));12261227v128_t mask = wasm_i8x16_eq(sel, wasm_i8x16_splat(15));12281229unsigned char mask0, mask1;1230wasmMoveMask(mask, mask0, mask1);12311232v128_t shuf = decodeShuffleMask(mask0, mask1);1233v128_t result = wasm_v128_bitselect(wasm_i8x16_swizzle(rest, shuf), sel, mask);12341235wasm_v128_store(buffer, result);12361237return data + 8 + kDecodeBytesGroupCount[mask0] + kDecodeBytesGroupCount[mask1];1238}12391240case 3:1241case 8:1242{1243v128_t result = wasm_v128_load(data);12441245wasm_v128_store(buffer, result);12461247return data + 16;1248}12491250case 5:1251{1252v128_t rest = wasm_v128_load(data + 2);12531254unsigned char mask0 = data[0];1255unsigned char mask1 = data[1];12561257v128_t shuf = decodeShuffleMask(mask0, mask1);1258v128_t result = wasm_i8x16_swizzle(rest, shuf);12591260wasm_v128_store(buffer, result);12611262return data + 2 + kDecodeBytesGroupCount[mask0] + kDecodeBytesGroupCount[mask1];1263}12641265default:1266SIMD_UNREACHABLE(); // unreachable1267}1268}1269#endif12701271#if defined(SIMD_SSE) || defined(SIMD_AVX)1272SIMD_TARGET1273inline void transpose8(__m128i& x0, __m128i& x1, __m128i& x2, __m128i& x3)1274{1275__m128i t0 = _mm_unpacklo_epi8(x0, x1);1276__m128i t1 = _mm_unpackhi_epi8(x0, x1);1277__m128i t2 = _mm_unpacklo_epi8(x2, x3);1278__m128i t3 = _mm_unpackhi_epi8(x2, x3);12791280x0 = _mm_unpacklo_epi16(t0, t2);1281x1 = _mm_unpackhi_epi16(t0, t2);1282x2 = _mm_unpacklo_epi16(t1, t3);1283x3 = _mm_unpackhi_epi16(t1, t3);1284}12851286SIMD_TARGET1287inline __m128i unzigzag8(__m128i v)1288{1289__m128i xl = _mm_sub_epi8(_mm_setzero_si128(), _mm_and_si128(v, _mm_set1_epi8(1)));1290__m128i xr = _mm_and_si128(_mm_srli_epi16(v, 1), _mm_set1_epi8(127));12911292return _mm_xor_si128(xl, xr);1293}12941295SIMD_TARGET1296inline __m128i unzigzag16(__m128i v)1297{1298__m128i xl = _mm_sub_epi16(_mm_setzero_si128(), _mm_and_si128(v, _mm_set1_epi16(1)));1299__m128i xr = _mm_srli_epi16(v, 1);13001301return _mm_xor_si128(xl, xr);1302}13031304SIMD_TARGET1305inline __m128i rotate32(__m128i v, int r)1306{1307return _mm_or_si128(_mm_slli_epi32(v, r), _mm_srli_epi32(v, 32 - r));1308}1309#endif13101311#ifdef SIMD_NEON1312SIMD_TARGET1313inline void transpose8(uint8x16_t& x0, uint8x16_t& x1, uint8x16_t& x2, uint8x16_t& x3)1314{1315uint8x16x2_t t01 = vzipq_u8(x0, x1);1316uint8x16x2_t t23 = vzipq_u8(x2, x3);13171318uint16x8x2_t x01 = vzipq_u16(vreinterpretq_u16_u8(t01.val[0]), vreinterpretq_u16_u8(t23.val[0]));1319uint16x8x2_t x23 = vzipq_u16(vreinterpretq_u16_u8(t01.val[1]), vreinterpretq_u16_u8(t23.val[1]));13201321x0 = vreinterpretq_u8_u16(x01.val[0]);1322x1 = vreinterpretq_u8_u16(x01.val[1]);1323x2 = vreinterpretq_u8_u16(x23.val[0]);1324x3 = vreinterpretq_u8_u16(x23.val[1]);1325}13261327SIMD_TARGET1328inline uint8x16_t unzigzag8(uint8x16_t v)1329{1330uint8x16_t xl = vreinterpretq_u8_s8(vnegq_s8(vreinterpretq_s8_u8(vandq_u8(v, vdupq_n_u8(1)))));1331uint8x16_t xr = vshrq_n_u8(v, 1);13321333return veorq_u8(xl, xr);1334}13351336SIMD_TARGET1337inline uint8x16_t unzigzag16(uint8x16_t v)1338{1339uint16x8_t vv = vreinterpretq_u16_u8(v);1340uint8x16_t xl = vreinterpretq_u8_s16(vnegq_s16(vreinterpretq_s16_u16(vandq_u16(vv, vdupq_n_u16(1)))));1341uint8x16_t xr = vreinterpretq_u8_u16(vshrq_n_u16(vv, 1));13421343return veorq_u8(xl, xr);1344}13451346SIMD_TARGET1347inline uint8x16_t rotate32(uint8x16_t v, int r)1348{1349uint32x4_t v32 = vreinterpretq_u32_u8(v);1350return vreinterpretq_u8_u32(vorrq_u32(vshlq_u32(v32, vdupq_n_s32(r)), vshlq_u32(v32, vdupq_n_s32(r - 32))));1351}13521353template <int Channel>1354SIMD_TARGET inline uint8x8_t rebase(uint8x8_t npi, uint8x16_t r0, uint8x16_t r1, uint8x16_t r2, uint8x16_t r3)1355{1356switch (Channel)1357{1358case 0:1359{1360uint8x16_t rsum = vaddq_u8(vaddq_u8(r0, r1), vaddq_u8(r2, r3));1361uint8x8_t rsumx = vadd_u8(vget_low_u8(rsum), vget_high_u8(rsum));1362return vadd_u8(vadd_u8(npi, rsumx), vext_u8(rsumx, rsumx, 4));1363}1364case 1:1365{1366uint16x8_t rsum = vaddq_u16(vaddq_u16(vreinterpretq_u16_u8(r0), vreinterpretq_u16_u8(r1)), vaddq_u16(vreinterpretq_u16_u8(r2), vreinterpretq_u16_u8(r3)));1367uint16x4_t rsumx = vadd_u16(vget_low_u16(rsum), vget_high_u16(rsum));1368return vreinterpret_u8_u16(vadd_u16(vadd_u16(vreinterpret_u16_u8(npi), rsumx), vext_u16(rsumx, rsumx, 2)));1369}1370case 2:1371{1372uint8x16_t rsum = veorq_u8(veorq_u8(r0, r1), veorq_u8(r2, r3));1373uint8x8_t rsumx = veor_u8(vget_low_u8(rsum), vget_high_u8(rsum));1374return veor_u8(veor_u8(npi, rsumx), vext_u8(rsumx, rsumx, 4));1375}1376default:1377return npi;1378}1379}1380#endif13811382#ifdef SIMD_WASM1383SIMD_TARGET1384inline void transpose8(v128_t& x0, v128_t& x1, v128_t& x2, v128_t& x3)1385{1386v128_t t0 = wasmx_unpacklo_v8x16(x0, x1);1387v128_t t1 = wasmx_unpackhi_v8x16(x0, x1);1388v128_t t2 = wasmx_unpacklo_v8x16(x2, x3);1389v128_t t3 = wasmx_unpackhi_v8x16(x2, x3);13901391x0 = wasmx_unpacklo_v16x8(t0, t2);1392x1 = wasmx_unpackhi_v16x8(t0, t2);1393x2 = wasmx_unpacklo_v16x8(t1, t3);1394x3 = wasmx_unpackhi_v16x8(t1, t3);1395}13961397SIMD_TARGET1398inline v128_t unzigzag8(v128_t v)1399{1400v128_t xl = wasm_i8x16_neg(wasm_v128_and(v, wasm_i8x16_splat(1)));1401v128_t xr = wasm_u8x16_shr(v, 1);14021403return wasm_v128_xor(xl, xr);1404}14051406SIMD_TARGET1407inline v128_t unzigzag16(v128_t v)1408{1409v128_t xl = wasm_i16x8_neg(wasm_v128_and(v, wasm_i16x8_splat(1)));1410v128_t xr = wasm_u16x8_shr(v, 1);14111412return wasm_v128_xor(xl, xr);1413}14141415SIMD_TARGET1416inline v128_t rotate32(v128_t v, int r)1417{1418return wasm_v128_or(wasm_i32x4_shl(v, r), wasm_i32x4_shr(v, 32 - r));1419}1420#endif14211422#if defined(SIMD_SSE) || defined(SIMD_AVX) || defined(SIMD_NEON) || defined(SIMD_WASM)1423SIMD_TARGET1424static const unsigned char* decodeBytesSimd(const unsigned char* data, const unsigned char* data_end, unsigned char* buffer, size_t buffer_size, int hshift)1425{1426assert(buffer_size % kByteGroupSize == 0);1427assert(kByteGroupSize == 16);14281429// round number of groups to 4 to get number of header bytes1430size_t header_size = (buffer_size / kByteGroupSize + 3) / 4;1431if (size_t(data_end - data) < header_size)1432return NULL;14331434const unsigned char* header = data;1435data += header_size;14361437size_t i = 0;14381439// fast-path: process 4 groups at a time, do a shared bounds check1440for (; i + kByteGroupSize * 4 <= buffer_size && size_t(data_end - data) >= kByteGroupDecodeLimit * 4; i += kByteGroupSize * 4)1441{1442size_t header_offset = i / kByteGroupSize;1443unsigned char header_byte = header[header_offset / 4];14441445data = decodeBytesGroupSimd(data, buffer + i + kByteGroupSize * 0, hshift + ((header_byte >> 0) & 3));1446data = decodeBytesGroupSimd(data, buffer + i + kByteGroupSize * 1, hshift + ((header_byte >> 2) & 3));1447data = decodeBytesGroupSimd(data, buffer + i + kByteGroupSize * 2, hshift + ((header_byte >> 4) & 3));1448data = decodeBytesGroupSimd(data, buffer + i + kByteGroupSize * 3, hshift + ((header_byte >> 6) & 3));1449}14501451// slow-path: process remaining groups1452for (; i < buffer_size; i += kByteGroupSize)1453{1454if (size_t(data_end - data) < kByteGroupDecodeLimit)1455return NULL;14561457size_t header_offset = i / kByteGroupSize;1458unsigned char header_byte = header[header_offset / 4];14591460data = decodeBytesGroupSimd(data, buffer + i, hshift + ((header_byte >> ((header_offset % 4) * 2)) & 3));1461}14621463return data;1464}14651466template <int Channel>1467SIMD_TARGET static void1468decodeDeltas4Simd(const unsigned char* buffer, unsigned char* transposed, size_t vertex_count_aligned, size_t vertex_size, unsigned char last_vertex[4], int rot)1469{1470#if defined(SIMD_SSE) || defined(SIMD_AVX)1471#define TEMP __m128i1472#define PREP() __m128i pi = _mm_cvtsi32_si128(*reinterpret_cast<const int*>(last_vertex))1473#define LOAD(i) __m128i r##i = _mm_loadu_si128(reinterpret_cast<const __m128i*>(buffer + j + i * vertex_count_aligned))1474#define GRP4(i) t0 = r##i, t1 = _mm_shuffle_epi32(r##i, 1), t2 = _mm_shuffle_epi32(r##i, 2), t3 = _mm_shuffle_epi32(r##i, 3)1475#define FIXD(i) t##i = pi = Channel == 0 ? _mm_add_epi8(pi, t##i) : (Channel == 1 ? _mm_add_epi16(pi, t##i) : _mm_xor_si128(pi, t##i))1476#define SAVE(i) *reinterpret_cast<int*>(savep) = _mm_cvtsi128_si32(t##i), savep += vertex_size1477#endif14781479#ifdef SIMD_NEON1480#define TEMP uint8x8_t1481#define PREP() uint8x8_t pi = vreinterpret_u8_u32(vld1_lane_u32(reinterpret_cast<uint32_t*>(last_vertex), vdup_n_u32(0), 0))1482#define LOAD(i) uint8x16_t r##i = vld1q_u8(buffer + j + i * vertex_count_aligned)1483#define GRP4(i) t0 = vget_low_u8(r##i), t1 = vreinterpret_u8_u32(vdup_lane_u32(vreinterpret_u32_u8(t0), 1)), t2 = vget_high_u8(r##i), t3 = vreinterpret_u8_u32(vdup_lane_u32(vreinterpret_u32_u8(t2), 1))1484#define FIXD(i) t##i = pi = Channel == 0 ? vadd_u8(pi, t##i) : (Channel == 1 ? vreinterpret_u8_u16(vadd_u16(vreinterpret_u16_u8(pi), vreinterpret_u16_u8(t##i))) : veor_u8(pi, t##i))1485#define SAVE(i) vst1_lane_u32(reinterpret_cast<uint32_t*>(savep), vreinterpret_u32_u8(t##i), 0), savep += vertex_size1486#endif14871488#ifdef SIMD_WASM1489#define TEMP v128_t1490#define PREP() v128_t pi = wasm_v128_load(last_vertex)1491#define LOAD(i) v128_t r##i = wasm_v128_load(buffer + j + i * vertex_count_aligned)1492#define GRP4(i) t0 = r##i, t1 = wasmx_splat_v32x4(r##i, 1), t2 = wasmx_splat_v32x4(r##i, 2), t3 = wasmx_splat_v32x4(r##i, 3)1493#define FIXD(i) t##i = pi = Channel == 0 ? wasm_i8x16_add(pi, t##i) : (Channel == 1 ? wasm_i16x8_add(pi, t##i) : wasm_v128_xor(pi, t##i))1494#define SAVE(i) wasm_v128_store32_lane(savep, t##i, 0), savep += vertex_size1495#endif14961497#define UNZR(i) r##i = Channel == 0 ? unzigzag8(r##i) : (Channel == 1 ? unzigzag16(r##i) : rotate32(r##i, rot))14981499PREP();15001501unsigned char* savep = transposed;15021503for (size_t j = 0; j < vertex_count_aligned; j += 16)1504{1505LOAD(0);1506LOAD(1);1507LOAD(2);1508LOAD(3);15091510transpose8(r0, r1, r2, r3);15111512TEMP t0, t1, t2, t3;1513TEMP npi = pi;15141515UNZR(0);1516GRP4(0);1517FIXD(0), FIXD(1), FIXD(2), FIXD(3);1518SAVE(0), SAVE(1), SAVE(2), SAVE(3);15191520UNZR(1);1521GRP4(1);1522FIXD(0), FIXD(1), FIXD(2), FIXD(3);1523SAVE(0), SAVE(1), SAVE(2), SAVE(3);15241525UNZR(2);1526GRP4(2);1527FIXD(0), FIXD(1), FIXD(2), FIXD(3);1528SAVE(0), SAVE(1), SAVE(2), SAVE(3);15291530UNZR(3);1531GRP4(3);1532FIXD(0), FIXD(1), FIXD(2), FIXD(3);1533SAVE(0), SAVE(1), SAVE(2), SAVE(3);15341535#if defined(SIMD_LATENCYOPT) && defined(SIMD_NEON) && (defined(__APPLE__) || defined(_WIN32))1536// instead of relying on accumulated pi, recompute it from scratch from r0..r3; this shortens dependency between loop iterations1537pi = rebase<Channel>(npi, r0, r1, r2, r3);1538#else1539(void)npi;1540#endif15411542#undef UNZR1543#undef TEMP1544#undef PREP1545#undef LOAD1546#undef GRP41547#undef FIXD1548#undef SAVE1549}1550}15511552SIMD_TARGET1553static const unsigned char* decodeVertexBlockSimd(const unsigned char* data, const unsigned char* data_end, unsigned char* vertex_data, size_t vertex_count, size_t vertex_size, unsigned char last_vertex[256], const unsigned char* channels, int version)1554{1555assert(vertex_count > 0 && vertex_count <= kVertexBlockMaxSize);15561557unsigned char buffer[kVertexBlockMaxSize * 4];1558unsigned char transposed[kVertexBlockSizeBytes];15591560size_t vertex_count_aligned = (vertex_count + kByteGroupSize - 1) & ~(kByteGroupSize - 1);15611562size_t control_size = version == 0 ? 0 : vertex_size / 4;1563if (size_t(data_end - data) < control_size)1564return NULL;15651566const unsigned char* control = data;1567data += control_size;15681569for (size_t k = 0; k < vertex_size; k += 4)1570{1571unsigned char ctrl_byte = version == 0 ? 0 : control[k / 4];15721573for (size_t j = 0; j < 4; ++j)1574{1575int ctrl = (ctrl_byte >> (j * 2)) & 3;15761577if (ctrl == 3)1578{1579// literal encoding; safe to over-copy due to tail1580if (size_t(data_end - data) < vertex_count_aligned)1581return NULL;15821583memcpy(buffer + j * vertex_count_aligned, data, vertex_count_aligned);1584data += vertex_count;1585}1586else if (ctrl == 2)1587{1588// zero encoding1589memset(buffer + j * vertex_count_aligned, 0, vertex_count_aligned);1590}1591else1592{1593// for v0, headers are mapped to 0..3; for v1, headers are mapped to 4..81594int hshift = version == 0 ? 0 : 4 + ctrl;15951596data = decodeBytesSimd(data, data_end, buffer + j * vertex_count_aligned, vertex_count_aligned, hshift);1597if (!data)1598return NULL;1599}1600}16011602int channel = version == 0 ? 0 : channels[k / 4];16031604switch (channel & 3)1605{1606case 0:1607decodeDeltas4Simd<0>(buffer, transposed + k, vertex_count_aligned, vertex_size, last_vertex + k, 0);1608break;1609case 1:1610decodeDeltas4Simd<1>(buffer, transposed + k, vertex_count_aligned, vertex_size, last_vertex + k, 0);1611break;1612case 2:1613decodeDeltas4Simd<2>(buffer, transposed + k, vertex_count_aligned, vertex_size, last_vertex + k, (32 - (channel >> 4)) & 31);1614break;1615default:1616return NULL; // invalid channel type1617}1618}16191620memcpy(vertex_data, transposed, vertex_count * vertex_size);16211622memcpy(last_vertex, &transposed[vertex_size * (vertex_count - 1)], vertex_size);16231624return data;1625}1626#endif16271628#if defined(SIMD_SSE) && defined(SIMD_FALLBACK)1629static unsigned int getCpuFeatures()1630{1631int cpuinfo[4] = {};1632#ifdef _MSC_VER1633__cpuid(cpuinfo, 1);1634#else1635__cpuid(1, cpuinfo[0], cpuinfo[1], cpuinfo[2], cpuinfo[3]);1636#endif1637return cpuinfo[2];1638}16391640static unsigned int cpuid = getCpuFeatures();1641#endif16421643} // namespace meshopt16441645size_t meshopt_encodeVertexBufferLevel(unsigned char* buffer, size_t buffer_size, const void* vertices, size_t vertex_count, size_t vertex_size, int level, int version)1646{1647using namespace meshopt;16481649assert(vertex_size > 0 && vertex_size <= 256);1650assert(vertex_size % 4 == 0);1651assert(level >= 0 && level <= 9); // only a subset of this range is used right now1652assert(version < 0 || unsigned(version) <= kDecodeVertexVersion);16531654version = version < 0 ? gEncodeVertexVersion : version;16551656#if TRACE1657memset(vertexstats, 0, sizeof(vertexstats));1658#endif16591660const unsigned char* vertex_data = static_cast<const unsigned char*>(vertices);16611662unsigned char* data = buffer;1663unsigned char* data_end = buffer + buffer_size;16641665if (size_t(data_end - data) < 1)1666return 0;16671668*data++ = (unsigned char)(kVertexHeader | version);16691670unsigned char first_vertex[256] = {};1671if (vertex_count > 0)1672memcpy(first_vertex, vertex_data, vertex_size);16731674unsigned char last_vertex[256] = {};1675memcpy(last_vertex, first_vertex, vertex_size);16761677size_t vertex_block_size = getVertexBlockSize(vertex_size);16781679unsigned char channels[64] = {};1680if (version != 0 && level > 1 && vertex_count > 1)1681for (size_t k = 0; k < vertex_size; k += 4)1682{1683int rot = level >= 3 ? estimateRotate(vertex_data, vertex_count, vertex_size, k, /* group_size= */ 16) : 0;1684int channel = estimateChannel(vertex_data, vertex_count, vertex_size, k, vertex_block_size, /* block_skip= */ 3, /* max_channels= */ level >= 3 ? 3 : 2, rot);16851686assert(unsigned(channel) < 2 || ((channel & 3) == 2 && unsigned(channel >> 4) < 8));1687channels[k / 4] = (unsigned char)channel;1688}16891690size_t vertex_offset = 0;16911692while (vertex_offset < vertex_count)1693{1694size_t block_size = (vertex_offset + vertex_block_size < vertex_count) ? vertex_block_size : vertex_count - vertex_offset;16951696data = encodeVertexBlock(data, data_end, vertex_data + vertex_offset * vertex_size, block_size, vertex_size, last_vertex, channels, version, level);1697if (!data)1698return 0;16991700vertex_offset += block_size;1701}17021703size_t tail_size = vertex_size + (version == 0 ? 0 : vertex_size / 4);1704size_t tail_size_min = version == 0 ? kTailMinSizeV0 : kTailMinSizeV1;1705size_t tail_size_pad = tail_size < tail_size_min ? tail_size_min : tail_size;17061707if (size_t(data_end - data) < tail_size_pad)1708return 0;17091710if (tail_size < tail_size_pad)1711{1712memset(data, 0, tail_size_pad - tail_size);1713data += tail_size_pad - tail_size;1714}17151716memcpy(data, first_vertex, vertex_size);1717data += vertex_size;17181719if (version != 0)1720{1721memcpy(data, channels, vertex_size / 4);1722data += vertex_size / 4;1723}17241725assert(data >= buffer + tail_size);1726assert(data <= buffer + buffer_size);17271728#if TRACE1729size_t total_size = data - buffer;17301731for (size_t k = 0; k < vertex_size; ++k)1732{1733const Stats& vsk = vertexstats[k];17341735printf("%2d: %7d bytes [%4.1f%%] %.1f bpv", int(k), int(vsk.size), double(vsk.size) / double(total_size) * 100, double(vsk.size) / double(vertex_count) * 8);17361737size_t total_k = vsk.header + vsk.bitg[1] + vsk.bitg[2] + vsk.bitg[4] + vsk.bitg[8];1738double total_kr = total_k ? 1.0 / double(total_k) : 0;17391740if (version != 0)1741{1742int channel = channels[k / 4];17431744if ((channel & 3) == 2 && k % 4 == 0)1745printf(" | ^%d", channel >> 4);1746else1747printf(" | %2s", channel == 0 ? "1" : (channel == 1 && k % 2 == 0 ? "2" : "."));1748}17491750printf(" | hdr [%5.1f%%] bitg [1 %4.1f%% 2 %4.1f%% 4 %4.1f%% 8 %4.1f%%]",1751double(vsk.header) * total_kr * 100,1752double(vsk.bitg[1]) * total_kr * 100, double(vsk.bitg[2]) * total_kr * 100,1753double(vsk.bitg[4]) * total_kr * 100, double(vsk.bitg[8]) * total_kr * 100);17541755size_t total_ctrl = vsk.ctrl[0] + vsk.ctrl[1] + vsk.ctrl[2] + vsk.ctrl[3];17561757if (total_ctrl)1758{1759printf(" | ctrl %3.0f%% %3.0f%% %3.0f%% %3.0f%%",1760double(vsk.ctrl[0]) / double(total_ctrl) * 100, double(vsk.ctrl[1]) / double(total_ctrl) * 100,1761double(vsk.ctrl[2]) / double(total_ctrl) * 100, double(vsk.ctrl[3]) / double(total_ctrl) * 100);1762}17631764if (level >= 3)1765printf(" | bitc [%3.0f%% %3.0f%% %3.0f%% %3.0f%% %3.0f%% %3.0f%% %3.0f%% %3.0f%%]",1766double(vsk.bitc[0]) / double(vertex_count) * 100, double(vsk.bitc[1]) / double(vertex_count) * 100,1767double(vsk.bitc[2]) / double(vertex_count) * 100, double(vsk.bitc[3]) / double(vertex_count) * 100,1768double(vsk.bitc[4]) / double(vertex_count) * 100, double(vsk.bitc[5]) / double(vertex_count) * 100,1769double(vsk.bitc[6]) / double(vertex_count) * 100, double(vsk.bitc[7]) / double(vertex_count) * 100);17701771printf("\n");1772}1773#endif17741775return data - buffer;1776}17771778size_t meshopt_encodeVertexBuffer(unsigned char* buffer, size_t buffer_size, const void* vertices, size_t vertex_count, size_t vertex_size)1779{1780return meshopt_encodeVertexBufferLevel(buffer, buffer_size, vertices, vertex_count, vertex_size, meshopt::kEncodeDefaultLevel, meshopt::gEncodeVertexVersion);1781}17821783size_t meshopt_encodeVertexBufferBound(size_t vertex_count, size_t vertex_size)1784{1785using namespace meshopt;17861787assert(vertex_size > 0 && vertex_size <= 256);1788assert(vertex_size % 4 == 0);17891790size_t vertex_block_size = getVertexBlockSize(vertex_size);1791size_t vertex_block_count = (vertex_count + vertex_block_size - 1) / vertex_block_size;17921793size_t vertex_block_control_size = vertex_size / 4;1794size_t vertex_block_header_size = (vertex_block_size / kByteGroupSize + 3) / 4;1795size_t vertex_block_data_size = vertex_block_size;17961797size_t tail_size = vertex_size + (vertex_size / 4);1798size_t tail_size_min = kTailMinSizeV0 > kTailMinSizeV1 ? kTailMinSizeV0 : kTailMinSizeV1;1799size_t tail_size_pad = tail_size < tail_size_min ? tail_size_min : tail_size;1800assert(tail_size_pad >= kByteGroupDecodeLimit);18011802return 1 + vertex_block_count * vertex_size * (vertex_block_control_size + vertex_block_header_size + vertex_block_data_size) + tail_size_pad;1803}18041805void meshopt_encodeVertexVersion(int version)1806{1807assert(unsigned(version) <= unsigned(meshopt::kDecodeVertexVersion));18081809meshopt::gEncodeVertexVersion = version;1810}18111812int meshopt_decodeVertexVersion(const unsigned char* buffer, size_t buffer_size)1813{1814if (buffer_size < 1)1815return -1;18161817unsigned char header = buffer[0];18181819if ((header & 0xf0) != meshopt::kVertexHeader)1820return -1;18211822int version = header & 0x0f;1823if (version > meshopt::kDecodeVertexVersion)1824return -1;18251826return version;1827}18281829int meshopt_decodeVertexBuffer(void* destination, size_t vertex_count, size_t vertex_size, const unsigned char* buffer, size_t buffer_size)1830{1831using namespace meshopt;18321833assert(vertex_size > 0 && vertex_size <= 256);1834assert(vertex_size % 4 == 0);18351836const unsigned char* (*decode)(const unsigned char*, const unsigned char*, unsigned char*, size_t, size_t, unsigned char[256], const unsigned char*, int) = NULL;18371838#if defined(SIMD_SSE) && defined(SIMD_FALLBACK)1839decode = (cpuid & (1 << 9)) ? decodeVertexBlockSimd : decodeVertexBlock;1840#elif defined(SIMD_SSE) || defined(SIMD_AVX) || defined(SIMD_NEON) || defined(SIMD_WASM)1841decode = decodeVertexBlockSimd;1842#else1843decode = decodeVertexBlock;1844#endif18451846#if defined(SIMD_SSE) || defined(SIMD_NEON) || defined(SIMD_WASM)1847assert(gDecodeBytesGroupInitialized);1848(void)gDecodeBytesGroupInitialized;1849#endif18501851unsigned char* vertex_data = static_cast<unsigned char*>(destination);18521853const unsigned char* data = buffer;1854const unsigned char* data_end = buffer + buffer_size;18551856if (size_t(data_end - data) < 1)1857return -2;18581859unsigned char data_header = *data++;18601861if ((data_header & 0xf0) != kVertexHeader)1862return -1;18631864int version = data_header & 0x0f;1865if (version > kDecodeVertexVersion)1866return -1;18671868size_t tail_size = vertex_size + (version == 0 ? 0 : vertex_size / 4);1869size_t tail_size_min = version == 0 ? kTailMinSizeV0 : kTailMinSizeV1;1870size_t tail_size_pad = tail_size < tail_size_min ? tail_size_min : tail_size;18711872if (size_t(data_end - data) < tail_size_pad)1873return -2;18741875const unsigned char* tail = data_end - tail_size;18761877unsigned char last_vertex[256];1878memcpy(last_vertex, tail, vertex_size);18791880const unsigned char* channels = version == 0 ? NULL : tail + vertex_size;18811882size_t vertex_block_size = getVertexBlockSize(vertex_size);18831884size_t vertex_offset = 0;18851886while (vertex_offset < vertex_count)1887{1888size_t block_size = (vertex_offset + vertex_block_size < vertex_count) ? vertex_block_size : vertex_count - vertex_offset;18891890data = decode(data, data_end, vertex_data + vertex_offset * vertex_size, block_size, vertex_size, last_vertex, channels, version);1891if (!data)1892return -2;18931894vertex_offset += block_size;1895}18961897if (size_t(data_end - data) != tail_size_pad)1898return -3;18991900return 0;1901}19021903#undef SIMD_NEON1904#undef SIMD_SSE1905#undef SIMD_AVX1906#undef SIMD_WASM1907#undef SIMD_FALLBACK1908#undef SIMD_TARGET1909#undef SIMD_LATENCYOPT191019111912