Path: blob/master/thirdparty/meshoptimizer/clusterizer.cpp
20843 views
// This file is part of meshoptimizer library; see meshoptimizer.h for version/license details1#include "meshoptimizer.h"23#include <assert.h>4#include <float.h>5#include <math.h>6#include <string.h>78// The block below auto-detects SIMD ISA that can be used on the target platform9#ifndef MESHOPTIMIZER_NO_SIMD10#if defined(__SSE2__) || (defined(_MSC_VER) && defined(_M_X64))11#define SIMD_SSE12#include <emmintrin.h>13#elif defined(__aarch64__) || (defined(_MSC_VER) && defined(_M_ARM64) && _MSC_VER >= 1922)14#define SIMD_NEON15#include <arm_neon.h>16#endif17#endif // !MESHOPTIMIZER_NO_SIMD1819// This work is based on:20// Graham Wihlidal. Optimizing the Graphics Pipeline with Compute. 201621// Matthaeus Chajdas. GeometryFX 1.2 - Cluster Culling. 201622// Jack Ritter. An Efficient Bounding Sphere. 199023// Thomas Larsson. Fast and Tight Fitting Bounding Spheres. 200824// Ingo Wald, Vlastimil Havran. On building fast kd-Trees for Ray Tracing, and on doing that in O(N log N). 200625namespace meshopt26{2728// This must be <= 256 since meshlet indices are stored as bytes29const size_t kMeshletMaxVertices = 256;3031// A reasonable limit is around 2*max_vertices or less32const size_t kMeshletMaxTriangles = 512;3334// We keep a limited number of seed triangles and add a few triangles per finished meshlet35const size_t kMeshletMaxSeeds = 256;36const size_t kMeshletAddSeeds = 4;3738// To avoid excessive recursion for malformed inputs, we limit the maximum depth of the tree39const int kMeshletMaxTreeDepth = 50;4041struct TriangleAdjacency242{43unsigned int* counts;44unsigned int* offsets;45unsigned int* data;46};4748static void buildTriangleAdjacency(TriangleAdjacency2& adjacency, const unsigned int* indices, size_t index_count, size_t vertex_count, meshopt_Allocator& allocator)49{50size_t face_count = index_count / 3;5152// allocate arrays53adjacency.counts = allocator.allocate<unsigned int>(vertex_count);54adjacency.offsets = allocator.allocate<unsigned int>(vertex_count);55adjacency.data = allocator.allocate<unsigned int>(index_count);5657// fill triangle counts58memset(adjacency.counts, 0, vertex_count * sizeof(unsigned int));5960for (size_t i = 0; i < index_count; ++i)61{62assert(indices[i] < vertex_count);6364adjacency.counts[indices[i]]++;65}6667// fill offset table68unsigned int offset = 0;6970for (size_t i = 0; i < vertex_count; ++i)71{72adjacency.offsets[i] = offset;73offset += adjacency.counts[i];74}7576assert(offset == index_count);7778// fill triangle data79for (size_t i = 0; i < face_count; ++i)80{81unsigned int a = indices[i * 3 + 0], b = indices[i * 3 + 1], c = indices[i * 3 + 2];8283adjacency.data[adjacency.offsets[a]++] = unsigned(i);84adjacency.data[adjacency.offsets[b]++] = unsigned(i);85adjacency.data[adjacency.offsets[c]++] = unsigned(i);86}8788// fix offsets that have been disturbed by the previous pass89for (size_t i = 0; i < vertex_count; ++i)90{91assert(adjacency.offsets[i] >= adjacency.counts[i]);92adjacency.offsets[i] -= adjacency.counts[i];93}94}9596static void buildTriangleAdjacencySparse(TriangleAdjacency2& adjacency, const unsigned int* indices, size_t index_count, size_t vertex_count, meshopt_Allocator& allocator)97{98size_t face_count = index_count / 3;99100// sparse mode can build adjacency more quickly by ignoring unused vertices, using a bit to mark visited vertices101const unsigned int sparse_seen = 1u << 31;102assert(index_count < sparse_seen);103104// allocate arrays105adjacency.counts = allocator.allocate<unsigned int>(vertex_count);106adjacency.offsets = allocator.allocate<unsigned int>(vertex_count);107adjacency.data = allocator.allocate<unsigned int>(index_count);108109// fill triangle counts110for (size_t i = 0; i < index_count; ++i)111assert(indices[i] < vertex_count);112113for (size_t i = 0; i < index_count; ++i)114adjacency.counts[indices[i]] = 0;115116for (size_t i = 0; i < index_count; ++i)117adjacency.counts[indices[i]]++;118119// fill offset table; uses sparse_seen bit to tag visited vertices120unsigned int offset = 0;121122for (size_t i = 0; i < index_count; ++i)123{124unsigned int v = indices[i];125126if ((adjacency.counts[v] & sparse_seen) == 0)127{128adjacency.offsets[v] = offset;129offset += adjacency.counts[v];130adjacency.counts[v] |= sparse_seen;131}132}133134assert(offset == index_count);135136// fill triangle data137for (size_t i = 0; i < face_count; ++i)138{139unsigned int a = indices[i * 3 + 0], b = indices[i * 3 + 1], c = indices[i * 3 + 2];140141adjacency.data[adjacency.offsets[a]++] = unsigned(i);142adjacency.data[adjacency.offsets[b]++] = unsigned(i);143adjacency.data[adjacency.offsets[c]++] = unsigned(i);144}145146// fix offsets that have been disturbed by the previous pass147// also fix counts (that were marked with sparse_seen by the first pass)148for (size_t i = 0; i < index_count; ++i)149{150unsigned int v = indices[i];151152if (adjacency.counts[v] & sparse_seen)153{154adjacency.counts[v] &= ~sparse_seen;155156assert(adjacency.offsets[v] >= adjacency.counts[v]);157adjacency.offsets[v] -= adjacency.counts[v];158}159}160}161162static void clearUsed(short* used, size_t vertex_count, const unsigned int* indices, size_t index_count)163{164// for sparse inputs, it's faster to only clear vertices referenced by the index buffer165if (vertex_count <= index_count)166memset(used, -1, vertex_count * sizeof(short));167else168for (size_t i = 0; i < index_count; ++i)169{170assert(indices[i] < vertex_count);171used[indices[i]] = -1;172}173}174175static void computeBoundingSphere(float result[4], const float* points, size_t count, size_t points_stride, const float* radii, size_t radii_stride, size_t axis_count)176{177static const float kAxes[7][3] = {178// X, Y, Z179{1, 0, 0},180{0, 1, 0},181{0, 0, 1},182183// XYZ, -XYZ, X-YZ, XY-Z; normalized to unit length184{0.57735026f, 0.57735026f, 0.57735026f},185{-0.57735026f, 0.57735026f, 0.57735026f},186{0.57735026f, -0.57735026f, 0.57735026f},187{0.57735026f, 0.57735026f, -0.57735026f},188};189190assert(count > 0);191assert(axis_count <= sizeof(kAxes) / sizeof(kAxes[0]));192193size_t points_stride_float = points_stride / sizeof(float);194size_t radii_stride_float = radii_stride / sizeof(float);195196// find extremum points along all axes; for each axis we get a pair of points with min/max coordinates197size_t pmin[7], pmax[7];198float tmin[7], tmax[7];199200for (size_t axis = 0; axis < axis_count; ++axis)201{202pmin[axis] = pmax[axis] = 0;203tmin[axis] = FLT_MAX;204tmax[axis] = -FLT_MAX;205}206207for (size_t i = 0; i < count; ++i)208{209const float* p = points + i * points_stride_float;210float r = radii[i * radii_stride_float];211212for (size_t axis = 0; axis < axis_count; ++axis)213{214const float* ax = kAxes[axis];215216float tp = ax[0] * p[0] + ax[1] * p[1] + ax[2] * p[2];217float tpmin = tp - r, tpmax = tp + r;218219pmin[axis] = (tpmin < tmin[axis]) ? i : pmin[axis];220pmax[axis] = (tpmax > tmax[axis]) ? i : pmax[axis];221tmin[axis] = (tpmin < tmin[axis]) ? tpmin : tmin[axis];222tmax[axis] = (tpmax > tmax[axis]) ? tpmax : tmax[axis];223}224}225226// find the pair of points with largest distance227size_t paxis = 0;228float paxisdr = 0;229230for (size_t axis = 0; axis < axis_count; ++axis)231{232const float* p1 = points + pmin[axis] * points_stride_float;233const float* p2 = points + pmax[axis] * points_stride_float;234float r1 = radii[pmin[axis] * radii_stride_float];235float r2 = radii[pmax[axis] * radii_stride_float];236237float d2 = (p2[0] - p1[0]) * (p2[0] - p1[0]) + (p2[1] - p1[1]) * (p2[1] - p1[1]) + (p2[2] - p1[2]) * (p2[2] - p1[2]);238float dr = sqrtf(d2) + r1 + r2;239240if (dr > paxisdr)241{242paxisdr = dr;243paxis = axis;244}245}246247// use the longest segment as the initial sphere diameter248const float* p1 = points + pmin[paxis] * points_stride_float;249const float* p2 = points + pmax[paxis] * points_stride_float;250float r1 = radii[pmin[paxis] * radii_stride_float];251float r2 = radii[pmax[paxis] * radii_stride_float];252253float paxisd = sqrtf((p2[0] - p1[0]) * (p2[0] - p1[0]) + (p2[1] - p1[1]) * (p2[1] - p1[1]) + (p2[2] - p1[2]) * (p2[2] - p1[2]));254float paxisk = paxisd > 0 ? (paxisd + r2 - r1) / (2 * paxisd) : 0.f;255256float center[3] = {p1[0] + (p2[0] - p1[0]) * paxisk, p1[1] + (p2[1] - p1[1]) * paxisk, p1[2] + (p2[2] - p1[2]) * paxisk};257float radius = paxisdr / 2;258259// iteratively adjust the sphere up until all points fit260for (size_t i = 0; i < count; ++i)261{262const float* p = points + i * points_stride_float;263float r = radii[i * radii_stride_float];264265float d2 = (p[0] - center[0]) * (p[0] - center[0]) + (p[1] - center[1]) * (p[1] - center[1]) + (p[2] - center[2]) * (p[2] - center[2]);266float d = sqrtf(d2);267268if (d + r > radius)269{270float k = d > 0 ? (d + r - radius) / (2 * d) : 0.f;271272center[0] += k * (p[0] - center[0]);273center[1] += k * (p[1] - center[1]);274center[2] += k * (p[2] - center[2]);275radius = (radius + d + r) / 2;276}277}278279result[0] = center[0];280result[1] = center[1];281result[2] = center[2];282result[3] = radius;283}284285struct Cone286{287float px, py, pz;288float nx, ny, nz;289};290291static float getMeshletScore(float distance, float spread, float cone_weight, float expected_radius)292{293float cone = 1.f - spread * cone_weight;294float cone_clamped = cone < 1e-3f ? 1e-3f : cone;295296return (1 + distance / expected_radius * (1 - cone_weight)) * cone_clamped;297}298299static Cone getMeshletCone(const Cone& acc, unsigned int triangle_count)300{301Cone result = acc;302303float center_scale = triangle_count == 0 ? 0.f : 1.f / float(triangle_count);304305result.px *= center_scale;306result.py *= center_scale;307result.pz *= center_scale;308309float axis_length = result.nx * result.nx + result.ny * result.ny + result.nz * result.nz;310float axis_scale = axis_length == 0.f ? 0.f : 1.f / sqrtf(axis_length);311312result.nx *= axis_scale;313result.ny *= axis_scale;314result.nz *= axis_scale;315316return result;317}318319static float computeTriangleCones(Cone* triangles, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride)320{321(void)vertex_count;322323size_t vertex_stride_float = vertex_positions_stride / sizeof(float);324size_t face_count = index_count / 3;325326float mesh_area = 0;327328for (size_t i = 0; i < face_count; ++i)329{330unsigned int a = indices[i * 3 + 0], b = indices[i * 3 + 1], c = indices[i * 3 + 2];331assert(a < vertex_count && b < vertex_count && c < vertex_count);332333const float* p0 = vertex_positions + vertex_stride_float * a;334const float* p1 = vertex_positions + vertex_stride_float * b;335const float* p2 = vertex_positions + vertex_stride_float * c;336337float p10[3] = {p1[0] - p0[0], p1[1] - p0[1], p1[2] - p0[2]};338float p20[3] = {p2[0] - p0[0], p2[1] - p0[1], p2[2] - p0[2]};339340float normalx = p10[1] * p20[2] - p10[2] * p20[1];341float normaly = p10[2] * p20[0] - p10[0] * p20[2];342float normalz = p10[0] * p20[1] - p10[1] * p20[0];343344float area = sqrtf(normalx * normalx + normaly * normaly + normalz * normalz);345float invarea = (area == 0.f) ? 0.f : 1.f / area;346347triangles[i].px = (p0[0] + p1[0] + p2[0]) / 3.f;348triangles[i].py = (p0[1] + p1[1] + p2[1]) / 3.f;349triangles[i].pz = (p0[2] + p1[2] + p2[2]) / 3.f;350351triangles[i].nx = normalx * invarea;352triangles[i].ny = normaly * invarea;353triangles[i].nz = normalz * invarea;354355mesh_area += area;356}357358return mesh_area;359}360361static bool appendMeshlet(meshopt_Meshlet& meshlet, unsigned int a, unsigned int b, unsigned int c, short* used, meshopt_Meshlet* meshlets, unsigned int* meshlet_vertices, unsigned char* meshlet_triangles, size_t meshlet_offset, size_t max_vertices, size_t max_triangles, bool split = false)362{363short& av = used[a];364short& bv = used[b];365short& cv = used[c];366367bool result = false;368369int used_extra = (av < 0) + (bv < 0) + (cv < 0);370371if (meshlet.vertex_count + used_extra > max_vertices || meshlet.triangle_count >= max_triangles || split)372{373meshlets[meshlet_offset] = meshlet;374375for (size_t j = 0; j < meshlet.vertex_count; ++j)376used[meshlet_vertices[meshlet.vertex_offset + j]] = -1;377378meshlet.vertex_offset += meshlet.vertex_count;379meshlet.triangle_offset += meshlet.triangle_count * 3;380meshlet.vertex_count = 0;381meshlet.triangle_count = 0;382383result = true;384}385386if (av < 0)387{388av = short(meshlet.vertex_count);389meshlet_vertices[meshlet.vertex_offset + meshlet.vertex_count++] = a;390}391392if (bv < 0)393{394bv = short(meshlet.vertex_count);395meshlet_vertices[meshlet.vertex_offset + meshlet.vertex_count++] = b;396}397398if (cv < 0)399{400cv = short(meshlet.vertex_count);401meshlet_vertices[meshlet.vertex_offset + meshlet.vertex_count++] = c;402}403404meshlet_triangles[meshlet.triangle_offset + meshlet.triangle_count * 3 + 0] = (unsigned char)av;405meshlet_triangles[meshlet.triangle_offset + meshlet.triangle_count * 3 + 1] = (unsigned char)bv;406meshlet_triangles[meshlet.triangle_offset + meshlet.triangle_count * 3 + 2] = (unsigned char)cv;407meshlet.triangle_count++;408409return result;410}411412static unsigned int getNeighborTriangle(const meshopt_Meshlet& meshlet, const Cone& meshlet_cone, const unsigned int* meshlet_vertices, const unsigned int* indices, const TriangleAdjacency2& adjacency, const Cone* triangles, const unsigned int* live_triangles, const short* used, float meshlet_expected_radius, float cone_weight)413{414unsigned int best_triangle = ~0u;415int best_priority = 5;416float best_score = FLT_MAX;417418for (size_t i = 0; i < meshlet.vertex_count; ++i)419{420unsigned int index = meshlet_vertices[meshlet.vertex_offset + i];421422unsigned int* neighbors = &adjacency.data[0] + adjacency.offsets[index];423size_t neighbors_size = adjacency.counts[index];424425for (size_t j = 0; j < neighbors_size; ++j)426{427unsigned int triangle = neighbors[j];428unsigned int a = indices[triangle * 3 + 0], b = indices[triangle * 3 + 1], c = indices[triangle * 3 + 2];429430int extra = (used[a] < 0) + (used[b] < 0) + (used[c] < 0);431assert(extra <= 2);432433int priority = -1;434435// triangles that don't add new vertices to meshlets are max. priority436if (extra == 0)437priority = 0;438// artificially increase the priority of dangling triangles as they're expensive to add to new meshlets439else if (live_triangles[a] == 1 || live_triangles[b] == 1 || live_triangles[c] == 1)440priority = 1;441// if two vertices have live count of 2, removing this triangle will make another triangle dangling which is good for overall flow442else if ((live_triangles[a] == 2) + (live_triangles[b] == 2) + (live_triangles[c] == 2) >= 2)443priority = 1 + extra;444// otherwise adjust priority to be after the above cases, 3 or 4 based on used[] count445else446priority = 2 + extra;447448// since topology-based priority is always more important than the score, we can skip scoring in some cases449if (priority > best_priority)450continue;451452const Cone& tri_cone = triangles[triangle];453454float dx = tri_cone.px - meshlet_cone.px, dy = tri_cone.py - meshlet_cone.py, dz = tri_cone.pz - meshlet_cone.pz;455float distance = sqrtf(dx * dx + dy * dy + dz * dz);456float spread = tri_cone.nx * meshlet_cone.nx + tri_cone.ny * meshlet_cone.ny + tri_cone.nz * meshlet_cone.nz;457458float score = getMeshletScore(distance, spread, cone_weight, meshlet_expected_radius);459460// note that topology-based priority is always more important than the score461// this helps maintain reasonable effectiveness of meshlet data and reduces scoring cost462if (priority < best_priority || score < best_score)463{464best_triangle = triangle;465best_priority = priority;466best_score = score;467}468}469}470471return best_triangle;472}473474static size_t appendSeedTriangles(unsigned int* seeds, const meshopt_Meshlet& meshlet, const unsigned int* meshlet_vertices, const unsigned int* indices, const TriangleAdjacency2& adjacency, const Cone* triangles, const unsigned int* live_triangles, float cornerx, float cornery, float cornerz)475{476unsigned int best_seeds[kMeshletAddSeeds];477unsigned int best_live[kMeshletAddSeeds];478float best_score[kMeshletAddSeeds];479480for (size_t i = 0; i < kMeshletAddSeeds; ++i)481{482best_seeds[i] = ~0u;483best_live[i] = ~0u;484best_score[i] = FLT_MAX;485}486487for (size_t i = 0; i < meshlet.vertex_count; ++i)488{489unsigned int index = meshlet_vertices[meshlet.vertex_offset + i];490491unsigned int best_neighbor = ~0u;492unsigned int best_neighbor_live = ~0u;493494// find the neighbor with the smallest live metric495unsigned int* neighbors = &adjacency.data[0] + adjacency.offsets[index];496size_t neighbors_size = adjacency.counts[index];497498for (size_t j = 0; j < neighbors_size; ++j)499{500unsigned int triangle = neighbors[j];501unsigned int a = indices[triangle * 3 + 0], b = indices[triangle * 3 + 1], c = indices[triangle * 3 + 2];502503unsigned int live = live_triangles[a] + live_triangles[b] + live_triangles[c];504505if (live < best_neighbor_live)506{507best_neighbor = triangle;508best_neighbor_live = live;509}510}511512// add the neighbor to the list of seeds; the list is unsorted and the replacement criteria is approximate513if (best_neighbor == ~0u)514continue;515516float dx = triangles[best_neighbor].px - cornerx, dy = triangles[best_neighbor].py - cornery, dz = triangles[best_neighbor].pz - cornerz;517float best_neighbor_score = sqrtf(dx * dx + dy * dy + dz * dz);518519for (size_t j = 0; j < kMeshletAddSeeds; ++j)520{521// non-strict comparison reduces the number of duplicate seeds (triangles adjacent to multiple vertices)522if (best_neighbor_live < best_live[j] || (best_neighbor_live == best_live[j] && best_neighbor_score <= best_score[j]))523{524best_seeds[j] = best_neighbor;525best_live[j] = best_neighbor_live;526best_score[j] = best_neighbor_score;527break;528}529}530}531532// add surviving seeds to the meshlet533size_t seed_count = 0;534535for (size_t i = 0; i < kMeshletAddSeeds; ++i)536if (best_seeds[i] != ~0u)537seeds[seed_count++] = best_seeds[i];538539return seed_count;540}541542static size_t pruneSeedTriangles(unsigned int* seeds, size_t seed_count, const unsigned char* emitted_flags)543{544size_t result = 0;545546for (size_t i = 0; i < seed_count; ++i)547{548unsigned int index = seeds[i];549550seeds[result] = index;551result += emitted_flags[index] == 0;552}553554return result;555}556557static unsigned int selectSeedTriangle(const unsigned int* seeds, size_t seed_count, const unsigned int* indices, const Cone* triangles, const unsigned int* live_triangles, float cornerx, float cornery, float cornerz)558{559unsigned int best_seed = ~0u;560unsigned int best_live = ~0u;561float best_score = FLT_MAX;562563for (size_t i = 0; i < seed_count; ++i)564{565unsigned int index = seeds[i];566unsigned int a = indices[index * 3 + 0], b = indices[index * 3 + 1], c = indices[index * 3 + 2];567568unsigned int live = live_triangles[a] + live_triangles[b] + live_triangles[c];569float dx = triangles[index].px - cornerx, dy = triangles[index].py - cornery, dz = triangles[index].pz - cornerz;570float score = sqrtf(dx * dx + dy * dy + dz * dz);571572if (live < best_live || (live == best_live && score < best_score))573{574best_seed = index;575best_live = live;576best_score = score;577}578}579580return best_seed;581}582583struct KDNode584{585union586{587float split;588unsigned int index;589};590591// leaves: axis = 3, children = number of points including this one592// branches: axis != 3, left subtree = skip 1, right subtree = skip 1+children593unsigned int axis : 2;594unsigned int children : 30;595};596597static size_t kdtreePartition(unsigned int* indices, size_t count, const float* points, size_t stride, int axis, float pivot)598{599size_t m = 0;600601// invariant: elements in range [0, m) are < pivot, elements in range [m, i) are >= pivot602for (size_t i = 0; i < count; ++i)603{604float v = points[indices[i] * stride + axis];605606// swap(m, i) unconditionally607unsigned int t = indices[m];608indices[m] = indices[i];609indices[i] = t;610611// when v >= pivot, we swap i with m without advancing it, preserving invariants612m += v < pivot;613}614615return m;616}617618static size_t kdtreeBuildLeaf(size_t offset, KDNode* nodes, size_t node_count, unsigned int* indices, size_t count)619{620assert(offset + count <= node_count);621(void)node_count;622623KDNode& result = nodes[offset];624625result.index = indices[0];626result.axis = 3;627result.children = unsigned(count);628629// all remaining points are stored in nodes immediately following the leaf630for (size_t i = 1; i < count; ++i)631{632KDNode& tail = nodes[offset + i];633634tail.index = indices[i];635tail.axis = 3;636tail.children = ~0u >> 2; // bogus value to prevent misuse637}638639return offset + count;640}641642static size_t kdtreeBuild(size_t offset, KDNode* nodes, size_t node_count, const float* points, size_t stride, unsigned int* indices, size_t count, size_t leaf_size, int depth)643{644assert(count > 0);645assert(offset < node_count);646647if (count <= leaf_size)648return kdtreeBuildLeaf(offset, nodes, node_count, indices, count);649650float mean[3] = {};651float vars[3] = {};652float runc = 1, runs = 1;653654// gather statistics on the points in the subtree using Welford's algorithm655for (size_t i = 0; i < count; ++i, runc += 1.f, runs = 1.f / runc)656{657const float* point = points + indices[i] * stride;658659for (int k = 0; k < 3; ++k)660{661float delta = point[k] - mean[k];662mean[k] += delta * runs;663vars[k] += delta * (point[k] - mean[k]);664}665}666667// split axis is one where the variance is largest668int axis = (vars[0] >= vars[1] && vars[0] >= vars[2]) ? 0 : (vars[1] >= vars[2] ? 1 : 2);669670float split = mean[axis];671size_t middle = kdtreePartition(indices, count, points, stride, axis, split);672673// when the partition is degenerate simply consolidate the points into a single node674// this also ensures recursion depth is bounded on pathological inputs675if (middle <= leaf_size / 2 || middle >= count - leaf_size / 2 || depth >= kMeshletMaxTreeDepth)676return kdtreeBuildLeaf(offset, nodes, node_count, indices, count);677678KDNode& result = nodes[offset];679680result.split = split;681result.axis = axis;682683// left subtree is right after our node684size_t next_offset = kdtreeBuild(offset + 1, nodes, node_count, points, stride, indices, middle, leaf_size, depth + 1);685686// distance to the right subtree is represented explicitly687assert(next_offset - offset > 1);688result.children = unsigned(next_offset - offset - 1);689690return kdtreeBuild(next_offset, nodes, node_count, points, stride, indices + middle, count - middle, leaf_size, depth + 1);691}692693static void kdtreeNearest(KDNode* nodes, unsigned int root, const float* points, size_t stride, const unsigned char* emitted_flags, const float* position, unsigned int& result, float& limit)694{695const KDNode& node = nodes[root];696697if (node.children == 0)698return;699700if (node.axis == 3)701{702// leaf703bool inactive = true;704705for (unsigned int i = 0; i < node.children; ++i)706{707unsigned int index = nodes[root + i].index;708709if (emitted_flags[index])710continue;711712inactive = false;713714const float* point = points + index * stride;715716float dx = point[0] - position[0], dy = point[1] - position[1], dz = point[2] - position[2];717float distance = sqrtf(dx * dx + dy * dy + dz * dz);718719if (distance < limit)720{721result = index;722limit = distance;723}724}725726// deactivate leaves that no longer have items to emit727if (inactive)728nodes[root].children = 0;729}730else731{732// branch; we order recursion to process the node that search position is in first733float delta = position[node.axis] - node.split;734unsigned int first = (delta <= 0) ? 0 : node.children;735unsigned int second = first ^ node.children;736737// deactivate branches that no longer have items to emit to accelerate traversal738// note that we do this *before* recursing which delays deactivation but keeps tail calls739if ((nodes[root + 1 + first].children | nodes[root + 1 + second].children) == 0)740nodes[root].children = 0;741742// recursion depth is bounded by tree depth (which is limited by construction)743kdtreeNearest(nodes, root + 1 + first, points, stride, emitted_flags, position, result, limit);744745// only process the other node if it can have a match based on closest distance so far746if (fabsf(delta) <= limit)747kdtreeNearest(nodes, root + 1 + second, points, stride, emitted_flags, position, result, limit);748}749}750751struct BVHBoxT752{753float min[4];754float max[4];755};756757struct BVHBox758{759float min[3];760float max[3];761};762763#if defined(SIMD_SSE)764static float boxMerge(BVHBoxT& box, const BVHBox& other)765{766__m128 min = _mm_loadu_ps(box.min);767__m128 max = _mm_loadu_ps(box.max);768769// note: over-read is safe because BVHBox array is allocated with padding770min = _mm_min_ps(min, _mm_loadu_ps(other.min));771max = _mm_max_ps(max, _mm_loadu_ps(other.max));772773_mm_storeu_ps(box.min, min);774_mm_storeu_ps(box.max, max);775776__m128 size = _mm_sub_ps(max, min);777__m128 size_yzx = _mm_shuffle_ps(size, size, _MM_SHUFFLE(0, 0, 2, 1));778__m128 mul = _mm_mul_ps(size, size_yzx);779__m128 sum_xy = _mm_add_ss(mul, _mm_shuffle_ps(mul, mul, _MM_SHUFFLE(1, 1, 1, 1)));780__m128 sum_xyz = _mm_add_ss(sum_xy, _mm_shuffle_ps(mul, mul, _MM_SHUFFLE(2, 2, 2, 2)));781782return _mm_cvtss_f32(sum_xyz);783}784#elif defined(SIMD_NEON)785static float boxMerge(BVHBoxT& box, const BVHBox& other)786{787float32x4_t min = vld1q_f32(box.min);788float32x4_t max = vld1q_f32(box.max);789790// note: over-read is safe because BVHBox array is allocated with padding791min = vminq_f32(min, vld1q_f32(other.min));792max = vmaxq_f32(max, vld1q_f32(other.max));793794vst1q_f32(box.min, min);795vst1q_f32(box.max, max);796797float32x4_t size = vsubq_f32(max, min);798float32x4_t size_yzx = vextq_f32(vextq_f32(size, size, 3), size, 2);799float32x4_t mul = vmulq_f32(size, size_yzx);800float sum_xy = vgetq_lane_f32(mul, 0) + vgetq_lane_f32(mul, 1);801float sum_xyz = sum_xy + vgetq_lane_f32(mul, 2);802803return sum_xyz;804}805#else806static float boxMerge(BVHBoxT& box, const BVHBox& other)807{808for (int k = 0; k < 3; ++k)809{810box.min[k] = other.min[k] < box.min[k] ? other.min[k] : box.min[k];811box.max[k] = other.max[k] > box.max[k] ? other.max[k] : box.max[k];812}813814float sx = box.max[0] - box.min[0], sy = box.max[1] - box.min[1], sz = box.max[2] - box.min[2];815return sx * sy + sx * sz + sy * sz;816}817#endif818819inline unsigned int radixFloat(unsigned int v)820{821// if sign bit is 0, flip sign bit822// if sign bit is 1, flip everything823unsigned int mask = (int(v) >> 31) | 0x80000000;824return v ^ mask;825}826827static void computeHistogram(unsigned int (&hist)[1024][3], const float* data, size_t count)828{829memset(hist, 0, sizeof(hist));830831const unsigned int* bits = reinterpret_cast<const unsigned int*>(data);832833// compute 3 10-bit histograms in parallel (dropping 2 LSB)834for (size_t i = 0; i < count; ++i)835{836unsigned int id = radixFloat(bits[i]);837838hist[(id >> 2) & 1023][0]++;839hist[(id >> 12) & 1023][1]++;840hist[(id >> 22) & 1023][2]++;841}842843unsigned int sum0 = 0, sum1 = 0, sum2 = 0;844845// replace histogram data with prefix histogram sums in-place846for (int i = 0; i < 1024; ++i)847{848unsigned int hx = hist[i][0], hy = hist[i][1], hz = hist[i][2];849850hist[i][0] = sum0;851hist[i][1] = sum1;852hist[i][2] = sum2;853854sum0 += hx;855sum1 += hy;856sum2 += hz;857}858859assert(sum0 == count && sum1 == count && sum2 == count);860}861862static void radixPass(unsigned int* destination, const unsigned int* source, const float* keys, size_t count, unsigned int (&hist)[1024][3], int pass)863{864const unsigned int* bits = reinterpret_cast<const unsigned int*>(keys);865int bitoff = pass * 10 + 2; // drop 2 LSB to be able to use 3 10-bit passes866867for (size_t i = 0; i < count; ++i)868{869unsigned int id = (radixFloat(bits[source[i]]) >> bitoff) & 1023;870871destination[hist[id][pass]++] = source[i];872}873}874875static void bvhPrepare(BVHBox* boxes, float* centroids, const unsigned int* indices, size_t face_count, const float* vertex_positions, size_t vertex_count, size_t vertex_stride_float)876{877(void)vertex_count;878879for (size_t i = 0; i < face_count; ++i)880{881unsigned int a = indices[i * 3 + 0], b = indices[i * 3 + 1], c = indices[i * 3 + 2];882assert(a < vertex_count && b < vertex_count && c < vertex_count);883884const float* va = vertex_positions + vertex_stride_float * a;885const float* vb = vertex_positions + vertex_stride_float * b;886const float* vc = vertex_positions + vertex_stride_float * c;887888BVHBox& box = boxes[i];889890for (int k = 0; k < 3; ++k)891{892box.min[k] = va[k] < vb[k] ? va[k] : vb[k];893box.min[k] = vc[k] < box.min[k] ? vc[k] : box.min[k];894895box.max[k] = va[k] > vb[k] ? va[k] : vb[k];896box.max[k] = vc[k] > box.max[k] ? vc[k] : box.max[k];897898centroids[i + face_count * k] = (box.min[k] + box.max[k]) / 2.f;899}900}901}902903static size_t bvhCountVertices(const unsigned int* order, size_t count, short* used, const unsigned int* indices, unsigned int* out = NULL)904{905// count number of unique vertices906size_t used_vertices = 0;907for (size_t i = 0; i < count; ++i)908{909unsigned int index = order[i];910unsigned int a = indices[index * 3 + 0], b = indices[index * 3 + 1], c = indices[index * 3 + 2];911912used_vertices += (used[a] < 0) + (used[b] < 0) + (used[c] < 0);913used[a] = used[b] = used[c] = 1;914915if (out)916out[i] = unsigned(used_vertices);917}918919// reset used[] for future invocations920for (size_t i = 0; i < count; ++i)921{922unsigned int index = order[i];923unsigned int a = indices[index * 3 + 0], b = indices[index * 3 + 1], c = indices[index * 3 + 2];924925used[a] = used[b] = used[c] = -1;926}927928return used_vertices;929}930931static void bvhPackLeaf(unsigned char* boundary, size_t count)932{933// mark meshlet boundary for future reassembly934assert(count > 0);935936boundary[0] = 1;937memset(boundary + 1, 0, count - 1);938}939940static void bvhPackTail(unsigned char* boundary, const unsigned int* order, size_t count, short* used, const unsigned int* indices, size_t max_vertices, size_t max_triangles)941{942for (size_t i = 0; i < count;)943{944size_t chunk = i + max_triangles <= count ? max_triangles : count - i;945946if (bvhCountVertices(order + i, chunk, used, indices) <= max_vertices)947{948bvhPackLeaf(boundary + i, chunk);949i += chunk;950continue;951}952953// chunk is vertex bound, split it into smaller meshlets954assert(chunk > max_vertices / 3);955956bvhPackLeaf(boundary + i, max_vertices / 3);957i += max_vertices / 3;958}959}960961static bool bvhDivisible(size_t count, size_t min, size_t max)962{963// count is representable as a sum of values in [min..max] if if it in range of [k*min..k*min+k*(max-min)]964// equivalent to ceil(count / max) <= floor(count / min), but the form below allows using idiv (see nv_cluster_builder)965// we avoid expensive integer divisions in the common case where min is <= max/2966return min * 2 <= max ? count >= min : count % min <= (count / min) * (max - min);967}968969static void bvhComputeArea(float* areas, const BVHBox* boxes, const unsigned int* order, size_t count)970{971BVHBoxT accuml = {{FLT_MAX, FLT_MAX, FLT_MAX, 0}, {-FLT_MAX, -FLT_MAX, -FLT_MAX, 0}};972BVHBoxT accumr = accuml;973974for (size_t i = 0; i < count; ++i)975{976float larea = boxMerge(accuml, boxes[order[i]]);977float rarea = boxMerge(accumr, boxes[order[count - 1 - i]]);978979areas[i] = larea;980areas[i + count] = rarea;981}982}983984static size_t bvhPivot(const float* areas, const unsigned int* vertices, size_t count, size_t step, size_t min, size_t max, float fill, size_t maxfill, float* out_cost)985{986bool aligned = count >= min * 2 && bvhDivisible(count, min, max);987size_t end = aligned ? count - min : count - 1;988989float rmaxfill = 1.f / float(int(maxfill));990991// find best split that minimizes SAH992size_t bestsplit = 0;993float bestcost = FLT_MAX;994995for (size_t i = min - 1; i < end; i += step)996{997size_t lsplit = i + 1, rsplit = count - (i + 1);998999if (!bvhDivisible(lsplit, min, max))1000continue;1001if (aligned && !bvhDivisible(rsplit, min, max))1002continue;10031004// areas[x] = inclusive surface area of boxes[0..x]1005// areas[count-1-x] = inclusive surface area of boxes[x..count-1]1006float larea = areas[i], rarea = areas[(count - 1 - (i + 1)) + count];1007float cost = larea * float(int(lsplit)) + rarea * float(int(rsplit));10081009if (cost > bestcost)1010continue;10111012// use vertex fill when splitting vertex limited clusters; note that we use the same (left->right) vertex count1013// using bidirectional vertex counts is a little more expensive to compute and produces slightly worse results in practice1014size_t lfill = vertices ? vertices[i] : lsplit;1015size_t rfill = vertices ? vertices[i] : rsplit;10161017// fill cost; use floating point math to round up to maxfill to avoid expensive integer modulo1018int lrest = int(float(int(lfill + maxfill - 1)) * rmaxfill) * int(maxfill) - int(lfill);1019int rrest = int(float(int(rfill + maxfill - 1)) * rmaxfill) * int(maxfill) - int(rfill);10201021cost += fill * (float(lrest) * larea + float(rrest) * rarea);10221023if (cost < bestcost)1024{1025bestcost = cost;1026bestsplit = i + 1;1027}1028}10291030*out_cost = bestcost;1031return bestsplit;1032}10331034static void bvhPartition(unsigned int* target, const unsigned int* order, const unsigned char* sides, size_t split, size_t count)1035{1036size_t l = 0, r = split;10371038for (size_t i = 0; i < count; ++i)1039{1040unsigned char side = sides[order[i]];1041target[side ? r : l] = order[i];1042l += 1;1043l -= side;1044r += side;1045}10461047assert(l == split && r == count);1048}10491050static void bvhSplit(const BVHBox* boxes, unsigned int* orderx, unsigned int* ordery, unsigned int* orderz, unsigned char* boundary, size_t count, int depth, void* scratch, short* used, const unsigned int* indices, size_t max_vertices, size_t min_triangles, size_t max_triangles, float fill_weight)1051{1052if (count <= max_triangles && bvhCountVertices(orderx, count, used, indices) <= max_vertices)1053return bvhPackLeaf(boundary, count);10541055unsigned int* axes[3] = {orderx, ordery, orderz};10561057// we can use step=1 unconditionally but to reduce the cost for min=max case we use step=max1058size_t step = min_triangles == max_triangles && count > max_triangles ? max_triangles : 1;10591060// if we could not pack the meshlet, we must be vertex bound1061size_t mint = count <= max_triangles && max_vertices / 3 < min_triangles ? max_vertices / 3 : min_triangles;1062size_t maxfill = count <= max_triangles ? max_vertices : max_triangles;10631064// find best split that minimizes SAH1065int bestk = -1;1066size_t bestsplit = 0;1067float bestcost = FLT_MAX;10681069for (int k = 0; k < 3; ++k)1070{1071float* areas = static_cast<float*>(scratch);1072unsigned int* vertices = NULL;10731074bvhComputeArea(areas, boxes, axes[k], count);10751076if (count <= max_triangles)1077{1078// for vertex bound clusters, count number of unique vertices for each split1079vertices = reinterpret_cast<unsigned int*>(areas + 2 * count);1080bvhCountVertices(axes[k], count, used, indices, vertices);1081}10821083float axiscost = FLT_MAX;1084size_t axissplit = bvhPivot(areas, vertices, count, step, mint, max_triangles, fill_weight, maxfill, &axiscost);10851086if (axissplit && axiscost < bestcost)1087{1088bestk = k;1089bestcost = axiscost;1090bestsplit = axissplit;1091}1092}10931094// this may happen if SAH costs along the admissible splits are NaN, or due to imbalanced splits on pathological inputs1095if (bestk < 0 || depth >= kMeshletMaxTreeDepth)1096return bvhPackTail(boundary, orderx, count, used, indices, max_vertices, max_triangles);10971098// mark sides of split for partitioning1099unsigned char* sides = static_cast<unsigned char*>(scratch) + count * sizeof(unsigned int);11001101for (size_t i = 0; i < bestsplit; ++i)1102sides[axes[bestk][i]] = 0;11031104for (size_t i = bestsplit; i < count; ++i)1105sides[axes[bestk][i]] = 1;11061107// partition all axes into two sides, maintaining order1108unsigned int* temp = static_cast<unsigned int*>(scratch);11091110for (int k = 0; k < 3; ++k)1111{1112if (k == bestk)1113continue;11141115unsigned int* axis = axes[k];1116memcpy(temp, axis, sizeof(unsigned int) * count);1117bvhPartition(axis, temp, sides, bestsplit, count);1118}11191120// recursion depth is bounded due to max depth check above1121bvhSplit(boxes, orderx, ordery, orderz, boundary, bestsplit, depth + 1, scratch, used, indices, max_vertices, min_triangles, max_triangles, fill_weight);1122bvhSplit(boxes, orderx + bestsplit, ordery + bestsplit, orderz + bestsplit, boundary + bestsplit, count - bestsplit, depth + 1, scratch, used, indices, max_vertices, min_triangles, max_triangles, fill_weight);1123}11241125} // namespace meshopt11261127size_t meshopt_buildMeshletsBound(size_t index_count, size_t max_vertices, size_t max_triangles)1128{1129using namespace meshopt;11301131assert(index_count % 3 == 0);1132assert(max_vertices >= 3 && max_vertices <= kMeshletMaxVertices);1133assert(max_triangles >= 1 && max_triangles <= kMeshletMaxTriangles);11341135(void)kMeshletMaxVertices;1136(void)kMeshletMaxTriangles;11371138// meshlet construction is limited by max vertices and max triangles per meshlet1139// the worst case is that the input is an unindexed stream since this equally stresses both limits1140// note that we assume that in the worst case, we leave 2 vertices unpacked in each meshlet - if we have space for 3 we can pack any triangle1141size_t max_vertices_conservative = max_vertices - 2;1142size_t meshlet_limit_vertices = (index_count + max_vertices_conservative - 1) / max_vertices_conservative;1143size_t meshlet_limit_triangles = (index_count / 3 + max_triangles - 1) / max_triangles;11441145return meshlet_limit_vertices > meshlet_limit_triangles ? meshlet_limit_vertices : meshlet_limit_triangles;1146}11471148size_t meshopt_buildMeshletsFlex(meshopt_Meshlet* meshlets, unsigned int* meshlet_vertices, unsigned char* meshlet_triangles, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t max_vertices, size_t min_triangles, size_t max_triangles, float cone_weight, float split_factor)1149{1150using namespace meshopt;11511152assert(index_count % 3 == 0);1153assert(vertex_positions_stride >= 12 && vertex_positions_stride <= 256);1154assert(vertex_positions_stride % sizeof(float) == 0);11551156assert(max_vertices >= 3 && max_vertices <= kMeshletMaxVertices);1157assert(min_triangles >= 1 && min_triangles <= max_triangles && max_triangles <= kMeshletMaxTriangles);11581159assert(cone_weight >= 0 && cone_weight <= 1);1160assert(split_factor >= 0);11611162if (index_count == 0)1163return 0;11641165meshopt_Allocator allocator;11661167TriangleAdjacency2 adjacency = {};1168if (vertex_count > index_count && index_count < (1u << 31))1169buildTriangleAdjacencySparse(adjacency, indices, index_count, vertex_count, allocator);1170else1171buildTriangleAdjacency(adjacency, indices, index_count, vertex_count, allocator);11721173// live triangle counts; note, we alias adjacency.counts as we remove triangles after emitting them so the counts always match1174unsigned int* live_triangles = adjacency.counts;11751176size_t face_count = index_count / 3;11771178unsigned char* emitted_flags = allocator.allocate<unsigned char>(face_count);1179memset(emitted_flags, 0, face_count);11801181// for each triangle, precompute centroid & normal to use for scoring1182Cone* triangles = allocator.allocate<Cone>(face_count);1183float mesh_area = computeTriangleCones(triangles, indices, index_count, vertex_positions, vertex_count, vertex_positions_stride);11841185// assuming each meshlet is a square patch, expected radius is sqrt(expected area)1186float triangle_area_avg = face_count == 0 ? 0.f : mesh_area / float(face_count) * 0.5f;1187float meshlet_expected_radius = sqrtf(triangle_area_avg * max_triangles) * 0.5f;11881189// build a kd-tree for nearest neighbor lookup1190unsigned int* kdindices = allocator.allocate<unsigned int>(face_count);1191for (size_t i = 0; i < face_count; ++i)1192kdindices[i] = unsigned(i);11931194KDNode* nodes = allocator.allocate<KDNode>(face_count * 2);1195kdtreeBuild(0, nodes, face_count * 2, &triangles[0].px, sizeof(Cone) / sizeof(float), kdindices, face_count, /* leaf_size= */ 8, 0);11961197// find a specific corner of the mesh to use as a starting point for meshlet flow1198float cornerx = FLT_MAX, cornery = FLT_MAX, cornerz = FLT_MAX;11991200for (size_t i = 0; i < face_count; ++i)1201{1202const Cone& tri = triangles[i];12031204cornerx = cornerx > tri.px ? tri.px : cornerx;1205cornery = cornery > tri.py ? tri.py : cornery;1206cornerz = cornerz > tri.pz ? tri.pz : cornerz;1207}12081209// index of the vertex in the meshlet, -1 if the vertex isn't used1210short* used = allocator.allocate<short>(vertex_count);1211clearUsed(used, vertex_count, indices, index_count);12121213// initial seed triangle is the one closest to the corner1214unsigned int initial_seed = ~0u;1215float initial_score = FLT_MAX;12161217for (size_t i = 0; i < face_count; ++i)1218{1219const Cone& tri = triangles[i];12201221float dx = tri.px - cornerx, dy = tri.py - cornery, dz = tri.pz - cornerz;1222float score = sqrtf(dx * dx + dy * dy + dz * dz);12231224if (initial_seed == ~0u || score < initial_score)1225{1226initial_seed = unsigned(i);1227initial_score = score;1228}1229}12301231// seed triangles to continue meshlet flow1232unsigned int seeds[kMeshletMaxSeeds] = {};1233size_t seed_count = 0;12341235meshopt_Meshlet meshlet = {};1236size_t meshlet_offset = 0;12371238Cone meshlet_cone_acc = {};12391240for (;;)1241{1242Cone meshlet_cone = getMeshletCone(meshlet_cone_acc, meshlet.triangle_count);12431244unsigned int best_triangle = ~0u;12451246// for the first triangle, we don't have a meshlet cone yet, so we use the initial seed1247// to continue the meshlet, we select an adjacent triangle based on connectivity and spatial scoring1248if (meshlet_offset == 0 && meshlet.triangle_count == 0)1249best_triangle = initial_seed;1250else1251best_triangle = getNeighborTriangle(meshlet, meshlet_cone, meshlet_vertices, indices, adjacency, triangles, live_triangles, used, meshlet_expected_radius, cone_weight);12521253bool split = false;12541255// when we run out of adjacent triangles we need to switch to spatial search; we currently just pick the closest triangle irrespective of connectivity1256if (best_triangle == ~0u)1257{1258float position[3] = {meshlet_cone.px, meshlet_cone.py, meshlet_cone.pz};1259unsigned int index = ~0u;1260float distance = FLT_MAX;12611262kdtreeNearest(nodes, 0, &triangles[0].px, sizeof(Cone) / sizeof(float), emitted_flags, position, index, distance);12631264best_triangle = index;1265split = meshlet.triangle_count >= min_triangles && split_factor > 0 && distance > meshlet_expected_radius * split_factor;1266}12671268if (best_triangle == ~0u)1269break;12701271int best_extra = (used[indices[best_triangle * 3 + 0]] < 0) + (used[indices[best_triangle * 3 + 1]] < 0) + (used[indices[best_triangle * 3 + 2]] < 0);12721273// if the best triangle doesn't fit into current meshlet, we re-select using seeds to maintain global flow1274if (split || (meshlet.vertex_count + best_extra > max_vertices || meshlet.triangle_count >= max_triangles))1275{1276seed_count = pruneSeedTriangles(seeds, seed_count, emitted_flags);1277seed_count = (seed_count + kMeshletAddSeeds <= kMeshletMaxSeeds) ? seed_count : kMeshletMaxSeeds - kMeshletAddSeeds;1278seed_count += appendSeedTriangles(seeds + seed_count, meshlet, meshlet_vertices, indices, adjacency, triangles, live_triangles, cornerx, cornery, cornerz);12791280unsigned int best_seed = selectSeedTriangle(seeds, seed_count, indices, triangles, live_triangles, cornerx, cornery, cornerz);12811282// we may not find a valid seed triangle if the mesh is disconnected as seeds are based on adjacency1283best_triangle = best_seed != ~0u ? best_seed : best_triangle;1284}12851286unsigned int a = indices[best_triangle * 3 + 0], b = indices[best_triangle * 3 + 1], c = indices[best_triangle * 3 + 2];1287assert(a < vertex_count && b < vertex_count && c < vertex_count);12881289// add meshlet to the output; when the current meshlet is full we reset the accumulated bounds1290if (appendMeshlet(meshlet, a, b, c, used, meshlets, meshlet_vertices, meshlet_triangles, meshlet_offset, max_vertices, max_triangles, split))1291{1292meshlet_offset++;1293memset(&meshlet_cone_acc, 0, sizeof(meshlet_cone_acc));1294}12951296// remove emitted triangle from adjacency data1297// this makes sure that we spend less time traversing these lists on subsequent iterations1298// live triangle counts are updated as a byproduct of these adjustments1299for (size_t k = 0; k < 3; ++k)1300{1301unsigned int index = indices[best_triangle * 3 + k];13021303unsigned int* neighbors = &adjacency.data[0] + adjacency.offsets[index];1304size_t neighbors_size = adjacency.counts[index];13051306for (size_t i = 0; i < neighbors_size; ++i)1307{1308unsigned int tri = neighbors[i];13091310if (tri == best_triangle)1311{1312neighbors[i] = neighbors[neighbors_size - 1];1313adjacency.counts[index]--;1314break;1315}1316}1317}13181319// update aggregated meshlet cone data for scoring subsequent triangles1320meshlet_cone_acc.px += triangles[best_triangle].px;1321meshlet_cone_acc.py += triangles[best_triangle].py;1322meshlet_cone_acc.pz += triangles[best_triangle].pz;1323meshlet_cone_acc.nx += triangles[best_triangle].nx;1324meshlet_cone_acc.ny += triangles[best_triangle].ny;1325meshlet_cone_acc.nz += triangles[best_triangle].nz;13261327assert(!emitted_flags[best_triangle]);1328emitted_flags[best_triangle] = 1;1329}13301331if (meshlet.triangle_count)1332meshlets[meshlet_offset++] = meshlet;13331334assert(meshlet_offset <= meshopt_buildMeshletsBound(index_count, max_vertices, min_triangles));1335assert(meshlet.triangle_offset + meshlet.triangle_count * 3 <= index_count && meshlet.vertex_offset + meshlet.vertex_count <= index_count);1336return meshlet_offset;1337}13381339size_t meshopt_buildMeshlets(meshopt_Meshlet* meshlets, unsigned int* meshlet_vertices, unsigned char* meshlet_triangles, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t max_vertices, size_t max_triangles, float cone_weight)1340{1341return meshopt_buildMeshletsFlex(meshlets, meshlet_vertices, meshlet_triangles, indices, index_count, vertex_positions, vertex_count, vertex_positions_stride, max_vertices, max_triangles, max_triangles, cone_weight, 0.0f);1342}13431344size_t meshopt_buildMeshletsScan(meshopt_Meshlet* meshlets, unsigned int* meshlet_vertices, unsigned char* meshlet_triangles, const unsigned int* indices, size_t index_count, size_t vertex_count, size_t max_vertices, size_t max_triangles)1345{1346using namespace meshopt;13471348assert(index_count % 3 == 0);13491350assert(max_vertices >= 3 && max_vertices <= kMeshletMaxVertices);1351assert(max_triangles >= 1 && max_triangles <= kMeshletMaxTriangles);13521353meshopt_Allocator allocator;13541355// index of the vertex in the meshlet, -1 if the vertex isn't used1356short* used = allocator.allocate<short>(vertex_count);1357clearUsed(used, vertex_count, indices, index_count);13581359meshopt_Meshlet meshlet = {};1360size_t meshlet_offset = 0;13611362for (size_t i = 0; i < index_count; i += 3)1363{1364unsigned int a = indices[i + 0], b = indices[i + 1], c = indices[i + 2];1365assert(a < vertex_count && b < vertex_count && c < vertex_count);13661367// appends triangle to the meshlet and writes previous meshlet to the output if full1368meshlet_offset += appendMeshlet(meshlet, a, b, c, used, meshlets, meshlet_vertices, meshlet_triangles, meshlet_offset, max_vertices, max_triangles);1369}13701371if (meshlet.triangle_count)1372meshlets[meshlet_offset++] = meshlet;13731374assert(meshlet_offset <= meshopt_buildMeshletsBound(index_count, max_vertices, max_triangles));1375assert(meshlet.triangle_offset + meshlet.triangle_count * 3 <= index_count && meshlet.vertex_offset + meshlet.vertex_count <= index_count);1376return meshlet_offset;1377}13781379size_t meshopt_buildMeshletsSpatial(struct meshopt_Meshlet* meshlets, unsigned int* meshlet_vertices, unsigned char* meshlet_triangles, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t max_vertices, size_t min_triangles, size_t max_triangles, float fill_weight)1380{1381using namespace meshopt;13821383assert(index_count % 3 == 0);1384assert(vertex_positions_stride >= 12 && vertex_positions_stride <= 256);1385assert(vertex_positions_stride % sizeof(float) == 0);13861387assert(max_vertices >= 3 && max_vertices <= kMeshletMaxVertices);1388assert(min_triangles >= 1 && min_triangles <= max_triangles && max_triangles <= kMeshletMaxTriangles);13891390if (index_count == 0)1391return 0;13921393size_t face_count = index_count / 3;1394size_t vertex_stride_float = vertex_positions_stride / sizeof(float);13951396meshopt_Allocator allocator;13971398// 3 floats plus 1 uint for sorting, or1399// 2 floats plus 1 uint for pivoting, or1400// 1 uint plus 1 byte for partitioning1401float* scratch = allocator.allocate<float>(face_count * 4);14021403// compute bounding boxes and centroids for sorting1404BVHBox* boxes = allocator.allocate<BVHBox>(face_count + 1); // padding for SIMD1405bvhPrepare(boxes, scratch, indices, face_count, vertex_positions, vertex_count, vertex_stride_float);1406memset(boxes + face_count, 0, sizeof(BVHBox));14071408unsigned int* axes = allocator.allocate<unsigned int>(face_count * 3);1409unsigned int* temp = reinterpret_cast<unsigned int*>(scratch) + face_count * 3;14101411for (int k = 0; k < 3; ++k)1412{1413unsigned int* order = axes + k * face_count;1414const float* keys = scratch + k * face_count;14151416unsigned int hist[1024][3];1417computeHistogram(hist, keys, face_count);14181419// 3-pass radix sort computes the resulting order into axes1420for (size_t i = 0; i < face_count; ++i)1421temp[i] = unsigned(i);14221423radixPass(order, temp, keys, face_count, hist, 0);1424radixPass(temp, order, keys, face_count, hist, 1);1425radixPass(order, temp, keys, face_count, hist, 2);1426}14271428// index of the vertex in the meshlet, -1 if the vertex isn't used1429short* used = allocator.allocate<short>(vertex_count);1430clearUsed(used, vertex_count, indices, index_count);14311432unsigned char* boundary = allocator.allocate<unsigned char>(face_count);14331434bvhSplit(boxes, &axes[0], &axes[face_count], &axes[face_count * 2], boundary, face_count, 0, scratch, used, indices, max_vertices, min_triangles, max_triangles, fill_weight);14351436// compute the desired number of meshlets; note that on some meshes with a lot of vertex bound clusters this might go over the bound1437size_t meshlet_count = 0;1438for (size_t i = 0; i < face_count; ++i)1439{1440assert(boundary[i] <= 1);1441meshlet_count += boundary[i];1442}14431444size_t meshlet_bound = meshopt_buildMeshletsBound(index_count, max_vertices, min_triangles);14451446// pack triangles into meshlets according to the order and boundaries marked by bvhSplit1447meshopt_Meshlet meshlet = {};1448size_t meshlet_offset = 0;1449size_t meshlet_pending = meshlet_count;14501451for (size_t i = 0; i < face_count; ++i)1452{1453assert(boundary[i] <= 1);1454bool split = i > 0 && boundary[i] == 1;14551456// while we are over the limit, we ignore boundary[] data and disable splits until we free up enough space1457if (split && meshlet_count > meshlet_bound && meshlet_offset + meshlet_pending >= meshlet_bound)1458split = false;14591460unsigned int index = axes[i];1461assert(index < face_count);14621463unsigned int a = indices[index * 3 + 0], b = indices[index * 3 + 1], c = indices[index * 3 + 2];14641465// appends triangle to the meshlet and writes previous meshlet to the output if full1466meshlet_offset += appendMeshlet(meshlet, a, b, c, used, meshlets, meshlet_vertices, meshlet_triangles, meshlet_offset, max_vertices, max_triangles, split);1467meshlet_pending -= boundary[i];1468}14691470if (meshlet.triangle_count)1471meshlets[meshlet_offset++] = meshlet;14721473assert(meshlet_offset <= meshlet_bound);1474assert(meshlet.triangle_offset + meshlet.triangle_count * 3 <= index_count && meshlet.vertex_offset + meshlet.vertex_count <= index_count);1475return meshlet_offset;1476}14771478meshopt_Bounds meshopt_computeClusterBounds(const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride)1479{1480using namespace meshopt;14811482assert(index_count % 3 == 0);1483assert(index_count / 3 <= kMeshletMaxTriangles);1484assert(vertex_positions_stride >= 12 && vertex_positions_stride <= 256);1485assert(vertex_positions_stride % sizeof(float) == 0);14861487(void)vertex_count;14881489size_t vertex_stride_float = vertex_positions_stride / sizeof(float);14901491// compute triangle normals and gather triangle corners1492float normals[kMeshletMaxTriangles][3];1493float corners[kMeshletMaxTriangles][3][3];1494size_t triangles = 0;14951496for (size_t i = 0; i < index_count; i += 3)1497{1498unsigned int a = indices[i + 0], b = indices[i + 1], c = indices[i + 2];1499assert(a < vertex_count && b < vertex_count && c < vertex_count);15001501const float* p0 = vertex_positions + vertex_stride_float * a;1502const float* p1 = vertex_positions + vertex_stride_float * b;1503const float* p2 = vertex_positions + vertex_stride_float * c;15041505float p10[3] = {p1[0] - p0[0], p1[1] - p0[1], p1[2] - p0[2]};1506float p20[3] = {p2[0] - p0[0], p2[1] - p0[1], p2[2] - p0[2]};15071508float normalx = p10[1] * p20[2] - p10[2] * p20[1];1509float normaly = p10[2] * p20[0] - p10[0] * p20[2];1510float normalz = p10[0] * p20[1] - p10[1] * p20[0];15111512float area = sqrtf(normalx * normalx + normaly * normaly + normalz * normalz);15131514// no need to include degenerate triangles - they will be invisible anyway1515if (area == 0.f)1516continue;15171518// record triangle normals & corners for future use; normal and corner 0 define a plane equation1519normals[triangles][0] = normalx / area;1520normals[triangles][1] = normaly / area;1521normals[triangles][2] = normalz / area;1522memcpy(corners[triangles][0], p0, 3 * sizeof(float));1523memcpy(corners[triangles][1], p1, 3 * sizeof(float));1524memcpy(corners[triangles][2], p2, 3 * sizeof(float));1525triangles++;1526}15271528meshopt_Bounds bounds = {};15291530// degenerate cluster, no valid triangles => trivial reject (cone data is 0)1531if (triangles == 0)1532return bounds;15331534const float rzero = 0.f;15351536// compute cluster bounding sphere; we'll use the center to determine normal cone apex as well1537float psphere[4] = {};1538computeBoundingSphere(psphere, corners[0][0], triangles * 3, sizeof(float) * 3, &rzero, 0, 7);15391540float center[3] = {psphere[0], psphere[1], psphere[2]};15411542// treating triangle normals as points, find the bounding sphere - the sphere center determines the optimal cone axis1543float nsphere[4] = {};1544computeBoundingSphere(nsphere, normals[0], triangles, sizeof(float) * 3, &rzero, 0, 3);15451546float axis[3] = {nsphere[0], nsphere[1], nsphere[2]};1547float axislength = sqrtf(axis[0] * axis[0] + axis[1] * axis[1] + axis[2] * axis[2]);1548float invaxislength = axislength == 0.f ? 0.f : 1.f / axislength;15491550axis[0] *= invaxislength;1551axis[1] *= invaxislength;1552axis[2] *= invaxislength;15531554// compute a tight cone around all normals, mindp = cos(angle/2)1555float mindp = 1.f;15561557for (size_t i = 0; i < triangles; ++i)1558{1559float dp = normals[i][0] * axis[0] + normals[i][1] * axis[1] + normals[i][2] * axis[2];15601561mindp = (dp < mindp) ? dp : mindp;1562}15631564// fill bounding sphere info; note that below we can return bounds without cone information for degenerate cones1565bounds.center[0] = center[0];1566bounds.center[1] = center[1];1567bounds.center[2] = center[2];1568bounds.radius = psphere[3];15691570// degenerate cluster, normal cone is larger than a hemisphere => trivial accept1571// note that if mindp is positive but close to 0, the triangle intersection code below gets less stable1572// we arbitrarily decide that if a normal cone is ~168 degrees wide or more, the cone isn't useful1573if (mindp <= 0.1f)1574{1575bounds.cone_cutoff = 1;1576bounds.cone_cutoff_s8 = 127;1577return bounds;1578}15791580float maxt = 0;15811582// we need to find the point on center-t*axis ray that lies in negative half-space of all triangles1583for (size_t i = 0; i < triangles; ++i)1584{1585// dot(center-t*axis-corner, trinormal) = 01586// dot(center-corner, trinormal) - t * dot(axis, trinormal) = 01587float cx = center[0] - corners[i][0][0];1588float cy = center[1] - corners[i][0][1];1589float cz = center[2] - corners[i][0][2];15901591float dc = cx * normals[i][0] + cy * normals[i][1] + cz * normals[i][2];1592float dn = axis[0] * normals[i][0] + axis[1] * normals[i][1] + axis[2] * normals[i][2];15931594// dn should be larger than mindp cutoff above1595assert(dn > 0.f);1596float t = dc / dn;15971598maxt = (t > maxt) ? t : maxt;1599}16001601// cone apex should be in the negative half-space of all cluster triangles by construction1602bounds.cone_apex[0] = center[0] - axis[0] * maxt;1603bounds.cone_apex[1] = center[1] - axis[1] * maxt;1604bounds.cone_apex[2] = center[2] - axis[2] * maxt;16051606// note: this axis is the axis of the normal cone, but our test for perspective camera effectively negates the axis1607bounds.cone_axis[0] = axis[0];1608bounds.cone_axis[1] = axis[1];1609bounds.cone_axis[2] = axis[2];16101611// cos(a) for normal cone is mindp; we need to add 90 degrees on both sides and invert the cone1612// which gives us -cos(a+90) = -(-sin(a)) = sin(a) = sqrt(1 - cos^2(a))1613bounds.cone_cutoff = sqrtf(1 - mindp * mindp);16141615// quantize axis & cutoff to 8-bit SNORM format1616bounds.cone_axis_s8[0] = (signed char)(meshopt_quantizeSnorm(bounds.cone_axis[0], 8));1617bounds.cone_axis_s8[1] = (signed char)(meshopt_quantizeSnorm(bounds.cone_axis[1], 8));1618bounds.cone_axis_s8[2] = (signed char)(meshopt_quantizeSnorm(bounds.cone_axis[2], 8));16191620// for the 8-bit test to be conservative, we need to adjust the cutoff by measuring the max. error1621float cone_axis_s8_e0 = fabsf(bounds.cone_axis_s8[0] / 127.f - bounds.cone_axis[0]);1622float cone_axis_s8_e1 = fabsf(bounds.cone_axis_s8[1] / 127.f - bounds.cone_axis[1]);1623float cone_axis_s8_e2 = fabsf(bounds.cone_axis_s8[2] / 127.f - bounds.cone_axis[2]);16241625// note that we need to round this up instead of rounding to nearest, hence +11626int cone_cutoff_s8 = int(127 * (bounds.cone_cutoff + cone_axis_s8_e0 + cone_axis_s8_e1 + cone_axis_s8_e2) + 1);16271628bounds.cone_cutoff_s8 = (cone_cutoff_s8 > 127) ? 127 : (signed char)(cone_cutoff_s8);16291630return bounds;1631}16321633meshopt_Bounds meshopt_computeMeshletBounds(const unsigned int* meshlet_vertices, const unsigned char* meshlet_triangles, size_t triangle_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride)1634{1635using namespace meshopt;16361637assert(triangle_count <= kMeshletMaxTriangles);1638assert(vertex_positions_stride >= 12 && vertex_positions_stride <= 256);1639assert(vertex_positions_stride % sizeof(float) == 0);16401641unsigned int indices[kMeshletMaxTriangles * 3];16421643for (size_t i = 0; i < triangle_count * 3; ++i)1644{1645unsigned int index = meshlet_vertices[meshlet_triangles[i]];1646assert(index < vertex_count);16471648indices[i] = index;1649}16501651return meshopt_computeClusterBounds(indices, triangle_count * 3, vertex_positions, vertex_count, vertex_positions_stride);1652}16531654meshopt_Bounds meshopt_computeSphereBounds(const float* positions, size_t count, size_t positions_stride, const float* radii, size_t radii_stride)1655{1656using namespace meshopt;16571658assert(positions_stride >= 12 && positions_stride <= 256);1659assert(positions_stride % sizeof(float) == 0);1660assert((radii_stride >= 4 && radii_stride <= 256) || radii == NULL);1661assert(radii_stride % sizeof(float) == 0);16621663meshopt_Bounds bounds = {};16641665if (count == 0)1666return bounds;16671668const float rzero = 0.f;16691670float psphere[4] = {};1671computeBoundingSphere(psphere, positions, count, positions_stride, radii ? radii : &rzero, radii ? radii_stride : 0, 7);16721673bounds.center[0] = psphere[0];1674bounds.center[1] = psphere[1];1675bounds.center[2] = psphere[2];1676bounds.radius = psphere[3];16771678return bounds;1679}16801681void meshopt_optimizeMeshlet(unsigned int* meshlet_vertices, unsigned char* meshlet_triangles, size_t triangle_count, size_t vertex_count)1682{1683using namespace meshopt;16841685assert(triangle_count <= kMeshletMaxTriangles);1686assert(vertex_count <= kMeshletMaxVertices);16871688unsigned char* indices = meshlet_triangles;1689unsigned int* vertices = meshlet_vertices;16901691// cache tracks vertex timestamps (corresponding to triangle index! all 3 vertices are added at the same time and never removed)1692unsigned char cache[kMeshletMaxVertices];1693memset(cache, 0, vertex_count);16941695// note that we start from a value that means all vertices aren't in cache1696unsigned char cache_last = 128;1697const unsigned char cache_cutoff = 3; // 3 triangles = ~5..9 vertices depending on reuse16981699for (size_t i = 0; i < triangle_count; ++i)1700{1701int next = -1;1702int next_match = -1;17031704for (size_t j = i; j < triangle_count; ++j)1705{1706unsigned char a = indices[j * 3 + 0], b = indices[j * 3 + 1], c = indices[j * 3 + 2];1707assert(a < vertex_count && b < vertex_count && c < vertex_count);17081709// score each triangle by how many vertices are in cache1710// note: the distance is computed using unsigned 8-bit values, so cache timestamp overflow is handled gracefully1711int aok = (unsigned char)(cache_last - cache[a]) < cache_cutoff;1712int bok = (unsigned char)(cache_last - cache[b]) < cache_cutoff;1713int cok = (unsigned char)(cache_last - cache[c]) < cache_cutoff;17141715if (aok + bok + cok > next_match)1716{1717next = (int)j;1718next_match = aok + bok + cok;17191720// note that we could end up with all 3 vertices in the cache, but 2 is enough for ~strip traversal1721if (next_match >= 2)1722break;1723}1724}17251726assert(next >= 0);17271728unsigned char a = indices[next * 3 + 0], b = indices[next * 3 + 1], c = indices[next * 3 + 2];17291730// shift triangles before the next one forward so that we always keep an ordered partition1731// note: this could have swapped triangles [i] and [next] but that distorts the order and may skew the output sequence1732memmove(indices + (i + 1) * 3, indices + i * 3, (next - i) * 3 * sizeof(unsigned char));17331734indices[i * 3 + 0] = a;1735indices[i * 3 + 1] = b;1736indices[i * 3 + 2] = c;17371738// cache timestamp is the same between all vertices of each triangle to reduce overflow1739cache_last++;1740cache[a] = cache_last;1741cache[b] = cache_last;1742cache[c] = cache_last;1743}17441745// reorder meshlet vertices for access locality assuming index buffer is scanned sequentially1746unsigned int order[kMeshletMaxVertices];17471748short remap[kMeshletMaxVertices];1749memset(remap, -1, vertex_count * sizeof(short));17501751size_t vertex_offset = 0;17521753for (size_t i = 0; i < triangle_count * 3; ++i)1754{1755short& r = remap[indices[i]];17561757if (r < 0)1758{1759r = short(vertex_offset);1760order[vertex_offset] = vertices[indices[i]];1761vertex_offset++;1762}17631764indices[i] = (unsigned char)r;1765}17661767assert(vertex_offset <= vertex_count);1768memcpy(vertices, order, vertex_offset * sizeof(unsigned int));1769}17701771#undef SIMD_SSE1772#undef SIMD_NEON177317741775