Path: blob/master/thirdparty/meshoptimizer/clusterizer.cpp
9903 views
// This file is part of meshoptimizer library; see meshoptimizer.h for version/license details1#include "meshoptimizer.h"23#include <assert.h>4#include <float.h>5#include <math.h>6#include <string.h>78// This work is based on:9// Graham Wihlidal. Optimizing the Graphics Pipeline with Compute. 201610// Matthaeus Chajdas. GeometryFX 1.2 - Cluster Culling. 201611// Jack Ritter. An Efficient Bounding Sphere. 199012// Thomas Larsson. Fast and Tight Fitting Bounding Spheres. 200813// Ingo Wald, Vlastimil Havran. On building fast kd-Trees for Ray Tracing, and on doing that in O(N log N). 200614namespace meshopt15{1617// This must be <= 256 since meshlet indices are stored as bytes18const size_t kMeshletMaxVertices = 256;1920// A reasonable limit is around 2*max_vertices or less21const size_t kMeshletMaxTriangles = 512;2223// We keep a limited number of seed triangles and add a few triangles per finished meshlet24const size_t kMeshletMaxSeeds = 256;25const size_t kMeshletAddSeeds = 4;2627// To avoid excessive recursion for malformed inputs, we limit the maximum depth of the tree28const int kMeshletMaxTreeDepth = 50;2930struct TriangleAdjacency231{32unsigned int* counts;33unsigned int* offsets;34unsigned int* data;35};3637static void buildTriangleAdjacency(TriangleAdjacency2& adjacency, const unsigned int* indices, size_t index_count, size_t vertex_count, meshopt_Allocator& allocator)38{39size_t face_count = index_count / 3;4041// allocate arrays42adjacency.counts = allocator.allocate<unsigned int>(vertex_count);43adjacency.offsets = allocator.allocate<unsigned int>(vertex_count);44adjacency.data = allocator.allocate<unsigned int>(index_count);4546// fill triangle counts47memset(adjacency.counts, 0, vertex_count * sizeof(unsigned int));4849for (size_t i = 0; i < index_count; ++i)50{51assert(indices[i] < vertex_count);5253adjacency.counts[indices[i]]++;54}5556// fill offset table57unsigned int offset = 0;5859for (size_t i = 0; i < vertex_count; ++i)60{61adjacency.offsets[i] = offset;62offset += adjacency.counts[i];63}6465assert(offset == index_count);6667// fill triangle data68for (size_t i = 0; i < face_count; ++i)69{70unsigned int a = indices[i * 3 + 0], b = indices[i * 3 + 1], c = indices[i * 3 + 2];7172adjacency.data[adjacency.offsets[a]++] = unsigned(i);73adjacency.data[adjacency.offsets[b]++] = unsigned(i);74adjacency.data[adjacency.offsets[c]++] = unsigned(i);75}7677// fix offsets that have been disturbed by the previous pass78for (size_t i = 0; i < vertex_count; ++i)79{80assert(adjacency.offsets[i] >= adjacency.counts[i]);81adjacency.offsets[i] -= adjacency.counts[i];82}83}8485static void buildTriangleAdjacencySparse(TriangleAdjacency2& adjacency, const unsigned int* indices, size_t index_count, size_t vertex_count, meshopt_Allocator& allocator)86{87size_t face_count = index_count / 3;8889// sparse mode can build adjacency more quickly by ignoring unused vertices, using a bit to mark visited vertices90const unsigned int sparse_seen = 1u << 31;91assert(index_count < sparse_seen);9293// allocate arrays94adjacency.counts = allocator.allocate<unsigned int>(vertex_count);95adjacency.offsets = allocator.allocate<unsigned int>(vertex_count);96adjacency.data = allocator.allocate<unsigned int>(index_count);9798// fill triangle counts99for (size_t i = 0; i < index_count; ++i)100assert(indices[i] < vertex_count);101102for (size_t i = 0; i < index_count; ++i)103adjacency.counts[indices[i]] = 0;104105for (size_t i = 0; i < index_count; ++i)106adjacency.counts[indices[i]]++;107108// fill offset table; uses sparse_seen bit to tag visited vertices109unsigned int offset = 0;110111for (size_t i = 0; i < index_count; ++i)112{113unsigned int v = indices[i];114115if ((adjacency.counts[v] & sparse_seen) == 0)116{117adjacency.offsets[v] = offset;118offset += adjacency.counts[v];119adjacency.counts[v] |= sparse_seen;120}121}122123assert(offset == index_count);124125// fill triangle data126for (size_t i = 0; i < face_count; ++i)127{128unsigned int a = indices[i * 3 + 0], b = indices[i * 3 + 1], c = indices[i * 3 + 2];129130adjacency.data[adjacency.offsets[a]++] = unsigned(i);131adjacency.data[adjacency.offsets[b]++] = unsigned(i);132adjacency.data[adjacency.offsets[c]++] = unsigned(i);133}134135// fix offsets that have been disturbed by the previous pass136// also fix counts (that were marked with sparse_seen by the first pass)137for (size_t i = 0; i < index_count; ++i)138{139unsigned int v = indices[i];140141if (adjacency.counts[v] & sparse_seen)142{143adjacency.counts[v] &= ~sparse_seen;144145assert(adjacency.offsets[v] >= adjacency.counts[v]);146adjacency.offsets[v] -= adjacency.counts[v];147}148}149}150151static void computeBoundingSphere(float result[4], const float* points, size_t count, size_t points_stride, const float* radii, size_t radii_stride, size_t axis_count)152{153static const float kAxes[7][3] = {154// X, Y, Z155{1, 0, 0},156{0, 1, 0},157{0, 0, 1},158159// XYZ, -XYZ, X-YZ, XY-Z; normalized to unit length160{0.57735026f, 0.57735026f, 0.57735026f},161{-0.57735026f, 0.57735026f, 0.57735026f},162{0.57735026f, -0.57735026f, 0.57735026f},163{0.57735026f, 0.57735026f, -0.57735026f},164};165166assert(count > 0);167assert(axis_count <= sizeof(kAxes) / sizeof(kAxes[0]));168169size_t points_stride_float = points_stride / sizeof(float);170size_t radii_stride_float = radii_stride / sizeof(float);171172// find extremum points along all axes; for each axis we get a pair of points with min/max coordinates173size_t pmin[7], pmax[7];174float tmin[7], tmax[7];175176for (size_t axis = 0; axis < axis_count; ++axis)177{178pmin[axis] = pmax[axis] = 0;179tmin[axis] = FLT_MAX;180tmax[axis] = -FLT_MAX;181}182183for (size_t i = 0; i < count; ++i)184{185const float* p = points + i * points_stride_float;186float r = radii[i * radii_stride_float];187188for (size_t axis = 0; axis < axis_count; ++axis)189{190const float* ax = kAxes[axis];191192float tp = ax[0] * p[0] + ax[1] * p[1] + ax[2] * p[2];193float tpmin = tp - r, tpmax = tp + r;194195pmin[axis] = (tpmin < tmin[axis]) ? i : pmin[axis];196pmax[axis] = (tpmax > tmax[axis]) ? i : pmax[axis];197tmin[axis] = (tpmin < tmin[axis]) ? tpmin : tmin[axis];198tmax[axis] = (tpmax > tmax[axis]) ? tpmax : tmax[axis];199}200}201202// find the pair of points with largest distance203size_t paxis = 0;204float paxisdr = 0;205206for (size_t axis = 0; axis < axis_count; ++axis)207{208const float* p1 = points + pmin[axis] * points_stride_float;209const float* p2 = points + pmax[axis] * points_stride_float;210float r1 = radii[pmin[axis] * radii_stride_float];211float r2 = radii[pmax[axis] * radii_stride_float];212213float d2 = (p2[0] - p1[0]) * (p2[0] - p1[0]) + (p2[1] - p1[1]) * (p2[1] - p1[1]) + (p2[2] - p1[2]) * (p2[2] - p1[2]);214float dr = sqrtf(d2) + r1 + r2;215216if (dr > paxisdr)217{218paxisdr = dr;219paxis = axis;220}221}222223// use the longest segment as the initial sphere diameter224const float* p1 = points + pmin[paxis] * points_stride_float;225const float* p2 = points + pmax[paxis] * points_stride_float;226float r1 = radii[pmin[paxis] * radii_stride_float];227float r2 = radii[pmax[paxis] * radii_stride_float];228229float paxisd = sqrtf((p2[0] - p1[0]) * (p2[0] - p1[0]) + (p2[1] - p1[1]) * (p2[1] - p1[1]) + (p2[2] - p1[2]) * (p2[2] - p1[2]));230float paxisk = paxisd > 0 ? (paxisd + r2 - r1) / (2 * paxisd) : 0.f;231232float center[3] = {p1[0] + (p2[0] - p1[0]) * paxisk, p1[1] + (p2[1] - p1[1]) * paxisk, p1[2] + (p2[2] - p1[2]) * paxisk};233float radius = paxisdr / 2;234235// iteratively adjust the sphere up until all points fit236for (size_t i = 0; i < count; ++i)237{238const float* p = points + i * points_stride_float;239float r = radii[i * radii_stride_float];240241float d2 = (p[0] - center[0]) * (p[0] - center[0]) + (p[1] - center[1]) * (p[1] - center[1]) + (p[2] - center[2]) * (p[2] - center[2]);242float d = sqrtf(d2);243244if (d + r > radius)245{246float k = d > 0 ? (d + r - radius) / (2 * d) : 0.f;247248center[0] += k * (p[0] - center[0]);249center[1] += k * (p[1] - center[1]);250center[2] += k * (p[2] - center[2]);251radius = (radius + d + r) / 2;252}253}254255result[0] = center[0];256result[1] = center[1];257result[2] = center[2];258result[3] = radius;259}260261struct Cone262{263float px, py, pz;264float nx, ny, nz;265};266267static float getDistance(float dx, float dy, float dz, bool aa)268{269if (!aa)270return sqrtf(dx * dx + dy * dy + dz * dz);271272float rx = fabsf(dx), ry = fabsf(dy), rz = fabsf(dz);273float rxy = rx > ry ? rx : ry;274return rxy > rz ? rxy : rz;275}276277static float getMeshletScore(float distance, float spread, float cone_weight, float expected_radius)278{279if (cone_weight < 0)280return 1 + distance / expected_radius;281282float cone = 1.f - spread * cone_weight;283float cone_clamped = cone < 1e-3f ? 1e-3f : cone;284285return (1 + distance / expected_radius * (1 - cone_weight)) * cone_clamped;286}287288static Cone getMeshletCone(const Cone& acc, unsigned int triangle_count)289{290Cone result = acc;291292float center_scale = triangle_count == 0 ? 0.f : 1.f / float(triangle_count);293294result.px *= center_scale;295result.py *= center_scale;296result.pz *= center_scale;297298float axis_length = result.nx * result.nx + result.ny * result.ny + result.nz * result.nz;299float axis_scale = axis_length == 0.f ? 0.f : 1.f / sqrtf(axis_length);300301result.nx *= axis_scale;302result.ny *= axis_scale;303result.nz *= axis_scale;304305return result;306}307308static float computeTriangleCones(Cone* triangles, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride)309{310(void)vertex_count;311312size_t vertex_stride_float = vertex_positions_stride / sizeof(float);313size_t face_count = index_count / 3;314315float mesh_area = 0;316317for (size_t i = 0; i < face_count; ++i)318{319unsigned int a = indices[i * 3 + 0], b = indices[i * 3 + 1], c = indices[i * 3 + 2];320assert(a < vertex_count && b < vertex_count && c < vertex_count);321322const float* p0 = vertex_positions + vertex_stride_float * a;323const float* p1 = vertex_positions + vertex_stride_float * b;324const float* p2 = vertex_positions + vertex_stride_float * c;325326float p10[3] = {p1[0] - p0[0], p1[1] - p0[1], p1[2] - p0[2]};327float p20[3] = {p2[0] - p0[0], p2[1] - p0[1], p2[2] - p0[2]};328329float normalx = p10[1] * p20[2] - p10[2] * p20[1];330float normaly = p10[2] * p20[0] - p10[0] * p20[2];331float normalz = p10[0] * p20[1] - p10[1] * p20[0];332333float area = sqrtf(normalx * normalx + normaly * normaly + normalz * normalz);334float invarea = (area == 0.f) ? 0.f : 1.f / area;335336triangles[i].px = (p0[0] + p1[0] + p2[0]) / 3.f;337triangles[i].py = (p0[1] + p1[1] + p2[1]) / 3.f;338triangles[i].pz = (p0[2] + p1[2] + p2[2]) / 3.f;339340triangles[i].nx = normalx * invarea;341triangles[i].ny = normaly * invarea;342triangles[i].nz = normalz * invarea;343344mesh_area += area;345}346347return mesh_area;348}349350static void finishMeshlet(meshopt_Meshlet& meshlet, unsigned char* meshlet_triangles)351{352size_t offset = meshlet.triangle_offset + meshlet.triangle_count * 3;353354// fill 4b padding with 0355while (offset & 3)356meshlet_triangles[offset++] = 0;357}358359static bool appendMeshlet(meshopt_Meshlet& meshlet, unsigned int a, unsigned int b, unsigned int c, short* used, meshopt_Meshlet* meshlets, unsigned int* meshlet_vertices, unsigned char* meshlet_triangles, size_t meshlet_offset, size_t max_vertices, size_t max_triangles, bool split = false)360{361short& av = used[a];362short& bv = used[b];363short& cv = used[c];364365bool result = false;366367int used_extra = (av < 0) + (bv < 0) + (cv < 0);368369if (meshlet.vertex_count + used_extra > max_vertices || meshlet.triangle_count >= max_triangles || split)370{371meshlets[meshlet_offset] = meshlet;372373for (size_t j = 0; j < meshlet.vertex_count; ++j)374used[meshlet_vertices[meshlet.vertex_offset + j]] = -1;375376finishMeshlet(meshlet, meshlet_triangles);377378meshlet.vertex_offset += meshlet.vertex_count;379meshlet.triangle_offset += (meshlet.triangle_count * 3 + 3) & ~3; // 4b padding380meshlet.vertex_count = 0;381meshlet.triangle_count = 0;382383result = true;384}385386if (av < 0)387{388av = short(meshlet.vertex_count);389meshlet_vertices[meshlet.vertex_offset + meshlet.vertex_count++] = a;390}391392if (bv < 0)393{394bv = short(meshlet.vertex_count);395meshlet_vertices[meshlet.vertex_offset + meshlet.vertex_count++] = b;396}397398if (cv < 0)399{400cv = short(meshlet.vertex_count);401meshlet_vertices[meshlet.vertex_offset + meshlet.vertex_count++] = c;402}403404meshlet_triangles[meshlet.triangle_offset + meshlet.triangle_count * 3 + 0] = (unsigned char)av;405meshlet_triangles[meshlet.triangle_offset + meshlet.triangle_count * 3 + 1] = (unsigned char)bv;406meshlet_triangles[meshlet.triangle_offset + meshlet.triangle_count * 3 + 2] = (unsigned char)cv;407meshlet.triangle_count++;408409return result;410}411412static unsigned int getNeighborTriangle(const meshopt_Meshlet& meshlet, const Cone& meshlet_cone, const unsigned int* meshlet_vertices, const unsigned int* indices, const TriangleAdjacency2& adjacency, const Cone* triangles, const unsigned int* live_triangles, const short* used, float meshlet_expected_radius, float cone_weight)413{414unsigned int best_triangle = ~0u;415int best_priority = 5;416float best_score = FLT_MAX;417418for (size_t i = 0; i < meshlet.vertex_count; ++i)419{420unsigned int index = meshlet_vertices[meshlet.vertex_offset + i];421422unsigned int* neighbors = &adjacency.data[0] + adjacency.offsets[index];423size_t neighbors_size = adjacency.counts[index];424425for (size_t j = 0; j < neighbors_size; ++j)426{427unsigned int triangle = neighbors[j];428unsigned int a = indices[triangle * 3 + 0], b = indices[triangle * 3 + 1], c = indices[triangle * 3 + 2];429430int extra = (used[a] < 0) + (used[b] < 0) + (used[c] < 0);431assert(extra <= 2);432433int priority = -1;434435// triangles that don't add new vertices to meshlets are max. priority436if (extra == 0)437priority = 0;438// artificially increase the priority of dangling triangles as they're expensive to add to new meshlets439else if (live_triangles[a] == 1 || live_triangles[b] == 1 || live_triangles[c] == 1)440priority = 1;441// if two vertices have live count of 2, removing this triangle will make another triangle dangling which is good for overall flow442else if ((live_triangles[a] == 2) + (live_triangles[b] == 2) + (live_triangles[c] == 2) >= 2)443priority = 1 + extra;444// otherwise adjust priority to be after the above cases, 3 or 4 based on used[] count445else446priority = 2 + extra;447448// since topology-based priority is always more important than the score, we can skip scoring in some cases449if (priority > best_priority)450continue;451452const Cone& tri_cone = triangles[triangle];453454float dx = tri_cone.px - meshlet_cone.px, dy = tri_cone.py - meshlet_cone.py, dz = tri_cone.pz - meshlet_cone.pz;455float distance = getDistance(dx, dy, dz, cone_weight < 0);456float spread = tri_cone.nx * meshlet_cone.nx + tri_cone.ny * meshlet_cone.ny + tri_cone.nz * meshlet_cone.nz;457458float score = getMeshletScore(distance, spread, cone_weight, meshlet_expected_radius);459460// note that topology-based priority is always more important than the score461// this helps maintain reasonable effectiveness of meshlet data and reduces scoring cost462if (priority < best_priority || score < best_score)463{464best_triangle = triangle;465best_priority = priority;466best_score = score;467}468}469}470471return best_triangle;472}473474static size_t appendSeedTriangles(unsigned int* seeds, const meshopt_Meshlet& meshlet, const unsigned int* meshlet_vertices, const unsigned int* indices, const TriangleAdjacency2& adjacency, const Cone* triangles, const unsigned int* live_triangles, float cornerx, float cornery, float cornerz)475{476unsigned int best_seeds[kMeshletAddSeeds];477unsigned int best_live[kMeshletAddSeeds];478float best_score[kMeshletAddSeeds];479480for (size_t i = 0; i < kMeshletAddSeeds; ++i)481{482best_seeds[i] = ~0u;483best_live[i] = ~0u;484best_score[i] = FLT_MAX;485}486487for (size_t i = 0; i < meshlet.vertex_count; ++i)488{489unsigned int index = meshlet_vertices[meshlet.vertex_offset + i];490491unsigned int best_neighbor = ~0u;492unsigned int best_neighbor_live = ~0u;493494// find the neighbor with the smallest live metric495unsigned int* neighbors = &adjacency.data[0] + adjacency.offsets[index];496size_t neighbors_size = adjacency.counts[index];497498for (size_t j = 0; j < neighbors_size; ++j)499{500unsigned int triangle = neighbors[j];501unsigned int a = indices[triangle * 3 + 0], b = indices[triangle * 3 + 1], c = indices[triangle * 3 + 2];502503unsigned int live = live_triangles[a] + live_triangles[b] + live_triangles[c];504505if (live < best_neighbor_live)506{507best_neighbor = triangle;508best_neighbor_live = live;509}510}511512// add the neighbor to the list of seeds; the list is unsorted and the replacement criteria is approximate513if (best_neighbor == ~0u)514continue;515516float best_neighbor_score = getDistance(triangles[best_neighbor].px - cornerx, triangles[best_neighbor].py - cornery, triangles[best_neighbor].pz - cornerz, false);517518for (size_t j = 0; j < kMeshletAddSeeds; ++j)519{520// non-strict comparison reduces the number of duplicate seeds (triangles adjacent to multiple vertices)521if (best_neighbor_live < best_live[j] || (best_neighbor_live == best_live[j] && best_neighbor_score <= best_score[j]))522{523best_seeds[j] = best_neighbor;524best_live[j] = best_neighbor_live;525best_score[j] = best_neighbor_score;526break;527}528}529}530531// add surviving seeds to the meshlet532size_t seed_count = 0;533534for (size_t i = 0; i < kMeshletAddSeeds; ++i)535if (best_seeds[i] != ~0u)536seeds[seed_count++] = best_seeds[i];537538return seed_count;539}540541static size_t pruneSeedTriangles(unsigned int* seeds, size_t seed_count, const unsigned char* emitted_flags)542{543size_t result = 0;544545for (size_t i = 0; i < seed_count; ++i)546{547unsigned int index = seeds[i];548549seeds[result] = index;550result += emitted_flags[index] == 0;551}552553return result;554}555556static unsigned int selectSeedTriangle(const unsigned int* seeds, size_t seed_count, const unsigned int* indices, const Cone* triangles, const unsigned int* live_triangles, float cornerx, float cornery, float cornerz)557{558unsigned int best_seed = ~0u;559unsigned int best_live = ~0u;560float best_score = FLT_MAX;561562for (size_t i = 0; i < seed_count; ++i)563{564unsigned int index = seeds[i];565unsigned int a = indices[index * 3 + 0], b = indices[index * 3 + 1], c = indices[index * 3 + 2];566567unsigned int live = live_triangles[a] + live_triangles[b] + live_triangles[c];568float score = getDistance(triangles[index].px - cornerx, triangles[index].py - cornery, triangles[index].pz - cornerz, false);569570if (live < best_live || (live == best_live && score < best_score))571{572best_seed = index;573best_live = live;574best_score = score;575}576}577578return best_seed;579}580581struct KDNode582{583union584{585float split;586unsigned int index;587};588589// leaves: axis = 3, children = number of extra points after this one (0 if 'index' is the only point)590// branches: axis != 3, left subtree = skip 1, right subtree = skip 1+children591unsigned int axis : 2;592unsigned int children : 30;593};594595static size_t kdtreePartition(unsigned int* indices, size_t count, const float* points, size_t stride, unsigned int axis, float pivot)596{597size_t m = 0;598599// invariant: elements in range [0, m) are < pivot, elements in range [m, i) are >= pivot600for (size_t i = 0; i < count; ++i)601{602float v = points[indices[i] * stride + axis];603604// swap(m, i) unconditionally605unsigned int t = indices[m];606indices[m] = indices[i];607indices[i] = t;608609// when v >= pivot, we swap i with m without advancing it, preserving invariants610m += v < pivot;611}612613return m;614}615616static size_t kdtreeBuildLeaf(size_t offset, KDNode* nodes, size_t node_count, unsigned int* indices, size_t count)617{618assert(offset + count <= node_count);619(void)node_count;620621KDNode& result = nodes[offset];622623result.index = indices[0];624result.axis = 3;625result.children = unsigned(count - 1);626627// all remaining points are stored in nodes immediately following the leaf628for (size_t i = 1; i < count; ++i)629{630KDNode& tail = nodes[offset + i];631632tail.index = indices[i];633tail.axis = 3;634tail.children = ~0u >> 2; // bogus value to prevent misuse635}636637return offset + count;638}639640static size_t kdtreeBuild(size_t offset, KDNode* nodes, size_t node_count, const float* points, size_t stride, unsigned int* indices, size_t count, size_t leaf_size)641{642assert(count > 0);643assert(offset < node_count);644645if (count <= leaf_size)646return kdtreeBuildLeaf(offset, nodes, node_count, indices, count);647648float mean[3] = {};649float vars[3] = {};650float runc = 1, runs = 1;651652// gather statistics on the points in the subtree using Welford's algorithm653for (size_t i = 0; i < count; ++i, runc += 1.f, runs = 1.f / runc)654{655const float* point = points + indices[i] * stride;656657for (int k = 0; k < 3; ++k)658{659float delta = point[k] - mean[k];660mean[k] += delta * runs;661vars[k] += delta * (point[k] - mean[k]);662}663}664665// split axis is one where the variance is largest666unsigned int axis = (vars[0] >= vars[1] && vars[0] >= vars[2]) ? 0 : (vars[1] >= vars[2] ? 1 : 2);667668float split = mean[axis];669size_t middle = kdtreePartition(indices, count, points, stride, axis, split);670671// when the partition is degenerate simply consolidate the points into a single node672if (middle <= leaf_size / 2 || middle >= count - leaf_size / 2)673return kdtreeBuildLeaf(offset, nodes, node_count, indices, count);674675KDNode& result = nodes[offset];676677result.split = split;678result.axis = axis;679680// left subtree is right after our node681size_t next_offset = kdtreeBuild(offset + 1, nodes, node_count, points, stride, indices, middle, leaf_size);682683// distance to the right subtree is represented explicitly684result.children = unsigned(next_offset - offset - 1);685686return kdtreeBuild(next_offset, nodes, node_count, points, stride, indices + middle, count - middle, leaf_size);687}688689static void kdtreeNearest(KDNode* nodes, unsigned int root, const float* points, size_t stride, const unsigned char* emitted_flags, const float* position, bool aa, unsigned int& result, float& limit)690{691const KDNode& node = nodes[root];692693if (node.axis == 3)694{695// leaf696for (unsigned int i = 0; i <= node.children; ++i)697{698unsigned int index = nodes[root + i].index;699700if (emitted_flags[index])701continue;702703const float* point = points + index * stride;704705float dx = point[0] - position[0], dy = point[1] - position[1], dz = point[2] - position[2];706float distance = getDistance(dx, dy, dz, aa);707708if (distance < limit)709{710result = index;711limit = distance;712}713}714}715else716{717// branch; we order recursion to process the node that search position is in first718float delta = position[node.axis] - node.split;719unsigned int first = (delta <= 0) ? 0 : node.children;720unsigned int second = first ^ node.children;721722kdtreeNearest(nodes, root + 1 + first, points, stride, emitted_flags, position, aa, result, limit);723724// only process the other node if it can have a match based on closest distance so far725if (fabsf(delta) <= limit)726kdtreeNearest(nodes, root + 1 + second, points, stride, emitted_flags, position, aa, result, limit);727}728}729730struct BVHBox731{732float min[3];733float max[3];734};735736static void boxMerge(BVHBox& box, const BVHBox& other)737{738for (int k = 0; k < 3; ++k)739{740box.min[k] = other.min[k] < box.min[k] ? other.min[k] : box.min[k];741box.max[k] = other.max[k] > box.max[k] ? other.max[k] : box.max[k];742}743}744745inline float boxSurface(const BVHBox& box)746{747float sx = box.max[0] - box.min[0], sy = box.max[1] - box.min[1], sz = box.max[2] - box.min[2];748return sx * sy + sx * sz + sy * sz;749}750751inline unsigned int radixFloat(unsigned int v)752{753// if sign bit is 0, flip sign bit754// if sign bit is 1, flip everything755unsigned int mask = (int(v) >> 31) | 0x80000000;756return v ^ mask;757}758759static void computeHistogram(unsigned int (&hist)[1024][3], const float* data, size_t count)760{761memset(hist, 0, sizeof(hist));762763const unsigned int* bits = reinterpret_cast<const unsigned int*>(data);764765// compute 3 10-bit histograms in parallel (dropping 2 LSB)766for (size_t i = 0; i < count; ++i)767{768unsigned int id = radixFloat(bits[i]);769770hist[(id >> 2) & 1023][0]++;771hist[(id >> 12) & 1023][1]++;772hist[(id >> 22) & 1023][2]++;773}774775unsigned int sum0 = 0, sum1 = 0, sum2 = 0;776777// replace histogram data with prefix histogram sums in-place778for (int i = 0; i < 1024; ++i)779{780unsigned int hx = hist[i][0], hy = hist[i][1], hz = hist[i][2];781782hist[i][0] = sum0;783hist[i][1] = sum1;784hist[i][2] = sum2;785786sum0 += hx;787sum1 += hy;788sum2 += hz;789}790791assert(sum0 == count && sum1 == count && sum2 == count);792}793794static void radixPass(unsigned int* destination, const unsigned int* source, const float* keys, size_t count, unsigned int (&hist)[1024][3], int pass)795{796const unsigned int* bits = reinterpret_cast<const unsigned int*>(keys);797int bitoff = pass * 10 + 2; // drop 2 LSB to be able to use 3 10-bit passes798799for (size_t i = 0; i < count; ++i)800{801unsigned int id = (radixFloat(bits[source[i]]) >> bitoff) & 1023;802803destination[hist[id][pass]++] = source[i];804}805}806807static void bvhPrepare(BVHBox* boxes, float* centroids, const unsigned int* indices, size_t face_count, const float* vertex_positions, size_t vertex_count, size_t vertex_stride_float)808{809(void)vertex_count;810811for (size_t i = 0; i < face_count; ++i)812{813unsigned int a = indices[i * 3 + 0], b = indices[i * 3 + 1], c = indices[i * 3 + 2];814assert(a < vertex_count && b < vertex_count && c < vertex_count);815816const float* va = vertex_positions + vertex_stride_float * a;817const float* vb = vertex_positions + vertex_stride_float * b;818const float* vc = vertex_positions + vertex_stride_float * c;819820BVHBox& box = boxes[i];821822for (int k = 0; k < 3; ++k)823{824box.min[k] = va[k] < vb[k] ? va[k] : vb[k];825box.min[k] = vc[k] < box.min[k] ? vc[k] : box.min[k];826827box.max[k] = va[k] > vb[k] ? va[k] : vb[k];828box.max[k] = vc[k] > box.max[k] ? vc[k] : box.max[k];829830centroids[i + face_count * k] = (box.min[k] + box.max[k]) / 2.f;831}832}833}834835static bool bvhPackLeaf(unsigned char* boundary, const unsigned int* order, size_t count, short* used, const unsigned int* indices, size_t max_vertices)836{837// count number of unique vertices838size_t used_vertices = 0;839for (size_t i = 0; i < count; ++i)840{841unsigned int index = order[i];842unsigned int a = indices[index * 3 + 0], b = indices[index * 3 + 1], c = indices[index * 3 + 2];843844used_vertices += (used[a] < 0) + (used[b] < 0) + (used[c] < 0);845used[a] = used[b] = used[c] = 1;846}847848// reset used[] for future invocations849for (size_t i = 0; i < count; ++i)850{851unsigned int index = order[i];852unsigned int a = indices[index * 3 + 0], b = indices[index * 3 + 1], c = indices[index * 3 + 2];853854used[a] = used[b] = used[c] = -1;855}856857if (used_vertices > max_vertices)858return false;859860// mark meshlet boundary for future reassembly861assert(count > 0);862863boundary[0] = 1;864memset(boundary + 1, 0, count - 1);865866return true;867}868869static void bvhPackTail(unsigned char* boundary, const unsigned int* order, size_t count, short* used, const unsigned int* indices, size_t max_vertices, size_t max_triangles)870{871for (size_t i = 0; i < count;)872{873size_t chunk = i + max_triangles <= count ? max_triangles : count - i;874875if (bvhPackLeaf(boundary + i, order + i, chunk, used, indices, max_vertices))876{877i += chunk;878continue;879}880881// chunk is vertex bound, split it into smaller meshlets882assert(chunk > max_vertices / 3);883884bvhPackLeaf(boundary + i, order + i, max_vertices / 3, used, indices, max_vertices);885i += max_vertices / 3;886}887}888889static bool bvhDivisible(size_t count, size_t min, size_t max)890{891// count is representable as a sum of values in [min..max] if if it in range of [k*min..k*min+k*(max-min)]892// equivalent to ceil(count / max) <= floor(count / min), but the form below allows using idiv (see nv_cluster_builder)893// we avoid expensive integer divisions in the common case where min is <= max/2894return min * 2 <= max ? count >= min : count % min <= (count / min) * (max - min);895}896897static size_t bvhPivot(const BVHBox* boxes, const unsigned int* order, size_t count, void* scratch, size_t step, size_t min, size_t max, float fill, float* out_cost)898{899BVHBox accuml = boxes[order[0]], accumr = boxes[order[count - 1]];900float* costs = static_cast<float*>(scratch);901902// accumulate SAH cost in forward and backward directions903for (size_t i = 0; i < count; ++i)904{905boxMerge(accuml, boxes[order[i]]);906boxMerge(accumr, boxes[order[count - 1 - i]]);907908costs[i] = boxSurface(accuml);909costs[i + count] = boxSurface(accumr);910}911912bool aligned = count >= min * 2 && bvhDivisible(count, min, max);913size_t end = aligned ? count - min : count - 1;914915float rmaxf = 1.f / float(int(max));916917// find best split that minimizes SAH918size_t bestsplit = 0;919float bestcost = FLT_MAX;920921for (size_t i = min - 1; i < end; i += step)922{923size_t lsplit = i + 1, rsplit = count - (i + 1);924925if (!bvhDivisible(lsplit, min, max))926continue;927if (aligned && !bvhDivisible(rsplit, min, max))928continue;929930// costs[x] = inclusive surface area of boxes[0..x]931// costs[count-1-x] = inclusive surface area of boxes[x..count-1]932float larea = costs[i], rarea = costs[(count - 1 - (i + 1)) + count];933float cost = larea * float(int(lsplit)) + rarea * float(int(rsplit));934935if (cost > bestcost)936continue;937938// fill cost; use floating point math to avoid expensive integer modulo939int lrest = int(float(int(lsplit + max - 1)) * rmaxf) * int(max) - int(lsplit);940int rrest = int(float(int(rsplit + max - 1)) * rmaxf) * int(max) - int(rsplit);941942cost += fill * (float(lrest) * larea + float(rrest) * rarea);943944if (cost < bestcost)945{946bestcost = cost;947bestsplit = i + 1;948}949}950951*out_cost = bestcost;952return bestsplit;953}954955static void bvhPartition(unsigned int* target, const unsigned int* order, const unsigned char* sides, size_t split, size_t count)956{957size_t l = 0, r = split;958959for (size_t i = 0; i < count; ++i)960{961unsigned char side = sides[order[i]];962target[side ? r : l] = order[i];963l += 1;964l -= side;965r += side;966}967968assert(l == split && r == count);969}970971static void bvhSplit(const BVHBox* boxes, unsigned int* orderx, unsigned int* ordery, unsigned int* orderz, unsigned char* boundary, size_t count, int depth, void* scratch, short* used, const unsigned int* indices, size_t max_vertices, size_t min_triangles, size_t max_triangles, float fill_weight)972{973if (depth >= kMeshletMaxTreeDepth)974return bvhPackTail(boundary, orderx, count, used, indices, max_vertices, max_triangles);975976if (count <= max_triangles && bvhPackLeaf(boundary, orderx, count, used, indices, max_vertices))977return;978979unsigned int* axes[3] = {orderx, ordery, orderz};980981// we can use step=1 unconditionally but to reduce the cost for min=max case we use step=max982size_t step = min_triangles == max_triangles && count > max_triangles ? max_triangles : 1;983984// if we could not pack the meshlet, we must be vertex bound985size_t mint = count <= max_triangles && max_vertices / 3 < min_triangles ? max_vertices / 3 : min_triangles;986987// only use fill weight if we are optimizing for triangle count988float fill = count <= max_triangles ? 0.f : fill_weight;989990// find best split that minimizes SAH991int bestk = -1;992size_t bestsplit = 0;993float bestcost = FLT_MAX;994995for (int k = 0; k < 3; ++k)996{997float axiscost = FLT_MAX;998size_t axissplit = bvhPivot(boxes, axes[k], count, scratch, step, mint, max_triangles, fill, &axiscost);9991000if (axissplit && axiscost < bestcost)1001{1002bestk = k;1003bestcost = axiscost;1004bestsplit = axissplit;1005}1006}10071008// this may happen if SAH costs along the admissible splits are NaN1009if (bestk < 0)1010return bvhPackTail(boundary, orderx, count, used, indices, max_vertices, max_triangles);10111012// mark sides of split for partitioning1013unsigned char* sides = static_cast<unsigned char*>(scratch) + count * sizeof(unsigned int);10141015for (size_t i = 0; i < bestsplit; ++i)1016sides[axes[bestk][i]] = 0;10171018for (size_t i = bestsplit; i < count; ++i)1019sides[axes[bestk][i]] = 1;10201021// partition all axes into two sides, maintaining order1022unsigned int* temp = static_cast<unsigned int*>(scratch);10231024for (int k = 0; k < 3; ++k)1025{1026if (k == bestk)1027continue;10281029unsigned int* axis = axes[k];1030memcpy(temp, axis, sizeof(unsigned int) * count);1031bvhPartition(axis, temp, sides, bestsplit, count);1032}10331034bvhSplit(boxes, orderx, ordery, orderz, boundary, bestsplit, depth + 1, scratch, used, indices, max_vertices, min_triangles, max_triangles, fill_weight);1035bvhSplit(boxes, orderx + bestsplit, ordery + bestsplit, orderz + bestsplit, boundary + bestsplit, count - bestsplit, depth + 1, scratch, used, indices, max_vertices, min_triangles, max_triangles, fill_weight);1036}10371038} // namespace meshopt10391040size_t meshopt_buildMeshletsBound(size_t index_count, size_t max_vertices, size_t max_triangles)1041{1042using namespace meshopt;10431044assert(index_count % 3 == 0);1045assert(max_vertices >= 3 && max_vertices <= kMeshletMaxVertices);1046assert(max_triangles >= 1 && max_triangles <= kMeshletMaxTriangles);1047assert(max_triangles % 4 == 0); // ensures the caller will compute output space properly as index data is 4b aligned10481049(void)kMeshletMaxVertices;1050(void)kMeshletMaxTriangles;10511052// meshlet construction is limited by max vertices and max triangles per meshlet1053// the worst case is that the input is an unindexed stream since this equally stresses both limits1054// note that we assume that in the worst case, we leave 2 vertices unpacked in each meshlet - if we have space for 3 we can pack any triangle1055size_t max_vertices_conservative = max_vertices - 2;1056size_t meshlet_limit_vertices = (index_count + max_vertices_conservative - 1) / max_vertices_conservative;1057size_t meshlet_limit_triangles = (index_count / 3 + max_triangles - 1) / max_triangles;10581059return meshlet_limit_vertices > meshlet_limit_triangles ? meshlet_limit_vertices : meshlet_limit_triangles;1060}10611062size_t meshopt_buildMeshletsFlex(meshopt_Meshlet* meshlets, unsigned int* meshlet_vertices, unsigned char* meshlet_triangles, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t max_vertices, size_t min_triangles, size_t max_triangles, float cone_weight, float split_factor)1063{1064using namespace meshopt;10651066assert(index_count % 3 == 0);1067assert(vertex_positions_stride >= 12 && vertex_positions_stride <= 256);1068assert(vertex_positions_stride % sizeof(float) == 0);10691070assert(max_vertices >= 3 && max_vertices <= kMeshletMaxVertices);1071assert(min_triangles >= 1 && min_triangles <= max_triangles && max_triangles <= kMeshletMaxTriangles);1072assert(min_triangles % 4 == 0 && max_triangles % 4 == 0); // ensures the caller will compute output space properly as index data is 4b aligned10731074assert(cone_weight <= 1); // negative cone weight switches metric to optimize for axis-aligned meshlets1075assert(split_factor >= 0);10761077if (index_count == 0)1078return 0;10791080meshopt_Allocator allocator;10811082TriangleAdjacency2 adjacency = {};1083if (vertex_count > index_count && index_count < (1u << 31))1084buildTriangleAdjacencySparse(adjacency, indices, index_count, vertex_count, allocator);1085else1086buildTriangleAdjacency(adjacency, indices, index_count, vertex_count, allocator);10871088// live triangle counts; note, we alias adjacency.counts as we remove triangles after emitting them so the counts always match1089unsigned int* live_triangles = adjacency.counts;10901091size_t face_count = index_count / 3;10921093unsigned char* emitted_flags = allocator.allocate<unsigned char>(face_count);1094memset(emitted_flags, 0, face_count);10951096// for each triangle, precompute centroid & normal to use for scoring1097Cone* triangles = allocator.allocate<Cone>(face_count);1098float mesh_area = computeTriangleCones(triangles, indices, index_count, vertex_positions, vertex_count, vertex_positions_stride);10991100// assuming each meshlet is a square patch, expected radius is sqrt(expected area)1101float triangle_area_avg = face_count == 0 ? 0.f : mesh_area / float(face_count) * 0.5f;1102float meshlet_expected_radius = sqrtf(triangle_area_avg * max_triangles) * 0.5f;11031104// build a kd-tree for nearest neighbor lookup1105unsigned int* kdindices = allocator.allocate<unsigned int>(face_count);1106for (size_t i = 0; i < face_count; ++i)1107kdindices[i] = unsigned(i);11081109KDNode* nodes = allocator.allocate<KDNode>(face_count * 2);1110kdtreeBuild(0, nodes, face_count * 2, &triangles[0].px, sizeof(Cone) / sizeof(float), kdindices, face_count, /* leaf_size= */ 8);11111112// find a specific corner of the mesh to use as a starting point for meshlet flow1113float cornerx = FLT_MAX, cornery = FLT_MAX, cornerz = FLT_MAX;11141115for (size_t i = 0; i < face_count; ++i)1116{1117const Cone& tri = triangles[i];11181119cornerx = cornerx > tri.px ? tri.px : cornerx;1120cornery = cornery > tri.py ? tri.py : cornery;1121cornerz = cornerz > tri.pz ? tri.pz : cornerz;1122}11231124// index of the vertex in the meshlet, -1 if the vertex isn't used1125short* used = allocator.allocate<short>(vertex_count);1126memset(used, -1, vertex_count * sizeof(short));11271128// initial seed triangle is the one closest to the corner1129unsigned int initial_seed = ~0u;1130float initial_score = FLT_MAX;11311132for (size_t i = 0; i < face_count; ++i)1133{1134const Cone& tri = triangles[i];11351136float score = getDistance(tri.px - cornerx, tri.py - cornery, tri.pz - cornerz, false);11371138if (initial_seed == ~0u || score < initial_score)1139{1140initial_seed = unsigned(i);1141initial_score = score;1142}1143}11441145// seed triangles to continue meshlet flow1146unsigned int seeds[kMeshletMaxSeeds] = {};1147size_t seed_count = 0;11481149meshopt_Meshlet meshlet = {};1150size_t meshlet_offset = 0;11511152Cone meshlet_cone_acc = {};11531154for (;;)1155{1156Cone meshlet_cone = getMeshletCone(meshlet_cone_acc, meshlet.triangle_count);11571158unsigned int best_triangle = ~0u;11591160// for the first triangle, we don't have a meshlet cone yet, so we use the initial seed1161// to continue the meshlet, we select an adjacent triangle based on connectivity and spatial scoring1162if (meshlet_offset == 0 && meshlet.triangle_count == 0)1163best_triangle = initial_seed;1164else1165best_triangle = getNeighborTriangle(meshlet, meshlet_cone, meshlet_vertices, indices, adjacency, triangles, live_triangles, used, meshlet_expected_radius, cone_weight);11661167bool split = false;11681169// when we run out of adjacent triangles we need to switch to spatial search; we currently just pick the closest triangle irrespective of connectivity1170if (best_triangle == ~0u)1171{1172float position[3] = {meshlet_cone.px, meshlet_cone.py, meshlet_cone.pz};1173unsigned int index = ~0u;1174float distance = FLT_MAX;11751176kdtreeNearest(nodes, 0, &triangles[0].px, sizeof(Cone) / sizeof(float), emitted_flags, position, cone_weight < 0.f, index, distance);11771178best_triangle = index;1179split = meshlet.triangle_count >= min_triangles && split_factor > 0 && distance > meshlet_expected_radius * split_factor;1180}11811182if (best_triangle == ~0u)1183break;11841185int best_extra = (used[indices[best_triangle * 3 + 0]] < 0) + (used[indices[best_triangle * 3 + 1]] < 0) + (used[indices[best_triangle * 3 + 2]] < 0);11861187// if the best triangle doesn't fit into current meshlet, we re-select using seeds to maintain global flow1188if (split || (meshlet.vertex_count + best_extra > max_vertices || meshlet.triangle_count >= max_triangles))1189{1190seed_count = pruneSeedTriangles(seeds, seed_count, emitted_flags);1191seed_count = (seed_count + kMeshletAddSeeds <= kMeshletMaxSeeds) ? seed_count : kMeshletMaxSeeds - kMeshletAddSeeds;1192seed_count += appendSeedTriangles(seeds + seed_count, meshlet, meshlet_vertices, indices, adjacency, triangles, live_triangles, cornerx, cornery, cornerz);11931194unsigned int best_seed = selectSeedTriangle(seeds, seed_count, indices, triangles, live_triangles, cornerx, cornery, cornerz);11951196// we may not find a valid seed triangle if the mesh is disconnected as seeds are based on adjacency1197best_triangle = best_seed != ~0u ? best_seed : best_triangle;1198}11991200unsigned int a = indices[best_triangle * 3 + 0], b = indices[best_triangle * 3 + 1], c = indices[best_triangle * 3 + 2];1201assert(a < vertex_count && b < vertex_count && c < vertex_count);12021203// add meshlet to the output; when the current meshlet is full we reset the accumulated bounds1204if (appendMeshlet(meshlet, a, b, c, used, meshlets, meshlet_vertices, meshlet_triangles, meshlet_offset, max_vertices, max_triangles, split))1205{1206meshlet_offset++;1207memset(&meshlet_cone_acc, 0, sizeof(meshlet_cone_acc));1208}12091210// remove emitted triangle from adjacency data1211// this makes sure that we spend less time traversing these lists on subsequent iterations1212// live triangle counts are updated as a byproduct of these adjustments1213for (size_t k = 0; k < 3; ++k)1214{1215unsigned int index = indices[best_triangle * 3 + k];12161217unsigned int* neighbors = &adjacency.data[0] + adjacency.offsets[index];1218size_t neighbors_size = adjacency.counts[index];12191220for (size_t i = 0; i < neighbors_size; ++i)1221{1222unsigned int tri = neighbors[i];12231224if (tri == best_triangle)1225{1226neighbors[i] = neighbors[neighbors_size - 1];1227adjacency.counts[index]--;1228break;1229}1230}1231}12321233// update aggregated meshlet cone data for scoring subsequent triangles1234meshlet_cone_acc.px += triangles[best_triangle].px;1235meshlet_cone_acc.py += triangles[best_triangle].py;1236meshlet_cone_acc.pz += triangles[best_triangle].pz;1237meshlet_cone_acc.nx += triangles[best_triangle].nx;1238meshlet_cone_acc.ny += triangles[best_triangle].ny;1239meshlet_cone_acc.nz += triangles[best_triangle].nz;12401241assert(!emitted_flags[best_triangle]);1242emitted_flags[best_triangle] = 1;1243}12441245if (meshlet.triangle_count)1246{1247finishMeshlet(meshlet, meshlet_triangles);12481249meshlets[meshlet_offset++] = meshlet;1250}12511252assert(meshlet_offset <= meshopt_buildMeshletsBound(index_count, max_vertices, min_triangles));1253return meshlet_offset;1254}12551256size_t meshopt_buildMeshlets(meshopt_Meshlet* meshlets, unsigned int* meshlet_vertices, unsigned char* meshlet_triangles, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t max_vertices, size_t max_triangles, float cone_weight)1257{1258assert(cone_weight >= 0); // to use negative cone weight, use meshopt_buildMeshletsFlex12591260return meshopt_buildMeshletsFlex(meshlets, meshlet_vertices, meshlet_triangles, indices, index_count, vertex_positions, vertex_count, vertex_positions_stride, max_vertices, max_triangles, max_triangles, cone_weight, 0.0f);1261}12621263size_t meshopt_buildMeshletsScan(meshopt_Meshlet* meshlets, unsigned int* meshlet_vertices, unsigned char* meshlet_triangles, const unsigned int* indices, size_t index_count, size_t vertex_count, size_t max_vertices, size_t max_triangles)1264{1265using namespace meshopt;12661267assert(index_count % 3 == 0);12681269assert(max_vertices >= 3 && max_vertices <= kMeshletMaxVertices);1270assert(max_triangles >= 1 && max_triangles <= kMeshletMaxTriangles);1271assert(max_triangles % 4 == 0); // ensures the caller will compute output space properly as index data is 4b aligned12721273meshopt_Allocator allocator;12741275// index of the vertex in the meshlet, -1 if the vertex isn't used1276short* used = allocator.allocate<short>(vertex_count);1277memset(used, -1, vertex_count * sizeof(short));12781279meshopt_Meshlet meshlet = {};1280size_t meshlet_offset = 0;12811282for (size_t i = 0; i < index_count; i += 3)1283{1284unsigned int a = indices[i + 0], b = indices[i + 1], c = indices[i + 2];1285assert(a < vertex_count && b < vertex_count && c < vertex_count);12861287// appends triangle to the meshlet and writes previous meshlet to the output if full1288meshlet_offset += appendMeshlet(meshlet, a, b, c, used, meshlets, meshlet_vertices, meshlet_triangles, meshlet_offset, max_vertices, max_triangles);1289}12901291if (meshlet.triangle_count)1292{1293finishMeshlet(meshlet, meshlet_triangles);12941295meshlets[meshlet_offset++] = meshlet;1296}12971298assert(meshlet_offset <= meshopt_buildMeshletsBound(index_count, max_vertices, max_triangles));1299return meshlet_offset;1300}13011302size_t meshopt_buildMeshletsSpatial(struct meshopt_Meshlet* meshlets, unsigned int* meshlet_vertices, unsigned char* meshlet_triangles, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t max_vertices, size_t min_triangles, size_t max_triangles, float fill_weight)1303{1304using namespace meshopt;13051306assert(index_count % 3 == 0);1307assert(vertex_positions_stride >= 12 && vertex_positions_stride <= 256);1308assert(vertex_positions_stride % sizeof(float) == 0);13091310assert(max_vertices >= 3 && max_vertices <= kMeshletMaxVertices);1311assert(min_triangles >= 1 && min_triangles <= max_triangles && max_triangles <= kMeshletMaxTriangles);1312assert(min_triangles % 4 == 0 && max_triangles % 4 == 0); // ensures the caller will compute output space properly as index data is 4b aligned13131314if (index_count == 0)1315return 0;13161317size_t face_count = index_count / 3;1318size_t vertex_stride_float = vertex_positions_stride / sizeof(float);13191320meshopt_Allocator allocator;13211322// 3 floats plus 1 uint for sorting, or1323// 2 floats for SAH costs, or1324// 1 uint plus 1 byte for partitioning1325float* scratch = allocator.allocate<float>(face_count * 4);13261327// compute bounding boxes and centroids for sorting1328BVHBox* boxes = allocator.allocate<BVHBox>(face_count);1329bvhPrepare(boxes, scratch, indices, face_count, vertex_positions, vertex_count, vertex_stride_float);13301331unsigned int* axes = allocator.allocate<unsigned int>(face_count * 3);1332unsigned int* temp = reinterpret_cast<unsigned int*>(scratch) + face_count * 3;13331334for (int k = 0; k < 3; ++k)1335{1336unsigned int* order = axes + k * face_count;1337const float* keys = scratch + k * face_count;13381339unsigned int hist[1024][3];1340computeHistogram(hist, keys, face_count);13411342// 3-pass radix sort computes the resulting order into axes1343for (size_t i = 0; i < face_count; ++i)1344temp[i] = unsigned(i);13451346radixPass(order, temp, keys, face_count, hist, 0);1347radixPass(temp, order, keys, face_count, hist, 1);1348radixPass(order, temp, keys, face_count, hist, 2);1349}13501351// index of the vertex in the meshlet, -1 if the vertex isn't used1352short* used = allocator.allocate<short>(vertex_count);1353memset(used, -1, vertex_count * sizeof(short));13541355unsigned char* boundary = allocator.allocate<unsigned char>(face_count);13561357bvhSplit(boxes, &axes[0], &axes[face_count], &axes[face_count * 2], boundary, face_count, 0, scratch, used, indices, max_vertices, min_triangles, max_triangles, fill_weight);13581359// compute the desired number of meshlets; note that on some meshes with a lot of vertex bound clusters this might go over the bound1360size_t meshlet_count = 0;1361for (size_t i = 0; i < face_count; ++i)1362{1363assert(boundary[i] <= 1);1364meshlet_count += boundary[i];1365}13661367size_t meshlet_bound = meshopt_buildMeshletsBound(index_count, max_vertices, min_triangles);13681369// pack triangles into meshlets according to the order and boundaries marked by bvhSplit1370meshopt_Meshlet meshlet = {};1371size_t meshlet_offset = 0;1372size_t meshlet_pending = meshlet_count;13731374for (size_t i = 0; i < face_count; ++i)1375{1376assert(boundary[i] <= 1);1377bool split = i > 0 && boundary[i] == 1;13781379// while we are over the limit, we ignore boundary[] data and disable splits until we free up enough space1380if (split && meshlet_count > meshlet_bound && meshlet_offset + meshlet_pending >= meshlet_bound)1381split = false;13821383unsigned int index = axes[i];1384assert(index < face_count);13851386unsigned int a = indices[index * 3 + 0], b = indices[index * 3 + 1], c = indices[index * 3 + 2];13871388// appends triangle to the meshlet and writes previous meshlet to the output if full1389meshlet_offset += appendMeshlet(meshlet, a, b, c, used, meshlets, meshlet_vertices, meshlet_triangles, meshlet_offset, max_vertices, max_triangles, split);1390meshlet_pending -= boundary[i];1391}13921393if (meshlet.triangle_count)1394{1395finishMeshlet(meshlet, meshlet_triangles);13961397meshlets[meshlet_offset++] = meshlet;1398}13991400assert(meshlet_offset <= meshlet_bound);1401return meshlet_offset;1402}14031404meshopt_Bounds meshopt_computeClusterBounds(const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride)1405{1406using namespace meshopt;14071408assert(index_count % 3 == 0);1409assert(index_count / 3 <= kMeshletMaxTriangles);1410assert(vertex_positions_stride >= 12 && vertex_positions_stride <= 256);1411assert(vertex_positions_stride % sizeof(float) == 0);14121413(void)vertex_count;14141415size_t vertex_stride_float = vertex_positions_stride / sizeof(float);14161417// compute triangle normals and gather triangle corners1418float normals[kMeshletMaxTriangles][3];1419float corners[kMeshletMaxTriangles][3][3];1420size_t triangles = 0;14211422for (size_t i = 0; i < index_count; i += 3)1423{1424unsigned int a = indices[i + 0], b = indices[i + 1], c = indices[i + 2];1425assert(a < vertex_count && b < vertex_count && c < vertex_count);14261427const float* p0 = vertex_positions + vertex_stride_float * a;1428const float* p1 = vertex_positions + vertex_stride_float * b;1429const float* p2 = vertex_positions + vertex_stride_float * c;14301431float p10[3] = {p1[0] - p0[0], p1[1] - p0[1], p1[2] - p0[2]};1432float p20[3] = {p2[0] - p0[0], p2[1] - p0[1], p2[2] - p0[2]};14331434float normalx = p10[1] * p20[2] - p10[2] * p20[1];1435float normaly = p10[2] * p20[0] - p10[0] * p20[2];1436float normalz = p10[0] * p20[1] - p10[1] * p20[0];14371438float area = sqrtf(normalx * normalx + normaly * normaly + normalz * normalz);14391440// no need to include degenerate triangles - they will be invisible anyway1441if (area == 0.f)1442continue;14431444// record triangle normals & corners for future use; normal and corner 0 define a plane equation1445normals[triangles][0] = normalx / area;1446normals[triangles][1] = normaly / area;1447normals[triangles][2] = normalz / area;1448memcpy(corners[triangles][0], p0, 3 * sizeof(float));1449memcpy(corners[triangles][1], p1, 3 * sizeof(float));1450memcpy(corners[triangles][2], p2, 3 * sizeof(float));1451triangles++;1452}14531454meshopt_Bounds bounds = {};14551456// degenerate cluster, no valid triangles => trivial reject (cone data is 0)1457if (triangles == 0)1458return bounds;14591460const float rzero = 0.f;14611462// compute cluster bounding sphere; we'll use the center to determine normal cone apex as well1463float psphere[4] = {};1464computeBoundingSphere(psphere, corners[0][0], triangles * 3, sizeof(float) * 3, &rzero, 0, 7);14651466float center[3] = {psphere[0], psphere[1], psphere[2]};14671468// treating triangle normals as points, find the bounding sphere - the sphere center determines the optimal cone axis1469float nsphere[4] = {};1470computeBoundingSphere(nsphere, normals[0], triangles, sizeof(float) * 3, &rzero, 0, 3);14711472float axis[3] = {nsphere[0], nsphere[1], nsphere[2]};1473float axislength = sqrtf(axis[0] * axis[0] + axis[1] * axis[1] + axis[2] * axis[2]);1474float invaxislength = axislength == 0.f ? 0.f : 1.f / axislength;14751476axis[0] *= invaxislength;1477axis[1] *= invaxislength;1478axis[2] *= invaxislength;14791480// compute a tight cone around all normals, mindp = cos(angle/2)1481float mindp = 1.f;14821483for (size_t i = 0; i < triangles; ++i)1484{1485float dp = normals[i][0] * axis[0] + normals[i][1] * axis[1] + normals[i][2] * axis[2];14861487mindp = (dp < mindp) ? dp : mindp;1488}14891490// fill bounding sphere info; note that below we can return bounds without cone information for degenerate cones1491bounds.center[0] = center[0];1492bounds.center[1] = center[1];1493bounds.center[2] = center[2];1494bounds.radius = psphere[3];14951496// degenerate cluster, normal cone is larger than a hemisphere => trivial accept1497// note that if mindp is positive but close to 0, the triangle intersection code below gets less stable1498// we arbitrarily decide that if a normal cone is ~168 degrees wide or more, the cone isn't useful1499if (mindp <= 0.1f)1500{1501bounds.cone_cutoff = 1;1502bounds.cone_cutoff_s8 = 127;1503return bounds;1504}15051506float maxt = 0;15071508// we need to find the point on center-t*axis ray that lies in negative half-space of all triangles1509for (size_t i = 0; i < triangles; ++i)1510{1511// dot(center-t*axis-corner, trinormal) = 01512// dot(center-corner, trinormal) - t * dot(axis, trinormal) = 01513float cx = center[0] - corners[i][0][0];1514float cy = center[1] - corners[i][0][1];1515float cz = center[2] - corners[i][0][2];15161517float dc = cx * normals[i][0] + cy * normals[i][1] + cz * normals[i][2];1518float dn = axis[0] * normals[i][0] + axis[1] * normals[i][1] + axis[2] * normals[i][2];15191520// dn should be larger than mindp cutoff above1521assert(dn > 0.f);1522float t = dc / dn;15231524maxt = (t > maxt) ? t : maxt;1525}15261527// cone apex should be in the negative half-space of all cluster triangles by construction1528bounds.cone_apex[0] = center[0] - axis[0] * maxt;1529bounds.cone_apex[1] = center[1] - axis[1] * maxt;1530bounds.cone_apex[2] = center[2] - axis[2] * maxt;15311532// note: this axis is the axis of the normal cone, but our test for perspective camera effectively negates the axis1533bounds.cone_axis[0] = axis[0];1534bounds.cone_axis[1] = axis[1];1535bounds.cone_axis[2] = axis[2];15361537// cos(a) for normal cone is mindp; we need to add 90 degrees on both sides and invert the cone1538// which gives us -cos(a+90) = -(-sin(a)) = sin(a) = sqrt(1 - cos^2(a))1539bounds.cone_cutoff = sqrtf(1 - mindp * mindp);15401541// quantize axis & cutoff to 8-bit SNORM format1542bounds.cone_axis_s8[0] = (signed char)(meshopt_quantizeSnorm(bounds.cone_axis[0], 8));1543bounds.cone_axis_s8[1] = (signed char)(meshopt_quantizeSnorm(bounds.cone_axis[1], 8));1544bounds.cone_axis_s8[2] = (signed char)(meshopt_quantizeSnorm(bounds.cone_axis[2], 8));15451546// for the 8-bit test to be conservative, we need to adjust the cutoff by measuring the max. error1547float cone_axis_s8_e0 = fabsf(bounds.cone_axis_s8[0] / 127.f - bounds.cone_axis[0]);1548float cone_axis_s8_e1 = fabsf(bounds.cone_axis_s8[1] / 127.f - bounds.cone_axis[1]);1549float cone_axis_s8_e2 = fabsf(bounds.cone_axis_s8[2] / 127.f - bounds.cone_axis[2]);15501551// note that we need to round this up instead of rounding to nearest, hence +11552int cone_cutoff_s8 = int(127 * (bounds.cone_cutoff + cone_axis_s8_e0 + cone_axis_s8_e1 + cone_axis_s8_e2) + 1);15531554bounds.cone_cutoff_s8 = (cone_cutoff_s8 > 127) ? 127 : (signed char)(cone_cutoff_s8);15551556return bounds;1557}15581559meshopt_Bounds meshopt_computeMeshletBounds(const unsigned int* meshlet_vertices, const unsigned char* meshlet_triangles, size_t triangle_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride)1560{1561using namespace meshopt;15621563assert(triangle_count <= kMeshletMaxTriangles);1564assert(vertex_positions_stride >= 12 && vertex_positions_stride <= 256);1565assert(vertex_positions_stride % sizeof(float) == 0);15661567unsigned int indices[kMeshletMaxTriangles * 3];15681569for (size_t i = 0; i < triangle_count * 3; ++i)1570{1571unsigned int index = meshlet_vertices[meshlet_triangles[i]];1572assert(index < vertex_count);15731574indices[i] = index;1575}15761577return meshopt_computeClusterBounds(indices, triangle_count * 3, vertex_positions, vertex_count, vertex_positions_stride);1578}15791580meshopt_Bounds meshopt_computeSphereBounds(const float* positions, size_t count, size_t positions_stride, const float* radii, size_t radii_stride)1581{1582using namespace meshopt;15831584assert(positions_stride >= 12 && positions_stride <= 256);1585assert(positions_stride % sizeof(float) == 0);1586assert((radii_stride >= 4 && radii_stride <= 256) || radii == NULL);1587assert(radii_stride % sizeof(float) == 0);15881589meshopt_Bounds bounds = {};15901591if (count == 0)1592return bounds;15931594const float rzero = 0.f;15951596float psphere[4] = {};1597computeBoundingSphere(psphere, positions, count, positions_stride, radii ? radii : &rzero, radii ? radii_stride : 0, 7);15981599bounds.center[0] = psphere[0];1600bounds.center[1] = psphere[1];1601bounds.center[2] = psphere[2];1602bounds.radius = psphere[3];16031604return bounds;1605}16061607void meshopt_optimizeMeshlet(unsigned int* meshlet_vertices, unsigned char* meshlet_triangles, size_t triangle_count, size_t vertex_count)1608{1609using namespace meshopt;16101611assert(triangle_count <= kMeshletMaxTriangles);1612assert(vertex_count <= kMeshletMaxVertices);16131614unsigned char* indices = meshlet_triangles;1615unsigned int* vertices = meshlet_vertices;16161617// cache tracks vertex timestamps (corresponding to triangle index! all 3 vertices are added at the same time and never removed)1618unsigned char cache[kMeshletMaxVertices];1619memset(cache, 0, vertex_count);16201621// note that we start from a value that means all vertices aren't in cache1622unsigned char cache_last = 128;1623const unsigned char cache_cutoff = 3; // 3 triangles = ~5..9 vertices depending on reuse16241625for (size_t i = 0; i < triangle_count; ++i)1626{1627int next = -1;1628int next_match = -1;16291630for (size_t j = i; j < triangle_count; ++j)1631{1632unsigned char a = indices[j * 3 + 0], b = indices[j * 3 + 1], c = indices[j * 3 + 2];1633assert(a < vertex_count && b < vertex_count && c < vertex_count);16341635// score each triangle by how many vertices are in cache1636// note: the distance is computed using unsigned 8-bit values, so cache timestamp overflow is handled gracefully1637int aok = (unsigned char)(cache_last - cache[a]) < cache_cutoff;1638int bok = (unsigned char)(cache_last - cache[b]) < cache_cutoff;1639int cok = (unsigned char)(cache_last - cache[c]) < cache_cutoff;16401641if (aok + bok + cok > next_match)1642{1643next = (int)j;1644next_match = aok + bok + cok;16451646// note that we could end up with all 3 vertices in the cache, but 2 is enough for ~strip traversal1647if (next_match >= 2)1648break;1649}1650}16511652assert(next >= 0);16531654unsigned char a = indices[next * 3 + 0], b = indices[next * 3 + 1], c = indices[next * 3 + 2];16551656// shift triangles before the next one forward so that we always keep an ordered partition1657// note: this could have swapped triangles [i] and [next] but that distorts the order and may skew the output sequence1658memmove(indices + (i + 1) * 3, indices + i * 3, (next - i) * 3 * sizeof(unsigned char));16591660indices[i * 3 + 0] = a;1661indices[i * 3 + 1] = b;1662indices[i * 3 + 2] = c;16631664// cache timestamp is the same between all vertices of each triangle to reduce overflow1665cache_last++;1666cache[a] = cache_last;1667cache[b] = cache_last;1668cache[c] = cache_last;1669}16701671// reorder meshlet vertices for access locality assuming index buffer is scanned sequentially1672unsigned int order[kMeshletMaxVertices];16731674short remap[kMeshletMaxVertices];1675memset(remap, -1, vertex_count * sizeof(short));16761677size_t vertex_offset = 0;16781679for (size_t i = 0; i < triangle_count * 3; ++i)1680{1681short& r = remap[indices[i]];16821683if (r < 0)1684{1685r = short(vertex_offset);1686order[vertex_offset] = vertices[indices[i]];1687vertex_offset++;1688}16891690indices[i] = (unsigned char)r;1691}16921693assert(vertex_offset <= vertex_count);1694memcpy(vertices, order, vertex_offset * sizeof(unsigned int));1695}169616971698