Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
godotengine
GitHub Repository: godotengine/godot
Path: blob/master/thirdparty/meshoptimizer/clusterizer.cpp
20843 views
1
// This file is part of meshoptimizer library; see meshoptimizer.h for version/license details
2
#include "meshoptimizer.h"
3
4
#include <assert.h>
5
#include <float.h>
6
#include <math.h>
7
#include <string.h>
8
9
// The block below auto-detects SIMD ISA that can be used on the target platform
10
#ifndef MESHOPTIMIZER_NO_SIMD
11
#if defined(__SSE2__) || (defined(_MSC_VER) && defined(_M_X64))
12
#define SIMD_SSE
13
#include <emmintrin.h>
14
#elif defined(__aarch64__) || (defined(_MSC_VER) && defined(_M_ARM64) && _MSC_VER >= 1922)
15
#define SIMD_NEON
16
#include <arm_neon.h>
17
#endif
18
#endif // !MESHOPTIMIZER_NO_SIMD
19
20
// This work is based on:
21
// Graham Wihlidal. Optimizing the Graphics Pipeline with Compute. 2016
22
// Matthaeus Chajdas. GeometryFX 1.2 - Cluster Culling. 2016
23
// Jack Ritter. An Efficient Bounding Sphere. 1990
24
// Thomas Larsson. Fast and Tight Fitting Bounding Spheres. 2008
25
// Ingo Wald, Vlastimil Havran. On building fast kd-Trees for Ray Tracing, and on doing that in O(N log N). 2006
26
namespace meshopt
27
{
28
29
// This must be <= 256 since meshlet indices are stored as bytes
30
const size_t kMeshletMaxVertices = 256;
31
32
// A reasonable limit is around 2*max_vertices or less
33
const size_t kMeshletMaxTriangles = 512;
34
35
// We keep a limited number of seed triangles and add a few triangles per finished meshlet
36
const size_t kMeshletMaxSeeds = 256;
37
const size_t kMeshletAddSeeds = 4;
38
39
// To avoid excessive recursion for malformed inputs, we limit the maximum depth of the tree
40
const int kMeshletMaxTreeDepth = 50;
41
42
struct TriangleAdjacency2
43
{
44
unsigned int* counts;
45
unsigned int* offsets;
46
unsigned int* data;
47
};
48
49
static void buildTriangleAdjacency(TriangleAdjacency2& adjacency, const unsigned int* indices, size_t index_count, size_t vertex_count, meshopt_Allocator& allocator)
50
{
51
size_t face_count = index_count / 3;
52
53
// allocate arrays
54
adjacency.counts = allocator.allocate<unsigned int>(vertex_count);
55
adjacency.offsets = allocator.allocate<unsigned int>(vertex_count);
56
adjacency.data = allocator.allocate<unsigned int>(index_count);
57
58
// fill triangle counts
59
memset(adjacency.counts, 0, vertex_count * sizeof(unsigned int));
60
61
for (size_t i = 0; i < index_count; ++i)
62
{
63
assert(indices[i] < vertex_count);
64
65
adjacency.counts[indices[i]]++;
66
}
67
68
// fill offset table
69
unsigned int offset = 0;
70
71
for (size_t i = 0; i < vertex_count; ++i)
72
{
73
adjacency.offsets[i] = offset;
74
offset += adjacency.counts[i];
75
}
76
77
assert(offset == index_count);
78
79
// fill triangle data
80
for (size_t i = 0; i < face_count; ++i)
81
{
82
unsigned int a = indices[i * 3 + 0], b = indices[i * 3 + 1], c = indices[i * 3 + 2];
83
84
adjacency.data[adjacency.offsets[a]++] = unsigned(i);
85
adjacency.data[adjacency.offsets[b]++] = unsigned(i);
86
adjacency.data[adjacency.offsets[c]++] = unsigned(i);
87
}
88
89
// fix offsets that have been disturbed by the previous pass
90
for (size_t i = 0; i < vertex_count; ++i)
91
{
92
assert(adjacency.offsets[i] >= adjacency.counts[i]);
93
adjacency.offsets[i] -= adjacency.counts[i];
94
}
95
}
96
97
static void buildTriangleAdjacencySparse(TriangleAdjacency2& adjacency, const unsigned int* indices, size_t index_count, size_t vertex_count, meshopt_Allocator& allocator)
98
{
99
size_t face_count = index_count / 3;
100
101
// sparse mode can build adjacency more quickly by ignoring unused vertices, using a bit to mark visited vertices
102
const unsigned int sparse_seen = 1u << 31;
103
assert(index_count < sparse_seen);
104
105
// allocate arrays
106
adjacency.counts = allocator.allocate<unsigned int>(vertex_count);
107
adjacency.offsets = allocator.allocate<unsigned int>(vertex_count);
108
adjacency.data = allocator.allocate<unsigned int>(index_count);
109
110
// fill triangle counts
111
for (size_t i = 0; i < index_count; ++i)
112
assert(indices[i] < vertex_count);
113
114
for (size_t i = 0; i < index_count; ++i)
115
adjacency.counts[indices[i]] = 0;
116
117
for (size_t i = 0; i < index_count; ++i)
118
adjacency.counts[indices[i]]++;
119
120
// fill offset table; uses sparse_seen bit to tag visited vertices
121
unsigned int offset = 0;
122
123
for (size_t i = 0; i < index_count; ++i)
124
{
125
unsigned int v = indices[i];
126
127
if ((adjacency.counts[v] & sparse_seen) == 0)
128
{
129
adjacency.offsets[v] = offset;
130
offset += adjacency.counts[v];
131
adjacency.counts[v] |= sparse_seen;
132
}
133
}
134
135
assert(offset == index_count);
136
137
// fill triangle data
138
for (size_t i = 0; i < face_count; ++i)
139
{
140
unsigned int a = indices[i * 3 + 0], b = indices[i * 3 + 1], c = indices[i * 3 + 2];
141
142
adjacency.data[adjacency.offsets[a]++] = unsigned(i);
143
adjacency.data[adjacency.offsets[b]++] = unsigned(i);
144
adjacency.data[adjacency.offsets[c]++] = unsigned(i);
145
}
146
147
// fix offsets that have been disturbed by the previous pass
148
// also fix counts (that were marked with sparse_seen by the first pass)
149
for (size_t i = 0; i < index_count; ++i)
150
{
151
unsigned int v = indices[i];
152
153
if (adjacency.counts[v] & sparse_seen)
154
{
155
adjacency.counts[v] &= ~sparse_seen;
156
157
assert(adjacency.offsets[v] >= adjacency.counts[v]);
158
adjacency.offsets[v] -= adjacency.counts[v];
159
}
160
}
161
}
162
163
static void clearUsed(short* used, size_t vertex_count, const unsigned int* indices, size_t index_count)
164
{
165
// for sparse inputs, it's faster to only clear vertices referenced by the index buffer
166
if (vertex_count <= index_count)
167
memset(used, -1, vertex_count * sizeof(short));
168
else
169
for (size_t i = 0; i < index_count; ++i)
170
{
171
assert(indices[i] < vertex_count);
172
used[indices[i]] = -1;
173
}
174
}
175
176
static void computeBoundingSphere(float result[4], const float* points, size_t count, size_t points_stride, const float* radii, size_t radii_stride, size_t axis_count)
177
{
178
static const float kAxes[7][3] = {
179
// X, Y, Z
180
{1, 0, 0},
181
{0, 1, 0},
182
{0, 0, 1},
183
184
// XYZ, -XYZ, X-YZ, XY-Z; normalized to unit length
185
{0.57735026f, 0.57735026f, 0.57735026f},
186
{-0.57735026f, 0.57735026f, 0.57735026f},
187
{0.57735026f, -0.57735026f, 0.57735026f},
188
{0.57735026f, 0.57735026f, -0.57735026f},
189
};
190
191
assert(count > 0);
192
assert(axis_count <= sizeof(kAxes) / sizeof(kAxes[0]));
193
194
size_t points_stride_float = points_stride / sizeof(float);
195
size_t radii_stride_float = radii_stride / sizeof(float);
196
197
// find extremum points along all axes; for each axis we get a pair of points with min/max coordinates
198
size_t pmin[7], pmax[7];
199
float tmin[7], tmax[7];
200
201
for (size_t axis = 0; axis < axis_count; ++axis)
202
{
203
pmin[axis] = pmax[axis] = 0;
204
tmin[axis] = FLT_MAX;
205
tmax[axis] = -FLT_MAX;
206
}
207
208
for (size_t i = 0; i < count; ++i)
209
{
210
const float* p = points + i * points_stride_float;
211
float r = radii[i * radii_stride_float];
212
213
for (size_t axis = 0; axis < axis_count; ++axis)
214
{
215
const float* ax = kAxes[axis];
216
217
float tp = ax[0] * p[0] + ax[1] * p[1] + ax[2] * p[2];
218
float tpmin = tp - r, tpmax = tp + r;
219
220
pmin[axis] = (tpmin < tmin[axis]) ? i : pmin[axis];
221
pmax[axis] = (tpmax > tmax[axis]) ? i : pmax[axis];
222
tmin[axis] = (tpmin < tmin[axis]) ? tpmin : tmin[axis];
223
tmax[axis] = (tpmax > tmax[axis]) ? tpmax : tmax[axis];
224
}
225
}
226
227
// find the pair of points with largest distance
228
size_t paxis = 0;
229
float paxisdr = 0;
230
231
for (size_t axis = 0; axis < axis_count; ++axis)
232
{
233
const float* p1 = points + pmin[axis] * points_stride_float;
234
const float* p2 = points + pmax[axis] * points_stride_float;
235
float r1 = radii[pmin[axis] * radii_stride_float];
236
float r2 = radii[pmax[axis] * radii_stride_float];
237
238
float d2 = (p2[0] - p1[0]) * (p2[0] - p1[0]) + (p2[1] - p1[1]) * (p2[1] - p1[1]) + (p2[2] - p1[2]) * (p2[2] - p1[2]);
239
float dr = sqrtf(d2) + r1 + r2;
240
241
if (dr > paxisdr)
242
{
243
paxisdr = dr;
244
paxis = axis;
245
}
246
}
247
248
// use the longest segment as the initial sphere diameter
249
const float* p1 = points + pmin[paxis] * points_stride_float;
250
const float* p2 = points + pmax[paxis] * points_stride_float;
251
float r1 = radii[pmin[paxis] * radii_stride_float];
252
float r2 = radii[pmax[paxis] * radii_stride_float];
253
254
float paxisd = sqrtf((p2[0] - p1[0]) * (p2[0] - p1[0]) + (p2[1] - p1[1]) * (p2[1] - p1[1]) + (p2[2] - p1[2]) * (p2[2] - p1[2]));
255
float paxisk = paxisd > 0 ? (paxisd + r2 - r1) / (2 * paxisd) : 0.f;
256
257
float center[3] = {p1[0] + (p2[0] - p1[0]) * paxisk, p1[1] + (p2[1] - p1[1]) * paxisk, p1[2] + (p2[2] - p1[2]) * paxisk};
258
float radius = paxisdr / 2;
259
260
// iteratively adjust the sphere up until all points fit
261
for (size_t i = 0; i < count; ++i)
262
{
263
const float* p = points + i * points_stride_float;
264
float r = radii[i * radii_stride_float];
265
266
float d2 = (p[0] - center[0]) * (p[0] - center[0]) + (p[1] - center[1]) * (p[1] - center[1]) + (p[2] - center[2]) * (p[2] - center[2]);
267
float d = sqrtf(d2);
268
269
if (d + r > radius)
270
{
271
float k = d > 0 ? (d + r - radius) / (2 * d) : 0.f;
272
273
center[0] += k * (p[0] - center[0]);
274
center[1] += k * (p[1] - center[1]);
275
center[2] += k * (p[2] - center[2]);
276
radius = (radius + d + r) / 2;
277
}
278
}
279
280
result[0] = center[0];
281
result[1] = center[1];
282
result[2] = center[2];
283
result[3] = radius;
284
}
285
286
struct Cone
287
{
288
float px, py, pz;
289
float nx, ny, nz;
290
};
291
292
static float getMeshletScore(float distance, float spread, float cone_weight, float expected_radius)
293
{
294
float cone = 1.f - spread * cone_weight;
295
float cone_clamped = cone < 1e-3f ? 1e-3f : cone;
296
297
return (1 + distance / expected_radius * (1 - cone_weight)) * cone_clamped;
298
}
299
300
static Cone getMeshletCone(const Cone& acc, unsigned int triangle_count)
301
{
302
Cone result = acc;
303
304
float center_scale = triangle_count == 0 ? 0.f : 1.f / float(triangle_count);
305
306
result.px *= center_scale;
307
result.py *= center_scale;
308
result.pz *= center_scale;
309
310
float axis_length = result.nx * result.nx + result.ny * result.ny + result.nz * result.nz;
311
float axis_scale = axis_length == 0.f ? 0.f : 1.f / sqrtf(axis_length);
312
313
result.nx *= axis_scale;
314
result.ny *= axis_scale;
315
result.nz *= axis_scale;
316
317
return result;
318
}
319
320
static float computeTriangleCones(Cone* triangles, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride)
321
{
322
(void)vertex_count;
323
324
size_t vertex_stride_float = vertex_positions_stride / sizeof(float);
325
size_t face_count = index_count / 3;
326
327
float mesh_area = 0;
328
329
for (size_t i = 0; i < face_count; ++i)
330
{
331
unsigned int a = indices[i * 3 + 0], b = indices[i * 3 + 1], c = indices[i * 3 + 2];
332
assert(a < vertex_count && b < vertex_count && c < vertex_count);
333
334
const float* p0 = vertex_positions + vertex_stride_float * a;
335
const float* p1 = vertex_positions + vertex_stride_float * b;
336
const float* p2 = vertex_positions + vertex_stride_float * c;
337
338
float p10[3] = {p1[0] - p0[0], p1[1] - p0[1], p1[2] - p0[2]};
339
float p20[3] = {p2[0] - p0[0], p2[1] - p0[1], p2[2] - p0[2]};
340
341
float normalx = p10[1] * p20[2] - p10[2] * p20[1];
342
float normaly = p10[2] * p20[0] - p10[0] * p20[2];
343
float normalz = p10[0] * p20[1] - p10[1] * p20[0];
344
345
float area = sqrtf(normalx * normalx + normaly * normaly + normalz * normalz);
346
float invarea = (area == 0.f) ? 0.f : 1.f / area;
347
348
triangles[i].px = (p0[0] + p1[0] + p2[0]) / 3.f;
349
triangles[i].py = (p0[1] + p1[1] + p2[1]) / 3.f;
350
triangles[i].pz = (p0[2] + p1[2] + p2[2]) / 3.f;
351
352
triangles[i].nx = normalx * invarea;
353
triangles[i].ny = normaly * invarea;
354
triangles[i].nz = normalz * invarea;
355
356
mesh_area += area;
357
}
358
359
return mesh_area;
360
}
361
362
static bool appendMeshlet(meshopt_Meshlet& meshlet, unsigned int a, unsigned int b, unsigned int c, short* used, meshopt_Meshlet* meshlets, unsigned int* meshlet_vertices, unsigned char* meshlet_triangles, size_t meshlet_offset, size_t max_vertices, size_t max_triangles, bool split = false)
363
{
364
short& av = used[a];
365
short& bv = used[b];
366
short& cv = used[c];
367
368
bool result = false;
369
370
int used_extra = (av < 0) + (bv < 0) + (cv < 0);
371
372
if (meshlet.vertex_count + used_extra > max_vertices || meshlet.triangle_count >= max_triangles || split)
373
{
374
meshlets[meshlet_offset] = meshlet;
375
376
for (size_t j = 0; j < meshlet.vertex_count; ++j)
377
used[meshlet_vertices[meshlet.vertex_offset + j]] = -1;
378
379
meshlet.vertex_offset += meshlet.vertex_count;
380
meshlet.triangle_offset += meshlet.triangle_count * 3;
381
meshlet.vertex_count = 0;
382
meshlet.triangle_count = 0;
383
384
result = true;
385
}
386
387
if (av < 0)
388
{
389
av = short(meshlet.vertex_count);
390
meshlet_vertices[meshlet.vertex_offset + meshlet.vertex_count++] = a;
391
}
392
393
if (bv < 0)
394
{
395
bv = short(meshlet.vertex_count);
396
meshlet_vertices[meshlet.vertex_offset + meshlet.vertex_count++] = b;
397
}
398
399
if (cv < 0)
400
{
401
cv = short(meshlet.vertex_count);
402
meshlet_vertices[meshlet.vertex_offset + meshlet.vertex_count++] = c;
403
}
404
405
meshlet_triangles[meshlet.triangle_offset + meshlet.triangle_count * 3 + 0] = (unsigned char)av;
406
meshlet_triangles[meshlet.triangle_offset + meshlet.triangle_count * 3 + 1] = (unsigned char)bv;
407
meshlet_triangles[meshlet.triangle_offset + meshlet.triangle_count * 3 + 2] = (unsigned char)cv;
408
meshlet.triangle_count++;
409
410
return result;
411
}
412
413
static unsigned int getNeighborTriangle(const meshopt_Meshlet& meshlet, const Cone& meshlet_cone, const unsigned int* meshlet_vertices, const unsigned int* indices, const TriangleAdjacency2& adjacency, const Cone* triangles, const unsigned int* live_triangles, const short* used, float meshlet_expected_radius, float cone_weight)
414
{
415
unsigned int best_triangle = ~0u;
416
int best_priority = 5;
417
float best_score = FLT_MAX;
418
419
for (size_t i = 0; i < meshlet.vertex_count; ++i)
420
{
421
unsigned int index = meshlet_vertices[meshlet.vertex_offset + i];
422
423
unsigned int* neighbors = &adjacency.data[0] + adjacency.offsets[index];
424
size_t neighbors_size = adjacency.counts[index];
425
426
for (size_t j = 0; j < neighbors_size; ++j)
427
{
428
unsigned int triangle = neighbors[j];
429
unsigned int a = indices[triangle * 3 + 0], b = indices[triangle * 3 + 1], c = indices[triangle * 3 + 2];
430
431
int extra = (used[a] < 0) + (used[b] < 0) + (used[c] < 0);
432
assert(extra <= 2);
433
434
int priority = -1;
435
436
// triangles that don't add new vertices to meshlets are max. priority
437
if (extra == 0)
438
priority = 0;
439
// artificially increase the priority of dangling triangles as they're expensive to add to new meshlets
440
else if (live_triangles[a] == 1 || live_triangles[b] == 1 || live_triangles[c] == 1)
441
priority = 1;
442
// if two vertices have live count of 2, removing this triangle will make another triangle dangling which is good for overall flow
443
else if ((live_triangles[a] == 2) + (live_triangles[b] == 2) + (live_triangles[c] == 2) >= 2)
444
priority = 1 + extra;
445
// otherwise adjust priority to be after the above cases, 3 or 4 based on used[] count
446
else
447
priority = 2 + extra;
448
449
// since topology-based priority is always more important than the score, we can skip scoring in some cases
450
if (priority > best_priority)
451
continue;
452
453
const Cone& tri_cone = triangles[triangle];
454
455
float dx = tri_cone.px - meshlet_cone.px, dy = tri_cone.py - meshlet_cone.py, dz = tri_cone.pz - meshlet_cone.pz;
456
float distance = sqrtf(dx * dx + dy * dy + dz * dz);
457
float spread = tri_cone.nx * meshlet_cone.nx + tri_cone.ny * meshlet_cone.ny + tri_cone.nz * meshlet_cone.nz;
458
459
float score = getMeshletScore(distance, spread, cone_weight, meshlet_expected_radius);
460
461
// note that topology-based priority is always more important than the score
462
// this helps maintain reasonable effectiveness of meshlet data and reduces scoring cost
463
if (priority < best_priority || score < best_score)
464
{
465
best_triangle = triangle;
466
best_priority = priority;
467
best_score = score;
468
}
469
}
470
}
471
472
return best_triangle;
473
}
474
475
static size_t appendSeedTriangles(unsigned int* seeds, const meshopt_Meshlet& meshlet, const unsigned int* meshlet_vertices, const unsigned int* indices, const TriangleAdjacency2& adjacency, const Cone* triangles, const unsigned int* live_triangles, float cornerx, float cornery, float cornerz)
476
{
477
unsigned int best_seeds[kMeshletAddSeeds];
478
unsigned int best_live[kMeshletAddSeeds];
479
float best_score[kMeshletAddSeeds];
480
481
for (size_t i = 0; i < kMeshletAddSeeds; ++i)
482
{
483
best_seeds[i] = ~0u;
484
best_live[i] = ~0u;
485
best_score[i] = FLT_MAX;
486
}
487
488
for (size_t i = 0; i < meshlet.vertex_count; ++i)
489
{
490
unsigned int index = meshlet_vertices[meshlet.vertex_offset + i];
491
492
unsigned int best_neighbor = ~0u;
493
unsigned int best_neighbor_live = ~0u;
494
495
// find the neighbor with the smallest live metric
496
unsigned int* neighbors = &adjacency.data[0] + adjacency.offsets[index];
497
size_t neighbors_size = adjacency.counts[index];
498
499
for (size_t j = 0; j < neighbors_size; ++j)
500
{
501
unsigned int triangle = neighbors[j];
502
unsigned int a = indices[triangle * 3 + 0], b = indices[triangle * 3 + 1], c = indices[triangle * 3 + 2];
503
504
unsigned int live = live_triangles[a] + live_triangles[b] + live_triangles[c];
505
506
if (live < best_neighbor_live)
507
{
508
best_neighbor = triangle;
509
best_neighbor_live = live;
510
}
511
}
512
513
// add the neighbor to the list of seeds; the list is unsorted and the replacement criteria is approximate
514
if (best_neighbor == ~0u)
515
continue;
516
517
float dx = triangles[best_neighbor].px - cornerx, dy = triangles[best_neighbor].py - cornery, dz = triangles[best_neighbor].pz - cornerz;
518
float best_neighbor_score = sqrtf(dx * dx + dy * dy + dz * dz);
519
520
for (size_t j = 0; j < kMeshletAddSeeds; ++j)
521
{
522
// non-strict comparison reduces the number of duplicate seeds (triangles adjacent to multiple vertices)
523
if (best_neighbor_live < best_live[j] || (best_neighbor_live == best_live[j] && best_neighbor_score <= best_score[j]))
524
{
525
best_seeds[j] = best_neighbor;
526
best_live[j] = best_neighbor_live;
527
best_score[j] = best_neighbor_score;
528
break;
529
}
530
}
531
}
532
533
// add surviving seeds to the meshlet
534
size_t seed_count = 0;
535
536
for (size_t i = 0; i < kMeshletAddSeeds; ++i)
537
if (best_seeds[i] != ~0u)
538
seeds[seed_count++] = best_seeds[i];
539
540
return seed_count;
541
}
542
543
static size_t pruneSeedTriangles(unsigned int* seeds, size_t seed_count, const unsigned char* emitted_flags)
544
{
545
size_t result = 0;
546
547
for (size_t i = 0; i < seed_count; ++i)
548
{
549
unsigned int index = seeds[i];
550
551
seeds[result] = index;
552
result += emitted_flags[index] == 0;
553
}
554
555
return result;
556
}
557
558
static unsigned int selectSeedTriangle(const unsigned int* seeds, size_t seed_count, const unsigned int* indices, const Cone* triangles, const unsigned int* live_triangles, float cornerx, float cornery, float cornerz)
559
{
560
unsigned int best_seed = ~0u;
561
unsigned int best_live = ~0u;
562
float best_score = FLT_MAX;
563
564
for (size_t i = 0; i < seed_count; ++i)
565
{
566
unsigned int index = seeds[i];
567
unsigned int a = indices[index * 3 + 0], b = indices[index * 3 + 1], c = indices[index * 3 + 2];
568
569
unsigned int live = live_triangles[a] + live_triangles[b] + live_triangles[c];
570
float dx = triangles[index].px - cornerx, dy = triangles[index].py - cornery, dz = triangles[index].pz - cornerz;
571
float score = sqrtf(dx * dx + dy * dy + dz * dz);
572
573
if (live < best_live || (live == best_live && score < best_score))
574
{
575
best_seed = index;
576
best_live = live;
577
best_score = score;
578
}
579
}
580
581
return best_seed;
582
}
583
584
struct KDNode
585
{
586
union
587
{
588
float split;
589
unsigned int index;
590
};
591
592
// leaves: axis = 3, children = number of points including this one
593
// branches: axis != 3, left subtree = skip 1, right subtree = skip 1+children
594
unsigned int axis : 2;
595
unsigned int children : 30;
596
};
597
598
static size_t kdtreePartition(unsigned int* indices, size_t count, const float* points, size_t stride, int axis, float pivot)
599
{
600
size_t m = 0;
601
602
// invariant: elements in range [0, m) are < pivot, elements in range [m, i) are >= pivot
603
for (size_t i = 0; i < count; ++i)
604
{
605
float v = points[indices[i] * stride + axis];
606
607
// swap(m, i) unconditionally
608
unsigned int t = indices[m];
609
indices[m] = indices[i];
610
indices[i] = t;
611
612
// when v >= pivot, we swap i with m without advancing it, preserving invariants
613
m += v < pivot;
614
}
615
616
return m;
617
}
618
619
static size_t kdtreeBuildLeaf(size_t offset, KDNode* nodes, size_t node_count, unsigned int* indices, size_t count)
620
{
621
assert(offset + count <= node_count);
622
(void)node_count;
623
624
KDNode& result = nodes[offset];
625
626
result.index = indices[0];
627
result.axis = 3;
628
result.children = unsigned(count);
629
630
// all remaining points are stored in nodes immediately following the leaf
631
for (size_t i = 1; i < count; ++i)
632
{
633
KDNode& tail = nodes[offset + i];
634
635
tail.index = indices[i];
636
tail.axis = 3;
637
tail.children = ~0u >> 2; // bogus value to prevent misuse
638
}
639
640
return offset + count;
641
}
642
643
static size_t kdtreeBuild(size_t offset, KDNode* nodes, size_t node_count, const float* points, size_t stride, unsigned int* indices, size_t count, size_t leaf_size, int depth)
644
{
645
assert(count > 0);
646
assert(offset < node_count);
647
648
if (count <= leaf_size)
649
return kdtreeBuildLeaf(offset, nodes, node_count, indices, count);
650
651
float mean[3] = {};
652
float vars[3] = {};
653
float runc = 1, runs = 1;
654
655
// gather statistics on the points in the subtree using Welford's algorithm
656
for (size_t i = 0; i < count; ++i, runc += 1.f, runs = 1.f / runc)
657
{
658
const float* point = points + indices[i] * stride;
659
660
for (int k = 0; k < 3; ++k)
661
{
662
float delta = point[k] - mean[k];
663
mean[k] += delta * runs;
664
vars[k] += delta * (point[k] - mean[k]);
665
}
666
}
667
668
// split axis is one where the variance is largest
669
int axis = (vars[0] >= vars[1] && vars[0] >= vars[2]) ? 0 : (vars[1] >= vars[2] ? 1 : 2);
670
671
float split = mean[axis];
672
size_t middle = kdtreePartition(indices, count, points, stride, axis, split);
673
674
// when the partition is degenerate simply consolidate the points into a single node
675
// this also ensures recursion depth is bounded on pathological inputs
676
if (middle <= leaf_size / 2 || middle >= count - leaf_size / 2 || depth >= kMeshletMaxTreeDepth)
677
return kdtreeBuildLeaf(offset, nodes, node_count, indices, count);
678
679
KDNode& result = nodes[offset];
680
681
result.split = split;
682
result.axis = axis;
683
684
// left subtree is right after our node
685
size_t next_offset = kdtreeBuild(offset + 1, nodes, node_count, points, stride, indices, middle, leaf_size, depth + 1);
686
687
// distance to the right subtree is represented explicitly
688
assert(next_offset - offset > 1);
689
result.children = unsigned(next_offset - offset - 1);
690
691
return kdtreeBuild(next_offset, nodes, node_count, points, stride, indices + middle, count - middle, leaf_size, depth + 1);
692
}
693
694
static void kdtreeNearest(KDNode* nodes, unsigned int root, const float* points, size_t stride, const unsigned char* emitted_flags, const float* position, unsigned int& result, float& limit)
695
{
696
const KDNode& node = nodes[root];
697
698
if (node.children == 0)
699
return;
700
701
if (node.axis == 3)
702
{
703
// leaf
704
bool inactive = true;
705
706
for (unsigned int i = 0; i < node.children; ++i)
707
{
708
unsigned int index = nodes[root + i].index;
709
710
if (emitted_flags[index])
711
continue;
712
713
inactive = false;
714
715
const float* point = points + index * stride;
716
717
float dx = point[0] - position[0], dy = point[1] - position[1], dz = point[2] - position[2];
718
float distance = sqrtf(dx * dx + dy * dy + dz * dz);
719
720
if (distance < limit)
721
{
722
result = index;
723
limit = distance;
724
}
725
}
726
727
// deactivate leaves that no longer have items to emit
728
if (inactive)
729
nodes[root].children = 0;
730
}
731
else
732
{
733
// branch; we order recursion to process the node that search position is in first
734
float delta = position[node.axis] - node.split;
735
unsigned int first = (delta <= 0) ? 0 : node.children;
736
unsigned int second = first ^ node.children;
737
738
// deactivate branches that no longer have items to emit to accelerate traversal
739
// note that we do this *before* recursing which delays deactivation but keeps tail calls
740
if ((nodes[root + 1 + first].children | nodes[root + 1 + second].children) == 0)
741
nodes[root].children = 0;
742
743
// recursion depth is bounded by tree depth (which is limited by construction)
744
kdtreeNearest(nodes, root + 1 + first, points, stride, emitted_flags, position, result, limit);
745
746
// only process the other node if it can have a match based on closest distance so far
747
if (fabsf(delta) <= limit)
748
kdtreeNearest(nodes, root + 1 + second, points, stride, emitted_flags, position, result, limit);
749
}
750
}
751
752
struct BVHBoxT
753
{
754
float min[4];
755
float max[4];
756
};
757
758
struct BVHBox
759
{
760
float min[3];
761
float max[3];
762
};
763
764
#if defined(SIMD_SSE)
765
static float boxMerge(BVHBoxT& box, const BVHBox& other)
766
{
767
__m128 min = _mm_loadu_ps(box.min);
768
__m128 max = _mm_loadu_ps(box.max);
769
770
// note: over-read is safe because BVHBox array is allocated with padding
771
min = _mm_min_ps(min, _mm_loadu_ps(other.min));
772
max = _mm_max_ps(max, _mm_loadu_ps(other.max));
773
774
_mm_storeu_ps(box.min, min);
775
_mm_storeu_ps(box.max, max);
776
777
__m128 size = _mm_sub_ps(max, min);
778
__m128 size_yzx = _mm_shuffle_ps(size, size, _MM_SHUFFLE(0, 0, 2, 1));
779
__m128 mul = _mm_mul_ps(size, size_yzx);
780
__m128 sum_xy = _mm_add_ss(mul, _mm_shuffle_ps(mul, mul, _MM_SHUFFLE(1, 1, 1, 1)));
781
__m128 sum_xyz = _mm_add_ss(sum_xy, _mm_shuffle_ps(mul, mul, _MM_SHUFFLE(2, 2, 2, 2)));
782
783
return _mm_cvtss_f32(sum_xyz);
784
}
785
#elif defined(SIMD_NEON)
786
static float boxMerge(BVHBoxT& box, const BVHBox& other)
787
{
788
float32x4_t min = vld1q_f32(box.min);
789
float32x4_t max = vld1q_f32(box.max);
790
791
// note: over-read is safe because BVHBox array is allocated with padding
792
min = vminq_f32(min, vld1q_f32(other.min));
793
max = vmaxq_f32(max, vld1q_f32(other.max));
794
795
vst1q_f32(box.min, min);
796
vst1q_f32(box.max, max);
797
798
float32x4_t size = vsubq_f32(max, min);
799
float32x4_t size_yzx = vextq_f32(vextq_f32(size, size, 3), size, 2);
800
float32x4_t mul = vmulq_f32(size, size_yzx);
801
float sum_xy = vgetq_lane_f32(mul, 0) + vgetq_lane_f32(mul, 1);
802
float sum_xyz = sum_xy + vgetq_lane_f32(mul, 2);
803
804
return sum_xyz;
805
}
806
#else
807
static float boxMerge(BVHBoxT& box, const BVHBox& other)
808
{
809
for (int k = 0; k < 3; ++k)
810
{
811
box.min[k] = other.min[k] < box.min[k] ? other.min[k] : box.min[k];
812
box.max[k] = other.max[k] > box.max[k] ? other.max[k] : box.max[k];
813
}
814
815
float sx = box.max[0] - box.min[0], sy = box.max[1] - box.min[1], sz = box.max[2] - box.min[2];
816
return sx * sy + sx * sz + sy * sz;
817
}
818
#endif
819
820
inline unsigned int radixFloat(unsigned int v)
821
{
822
// if sign bit is 0, flip sign bit
823
// if sign bit is 1, flip everything
824
unsigned int mask = (int(v) >> 31) | 0x80000000;
825
return v ^ mask;
826
}
827
828
static void computeHistogram(unsigned int (&hist)[1024][3], const float* data, size_t count)
829
{
830
memset(hist, 0, sizeof(hist));
831
832
const unsigned int* bits = reinterpret_cast<const unsigned int*>(data);
833
834
// compute 3 10-bit histograms in parallel (dropping 2 LSB)
835
for (size_t i = 0; i < count; ++i)
836
{
837
unsigned int id = radixFloat(bits[i]);
838
839
hist[(id >> 2) & 1023][0]++;
840
hist[(id >> 12) & 1023][1]++;
841
hist[(id >> 22) & 1023][2]++;
842
}
843
844
unsigned int sum0 = 0, sum1 = 0, sum2 = 0;
845
846
// replace histogram data with prefix histogram sums in-place
847
for (int i = 0; i < 1024; ++i)
848
{
849
unsigned int hx = hist[i][0], hy = hist[i][1], hz = hist[i][2];
850
851
hist[i][0] = sum0;
852
hist[i][1] = sum1;
853
hist[i][2] = sum2;
854
855
sum0 += hx;
856
sum1 += hy;
857
sum2 += hz;
858
}
859
860
assert(sum0 == count && sum1 == count && sum2 == count);
861
}
862
863
static void radixPass(unsigned int* destination, const unsigned int* source, const float* keys, size_t count, unsigned int (&hist)[1024][3], int pass)
864
{
865
const unsigned int* bits = reinterpret_cast<const unsigned int*>(keys);
866
int bitoff = pass * 10 + 2; // drop 2 LSB to be able to use 3 10-bit passes
867
868
for (size_t i = 0; i < count; ++i)
869
{
870
unsigned int id = (radixFloat(bits[source[i]]) >> bitoff) & 1023;
871
872
destination[hist[id][pass]++] = source[i];
873
}
874
}
875
876
static void bvhPrepare(BVHBox* boxes, float* centroids, const unsigned int* indices, size_t face_count, const float* vertex_positions, size_t vertex_count, size_t vertex_stride_float)
877
{
878
(void)vertex_count;
879
880
for (size_t i = 0; i < face_count; ++i)
881
{
882
unsigned int a = indices[i * 3 + 0], b = indices[i * 3 + 1], c = indices[i * 3 + 2];
883
assert(a < vertex_count && b < vertex_count && c < vertex_count);
884
885
const float* va = vertex_positions + vertex_stride_float * a;
886
const float* vb = vertex_positions + vertex_stride_float * b;
887
const float* vc = vertex_positions + vertex_stride_float * c;
888
889
BVHBox& box = boxes[i];
890
891
for (int k = 0; k < 3; ++k)
892
{
893
box.min[k] = va[k] < vb[k] ? va[k] : vb[k];
894
box.min[k] = vc[k] < box.min[k] ? vc[k] : box.min[k];
895
896
box.max[k] = va[k] > vb[k] ? va[k] : vb[k];
897
box.max[k] = vc[k] > box.max[k] ? vc[k] : box.max[k];
898
899
centroids[i + face_count * k] = (box.min[k] + box.max[k]) / 2.f;
900
}
901
}
902
}
903
904
static size_t bvhCountVertices(const unsigned int* order, size_t count, short* used, const unsigned int* indices, unsigned int* out = NULL)
905
{
906
// count number of unique vertices
907
size_t used_vertices = 0;
908
for (size_t i = 0; i < count; ++i)
909
{
910
unsigned int index = order[i];
911
unsigned int a = indices[index * 3 + 0], b = indices[index * 3 + 1], c = indices[index * 3 + 2];
912
913
used_vertices += (used[a] < 0) + (used[b] < 0) + (used[c] < 0);
914
used[a] = used[b] = used[c] = 1;
915
916
if (out)
917
out[i] = unsigned(used_vertices);
918
}
919
920
// reset used[] for future invocations
921
for (size_t i = 0; i < count; ++i)
922
{
923
unsigned int index = order[i];
924
unsigned int a = indices[index * 3 + 0], b = indices[index * 3 + 1], c = indices[index * 3 + 2];
925
926
used[a] = used[b] = used[c] = -1;
927
}
928
929
return used_vertices;
930
}
931
932
static void bvhPackLeaf(unsigned char* boundary, size_t count)
933
{
934
// mark meshlet boundary for future reassembly
935
assert(count > 0);
936
937
boundary[0] = 1;
938
memset(boundary + 1, 0, count - 1);
939
}
940
941
static void bvhPackTail(unsigned char* boundary, const unsigned int* order, size_t count, short* used, const unsigned int* indices, size_t max_vertices, size_t max_triangles)
942
{
943
for (size_t i = 0; i < count;)
944
{
945
size_t chunk = i + max_triangles <= count ? max_triangles : count - i;
946
947
if (bvhCountVertices(order + i, chunk, used, indices) <= max_vertices)
948
{
949
bvhPackLeaf(boundary + i, chunk);
950
i += chunk;
951
continue;
952
}
953
954
// chunk is vertex bound, split it into smaller meshlets
955
assert(chunk > max_vertices / 3);
956
957
bvhPackLeaf(boundary + i, max_vertices / 3);
958
i += max_vertices / 3;
959
}
960
}
961
962
static bool bvhDivisible(size_t count, size_t min, size_t max)
963
{
964
// count is representable as a sum of values in [min..max] if if it in range of [k*min..k*min+k*(max-min)]
965
// equivalent to ceil(count / max) <= floor(count / min), but the form below allows using idiv (see nv_cluster_builder)
966
// we avoid expensive integer divisions in the common case where min is <= max/2
967
return min * 2 <= max ? count >= min : count % min <= (count / min) * (max - min);
968
}
969
970
static void bvhComputeArea(float* areas, const BVHBox* boxes, const unsigned int* order, size_t count)
971
{
972
BVHBoxT accuml = {{FLT_MAX, FLT_MAX, FLT_MAX, 0}, {-FLT_MAX, -FLT_MAX, -FLT_MAX, 0}};
973
BVHBoxT accumr = accuml;
974
975
for (size_t i = 0; i < count; ++i)
976
{
977
float larea = boxMerge(accuml, boxes[order[i]]);
978
float rarea = boxMerge(accumr, boxes[order[count - 1 - i]]);
979
980
areas[i] = larea;
981
areas[i + count] = rarea;
982
}
983
}
984
985
static size_t bvhPivot(const float* areas, const unsigned int* vertices, size_t count, size_t step, size_t min, size_t max, float fill, size_t maxfill, float* out_cost)
986
{
987
bool aligned = count >= min * 2 && bvhDivisible(count, min, max);
988
size_t end = aligned ? count - min : count - 1;
989
990
float rmaxfill = 1.f / float(int(maxfill));
991
992
// find best split that minimizes SAH
993
size_t bestsplit = 0;
994
float bestcost = FLT_MAX;
995
996
for (size_t i = min - 1; i < end; i += step)
997
{
998
size_t lsplit = i + 1, rsplit = count - (i + 1);
999
1000
if (!bvhDivisible(lsplit, min, max))
1001
continue;
1002
if (aligned && !bvhDivisible(rsplit, min, max))
1003
continue;
1004
1005
// areas[x] = inclusive surface area of boxes[0..x]
1006
// areas[count-1-x] = inclusive surface area of boxes[x..count-1]
1007
float larea = areas[i], rarea = areas[(count - 1 - (i + 1)) + count];
1008
float cost = larea * float(int(lsplit)) + rarea * float(int(rsplit));
1009
1010
if (cost > bestcost)
1011
continue;
1012
1013
// use vertex fill when splitting vertex limited clusters; note that we use the same (left->right) vertex count
1014
// using bidirectional vertex counts is a little more expensive to compute and produces slightly worse results in practice
1015
size_t lfill = vertices ? vertices[i] : lsplit;
1016
size_t rfill = vertices ? vertices[i] : rsplit;
1017
1018
// fill cost; use floating point math to round up to maxfill to avoid expensive integer modulo
1019
int lrest = int(float(int(lfill + maxfill - 1)) * rmaxfill) * int(maxfill) - int(lfill);
1020
int rrest = int(float(int(rfill + maxfill - 1)) * rmaxfill) * int(maxfill) - int(rfill);
1021
1022
cost += fill * (float(lrest) * larea + float(rrest) * rarea);
1023
1024
if (cost < bestcost)
1025
{
1026
bestcost = cost;
1027
bestsplit = i + 1;
1028
}
1029
}
1030
1031
*out_cost = bestcost;
1032
return bestsplit;
1033
}
1034
1035
static void bvhPartition(unsigned int* target, const unsigned int* order, const unsigned char* sides, size_t split, size_t count)
1036
{
1037
size_t l = 0, r = split;
1038
1039
for (size_t i = 0; i < count; ++i)
1040
{
1041
unsigned char side = sides[order[i]];
1042
target[side ? r : l] = order[i];
1043
l += 1;
1044
l -= side;
1045
r += side;
1046
}
1047
1048
assert(l == split && r == count);
1049
}
1050
1051
static void bvhSplit(const BVHBox* boxes, unsigned int* orderx, unsigned int* ordery, unsigned int* orderz, unsigned char* boundary, size_t count, int depth, void* scratch, short* used, const unsigned int* indices, size_t max_vertices, size_t min_triangles, size_t max_triangles, float fill_weight)
1052
{
1053
if (count <= max_triangles && bvhCountVertices(orderx, count, used, indices) <= max_vertices)
1054
return bvhPackLeaf(boundary, count);
1055
1056
unsigned int* axes[3] = {orderx, ordery, orderz};
1057
1058
// we can use step=1 unconditionally but to reduce the cost for min=max case we use step=max
1059
size_t step = min_triangles == max_triangles && count > max_triangles ? max_triangles : 1;
1060
1061
// if we could not pack the meshlet, we must be vertex bound
1062
size_t mint = count <= max_triangles && max_vertices / 3 < min_triangles ? max_vertices / 3 : min_triangles;
1063
size_t maxfill = count <= max_triangles ? max_vertices : max_triangles;
1064
1065
// find best split that minimizes SAH
1066
int bestk = -1;
1067
size_t bestsplit = 0;
1068
float bestcost = FLT_MAX;
1069
1070
for (int k = 0; k < 3; ++k)
1071
{
1072
float* areas = static_cast<float*>(scratch);
1073
unsigned int* vertices = NULL;
1074
1075
bvhComputeArea(areas, boxes, axes[k], count);
1076
1077
if (count <= max_triangles)
1078
{
1079
// for vertex bound clusters, count number of unique vertices for each split
1080
vertices = reinterpret_cast<unsigned int*>(areas + 2 * count);
1081
bvhCountVertices(axes[k], count, used, indices, vertices);
1082
}
1083
1084
float axiscost = FLT_MAX;
1085
size_t axissplit = bvhPivot(areas, vertices, count, step, mint, max_triangles, fill_weight, maxfill, &axiscost);
1086
1087
if (axissplit && axiscost < bestcost)
1088
{
1089
bestk = k;
1090
bestcost = axiscost;
1091
bestsplit = axissplit;
1092
}
1093
}
1094
1095
// this may happen if SAH costs along the admissible splits are NaN, or due to imbalanced splits on pathological inputs
1096
if (bestk < 0 || depth >= kMeshletMaxTreeDepth)
1097
return bvhPackTail(boundary, orderx, count, used, indices, max_vertices, max_triangles);
1098
1099
// mark sides of split for partitioning
1100
unsigned char* sides = static_cast<unsigned char*>(scratch) + count * sizeof(unsigned int);
1101
1102
for (size_t i = 0; i < bestsplit; ++i)
1103
sides[axes[bestk][i]] = 0;
1104
1105
for (size_t i = bestsplit; i < count; ++i)
1106
sides[axes[bestk][i]] = 1;
1107
1108
// partition all axes into two sides, maintaining order
1109
unsigned int* temp = static_cast<unsigned int*>(scratch);
1110
1111
for (int k = 0; k < 3; ++k)
1112
{
1113
if (k == bestk)
1114
continue;
1115
1116
unsigned int* axis = axes[k];
1117
memcpy(temp, axis, sizeof(unsigned int) * count);
1118
bvhPartition(axis, temp, sides, bestsplit, count);
1119
}
1120
1121
// recursion depth is bounded due to max depth check above
1122
bvhSplit(boxes, orderx, ordery, orderz, boundary, bestsplit, depth + 1, scratch, used, indices, max_vertices, min_triangles, max_triangles, fill_weight);
1123
bvhSplit(boxes, orderx + bestsplit, ordery + bestsplit, orderz + bestsplit, boundary + bestsplit, count - bestsplit, depth + 1, scratch, used, indices, max_vertices, min_triangles, max_triangles, fill_weight);
1124
}
1125
1126
} // namespace meshopt
1127
1128
size_t meshopt_buildMeshletsBound(size_t index_count, size_t max_vertices, size_t max_triangles)
1129
{
1130
using namespace meshopt;
1131
1132
assert(index_count % 3 == 0);
1133
assert(max_vertices >= 3 && max_vertices <= kMeshletMaxVertices);
1134
assert(max_triangles >= 1 && max_triangles <= kMeshletMaxTriangles);
1135
1136
(void)kMeshletMaxVertices;
1137
(void)kMeshletMaxTriangles;
1138
1139
// meshlet construction is limited by max vertices and max triangles per meshlet
1140
// the worst case is that the input is an unindexed stream since this equally stresses both limits
1141
// note that we assume that in the worst case, we leave 2 vertices unpacked in each meshlet - if we have space for 3 we can pack any triangle
1142
size_t max_vertices_conservative = max_vertices - 2;
1143
size_t meshlet_limit_vertices = (index_count + max_vertices_conservative - 1) / max_vertices_conservative;
1144
size_t meshlet_limit_triangles = (index_count / 3 + max_triangles - 1) / max_triangles;
1145
1146
return meshlet_limit_vertices > meshlet_limit_triangles ? meshlet_limit_vertices : meshlet_limit_triangles;
1147
}
1148
1149
size_t meshopt_buildMeshletsFlex(meshopt_Meshlet* meshlets, unsigned int* meshlet_vertices, unsigned char* meshlet_triangles, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t max_vertices, size_t min_triangles, size_t max_triangles, float cone_weight, float split_factor)
1150
{
1151
using namespace meshopt;
1152
1153
assert(index_count % 3 == 0);
1154
assert(vertex_positions_stride >= 12 && vertex_positions_stride <= 256);
1155
assert(vertex_positions_stride % sizeof(float) == 0);
1156
1157
assert(max_vertices >= 3 && max_vertices <= kMeshletMaxVertices);
1158
assert(min_triangles >= 1 && min_triangles <= max_triangles && max_triangles <= kMeshletMaxTriangles);
1159
1160
assert(cone_weight >= 0 && cone_weight <= 1);
1161
assert(split_factor >= 0);
1162
1163
if (index_count == 0)
1164
return 0;
1165
1166
meshopt_Allocator allocator;
1167
1168
TriangleAdjacency2 adjacency = {};
1169
if (vertex_count > index_count && index_count < (1u << 31))
1170
buildTriangleAdjacencySparse(adjacency, indices, index_count, vertex_count, allocator);
1171
else
1172
buildTriangleAdjacency(adjacency, indices, index_count, vertex_count, allocator);
1173
1174
// live triangle counts; note, we alias adjacency.counts as we remove triangles after emitting them so the counts always match
1175
unsigned int* live_triangles = adjacency.counts;
1176
1177
size_t face_count = index_count / 3;
1178
1179
unsigned char* emitted_flags = allocator.allocate<unsigned char>(face_count);
1180
memset(emitted_flags, 0, face_count);
1181
1182
// for each triangle, precompute centroid & normal to use for scoring
1183
Cone* triangles = allocator.allocate<Cone>(face_count);
1184
float mesh_area = computeTriangleCones(triangles, indices, index_count, vertex_positions, vertex_count, vertex_positions_stride);
1185
1186
// assuming each meshlet is a square patch, expected radius is sqrt(expected area)
1187
float triangle_area_avg = face_count == 0 ? 0.f : mesh_area / float(face_count) * 0.5f;
1188
float meshlet_expected_radius = sqrtf(triangle_area_avg * max_triangles) * 0.5f;
1189
1190
// build a kd-tree for nearest neighbor lookup
1191
unsigned int* kdindices = allocator.allocate<unsigned int>(face_count);
1192
for (size_t i = 0; i < face_count; ++i)
1193
kdindices[i] = unsigned(i);
1194
1195
KDNode* nodes = allocator.allocate<KDNode>(face_count * 2);
1196
kdtreeBuild(0, nodes, face_count * 2, &triangles[0].px, sizeof(Cone) / sizeof(float), kdindices, face_count, /* leaf_size= */ 8, 0);
1197
1198
// find a specific corner of the mesh to use as a starting point for meshlet flow
1199
float cornerx = FLT_MAX, cornery = FLT_MAX, cornerz = FLT_MAX;
1200
1201
for (size_t i = 0; i < face_count; ++i)
1202
{
1203
const Cone& tri = triangles[i];
1204
1205
cornerx = cornerx > tri.px ? tri.px : cornerx;
1206
cornery = cornery > tri.py ? tri.py : cornery;
1207
cornerz = cornerz > tri.pz ? tri.pz : cornerz;
1208
}
1209
1210
// index of the vertex in the meshlet, -1 if the vertex isn't used
1211
short* used = allocator.allocate<short>(vertex_count);
1212
clearUsed(used, vertex_count, indices, index_count);
1213
1214
// initial seed triangle is the one closest to the corner
1215
unsigned int initial_seed = ~0u;
1216
float initial_score = FLT_MAX;
1217
1218
for (size_t i = 0; i < face_count; ++i)
1219
{
1220
const Cone& tri = triangles[i];
1221
1222
float dx = tri.px - cornerx, dy = tri.py - cornery, dz = tri.pz - cornerz;
1223
float score = sqrtf(dx * dx + dy * dy + dz * dz);
1224
1225
if (initial_seed == ~0u || score < initial_score)
1226
{
1227
initial_seed = unsigned(i);
1228
initial_score = score;
1229
}
1230
}
1231
1232
// seed triangles to continue meshlet flow
1233
unsigned int seeds[kMeshletMaxSeeds] = {};
1234
size_t seed_count = 0;
1235
1236
meshopt_Meshlet meshlet = {};
1237
size_t meshlet_offset = 0;
1238
1239
Cone meshlet_cone_acc = {};
1240
1241
for (;;)
1242
{
1243
Cone meshlet_cone = getMeshletCone(meshlet_cone_acc, meshlet.triangle_count);
1244
1245
unsigned int best_triangle = ~0u;
1246
1247
// for the first triangle, we don't have a meshlet cone yet, so we use the initial seed
1248
// to continue the meshlet, we select an adjacent triangle based on connectivity and spatial scoring
1249
if (meshlet_offset == 0 && meshlet.triangle_count == 0)
1250
best_triangle = initial_seed;
1251
else
1252
best_triangle = getNeighborTriangle(meshlet, meshlet_cone, meshlet_vertices, indices, adjacency, triangles, live_triangles, used, meshlet_expected_radius, cone_weight);
1253
1254
bool split = false;
1255
1256
// when we run out of adjacent triangles we need to switch to spatial search; we currently just pick the closest triangle irrespective of connectivity
1257
if (best_triangle == ~0u)
1258
{
1259
float position[3] = {meshlet_cone.px, meshlet_cone.py, meshlet_cone.pz};
1260
unsigned int index = ~0u;
1261
float distance = FLT_MAX;
1262
1263
kdtreeNearest(nodes, 0, &triangles[0].px, sizeof(Cone) / sizeof(float), emitted_flags, position, index, distance);
1264
1265
best_triangle = index;
1266
split = meshlet.triangle_count >= min_triangles && split_factor > 0 && distance > meshlet_expected_radius * split_factor;
1267
}
1268
1269
if (best_triangle == ~0u)
1270
break;
1271
1272
int best_extra = (used[indices[best_triangle * 3 + 0]] < 0) + (used[indices[best_triangle * 3 + 1]] < 0) + (used[indices[best_triangle * 3 + 2]] < 0);
1273
1274
// if the best triangle doesn't fit into current meshlet, we re-select using seeds to maintain global flow
1275
if (split || (meshlet.vertex_count + best_extra > max_vertices || meshlet.triangle_count >= max_triangles))
1276
{
1277
seed_count = pruneSeedTriangles(seeds, seed_count, emitted_flags);
1278
seed_count = (seed_count + kMeshletAddSeeds <= kMeshletMaxSeeds) ? seed_count : kMeshletMaxSeeds - kMeshletAddSeeds;
1279
seed_count += appendSeedTriangles(seeds + seed_count, meshlet, meshlet_vertices, indices, adjacency, triangles, live_triangles, cornerx, cornery, cornerz);
1280
1281
unsigned int best_seed = selectSeedTriangle(seeds, seed_count, indices, triangles, live_triangles, cornerx, cornery, cornerz);
1282
1283
// we may not find a valid seed triangle if the mesh is disconnected as seeds are based on adjacency
1284
best_triangle = best_seed != ~0u ? best_seed : best_triangle;
1285
}
1286
1287
unsigned int a = indices[best_triangle * 3 + 0], b = indices[best_triangle * 3 + 1], c = indices[best_triangle * 3 + 2];
1288
assert(a < vertex_count && b < vertex_count && c < vertex_count);
1289
1290
// add meshlet to the output; when the current meshlet is full we reset the accumulated bounds
1291
if (appendMeshlet(meshlet, a, b, c, used, meshlets, meshlet_vertices, meshlet_triangles, meshlet_offset, max_vertices, max_triangles, split))
1292
{
1293
meshlet_offset++;
1294
memset(&meshlet_cone_acc, 0, sizeof(meshlet_cone_acc));
1295
}
1296
1297
// remove emitted triangle from adjacency data
1298
// this makes sure that we spend less time traversing these lists on subsequent iterations
1299
// live triangle counts are updated as a byproduct of these adjustments
1300
for (size_t k = 0; k < 3; ++k)
1301
{
1302
unsigned int index = indices[best_triangle * 3 + k];
1303
1304
unsigned int* neighbors = &adjacency.data[0] + adjacency.offsets[index];
1305
size_t neighbors_size = adjacency.counts[index];
1306
1307
for (size_t i = 0; i < neighbors_size; ++i)
1308
{
1309
unsigned int tri = neighbors[i];
1310
1311
if (tri == best_triangle)
1312
{
1313
neighbors[i] = neighbors[neighbors_size - 1];
1314
adjacency.counts[index]--;
1315
break;
1316
}
1317
}
1318
}
1319
1320
// update aggregated meshlet cone data for scoring subsequent triangles
1321
meshlet_cone_acc.px += triangles[best_triangle].px;
1322
meshlet_cone_acc.py += triangles[best_triangle].py;
1323
meshlet_cone_acc.pz += triangles[best_triangle].pz;
1324
meshlet_cone_acc.nx += triangles[best_triangle].nx;
1325
meshlet_cone_acc.ny += triangles[best_triangle].ny;
1326
meshlet_cone_acc.nz += triangles[best_triangle].nz;
1327
1328
assert(!emitted_flags[best_triangle]);
1329
emitted_flags[best_triangle] = 1;
1330
}
1331
1332
if (meshlet.triangle_count)
1333
meshlets[meshlet_offset++] = meshlet;
1334
1335
assert(meshlet_offset <= meshopt_buildMeshletsBound(index_count, max_vertices, min_triangles));
1336
assert(meshlet.triangle_offset + meshlet.triangle_count * 3 <= index_count && meshlet.vertex_offset + meshlet.vertex_count <= index_count);
1337
return meshlet_offset;
1338
}
1339
1340
size_t meshopt_buildMeshlets(meshopt_Meshlet* meshlets, unsigned int* meshlet_vertices, unsigned char* meshlet_triangles, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t max_vertices, size_t max_triangles, float cone_weight)
1341
{
1342
return meshopt_buildMeshletsFlex(meshlets, meshlet_vertices, meshlet_triangles, indices, index_count, vertex_positions, vertex_count, vertex_positions_stride, max_vertices, max_triangles, max_triangles, cone_weight, 0.0f);
1343
}
1344
1345
size_t meshopt_buildMeshletsScan(meshopt_Meshlet* meshlets, unsigned int* meshlet_vertices, unsigned char* meshlet_triangles, const unsigned int* indices, size_t index_count, size_t vertex_count, size_t max_vertices, size_t max_triangles)
1346
{
1347
using namespace meshopt;
1348
1349
assert(index_count % 3 == 0);
1350
1351
assert(max_vertices >= 3 && max_vertices <= kMeshletMaxVertices);
1352
assert(max_triangles >= 1 && max_triangles <= kMeshletMaxTriangles);
1353
1354
meshopt_Allocator allocator;
1355
1356
// index of the vertex in the meshlet, -1 if the vertex isn't used
1357
short* used = allocator.allocate<short>(vertex_count);
1358
clearUsed(used, vertex_count, indices, index_count);
1359
1360
meshopt_Meshlet meshlet = {};
1361
size_t meshlet_offset = 0;
1362
1363
for (size_t i = 0; i < index_count; i += 3)
1364
{
1365
unsigned int a = indices[i + 0], b = indices[i + 1], c = indices[i + 2];
1366
assert(a < vertex_count && b < vertex_count && c < vertex_count);
1367
1368
// appends triangle to the meshlet and writes previous meshlet to the output if full
1369
meshlet_offset += appendMeshlet(meshlet, a, b, c, used, meshlets, meshlet_vertices, meshlet_triangles, meshlet_offset, max_vertices, max_triangles);
1370
}
1371
1372
if (meshlet.triangle_count)
1373
meshlets[meshlet_offset++] = meshlet;
1374
1375
assert(meshlet_offset <= meshopt_buildMeshletsBound(index_count, max_vertices, max_triangles));
1376
assert(meshlet.triangle_offset + meshlet.triangle_count * 3 <= index_count && meshlet.vertex_offset + meshlet.vertex_count <= index_count);
1377
return meshlet_offset;
1378
}
1379
1380
size_t meshopt_buildMeshletsSpatial(struct meshopt_Meshlet* meshlets, unsigned int* meshlet_vertices, unsigned char* meshlet_triangles, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t max_vertices, size_t min_triangles, size_t max_triangles, float fill_weight)
1381
{
1382
using namespace meshopt;
1383
1384
assert(index_count % 3 == 0);
1385
assert(vertex_positions_stride >= 12 && vertex_positions_stride <= 256);
1386
assert(vertex_positions_stride % sizeof(float) == 0);
1387
1388
assert(max_vertices >= 3 && max_vertices <= kMeshletMaxVertices);
1389
assert(min_triangles >= 1 && min_triangles <= max_triangles && max_triangles <= kMeshletMaxTriangles);
1390
1391
if (index_count == 0)
1392
return 0;
1393
1394
size_t face_count = index_count / 3;
1395
size_t vertex_stride_float = vertex_positions_stride / sizeof(float);
1396
1397
meshopt_Allocator allocator;
1398
1399
// 3 floats plus 1 uint for sorting, or
1400
// 2 floats plus 1 uint for pivoting, or
1401
// 1 uint plus 1 byte for partitioning
1402
float* scratch = allocator.allocate<float>(face_count * 4);
1403
1404
// compute bounding boxes and centroids for sorting
1405
BVHBox* boxes = allocator.allocate<BVHBox>(face_count + 1); // padding for SIMD
1406
bvhPrepare(boxes, scratch, indices, face_count, vertex_positions, vertex_count, vertex_stride_float);
1407
memset(boxes + face_count, 0, sizeof(BVHBox));
1408
1409
unsigned int* axes = allocator.allocate<unsigned int>(face_count * 3);
1410
unsigned int* temp = reinterpret_cast<unsigned int*>(scratch) + face_count * 3;
1411
1412
for (int k = 0; k < 3; ++k)
1413
{
1414
unsigned int* order = axes + k * face_count;
1415
const float* keys = scratch + k * face_count;
1416
1417
unsigned int hist[1024][3];
1418
computeHistogram(hist, keys, face_count);
1419
1420
// 3-pass radix sort computes the resulting order into axes
1421
for (size_t i = 0; i < face_count; ++i)
1422
temp[i] = unsigned(i);
1423
1424
radixPass(order, temp, keys, face_count, hist, 0);
1425
radixPass(temp, order, keys, face_count, hist, 1);
1426
radixPass(order, temp, keys, face_count, hist, 2);
1427
}
1428
1429
// index of the vertex in the meshlet, -1 if the vertex isn't used
1430
short* used = allocator.allocate<short>(vertex_count);
1431
clearUsed(used, vertex_count, indices, index_count);
1432
1433
unsigned char* boundary = allocator.allocate<unsigned char>(face_count);
1434
1435
bvhSplit(boxes, &axes[0], &axes[face_count], &axes[face_count * 2], boundary, face_count, 0, scratch, used, indices, max_vertices, min_triangles, max_triangles, fill_weight);
1436
1437
// compute the desired number of meshlets; note that on some meshes with a lot of vertex bound clusters this might go over the bound
1438
size_t meshlet_count = 0;
1439
for (size_t i = 0; i < face_count; ++i)
1440
{
1441
assert(boundary[i] <= 1);
1442
meshlet_count += boundary[i];
1443
}
1444
1445
size_t meshlet_bound = meshopt_buildMeshletsBound(index_count, max_vertices, min_triangles);
1446
1447
// pack triangles into meshlets according to the order and boundaries marked by bvhSplit
1448
meshopt_Meshlet meshlet = {};
1449
size_t meshlet_offset = 0;
1450
size_t meshlet_pending = meshlet_count;
1451
1452
for (size_t i = 0; i < face_count; ++i)
1453
{
1454
assert(boundary[i] <= 1);
1455
bool split = i > 0 && boundary[i] == 1;
1456
1457
// while we are over the limit, we ignore boundary[] data and disable splits until we free up enough space
1458
if (split && meshlet_count > meshlet_bound && meshlet_offset + meshlet_pending >= meshlet_bound)
1459
split = false;
1460
1461
unsigned int index = axes[i];
1462
assert(index < face_count);
1463
1464
unsigned int a = indices[index * 3 + 0], b = indices[index * 3 + 1], c = indices[index * 3 + 2];
1465
1466
// appends triangle to the meshlet and writes previous meshlet to the output if full
1467
meshlet_offset += appendMeshlet(meshlet, a, b, c, used, meshlets, meshlet_vertices, meshlet_triangles, meshlet_offset, max_vertices, max_triangles, split);
1468
meshlet_pending -= boundary[i];
1469
}
1470
1471
if (meshlet.triangle_count)
1472
meshlets[meshlet_offset++] = meshlet;
1473
1474
assert(meshlet_offset <= meshlet_bound);
1475
assert(meshlet.triangle_offset + meshlet.triangle_count * 3 <= index_count && meshlet.vertex_offset + meshlet.vertex_count <= index_count);
1476
return meshlet_offset;
1477
}
1478
1479
meshopt_Bounds meshopt_computeClusterBounds(const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride)
1480
{
1481
using namespace meshopt;
1482
1483
assert(index_count % 3 == 0);
1484
assert(index_count / 3 <= kMeshletMaxTriangles);
1485
assert(vertex_positions_stride >= 12 && vertex_positions_stride <= 256);
1486
assert(vertex_positions_stride % sizeof(float) == 0);
1487
1488
(void)vertex_count;
1489
1490
size_t vertex_stride_float = vertex_positions_stride / sizeof(float);
1491
1492
// compute triangle normals and gather triangle corners
1493
float normals[kMeshletMaxTriangles][3];
1494
float corners[kMeshletMaxTriangles][3][3];
1495
size_t triangles = 0;
1496
1497
for (size_t i = 0; i < index_count; i += 3)
1498
{
1499
unsigned int a = indices[i + 0], b = indices[i + 1], c = indices[i + 2];
1500
assert(a < vertex_count && b < vertex_count && c < vertex_count);
1501
1502
const float* p0 = vertex_positions + vertex_stride_float * a;
1503
const float* p1 = vertex_positions + vertex_stride_float * b;
1504
const float* p2 = vertex_positions + vertex_stride_float * c;
1505
1506
float p10[3] = {p1[0] - p0[0], p1[1] - p0[1], p1[2] - p0[2]};
1507
float p20[3] = {p2[0] - p0[0], p2[1] - p0[1], p2[2] - p0[2]};
1508
1509
float normalx = p10[1] * p20[2] - p10[2] * p20[1];
1510
float normaly = p10[2] * p20[0] - p10[0] * p20[2];
1511
float normalz = p10[0] * p20[1] - p10[1] * p20[0];
1512
1513
float area = sqrtf(normalx * normalx + normaly * normaly + normalz * normalz);
1514
1515
// no need to include degenerate triangles - they will be invisible anyway
1516
if (area == 0.f)
1517
continue;
1518
1519
// record triangle normals & corners for future use; normal and corner 0 define a plane equation
1520
normals[triangles][0] = normalx / area;
1521
normals[triangles][1] = normaly / area;
1522
normals[triangles][2] = normalz / area;
1523
memcpy(corners[triangles][0], p0, 3 * sizeof(float));
1524
memcpy(corners[triangles][1], p1, 3 * sizeof(float));
1525
memcpy(corners[triangles][2], p2, 3 * sizeof(float));
1526
triangles++;
1527
}
1528
1529
meshopt_Bounds bounds = {};
1530
1531
// degenerate cluster, no valid triangles => trivial reject (cone data is 0)
1532
if (triangles == 0)
1533
return bounds;
1534
1535
const float rzero = 0.f;
1536
1537
// compute cluster bounding sphere; we'll use the center to determine normal cone apex as well
1538
float psphere[4] = {};
1539
computeBoundingSphere(psphere, corners[0][0], triangles * 3, sizeof(float) * 3, &rzero, 0, 7);
1540
1541
float center[3] = {psphere[0], psphere[1], psphere[2]};
1542
1543
// treating triangle normals as points, find the bounding sphere - the sphere center determines the optimal cone axis
1544
float nsphere[4] = {};
1545
computeBoundingSphere(nsphere, normals[0], triangles, sizeof(float) * 3, &rzero, 0, 3);
1546
1547
float axis[3] = {nsphere[0], nsphere[1], nsphere[2]};
1548
float axislength = sqrtf(axis[0] * axis[0] + axis[1] * axis[1] + axis[2] * axis[2]);
1549
float invaxislength = axislength == 0.f ? 0.f : 1.f / axislength;
1550
1551
axis[0] *= invaxislength;
1552
axis[1] *= invaxislength;
1553
axis[2] *= invaxislength;
1554
1555
// compute a tight cone around all normals, mindp = cos(angle/2)
1556
float mindp = 1.f;
1557
1558
for (size_t i = 0; i < triangles; ++i)
1559
{
1560
float dp = normals[i][0] * axis[0] + normals[i][1] * axis[1] + normals[i][2] * axis[2];
1561
1562
mindp = (dp < mindp) ? dp : mindp;
1563
}
1564
1565
// fill bounding sphere info; note that below we can return bounds without cone information for degenerate cones
1566
bounds.center[0] = center[0];
1567
bounds.center[1] = center[1];
1568
bounds.center[2] = center[2];
1569
bounds.radius = psphere[3];
1570
1571
// degenerate cluster, normal cone is larger than a hemisphere => trivial accept
1572
// note that if mindp is positive but close to 0, the triangle intersection code below gets less stable
1573
// we arbitrarily decide that if a normal cone is ~168 degrees wide or more, the cone isn't useful
1574
if (mindp <= 0.1f)
1575
{
1576
bounds.cone_cutoff = 1;
1577
bounds.cone_cutoff_s8 = 127;
1578
return bounds;
1579
}
1580
1581
float maxt = 0;
1582
1583
// we need to find the point on center-t*axis ray that lies in negative half-space of all triangles
1584
for (size_t i = 0; i < triangles; ++i)
1585
{
1586
// dot(center-t*axis-corner, trinormal) = 0
1587
// dot(center-corner, trinormal) - t * dot(axis, trinormal) = 0
1588
float cx = center[0] - corners[i][0][0];
1589
float cy = center[1] - corners[i][0][1];
1590
float cz = center[2] - corners[i][0][2];
1591
1592
float dc = cx * normals[i][0] + cy * normals[i][1] + cz * normals[i][2];
1593
float dn = axis[0] * normals[i][0] + axis[1] * normals[i][1] + axis[2] * normals[i][2];
1594
1595
// dn should be larger than mindp cutoff above
1596
assert(dn > 0.f);
1597
float t = dc / dn;
1598
1599
maxt = (t > maxt) ? t : maxt;
1600
}
1601
1602
// cone apex should be in the negative half-space of all cluster triangles by construction
1603
bounds.cone_apex[0] = center[0] - axis[0] * maxt;
1604
bounds.cone_apex[1] = center[1] - axis[1] * maxt;
1605
bounds.cone_apex[2] = center[2] - axis[2] * maxt;
1606
1607
// note: this axis is the axis of the normal cone, but our test for perspective camera effectively negates the axis
1608
bounds.cone_axis[0] = axis[0];
1609
bounds.cone_axis[1] = axis[1];
1610
bounds.cone_axis[2] = axis[2];
1611
1612
// cos(a) for normal cone is mindp; we need to add 90 degrees on both sides and invert the cone
1613
// which gives us -cos(a+90) = -(-sin(a)) = sin(a) = sqrt(1 - cos^2(a))
1614
bounds.cone_cutoff = sqrtf(1 - mindp * mindp);
1615
1616
// quantize axis & cutoff to 8-bit SNORM format
1617
bounds.cone_axis_s8[0] = (signed char)(meshopt_quantizeSnorm(bounds.cone_axis[0], 8));
1618
bounds.cone_axis_s8[1] = (signed char)(meshopt_quantizeSnorm(bounds.cone_axis[1], 8));
1619
bounds.cone_axis_s8[2] = (signed char)(meshopt_quantizeSnorm(bounds.cone_axis[2], 8));
1620
1621
// for the 8-bit test to be conservative, we need to adjust the cutoff by measuring the max. error
1622
float cone_axis_s8_e0 = fabsf(bounds.cone_axis_s8[0] / 127.f - bounds.cone_axis[0]);
1623
float cone_axis_s8_e1 = fabsf(bounds.cone_axis_s8[1] / 127.f - bounds.cone_axis[1]);
1624
float cone_axis_s8_e2 = fabsf(bounds.cone_axis_s8[2] / 127.f - bounds.cone_axis[2]);
1625
1626
// note that we need to round this up instead of rounding to nearest, hence +1
1627
int cone_cutoff_s8 = int(127 * (bounds.cone_cutoff + cone_axis_s8_e0 + cone_axis_s8_e1 + cone_axis_s8_e2) + 1);
1628
1629
bounds.cone_cutoff_s8 = (cone_cutoff_s8 > 127) ? 127 : (signed char)(cone_cutoff_s8);
1630
1631
return bounds;
1632
}
1633
1634
meshopt_Bounds meshopt_computeMeshletBounds(const unsigned int* meshlet_vertices, const unsigned char* meshlet_triangles, size_t triangle_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride)
1635
{
1636
using namespace meshopt;
1637
1638
assert(triangle_count <= kMeshletMaxTriangles);
1639
assert(vertex_positions_stride >= 12 && vertex_positions_stride <= 256);
1640
assert(vertex_positions_stride % sizeof(float) == 0);
1641
1642
unsigned int indices[kMeshletMaxTriangles * 3];
1643
1644
for (size_t i = 0; i < triangle_count * 3; ++i)
1645
{
1646
unsigned int index = meshlet_vertices[meshlet_triangles[i]];
1647
assert(index < vertex_count);
1648
1649
indices[i] = index;
1650
}
1651
1652
return meshopt_computeClusterBounds(indices, triangle_count * 3, vertex_positions, vertex_count, vertex_positions_stride);
1653
}
1654
1655
meshopt_Bounds meshopt_computeSphereBounds(const float* positions, size_t count, size_t positions_stride, const float* radii, size_t radii_stride)
1656
{
1657
using namespace meshopt;
1658
1659
assert(positions_stride >= 12 && positions_stride <= 256);
1660
assert(positions_stride % sizeof(float) == 0);
1661
assert((radii_stride >= 4 && radii_stride <= 256) || radii == NULL);
1662
assert(radii_stride % sizeof(float) == 0);
1663
1664
meshopt_Bounds bounds = {};
1665
1666
if (count == 0)
1667
return bounds;
1668
1669
const float rzero = 0.f;
1670
1671
float psphere[4] = {};
1672
computeBoundingSphere(psphere, positions, count, positions_stride, radii ? radii : &rzero, radii ? radii_stride : 0, 7);
1673
1674
bounds.center[0] = psphere[0];
1675
bounds.center[1] = psphere[1];
1676
bounds.center[2] = psphere[2];
1677
bounds.radius = psphere[3];
1678
1679
return bounds;
1680
}
1681
1682
void meshopt_optimizeMeshlet(unsigned int* meshlet_vertices, unsigned char* meshlet_triangles, size_t triangle_count, size_t vertex_count)
1683
{
1684
using namespace meshopt;
1685
1686
assert(triangle_count <= kMeshletMaxTriangles);
1687
assert(vertex_count <= kMeshletMaxVertices);
1688
1689
unsigned char* indices = meshlet_triangles;
1690
unsigned int* vertices = meshlet_vertices;
1691
1692
// cache tracks vertex timestamps (corresponding to triangle index! all 3 vertices are added at the same time and never removed)
1693
unsigned char cache[kMeshletMaxVertices];
1694
memset(cache, 0, vertex_count);
1695
1696
// note that we start from a value that means all vertices aren't in cache
1697
unsigned char cache_last = 128;
1698
const unsigned char cache_cutoff = 3; // 3 triangles = ~5..9 vertices depending on reuse
1699
1700
for (size_t i = 0; i < triangle_count; ++i)
1701
{
1702
int next = -1;
1703
int next_match = -1;
1704
1705
for (size_t j = i; j < triangle_count; ++j)
1706
{
1707
unsigned char a = indices[j * 3 + 0], b = indices[j * 3 + 1], c = indices[j * 3 + 2];
1708
assert(a < vertex_count && b < vertex_count && c < vertex_count);
1709
1710
// score each triangle by how many vertices are in cache
1711
// note: the distance is computed using unsigned 8-bit values, so cache timestamp overflow is handled gracefully
1712
int aok = (unsigned char)(cache_last - cache[a]) < cache_cutoff;
1713
int bok = (unsigned char)(cache_last - cache[b]) < cache_cutoff;
1714
int cok = (unsigned char)(cache_last - cache[c]) < cache_cutoff;
1715
1716
if (aok + bok + cok > next_match)
1717
{
1718
next = (int)j;
1719
next_match = aok + bok + cok;
1720
1721
// note that we could end up with all 3 vertices in the cache, but 2 is enough for ~strip traversal
1722
if (next_match >= 2)
1723
break;
1724
}
1725
}
1726
1727
assert(next >= 0);
1728
1729
unsigned char a = indices[next * 3 + 0], b = indices[next * 3 + 1], c = indices[next * 3 + 2];
1730
1731
// shift triangles before the next one forward so that we always keep an ordered partition
1732
// note: this could have swapped triangles [i] and [next] but that distorts the order and may skew the output sequence
1733
memmove(indices + (i + 1) * 3, indices + i * 3, (next - i) * 3 * sizeof(unsigned char));
1734
1735
indices[i * 3 + 0] = a;
1736
indices[i * 3 + 1] = b;
1737
indices[i * 3 + 2] = c;
1738
1739
// cache timestamp is the same between all vertices of each triangle to reduce overflow
1740
cache_last++;
1741
cache[a] = cache_last;
1742
cache[b] = cache_last;
1743
cache[c] = cache_last;
1744
}
1745
1746
// reorder meshlet vertices for access locality assuming index buffer is scanned sequentially
1747
unsigned int order[kMeshletMaxVertices];
1748
1749
short remap[kMeshletMaxVertices];
1750
memset(remap, -1, vertex_count * sizeof(short));
1751
1752
size_t vertex_offset = 0;
1753
1754
for (size_t i = 0; i < triangle_count * 3; ++i)
1755
{
1756
short& r = remap[indices[i]];
1757
1758
if (r < 0)
1759
{
1760
r = short(vertex_offset);
1761
order[vertex_offset] = vertices[indices[i]];
1762
vertex_offset++;
1763
}
1764
1765
indices[i] = (unsigned char)r;
1766
}
1767
1768
assert(vertex_offset <= vertex_count);
1769
memcpy(vertices, order, vertex_offset * sizeof(unsigned int));
1770
}
1771
1772
#undef SIMD_SSE
1773
#undef SIMD_NEON
1774
1775