Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
godotengine
GitHub Repository: godotengine/godot
Path: blob/master/thirdparty/meshoptimizer/clusterizer.cpp
9903 views
1
// This file is part of meshoptimizer library; see meshoptimizer.h for version/license details
2
#include "meshoptimizer.h"
3
4
#include <assert.h>
5
#include <float.h>
6
#include <math.h>
7
#include <string.h>
8
9
// This work is based on:
10
// Graham Wihlidal. Optimizing the Graphics Pipeline with Compute. 2016
11
// Matthaeus Chajdas. GeometryFX 1.2 - Cluster Culling. 2016
12
// Jack Ritter. An Efficient Bounding Sphere. 1990
13
// Thomas Larsson. Fast and Tight Fitting Bounding Spheres. 2008
14
// Ingo Wald, Vlastimil Havran. On building fast kd-Trees for Ray Tracing, and on doing that in O(N log N). 2006
15
namespace meshopt
16
{
17
18
// This must be <= 256 since meshlet indices are stored as bytes
19
const size_t kMeshletMaxVertices = 256;
20
21
// A reasonable limit is around 2*max_vertices or less
22
const size_t kMeshletMaxTriangles = 512;
23
24
// We keep a limited number of seed triangles and add a few triangles per finished meshlet
25
const size_t kMeshletMaxSeeds = 256;
26
const size_t kMeshletAddSeeds = 4;
27
28
// To avoid excessive recursion for malformed inputs, we limit the maximum depth of the tree
29
const int kMeshletMaxTreeDepth = 50;
30
31
struct TriangleAdjacency2
32
{
33
unsigned int* counts;
34
unsigned int* offsets;
35
unsigned int* data;
36
};
37
38
static void buildTriangleAdjacency(TriangleAdjacency2& adjacency, const unsigned int* indices, size_t index_count, size_t vertex_count, meshopt_Allocator& allocator)
39
{
40
size_t face_count = index_count / 3;
41
42
// allocate arrays
43
adjacency.counts = allocator.allocate<unsigned int>(vertex_count);
44
adjacency.offsets = allocator.allocate<unsigned int>(vertex_count);
45
adjacency.data = allocator.allocate<unsigned int>(index_count);
46
47
// fill triangle counts
48
memset(adjacency.counts, 0, vertex_count * sizeof(unsigned int));
49
50
for (size_t i = 0; i < index_count; ++i)
51
{
52
assert(indices[i] < vertex_count);
53
54
adjacency.counts[indices[i]]++;
55
}
56
57
// fill offset table
58
unsigned int offset = 0;
59
60
for (size_t i = 0; i < vertex_count; ++i)
61
{
62
adjacency.offsets[i] = offset;
63
offset += adjacency.counts[i];
64
}
65
66
assert(offset == index_count);
67
68
// fill triangle data
69
for (size_t i = 0; i < face_count; ++i)
70
{
71
unsigned int a = indices[i * 3 + 0], b = indices[i * 3 + 1], c = indices[i * 3 + 2];
72
73
adjacency.data[adjacency.offsets[a]++] = unsigned(i);
74
adjacency.data[adjacency.offsets[b]++] = unsigned(i);
75
adjacency.data[adjacency.offsets[c]++] = unsigned(i);
76
}
77
78
// fix offsets that have been disturbed by the previous pass
79
for (size_t i = 0; i < vertex_count; ++i)
80
{
81
assert(adjacency.offsets[i] >= adjacency.counts[i]);
82
adjacency.offsets[i] -= adjacency.counts[i];
83
}
84
}
85
86
static void buildTriangleAdjacencySparse(TriangleAdjacency2& adjacency, const unsigned int* indices, size_t index_count, size_t vertex_count, meshopt_Allocator& allocator)
87
{
88
size_t face_count = index_count / 3;
89
90
// sparse mode can build adjacency more quickly by ignoring unused vertices, using a bit to mark visited vertices
91
const unsigned int sparse_seen = 1u << 31;
92
assert(index_count < sparse_seen);
93
94
// allocate arrays
95
adjacency.counts = allocator.allocate<unsigned int>(vertex_count);
96
adjacency.offsets = allocator.allocate<unsigned int>(vertex_count);
97
adjacency.data = allocator.allocate<unsigned int>(index_count);
98
99
// fill triangle counts
100
for (size_t i = 0; i < index_count; ++i)
101
assert(indices[i] < vertex_count);
102
103
for (size_t i = 0; i < index_count; ++i)
104
adjacency.counts[indices[i]] = 0;
105
106
for (size_t i = 0; i < index_count; ++i)
107
adjacency.counts[indices[i]]++;
108
109
// fill offset table; uses sparse_seen bit to tag visited vertices
110
unsigned int offset = 0;
111
112
for (size_t i = 0; i < index_count; ++i)
113
{
114
unsigned int v = indices[i];
115
116
if ((adjacency.counts[v] & sparse_seen) == 0)
117
{
118
adjacency.offsets[v] = offset;
119
offset += adjacency.counts[v];
120
adjacency.counts[v] |= sparse_seen;
121
}
122
}
123
124
assert(offset == index_count);
125
126
// fill triangle data
127
for (size_t i = 0; i < face_count; ++i)
128
{
129
unsigned int a = indices[i * 3 + 0], b = indices[i * 3 + 1], c = indices[i * 3 + 2];
130
131
adjacency.data[adjacency.offsets[a]++] = unsigned(i);
132
adjacency.data[adjacency.offsets[b]++] = unsigned(i);
133
adjacency.data[adjacency.offsets[c]++] = unsigned(i);
134
}
135
136
// fix offsets that have been disturbed by the previous pass
137
// also fix counts (that were marked with sparse_seen by the first pass)
138
for (size_t i = 0; i < index_count; ++i)
139
{
140
unsigned int v = indices[i];
141
142
if (adjacency.counts[v] & sparse_seen)
143
{
144
adjacency.counts[v] &= ~sparse_seen;
145
146
assert(adjacency.offsets[v] >= adjacency.counts[v]);
147
adjacency.offsets[v] -= adjacency.counts[v];
148
}
149
}
150
}
151
152
static void computeBoundingSphere(float result[4], const float* points, size_t count, size_t points_stride, const float* radii, size_t radii_stride, size_t axis_count)
153
{
154
static const float kAxes[7][3] = {
155
// X, Y, Z
156
{1, 0, 0},
157
{0, 1, 0},
158
{0, 0, 1},
159
160
// XYZ, -XYZ, X-YZ, XY-Z; normalized to unit length
161
{0.57735026f, 0.57735026f, 0.57735026f},
162
{-0.57735026f, 0.57735026f, 0.57735026f},
163
{0.57735026f, -0.57735026f, 0.57735026f},
164
{0.57735026f, 0.57735026f, -0.57735026f},
165
};
166
167
assert(count > 0);
168
assert(axis_count <= sizeof(kAxes) / sizeof(kAxes[0]));
169
170
size_t points_stride_float = points_stride / sizeof(float);
171
size_t radii_stride_float = radii_stride / sizeof(float);
172
173
// find extremum points along all axes; for each axis we get a pair of points with min/max coordinates
174
size_t pmin[7], pmax[7];
175
float tmin[7], tmax[7];
176
177
for (size_t axis = 0; axis < axis_count; ++axis)
178
{
179
pmin[axis] = pmax[axis] = 0;
180
tmin[axis] = FLT_MAX;
181
tmax[axis] = -FLT_MAX;
182
}
183
184
for (size_t i = 0; i < count; ++i)
185
{
186
const float* p = points + i * points_stride_float;
187
float r = radii[i * radii_stride_float];
188
189
for (size_t axis = 0; axis < axis_count; ++axis)
190
{
191
const float* ax = kAxes[axis];
192
193
float tp = ax[0] * p[0] + ax[1] * p[1] + ax[2] * p[2];
194
float tpmin = tp - r, tpmax = tp + r;
195
196
pmin[axis] = (tpmin < tmin[axis]) ? i : pmin[axis];
197
pmax[axis] = (tpmax > tmax[axis]) ? i : pmax[axis];
198
tmin[axis] = (tpmin < tmin[axis]) ? tpmin : tmin[axis];
199
tmax[axis] = (tpmax > tmax[axis]) ? tpmax : tmax[axis];
200
}
201
}
202
203
// find the pair of points with largest distance
204
size_t paxis = 0;
205
float paxisdr = 0;
206
207
for (size_t axis = 0; axis < axis_count; ++axis)
208
{
209
const float* p1 = points + pmin[axis] * points_stride_float;
210
const float* p2 = points + pmax[axis] * points_stride_float;
211
float r1 = radii[pmin[axis] * radii_stride_float];
212
float r2 = radii[pmax[axis] * radii_stride_float];
213
214
float d2 = (p2[0] - p1[0]) * (p2[0] - p1[0]) + (p2[1] - p1[1]) * (p2[1] - p1[1]) + (p2[2] - p1[2]) * (p2[2] - p1[2]);
215
float dr = sqrtf(d2) + r1 + r2;
216
217
if (dr > paxisdr)
218
{
219
paxisdr = dr;
220
paxis = axis;
221
}
222
}
223
224
// use the longest segment as the initial sphere diameter
225
const float* p1 = points + pmin[paxis] * points_stride_float;
226
const float* p2 = points + pmax[paxis] * points_stride_float;
227
float r1 = radii[pmin[paxis] * radii_stride_float];
228
float r2 = radii[pmax[paxis] * radii_stride_float];
229
230
float paxisd = sqrtf((p2[0] - p1[0]) * (p2[0] - p1[0]) + (p2[1] - p1[1]) * (p2[1] - p1[1]) + (p2[2] - p1[2]) * (p2[2] - p1[2]));
231
float paxisk = paxisd > 0 ? (paxisd + r2 - r1) / (2 * paxisd) : 0.f;
232
233
float center[3] = {p1[0] + (p2[0] - p1[0]) * paxisk, p1[1] + (p2[1] - p1[1]) * paxisk, p1[2] + (p2[2] - p1[2]) * paxisk};
234
float radius = paxisdr / 2;
235
236
// iteratively adjust the sphere up until all points fit
237
for (size_t i = 0; i < count; ++i)
238
{
239
const float* p = points + i * points_stride_float;
240
float r = radii[i * radii_stride_float];
241
242
float d2 = (p[0] - center[0]) * (p[0] - center[0]) + (p[1] - center[1]) * (p[1] - center[1]) + (p[2] - center[2]) * (p[2] - center[2]);
243
float d = sqrtf(d2);
244
245
if (d + r > radius)
246
{
247
float k = d > 0 ? (d + r - radius) / (2 * d) : 0.f;
248
249
center[0] += k * (p[0] - center[0]);
250
center[1] += k * (p[1] - center[1]);
251
center[2] += k * (p[2] - center[2]);
252
radius = (radius + d + r) / 2;
253
}
254
}
255
256
result[0] = center[0];
257
result[1] = center[1];
258
result[2] = center[2];
259
result[3] = radius;
260
}
261
262
struct Cone
263
{
264
float px, py, pz;
265
float nx, ny, nz;
266
};
267
268
static float getDistance(float dx, float dy, float dz, bool aa)
269
{
270
if (!aa)
271
return sqrtf(dx * dx + dy * dy + dz * dz);
272
273
float rx = fabsf(dx), ry = fabsf(dy), rz = fabsf(dz);
274
float rxy = rx > ry ? rx : ry;
275
return rxy > rz ? rxy : rz;
276
}
277
278
static float getMeshletScore(float distance, float spread, float cone_weight, float expected_radius)
279
{
280
if (cone_weight < 0)
281
return 1 + distance / expected_radius;
282
283
float cone = 1.f - spread * cone_weight;
284
float cone_clamped = cone < 1e-3f ? 1e-3f : cone;
285
286
return (1 + distance / expected_radius * (1 - cone_weight)) * cone_clamped;
287
}
288
289
static Cone getMeshletCone(const Cone& acc, unsigned int triangle_count)
290
{
291
Cone result = acc;
292
293
float center_scale = triangle_count == 0 ? 0.f : 1.f / float(triangle_count);
294
295
result.px *= center_scale;
296
result.py *= center_scale;
297
result.pz *= center_scale;
298
299
float axis_length = result.nx * result.nx + result.ny * result.ny + result.nz * result.nz;
300
float axis_scale = axis_length == 0.f ? 0.f : 1.f / sqrtf(axis_length);
301
302
result.nx *= axis_scale;
303
result.ny *= axis_scale;
304
result.nz *= axis_scale;
305
306
return result;
307
}
308
309
static float computeTriangleCones(Cone* triangles, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride)
310
{
311
(void)vertex_count;
312
313
size_t vertex_stride_float = vertex_positions_stride / sizeof(float);
314
size_t face_count = index_count / 3;
315
316
float mesh_area = 0;
317
318
for (size_t i = 0; i < face_count; ++i)
319
{
320
unsigned int a = indices[i * 3 + 0], b = indices[i * 3 + 1], c = indices[i * 3 + 2];
321
assert(a < vertex_count && b < vertex_count && c < vertex_count);
322
323
const float* p0 = vertex_positions + vertex_stride_float * a;
324
const float* p1 = vertex_positions + vertex_stride_float * b;
325
const float* p2 = vertex_positions + vertex_stride_float * c;
326
327
float p10[3] = {p1[0] - p0[0], p1[1] - p0[1], p1[2] - p0[2]};
328
float p20[3] = {p2[0] - p0[0], p2[1] - p0[1], p2[2] - p0[2]};
329
330
float normalx = p10[1] * p20[2] - p10[2] * p20[1];
331
float normaly = p10[2] * p20[0] - p10[0] * p20[2];
332
float normalz = p10[0] * p20[1] - p10[1] * p20[0];
333
334
float area = sqrtf(normalx * normalx + normaly * normaly + normalz * normalz);
335
float invarea = (area == 0.f) ? 0.f : 1.f / area;
336
337
triangles[i].px = (p0[0] + p1[0] + p2[0]) / 3.f;
338
triangles[i].py = (p0[1] + p1[1] + p2[1]) / 3.f;
339
triangles[i].pz = (p0[2] + p1[2] + p2[2]) / 3.f;
340
341
triangles[i].nx = normalx * invarea;
342
triangles[i].ny = normaly * invarea;
343
triangles[i].nz = normalz * invarea;
344
345
mesh_area += area;
346
}
347
348
return mesh_area;
349
}
350
351
static void finishMeshlet(meshopt_Meshlet& meshlet, unsigned char* meshlet_triangles)
352
{
353
size_t offset = meshlet.triangle_offset + meshlet.triangle_count * 3;
354
355
// fill 4b padding with 0
356
while (offset & 3)
357
meshlet_triangles[offset++] = 0;
358
}
359
360
static bool appendMeshlet(meshopt_Meshlet& meshlet, unsigned int a, unsigned int b, unsigned int c, short* used, meshopt_Meshlet* meshlets, unsigned int* meshlet_vertices, unsigned char* meshlet_triangles, size_t meshlet_offset, size_t max_vertices, size_t max_triangles, bool split = false)
361
{
362
short& av = used[a];
363
short& bv = used[b];
364
short& cv = used[c];
365
366
bool result = false;
367
368
int used_extra = (av < 0) + (bv < 0) + (cv < 0);
369
370
if (meshlet.vertex_count + used_extra > max_vertices || meshlet.triangle_count >= max_triangles || split)
371
{
372
meshlets[meshlet_offset] = meshlet;
373
374
for (size_t j = 0; j < meshlet.vertex_count; ++j)
375
used[meshlet_vertices[meshlet.vertex_offset + j]] = -1;
376
377
finishMeshlet(meshlet, meshlet_triangles);
378
379
meshlet.vertex_offset += meshlet.vertex_count;
380
meshlet.triangle_offset += (meshlet.triangle_count * 3 + 3) & ~3; // 4b padding
381
meshlet.vertex_count = 0;
382
meshlet.triangle_count = 0;
383
384
result = true;
385
}
386
387
if (av < 0)
388
{
389
av = short(meshlet.vertex_count);
390
meshlet_vertices[meshlet.vertex_offset + meshlet.vertex_count++] = a;
391
}
392
393
if (bv < 0)
394
{
395
bv = short(meshlet.vertex_count);
396
meshlet_vertices[meshlet.vertex_offset + meshlet.vertex_count++] = b;
397
}
398
399
if (cv < 0)
400
{
401
cv = short(meshlet.vertex_count);
402
meshlet_vertices[meshlet.vertex_offset + meshlet.vertex_count++] = c;
403
}
404
405
meshlet_triangles[meshlet.triangle_offset + meshlet.triangle_count * 3 + 0] = (unsigned char)av;
406
meshlet_triangles[meshlet.triangle_offset + meshlet.triangle_count * 3 + 1] = (unsigned char)bv;
407
meshlet_triangles[meshlet.triangle_offset + meshlet.triangle_count * 3 + 2] = (unsigned char)cv;
408
meshlet.triangle_count++;
409
410
return result;
411
}
412
413
static unsigned int getNeighborTriangle(const meshopt_Meshlet& meshlet, const Cone& meshlet_cone, const unsigned int* meshlet_vertices, const unsigned int* indices, const TriangleAdjacency2& adjacency, const Cone* triangles, const unsigned int* live_triangles, const short* used, float meshlet_expected_radius, float cone_weight)
414
{
415
unsigned int best_triangle = ~0u;
416
int best_priority = 5;
417
float best_score = FLT_MAX;
418
419
for (size_t i = 0; i < meshlet.vertex_count; ++i)
420
{
421
unsigned int index = meshlet_vertices[meshlet.vertex_offset + i];
422
423
unsigned int* neighbors = &adjacency.data[0] + adjacency.offsets[index];
424
size_t neighbors_size = adjacency.counts[index];
425
426
for (size_t j = 0; j < neighbors_size; ++j)
427
{
428
unsigned int triangle = neighbors[j];
429
unsigned int a = indices[triangle * 3 + 0], b = indices[triangle * 3 + 1], c = indices[triangle * 3 + 2];
430
431
int extra = (used[a] < 0) + (used[b] < 0) + (used[c] < 0);
432
assert(extra <= 2);
433
434
int priority = -1;
435
436
// triangles that don't add new vertices to meshlets are max. priority
437
if (extra == 0)
438
priority = 0;
439
// artificially increase the priority of dangling triangles as they're expensive to add to new meshlets
440
else if (live_triangles[a] == 1 || live_triangles[b] == 1 || live_triangles[c] == 1)
441
priority = 1;
442
// if two vertices have live count of 2, removing this triangle will make another triangle dangling which is good for overall flow
443
else if ((live_triangles[a] == 2) + (live_triangles[b] == 2) + (live_triangles[c] == 2) >= 2)
444
priority = 1 + extra;
445
// otherwise adjust priority to be after the above cases, 3 or 4 based on used[] count
446
else
447
priority = 2 + extra;
448
449
// since topology-based priority is always more important than the score, we can skip scoring in some cases
450
if (priority > best_priority)
451
continue;
452
453
const Cone& tri_cone = triangles[triangle];
454
455
float dx = tri_cone.px - meshlet_cone.px, dy = tri_cone.py - meshlet_cone.py, dz = tri_cone.pz - meshlet_cone.pz;
456
float distance = getDistance(dx, dy, dz, cone_weight < 0);
457
float spread = tri_cone.nx * meshlet_cone.nx + tri_cone.ny * meshlet_cone.ny + tri_cone.nz * meshlet_cone.nz;
458
459
float score = getMeshletScore(distance, spread, cone_weight, meshlet_expected_radius);
460
461
// note that topology-based priority is always more important than the score
462
// this helps maintain reasonable effectiveness of meshlet data and reduces scoring cost
463
if (priority < best_priority || score < best_score)
464
{
465
best_triangle = triangle;
466
best_priority = priority;
467
best_score = score;
468
}
469
}
470
}
471
472
return best_triangle;
473
}
474
475
static size_t appendSeedTriangles(unsigned int* seeds, const meshopt_Meshlet& meshlet, const unsigned int* meshlet_vertices, const unsigned int* indices, const TriangleAdjacency2& adjacency, const Cone* triangles, const unsigned int* live_triangles, float cornerx, float cornery, float cornerz)
476
{
477
unsigned int best_seeds[kMeshletAddSeeds];
478
unsigned int best_live[kMeshletAddSeeds];
479
float best_score[kMeshletAddSeeds];
480
481
for (size_t i = 0; i < kMeshletAddSeeds; ++i)
482
{
483
best_seeds[i] = ~0u;
484
best_live[i] = ~0u;
485
best_score[i] = FLT_MAX;
486
}
487
488
for (size_t i = 0; i < meshlet.vertex_count; ++i)
489
{
490
unsigned int index = meshlet_vertices[meshlet.vertex_offset + i];
491
492
unsigned int best_neighbor = ~0u;
493
unsigned int best_neighbor_live = ~0u;
494
495
// find the neighbor with the smallest live metric
496
unsigned int* neighbors = &adjacency.data[0] + adjacency.offsets[index];
497
size_t neighbors_size = adjacency.counts[index];
498
499
for (size_t j = 0; j < neighbors_size; ++j)
500
{
501
unsigned int triangle = neighbors[j];
502
unsigned int a = indices[triangle * 3 + 0], b = indices[triangle * 3 + 1], c = indices[triangle * 3 + 2];
503
504
unsigned int live = live_triangles[a] + live_triangles[b] + live_triangles[c];
505
506
if (live < best_neighbor_live)
507
{
508
best_neighbor = triangle;
509
best_neighbor_live = live;
510
}
511
}
512
513
// add the neighbor to the list of seeds; the list is unsorted and the replacement criteria is approximate
514
if (best_neighbor == ~0u)
515
continue;
516
517
float best_neighbor_score = getDistance(triangles[best_neighbor].px - cornerx, triangles[best_neighbor].py - cornery, triangles[best_neighbor].pz - cornerz, false);
518
519
for (size_t j = 0; j < kMeshletAddSeeds; ++j)
520
{
521
// non-strict comparison reduces the number of duplicate seeds (triangles adjacent to multiple vertices)
522
if (best_neighbor_live < best_live[j] || (best_neighbor_live == best_live[j] && best_neighbor_score <= best_score[j]))
523
{
524
best_seeds[j] = best_neighbor;
525
best_live[j] = best_neighbor_live;
526
best_score[j] = best_neighbor_score;
527
break;
528
}
529
}
530
}
531
532
// add surviving seeds to the meshlet
533
size_t seed_count = 0;
534
535
for (size_t i = 0; i < kMeshletAddSeeds; ++i)
536
if (best_seeds[i] != ~0u)
537
seeds[seed_count++] = best_seeds[i];
538
539
return seed_count;
540
}
541
542
static size_t pruneSeedTriangles(unsigned int* seeds, size_t seed_count, const unsigned char* emitted_flags)
543
{
544
size_t result = 0;
545
546
for (size_t i = 0; i < seed_count; ++i)
547
{
548
unsigned int index = seeds[i];
549
550
seeds[result] = index;
551
result += emitted_flags[index] == 0;
552
}
553
554
return result;
555
}
556
557
static unsigned int selectSeedTriangle(const unsigned int* seeds, size_t seed_count, const unsigned int* indices, const Cone* triangles, const unsigned int* live_triangles, float cornerx, float cornery, float cornerz)
558
{
559
unsigned int best_seed = ~0u;
560
unsigned int best_live = ~0u;
561
float best_score = FLT_MAX;
562
563
for (size_t i = 0; i < seed_count; ++i)
564
{
565
unsigned int index = seeds[i];
566
unsigned int a = indices[index * 3 + 0], b = indices[index * 3 + 1], c = indices[index * 3 + 2];
567
568
unsigned int live = live_triangles[a] + live_triangles[b] + live_triangles[c];
569
float score = getDistance(triangles[index].px - cornerx, triangles[index].py - cornery, triangles[index].pz - cornerz, false);
570
571
if (live < best_live || (live == best_live && score < best_score))
572
{
573
best_seed = index;
574
best_live = live;
575
best_score = score;
576
}
577
}
578
579
return best_seed;
580
}
581
582
struct KDNode
583
{
584
union
585
{
586
float split;
587
unsigned int index;
588
};
589
590
// leaves: axis = 3, children = number of extra points after this one (0 if 'index' is the only point)
591
// branches: axis != 3, left subtree = skip 1, right subtree = skip 1+children
592
unsigned int axis : 2;
593
unsigned int children : 30;
594
};
595
596
static size_t kdtreePartition(unsigned int* indices, size_t count, const float* points, size_t stride, unsigned int axis, float pivot)
597
{
598
size_t m = 0;
599
600
// invariant: elements in range [0, m) are < pivot, elements in range [m, i) are >= pivot
601
for (size_t i = 0; i < count; ++i)
602
{
603
float v = points[indices[i] * stride + axis];
604
605
// swap(m, i) unconditionally
606
unsigned int t = indices[m];
607
indices[m] = indices[i];
608
indices[i] = t;
609
610
// when v >= pivot, we swap i with m without advancing it, preserving invariants
611
m += v < pivot;
612
}
613
614
return m;
615
}
616
617
static size_t kdtreeBuildLeaf(size_t offset, KDNode* nodes, size_t node_count, unsigned int* indices, size_t count)
618
{
619
assert(offset + count <= node_count);
620
(void)node_count;
621
622
KDNode& result = nodes[offset];
623
624
result.index = indices[0];
625
result.axis = 3;
626
result.children = unsigned(count - 1);
627
628
// all remaining points are stored in nodes immediately following the leaf
629
for (size_t i = 1; i < count; ++i)
630
{
631
KDNode& tail = nodes[offset + i];
632
633
tail.index = indices[i];
634
tail.axis = 3;
635
tail.children = ~0u >> 2; // bogus value to prevent misuse
636
}
637
638
return offset + count;
639
}
640
641
static size_t kdtreeBuild(size_t offset, KDNode* nodes, size_t node_count, const float* points, size_t stride, unsigned int* indices, size_t count, size_t leaf_size)
642
{
643
assert(count > 0);
644
assert(offset < node_count);
645
646
if (count <= leaf_size)
647
return kdtreeBuildLeaf(offset, nodes, node_count, indices, count);
648
649
float mean[3] = {};
650
float vars[3] = {};
651
float runc = 1, runs = 1;
652
653
// gather statistics on the points in the subtree using Welford's algorithm
654
for (size_t i = 0; i < count; ++i, runc += 1.f, runs = 1.f / runc)
655
{
656
const float* point = points + indices[i] * stride;
657
658
for (int k = 0; k < 3; ++k)
659
{
660
float delta = point[k] - mean[k];
661
mean[k] += delta * runs;
662
vars[k] += delta * (point[k] - mean[k]);
663
}
664
}
665
666
// split axis is one where the variance is largest
667
unsigned int axis = (vars[0] >= vars[1] && vars[0] >= vars[2]) ? 0 : (vars[1] >= vars[2] ? 1 : 2);
668
669
float split = mean[axis];
670
size_t middle = kdtreePartition(indices, count, points, stride, axis, split);
671
672
// when the partition is degenerate simply consolidate the points into a single node
673
if (middle <= leaf_size / 2 || middle >= count - leaf_size / 2)
674
return kdtreeBuildLeaf(offset, nodes, node_count, indices, count);
675
676
KDNode& result = nodes[offset];
677
678
result.split = split;
679
result.axis = axis;
680
681
// left subtree is right after our node
682
size_t next_offset = kdtreeBuild(offset + 1, nodes, node_count, points, stride, indices, middle, leaf_size);
683
684
// distance to the right subtree is represented explicitly
685
result.children = unsigned(next_offset - offset - 1);
686
687
return kdtreeBuild(next_offset, nodes, node_count, points, stride, indices + middle, count - middle, leaf_size);
688
}
689
690
static void kdtreeNearest(KDNode* nodes, unsigned int root, const float* points, size_t stride, const unsigned char* emitted_flags, const float* position, bool aa, unsigned int& result, float& limit)
691
{
692
const KDNode& node = nodes[root];
693
694
if (node.axis == 3)
695
{
696
// leaf
697
for (unsigned int i = 0; i <= node.children; ++i)
698
{
699
unsigned int index = nodes[root + i].index;
700
701
if (emitted_flags[index])
702
continue;
703
704
const float* point = points + index * stride;
705
706
float dx = point[0] - position[0], dy = point[1] - position[1], dz = point[2] - position[2];
707
float distance = getDistance(dx, dy, dz, aa);
708
709
if (distance < limit)
710
{
711
result = index;
712
limit = distance;
713
}
714
}
715
}
716
else
717
{
718
// branch; we order recursion to process the node that search position is in first
719
float delta = position[node.axis] - node.split;
720
unsigned int first = (delta <= 0) ? 0 : node.children;
721
unsigned int second = first ^ node.children;
722
723
kdtreeNearest(nodes, root + 1 + first, points, stride, emitted_flags, position, aa, result, limit);
724
725
// only process the other node if it can have a match based on closest distance so far
726
if (fabsf(delta) <= limit)
727
kdtreeNearest(nodes, root + 1 + second, points, stride, emitted_flags, position, aa, result, limit);
728
}
729
}
730
731
struct BVHBox
732
{
733
float min[3];
734
float max[3];
735
};
736
737
static void boxMerge(BVHBox& box, const BVHBox& other)
738
{
739
for (int k = 0; k < 3; ++k)
740
{
741
box.min[k] = other.min[k] < box.min[k] ? other.min[k] : box.min[k];
742
box.max[k] = other.max[k] > box.max[k] ? other.max[k] : box.max[k];
743
}
744
}
745
746
inline float boxSurface(const BVHBox& box)
747
{
748
float sx = box.max[0] - box.min[0], sy = box.max[1] - box.min[1], sz = box.max[2] - box.min[2];
749
return sx * sy + sx * sz + sy * sz;
750
}
751
752
inline unsigned int radixFloat(unsigned int v)
753
{
754
// if sign bit is 0, flip sign bit
755
// if sign bit is 1, flip everything
756
unsigned int mask = (int(v) >> 31) | 0x80000000;
757
return v ^ mask;
758
}
759
760
static void computeHistogram(unsigned int (&hist)[1024][3], const float* data, size_t count)
761
{
762
memset(hist, 0, sizeof(hist));
763
764
const unsigned int* bits = reinterpret_cast<const unsigned int*>(data);
765
766
// compute 3 10-bit histograms in parallel (dropping 2 LSB)
767
for (size_t i = 0; i < count; ++i)
768
{
769
unsigned int id = radixFloat(bits[i]);
770
771
hist[(id >> 2) & 1023][0]++;
772
hist[(id >> 12) & 1023][1]++;
773
hist[(id >> 22) & 1023][2]++;
774
}
775
776
unsigned int sum0 = 0, sum1 = 0, sum2 = 0;
777
778
// replace histogram data with prefix histogram sums in-place
779
for (int i = 0; i < 1024; ++i)
780
{
781
unsigned int hx = hist[i][0], hy = hist[i][1], hz = hist[i][2];
782
783
hist[i][0] = sum0;
784
hist[i][1] = sum1;
785
hist[i][2] = sum2;
786
787
sum0 += hx;
788
sum1 += hy;
789
sum2 += hz;
790
}
791
792
assert(sum0 == count && sum1 == count && sum2 == count);
793
}
794
795
static void radixPass(unsigned int* destination, const unsigned int* source, const float* keys, size_t count, unsigned int (&hist)[1024][3], int pass)
796
{
797
const unsigned int* bits = reinterpret_cast<const unsigned int*>(keys);
798
int bitoff = pass * 10 + 2; // drop 2 LSB to be able to use 3 10-bit passes
799
800
for (size_t i = 0; i < count; ++i)
801
{
802
unsigned int id = (radixFloat(bits[source[i]]) >> bitoff) & 1023;
803
804
destination[hist[id][pass]++] = source[i];
805
}
806
}
807
808
static void bvhPrepare(BVHBox* boxes, float* centroids, const unsigned int* indices, size_t face_count, const float* vertex_positions, size_t vertex_count, size_t vertex_stride_float)
809
{
810
(void)vertex_count;
811
812
for (size_t i = 0; i < face_count; ++i)
813
{
814
unsigned int a = indices[i * 3 + 0], b = indices[i * 3 + 1], c = indices[i * 3 + 2];
815
assert(a < vertex_count && b < vertex_count && c < vertex_count);
816
817
const float* va = vertex_positions + vertex_stride_float * a;
818
const float* vb = vertex_positions + vertex_stride_float * b;
819
const float* vc = vertex_positions + vertex_stride_float * c;
820
821
BVHBox& box = boxes[i];
822
823
for (int k = 0; k < 3; ++k)
824
{
825
box.min[k] = va[k] < vb[k] ? va[k] : vb[k];
826
box.min[k] = vc[k] < box.min[k] ? vc[k] : box.min[k];
827
828
box.max[k] = va[k] > vb[k] ? va[k] : vb[k];
829
box.max[k] = vc[k] > box.max[k] ? vc[k] : box.max[k];
830
831
centroids[i + face_count * k] = (box.min[k] + box.max[k]) / 2.f;
832
}
833
}
834
}
835
836
static bool bvhPackLeaf(unsigned char* boundary, const unsigned int* order, size_t count, short* used, const unsigned int* indices, size_t max_vertices)
837
{
838
// count number of unique vertices
839
size_t used_vertices = 0;
840
for (size_t i = 0; i < count; ++i)
841
{
842
unsigned int index = order[i];
843
unsigned int a = indices[index * 3 + 0], b = indices[index * 3 + 1], c = indices[index * 3 + 2];
844
845
used_vertices += (used[a] < 0) + (used[b] < 0) + (used[c] < 0);
846
used[a] = used[b] = used[c] = 1;
847
}
848
849
// reset used[] for future invocations
850
for (size_t i = 0; i < count; ++i)
851
{
852
unsigned int index = order[i];
853
unsigned int a = indices[index * 3 + 0], b = indices[index * 3 + 1], c = indices[index * 3 + 2];
854
855
used[a] = used[b] = used[c] = -1;
856
}
857
858
if (used_vertices > max_vertices)
859
return false;
860
861
// mark meshlet boundary for future reassembly
862
assert(count > 0);
863
864
boundary[0] = 1;
865
memset(boundary + 1, 0, count - 1);
866
867
return true;
868
}
869
870
static void bvhPackTail(unsigned char* boundary, const unsigned int* order, size_t count, short* used, const unsigned int* indices, size_t max_vertices, size_t max_triangles)
871
{
872
for (size_t i = 0; i < count;)
873
{
874
size_t chunk = i + max_triangles <= count ? max_triangles : count - i;
875
876
if (bvhPackLeaf(boundary + i, order + i, chunk, used, indices, max_vertices))
877
{
878
i += chunk;
879
continue;
880
}
881
882
// chunk is vertex bound, split it into smaller meshlets
883
assert(chunk > max_vertices / 3);
884
885
bvhPackLeaf(boundary + i, order + i, max_vertices / 3, used, indices, max_vertices);
886
i += max_vertices / 3;
887
}
888
}
889
890
static bool bvhDivisible(size_t count, size_t min, size_t max)
891
{
892
// count is representable as a sum of values in [min..max] if if it in range of [k*min..k*min+k*(max-min)]
893
// equivalent to ceil(count / max) <= floor(count / min), but the form below allows using idiv (see nv_cluster_builder)
894
// we avoid expensive integer divisions in the common case where min is <= max/2
895
return min * 2 <= max ? count >= min : count % min <= (count / min) * (max - min);
896
}
897
898
static size_t bvhPivot(const BVHBox* boxes, const unsigned int* order, size_t count, void* scratch, size_t step, size_t min, size_t max, float fill, float* out_cost)
899
{
900
BVHBox accuml = boxes[order[0]], accumr = boxes[order[count - 1]];
901
float* costs = static_cast<float*>(scratch);
902
903
// accumulate SAH cost in forward and backward directions
904
for (size_t i = 0; i < count; ++i)
905
{
906
boxMerge(accuml, boxes[order[i]]);
907
boxMerge(accumr, boxes[order[count - 1 - i]]);
908
909
costs[i] = boxSurface(accuml);
910
costs[i + count] = boxSurface(accumr);
911
}
912
913
bool aligned = count >= min * 2 && bvhDivisible(count, min, max);
914
size_t end = aligned ? count - min : count - 1;
915
916
float rmaxf = 1.f / float(int(max));
917
918
// find best split that minimizes SAH
919
size_t bestsplit = 0;
920
float bestcost = FLT_MAX;
921
922
for (size_t i = min - 1; i < end; i += step)
923
{
924
size_t lsplit = i + 1, rsplit = count - (i + 1);
925
926
if (!bvhDivisible(lsplit, min, max))
927
continue;
928
if (aligned && !bvhDivisible(rsplit, min, max))
929
continue;
930
931
// costs[x] = inclusive surface area of boxes[0..x]
932
// costs[count-1-x] = inclusive surface area of boxes[x..count-1]
933
float larea = costs[i], rarea = costs[(count - 1 - (i + 1)) + count];
934
float cost = larea * float(int(lsplit)) + rarea * float(int(rsplit));
935
936
if (cost > bestcost)
937
continue;
938
939
// fill cost; use floating point math to avoid expensive integer modulo
940
int lrest = int(float(int(lsplit + max - 1)) * rmaxf) * int(max) - int(lsplit);
941
int rrest = int(float(int(rsplit + max - 1)) * rmaxf) * int(max) - int(rsplit);
942
943
cost += fill * (float(lrest) * larea + float(rrest) * rarea);
944
945
if (cost < bestcost)
946
{
947
bestcost = cost;
948
bestsplit = i + 1;
949
}
950
}
951
952
*out_cost = bestcost;
953
return bestsplit;
954
}
955
956
static void bvhPartition(unsigned int* target, const unsigned int* order, const unsigned char* sides, size_t split, size_t count)
957
{
958
size_t l = 0, r = split;
959
960
for (size_t i = 0; i < count; ++i)
961
{
962
unsigned char side = sides[order[i]];
963
target[side ? r : l] = order[i];
964
l += 1;
965
l -= side;
966
r += side;
967
}
968
969
assert(l == split && r == count);
970
}
971
972
static void bvhSplit(const BVHBox* boxes, unsigned int* orderx, unsigned int* ordery, unsigned int* orderz, unsigned char* boundary, size_t count, int depth, void* scratch, short* used, const unsigned int* indices, size_t max_vertices, size_t min_triangles, size_t max_triangles, float fill_weight)
973
{
974
if (depth >= kMeshletMaxTreeDepth)
975
return bvhPackTail(boundary, orderx, count, used, indices, max_vertices, max_triangles);
976
977
if (count <= max_triangles && bvhPackLeaf(boundary, orderx, count, used, indices, max_vertices))
978
return;
979
980
unsigned int* axes[3] = {orderx, ordery, orderz};
981
982
// we can use step=1 unconditionally but to reduce the cost for min=max case we use step=max
983
size_t step = min_triangles == max_triangles && count > max_triangles ? max_triangles : 1;
984
985
// if we could not pack the meshlet, we must be vertex bound
986
size_t mint = count <= max_triangles && max_vertices / 3 < min_triangles ? max_vertices / 3 : min_triangles;
987
988
// only use fill weight if we are optimizing for triangle count
989
float fill = count <= max_triangles ? 0.f : fill_weight;
990
991
// find best split that minimizes SAH
992
int bestk = -1;
993
size_t bestsplit = 0;
994
float bestcost = FLT_MAX;
995
996
for (int k = 0; k < 3; ++k)
997
{
998
float axiscost = FLT_MAX;
999
size_t axissplit = bvhPivot(boxes, axes[k], count, scratch, step, mint, max_triangles, fill, &axiscost);
1000
1001
if (axissplit && axiscost < bestcost)
1002
{
1003
bestk = k;
1004
bestcost = axiscost;
1005
bestsplit = axissplit;
1006
}
1007
}
1008
1009
// this may happen if SAH costs along the admissible splits are NaN
1010
if (bestk < 0)
1011
return bvhPackTail(boundary, orderx, count, used, indices, max_vertices, max_triangles);
1012
1013
// mark sides of split for partitioning
1014
unsigned char* sides = static_cast<unsigned char*>(scratch) + count * sizeof(unsigned int);
1015
1016
for (size_t i = 0; i < bestsplit; ++i)
1017
sides[axes[bestk][i]] = 0;
1018
1019
for (size_t i = bestsplit; i < count; ++i)
1020
sides[axes[bestk][i]] = 1;
1021
1022
// partition all axes into two sides, maintaining order
1023
unsigned int* temp = static_cast<unsigned int*>(scratch);
1024
1025
for (int k = 0; k < 3; ++k)
1026
{
1027
if (k == bestk)
1028
continue;
1029
1030
unsigned int* axis = axes[k];
1031
memcpy(temp, axis, sizeof(unsigned int) * count);
1032
bvhPartition(axis, temp, sides, bestsplit, count);
1033
}
1034
1035
bvhSplit(boxes, orderx, ordery, orderz, boundary, bestsplit, depth + 1, scratch, used, indices, max_vertices, min_triangles, max_triangles, fill_weight);
1036
bvhSplit(boxes, orderx + bestsplit, ordery + bestsplit, orderz + bestsplit, boundary + bestsplit, count - bestsplit, depth + 1, scratch, used, indices, max_vertices, min_triangles, max_triangles, fill_weight);
1037
}
1038
1039
} // namespace meshopt
1040
1041
size_t meshopt_buildMeshletsBound(size_t index_count, size_t max_vertices, size_t max_triangles)
1042
{
1043
using namespace meshopt;
1044
1045
assert(index_count % 3 == 0);
1046
assert(max_vertices >= 3 && max_vertices <= kMeshletMaxVertices);
1047
assert(max_triangles >= 1 && max_triangles <= kMeshletMaxTriangles);
1048
assert(max_triangles % 4 == 0); // ensures the caller will compute output space properly as index data is 4b aligned
1049
1050
(void)kMeshletMaxVertices;
1051
(void)kMeshletMaxTriangles;
1052
1053
// meshlet construction is limited by max vertices and max triangles per meshlet
1054
// the worst case is that the input is an unindexed stream since this equally stresses both limits
1055
// note that we assume that in the worst case, we leave 2 vertices unpacked in each meshlet - if we have space for 3 we can pack any triangle
1056
size_t max_vertices_conservative = max_vertices - 2;
1057
size_t meshlet_limit_vertices = (index_count + max_vertices_conservative - 1) / max_vertices_conservative;
1058
size_t meshlet_limit_triangles = (index_count / 3 + max_triangles - 1) / max_triangles;
1059
1060
return meshlet_limit_vertices > meshlet_limit_triangles ? meshlet_limit_vertices : meshlet_limit_triangles;
1061
}
1062
1063
size_t meshopt_buildMeshletsFlex(meshopt_Meshlet* meshlets, unsigned int* meshlet_vertices, unsigned char* meshlet_triangles, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t max_vertices, size_t min_triangles, size_t max_triangles, float cone_weight, float split_factor)
1064
{
1065
using namespace meshopt;
1066
1067
assert(index_count % 3 == 0);
1068
assert(vertex_positions_stride >= 12 && vertex_positions_stride <= 256);
1069
assert(vertex_positions_stride % sizeof(float) == 0);
1070
1071
assert(max_vertices >= 3 && max_vertices <= kMeshletMaxVertices);
1072
assert(min_triangles >= 1 && min_triangles <= max_triangles && max_triangles <= kMeshletMaxTriangles);
1073
assert(min_triangles % 4 == 0 && max_triangles % 4 == 0); // ensures the caller will compute output space properly as index data is 4b aligned
1074
1075
assert(cone_weight <= 1); // negative cone weight switches metric to optimize for axis-aligned meshlets
1076
assert(split_factor >= 0);
1077
1078
if (index_count == 0)
1079
return 0;
1080
1081
meshopt_Allocator allocator;
1082
1083
TriangleAdjacency2 adjacency = {};
1084
if (vertex_count > index_count && index_count < (1u << 31))
1085
buildTriangleAdjacencySparse(adjacency, indices, index_count, vertex_count, allocator);
1086
else
1087
buildTriangleAdjacency(adjacency, indices, index_count, vertex_count, allocator);
1088
1089
// live triangle counts; note, we alias adjacency.counts as we remove triangles after emitting them so the counts always match
1090
unsigned int* live_triangles = adjacency.counts;
1091
1092
size_t face_count = index_count / 3;
1093
1094
unsigned char* emitted_flags = allocator.allocate<unsigned char>(face_count);
1095
memset(emitted_flags, 0, face_count);
1096
1097
// for each triangle, precompute centroid & normal to use for scoring
1098
Cone* triangles = allocator.allocate<Cone>(face_count);
1099
float mesh_area = computeTriangleCones(triangles, indices, index_count, vertex_positions, vertex_count, vertex_positions_stride);
1100
1101
// assuming each meshlet is a square patch, expected radius is sqrt(expected area)
1102
float triangle_area_avg = face_count == 0 ? 0.f : mesh_area / float(face_count) * 0.5f;
1103
float meshlet_expected_radius = sqrtf(triangle_area_avg * max_triangles) * 0.5f;
1104
1105
// build a kd-tree for nearest neighbor lookup
1106
unsigned int* kdindices = allocator.allocate<unsigned int>(face_count);
1107
for (size_t i = 0; i < face_count; ++i)
1108
kdindices[i] = unsigned(i);
1109
1110
KDNode* nodes = allocator.allocate<KDNode>(face_count * 2);
1111
kdtreeBuild(0, nodes, face_count * 2, &triangles[0].px, sizeof(Cone) / sizeof(float), kdindices, face_count, /* leaf_size= */ 8);
1112
1113
// find a specific corner of the mesh to use as a starting point for meshlet flow
1114
float cornerx = FLT_MAX, cornery = FLT_MAX, cornerz = FLT_MAX;
1115
1116
for (size_t i = 0; i < face_count; ++i)
1117
{
1118
const Cone& tri = triangles[i];
1119
1120
cornerx = cornerx > tri.px ? tri.px : cornerx;
1121
cornery = cornery > tri.py ? tri.py : cornery;
1122
cornerz = cornerz > tri.pz ? tri.pz : cornerz;
1123
}
1124
1125
// index of the vertex in the meshlet, -1 if the vertex isn't used
1126
short* used = allocator.allocate<short>(vertex_count);
1127
memset(used, -1, vertex_count * sizeof(short));
1128
1129
// initial seed triangle is the one closest to the corner
1130
unsigned int initial_seed = ~0u;
1131
float initial_score = FLT_MAX;
1132
1133
for (size_t i = 0; i < face_count; ++i)
1134
{
1135
const Cone& tri = triangles[i];
1136
1137
float score = getDistance(tri.px - cornerx, tri.py - cornery, tri.pz - cornerz, false);
1138
1139
if (initial_seed == ~0u || score < initial_score)
1140
{
1141
initial_seed = unsigned(i);
1142
initial_score = score;
1143
}
1144
}
1145
1146
// seed triangles to continue meshlet flow
1147
unsigned int seeds[kMeshletMaxSeeds] = {};
1148
size_t seed_count = 0;
1149
1150
meshopt_Meshlet meshlet = {};
1151
size_t meshlet_offset = 0;
1152
1153
Cone meshlet_cone_acc = {};
1154
1155
for (;;)
1156
{
1157
Cone meshlet_cone = getMeshletCone(meshlet_cone_acc, meshlet.triangle_count);
1158
1159
unsigned int best_triangle = ~0u;
1160
1161
// for the first triangle, we don't have a meshlet cone yet, so we use the initial seed
1162
// to continue the meshlet, we select an adjacent triangle based on connectivity and spatial scoring
1163
if (meshlet_offset == 0 && meshlet.triangle_count == 0)
1164
best_triangle = initial_seed;
1165
else
1166
best_triangle = getNeighborTriangle(meshlet, meshlet_cone, meshlet_vertices, indices, adjacency, triangles, live_triangles, used, meshlet_expected_radius, cone_weight);
1167
1168
bool split = false;
1169
1170
// when we run out of adjacent triangles we need to switch to spatial search; we currently just pick the closest triangle irrespective of connectivity
1171
if (best_triangle == ~0u)
1172
{
1173
float position[3] = {meshlet_cone.px, meshlet_cone.py, meshlet_cone.pz};
1174
unsigned int index = ~0u;
1175
float distance = FLT_MAX;
1176
1177
kdtreeNearest(nodes, 0, &triangles[0].px, sizeof(Cone) / sizeof(float), emitted_flags, position, cone_weight < 0.f, index, distance);
1178
1179
best_triangle = index;
1180
split = meshlet.triangle_count >= min_triangles && split_factor > 0 && distance > meshlet_expected_radius * split_factor;
1181
}
1182
1183
if (best_triangle == ~0u)
1184
break;
1185
1186
int best_extra = (used[indices[best_triangle * 3 + 0]] < 0) + (used[indices[best_triangle * 3 + 1]] < 0) + (used[indices[best_triangle * 3 + 2]] < 0);
1187
1188
// if the best triangle doesn't fit into current meshlet, we re-select using seeds to maintain global flow
1189
if (split || (meshlet.vertex_count + best_extra > max_vertices || meshlet.triangle_count >= max_triangles))
1190
{
1191
seed_count = pruneSeedTriangles(seeds, seed_count, emitted_flags);
1192
seed_count = (seed_count + kMeshletAddSeeds <= kMeshletMaxSeeds) ? seed_count : kMeshletMaxSeeds - kMeshletAddSeeds;
1193
seed_count += appendSeedTriangles(seeds + seed_count, meshlet, meshlet_vertices, indices, adjacency, triangles, live_triangles, cornerx, cornery, cornerz);
1194
1195
unsigned int best_seed = selectSeedTriangle(seeds, seed_count, indices, triangles, live_triangles, cornerx, cornery, cornerz);
1196
1197
// we may not find a valid seed triangle if the mesh is disconnected as seeds are based on adjacency
1198
best_triangle = best_seed != ~0u ? best_seed : best_triangle;
1199
}
1200
1201
unsigned int a = indices[best_triangle * 3 + 0], b = indices[best_triangle * 3 + 1], c = indices[best_triangle * 3 + 2];
1202
assert(a < vertex_count && b < vertex_count && c < vertex_count);
1203
1204
// add meshlet to the output; when the current meshlet is full we reset the accumulated bounds
1205
if (appendMeshlet(meshlet, a, b, c, used, meshlets, meshlet_vertices, meshlet_triangles, meshlet_offset, max_vertices, max_triangles, split))
1206
{
1207
meshlet_offset++;
1208
memset(&meshlet_cone_acc, 0, sizeof(meshlet_cone_acc));
1209
}
1210
1211
// remove emitted triangle from adjacency data
1212
// this makes sure that we spend less time traversing these lists on subsequent iterations
1213
// live triangle counts are updated as a byproduct of these adjustments
1214
for (size_t k = 0; k < 3; ++k)
1215
{
1216
unsigned int index = indices[best_triangle * 3 + k];
1217
1218
unsigned int* neighbors = &adjacency.data[0] + adjacency.offsets[index];
1219
size_t neighbors_size = adjacency.counts[index];
1220
1221
for (size_t i = 0; i < neighbors_size; ++i)
1222
{
1223
unsigned int tri = neighbors[i];
1224
1225
if (tri == best_triangle)
1226
{
1227
neighbors[i] = neighbors[neighbors_size - 1];
1228
adjacency.counts[index]--;
1229
break;
1230
}
1231
}
1232
}
1233
1234
// update aggregated meshlet cone data for scoring subsequent triangles
1235
meshlet_cone_acc.px += triangles[best_triangle].px;
1236
meshlet_cone_acc.py += triangles[best_triangle].py;
1237
meshlet_cone_acc.pz += triangles[best_triangle].pz;
1238
meshlet_cone_acc.nx += triangles[best_triangle].nx;
1239
meshlet_cone_acc.ny += triangles[best_triangle].ny;
1240
meshlet_cone_acc.nz += triangles[best_triangle].nz;
1241
1242
assert(!emitted_flags[best_triangle]);
1243
emitted_flags[best_triangle] = 1;
1244
}
1245
1246
if (meshlet.triangle_count)
1247
{
1248
finishMeshlet(meshlet, meshlet_triangles);
1249
1250
meshlets[meshlet_offset++] = meshlet;
1251
}
1252
1253
assert(meshlet_offset <= meshopt_buildMeshletsBound(index_count, max_vertices, min_triangles));
1254
return meshlet_offset;
1255
}
1256
1257
size_t meshopt_buildMeshlets(meshopt_Meshlet* meshlets, unsigned int* meshlet_vertices, unsigned char* meshlet_triangles, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t max_vertices, size_t max_triangles, float cone_weight)
1258
{
1259
assert(cone_weight >= 0); // to use negative cone weight, use meshopt_buildMeshletsFlex
1260
1261
return meshopt_buildMeshletsFlex(meshlets, meshlet_vertices, meshlet_triangles, indices, index_count, vertex_positions, vertex_count, vertex_positions_stride, max_vertices, max_triangles, max_triangles, cone_weight, 0.0f);
1262
}
1263
1264
size_t meshopt_buildMeshletsScan(meshopt_Meshlet* meshlets, unsigned int* meshlet_vertices, unsigned char* meshlet_triangles, const unsigned int* indices, size_t index_count, size_t vertex_count, size_t max_vertices, size_t max_triangles)
1265
{
1266
using namespace meshopt;
1267
1268
assert(index_count % 3 == 0);
1269
1270
assert(max_vertices >= 3 && max_vertices <= kMeshletMaxVertices);
1271
assert(max_triangles >= 1 && max_triangles <= kMeshletMaxTriangles);
1272
assert(max_triangles % 4 == 0); // ensures the caller will compute output space properly as index data is 4b aligned
1273
1274
meshopt_Allocator allocator;
1275
1276
// index of the vertex in the meshlet, -1 if the vertex isn't used
1277
short* used = allocator.allocate<short>(vertex_count);
1278
memset(used, -1, vertex_count * sizeof(short));
1279
1280
meshopt_Meshlet meshlet = {};
1281
size_t meshlet_offset = 0;
1282
1283
for (size_t i = 0; i < index_count; i += 3)
1284
{
1285
unsigned int a = indices[i + 0], b = indices[i + 1], c = indices[i + 2];
1286
assert(a < vertex_count && b < vertex_count && c < vertex_count);
1287
1288
// appends triangle to the meshlet and writes previous meshlet to the output if full
1289
meshlet_offset += appendMeshlet(meshlet, a, b, c, used, meshlets, meshlet_vertices, meshlet_triangles, meshlet_offset, max_vertices, max_triangles);
1290
}
1291
1292
if (meshlet.triangle_count)
1293
{
1294
finishMeshlet(meshlet, meshlet_triangles);
1295
1296
meshlets[meshlet_offset++] = meshlet;
1297
}
1298
1299
assert(meshlet_offset <= meshopt_buildMeshletsBound(index_count, max_vertices, max_triangles));
1300
return meshlet_offset;
1301
}
1302
1303
size_t meshopt_buildMeshletsSpatial(struct meshopt_Meshlet* meshlets, unsigned int* meshlet_vertices, unsigned char* meshlet_triangles, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t max_vertices, size_t min_triangles, size_t max_triangles, float fill_weight)
1304
{
1305
using namespace meshopt;
1306
1307
assert(index_count % 3 == 0);
1308
assert(vertex_positions_stride >= 12 && vertex_positions_stride <= 256);
1309
assert(vertex_positions_stride % sizeof(float) == 0);
1310
1311
assert(max_vertices >= 3 && max_vertices <= kMeshletMaxVertices);
1312
assert(min_triangles >= 1 && min_triangles <= max_triangles && max_triangles <= kMeshletMaxTriangles);
1313
assert(min_triangles % 4 == 0 && max_triangles % 4 == 0); // ensures the caller will compute output space properly as index data is 4b aligned
1314
1315
if (index_count == 0)
1316
return 0;
1317
1318
size_t face_count = index_count / 3;
1319
size_t vertex_stride_float = vertex_positions_stride / sizeof(float);
1320
1321
meshopt_Allocator allocator;
1322
1323
// 3 floats plus 1 uint for sorting, or
1324
// 2 floats for SAH costs, or
1325
// 1 uint plus 1 byte for partitioning
1326
float* scratch = allocator.allocate<float>(face_count * 4);
1327
1328
// compute bounding boxes and centroids for sorting
1329
BVHBox* boxes = allocator.allocate<BVHBox>(face_count);
1330
bvhPrepare(boxes, scratch, indices, face_count, vertex_positions, vertex_count, vertex_stride_float);
1331
1332
unsigned int* axes = allocator.allocate<unsigned int>(face_count * 3);
1333
unsigned int* temp = reinterpret_cast<unsigned int*>(scratch) + face_count * 3;
1334
1335
for (int k = 0; k < 3; ++k)
1336
{
1337
unsigned int* order = axes + k * face_count;
1338
const float* keys = scratch + k * face_count;
1339
1340
unsigned int hist[1024][3];
1341
computeHistogram(hist, keys, face_count);
1342
1343
// 3-pass radix sort computes the resulting order into axes
1344
for (size_t i = 0; i < face_count; ++i)
1345
temp[i] = unsigned(i);
1346
1347
radixPass(order, temp, keys, face_count, hist, 0);
1348
radixPass(temp, order, keys, face_count, hist, 1);
1349
radixPass(order, temp, keys, face_count, hist, 2);
1350
}
1351
1352
// index of the vertex in the meshlet, -1 if the vertex isn't used
1353
short* used = allocator.allocate<short>(vertex_count);
1354
memset(used, -1, vertex_count * sizeof(short));
1355
1356
unsigned char* boundary = allocator.allocate<unsigned char>(face_count);
1357
1358
bvhSplit(boxes, &axes[0], &axes[face_count], &axes[face_count * 2], boundary, face_count, 0, scratch, used, indices, max_vertices, min_triangles, max_triangles, fill_weight);
1359
1360
// compute the desired number of meshlets; note that on some meshes with a lot of vertex bound clusters this might go over the bound
1361
size_t meshlet_count = 0;
1362
for (size_t i = 0; i < face_count; ++i)
1363
{
1364
assert(boundary[i] <= 1);
1365
meshlet_count += boundary[i];
1366
}
1367
1368
size_t meshlet_bound = meshopt_buildMeshletsBound(index_count, max_vertices, min_triangles);
1369
1370
// pack triangles into meshlets according to the order and boundaries marked by bvhSplit
1371
meshopt_Meshlet meshlet = {};
1372
size_t meshlet_offset = 0;
1373
size_t meshlet_pending = meshlet_count;
1374
1375
for (size_t i = 0; i < face_count; ++i)
1376
{
1377
assert(boundary[i] <= 1);
1378
bool split = i > 0 && boundary[i] == 1;
1379
1380
// while we are over the limit, we ignore boundary[] data and disable splits until we free up enough space
1381
if (split && meshlet_count > meshlet_bound && meshlet_offset + meshlet_pending >= meshlet_bound)
1382
split = false;
1383
1384
unsigned int index = axes[i];
1385
assert(index < face_count);
1386
1387
unsigned int a = indices[index * 3 + 0], b = indices[index * 3 + 1], c = indices[index * 3 + 2];
1388
1389
// appends triangle to the meshlet and writes previous meshlet to the output if full
1390
meshlet_offset += appendMeshlet(meshlet, a, b, c, used, meshlets, meshlet_vertices, meshlet_triangles, meshlet_offset, max_vertices, max_triangles, split);
1391
meshlet_pending -= boundary[i];
1392
}
1393
1394
if (meshlet.triangle_count)
1395
{
1396
finishMeshlet(meshlet, meshlet_triangles);
1397
1398
meshlets[meshlet_offset++] = meshlet;
1399
}
1400
1401
assert(meshlet_offset <= meshlet_bound);
1402
return meshlet_offset;
1403
}
1404
1405
meshopt_Bounds meshopt_computeClusterBounds(const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride)
1406
{
1407
using namespace meshopt;
1408
1409
assert(index_count % 3 == 0);
1410
assert(index_count / 3 <= kMeshletMaxTriangles);
1411
assert(vertex_positions_stride >= 12 && vertex_positions_stride <= 256);
1412
assert(vertex_positions_stride % sizeof(float) == 0);
1413
1414
(void)vertex_count;
1415
1416
size_t vertex_stride_float = vertex_positions_stride / sizeof(float);
1417
1418
// compute triangle normals and gather triangle corners
1419
float normals[kMeshletMaxTriangles][3];
1420
float corners[kMeshletMaxTriangles][3][3];
1421
size_t triangles = 0;
1422
1423
for (size_t i = 0; i < index_count; i += 3)
1424
{
1425
unsigned int a = indices[i + 0], b = indices[i + 1], c = indices[i + 2];
1426
assert(a < vertex_count && b < vertex_count && c < vertex_count);
1427
1428
const float* p0 = vertex_positions + vertex_stride_float * a;
1429
const float* p1 = vertex_positions + vertex_stride_float * b;
1430
const float* p2 = vertex_positions + vertex_stride_float * c;
1431
1432
float p10[3] = {p1[0] - p0[0], p1[1] - p0[1], p1[2] - p0[2]};
1433
float p20[3] = {p2[0] - p0[0], p2[1] - p0[1], p2[2] - p0[2]};
1434
1435
float normalx = p10[1] * p20[2] - p10[2] * p20[1];
1436
float normaly = p10[2] * p20[0] - p10[0] * p20[2];
1437
float normalz = p10[0] * p20[1] - p10[1] * p20[0];
1438
1439
float area = sqrtf(normalx * normalx + normaly * normaly + normalz * normalz);
1440
1441
// no need to include degenerate triangles - they will be invisible anyway
1442
if (area == 0.f)
1443
continue;
1444
1445
// record triangle normals & corners for future use; normal and corner 0 define a plane equation
1446
normals[triangles][0] = normalx / area;
1447
normals[triangles][1] = normaly / area;
1448
normals[triangles][2] = normalz / area;
1449
memcpy(corners[triangles][0], p0, 3 * sizeof(float));
1450
memcpy(corners[triangles][1], p1, 3 * sizeof(float));
1451
memcpy(corners[triangles][2], p2, 3 * sizeof(float));
1452
triangles++;
1453
}
1454
1455
meshopt_Bounds bounds = {};
1456
1457
// degenerate cluster, no valid triangles => trivial reject (cone data is 0)
1458
if (triangles == 0)
1459
return bounds;
1460
1461
const float rzero = 0.f;
1462
1463
// compute cluster bounding sphere; we'll use the center to determine normal cone apex as well
1464
float psphere[4] = {};
1465
computeBoundingSphere(psphere, corners[0][0], triangles * 3, sizeof(float) * 3, &rzero, 0, 7);
1466
1467
float center[3] = {psphere[0], psphere[1], psphere[2]};
1468
1469
// treating triangle normals as points, find the bounding sphere - the sphere center determines the optimal cone axis
1470
float nsphere[4] = {};
1471
computeBoundingSphere(nsphere, normals[0], triangles, sizeof(float) * 3, &rzero, 0, 3);
1472
1473
float axis[3] = {nsphere[0], nsphere[1], nsphere[2]};
1474
float axislength = sqrtf(axis[0] * axis[0] + axis[1] * axis[1] + axis[2] * axis[2]);
1475
float invaxislength = axislength == 0.f ? 0.f : 1.f / axislength;
1476
1477
axis[0] *= invaxislength;
1478
axis[1] *= invaxislength;
1479
axis[2] *= invaxislength;
1480
1481
// compute a tight cone around all normals, mindp = cos(angle/2)
1482
float mindp = 1.f;
1483
1484
for (size_t i = 0; i < triangles; ++i)
1485
{
1486
float dp = normals[i][0] * axis[0] + normals[i][1] * axis[1] + normals[i][2] * axis[2];
1487
1488
mindp = (dp < mindp) ? dp : mindp;
1489
}
1490
1491
// fill bounding sphere info; note that below we can return bounds without cone information for degenerate cones
1492
bounds.center[0] = center[0];
1493
bounds.center[1] = center[1];
1494
bounds.center[2] = center[2];
1495
bounds.radius = psphere[3];
1496
1497
// degenerate cluster, normal cone is larger than a hemisphere => trivial accept
1498
// note that if mindp is positive but close to 0, the triangle intersection code below gets less stable
1499
// we arbitrarily decide that if a normal cone is ~168 degrees wide or more, the cone isn't useful
1500
if (mindp <= 0.1f)
1501
{
1502
bounds.cone_cutoff = 1;
1503
bounds.cone_cutoff_s8 = 127;
1504
return bounds;
1505
}
1506
1507
float maxt = 0;
1508
1509
// we need to find the point on center-t*axis ray that lies in negative half-space of all triangles
1510
for (size_t i = 0; i < triangles; ++i)
1511
{
1512
// dot(center-t*axis-corner, trinormal) = 0
1513
// dot(center-corner, trinormal) - t * dot(axis, trinormal) = 0
1514
float cx = center[0] - corners[i][0][0];
1515
float cy = center[1] - corners[i][0][1];
1516
float cz = center[2] - corners[i][0][2];
1517
1518
float dc = cx * normals[i][0] + cy * normals[i][1] + cz * normals[i][2];
1519
float dn = axis[0] * normals[i][0] + axis[1] * normals[i][1] + axis[2] * normals[i][2];
1520
1521
// dn should be larger than mindp cutoff above
1522
assert(dn > 0.f);
1523
float t = dc / dn;
1524
1525
maxt = (t > maxt) ? t : maxt;
1526
}
1527
1528
// cone apex should be in the negative half-space of all cluster triangles by construction
1529
bounds.cone_apex[0] = center[0] - axis[0] * maxt;
1530
bounds.cone_apex[1] = center[1] - axis[1] * maxt;
1531
bounds.cone_apex[2] = center[2] - axis[2] * maxt;
1532
1533
// note: this axis is the axis of the normal cone, but our test for perspective camera effectively negates the axis
1534
bounds.cone_axis[0] = axis[0];
1535
bounds.cone_axis[1] = axis[1];
1536
bounds.cone_axis[2] = axis[2];
1537
1538
// cos(a) for normal cone is mindp; we need to add 90 degrees on both sides and invert the cone
1539
// which gives us -cos(a+90) = -(-sin(a)) = sin(a) = sqrt(1 - cos^2(a))
1540
bounds.cone_cutoff = sqrtf(1 - mindp * mindp);
1541
1542
// quantize axis & cutoff to 8-bit SNORM format
1543
bounds.cone_axis_s8[0] = (signed char)(meshopt_quantizeSnorm(bounds.cone_axis[0], 8));
1544
bounds.cone_axis_s8[1] = (signed char)(meshopt_quantizeSnorm(bounds.cone_axis[1], 8));
1545
bounds.cone_axis_s8[2] = (signed char)(meshopt_quantizeSnorm(bounds.cone_axis[2], 8));
1546
1547
// for the 8-bit test to be conservative, we need to adjust the cutoff by measuring the max. error
1548
float cone_axis_s8_e0 = fabsf(bounds.cone_axis_s8[0] / 127.f - bounds.cone_axis[0]);
1549
float cone_axis_s8_e1 = fabsf(bounds.cone_axis_s8[1] / 127.f - bounds.cone_axis[1]);
1550
float cone_axis_s8_e2 = fabsf(bounds.cone_axis_s8[2] / 127.f - bounds.cone_axis[2]);
1551
1552
// note that we need to round this up instead of rounding to nearest, hence +1
1553
int cone_cutoff_s8 = int(127 * (bounds.cone_cutoff + cone_axis_s8_e0 + cone_axis_s8_e1 + cone_axis_s8_e2) + 1);
1554
1555
bounds.cone_cutoff_s8 = (cone_cutoff_s8 > 127) ? 127 : (signed char)(cone_cutoff_s8);
1556
1557
return bounds;
1558
}
1559
1560
meshopt_Bounds meshopt_computeMeshletBounds(const unsigned int* meshlet_vertices, const unsigned char* meshlet_triangles, size_t triangle_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride)
1561
{
1562
using namespace meshopt;
1563
1564
assert(triangle_count <= kMeshletMaxTriangles);
1565
assert(vertex_positions_stride >= 12 && vertex_positions_stride <= 256);
1566
assert(vertex_positions_stride % sizeof(float) == 0);
1567
1568
unsigned int indices[kMeshletMaxTriangles * 3];
1569
1570
for (size_t i = 0; i < triangle_count * 3; ++i)
1571
{
1572
unsigned int index = meshlet_vertices[meshlet_triangles[i]];
1573
assert(index < vertex_count);
1574
1575
indices[i] = index;
1576
}
1577
1578
return meshopt_computeClusterBounds(indices, triangle_count * 3, vertex_positions, vertex_count, vertex_positions_stride);
1579
}
1580
1581
meshopt_Bounds meshopt_computeSphereBounds(const float* positions, size_t count, size_t positions_stride, const float* radii, size_t radii_stride)
1582
{
1583
using namespace meshopt;
1584
1585
assert(positions_stride >= 12 && positions_stride <= 256);
1586
assert(positions_stride % sizeof(float) == 0);
1587
assert((radii_stride >= 4 && radii_stride <= 256) || radii == NULL);
1588
assert(radii_stride % sizeof(float) == 0);
1589
1590
meshopt_Bounds bounds = {};
1591
1592
if (count == 0)
1593
return bounds;
1594
1595
const float rzero = 0.f;
1596
1597
float psphere[4] = {};
1598
computeBoundingSphere(psphere, positions, count, positions_stride, radii ? radii : &rzero, radii ? radii_stride : 0, 7);
1599
1600
bounds.center[0] = psphere[0];
1601
bounds.center[1] = psphere[1];
1602
bounds.center[2] = psphere[2];
1603
bounds.radius = psphere[3];
1604
1605
return bounds;
1606
}
1607
1608
void meshopt_optimizeMeshlet(unsigned int* meshlet_vertices, unsigned char* meshlet_triangles, size_t triangle_count, size_t vertex_count)
1609
{
1610
using namespace meshopt;
1611
1612
assert(triangle_count <= kMeshletMaxTriangles);
1613
assert(vertex_count <= kMeshletMaxVertices);
1614
1615
unsigned char* indices = meshlet_triangles;
1616
unsigned int* vertices = meshlet_vertices;
1617
1618
// cache tracks vertex timestamps (corresponding to triangle index! all 3 vertices are added at the same time and never removed)
1619
unsigned char cache[kMeshletMaxVertices];
1620
memset(cache, 0, vertex_count);
1621
1622
// note that we start from a value that means all vertices aren't in cache
1623
unsigned char cache_last = 128;
1624
const unsigned char cache_cutoff = 3; // 3 triangles = ~5..9 vertices depending on reuse
1625
1626
for (size_t i = 0; i < triangle_count; ++i)
1627
{
1628
int next = -1;
1629
int next_match = -1;
1630
1631
for (size_t j = i; j < triangle_count; ++j)
1632
{
1633
unsigned char a = indices[j * 3 + 0], b = indices[j * 3 + 1], c = indices[j * 3 + 2];
1634
assert(a < vertex_count && b < vertex_count && c < vertex_count);
1635
1636
// score each triangle by how many vertices are in cache
1637
// note: the distance is computed using unsigned 8-bit values, so cache timestamp overflow is handled gracefully
1638
int aok = (unsigned char)(cache_last - cache[a]) < cache_cutoff;
1639
int bok = (unsigned char)(cache_last - cache[b]) < cache_cutoff;
1640
int cok = (unsigned char)(cache_last - cache[c]) < cache_cutoff;
1641
1642
if (aok + bok + cok > next_match)
1643
{
1644
next = (int)j;
1645
next_match = aok + bok + cok;
1646
1647
// note that we could end up with all 3 vertices in the cache, but 2 is enough for ~strip traversal
1648
if (next_match >= 2)
1649
break;
1650
}
1651
}
1652
1653
assert(next >= 0);
1654
1655
unsigned char a = indices[next * 3 + 0], b = indices[next * 3 + 1], c = indices[next * 3 + 2];
1656
1657
// shift triangles before the next one forward so that we always keep an ordered partition
1658
// note: this could have swapped triangles [i] and [next] but that distorts the order and may skew the output sequence
1659
memmove(indices + (i + 1) * 3, indices + i * 3, (next - i) * 3 * sizeof(unsigned char));
1660
1661
indices[i * 3 + 0] = a;
1662
indices[i * 3 + 1] = b;
1663
indices[i * 3 + 2] = c;
1664
1665
// cache timestamp is the same between all vertices of each triangle to reduce overflow
1666
cache_last++;
1667
cache[a] = cache_last;
1668
cache[b] = cache_last;
1669
cache[c] = cache_last;
1670
}
1671
1672
// reorder meshlet vertices for access locality assuming index buffer is scanned sequentially
1673
unsigned int order[kMeshletMaxVertices];
1674
1675
short remap[kMeshletMaxVertices];
1676
memset(remap, -1, vertex_count * sizeof(short));
1677
1678
size_t vertex_offset = 0;
1679
1680
for (size_t i = 0; i < triangle_count * 3; ++i)
1681
{
1682
short& r = remap[indices[i]];
1683
1684
if (r < 0)
1685
{
1686
r = short(vertex_offset);
1687
order[vertex_offset] = vertices[indices[i]];
1688
vertex_offset++;
1689
}
1690
1691
indices[i] = (unsigned char)r;
1692
}
1693
1694
assert(vertex_offset <= vertex_count);
1695
memcpy(vertices, order, vertex_offset * sizeof(unsigned int));
1696
}
1697
1698