CoCalc -- overdrawoptimizer.cpp

GitHub Repository: godotengine/godot
Path: blob/master/thirdparty/meshoptimizer/overdrawoptimizer.cpp
⁹⁹⁰³ views
1
// This file is part of meshoptimizer library; see meshoptimizer.h for version/license details
2
#include "meshoptimizer.h"
3

4
#include <assert.h>
5
#include <math.h>
6
#include <string.h>
7

8
// This work is based on:
9
// Pedro Sander, Diego Nehab and Joshua Barczak. Fast Triangle Reordering for Vertex Locality and Reduced Overdraw. 2007
10
namespace meshopt
11
{
12

13
static void calculateSortData(float* sort_data, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_positions_stride, const unsigned int* clusters, size_t cluster_count)
14
{
15
	size_t vertex_stride_float = vertex_positions_stride / sizeof(float);
16

17
	float mesh_centroid[3] = {};
18

19
	for (size_t i = 0; i < index_count; ++i)
20
	{
21
		const float* p = vertex_positions + vertex_stride_float * indices[i];
22

23
		mesh_centroid[0] += p[0];
24
		mesh_centroid[1] += p[1];
25
		mesh_centroid[2] += p[2];
26
	}
27

28
	mesh_centroid[0] /= index_count;
29
	mesh_centroid[1] /= index_count;
30
	mesh_centroid[2] /= index_count;
31

32
	for (size_t cluster = 0; cluster < cluster_count; ++cluster)
33
	{
34
		size_t cluster_begin = clusters[cluster] * 3;
35
		size_t cluster_end = (cluster + 1 < cluster_count) ? clusters[cluster + 1] * 3 : index_count;
36
		assert(cluster_begin < cluster_end);
37

38
		float cluster_area = 0;
39
		float cluster_centroid[3] = {};
40
		float cluster_normal[3] = {};
41

42
		for (size_t i = cluster_begin; i < cluster_end; i += 3)
43
		{
44
			const float* p0 = vertex_positions + vertex_stride_float * indices[i + 0];
45
			const float* p1 = vertex_positions + vertex_stride_float * indices[i + 1];
46
			const float* p2 = vertex_positions + vertex_stride_float * indices[i + 2];
47

48
			float p10[3] = {p1[0] - p0[0], p1[1] - p0[1], p1[2] - p0[2]};
49
			float p20[3] = {p2[0] - p0[0], p2[1] - p0[1], p2[2] - p0[2]};
50

51
			float normalx = p10[1] * p20[2] - p10[2] * p20[1];
52
			float normaly = p10[2] * p20[0] - p10[0] * p20[2];
53
			float normalz = p10[0] * p20[1] - p10[1] * p20[0];
54

55
			float area = sqrtf(normalx * normalx + normaly * normaly + normalz * normalz);
56

57
			cluster_centroid[0] += (p0[0] + p1[0] + p2[0]) * (area / 3);
58
			cluster_centroid[1] += (p0[1] + p1[1] + p2[1]) * (area / 3);
59
			cluster_centroid[2] += (p0[2] + p1[2] + p2[2]) * (area / 3);
60
			cluster_normal[0] += normalx;
61
			cluster_normal[1] += normaly;
62
			cluster_normal[2] += normalz;
63
			cluster_area += area;
64
		}
65

66
		float inv_cluster_area = cluster_area == 0 ? 0 : 1 / cluster_area;
67

68
		cluster_centroid[0] *= inv_cluster_area;
69
		cluster_centroid[1] *= inv_cluster_area;
70
		cluster_centroid[2] *= inv_cluster_area;
71

72
		float cluster_normal_length = sqrtf(cluster_normal[0] * cluster_normal[0] + cluster_normal[1] * cluster_normal[1] + cluster_normal[2] * cluster_normal[2]);
73
		float inv_cluster_normal_length = cluster_normal_length == 0 ? 0 : 1 / cluster_normal_length;
74

75
		cluster_normal[0] *= inv_cluster_normal_length;
76
		cluster_normal[1] *= inv_cluster_normal_length;
77
		cluster_normal[2] *= inv_cluster_normal_length;
78

79
		float centroid_vector[3] = {cluster_centroid[0] - mesh_centroid[0], cluster_centroid[1] - mesh_centroid[1], cluster_centroid[2] - mesh_centroid[2]};
80

81
		sort_data[cluster] = centroid_vector[0] * cluster_normal[0] + centroid_vector[1] * cluster_normal[1] + centroid_vector[2] * cluster_normal[2];
82
	}
83
}
84

85
static void calculateSortOrderRadix(unsigned int* sort_order, const float* sort_data, unsigned short* sort_keys, size_t cluster_count)
86
{
87
	// compute sort data bounds and renormalize, using fixed point snorm
88
	float sort_data_max = 1e-3f;
89

90
	for (size_t i = 0; i < cluster_count; ++i)
91
	{
92
		float dpa = fabsf(sort_data[i]);
93

94
		sort_data_max = (sort_data_max < dpa) ? dpa : sort_data_max;
95
	}
96

97
	const int sort_bits = 11;
98

99
	for (size_t i = 0; i < cluster_count; ++i)
100
	{
101
		// note that we flip distribution since high dot product should come first
102
		float sort_key = 0.5f - 0.5f * (sort_data[i] / sort_data_max);
103

104
		sort_keys[i] = meshopt_quantizeUnorm(sort_key, sort_bits) & ((1 << sort_bits) - 1);
105
	}
106

107
	// fill histogram for counting sort
108
	unsigned int histogram[1 << sort_bits];
109
	memset(histogram, 0, sizeof(histogram));
110

111
	for (size_t i = 0; i < cluster_count; ++i)
112
	{
113
		histogram[sort_keys[i]]++;
114
	}
115

116
	// compute offsets based on histogram data
117
	size_t histogram_sum = 0;
118

119
	for (size_t i = 0; i < 1 << sort_bits; ++i)
120
	{
121
		size_t count = histogram[i];
122
		histogram[i] = unsigned(histogram_sum);
123
		histogram_sum += count;
124
	}
125

126
	assert(histogram_sum == cluster_count);
127

128
	// compute sort order based on offsets
129
	for (size_t i = 0; i < cluster_count; ++i)
130
	{
131
		sort_order[histogram[sort_keys[i]]++] = unsigned(i);
132
	}
133
}
134

135
static unsigned int updateCache(unsigned int a, unsigned int b, unsigned int c, unsigned int cache_size, unsigned int* cache_timestamps, unsigned int& timestamp)
136
{
137
	unsigned int cache_misses = 0;
138

139
	// if vertex is not in cache, put it in cache
140
	if (timestamp - cache_timestamps[a] > cache_size)
141
	{
142
		cache_timestamps[a] = timestamp++;
143
		cache_misses++;
144
	}
145

146
	if (timestamp - cache_timestamps[b] > cache_size)
147
	{
148
		cache_timestamps[b] = timestamp++;
149
		cache_misses++;
150
	}
151

152
	if (timestamp - cache_timestamps[c] > cache_size)
153
	{
154
		cache_timestamps[c] = timestamp++;
155
		cache_misses++;
156
	}
157

158
	return cache_misses;
159
}
160

161
static size_t generateHardBoundaries(unsigned int* destination, const unsigned int* indices, size_t index_count, size_t vertex_count, unsigned int cache_size, unsigned int* cache_timestamps)
162
{
163
	memset(cache_timestamps, 0, vertex_count * sizeof(unsigned int));
164

165
	unsigned int timestamp = cache_size + 1;
166

167
	size_t face_count = index_count / 3;
168

169
	size_t result = 0;
170

171
	for (size_t i = 0; i < face_count; ++i)
172
	{
173
		unsigned int m = updateCache(indices[i * 3 + 0], indices[i * 3 + 1], indices[i * 3 + 2], cache_size, &cache_timestamps[0], timestamp);
174

175
		// when all three vertices are not in the cache it's usually relatively safe to assume that this is a new patch in the mesh
176
		// that is disjoint from previous vertices; sometimes it might come back to reference existing vertices but that frequently
177
		// suggests an inefficiency in the vertex cache optimization algorithm
178
		// usually the first triangle has 3 misses unless it's degenerate - thus we make sure the first cluster always starts with 0
179
		if (i == 0 || m == 3)
180
		{
181
			destination[result++] = unsigned(i);
182
		}
183
	}
184

185
	assert(result <= index_count / 3);
186

187
	return result;
188
}
189

190
static size_t generateSoftBoundaries(unsigned int* destination, const unsigned int* indices, size_t index_count, size_t vertex_count, const unsigned int* clusters, size_t cluster_count, unsigned int cache_size, float threshold, unsigned int* cache_timestamps)
191
{
192
	memset(cache_timestamps, 0, vertex_count * sizeof(unsigned int));
193

194
	unsigned int timestamp = 0;
195

196
	size_t result = 0;
197

198
	for (size_t it = 0; it < cluster_count; ++it)
199
	{
200
		size_t start = clusters[it];
201
		size_t end = (it + 1 < cluster_count) ? clusters[it + 1] : index_count / 3;
202
		assert(start < end);
203

204
		// reset cache
205
		timestamp += cache_size + 1;
206

207
		// measure cluster ACMR
208
		unsigned int cluster_misses = 0;
209

210
		for (size_t i = start; i < end; ++i)
211
		{
212
			unsigned int m = updateCache(indices[i * 3 + 0], indices[i * 3 + 1], indices[i * 3 + 2], cache_size, &cache_timestamps[0], timestamp);
213

214
			cluster_misses += m;
215
		}
216

217
		float cluster_threshold = threshold * (float(cluster_misses) / float(end - start));
218

219
		// first cluster always starts from the hard cluster boundary
220
		destination[result++] = unsigned(start);
221

222
		// reset cache
223
		timestamp += cache_size + 1;
224

225
		unsigned int running_misses = 0;
226
		unsigned int running_faces = 0;
227

228
		for (size_t i = start; i < end; ++i)
229
		{
230
			unsigned int m = updateCache(indices[i * 3 + 0], indices[i * 3 + 1], indices[i * 3 + 2], cache_size, &cache_timestamps[0], timestamp);
231

232
			running_misses += m;
233
			running_faces += 1;
234

235
			if (float(running_misses) / float(running_faces) <= cluster_threshold)
236
			{
237
				// we have reached the target ACMR with the current triangle so we need to start a new cluster on the next one
238
				// note that this may mean that we add 'end` to destination for the last triangle, which will imply that the last
239
				// cluster is empty; however, the 'pop_back' after the loop will clean it up
240
				destination[result++] = unsigned(i + 1);
241

242
				// reset cache
243
				timestamp += cache_size + 1;
244

245
				running_misses = 0;
246
				running_faces = 0;
247
			}
248
		}
249

250
		// each time we reach the target ACMR we flush the cluster
251
		// this means that the last cluster is by definition not very good - there are frequent cases where we are left with a few triangles
252
		// in the last cluster, producing a very bad ACMR and significantly penalizing the overall results
253
		// thus we remove the last cluster boundary, merging the last complete cluster with the last incomplete one
254
		// there are sometimes cases when the last cluster is actually good enough - in which case the code above would have added 'end'
255
		// to the cluster boundary array which we need to remove anyway - this code will do that automatically
256
		if (destination[result - 1] != start)
257
		{
258
			result--;
259
		}
260
	}
261

262
	assert(result >= cluster_count);
263
	assert(result <= index_count / 3);
264

265
	return result;
266
}
267

268
} // namespace meshopt
269

270
void meshopt_optimizeOverdraw(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, float threshold)
271
{
272
	using namespace meshopt;
273

274
	assert(index_count % 3 == 0);
275
	assert(vertex_positions_stride >= 12 && vertex_positions_stride <= 256);
276
	assert(vertex_positions_stride % sizeof(float) == 0);
277

278
	meshopt_Allocator allocator;
279

280
	// guard for empty meshes
281
	if (index_count == 0 || vertex_count == 0)
282
		return;
283

284
	// support in-place optimization
285
	if (destination == indices)
286
	{
287
		unsigned int* indices_copy = allocator.allocate<unsigned int>(index_count);
288
		memcpy(indices_copy, indices, index_count * sizeof(unsigned int));
289
		indices = indices_copy;
290
	}
291

292
	unsigned int cache_size = 16;
293

294
	unsigned int* cache_timestamps = allocator.allocate<unsigned int>(vertex_count);
295

296
	// generate hard boundaries from full-triangle cache misses
297
	unsigned int* hard_clusters = allocator.allocate<unsigned int>(index_count / 3);
298
	size_t hard_cluster_count = generateHardBoundaries(hard_clusters, indices, index_count, vertex_count, cache_size, cache_timestamps);
299

300
	// generate soft boundaries
301
	unsigned int* soft_clusters = allocator.allocate<unsigned int>(index_count / 3 + 1);
302
	size_t soft_cluster_count = generateSoftBoundaries(soft_clusters, indices, index_count, vertex_count, hard_clusters, hard_cluster_count, cache_size, threshold, cache_timestamps);
303

304
	const unsigned int* clusters = soft_clusters;
305
	size_t cluster_count = soft_cluster_count;
306

307
	// fill sort data
308
	float* sort_data = allocator.allocate<float>(cluster_count);
309
	calculateSortData(sort_data, indices, index_count, vertex_positions, vertex_positions_stride, clusters, cluster_count);
310

311
	// sort clusters using sort data
312
	unsigned short* sort_keys = allocator.allocate<unsigned short>(cluster_count);
313
	unsigned int* sort_order = allocator.allocate<unsigned int>(cluster_count);
314
	calculateSortOrderRadix(sort_order, sort_data, sort_keys, cluster_count);
315

316
	// fill output buffer
317
	size_t offset = 0;
318

319
	for (size_t it = 0; it < cluster_count; ++it)
320
	{
321
		unsigned int cluster = sort_order[it];
322
		assert(cluster < cluster_count);
323

324
		size_t cluster_begin = clusters[cluster] * 3;
325
		size_t cluster_end = (cluster + 1 < cluster_count) ? clusters[cluster + 1] * 3 : index_count;
326
		assert(cluster_begin < cluster_end);
327

328
		memcpy(destination + offset, indices + cluster_begin, (cluster_end - cluster_begin) * sizeof(unsigned int));
329
		offset += cluster_end - cluster_begin;
330
	}
331

332
	assert(offset == index_count);
333
}
334

335
Product

Resources

Company