Path: blob/master/thirdparty/meshoptimizer/overdrawoptimizer.cpp
9903 views
// This file is part of meshoptimizer library; see meshoptimizer.h for version/license details1#include "meshoptimizer.h"23#include <assert.h>4#include <math.h>5#include <string.h>67// This work is based on:8// Pedro Sander, Diego Nehab and Joshua Barczak. Fast Triangle Reordering for Vertex Locality and Reduced Overdraw. 20079namespace meshopt10{1112static void calculateSortData(float* sort_data, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_positions_stride, const unsigned int* clusters, size_t cluster_count)13{14size_t vertex_stride_float = vertex_positions_stride / sizeof(float);1516float mesh_centroid[3] = {};1718for (size_t i = 0; i < index_count; ++i)19{20const float* p = vertex_positions + vertex_stride_float * indices[i];2122mesh_centroid[0] += p[0];23mesh_centroid[1] += p[1];24mesh_centroid[2] += p[2];25}2627mesh_centroid[0] /= index_count;28mesh_centroid[1] /= index_count;29mesh_centroid[2] /= index_count;3031for (size_t cluster = 0; cluster < cluster_count; ++cluster)32{33size_t cluster_begin = clusters[cluster] * 3;34size_t cluster_end = (cluster + 1 < cluster_count) ? clusters[cluster + 1] * 3 : index_count;35assert(cluster_begin < cluster_end);3637float cluster_area = 0;38float cluster_centroid[3] = {};39float cluster_normal[3] = {};4041for (size_t i = cluster_begin; i < cluster_end; i += 3)42{43const float* p0 = vertex_positions + vertex_stride_float * indices[i + 0];44const float* p1 = vertex_positions + vertex_stride_float * indices[i + 1];45const float* p2 = vertex_positions + vertex_stride_float * indices[i + 2];4647float p10[3] = {p1[0] - p0[0], p1[1] - p0[1], p1[2] - p0[2]};48float p20[3] = {p2[0] - p0[0], p2[1] - p0[1], p2[2] - p0[2]};4950float normalx = p10[1] * p20[2] - p10[2] * p20[1];51float normaly = p10[2] * p20[0] - p10[0] * p20[2];52float normalz = p10[0] * p20[1] - p10[1] * p20[0];5354float area = sqrtf(normalx * normalx + normaly * normaly + normalz * normalz);5556cluster_centroid[0] += (p0[0] + p1[0] + p2[0]) * (area / 3);57cluster_centroid[1] += (p0[1] + p1[1] + p2[1]) * (area / 3);58cluster_centroid[2] += (p0[2] + p1[2] + p2[2]) * (area / 3);59cluster_normal[0] += normalx;60cluster_normal[1] += normaly;61cluster_normal[2] += normalz;62cluster_area += area;63}6465float inv_cluster_area = cluster_area == 0 ? 0 : 1 / cluster_area;6667cluster_centroid[0] *= inv_cluster_area;68cluster_centroid[1] *= inv_cluster_area;69cluster_centroid[2] *= inv_cluster_area;7071float cluster_normal_length = sqrtf(cluster_normal[0] * cluster_normal[0] + cluster_normal[1] * cluster_normal[1] + cluster_normal[2] * cluster_normal[2]);72float inv_cluster_normal_length = cluster_normal_length == 0 ? 0 : 1 / cluster_normal_length;7374cluster_normal[0] *= inv_cluster_normal_length;75cluster_normal[1] *= inv_cluster_normal_length;76cluster_normal[2] *= inv_cluster_normal_length;7778float centroid_vector[3] = {cluster_centroid[0] - mesh_centroid[0], cluster_centroid[1] - mesh_centroid[1], cluster_centroid[2] - mesh_centroid[2]};7980sort_data[cluster] = centroid_vector[0] * cluster_normal[0] + centroid_vector[1] * cluster_normal[1] + centroid_vector[2] * cluster_normal[2];81}82}8384static void calculateSortOrderRadix(unsigned int* sort_order, const float* sort_data, unsigned short* sort_keys, size_t cluster_count)85{86// compute sort data bounds and renormalize, using fixed point snorm87float sort_data_max = 1e-3f;8889for (size_t i = 0; i < cluster_count; ++i)90{91float dpa = fabsf(sort_data[i]);9293sort_data_max = (sort_data_max < dpa) ? dpa : sort_data_max;94}9596const int sort_bits = 11;9798for (size_t i = 0; i < cluster_count; ++i)99{100// note that we flip distribution since high dot product should come first101float sort_key = 0.5f - 0.5f * (sort_data[i] / sort_data_max);102103sort_keys[i] = meshopt_quantizeUnorm(sort_key, sort_bits) & ((1 << sort_bits) - 1);104}105106// fill histogram for counting sort107unsigned int histogram[1 << sort_bits];108memset(histogram, 0, sizeof(histogram));109110for (size_t i = 0; i < cluster_count; ++i)111{112histogram[sort_keys[i]]++;113}114115// compute offsets based on histogram data116size_t histogram_sum = 0;117118for (size_t i = 0; i < 1 << sort_bits; ++i)119{120size_t count = histogram[i];121histogram[i] = unsigned(histogram_sum);122histogram_sum += count;123}124125assert(histogram_sum == cluster_count);126127// compute sort order based on offsets128for (size_t i = 0; i < cluster_count; ++i)129{130sort_order[histogram[sort_keys[i]]++] = unsigned(i);131}132}133134static unsigned int updateCache(unsigned int a, unsigned int b, unsigned int c, unsigned int cache_size, unsigned int* cache_timestamps, unsigned int& timestamp)135{136unsigned int cache_misses = 0;137138// if vertex is not in cache, put it in cache139if (timestamp - cache_timestamps[a] > cache_size)140{141cache_timestamps[a] = timestamp++;142cache_misses++;143}144145if (timestamp - cache_timestamps[b] > cache_size)146{147cache_timestamps[b] = timestamp++;148cache_misses++;149}150151if (timestamp - cache_timestamps[c] > cache_size)152{153cache_timestamps[c] = timestamp++;154cache_misses++;155}156157return cache_misses;158}159160static size_t generateHardBoundaries(unsigned int* destination, const unsigned int* indices, size_t index_count, size_t vertex_count, unsigned int cache_size, unsigned int* cache_timestamps)161{162memset(cache_timestamps, 0, vertex_count * sizeof(unsigned int));163164unsigned int timestamp = cache_size + 1;165166size_t face_count = index_count / 3;167168size_t result = 0;169170for (size_t i = 0; i < face_count; ++i)171{172unsigned int m = updateCache(indices[i * 3 + 0], indices[i * 3 + 1], indices[i * 3 + 2], cache_size, &cache_timestamps[0], timestamp);173174// when all three vertices are not in the cache it's usually relatively safe to assume that this is a new patch in the mesh175// that is disjoint from previous vertices; sometimes it might come back to reference existing vertices but that frequently176// suggests an inefficiency in the vertex cache optimization algorithm177// usually the first triangle has 3 misses unless it's degenerate - thus we make sure the first cluster always starts with 0178if (i == 0 || m == 3)179{180destination[result++] = unsigned(i);181}182}183184assert(result <= index_count / 3);185186return result;187}188189static size_t generateSoftBoundaries(unsigned int* destination, const unsigned int* indices, size_t index_count, size_t vertex_count, const unsigned int* clusters, size_t cluster_count, unsigned int cache_size, float threshold, unsigned int* cache_timestamps)190{191memset(cache_timestamps, 0, vertex_count * sizeof(unsigned int));192193unsigned int timestamp = 0;194195size_t result = 0;196197for (size_t it = 0; it < cluster_count; ++it)198{199size_t start = clusters[it];200size_t end = (it + 1 < cluster_count) ? clusters[it + 1] : index_count / 3;201assert(start < end);202203// reset cache204timestamp += cache_size + 1;205206// measure cluster ACMR207unsigned int cluster_misses = 0;208209for (size_t i = start; i < end; ++i)210{211unsigned int m = updateCache(indices[i * 3 + 0], indices[i * 3 + 1], indices[i * 3 + 2], cache_size, &cache_timestamps[0], timestamp);212213cluster_misses += m;214}215216float cluster_threshold = threshold * (float(cluster_misses) / float(end - start));217218// first cluster always starts from the hard cluster boundary219destination[result++] = unsigned(start);220221// reset cache222timestamp += cache_size + 1;223224unsigned int running_misses = 0;225unsigned int running_faces = 0;226227for (size_t i = start; i < end; ++i)228{229unsigned int m = updateCache(indices[i * 3 + 0], indices[i * 3 + 1], indices[i * 3 + 2], cache_size, &cache_timestamps[0], timestamp);230231running_misses += m;232running_faces += 1;233234if (float(running_misses) / float(running_faces) <= cluster_threshold)235{236// we have reached the target ACMR with the current triangle so we need to start a new cluster on the next one237// note that this may mean that we add 'end` to destination for the last triangle, which will imply that the last238// cluster is empty; however, the 'pop_back' after the loop will clean it up239destination[result++] = unsigned(i + 1);240241// reset cache242timestamp += cache_size + 1;243244running_misses = 0;245running_faces = 0;246}247}248249// each time we reach the target ACMR we flush the cluster250// this means that the last cluster is by definition not very good - there are frequent cases where we are left with a few triangles251// in the last cluster, producing a very bad ACMR and significantly penalizing the overall results252// thus we remove the last cluster boundary, merging the last complete cluster with the last incomplete one253// there are sometimes cases when the last cluster is actually good enough - in which case the code above would have added 'end'254// to the cluster boundary array which we need to remove anyway - this code will do that automatically255if (destination[result - 1] != start)256{257result--;258}259}260261assert(result >= cluster_count);262assert(result <= index_count / 3);263264return result;265}266267} // namespace meshopt268269void meshopt_optimizeOverdraw(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, float threshold)270{271using namespace meshopt;272273assert(index_count % 3 == 0);274assert(vertex_positions_stride >= 12 && vertex_positions_stride <= 256);275assert(vertex_positions_stride % sizeof(float) == 0);276277meshopt_Allocator allocator;278279// guard for empty meshes280if (index_count == 0 || vertex_count == 0)281return;282283// support in-place optimization284if (destination == indices)285{286unsigned int* indices_copy = allocator.allocate<unsigned int>(index_count);287memcpy(indices_copy, indices, index_count * sizeof(unsigned int));288indices = indices_copy;289}290291unsigned int cache_size = 16;292293unsigned int* cache_timestamps = allocator.allocate<unsigned int>(vertex_count);294295// generate hard boundaries from full-triangle cache misses296unsigned int* hard_clusters = allocator.allocate<unsigned int>(index_count / 3);297size_t hard_cluster_count = generateHardBoundaries(hard_clusters, indices, index_count, vertex_count, cache_size, cache_timestamps);298299// generate soft boundaries300unsigned int* soft_clusters = allocator.allocate<unsigned int>(index_count / 3 + 1);301size_t soft_cluster_count = generateSoftBoundaries(soft_clusters, indices, index_count, vertex_count, hard_clusters, hard_cluster_count, cache_size, threshold, cache_timestamps);302303const unsigned int* clusters = soft_clusters;304size_t cluster_count = soft_cluster_count;305306// fill sort data307float* sort_data = allocator.allocate<float>(cluster_count);308calculateSortData(sort_data, indices, index_count, vertex_positions, vertex_positions_stride, clusters, cluster_count);309310// sort clusters using sort data311unsigned short* sort_keys = allocator.allocate<unsigned short>(cluster_count);312unsigned int* sort_order = allocator.allocate<unsigned int>(cluster_count);313calculateSortOrderRadix(sort_order, sort_data, sort_keys, cluster_count);314315// fill output buffer316size_t offset = 0;317318for (size_t it = 0; it < cluster_count; ++it)319{320unsigned int cluster = sort_order[it];321assert(cluster < cluster_count);322323size_t cluster_begin = clusters[cluster] * 3;324size_t cluster_end = (cluster + 1 < cluster_count) ? clusters[cluster + 1] * 3 : index_count;325assert(cluster_begin < cluster_end);326327memcpy(destination + offset, indices + cluster_begin, (cluster_end - cluster_begin) * sizeof(unsigned int));328offset += cluster_end - cluster_begin;329}330331assert(offset == index_count);332}333334335