Path: blob/master/thirdparty/astcenc/astcenc_find_best_partitioning.cpp
9902 views
// SPDX-License-Identifier: Apache-2.01// ----------------------------------------------------------------------------2// Copyright 2011-2025 Arm Limited3//4// Licensed under the Apache License, Version 2.0 (the "License"); you may not5// use this file except in compliance with the License. You may obtain a copy6// of the License at:7//8// http://www.apache.org/licenses/LICENSE-2.09//10// Unless required by applicable law or agreed to in writing, software11// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT12// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the13// License for the specific language governing permissions and limitations14// under the License.15// ----------------------------------------------------------------------------1617#if !defined(ASTCENC_DECOMPRESS_ONLY)1819/**20* @brief Functions for finding best partition for a block.21*22* The partition search operates in two stages. The first pass uses kmeans clustering to group23* texels into an ideal partitioning for the requested partition count, and then compares that24* against the 1024 partitionings generated by the ASTC partition hash function. The generated25* partitions are then ranked by the number of texels in the wrong partition, compared to the ideal26* clustering. All 1024 partitions are tested for similarity and ranked, apart from duplicates and27* partitionings that actually generate fewer than the requested partition count, but only the top28* N candidates are actually put through a more detailed search. N is determined by the compressor29* quality preset.30*31* For the detailed search, each candidate is checked against two possible encoding methods:32*33* - The best partitioning assuming different chroma colors (RGB + RGB or RGB + delta endpoints).34* - The best partitioning assuming same chroma colors (RGB + scale endpoints).35*36* This is implemented by computing the compute mean color and dominant direction for each37* partition. This defines two lines, both of which go through the mean color value.38*39* - One line has a direction defined by the dominant direction; this is used to assess the error40* from using an uncorrelated color representation.41* - The other line goes through (0,0,0,1) and is used to assess the error from using a same chroma42* (RGB + scale) color representation.43*44* The best candidate is selected by computing the squared-errors that result from using these45* lines for endpoint selection.46*/4748#include <limits>49#include "astcenc_internal.h"5051/**52* @brief Pick some initial kmeans cluster centers.53*54* @param blk The image block color data to compress.55* @param texel_count The number of texels in the block.56* @param partition_count The number of partitions in the block.57* @param[out] cluster_centers The initial partition cluster center colors.58*/59static void kmeans_init(60const image_block& blk,61unsigned int texel_count,62unsigned int partition_count,63vfloat4 cluster_centers[BLOCK_MAX_PARTITIONS]64) {65promise(texel_count > 0);66promise(partition_count > 0);6768unsigned int clusters_selected = 0;69float distances[BLOCK_MAX_TEXELS];7071// Pick a random sample as first cluster center; 145897 from random.org72unsigned int sample = 145897 % texel_count;73vfloat4 center_color = blk.texel(sample);74cluster_centers[clusters_selected] = center_color;75clusters_selected++;7677// Compute the distance to the first cluster center78float distance_sum = 0.0f;79for (unsigned int i = 0; i < texel_count; i++)80{81vfloat4 color = blk.texel(i);82vfloat4 diff = color - center_color;83float distance = dot_s(diff * diff, blk.channel_weight);84distance_sum += distance;85distances[i] = distance;86}8788// More numbers from random.org for weighted-random center selection89const float cluster_cutoffs[9] {900.626220f, 0.932770f, 0.275454f,910.318558f, 0.240113f, 0.009190f,920.347661f, 0.731960f, 0.156391f93};9495unsigned int cutoff = (clusters_selected - 1) + 3 * (partition_count - 2);9697// Pick the remaining samples as needed98while (true)99{100// Pick the next center in a weighted-random fashion.101float summa = 0.0f;102float distance_cutoff = distance_sum * cluster_cutoffs[cutoff++];103for (sample = 0; sample < texel_count; sample++)104{105summa += distances[sample];106if (summa >= distance_cutoff)107{108break;109}110}111112// Clamp to a valid range and store the selected cluster center113sample = astc::min(sample, texel_count - 1);114115center_color = blk.texel(sample);116cluster_centers[clusters_selected++] = center_color;117if (clusters_selected >= partition_count)118{119break;120}121122// Compute the distance to the new cluster center, keep the min dist123distance_sum = 0.0f;124for (unsigned int i = 0; i < texel_count; i++)125{126vfloat4 color = blk.texel(i);127vfloat4 diff = color - center_color;128float distance = dot_s(diff * diff, blk.channel_weight);129distance = astc::min(distance, distances[i]);130distance_sum += distance;131distances[i] = distance;132}133}134}135136/**137* @brief Assign texels to clusters, based on a set of chosen center points.138*139* @param blk The image block color data to compress.140* @param texel_count The number of texels in the block.141* @param partition_count The number of partitions in the block.142* @param cluster_centers The partition cluster center colors.143* @param[out] partition_of_texel The partition assigned for each texel.144*/145static void kmeans_assign(146const image_block& blk,147unsigned int texel_count,148unsigned int partition_count,149const vfloat4 cluster_centers[BLOCK_MAX_PARTITIONS],150uint8_t partition_of_texel[BLOCK_MAX_TEXELS]151) {152promise(texel_count > 0);153promise(partition_count > 0);154155uint8_t partition_texel_count[BLOCK_MAX_PARTITIONS] { 0 };156157// Find the best partition for every texel158for (unsigned int i = 0; i < texel_count; i++)159{160float best_distance = std::numeric_limits<float>::max();161unsigned int best_partition = 0;162163vfloat4 color = blk.texel(i);164for (unsigned int j = 0; j < partition_count; j++)165{166vfloat4 diff = color - cluster_centers[j];167float distance = dot_s(diff * diff, blk.channel_weight);168if (distance < best_distance)169{170best_distance = distance;171best_partition = j;172}173}174175partition_of_texel[i] = static_cast<uint8_t>(best_partition);176partition_texel_count[best_partition]++;177}178179// It is possible to get a situation where a partition ends up without any texels. In this case,180// assign texel N to partition N. This is silly, but ensures that every partition retains at181// least one texel. Reassigning a texel in this manner may cause another partition to go empty,182// so if we actually did a reassignment, run the whole loop over again.183bool problem_case;184do185{186problem_case = false;187for (unsigned int i = 0; i < partition_count; i++)188{189if (partition_texel_count[i] == 0)190{191partition_texel_count[partition_of_texel[i]]--;192partition_texel_count[i]++;193partition_of_texel[i] = static_cast<uint8_t>(i);194problem_case = true;195}196}197} while (problem_case);198}199200/**201* @brief Compute new cluster centers based on their center of gravity.202*203* @param blk The image block color data to compress.204* @param texel_count The number of texels in the block.205* @param partition_count The number of partitions in the block.206* @param[out] cluster_centers The new cluster center colors.207* @param partition_of_texel The partition assigned for each texel.208*/209static void kmeans_update(210const image_block& blk,211unsigned int texel_count,212unsigned int partition_count,213vfloat4 cluster_centers[BLOCK_MAX_PARTITIONS],214const uint8_t partition_of_texel[BLOCK_MAX_TEXELS]215) {216promise(texel_count > 0);217promise(partition_count > 0);218219vfloat4 color_sum[BLOCK_MAX_PARTITIONS] {220vfloat4::zero(),221vfloat4::zero(),222vfloat4::zero(),223vfloat4::zero()224};225226uint8_t partition_texel_count[BLOCK_MAX_PARTITIONS] { 0 };227228// Find the center of gravity in each cluster229for (unsigned int i = 0; i < texel_count; i++)230{231uint8_t partition = partition_of_texel[i];232color_sum[partition] += blk.texel(i);233partition_texel_count[partition]++;234}235236// Set the center of gravity to be the new cluster center237for (unsigned int i = 0; i < partition_count; i++)238{239float scale = 1.0f / static_cast<float>(partition_texel_count[i]);240cluster_centers[i] = color_sum[i] * scale;241}242}243244/**245* @brief Compute bit-mismatch for partitioning in 2-partition mode.246*247* @param a The texel assignment bitvector for the block.248* @param b The texel assignment bitvector for the partition table.249*250* @return The number of bit mismatches.251*/252static inline uint8_t partition_mismatch2(253const uint64_t a[2],254const uint64_t b[2]255) {256int v1 = popcount(a[0] ^ b[0]) + popcount(a[1] ^ b[1]);257int v2 = popcount(a[0] ^ b[1]) + popcount(a[1] ^ b[0]);258259// Divide by 2 because XOR always counts errors twice, once when missing260// in the expected position, and again when present in the wrong partition261return static_cast<uint8_t>(astc::min(v1, v2) / 2);262}263264/**265* @brief Compute bit-mismatch for partitioning in 3-partition mode.266*267* @param a The texel assignment bitvector for the block.268* @param b The texel assignment bitvector for the partition table.269*270* @return The number of bit mismatches.271*/272static inline uint8_t partition_mismatch3(273const uint64_t a[3],274const uint64_t b[3]275) {276int p00 = popcount(a[0] ^ b[0]);277int p01 = popcount(a[0] ^ b[1]);278int p02 = popcount(a[0] ^ b[2]);279280int p10 = popcount(a[1] ^ b[0]);281int p11 = popcount(a[1] ^ b[1]);282int p12 = popcount(a[1] ^ b[2]);283284int p20 = popcount(a[2] ^ b[0]);285int p21 = popcount(a[2] ^ b[1]);286int p22 = popcount(a[2] ^ b[2]);287288int s0 = p11 + p22;289int s1 = p12 + p21;290int v0 = astc::min(s0, s1) + p00;291292int s2 = p10 + p22;293int s3 = p12 + p20;294int v1 = astc::min(s2, s3) + p01;295296int s4 = p10 + p21;297int s5 = p11 + p20;298int v2 = astc::min(s4, s5) + p02;299300// Divide by 2 because XOR always counts errors twice, once when missing301// in the expected position, and again when present in the wrong partition302return static_cast<uint8_t>(astc::min(v0, v1, v2) / 2);303}304305/**306* @brief Compute bit-mismatch for partitioning in 4-partition mode.307*308* @param a The texel assignment bitvector for the block.309* @param b The texel assignment bitvector for the partition table.310*311* @return The number of bit mismatches.312*/313static inline uint8_t partition_mismatch4(314const uint64_t a[4],315const uint64_t b[4]316) {317int p00 = popcount(a[0] ^ b[0]);318int p01 = popcount(a[0] ^ b[1]);319int p02 = popcount(a[0] ^ b[2]);320int p03 = popcount(a[0] ^ b[3]);321322int p10 = popcount(a[1] ^ b[0]);323int p11 = popcount(a[1] ^ b[1]);324int p12 = popcount(a[1] ^ b[2]);325int p13 = popcount(a[1] ^ b[3]);326327int p20 = popcount(a[2] ^ b[0]);328int p21 = popcount(a[2] ^ b[1]);329int p22 = popcount(a[2] ^ b[2]);330int p23 = popcount(a[2] ^ b[3]);331332int p30 = popcount(a[3] ^ b[0]);333int p31 = popcount(a[3] ^ b[1]);334int p32 = popcount(a[3] ^ b[2]);335int p33 = popcount(a[3] ^ b[3]);336337int mx23 = astc::min(p22 + p33, p23 + p32);338int mx13 = astc::min(p21 + p33, p23 + p31);339int mx12 = astc::min(p21 + p32, p22 + p31);340int mx03 = astc::min(p20 + p33, p23 + p30);341int mx02 = astc::min(p20 + p32, p22 + p30);342int mx01 = astc::min(p21 + p30, p20 + p31);343344int v0 = p00 + astc::min(p11 + mx23, p12 + mx13, p13 + mx12);345int v1 = p01 + astc::min(p10 + mx23, p12 + mx03, p13 + mx02);346int v2 = p02 + astc::min(p11 + mx03, p10 + mx13, p13 + mx01);347int v3 = p03 + astc::min(p11 + mx02, p12 + mx01, p10 + mx12);348349// Divide by 2 because XOR always counts errors twice, once when missing350// in the expected position, and again when present in the wrong partition351return static_cast<uint8_t>(astc::min(v0, v1, v2, v3) / 2);352}353354using mismatch_dispatch = unsigned int (*)(const uint64_t*, const uint64_t*);355356/**357* @brief Count the partition table mismatches vs the data clustering.358*359* @param bsd The block size information.360* @param partition_count The number of partitions in the block.361* @param bitmaps The block texel partition assignment patterns.362* @param[out] mismatch_counts The array storing per partitioning mismatch counts.363*/364static void count_partition_mismatch_bits(365const block_size_descriptor& bsd,366unsigned int partition_count,367const uint64_t bitmaps[BLOCK_MAX_PARTITIONS],368uint8_t mismatch_counts[BLOCK_MAX_PARTITIONINGS]369) {370unsigned int active_count = bsd.partitioning_count_selected[partition_count - 1];371promise(active_count > 0);372373if (partition_count == 2)374{375for (unsigned int i = 0; i < active_count; i++)376{377mismatch_counts[i] = partition_mismatch2(bitmaps, bsd.coverage_bitmaps_2[i]);378assert(mismatch_counts[i] < BLOCK_MAX_KMEANS_TEXELS);379assert(mismatch_counts[i] < bsd.texel_count);380}381}382else if (partition_count == 3)383{384for (unsigned int i = 0; i < active_count; i++)385{386mismatch_counts[i] = partition_mismatch3(bitmaps, bsd.coverage_bitmaps_3[i]);387assert(mismatch_counts[i] < BLOCK_MAX_KMEANS_TEXELS);388assert(mismatch_counts[i] < bsd.texel_count);389}390}391else392{393for (unsigned int i = 0; i < active_count; i++)394{395mismatch_counts[i] = partition_mismatch4(bitmaps, bsd.coverage_bitmaps_4[i]);396assert(mismatch_counts[i] < BLOCK_MAX_KMEANS_TEXELS);397assert(mismatch_counts[i] < bsd.texel_count);398}399}400}401402/**403* @brief Use counting sort on the mismatch array to sort partition candidates.404*405* @param partitioning_count The number of packed partitionings.406* @param mismatch_count Partitioning mismatch counts, in index order.407* @param[out] partition_ordering Partition index values, in mismatch order.408*409* @return The number of active partitions in this selection.410*/411static unsigned int get_partition_ordering_by_mismatch_bits(412unsigned int texel_count,413unsigned int partitioning_count,414const uint8_t mismatch_count[BLOCK_MAX_PARTITIONINGS],415uint16_t partition_ordering[BLOCK_MAX_PARTITIONINGS]416) {417promise(partitioning_count > 0);418uint16_t mscount[BLOCK_MAX_KMEANS_TEXELS] { 0 };419420// Create the histogram of mismatch counts421for (unsigned int i = 0; i < partitioning_count; i++)422{423mscount[mismatch_count[i]]++;424}425426// Create a running sum from the histogram array427// Indices store previous values only; i.e. exclude self after sum428uint16_t sum = 0;429for (unsigned int i = 0; i < texel_count; i++)430{431uint16_t cnt = mscount[i];432mscount[i] = sum;433sum += cnt;434}435436// Use the running sum as the index, incrementing after read to allow437// sequential entries with the same count438for (unsigned int i = 0; i < partitioning_count; i++)439{440unsigned int idx = mscount[mismatch_count[i]]++;441partition_ordering[idx] = static_cast<uint16_t>(i);442}443444return partitioning_count;445}446447/**448* @brief Use k-means clustering to compute a partition ordering for a block..449*450* @param bsd The block size information.451* @param blk The image block color data to compress.452* @param partition_count The desired number of partitions in the block.453* @param[out] partition_ordering The list of recommended partition indices, in priority order.454*455* @return The number of active partitionings in this selection.456*/457static unsigned int compute_kmeans_partition_ordering(458const block_size_descriptor& bsd,459const image_block& blk,460unsigned int partition_count,461uint16_t partition_ordering[BLOCK_MAX_PARTITIONINGS]462) {463vfloat4 cluster_centers[BLOCK_MAX_PARTITIONS];464uint8_t texel_partitions[BLOCK_MAX_TEXELS];465466// Use three passes of k-means clustering to partition the block data467for (unsigned int i = 0; i < 3; i++)468{469if (i == 0)470{471kmeans_init(blk, bsd.texel_count, partition_count, cluster_centers);472}473else474{475kmeans_update(blk, bsd.texel_count, partition_count, cluster_centers, texel_partitions);476}477478kmeans_assign(blk, bsd.texel_count, partition_count, cluster_centers, texel_partitions);479}480481// Construct the block bitmaps of texel assignments to each partition482uint64_t bitmaps[BLOCK_MAX_PARTITIONS] { 0 };483unsigned int texels_to_process = astc::min(bsd.texel_count, BLOCK_MAX_KMEANS_TEXELS);484promise(texels_to_process > 0);485for (unsigned int i = 0; i < texels_to_process; i++)486{487unsigned int idx = bsd.kmeans_texels[i];488bitmaps[texel_partitions[idx]] |= 1ULL << i;489}490491// Count the mismatch between the block and the format's partition tables492uint8_t mismatch_counts[BLOCK_MAX_PARTITIONINGS];493count_partition_mismatch_bits(bsd, partition_count, bitmaps, mismatch_counts);494495// Sort the partitions based on the number of mismatched bits496return get_partition_ordering_by_mismatch_bits(497texels_to_process,498bsd.partitioning_count_selected[partition_count - 1],499mismatch_counts, partition_ordering);500}501502/**503* @brief Insert a partitioning into an order list of results, sorted by error.504*505* @param max_values The max number of entries in the best result arrays.506* @param this_error The error of the new entry.507* @param this_partition The partition ID of the new entry.508* @param[out] best_errors The array of best error values.509* @param[out] best_partitions The array of best partition values.510*/511static void insert_result(512unsigned int max_values,513float this_error,514unsigned int this_partition,515float* best_errors,516unsigned int* best_partitions)517{518promise(max_values > 0);519520// Don't bother searching if the current worst error beats the new error521if (this_error >= best_errors[max_values - 1])522{523return;524}525526// Else insert into the list in error-order527for (unsigned int i = 0; i < max_values; i++)528{529// Existing result is better - move on ...530if (this_error > best_errors[i])531{532continue;533}534535// Move existing results down one536for (unsigned int j = max_values - 1; j > i; j--)537{538best_errors[j] = best_errors[j - 1];539best_partitions[j] = best_partitions[j - 1];540}541542// Insert new result543best_errors[i] = this_error;544best_partitions[i] = this_partition;545break;546}547}548549/* See header for documentation. */550unsigned int find_best_partition_candidates(551const block_size_descriptor& bsd,552const image_block& blk,553unsigned int partition_count,554unsigned int partition_search_limit,555unsigned int best_partitions[TUNE_MAX_PARTITIONING_CANDIDATES],556unsigned int requested_candidates557) {558// Constant used to estimate quantization error for a given partitioning; the optimal value for559// this depends on bitrate. These values have been determined empirically.560unsigned int texels_per_block = bsd.texel_count;561float weight_imprecision_estim = 0.055f;562if (texels_per_block <= 20)563{564weight_imprecision_estim = 0.03f;565}566else if (texels_per_block <= 31)567{568weight_imprecision_estim = 0.04f;569}570else if (texels_per_block <= 41)571{572weight_imprecision_estim = 0.05f;573}574575promise(partition_count > 0);576promise(partition_search_limit > 0);577578weight_imprecision_estim = weight_imprecision_estim * weight_imprecision_estim;579580uint16_t partition_sequence[BLOCK_MAX_PARTITIONINGS];581unsigned int sequence_len = compute_kmeans_partition_ordering(bsd, blk, partition_count, partition_sequence);582partition_search_limit = astc::min(partition_search_limit, sequence_len);583requested_candidates = astc::min(partition_search_limit, requested_candidates);584585bool uses_alpha = !blk.is_constant_channel(3);586587// Partitioning errors assuming uncorrelated-chrominance endpoints588float uncor_best_errors[TUNE_MAX_PARTITIONING_CANDIDATES];589unsigned int uncor_best_partitions[TUNE_MAX_PARTITIONING_CANDIDATES];590591// Partitioning errors assuming same-chrominance endpoints592float samec_best_errors[TUNE_MAX_PARTITIONING_CANDIDATES];593unsigned int samec_best_partitions[TUNE_MAX_PARTITIONING_CANDIDATES];594595for (unsigned int i = 0; i < requested_candidates; i++)596{597uncor_best_errors[i] = ERROR_CALC_DEFAULT;598samec_best_errors[i] = ERROR_CALC_DEFAULT;599}600601if (uses_alpha)602{603for (unsigned int i = 0; i < partition_search_limit; i++)604{605unsigned int partition = partition_sequence[i];606const auto& pi = bsd.get_raw_partition_info(partition_count, partition);607608// Compute weighting to give to each component in each partition609partition_metrics pms[BLOCK_MAX_PARTITIONS];610611compute_avgs_and_dirs_4_comp(pi, blk, pms);612613line4 uncor_lines[BLOCK_MAX_PARTITIONS];614line4 samec_lines[BLOCK_MAX_PARTITIONS];615616processed_line4 uncor_plines[BLOCK_MAX_PARTITIONS];617processed_line4 samec_plines[BLOCK_MAX_PARTITIONS];618619float line_lengths[BLOCK_MAX_PARTITIONS];620621for (unsigned int j = 0; j < partition_count; j++)622{623partition_metrics& pm = pms[j];624625uncor_lines[j].a = pm.avg;626uncor_lines[j].b = normalize_safe(pm.dir, unit4());627628uncor_plines[j].amod = uncor_lines[j].a - uncor_lines[j].b * dot(uncor_lines[j].a, uncor_lines[j].b);629uncor_plines[j].bs = uncor_lines[j].b;630631samec_lines[j].a = vfloat4::zero();632samec_lines[j].b = normalize_safe(pm.avg, unit4());633634samec_plines[j].amod = vfloat4::zero();635samec_plines[j].bs = samec_lines[j].b;636}637638float uncor_error = 0.0f;639float samec_error = 0.0f;640641compute_error_squared_rgba(pi,642blk,643uncor_plines,644samec_plines,645line_lengths,646uncor_error,647samec_error);648649// Compute an estimate of error introduced by weight quantization imprecision.650// This error is computed as follows, for each partition651// 1: compute the principal-axis vector (full length) in error-space652// 2: convert the principal-axis vector to regular RGB-space653// 3: scale the vector by a constant that estimates average quantization error654// 4: for each texel, square the vector, then do a dot-product with the texel's655// error weight; sum up the results across all texels.656// 4(optimized): square the vector once, then do a dot-product with the average657// texel error, then multiply by the number of texels.658659for (unsigned int j = 0; j < partition_count; j++)660{661float tpp = static_cast<float>(pi.partition_texel_count[j]);662vfloat4 error_weights(tpp * weight_imprecision_estim);663664vfloat4 uncor_vector = uncor_lines[j].b * line_lengths[j];665vfloat4 samec_vector = samec_lines[j].b * line_lengths[j];666667uncor_error += dot_s(uncor_vector * uncor_vector, error_weights);668samec_error += dot_s(samec_vector * samec_vector, error_weights);669}670671insert_result(requested_candidates, uncor_error, partition, uncor_best_errors, uncor_best_partitions);672insert_result(requested_candidates, samec_error, partition, samec_best_errors, samec_best_partitions);673}674}675else676{677for (unsigned int i = 0; i < partition_search_limit; i++)678{679unsigned int partition = partition_sequence[i];680const auto& pi = bsd.get_raw_partition_info(partition_count, partition);681682// Compute weighting to give to each component in each partition683partition_metrics pms[BLOCK_MAX_PARTITIONS];684compute_avgs_and_dirs_3_comp_rgb(pi, blk, pms);685686partition_lines3 plines[BLOCK_MAX_PARTITIONS];687688for (unsigned int j = 0; j < partition_count; j++)689{690partition_metrics& pm = pms[j];691partition_lines3& pl = plines[j];692693pl.uncor_line.a = pm.avg;694pl.uncor_line.b = normalize_safe(pm.dir, unit3());695696pl.samec_line.a = vfloat4::zero();697pl.samec_line.b = normalize_safe(pm.avg, unit3());698699pl.uncor_pline.amod = pl.uncor_line.a - pl.uncor_line.b * dot3(pl.uncor_line.a, pl.uncor_line.b);700pl.uncor_pline.bs = pl.uncor_line.b;701702pl.samec_pline.amod = vfloat4::zero();703pl.samec_pline.bs = pl.samec_line.b;704}705706float uncor_error = 0.0f;707float samec_error = 0.0f;708709compute_error_squared_rgb(pi,710blk,711plines,712uncor_error,713samec_error);714715// Compute an estimate of error introduced by weight quantization imprecision.716// This error is computed as follows, for each partition717// 1: compute the principal-axis vector (full length) in error-space718// 2: convert the principal-axis vector to regular RGB-space719// 3: scale the vector by a constant that estimates average quantization error720// 4: for each texel, square the vector, then do a dot-product with the texel's721// error weight; sum up the results across all texels.722// 4(optimized): square the vector once, then do a dot-product with the average723// texel error, then multiply by the number of texels.724725for (unsigned int j = 0; j < partition_count; j++)726{727partition_lines3& pl = plines[j];728729float tpp = static_cast<float>(pi.partition_texel_count[j]);730vfloat4 error_weights(tpp * weight_imprecision_estim);731732vfloat4 uncor_vector = pl.uncor_line.b * pl.line_length;733vfloat4 samec_vector = pl.samec_line.b * pl.line_length;734735uncor_error += dot3_s(uncor_vector * uncor_vector, error_weights);736samec_error += dot3_s(samec_vector * samec_vector, error_weights);737}738739insert_result(requested_candidates, uncor_error, partition, uncor_best_errors, uncor_best_partitions);740insert_result(requested_candidates, samec_error, partition, samec_best_errors, samec_best_partitions);741}742}743744unsigned int interleave[2 * TUNE_MAX_PARTITIONING_CANDIDATES];745for (unsigned int i = 0; i < requested_candidates; i++)746{747interleave[2 * i] = bsd.get_raw_partition_info(partition_count, uncor_best_partitions[i]).partition_index;748interleave[2 * i + 1] = bsd.get_raw_partition_info(partition_count, samec_best_partitions[i]).partition_index;749}750751uint64_t bitmasks[1024/64] { 0 };752unsigned int emitted = 0;753754// Deduplicate the first "requested" entries755for (unsigned int i = 0; i < requested_candidates * 2; i++)756{757unsigned int partition = interleave[i];758759unsigned int word = partition / 64;760unsigned int bit = partition % 64;761762bool written = bitmasks[word] & (1ull << bit);763764if (!written)765{766best_partitions[emitted] = partition;767bitmasks[word] |= 1ull << bit;768emitted++;769770if (emitted == requested_candidates)771{772break;773}774}775}776777return emitted;778}779780#endif781782783