Path: blob/master/thirdparty/astcenc/astcenc_compress_symbolic.cpp
9896 views
// SPDX-License-Identifier: Apache-2.01// ----------------------------------------------------------------------------2// Copyright 2011-2025 Arm Limited3//4// Licensed under the Apache License, Version 2.0 (the "License"); you may not5// use this file except in compliance with the License. You may obtain a copy6// of the License at:7//8// http://www.apache.org/licenses/LICENSE-2.09//10// Unless required by applicable law or agreed to in writing, software11// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT12// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the13// License for the specific language governing permissions and limitations14// under the License.15// ----------------------------------------------------------------------------1617#if !defined(ASTCENC_DECOMPRESS_ONLY)1819/**20* @brief Functions to compress a symbolic block.21*/2223#include "astcenc_internal.h"24#include "astcenc_diagnostic_trace.h"2526#include <cassert>2728/**29* @brief Merge two planes of endpoints into a single vector.30*31* @param ep_plane1 The endpoints for plane 1.32* @param ep_plane2 The endpoints for plane 2.33* @param component_plane2 The color component for plane 2.34* @param[out] result The merged output.35*/36static void merge_endpoints(37const endpoints& ep_plane1,38const endpoints& ep_plane2,39unsigned int component_plane2,40endpoints& result41) {42unsigned int partition_count = ep_plane1.partition_count;43assert(partition_count == 1);4445vmask4 sep_mask = vint4::lane_id() == vint4(component_plane2);4647result.partition_count = partition_count;48result.endpt0[0] = select(ep_plane1.endpt0[0], ep_plane2.endpt0[0], sep_mask);49result.endpt1[0] = select(ep_plane1.endpt1[0], ep_plane2.endpt1[0], sep_mask);50}5152/**53* @brief Attempt to improve weights given a chosen configuration.54*55* Given a fixed weight grid decimation and weight value quantization, iterate over all weights (per56* partition and per plane) and attempt to improve image quality by moving each weight up by one or57* down by one quantization step.58*59* This is a specialized function which only supports operating on undecimated weight grids,60* therefore primarily improving the performance of 4x4 and 5x5 blocks where grid decimation61* is needed less often.62*63* @param decode_mode The decode mode (LDR, HDR).64* @param bsd The block size information.65* @param blk The image block color data to compress.66* @param[out] scb The symbolic compressed block output.67*/68static bool realign_weights_undecimated(69astcenc_profile decode_mode,70const block_size_descriptor& bsd,71const image_block& blk,72symbolic_compressed_block& scb73) {74// Get the partition descriptor75unsigned int partition_count = scb.partition_count;76const auto& pi = bsd.get_partition_info(partition_count, scb.partition_index);7778// Get the quantization table79const block_mode& bm = bsd.get_block_mode(scb.block_mode);80unsigned int weight_quant_level = bm.quant_mode;81const quant_and_transfer_table& qat = quant_and_xfer_tables[weight_quant_level];8283unsigned int max_plane = bm.is_dual_plane;84int plane2_component = scb.plane2_component;85vmask4 plane_mask = vint4::lane_id() == vint4(plane2_component);8687// Decode the color endpoints88bool rgb_hdr;89bool alpha_hdr;90vint4 endpnt0[BLOCK_MAX_PARTITIONS];91vint4 endpnt1[BLOCK_MAX_PARTITIONS];92vfloat4 endpnt0f[BLOCK_MAX_PARTITIONS];93vfloat4 offset[BLOCK_MAX_PARTITIONS];9495promise(partition_count > 0);9697for (unsigned int pa_idx = 0; pa_idx < partition_count; pa_idx++)98{99unpack_color_endpoints(decode_mode,100scb.color_formats[pa_idx],101scb.color_values[pa_idx],102rgb_hdr, alpha_hdr,103endpnt0[pa_idx],104endpnt1[pa_idx]);105}106107uint8_t* dec_weights_uquant = scb.weights;108bool adjustments = false;109110// For each plane and partition ...111for (unsigned int pl_idx = 0; pl_idx <= max_plane; pl_idx++)112{113for (unsigned int pa_idx = 0; pa_idx < partition_count; pa_idx++)114{115// Compute the endpoint delta for all components in current plane116vint4 epd = endpnt1[pa_idx] - endpnt0[pa_idx];117epd = select(epd, vint4::zero(), plane_mask);118119endpnt0f[pa_idx] = int_to_float(endpnt0[pa_idx]);120offset[pa_idx] = int_to_float(epd) * (1.0f / 64.0f);121}122123// For each weight compute previous, current, and next errors124promise(bsd.texel_count > 0);125for (unsigned int texel = 0; texel < bsd.texel_count; texel++)126{127int uqw = dec_weights_uquant[texel];128129uint32_t prev_and_next = qat.prev_next_values[uqw];130int uqw_down = prev_and_next & 0xFF;131int uqw_up = (prev_and_next >> 8) & 0xFF;132133// Interpolate the colors to create the diffs134float weight_base = static_cast<float>(uqw);135float weight_down = static_cast<float>(uqw_down - uqw);136float weight_up = static_cast<float>(uqw_up - uqw);137138unsigned int partition = pi.partition_of_texel[texel];139vfloat4 color_offset = offset[partition];140vfloat4 color_base = endpnt0f[partition];141142vfloat4 color = color_base + color_offset * weight_base;143vfloat4 orig_color = blk.texel(texel);144vfloat4 error_weight = blk.channel_weight;145146vfloat4 color_diff = color - orig_color;147vfloat4 color_diff_down = color_diff + color_offset * weight_down;148vfloat4 color_diff_up = color_diff + color_offset * weight_up;149150float error_base = dot_s(color_diff * color_diff, error_weight);151float error_down = dot_s(color_diff_down * color_diff_down, error_weight);152float error_up = dot_s(color_diff_up * color_diff_up, error_weight);153154// Check if the prev or next error is better, and if so use it155if ((error_up < error_base) && (error_up < error_down) && (uqw < 64))156{157dec_weights_uquant[texel] = static_cast<uint8_t>(uqw_up);158adjustments = true;159}160else if ((error_down < error_base) && (uqw > 0))161{162dec_weights_uquant[texel] = static_cast<uint8_t>(uqw_down);163adjustments = true;164}165}166167// Prepare iteration for plane 2168dec_weights_uquant += WEIGHTS_PLANE2_OFFSET;169plane_mask = ~plane_mask;170}171172return adjustments;173}174175/**176* @brief Attempt to improve weights given a chosen configuration.177*178* Given a fixed weight grid decimation and weight value quantization, iterate over all weights (per179* partition and per plane) and attempt to improve image quality by moving each weight up by one or180* down by one quantization step.181*182* @param decode_mode The decode mode (LDR, HDR).183* @param bsd The block size information.184* @param blk The image block color data to compress.185* @param[out] scb The symbolic compressed block output.186*/187static bool realign_weights_decimated(188astcenc_profile decode_mode,189const block_size_descriptor& bsd,190const image_block& blk,191symbolic_compressed_block& scb192) {193// Get the partition descriptor194unsigned int partition_count = scb.partition_count;195const auto& pi = bsd.get_partition_info(partition_count, scb.partition_index);196197// Get the quantization table198const block_mode& bm = bsd.get_block_mode(scb.block_mode);199unsigned int weight_quant_level = bm.quant_mode;200const quant_and_transfer_table& qat = quant_and_xfer_tables[weight_quant_level];201202// Get the decimation table203const decimation_info& di = bsd.get_decimation_info(bm.decimation_mode);204unsigned int weight_count = di.weight_count;205assert(weight_count != bsd.texel_count);206207unsigned int max_plane = bm.is_dual_plane;208int plane2_component = scb.plane2_component;209vmask4 plane_mask = vint4::lane_id() == vint4(plane2_component);210211// Decode the color endpoints212bool rgb_hdr;213bool alpha_hdr;214vint4 endpnt0[BLOCK_MAX_PARTITIONS];215vint4 endpnt1[BLOCK_MAX_PARTITIONS];216vfloat4 endpnt0f[BLOCK_MAX_PARTITIONS];217vfloat4 offset[BLOCK_MAX_PARTITIONS];218219promise(partition_count > 0);220promise(weight_count > 0);221222for (unsigned int pa_idx = 0; pa_idx < partition_count; pa_idx++)223{224unpack_color_endpoints(decode_mode,225scb.color_formats[pa_idx],226scb.color_values[pa_idx],227rgb_hdr, alpha_hdr,228endpnt0[pa_idx],229endpnt1[pa_idx]);230}231232uint8_t* dec_weights_uquant = scb.weights;233bool adjustments = false;234235// For each plane and partition ...236for (unsigned int pl_idx = 0; pl_idx <= max_plane; pl_idx++)237{238for (unsigned int pa_idx = 0; pa_idx < partition_count; pa_idx++)239{240// Compute the endpoint delta for all components in current plane241vint4 epd = endpnt1[pa_idx] - endpnt0[pa_idx];242epd = select(epd, vint4::zero(), plane_mask);243244endpnt0f[pa_idx] = int_to_float(endpnt0[pa_idx]);245offset[pa_idx] = int_to_float(epd) * (1.0f / 64.0f);246}247248// Create an unquantized weight grid for this decimation level249ASTCENC_ALIGNAS float uq_weightsf[BLOCK_MAX_WEIGHTS];250for (unsigned int we_idx = 0; we_idx < weight_count; we_idx += ASTCENC_SIMD_WIDTH)251{252vint unquant_value(dec_weights_uquant + we_idx);253vfloat unquant_valuef = int_to_float(unquant_value);254storea(unquant_valuef, uq_weightsf + we_idx);255}256257// For each weight compute previous, current, and next errors258for (unsigned int we_idx = 0; we_idx < weight_count; we_idx++)259{260int uqw = dec_weights_uquant[we_idx];261uint32_t prev_and_next = qat.prev_next_values[uqw];262263float uqw_base = uq_weightsf[we_idx];264float uqw_down = static_cast<float>(prev_and_next & 0xFF);265float uqw_up = static_cast<float>((prev_and_next >> 8) & 0xFF);266267float uqw_diff_down = uqw_down - uqw_base;268float uqw_diff_up = uqw_up - uqw_base;269270vfloat4 error_basev = vfloat4::zero();271vfloat4 error_downv = vfloat4::zero();272vfloat4 error_upv = vfloat4::zero();273274// Interpolate the colors to create the diffs275unsigned int texels_to_evaluate = di.weight_texel_count[we_idx];276promise(texels_to_evaluate > 0);277for (unsigned int te_idx = 0; te_idx < texels_to_evaluate; te_idx++)278{279unsigned int texel = di.weight_texels_tr[te_idx][we_idx];280281float tw_base = di.texel_contrib_for_weight[te_idx][we_idx];282283float weight_base = (uq_weightsf[di.texel_weights_tr[0][texel]] * di.texel_weight_contribs_float_tr[0][texel]284+ uq_weightsf[di.texel_weights_tr[1][texel]] * di.texel_weight_contribs_float_tr[1][texel])285+ (uq_weightsf[di.texel_weights_tr[2][texel]] * di.texel_weight_contribs_float_tr[2][texel]286+ uq_weightsf[di.texel_weights_tr[3][texel]] * di.texel_weight_contribs_float_tr[3][texel]);287288// Ideally this is integer rounded, but IQ gain it isn't worth the overhead289// float weight = astc::flt_rd(weight_base + 0.5f);290// float weight_down = astc::flt_rd(weight_base + 0.5f + uqw_diff_down * tw_base) - weight;291// float weight_up = astc::flt_rd(weight_base + 0.5f + uqw_diff_up * tw_base) - weight;292float weight_down = weight_base + uqw_diff_down * tw_base - weight_base;293float weight_up = weight_base + uqw_diff_up * tw_base - weight_base;294295unsigned int partition = pi.partition_of_texel[texel];296vfloat4 color_offset = offset[partition];297vfloat4 color_base = endpnt0f[partition];298299vfloat4 color = color_base + color_offset * weight_base;300vfloat4 orig_color = blk.texel(texel);301302vfloat4 color_diff = color - orig_color;303vfloat4 color_down_diff = color_diff + color_offset * weight_down;304vfloat4 color_up_diff = color_diff + color_offset * weight_up;305306error_basev += color_diff * color_diff;307error_downv += color_down_diff * color_down_diff;308error_upv += color_up_diff * color_up_diff;309}310311vfloat4 error_weight = blk.channel_weight;312float error_base = hadd_s(error_basev * error_weight);313float error_down = hadd_s(error_downv * error_weight);314float error_up = hadd_s(error_upv * error_weight);315316// Check if the prev or next error is better, and if so use it317if ((error_up < error_base) && (error_up < error_down) && (uqw < 64))318{319uq_weightsf[we_idx] = uqw_up;320dec_weights_uquant[we_idx] = static_cast<uint8_t>(uqw_up);321adjustments = true;322}323else if ((error_down < error_base) && (uqw > 0))324{325uq_weightsf[we_idx] = uqw_down;326dec_weights_uquant[we_idx] = static_cast<uint8_t>(uqw_down);327adjustments = true;328}329}330331// Prepare iteration for plane 2332dec_weights_uquant += WEIGHTS_PLANE2_OFFSET;333plane_mask = ~plane_mask;334}335336return adjustments;337}338339/**340* @brief Compress a block using a chosen partitioning and 1 plane of weights.341*342* @param config The compressor configuration.343* @param bsd The block size information.344* @param blk The image block color data to compress.345* @param only_always True if we only use "always" percentile block modes.346* @param tune_errorval_threshold The error value threshold.347* @param partition_count The partition count.348* @param partition_index The partition index if @c partition_count is 2-4.349* @param[out] scb The symbolic compressed block output.350* @param[out] tmpbuf The quantized weights for plane 1.351*/352static float compress_symbolic_block_for_partition_1plane(353const astcenc_config& config,354const block_size_descriptor& bsd,355const image_block& blk,356bool only_always,357float tune_errorval_threshold,358unsigned int partition_count,359unsigned int partition_index,360symbolic_compressed_block& scb,361compression_working_buffers& tmpbuf,362int quant_limit363) {364promise(partition_count > 0);365promise(config.tune_candidate_limit > 0);366promise(config.tune_refinement_limit > 0);367368int max_weight_quant = astc::min(static_cast<int>(QUANT_32), quant_limit);369370auto compute_difference = &compute_symbolic_block_difference_1plane;371if ((partition_count == 1) && !(config.flags & ASTCENC_FLG_MAP_RGBM))372{373compute_difference = &compute_symbolic_block_difference_1plane_1partition;374}375376const auto& pi = bsd.get_partition_info(partition_count, partition_index);377378// Compute ideal weights and endpoint colors, with no quantization or decimation379endpoints_and_weights& ei = tmpbuf.ei1;380compute_ideal_colors_and_weights_1plane(blk, pi, ei);381382// Compute ideal weights and endpoint colors for every decimation383float* dec_weights_ideal = tmpbuf.dec_weights_ideal;384uint8_t* dec_weights_uquant = tmpbuf.dec_weights_uquant;385386// For each decimation mode, compute an ideal set of weights with no quantization387unsigned int max_decimation_modes = only_always ? bsd.decimation_mode_count_always388: bsd.decimation_mode_count_selected;389promise(max_decimation_modes > 0);390for (unsigned int i = 0; i < max_decimation_modes; i++)391{392const auto& dm = bsd.get_decimation_mode(i);393if (!dm.is_ref_1plane(static_cast<quant_method>(max_weight_quant)))394{395continue;396}397398const auto& di = bsd.get_decimation_info(i);399400compute_ideal_weights_for_decimation(401ei,402di,403dec_weights_ideal + i * BLOCK_MAX_WEIGHTS);404}405406// Compute maximum colors for the endpoints and ideal weights, then for each endpoint and ideal407// weight pair, compute the smallest weight that will result in a color value greater than 1408vfloat4 min_ep(10.0f);409for (unsigned int i = 0; i < partition_count; i++)410{411vfloat4 ep = (vfloat4(1.0f) - ei.ep.endpt0[i]) / (ei.ep.endpt1[i] - ei.ep.endpt0[i]);412413vmask4 use_ep = (ep > vfloat4(0.5f)) & (ep < min_ep);414min_ep = select(min_ep, ep, use_ep);415}416417float min_wt_cutoff = hmin_s(min_ep);418419// For each mode, use the angular method to compute a shift420compute_angular_endpoints_1plane(421only_always, bsd, dec_weights_ideal, max_weight_quant, tmpbuf);422423float* weight_low_value = tmpbuf.weight_low_value1;424float* weight_high_value = tmpbuf.weight_high_value1;425int8_t* qwt_bitcounts = tmpbuf.qwt_bitcounts;426float* qwt_errors = tmpbuf.qwt_errors;427428// For each mode (which specifies a decimation and a quantization):429// * Compute number of bits needed for the quantized weights430// * Generate an optimized set of quantized weights431// * Compute quantization errors for the mode432433434static const int8_t free_bits_for_partition_count[4] {435115 - 4, 111 - 4 - PARTITION_INDEX_BITS, 108 - 4 - PARTITION_INDEX_BITS, 105 - 4 - PARTITION_INDEX_BITS436};437438unsigned int max_block_modes = only_always ? bsd.block_mode_count_1plane_always439: bsd.block_mode_count_1plane_selected;440promise(max_block_modes > 0);441for (unsigned int i = 0; i < max_block_modes; i++)442{443const block_mode& bm = bsd.block_modes[i];444445if (bm.quant_mode > max_weight_quant)446{447qwt_errors[i] = 1e38f;448continue;449}450451assert(!bm.is_dual_plane);452int bitcount = free_bits_for_partition_count[partition_count - 1] - bm.weight_bits;453if (bitcount <= 0)454{455qwt_errors[i] = 1e38f;456continue;457}458459if (weight_high_value[i] > 1.02f * min_wt_cutoff)460{461weight_high_value[i] = 1.0f;462}463464int decimation_mode = bm.decimation_mode;465const auto& di = bsd.get_decimation_info(decimation_mode);466467qwt_bitcounts[i] = static_cast<int8_t>(bitcount);468469ASTCENC_ALIGNAS float dec_weights_uquantf[BLOCK_MAX_WEIGHTS];470471// Generate the optimized set of weights for the weight mode472compute_quantized_weights_for_decimation(473di,474weight_low_value[i], weight_high_value[i],475dec_weights_ideal + BLOCK_MAX_WEIGHTS * decimation_mode,476dec_weights_uquantf,477dec_weights_uquant + BLOCK_MAX_WEIGHTS * i,478bm.get_weight_quant_mode());479480// Compute weight quantization errors for the block mode481qwt_errors[i] = compute_error_of_weight_set_1plane(482ei,483di,484dec_weights_uquantf);485}486487// Decide the optimal combination of color endpoint encodings and weight encodings488uint8_t partition_format_specifiers[TUNE_MAX_TRIAL_CANDIDATES][BLOCK_MAX_PARTITIONS];489int block_mode_index[TUNE_MAX_TRIAL_CANDIDATES];490491quant_method color_quant_level[TUNE_MAX_TRIAL_CANDIDATES];492quant_method color_quant_level_mod[TUNE_MAX_TRIAL_CANDIDATES];493494unsigned int candidate_count = compute_ideal_endpoint_formats(495pi, blk, ei.ep, qwt_bitcounts, qwt_errors,496config.tune_candidate_limit, 0, max_block_modes,497partition_format_specifiers, block_mode_index,498color_quant_level, color_quant_level_mod, tmpbuf);499500// Iterate over the N believed-to-be-best modes to find out which one is actually best501float best_errorval_in_mode = ERROR_CALC_DEFAULT;502float best_errorval_in_scb = scb.errorval;503504for (unsigned int i = 0; i < candidate_count; i++)505{506TRACE_NODE(node0, "candidate");507508const int bm_packed_index = block_mode_index[i];509assert(bm_packed_index >= 0 && bm_packed_index < static_cast<int>(bsd.block_mode_count_1plane_selected));510const block_mode& qw_bm = bsd.block_modes[bm_packed_index];511512int decimation_mode = qw_bm.decimation_mode;513const auto& di = bsd.get_decimation_info(decimation_mode);514promise(di.weight_count > 0);515516trace_add_data("weight_x", di.weight_x);517trace_add_data("weight_y", di.weight_y);518trace_add_data("weight_z", di.weight_z);519trace_add_data("weight_quant", qw_bm.quant_mode);520521// Recompute the ideal color endpoints before storing them522vfloat4 rgbs_colors[BLOCK_MAX_PARTITIONS];523vfloat4 rgbo_colors[BLOCK_MAX_PARTITIONS];524525symbolic_compressed_block workscb;526endpoints workep = ei.ep;527528uint8_t* u8_weight_src = dec_weights_uquant + BLOCK_MAX_WEIGHTS * bm_packed_index;529530for (unsigned int j = 0; j < di.weight_count; j++)531{532workscb.weights[j] = u8_weight_src[j];533}534535for (unsigned int l = 0; l < config.tune_refinement_limit; l++)536{537recompute_ideal_colors_1plane(538blk, pi, di, workscb.weights,539workep, rgbs_colors, rgbo_colors);540541// Quantize the chosen color, tracking if worth trying the mod value542bool all_same = color_quant_level[i] != color_quant_level_mod[i];543for (unsigned int j = 0; j < partition_count; j++)544{545workscb.color_formats[j] = pack_color_endpoints(546workep.endpt0[j],547workep.endpt1[j],548rgbs_colors[j],549rgbo_colors[j],550partition_format_specifiers[i][j],551workscb.color_values[j],552color_quant_level[i]);553554all_same = all_same && workscb.color_formats[j] == workscb.color_formats[0];555}556557// If all the color endpoint modes are the same, we get a few more bits to store colors;558// let's see if we can take advantage of this: requantize all the colors and see if the559// endpoint modes remain the same.560workscb.color_formats_matched = 0;561if (partition_count >= 2 && all_same)562{563uint8_t colorvals[BLOCK_MAX_PARTITIONS][8];564uint8_t color_formats_mod[BLOCK_MAX_PARTITIONS] { 0 };565bool all_same_mod = true;566for (unsigned int j = 0; j < partition_count; j++)567{568color_formats_mod[j] = pack_color_endpoints(569workep.endpt0[j],570workep.endpt1[j],571rgbs_colors[j],572rgbo_colors[j],573partition_format_specifiers[i][j],574colorvals[j],575color_quant_level_mod[i]);576577// Early out as soon as it's no longer possible to use mod578if (color_formats_mod[j] != color_formats_mod[0])579{580all_same_mod = false;581break;582}583}584585if (all_same_mod)586{587workscb.color_formats_matched = 1;588for (unsigned int j = 0; j < BLOCK_MAX_PARTITIONS; j++)589{590for (unsigned int k = 0; k < 8; k++)591{592workscb.color_values[j][k] = colorvals[j][k];593}594595workscb.color_formats[j] = color_formats_mod[j];596}597}598}599600// Store header fields601workscb.partition_count = static_cast<uint8_t>(partition_count);602workscb.partition_index = static_cast<uint16_t>(partition_index);603workscb.plane2_component = -1;604workscb.quant_mode = workscb.color_formats_matched ? color_quant_level_mod[i] : color_quant_level[i];605workscb.block_mode = qw_bm.mode_index;606workscb.block_type = SYM_BTYPE_NONCONST;607608// Pre-realign test609if (l == 0)610{611float errorval = compute_difference(config, bsd, workscb, blk);612if (errorval == -ERROR_CALC_DEFAULT)613{614errorval = -errorval;615workscb.block_type = SYM_BTYPE_ERROR;616}617618trace_add_data("error_prerealign", errorval);619best_errorval_in_mode = astc::min(errorval, best_errorval_in_mode);620621// Average refinement improvement is 3.5% per iteration (allow 4.5%), but the first622// iteration can help more so we give it a extra 8% leeway. Use this knowledge to623// drive a heuristic to skip blocks that are unlikely to catch up with the best624// block we have already.625unsigned int iters_remaining = config.tune_refinement_limit - l;626float threshold = (0.045f * static_cast<float>(iters_remaining)) + 1.08f;627if (errorval > (threshold * best_errorval_in_scb))628{629break;630}631632if (errorval < best_errorval_in_scb)633{634best_errorval_in_scb = errorval;635workscb.errorval = errorval;636scb = workscb;637638if (errorval < tune_errorval_threshold)639{640// Skip remaining candidates - this is "good enough"641i = candidate_count;642break;643}644}645}646647bool adjustments;648if (di.weight_count != bsd.texel_count)649{650adjustments = realign_weights_decimated(651config.profile, bsd, blk, workscb);652}653else654{655adjustments = realign_weights_undecimated(656config.profile, bsd, blk, workscb);657}658659// Post-realign test660float errorval = compute_difference(config, bsd, workscb, blk);661if (errorval == -ERROR_CALC_DEFAULT)662{663errorval = -errorval;664workscb.block_type = SYM_BTYPE_ERROR;665}666667trace_add_data("error_postrealign", errorval);668best_errorval_in_mode = astc::min(errorval, best_errorval_in_mode);669670// Average refinement improvement is 3.5% per iteration, so skip blocks that are671// unlikely to catch up with the best block we have already. Assume a 4.5% per step to672// give benefit of the doubt ...673unsigned int iters_remaining = config.tune_refinement_limit - 1 - l;674float threshold = (0.045f * static_cast<float>(iters_remaining)) + 1.0f;675if (errorval > (threshold * best_errorval_in_scb))676{677break;678}679680if (errorval < best_errorval_in_scb)681{682best_errorval_in_scb = errorval;683workscb.errorval = errorval;684scb = workscb;685686if (errorval < tune_errorval_threshold)687{688// Skip remaining candidates - this is "good enough"689i = candidate_count;690break;691}692}693694if (!adjustments)695{696break;697}698}699}700701return best_errorval_in_mode;702}703704/**705* @brief Compress a block using a chosen partitioning and 2 planes of weights.706*707* @param config The compressor configuration.708* @param bsd The block size information.709* @param blk The image block color data to compress.710* @param tune_errorval_threshold The error value threshold.711* @param plane2_component The component index for the second plane of weights.712* @param[out] scb The symbolic compressed block output.713* @param[out] tmpbuf The quantized weights for plane 1.714*/715static float compress_symbolic_block_for_partition_2planes(716const astcenc_config& config,717const block_size_descriptor& bsd,718const image_block& blk,719float tune_errorval_threshold,720unsigned int plane2_component,721symbolic_compressed_block& scb,722compression_working_buffers& tmpbuf,723int quant_limit724) {725promise(config.tune_candidate_limit > 0);726promise(config.tune_refinement_limit > 0);727promise(bsd.decimation_mode_count_selected > 0);728729int max_weight_quant = astc::min(static_cast<int>(QUANT_32), quant_limit);730731// Compute ideal weights and endpoint colors, with no quantization or decimation732endpoints_and_weights& ei1 = tmpbuf.ei1;733endpoints_and_weights& ei2 = tmpbuf.ei2;734735compute_ideal_colors_and_weights_2planes(bsd, blk, plane2_component, ei1, ei2);736737// Compute ideal weights and endpoint colors for every decimation738float* dec_weights_ideal = tmpbuf.dec_weights_ideal;739uint8_t* dec_weights_uquant = tmpbuf.dec_weights_uquant;740741// For each decimation mode, compute an ideal set of weights with no quantization742for (unsigned int i = 0; i < bsd.decimation_mode_count_selected; i++)743{744const auto& dm = bsd.get_decimation_mode(i);745if (!dm.is_ref_2plane(static_cast<quant_method>(max_weight_quant)))746{747continue;748}749750const auto& di = bsd.get_decimation_info(i);751752compute_ideal_weights_for_decimation(753ei1,754di,755dec_weights_ideal + i * BLOCK_MAX_WEIGHTS);756757compute_ideal_weights_for_decimation(758ei2,759di,760dec_weights_ideal + i * BLOCK_MAX_WEIGHTS + WEIGHTS_PLANE2_OFFSET);761}762763// Compute maximum colors for the endpoints and ideal weights, then for each endpoint and ideal764// weight pair, compute the smallest weight that will result in a color value greater than 1765vfloat4 min_ep1(10.0f);766vfloat4 min_ep2(10.0f);767768vfloat4 ep1 = (vfloat4(1.0f) - ei1.ep.endpt0[0]) / (ei1.ep.endpt1[0] - ei1.ep.endpt0[0]);769vmask4 use_ep1 = (ep1 > vfloat4(0.5f)) & (ep1 < min_ep1);770min_ep1 = select(min_ep1, ep1, use_ep1);771772vfloat4 ep2 = (vfloat4(1.0f) - ei2.ep.endpt0[0]) / (ei2.ep.endpt1[0] - ei2.ep.endpt0[0]);773vmask4 use_ep2 = (ep2 > vfloat4(0.5f)) & (ep2 < min_ep2);774min_ep2 = select(min_ep2, ep2, use_ep2);775776vfloat4 err_max(ERROR_CALC_DEFAULT);777vmask4 err_mask = vint4::lane_id() == vint4(plane2_component);778779// Set the plane2 component to max error in ep1780min_ep1 = select(min_ep1, err_max, err_mask);781782float min_wt_cutoff1 = hmin_s(min_ep1);783784// Set the minwt2 to the plane2 component min in ep2785float min_wt_cutoff2 = hmin_s(select(err_max, min_ep2, err_mask));786787compute_angular_endpoints_2planes(788bsd, dec_weights_ideal, max_weight_quant, tmpbuf);789790// For each mode (which specifies a decimation and a quantization):791// * Compute number of bits needed for the quantized weights792// * Generate an optimized set of quantized weights793// * Compute quantization errors for the mode794795float* weight_low_value1 = tmpbuf.weight_low_value1;796float* weight_high_value1 = tmpbuf.weight_high_value1;797float* weight_low_value2 = tmpbuf.weight_low_value2;798float* weight_high_value2 = tmpbuf.weight_high_value2;799800int8_t* qwt_bitcounts = tmpbuf.qwt_bitcounts;801float* qwt_errors = tmpbuf.qwt_errors;802803unsigned int start_2plane = bsd.block_mode_count_1plane_selected;804unsigned int end_2plane = bsd.block_mode_count_1plane_2plane_selected;805806for (unsigned int i = start_2plane; i < end_2plane; i++)807{808const block_mode& bm = bsd.block_modes[i];809assert(bm.is_dual_plane);810811if (bm.quant_mode > max_weight_quant)812{813qwt_errors[i] = 1e38f;814continue;815}816817qwt_bitcounts[i] = static_cast<int8_t>(109 - bm.weight_bits);818819if (weight_high_value1[i] > 1.02f * min_wt_cutoff1)820{821weight_high_value1[i] = 1.0f;822}823824if (weight_high_value2[i] > 1.02f * min_wt_cutoff2)825{826weight_high_value2[i] = 1.0f;827}828829unsigned int decimation_mode = bm.decimation_mode;830const auto& di = bsd.get_decimation_info(decimation_mode);831832ASTCENC_ALIGNAS float dec_weights_uquantf[BLOCK_MAX_WEIGHTS];833834// Generate the optimized set of weights for the mode835compute_quantized_weights_for_decimation(836di,837weight_low_value1[i],838weight_high_value1[i],839dec_weights_ideal + BLOCK_MAX_WEIGHTS * decimation_mode,840dec_weights_uquantf,841dec_weights_uquant + BLOCK_MAX_WEIGHTS * i,842bm.get_weight_quant_mode());843844compute_quantized_weights_for_decimation(845di,846weight_low_value2[i],847weight_high_value2[i],848dec_weights_ideal + BLOCK_MAX_WEIGHTS * decimation_mode + WEIGHTS_PLANE2_OFFSET,849dec_weights_uquantf + WEIGHTS_PLANE2_OFFSET,850dec_weights_uquant + BLOCK_MAX_WEIGHTS * i + WEIGHTS_PLANE2_OFFSET,851bm.get_weight_quant_mode());852853// Compute weight quantization errors for the block mode854qwt_errors[i] = compute_error_of_weight_set_2planes(855ei1,856ei2,857di,858dec_weights_uquantf,859dec_weights_uquantf + WEIGHTS_PLANE2_OFFSET);860}861862// Decide the optimal combination of color endpoint encodings and weight encodings863uint8_t partition_format_specifiers[TUNE_MAX_TRIAL_CANDIDATES][BLOCK_MAX_PARTITIONS];864int block_mode_index[TUNE_MAX_TRIAL_CANDIDATES];865866quant_method color_quant_level[TUNE_MAX_TRIAL_CANDIDATES];867quant_method color_quant_level_mod[TUNE_MAX_TRIAL_CANDIDATES];868869endpoints epm;870merge_endpoints(ei1.ep, ei2.ep, plane2_component, epm);871872const auto& pi = bsd.get_partition_info(1, 0);873unsigned int candidate_count = compute_ideal_endpoint_formats(874pi, blk, epm, qwt_bitcounts, qwt_errors,875config.tune_candidate_limit,876bsd.block_mode_count_1plane_selected, bsd.block_mode_count_1plane_2plane_selected,877partition_format_specifiers, block_mode_index,878color_quant_level, color_quant_level_mod, tmpbuf);879880// Iterate over the N believed-to-be-best modes to find out which one is actually best881float best_errorval_in_mode = ERROR_CALC_DEFAULT;882float best_errorval_in_scb = scb.errorval;883884for (unsigned int i = 0; i < candidate_count; i++)885{886TRACE_NODE(node0, "candidate");887888const int bm_packed_index = block_mode_index[i];889assert(bm_packed_index >= static_cast<int>(bsd.block_mode_count_1plane_selected) &&890bm_packed_index < static_cast<int>(bsd.block_mode_count_1plane_2plane_selected));891const block_mode& qw_bm = bsd.block_modes[bm_packed_index];892893int decimation_mode = qw_bm.decimation_mode;894const auto& di = bsd.get_decimation_info(decimation_mode);895promise(di.weight_count > 0);896897trace_add_data("weight_x", di.weight_x);898trace_add_data("weight_y", di.weight_y);899trace_add_data("weight_z", di.weight_z);900trace_add_data("weight_quant", qw_bm.quant_mode);901902vfloat4 rgbs_color;903vfloat4 rgbo_color;904905symbolic_compressed_block workscb;906endpoints workep = epm;907908uint8_t* u8_weight1_src = dec_weights_uquant + BLOCK_MAX_WEIGHTS * bm_packed_index;909uint8_t* u8_weight2_src = dec_weights_uquant + BLOCK_MAX_WEIGHTS * bm_packed_index + WEIGHTS_PLANE2_OFFSET;910911for (int j = 0; j < di.weight_count; j++)912{913workscb.weights[j] = u8_weight1_src[j];914workscb.weights[j + WEIGHTS_PLANE2_OFFSET] = u8_weight2_src[j];915}916917for (unsigned int l = 0; l < config.tune_refinement_limit; l++)918{919recompute_ideal_colors_2planes(920blk, bsd, di,921workscb.weights, workscb.weights + WEIGHTS_PLANE2_OFFSET,922workep, rgbs_color, rgbo_color, plane2_component);923924// Quantize the chosen color925workscb.color_formats[0] = pack_color_endpoints(926workep.endpt0[0],927workep.endpt1[0],928rgbs_color, rgbo_color,929partition_format_specifiers[i][0],930workscb.color_values[0],931color_quant_level[i]);932933// Store header fields934workscb.partition_count = 1;935workscb.partition_index = 0;936workscb.quant_mode = color_quant_level[i];937workscb.color_formats_matched = 0;938workscb.block_mode = qw_bm.mode_index;939workscb.plane2_component = static_cast<int8_t>(plane2_component);940workscb.block_type = SYM_BTYPE_NONCONST;941942// Pre-realign test943if (l == 0)944{945float errorval = compute_symbolic_block_difference_2plane(config, bsd, workscb, blk);946if (errorval == -ERROR_CALC_DEFAULT)947{948errorval = -errorval;949workscb.block_type = SYM_BTYPE_ERROR;950}951952trace_add_data("error_prerealign", errorval);953best_errorval_in_mode = astc::min(errorval, best_errorval_in_mode);954955// Average refinement improvement is 3.5% per iteration (allow 4.5%), but the first956// iteration can help more so we give it a extra 8% leeway. Use this knowledge to957// drive a heuristic to skip blocks that are unlikely to catch up with the best958// block we have already.959unsigned int iters_remaining = config.tune_refinement_limit - l;960float threshold = (0.045f * static_cast<float>(iters_remaining)) + 1.08f;961if (errorval > (threshold * best_errorval_in_scb))962{963break;964}965966if (errorval < best_errorval_in_scb)967{968best_errorval_in_scb = errorval;969workscb.errorval = errorval;970scb = workscb;971972if (errorval < tune_errorval_threshold)973{974// Skip remaining candidates - this is "good enough"975i = candidate_count;976break;977}978}979}980981// Perform a final pass over the weights to try to improve them.982bool adjustments;983if (di.weight_count != bsd.texel_count)984{985adjustments = realign_weights_decimated(986config.profile, bsd, blk, workscb);987}988else989{990adjustments = realign_weights_undecimated(991config.profile, bsd, blk, workscb);992}993994// Post-realign test995float errorval = compute_symbolic_block_difference_2plane(config, bsd, workscb, blk);996if (errorval == -ERROR_CALC_DEFAULT)997{998errorval = -errorval;999workscb.block_type = SYM_BTYPE_ERROR;1000}10011002trace_add_data("error_postrealign", errorval);1003best_errorval_in_mode = astc::min(errorval, best_errorval_in_mode);10041005// Average refinement improvement is 3.5% per iteration, so skip blocks that are1006// unlikely to catch up with the best block we have already. Assume a 4.5% per step to1007// give benefit of the doubt ...1008unsigned int iters_remaining = config.tune_refinement_limit - 1 - l;1009float threshold = (0.045f * static_cast<float>(iters_remaining)) + 1.0f;1010if (errorval > (threshold * best_errorval_in_scb))1011{1012break;1013}10141015if (errorval < best_errorval_in_scb)1016{1017best_errorval_in_scb = errorval;1018workscb.errorval = errorval;1019scb = workscb;10201021if (errorval < tune_errorval_threshold)1022{1023// Skip remaining candidates - this is "good enough"1024i = candidate_count;1025break;1026}1027}10281029if (!adjustments)1030{1031break;1032}1033}1034}10351036return best_errorval_in_mode;1037}10381039/**1040* @brief Determine the lowest cross-channel correlation factor.1041*1042* @param texels_per_block The number of texels in a block.1043* @param blk The image block color data to compress.1044*1045* @return Return the lowest correlation factor.1046*/1047static float prepare_block_statistics(1048int texels_per_block,1049const image_block& blk1050) {1051// Compute covariance matrix, as a collection of 10 scalars that form the upper-triangular row1052// of the matrix. The matrix is symmetric, so this is all we need for this use case.1053float rs = 0.0f;1054float gs = 0.0f;1055float bs = 0.0f;1056float as = 0.0f;1057float rr_var = 0.0f;1058float gg_var = 0.0f;1059float bb_var = 0.0f;1060float aa_var = 0.0f;1061float rg_cov = 0.0f;1062float rb_cov = 0.0f;1063float ra_cov = 0.0f;1064float gb_cov = 0.0f;1065float ga_cov = 0.0f;1066float ba_cov = 0.0f;10671068float weight_sum = 0.0f;10691070promise(texels_per_block > 0);1071for (int i = 0; i < texels_per_block; i++)1072{1073float weight = hadd_s(blk.channel_weight) / 4.0f;1074assert(weight >= 0.0f);1075weight_sum += weight;10761077float r = blk.data_r[i];1078float g = blk.data_g[i];1079float b = blk.data_b[i];1080float a = blk.data_a[i];10811082float rw = r * weight;1083rs += rw;1084rr_var += r * rw;1085rg_cov += g * rw;1086rb_cov += b * rw;1087ra_cov += a * rw;10881089float gw = g * weight;1090gs += gw;1091gg_var += g * gw;1092gb_cov += b * gw;1093ga_cov += a * gw;10941095float bw = b * weight;1096bs += bw;1097bb_var += b * bw;1098ba_cov += a * bw;10991100float aw = a * weight;1101as += aw;1102aa_var += a * aw;1103}11041105float rpt = 1.0f / astc::max(weight_sum, 1e-7f);11061107rr_var -= rs * (rs * rpt);1108rg_cov -= gs * (rs * rpt);1109rb_cov -= bs * (rs * rpt);1110ra_cov -= as * (rs * rpt);11111112gg_var -= gs * (gs * rpt);1113gb_cov -= bs * (gs * rpt);1114ga_cov -= as * (gs * rpt);11151116bb_var -= bs * (bs * rpt);1117ba_cov -= as * (bs * rpt);11181119aa_var -= as * (as * rpt);11201121// These will give a NaN if a channel is constant - these are fixed up in the next step1122rg_cov *= astc::rsqrt(rr_var * gg_var);1123rb_cov *= astc::rsqrt(rr_var * bb_var);1124ra_cov *= astc::rsqrt(rr_var * aa_var);1125gb_cov *= astc::rsqrt(gg_var * bb_var);1126ga_cov *= astc::rsqrt(gg_var * aa_var);1127ba_cov *= astc::rsqrt(bb_var * aa_var);11281129if (astc::isnan(rg_cov)) rg_cov = 1.0f;1130if (astc::isnan(rb_cov)) rb_cov = 1.0f;1131if (astc::isnan(ra_cov)) ra_cov = 1.0f;1132if (astc::isnan(gb_cov)) gb_cov = 1.0f;1133if (astc::isnan(ga_cov)) ga_cov = 1.0f;1134if (astc::isnan(ba_cov)) ba_cov = 1.0f;11351136float lowest_correlation = astc::min(fabsf(rg_cov), fabsf(rb_cov));1137lowest_correlation = astc::min(lowest_correlation, fabsf(ra_cov));1138lowest_correlation = astc::min(lowest_correlation, fabsf(gb_cov));1139lowest_correlation = astc::min(lowest_correlation, fabsf(ga_cov));1140lowest_correlation = astc::min(lowest_correlation, fabsf(ba_cov));11411142// Diagnostic trace points1143trace_add_data("min_r", blk.data_min.lane<0>());1144trace_add_data("max_r", blk.data_max.lane<0>());1145trace_add_data("min_g", blk.data_min.lane<1>());1146trace_add_data("max_g", blk.data_max.lane<1>());1147trace_add_data("min_b", blk.data_min.lane<2>());1148trace_add_data("max_b", blk.data_max.lane<2>());1149trace_add_data("min_a", blk.data_min.lane<3>());1150trace_add_data("max_a", blk.data_max.lane<3>());1151trace_add_data("cov_rg", fabsf(rg_cov));1152trace_add_data("cov_rb", fabsf(rb_cov));1153trace_add_data("cov_ra", fabsf(ra_cov));1154trace_add_data("cov_gb", fabsf(gb_cov));1155trace_add_data("cov_ga", fabsf(ga_cov));1156trace_add_data("cov_ba", fabsf(ba_cov));11571158return lowest_correlation;1159}11601161/* See header for documentation. */1162void compress_block(1163const astcenc_contexti& ctx,1164const image_block& blk,1165uint8_t pcb[16],1166compression_working_buffers& tmpbuf)1167{1168astcenc_profile decode_mode = ctx.config.profile;1169symbolic_compressed_block scb;1170const block_size_descriptor& bsd = *ctx.bsd;1171float lowest_correl;11721173TRACE_NODE(node0, "block");1174trace_add_data("pos_x", blk.xpos);1175trace_add_data("pos_y", blk.ypos);1176trace_add_data("pos_z", blk.zpos);11771178// Set stricter block targets for luminance data as we have more bits to play with1179bool block_is_l = blk.is_luminance();1180float block_is_l_scale = block_is_l ? 1.0f / 1.5f : 1.0f;11811182// Set slightly stricter block targets for lumalpha data as we have more bits to play with1183bool block_is_la = blk.is_luminancealpha();1184float block_is_la_scale = block_is_la ? 1.0f / 1.05f : 1.0f;11851186bool block_skip_two_plane = false;1187int max_partitions = ctx.config.tune_partition_count_limit;11881189unsigned int requested_partition_indices[3] {1190ctx.config.tune_2partition_index_limit,1191ctx.config.tune_3partition_index_limit,1192ctx.config.tune_4partition_index_limit1193};11941195unsigned int requested_partition_trials[3] {1196ctx.config.tune_2partitioning_candidate_limit,1197ctx.config.tune_3partitioning_candidate_limit,1198ctx.config.tune_4partitioning_candidate_limit1199};12001201#if defined(ASTCENC_DIAGNOSTICS)1202// Do this early in diagnostic builds so we can dump uniform metrics1203// for every block. Do it later in release builds to avoid redundant work!1204float error_weight_sum = hadd_s(blk.channel_weight) * bsd.texel_count;1205float error_threshold = ctx.config.tune_db_limit1206* error_weight_sum1207* block_is_l_scale1208* block_is_la_scale;12091210lowest_correl = prepare_block_statistics(bsd.texel_count, blk);1211trace_add_data("lowest_correl", lowest_correl);1212trace_add_data("tune_error_threshold", error_threshold);1213#endif12141215// Detected a constant-color block1216if (all(blk.data_min == blk.data_max))1217{1218TRACE_NODE(node1, "pass");1219trace_add_data("partition_count", 0);1220trace_add_data("plane_count", 1);12211222scb.partition_count = 0;12231224// Encode as FP16 if using HDR1225if ((decode_mode == ASTCENC_PRF_HDR) ||1226(decode_mode == ASTCENC_PRF_HDR_RGB_LDR_A))1227{1228scb.block_type = SYM_BTYPE_CONST_F16;1229vint4 color_f16 = float_to_float16(blk.origin_texel);1230store(color_f16, scb.constant_color);1231}1232// Encode as UNORM16 if NOT using HDR1233else1234{1235scb.block_type = SYM_BTYPE_CONST_U16;1236vfloat4 color_f32 = clamp(0.0f, 1.0f, blk.origin_texel) * 65535.0f;1237vint4 color_u16 = float_to_int_rtn(color_f32);1238store(color_u16, scb.constant_color);1239}12401241trace_add_data("exit", "quality hit");12421243symbolic_to_physical(bsd, scb, pcb);1244return;1245}12461247#if !defined(ASTCENC_DIAGNOSTICS)1248float error_weight_sum = hadd_s(blk.channel_weight) * bsd.texel_count;1249float error_threshold = ctx.config.tune_db_limit1250* error_weight_sum1251* block_is_l_scale1252* block_is_la_scale;1253#endif12541255// Set SCB and mode errors to a very high error value1256scb.errorval = ERROR_CALC_DEFAULT;1257scb.block_type = SYM_BTYPE_ERROR;12581259float best_errorvals_for_pcount[BLOCK_MAX_PARTITIONS] {1260ERROR_CALC_DEFAULT, ERROR_CALC_DEFAULT, ERROR_CALC_DEFAULT, ERROR_CALC_DEFAULT1261};12621263float exit_thresholds_for_pcount[BLOCK_MAX_PARTITIONS] {12640.0f,1265ctx.config.tune_2partition_early_out_limit_factor,1266ctx.config.tune_3partition_early_out_limit_factor,12670.0f1268};12691270// Trial using 1 plane of weights and 1 partition.12711272// Most of the time we test it twice, first with a mode cutoff of 0 and then with the specified1273// mode cutoff. This causes an early-out that speeds up encoding of easy blocks. However, this1274// optimization is disabled for 4x4 and 5x4 blocks where it nearly always slows down the1275// compression and slightly reduces image quality.12761277float errorval_mult[2] {12781.0f / ctx.config.tune_mse_overshoot,12791.0f1280};12811282const float errorval_overshoot = 1.0f / ctx.config.tune_mse_overshoot;12831284// Only enable MODE0 fast path if enabled1285// Never enable for 3D blocks as no "always" block modes are available1286int start_trial = 1;1287if ((ctx.config.tune_search_mode0_enable >= TUNE_MIN_SEARCH_MODE0) && (bsd.zdim == 1))1288{1289start_trial = 0;1290}12911292int quant_limit = QUANT_32;1293for (int i = start_trial; i < 2; i++)1294{1295TRACE_NODE(node1, "pass");1296trace_add_data("partition_count", 1);1297trace_add_data("plane_count", 1);1298trace_add_data("search_mode", i);12991300float errorval = compress_symbolic_block_for_partition_1plane(1301ctx.config, bsd, blk, i == 0,1302error_threshold * errorval_mult[i] * errorval_overshoot,13031, 0, scb, tmpbuf, QUANT_32);13041305// Record the quant level so we can use the filter later searches1306const auto& bm = bsd.get_block_mode(scb.block_mode);1307quant_limit = bm.get_weight_quant_mode();13081309best_errorvals_for_pcount[0] = astc::min(best_errorvals_for_pcount[0], errorval);1310if (errorval < (error_threshold * errorval_mult[i]))1311{1312trace_add_data("exit", "quality hit");1313goto END_OF_TESTS;1314}1315}13161317#if !defined(ASTCENC_DIAGNOSTICS)1318lowest_correl = prepare_block_statistics(bsd.texel_count, blk);1319#endif13201321block_skip_two_plane = lowest_correl > ctx.config.tune_2plane_early_out_limit_correlation;13221323// Test the four possible 1-partition, 2-planes modes. Do this in reverse, as1324// alpha is the most likely to be non-correlated if it is present in the data.1325for (int i = BLOCK_MAX_COMPONENTS - 1; i >= 0; i--)1326{1327TRACE_NODE(node1, "pass");1328trace_add_data("partition_count", 1);1329trace_add_data("plane_count", 2);1330trace_add_data("plane_component", i);13311332if (block_skip_two_plane)1333{1334trace_add_data("skip", "tune_2plane_early_out_limit_correlation");1335continue;1336}13371338if (blk.grayscale && i != 3)1339{1340trace_add_data("skip", "grayscale block");1341continue;1342}13431344if (blk.is_constant_channel(i))1345{1346trace_add_data("skip", "constant component");1347continue;1348}13491350float errorval = compress_symbolic_block_for_partition_2planes(1351ctx.config, bsd, blk, error_threshold * errorval_overshoot,1352i, scb, tmpbuf, quant_limit);13531354// If attempting two planes is much worse than the best one plane result1355// then further two plane searches are unlikely to help so move on ...1356if (errorval > (best_errorvals_for_pcount[0] * 1.85f))1357{1358break;1359}13601361if (errorval < error_threshold)1362{1363trace_add_data("exit", "quality hit");1364goto END_OF_TESTS;1365}1366}13671368// Find best blocks for 2, 3 and 4 partitions1369for (int partition_count = 2; partition_count <= max_partitions; partition_count++)1370{1371unsigned int partition_indices[TUNE_MAX_PARTITIONING_CANDIDATES];13721373unsigned int requested_indices = requested_partition_indices[partition_count - 2];13741375unsigned int requested_trials = requested_partition_trials[partition_count - 2];1376requested_trials = astc::min(requested_trials, requested_indices);13771378unsigned int actual_trials = find_best_partition_candidates(1379bsd, blk, partition_count, requested_indices, partition_indices, requested_trials);13801381float best_error_in_prev = best_errorvals_for_pcount[partition_count - 2];13821383for (unsigned int i = 0; i < actual_trials; i++)1384{1385TRACE_NODE(node1, "pass");1386trace_add_data("partition_count", partition_count);1387trace_add_data("partition_index", partition_indices[i]);1388trace_add_data("plane_count", 1);1389trace_add_data("search_mode", i);13901391float errorval = compress_symbolic_block_for_partition_1plane(1392ctx.config, bsd, blk, false,1393error_threshold * errorval_overshoot,1394partition_count, partition_indices[i],1395scb, tmpbuf, quant_limit);13961397best_errorvals_for_pcount[partition_count - 1] = astc::min(best_errorvals_for_pcount[partition_count - 1], errorval);13981399// If using N partitions doesn't improve much over using N-1 partitions then skip trying1400// N+1. Error can dramatically improve if the data is correlated or non-correlated and1401// aligns with a partitioning that suits that encoding, so for this inner loop check add1402// a large error scale because the "other" trial could be a lot better.1403float best_error = best_errorvals_for_pcount[partition_count - 1];1404float best_error_scale = exit_thresholds_for_pcount[partition_count - 1] * 1.85f;1405if (best_error > (best_error_in_prev * best_error_scale))1406{1407trace_add_data("skip", "tune_partition_early_out_limit_factor");1408goto END_OF_TESTS;1409}14101411if (errorval < error_threshold)1412{1413trace_add_data("exit", "quality hit");1414goto END_OF_TESTS;1415}1416}14171418// If using N partitions doesn't improve much over using N-1 partitions then skip trying N+11419float best_error = best_errorvals_for_pcount[partition_count - 1];1420float best_error_scale = exit_thresholds_for_pcount[partition_count - 1];1421if (best_error > (best_error_in_prev * best_error_scale))1422{1423trace_add_data("skip", "tune_partition_early_out_limit_factor");1424goto END_OF_TESTS;1425}1426}14271428trace_add_data("exit", "quality not hit");14291430END_OF_TESTS:1431// If we still have an error block then convert to something we can encode1432// TODO: Do something more sensible here, such as average color block1433if (scb.block_type == SYM_BTYPE_ERROR)1434{1435#if defined(ASTCENC_DIAGNOSTICS)1436static bool printed_once = false;1437if (!printed_once)1438{1439printed_once = true;1440printf("WARN: At least one block failed to find a valid encoding.\n"1441" Try increasing compression quality settings.\n\n");1442}1443#endif14441445scb.block_type = SYM_BTYPE_CONST_U16;1446vfloat4 color_f32 = clamp(0.0f, 1.0f, blk.origin_texel) * 65535.0f;1447vint4 color_u16 = float_to_int_rtn(color_f32);1448store(color_u16, scb.constant_color);1449}14501451// Compress to a physical block1452symbolic_to_physical(bsd, scb, pcb);1453}14541455#endif145614571458