Path: blob/master/thirdparty/basis_universal/encoder/basisu_frontend.cpp
9902 views
// basisu_frontend.cpp1// Copyright (C) 2019-2024 Binomial LLC. All Rights Reserved.2//3// Licensed under the Apache License, Version 2.0 (the "License");4// you may not use this file except in compliance with the License.5// You may obtain a copy of the License at6//7// http://www.apache.org/licenses/LICENSE-2.08//9// Unless required by applicable law or agreed to in writing, software10// distributed under the License is distributed on an "AS IS" BASIS,11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.12// See the License for the specific language governing permissions and13// limitations under the License.14//15// TODO:16// This code originally supported full ETC1 and ETC1S, so there's some legacy stuff to be cleaned up in here.17// Add endpoint tiling support (where we force adjacent blocks to use the same endpoints during quantization), for a ~10% or more increase in bitrate at same SSIM. The backend already supports this.18//19#include "../transcoder/basisu.h"20#include "basisu_frontend.h"21#include "basisu_opencl.h"22#include <unordered_set>23#include <unordered_map>2425#if BASISU_SUPPORT_SSE26#define CPPSPMD_NAME(a) a##_sse4127#include "basisu_kernels_declares.h"28#endif2930#define BASISU_FRONTEND_VERIFY(c) do { if (!(c)) handle_verify_failure(__LINE__); } while(0)3132namespace basisu33{34const uint32_t cMaxCodebookCreationThreads = 8;3536const uint32_t BASISU_MAX_ENDPOINT_REFINEMENT_STEPS = 3;37//const uint32_t BASISU_MAX_SELECTOR_REFINEMENT_STEPS = 3;3839const uint32_t BASISU_ENDPOINT_PARENT_CODEBOOK_SIZE = 16;40const uint32_t BASISU_SELECTOR_PARENT_CODEBOOK_SIZE_COMP_LEVEL_01 = 32;41const uint32_t BASISU_SELECTOR_PARENT_CODEBOOK_SIZE_COMP_LEVEL_DEFAULT = 16;4243// TODO - How to handle internal verifies in the basisu lib44static inline void handle_verify_failure(int line)45{46error_printf("basisu_frontend: verify check failed at line %i!\n", line);47abort();48}4950bool basisu_frontend::init(const params &p)51{52debug_printf("basisu_frontend::init: Multithreaded: %u, Job pool total threads: %u, NumEndpointClusters: %u, NumSelectorClusters: %u, Perceptual: %u, CompressionLevel: %u\n",53p.m_multithreaded, p.m_pJob_pool ? p.m_pJob_pool->get_total_threads() : 0,54p.m_max_endpoint_clusters, p.m_max_selector_clusters, p.m_perceptual, p.m_compression_level);5556if ((p.m_max_endpoint_clusters < 1) || (p.m_max_endpoint_clusters > cMaxEndpointClusters))57return false;58if ((p.m_max_selector_clusters < 1) || (p.m_max_selector_clusters > cMaxSelectorClusters))59return false;6061m_source_blocks.resize(0);62append_vector(m_source_blocks, p.m_pSource_blocks, p.m_num_source_blocks);6364m_params = p;6566if (m_params.m_pOpenCL_context)67{68BASISU_ASSUME(sizeof(cl_pixel_block) == sizeof(pixel_block));6970// Upload the RGBA pixel blocks a single time.71if (!opencl_set_pixel_blocks(m_params.m_pOpenCL_context, m_source_blocks.size(), (cl_pixel_block*)m_source_blocks.data()))72{73// This is not fatal, we just won't use OpenCL.74error_printf("basisu_frontend::init: opencl_set_pixel_blocks() failed\n");75m_params.m_pOpenCL_context = nullptr;76m_opencl_failed = true;77}78}7980m_encoded_blocks.resize(m_params.m_num_source_blocks);81memset(&m_encoded_blocks[0], 0, m_encoded_blocks.size() * sizeof(m_encoded_blocks[0]));8283m_num_endpoint_codebook_iterations = 1;84m_num_selector_codebook_iterations = 1;8586switch (p.m_compression_level)87{88case 0:89{90m_endpoint_refinement = false;91m_use_hierarchical_endpoint_codebooks = true;92m_use_hierarchical_selector_codebooks = true;93break;94}95case 1:96{97m_endpoint_refinement = true;98m_use_hierarchical_endpoint_codebooks = true;99m_use_hierarchical_selector_codebooks = true;100101break;102}103case 2:104{105m_endpoint_refinement = true;106m_use_hierarchical_endpoint_codebooks = true;107m_use_hierarchical_selector_codebooks = true;108109break;110}111case 3:112{113m_endpoint_refinement = true;114m_use_hierarchical_endpoint_codebooks = false;115m_use_hierarchical_selector_codebooks = false;116break;117}118case 4:119{120m_endpoint_refinement = true;121m_use_hierarchical_endpoint_codebooks = true;122m_use_hierarchical_selector_codebooks = true;123m_num_endpoint_codebook_iterations = BASISU_MAX_ENDPOINT_REFINEMENT_STEPS;124m_num_selector_codebook_iterations = BASISU_MAX_ENDPOINT_REFINEMENT_STEPS;125break;126}127case 5:128{129m_endpoint_refinement = true;130m_use_hierarchical_endpoint_codebooks = false;131m_use_hierarchical_selector_codebooks = false;132m_num_endpoint_codebook_iterations = BASISU_MAX_ENDPOINT_REFINEMENT_STEPS;133m_num_selector_codebook_iterations = BASISU_MAX_ENDPOINT_REFINEMENT_STEPS;134break;135}136case 6:137default:138{139m_endpoint_refinement = true;140m_use_hierarchical_endpoint_codebooks = false;141m_use_hierarchical_selector_codebooks = false;142m_num_endpoint_codebook_iterations = BASISU_MAX_ENDPOINT_REFINEMENT_STEPS*2;143m_num_selector_codebook_iterations = BASISU_MAX_ENDPOINT_REFINEMENT_STEPS*2;144break;145}146147}148149if (m_params.m_disable_hierarchical_endpoint_codebooks)150m_use_hierarchical_endpoint_codebooks = false;151152debug_printf("Endpoint refinement: %u, Hierarchical endpoint codebooks: %u, Hierarchical selector codebooks: %u, Endpoint codebook iters: %u, Selector codebook iters: %u\n",153m_endpoint_refinement, m_use_hierarchical_endpoint_codebooks, m_use_hierarchical_selector_codebooks, m_num_endpoint_codebook_iterations, m_num_selector_codebook_iterations);154155return true;156}157158bool basisu_frontend::compress()159{160debug_printf("basisu_frontend::compress\n");161162m_total_blocks = m_params.m_num_source_blocks;163m_total_pixels = m_total_blocks * cPixelBlockTotalPixels;164165// Encode the initial high quality ETC1S texture166167init_etc1_images();168169// First quantize the ETC1S endpoints170171if (m_params.m_pGlobal_codebooks)172{173init_global_codebooks();174}175else176{177init_endpoint_training_vectors();178179generate_endpoint_clusters();180181for (uint32_t refine_endpoint_step = 0; refine_endpoint_step < m_num_endpoint_codebook_iterations; refine_endpoint_step++)182{183if (m_params.m_validate)184{185BASISU_FRONTEND_VERIFY(check_etc1s_constraints());186187BASISU_FRONTEND_VERIFY(validate_endpoint_cluster_hierarchy(false));188}189190if (refine_endpoint_step)191{192introduce_new_endpoint_clusters();193}194195if (m_params.m_validate)196{197BASISU_FRONTEND_VERIFY(validate_endpoint_cluster_hierarchy(false));198}199200generate_endpoint_codebook(refine_endpoint_step);201202if ((m_params.m_debug_images) && (m_params.m_dump_endpoint_clusterization))203{204char buf[256];205snprintf(buf, sizeof(buf), "endpoint_cluster_vis_pre_%u.png", refine_endpoint_step);206dump_endpoint_clusterization_visualization(buf, false);207}208209bool early_out = false;210211if (m_endpoint_refinement)212{213//dump_endpoint_clusterization_visualization("endpoint_clusters_before_refinement.png");214215if (!refine_endpoint_clusterization())216early_out = true;217218if ((m_params.m_tex_type == basist::cBASISTexTypeVideoFrames) && (!refine_endpoint_step) && (m_num_endpoint_codebook_iterations == 1))219{220eliminate_redundant_or_empty_endpoint_clusters();221generate_endpoint_codebook(basisu::maximum(1U, refine_endpoint_step));222}223224if ((m_params.m_debug_images) && (m_params.m_dump_endpoint_clusterization))225{226char buf[256];227snprintf(buf, sizeof(buf), "endpoint_cluster_vis_post_%u.png", refine_endpoint_step);228229dump_endpoint_clusterization_visualization(buf, false);230snprintf(buf, sizeof(buf), "endpoint_cluster_colors_vis_post_%u.png", refine_endpoint_step);231232dump_endpoint_clusterization_visualization(buf, true);233}234}235236if (m_params.m_validate)237{238BASISU_FRONTEND_VERIFY(validate_endpoint_cluster_hierarchy(false));239}240241eliminate_redundant_or_empty_endpoint_clusters();242243if (m_params.m_validate)244{245BASISU_FRONTEND_VERIFY(validate_endpoint_cluster_hierarchy(false));246}247248if (m_params.m_debug_stats)249debug_printf("Total endpoint clusters: %u\n", (uint32_t)m_endpoint_clusters.size());250251if (early_out)252break;253}254255if (m_params.m_validate)256{257BASISU_FRONTEND_VERIFY(check_etc1s_constraints());258}259260generate_block_endpoint_clusters();261262create_initial_packed_texture();263264// Now quantize the ETC1S selectors265266generate_selector_clusters();267268if (m_use_hierarchical_selector_codebooks)269compute_selector_clusters_within_each_parent_cluster();270271if (m_params.m_compression_level == 0)272{273create_optimized_selector_codebook(0);274275find_optimal_selector_clusters_for_each_block();276277introduce_special_selector_clusters();278}279else280{281const uint32_t num_refine_selector_steps = m_num_selector_codebook_iterations;282for (uint32_t refine_selector_steps = 0; refine_selector_steps < num_refine_selector_steps; refine_selector_steps++)283{284create_optimized_selector_codebook(refine_selector_steps);285286find_optimal_selector_clusters_for_each_block();287288introduce_special_selector_clusters();289290if ((m_params.m_compression_level >= 4) || (m_params.m_tex_type == basist::cBASISTexTypeVideoFrames))291{292if (!refine_block_endpoints_given_selectors())293break;294}295}296}297298optimize_selector_codebook();299300if (m_params.m_debug_stats)301debug_printf("Total selector clusters: %u\n", (uint32_t)m_selector_cluster_block_indices.size());302}303304finalize();305306if (m_params.m_validate)307{308if (!validate_output())309return false;310}311312debug_printf("basisu_frontend::compress: Done\n");313314return true;315}316317bool basisu_frontend::init_global_codebooks()318{319const basist::basisu_lowlevel_etc1s_transcoder* pTranscoder = m_params.m_pGlobal_codebooks;320321const basist::basisu_lowlevel_etc1s_transcoder::endpoint_vec& endpoints = pTranscoder->get_endpoints();322const basist::basisu_lowlevel_etc1s_transcoder::selector_vec& selectors = pTranscoder->get_selectors();323324m_endpoint_cluster_etc_params.resize(endpoints.size());325for (uint32_t i = 0; i < endpoints.size(); i++)326{327m_endpoint_cluster_etc_params[i].m_inten_table[0] = endpoints[i].m_inten5;328m_endpoint_cluster_etc_params[i].m_inten_table[1] = endpoints[i].m_inten5;329330m_endpoint_cluster_etc_params[i].m_color_unscaled[0].set(endpoints[i].m_color5.r, endpoints[i].m_color5.g, endpoints[i].m_color5.b, 255);331m_endpoint_cluster_etc_params[i].m_color_used[0] = true;332m_endpoint_cluster_etc_params[i].m_valid = true;333}334335m_optimized_cluster_selectors.resize(selectors.size());336for (uint32_t i = 0; i < m_optimized_cluster_selectors.size(); i++)337{338for (uint32_t y = 0; y < 4; y++)339for (uint32_t x = 0; x < 4; x++)340m_optimized_cluster_selectors[i].set_selector(x, y, selectors[i].get_selector(x, y));341}342343m_block_endpoint_clusters_indices.resize(m_total_blocks);344345m_orig_encoded_blocks.resize(m_total_blocks);346347m_block_selector_cluster_index.resize(m_total_blocks);348349#if 0350for (uint32_t block_index_iter = 0; block_index_iter < m_total_blocks; block_index_iter += N)351{352const uint32_t first_index = block_index_iter;353const uint32_t last_index = minimum<uint32_t>(m_total_blocks, first_index + N);354355m_params.m_pJob_pool->add_job([this, first_index, last_index] {356357for (uint32_t block_index = first_index; block_index < last_index; block_index++)358{359const etc_block& blk = m_etc1_blocks_etc1s[block_index];360361const uint32_t block_endpoint_index = m_block_endpoint_clusters_indices[block_index][0];362363etc_block trial_blk;364trial_blk.set_block_color5_etc1s(blk.m_color_unscaled[0]);365trial_blk.set_flip_bit(true);366367uint64_t best_err = UINT64_MAX;368uint32_t best_index = 0;369370for (uint32_t i = 0; i < m_optimized_cluster_selectors.size(); i++)371{372trial_blk.set_raw_selector_bits(m_optimized_cluster_selectors[i].get_raw_selector_bits());373374const uint64_t cur_err = trial_blk.evaluate_etc1_error(get_source_pixel_block(block_index).get_ptr(), m_params.m_perceptual);375if (cur_err < best_err)376{377best_err = cur_err;378best_index = i;379if (!cur_err)380break;381}382383} // block_index384385m_block_selector_cluster_index[block_index] = best_index;386}387388});389390}391392m_params.m_pJob_pool->wait_for_all();393394m_encoded_blocks.resize(m_total_blocks);395for (uint32_t block_index = 0; block_index < m_total_blocks; block_index++)396{397const uint32_t endpoint_index = m_block_endpoint_clusters_indices[block_index][0];398const uint32_t selector_index = m_block_selector_cluster_index[block_index];399400etc_block& blk = m_encoded_blocks[block_index];401402blk.set_block_color5_etc1s(m_endpoint_cluster_etc_params[endpoint_index].m_color_unscaled[0]);403blk.set_inten_tables_etc1s(m_endpoint_cluster_etc_params[endpoint_index].m_inten_table[0]);404blk.set_flip_bit(true);405blk.set_raw_selector_bits(m_optimized_cluster_selectors[selector_index].get_raw_selector_bits());406}407#endif408409// HACK HACK410const uint32_t NUM_PASSES = 3;411for (uint32_t pass = 0; pass < NUM_PASSES; pass++)412{413debug_printf("init_global_codebooks: pass %u\n", pass);414415const uint32_t N = 128;416for (uint32_t block_index_iter = 0; block_index_iter < m_total_blocks; block_index_iter += N)417{418const uint32_t first_index = block_index_iter;419const uint32_t last_index = minimum<uint32_t>(m_total_blocks, first_index + N);420421m_params.m_pJob_pool->add_job([this, first_index, last_index, pass] {422423for (uint32_t block_index = first_index; block_index < last_index; block_index++)424{425const etc_block& blk = pass ? m_encoded_blocks[block_index] : m_etc1_blocks_etc1s[block_index];426const uint32_t blk_raw_selector_bits = blk.get_raw_selector_bits();427428etc_block trial_blk(blk);429trial_blk.set_raw_selector_bits(blk_raw_selector_bits);430trial_blk.set_flip_bit(true);431432uint64_t best_err = UINT64_MAX;433uint32_t best_index = 0;434etc_block best_block(trial_blk);435436for (uint32_t i = 0; i < m_endpoint_cluster_etc_params.size(); i++)437{438if (m_endpoint_cluster_etc_params[i].m_inten_table[0] > blk.get_inten_table(0))439continue;440441trial_blk.set_block_color5_etc1s(m_endpoint_cluster_etc_params[i].m_color_unscaled[0]);442trial_blk.set_inten_tables_etc1s(m_endpoint_cluster_etc_params[i].m_inten_table[0]);443444const color_rgba* pSource_pixels = get_source_pixel_block(block_index).get_ptr();445uint64_t cur_err;446if (!pass)447cur_err = trial_blk.determine_selectors(pSource_pixels, m_params.m_perceptual);448else449cur_err = trial_blk.evaluate_etc1_error(pSource_pixels, m_params.m_perceptual);450451if (cur_err < best_err)452{453best_err = cur_err;454best_index = i;455best_block = trial_blk;456457if (!cur_err)458break;459}460}461462m_block_endpoint_clusters_indices[block_index][0] = best_index;463m_block_endpoint_clusters_indices[block_index][1] = best_index;464465m_orig_encoded_blocks[block_index] = best_block;466467} // block_index468469});470471}472473m_params.m_pJob_pool->wait_for_all();474475m_endpoint_clusters.resize(0);476m_endpoint_clusters.resize(endpoints.size());477for (uint32_t block_index = 0; block_index < m_total_blocks; block_index++)478{479const uint32_t endpoint_cluster_index = m_block_endpoint_clusters_indices[block_index][0];480m_endpoint_clusters[endpoint_cluster_index].push_back(block_index * 2);481m_endpoint_clusters[endpoint_cluster_index].push_back(block_index * 2 + 1);482}483484m_block_selector_cluster_index.resize(m_total_blocks);485486for (uint32_t block_index_iter = 0; block_index_iter < m_total_blocks; block_index_iter += N)487{488const uint32_t first_index = block_index_iter;489const uint32_t last_index = minimum<uint32_t>(m_total_blocks, first_index + N);490491m_params.m_pJob_pool->add_job([this, first_index, last_index] {492493for (uint32_t block_index = first_index; block_index < last_index; block_index++)494{495const uint32_t block_endpoint_index = m_block_endpoint_clusters_indices[block_index][0];496497etc_block trial_blk;498trial_blk.set_block_color5_etc1s(m_endpoint_cluster_etc_params[block_endpoint_index].m_color_unscaled[0]);499trial_blk.set_inten_tables_etc1s(m_endpoint_cluster_etc_params[block_endpoint_index].m_inten_table[0]);500trial_blk.set_flip_bit(true);501502uint64_t best_err = UINT64_MAX;503uint32_t best_index = 0;504505for (uint32_t i = 0; i < m_optimized_cluster_selectors.size(); i++)506{507trial_blk.set_raw_selector_bits(m_optimized_cluster_selectors[i].get_raw_selector_bits());508509const uint64_t cur_err = trial_blk.evaluate_etc1_error(get_source_pixel_block(block_index).get_ptr(), m_params.m_perceptual);510if (cur_err < best_err)511{512best_err = cur_err;513best_index = i;514if (!cur_err)515break;516}517518} // block_index519520m_block_selector_cluster_index[block_index] = best_index;521}522523});524525}526527m_params.m_pJob_pool->wait_for_all();528529m_encoded_blocks.resize(m_total_blocks);530for (uint32_t block_index = 0; block_index < m_total_blocks; block_index++)531{532const uint32_t endpoint_index = m_block_endpoint_clusters_indices[block_index][0];533const uint32_t selector_index = m_block_selector_cluster_index[block_index];534535etc_block& blk = m_encoded_blocks[block_index];536537blk.set_block_color5_etc1s(m_endpoint_cluster_etc_params[endpoint_index].m_color_unscaled[0]);538blk.set_inten_tables_etc1s(m_endpoint_cluster_etc_params[endpoint_index].m_inten_table[0]);539blk.set_flip_bit(true);540blk.set_raw_selector_bits(m_optimized_cluster_selectors[selector_index].get_raw_selector_bits());541}542543} // pass544545m_selector_cluster_block_indices.resize(selectors.size());546for (uint32_t block_index = 0; block_index < m_etc1_blocks_etc1s.size(); block_index++)547m_selector_cluster_block_indices[m_block_selector_cluster_index[block_index]].push_back(block_index);548549return true;550}551552void basisu_frontend::introduce_special_selector_clusters()553{554debug_printf("introduce_special_selector_clusters\n");555556uint32_t total_blocks_relocated = 0;557const uint32_t initial_selector_clusters = m_selector_cluster_block_indices.size_u32();558559bool_vec block_relocated_flags(m_total_blocks);560561// Make sure the selector codebook always has pure flat blocks for each possible selector, to avoid obvious artifacts.562// optimize_selector_codebook() will clean up any redundant clusters we create here.563for (uint32_t sel = 0; sel < 4; sel++)564{565etc_block blk;566clear_obj(blk);567for (uint32_t j = 0; j < 16; j++)568blk.set_selector(j & 3, j >> 2, sel);569570int k;571for (k = 0; k < (int)m_optimized_cluster_selectors.size(); k++)572if (m_optimized_cluster_selectors[k].get_raw_selector_bits() == blk.get_raw_selector_bits())573break;574if (k < (int)m_optimized_cluster_selectors.size())575continue;576577debug_printf("Introducing sel %u\n", sel);578579const uint32_t new_selector_cluster_index = m_optimized_cluster_selectors.size_u32();580581m_optimized_cluster_selectors.push_back(blk);582583vector_ensure_element_is_valid(m_selector_cluster_block_indices, new_selector_cluster_index);584585for (uint32_t block_index = 0; block_index < m_total_blocks; block_index++)586{587if (m_orig_encoded_blocks[block_index].get_raw_selector_bits() != blk.get_raw_selector_bits())588continue;589590// See if using flat selectors actually decreases the block's error.591const uint32_t old_selector_cluster_index = m_block_selector_cluster_index[block_index];592593etc_block cur_blk;594const uint32_t endpoint_cluster_index = get_subblock_endpoint_cluster_index(block_index, 0);595cur_blk.set_block_color5_etc1s(get_endpoint_cluster_unscaled_color(endpoint_cluster_index, false));596cur_blk.set_inten_tables_etc1s(get_endpoint_cluster_inten_table(endpoint_cluster_index, false));597cur_blk.set_raw_selector_bits(get_selector_cluster_selector_bits(old_selector_cluster_index).get_raw_selector_bits());598cur_blk.set_flip_bit(true);599600const uint64_t cur_err = cur_blk.evaluate_etc1_error(get_source_pixel_block(block_index).get_ptr(), m_params.m_perceptual);601602cur_blk.set_raw_selector_bits(blk.get_raw_selector_bits());603604const uint64_t new_err = cur_blk.evaluate_etc1_error(get_source_pixel_block(block_index).get_ptr(), m_params.m_perceptual);605606if (new_err >= cur_err)607continue;608609// Change the block to use the new cluster610m_block_selector_cluster_index[block_index] = new_selector_cluster_index;611612m_selector_cluster_block_indices[new_selector_cluster_index].push_back(block_index);613614block_relocated_flags[block_index] = true;615616#if 0617int j = vector_find(m_selector_cluster_block_indices[old_selector_cluster_index], block_index);618if (j >= 0)619m_selector_cluster_block_indices[old_selector_cluster_index].erase(m_selector_cluster_block_indices[old_selector_cluster_index].begin() + j);620#endif621622total_blocks_relocated++;623624m_encoded_blocks[block_index].set_raw_selector_bits(blk.get_raw_selector_bits());625626} // block_index627628} // sel629630if (total_blocks_relocated)631{632debug_printf("Fixing selector codebook\n");633634for (int selector_cluster_index = 0; selector_cluster_index < (int)initial_selector_clusters; selector_cluster_index++)635{636uint_vec& block_indices = m_selector_cluster_block_indices[selector_cluster_index];637638uint32_t dst_ofs = 0;639640for (uint32_t i = 0; i < block_indices.size(); i++)641{642const uint32_t block_index = block_indices[i];643if (!block_relocated_flags[block_index])644block_indices[dst_ofs++] = block_index;645}646647block_indices.resize(dst_ofs);648}649}650651debug_printf("Total blocks relocated to new flat selector clusters: %u\n", total_blocks_relocated);652}653654// This method will change the number and ordering of the selector codebook clusters.655void basisu_frontend::optimize_selector_codebook()656{657debug_printf("optimize_selector_codebook\n");658659const uint32_t orig_total_selector_clusters = m_optimized_cluster_selectors.size_u32();660661bool_vec selector_cluster_was_used(m_optimized_cluster_selectors.size());662for (uint32_t i = 0; i < m_total_blocks; i++)663selector_cluster_was_used[m_block_selector_cluster_index[i]] = true;664665int_vec old_to_new(m_optimized_cluster_selectors.size());666int_vec new_to_old;667uint32_t total_new_entries = 0;668669std::unordered_map<uint32_t, uint32_t> selector_hashmap;670671for (int i = 0; i < static_cast<int>(m_optimized_cluster_selectors.size()); i++)672{673if (!selector_cluster_was_used[i])674{675old_to_new[i] = -1;676continue;677}678679const uint32_t raw_selector_bits = m_optimized_cluster_selectors[i].get_raw_selector_bits();680681auto find_res = selector_hashmap.insert(std::make_pair(raw_selector_bits, total_new_entries));682if (!find_res.second)683{684old_to_new[i] = (find_res.first)->second;685continue;686}687688old_to_new[i] = total_new_entries++;689new_to_old.push_back(i);690}691692debug_printf("Original selector clusters: %u, new cluster selectors: %u\n", orig_total_selector_clusters, total_new_entries);693694for (uint32_t i = 0; i < m_block_selector_cluster_index.size(); i++)695{696BASISU_FRONTEND_VERIFY((old_to_new[m_block_selector_cluster_index[i]] >= 0) && (old_to_new[m_block_selector_cluster_index[i]] < (int)total_new_entries));697m_block_selector_cluster_index[i] = old_to_new[m_block_selector_cluster_index[i]];698}699700basisu::vector<etc_block> new_optimized_cluster_selectors(m_optimized_cluster_selectors.size() ? total_new_entries : 0);701basisu::vector<uint_vec> new_selector_cluster_indices(m_selector_cluster_block_indices.size() ? total_new_entries : 0);702703for (uint32_t i = 0; i < total_new_entries; i++)704{705if (m_optimized_cluster_selectors.size())706new_optimized_cluster_selectors[i] = m_optimized_cluster_selectors[new_to_old[i]];707708//if (m_selector_cluster_block_indices.size())709// new_selector_cluster_indices[i] = m_selector_cluster_block_indices[new_to_old[i]];710}711712for (uint32_t i = 0; i < m_block_selector_cluster_index.size(); i++)713{714new_selector_cluster_indices[m_block_selector_cluster_index[i]].push_back(i);715}716717m_optimized_cluster_selectors.swap(new_optimized_cluster_selectors);718m_selector_cluster_block_indices.swap(new_selector_cluster_indices);719720// This isn't strictly necessary - doing it for completeness/future sanity.721if (m_selector_clusters_within_each_parent_cluster.size())722{723for (uint32_t i = 0; i < m_selector_clusters_within_each_parent_cluster.size(); i++)724for (uint32_t j = 0; j < m_selector_clusters_within_each_parent_cluster[i].size(); j++)725m_selector_clusters_within_each_parent_cluster[i][j] = old_to_new[m_selector_clusters_within_each_parent_cluster[i][j]];726}727728debug_printf("optimize_selector_codebook: Before: %u After: %u\n", orig_total_selector_clusters, total_new_entries);729}730731void basisu_frontend::init_etc1_images()732{733debug_printf("basisu_frontend::init_etc1_images\n");734735interval_timer tm;736tm.start();737738m_etc1_blocks_etc1s.resize(m_total_blocks);739740bool use_cpu = true;741742if (m_params.m_pOpenCL_context)743{744uint32_t total_perms = 64;745if (m_params.m_compression_level == 0)746total_perms = 4;747else if (m_params.m_compression_level == 1)748total_perms = 16;749else if (m_params.m_compression_level == BASISU_MAX_COMPRESSION_LEVEL)750total_perms = OPENCL_ENCODE_ETC1S_MAX_PERMS;751752bool status = opencl_encode_etc1s_blocks(m_params.m_pOpenCL_context, m_etc1_blocks_etc1s.data(), m_params.m_perceptual, total_perms);753if (status)754use_cpu = false;755else756{757error_printf("basisu_frontend::init_etc1_images: opencl_encode_etc1s_blocks() failed! Using CPU.\n");758m_params.m_pOpenCL_context = nullptr;759m_opencl_failed = true;760}761}762763if (use_cpu)764{765const uint32_t N = 4096;766for (uint32_t block_index_iter = 0; block_index_iter < m_total_blocks; block_index_iter += N)767{768const uint32_t first_index = block_index_iter;769const uint32_t last_index = minimum<uint32_t>(m_total_blocks, first_index + N);770771m_params.m_pJob_pool->add_job([this, first_index, last_index] {772773for (uint32_t block_index = first_index; block_index < last_index; block_index++)774{775const pixel_block& source_blk = get_source_pixel_block(block_index);776777etc1_optimizer optimizer;778etc1_optimizer::params optimizer_params;779etc1_optimizer::results optimizer_results;780781if (m_params.m_compression_level == 0)782optimizer_params.m_quality = cETCQualityFast;783else if (m_params.m_compression_level == 1)784optimizer_params.m_quality = cETCQualityMedium;785else if (m_params.m_compression_level == BASISU_MAX_COMPRESSION_LEVEL)786optimizer_params.m_quality = cETCQualityUber;787788optimizer_params.m_num_src_pixels = 16;789optimizer_params.m_pSrc_pixels = source_blk.get_ptr();790optimizer_params.m_perceptual = m_params.m_perceptual;791792uint8_t selectors[16];793optimizer_results.m_pSelectors = selectors;794optimizer_results.m_n = 16;795796optimizer.init(optimizer_params, optimizer_results);797if (!optimizer.compute())798BASISU_FRONTEND_VERIFY(false);799800etc_block& blk = m_etc1_blocks_etc1s[block_index];801802memset(&blk, 0, sizeof(blk));803blk.set_block_color5_etc1s(optimizer_results.m_block_color_unscaled);804blk.set_inten_tables_etc1s(optimizer_results.m_block_inten_table);805blk.set_flip_bit(true);806807for (uint32_t y = 0; y < 4; y++)808for (uint32_t x = 0; x < 4; x++)809blk.set_selector(x, y, selectors[x + y * 4]);810}811812});813814}815816m_params.m_pJob_pool->wait_for_all();817818} // use_cpu819820debug_printf("init_etc1_images: Elapsed time: %3.3f secs\n", tm.get_elapsed_secs());821}822823void basisu_frontend::init_endpoint_training_vectors()824{825debug_printf("init_endpoint_training_vectors\n");826827vec6F_quantizer::array_of_weighted_training_vecs &training_vecs = m_endpoint_clusterizer.get_training_vecs();828829training_vecs.resize(m_total_blocks * 2);830831const uint32_t N = 16384;832for (uint32_t block_index_iter = 0; block_index_iter < m_total_blocks; block_index_iter += N)833{834const uint32_t first_index = block_index_iter;835const uint32_t last_index = minimum<uint32_t>(m_total_blocks, first_index + N);836837m_params.m_pJob_pool->add_job( [this, first_index, last_index, &training_vecs] {838839for (uint32_t block_index = first_index; block_index < last_index; block_index++)840{841const etc_block &blk = m_etc1_blocks_etc1s[block_index];842843color_rgba block_colors[2];844blk.get_block_low_high_colors(block_colors, 0);845846vec6F v;847v[0] = block_colors[0].r * (1.0f / 255.0f);848v[1] = block_colors[0].g * (1.0f / 255.0f);849v[2] = block_colors[0].b * (1.0f / 255.0f);850v[3] = block_colors[1].r * (1.0f / 255.0f);851v[4] = block_colors[1].g * (1.0f / 255.0f);852v[5] = block_colors[1].b * (1.0f / 255.0f);853854training_vecs[block_index * 2 + 0] = std::make_pair(v, 1);855training_vecs[block_index * 2 + 1] = std::make_pair(v, 1);856857} // block_index;858859} );860861} // block_index_iter862863m_params.m_pJob_pool->wait_for_all();864}865866void basisu_frontend::generate_endpoint_clusters()867{868debug_printf("Begin endpoint quantization\n");869870const uint32_t parent_codebook_size = (m_params.m_max_endpoint_clusters >= 256) ? BASISU_ENDPOINT_PARENT_CODEBOOK_SIZE : 0;871uint32_t max_threads = 0;872max_threads = m_params.m_multithreaded ? minimum<int>(std::thread::hardware_concurrency(), cMaxCodebookCreationThreads) : 0;873if (m_params.m_pJob_pool)874max_threads = minimum<int>((int)m_params.m_pJob_pool->get_total_threads(), max_threads);875876debug_printf("max_threads: %u\n", max_threads);877bool status = generate_hierarchical_codebook_threaded(m_endpoint_clusterizer,878m_params.m_max_endpoint_clusters, m_use_hierarchical_endpoint_codebooks ? parent_codebook_size : 0,879m_endpoint_clusters,880m_endpoint_parent_clusters,881max_threads, m_params.m_pJob_pool, true);882BASISU_FRONTEND_VERIFY(status);883884if (m_use_hierarchical_endpoint_codebooks)885{886if (!m_endpoint_parent_clusters.size())887{888m_endpoint_parent_clusters.resize(0);889m_endpoint_parent_clusters.resize(1);890for (uint32_t i = 0; i < m_total_blocks; i++)891{892m_endpoint_parent_clusters[0].push_back(i*2);893m_endpoint_parent_clusters[0].push_back(i*2+1);894}895}896897BASISU_ASSUME(BASISU_ENDPOINT_PARENT_CODEBOOK_SIZE <= UINT8_MAX);898899m_block_parent_endpoint_cluster.resize(0);900m_block_parent_endpoint_cluster.resize(m_total_blocks);901vector_set_all(m_block_parent_endpoint_cluster, 0xFF);902for (uint32_t parent_cluster_index = 0; parent_cluster_index < m_endpoint_parent_clusters.size(); parent_cluster_index++)903{904const uint_vec &cluster = m_endpoint_parent_clusters[parent_cluster_index];905for (uint32_t j = 0; j < cluster.size(); j++)906{907const uint32_t block_index = cluster[j] >> 1;908m_block_parent_endpoint_cluster[block_index] = static_cast<uint8_t>(parent_cluster_index);909}910}911912for (uint32_t i = 0; i < m_total_blocks; i++)913{914BASISU_FRONTEND_VERIFY(m_block_parent_endpoint_cluster[i] != 0xFF);915}916917// Ensure that all the blocks within each cluster are all in the same parent cluster, or something is very wrong.918for (uint32_t cluster_index = 0; cluster_index < m_endpoint_clusters.size(); cluster_index++)919{920const uint_vec &cluster = m_endpoint_clusters[cluster_index];921922uint32_t parent_cluster_index = 0;923for (uint32_t j = 0; j < cluster.size(); j++)924{925const uint32_t block_index = cluster[j] >> 1;926927BASISU_FRONTEND_VERIFY(block_index < m_block_parent_endpoint_cluster.size());928929if (!j)930{931parent_cluster_index = m_block_parent_endpoint_cluster[block_index];932}933else934{935BASISU_FRONTEND_VERIFY(m_block_parent_endpoint_cluster[block_index] == parent_cluster_index);936}937}938}939}940941if (m_params.m_debug_stats)942debug_printf("Total endpoint clusters: %u, parent clusters: %u\n", m_endpoint_clusters.size_u32(), m_endpoint_parent_clusters.size_u32());943}944945// Iterate through each array of endpoint cluster block indices and set the m_block_endpoint_clusters_indices[][] array to indicaste which cluster index each block uses.946void basisu_frontend::generate_block_endpoint_clusters()947{948m_block_endpoint_clusters_indices.resize(m_total_blocks);949950for (int cluster_index = 0; cluster_index < static_cast<int>(m_endpoint_clusters.size()); cluster_index++)951{952const basisu::vector<uint32_t>& cluster_indices = m_endpoint_clusters[cluster_index];953954for (uint32_t cluster_indices_iter = 0; cluster_indices_iter < cluster_indices.size(); cluster_indices_iter++)955{956const uint32_t block_index = cluster_indices[cluster_indices_iter] >> 1;957const uint32_t subblock_index = cluster_indices[cluster_indices_iter] & 1;958959m_block_endpoint_clusters_indices[block_index][subblock_index] = cluster_index;960961} // cluster_indices_iter962}963964if (m_params.m_validate)965{966for (uint32_t block_index = 0; block_index < m_total_blocks; block_index++)967{968uint32_t cluster_0 = m_block_endpoint_clusters_indices[block_index][0];969uint32_t cluster_1 = m_block_endpoint_clusters_indices[block_index][1];970BASISU_FRONTEND_VERIFY(cluster_0 == cluster_1);971}972}973}974975void basisu_frontend::compute_endpoint_clusters_within_each_parent_cluster()976{977generate_block_endpoint_clusters();978979m_endpoint_clusters_within_each_parent_cluster.resize(0);980m_endpoint_clusters_within_each_parent_cluster.resize(m_endpoint_parent_clusters.size());981982// Note: It's possible that some blocks got moved into the same cluster, but live in different parent clusters.983for (uint32_t block_index = 0; block_index < m_total_blocks; block_index++)984{985const uint32_t cluster_index = m_block_endpoint_clusters_indices[block_index][0];986const uint32_t parent_cluster_index = m_block_parent_endpoint_cluster[block_index];987988m_endpoint_clusters_within_each_parent_cluster[parent_cluster_index].push_back(cluster_index);989}990991for (uint32_t i = 0; i < m_endpoint_clusters_within_each_parent_cluster.size(); i++)992{993uint_vec &cluster_indices = m_endpoint_clusters_within_each_parent_cluster[i];994995BASISU_FRONTEND_VERIFY(cluster_indices.size());996997vector_sort(cluster_indices);998999auto last = std::unique(cluster_indices.begin(), cluster_indices.end());1000cluster_indices.erase(last, cluster_indices.end());1001}1002}10031004void basisu_frontend::compute_endpoint_subblock_error_vec()1005{1006m_subblock_endpoint_quant_err_vec.resize(0);10071008const uint32_t N = 512;1009for (uint32_t cluster_index_iter = 0; cluster_index_iter < m_endpoint_clusters.size(); cluster_index_iter += N)1010{1011const uint32_t first_index = cluster_index_iter;1012const uint32_t last_index = minimum<uint32_t>(m_endpoint_clusters.size_u32(), cluster_index_iter + N);10131014m_params.m_pJob_pool->add_job( [this, first_index, last_index] {10151016for (uint32_t cluster_index = first_index; cluster_index < last_index; cluster_index++)1017{1018const basisu::vector<uint32_t>& cluster_indices = m_endpoint_clusters[cluster_index];10191020assert(cluster_indices.size());10211022for (uint32_t cluster_indices_iter = 0; cluster_indices_iter < cluster_indices.size(); cluster_indices_iter++)1023{1024basisu::vector<color_rgba> cluster_pixels(8);10251026const uint32_t block_index = cluster_indices[cluster_indices_iter] >> 1;1027const uint32_t subblock_index = cluster_indices[cluster_indices_iter] & 1;10281029const bool flipped = true;10301031const color_rgba *pSource_block_pixels = get_source_pixel_block(block_index).get_ptr();10321033for (uint32_t pixel_index = 0; pixel_index < 8; pixel_index++)1034{1035cluster_pixels[pixel_index] = pSource_block_pixels[g_etc1_pixel_indices[flipped][subblock_index][pixel_index]];1036}10371038const endpoint_cluster_etc_params &etc_params = m_endpoint_cluster_etc_params[cluster_index];10391040assert(etc_params.m_valid);10411042color_rgba block_colors[4];1043etc_block::get_block_colors5(block_colors, etc_params.m_color_unscaled[0], etc_params.m_inten_table[0], true);10441045uint64_t total_err = 0;10461047for (uint32_t i = 0; i < 8; i++)1048{1049const color_rgba &c = cluster_pixels[i];10501051uint64_t best_err = UINT64_MAX;1052//uint32_t best_index = 0;10531054for (uint32_t s = 0; s < 4; s++)1055{1056uint64_t err = color_distance(m_params.m_perceptual, c, block_colors[s], false);1057if (err < best_err)1058{1059best_err = err;1060//best_index = s;1061}1062}10631064total_err += best_err;1065}10661067subblock_endpoint_quant_err quant_err;1068quant_err.m_total_err = total_err;1069quant_err.m_cluster_index = cluster_index;1070quant_err.m_cluster_subblock_index = cluster_indices_iter;1071quant_err.m_block_index = block_index;1072quant_err.m_subblock_index = subblock_index;10731074{1075std::lock_guard<std::mutex> lock(m_lock);10761077m_subblock_endpoint_quant_err_vec.push_back(quant_err);1078}1079}1080} // cluster_index10811082} );10831084} // cluster_index_iter10851086m_params.m_pJob_pool->wait_for_all();10871088vector_sort(m_subblock_endpoint_quant_err_vec);1089}10901091void basisu_frontend::introduce_new_endpoint_clusters()1092{1093debug_printf("introduce_new_endpoint_clusters\n");10941095generate_block_endpoint_clusters();10961097int num_new_endpoint_clusters = m_params.m_max_endpoint_clusters - m_endpoint_clusters.size_u32();1098if (num_new_endpoint_clusters <= 0)1099return;11001101compute_endpoint_subblock_error_vec();11021103const uint32_t num_orig_endpoint_clusters = m_endpoint_clusters.size_u32();11041105std::unordered_set<uint32_t> training_vector_was_relocated;11061107uint_vec cluster_sizes(num_orig_endpoint_clusters);1108for (uint32_t i = 0; i < num_orig_endpoint_clusters; i++)1109cluster_sizes[i] = m_endpoint_clusters[i].size_u32();11101111std::unordered_set<uint32_t> ignore_cluster;11121113uint32_t total_new_clusters = 0;11141115while (num_new_endpoint_clusters)1116{1117if (m_subblock_endpoint_quant_err_vec.size() == 0)1118break;11191120subblock_endpoint_quant_err subblock_to_move(m_subblock_endpoint_quant_err_vec.back());11211122m_subblock_endpoint_quant_err_vec.pop_back();11231124if (unordered_set_contains(ignore_cluster, subblock_to_move.m_cluster_index))1125continue;11261127uint32_t training_vector_index = subblock_to_move.m_block_index * 2 + subblock_to_move.m_subblock_index;11281129if (cluster_sizes[subblock_to_move.m_cluster_index] <= 2)1130continue;11311132if (unordered_set_contains(training_vector_was_relocated, training_vector_index))1133continue;11341135if (unordered_set_contains(training_vector_was_relocated, training_vector_index ^ 1))1136continue;11371138#if 01139const uint32_t block_index = subblock_to_move.m_block_index;1140const etc_block& blk = m_etc1_blocks_etc1s[block_index];1141uint32_t ls, hs;1142blk.get_selector_range(ls, hs);1143if (ls != hs)1144continue;1145#endif11461147//const uint32_t new_endpoint_cluster_index = (uint32_t)m_endpoint_clusters.size();11481149enlarge_vector(m_endpoint_clusters, 1)->push_back(training_vector_index);1150enlarge_vector(m_endpoint_cluster_etc_params, 1);11511152assert(m_endpoint_clusters.size() == m_endpoint_cluster_etc_params.size());11531154training_vector_was_relocated.insert(training_vector_index);11551156m_endpoint_clusters.back().push_back(training_vector_index ^ 1);1157training_vector_was_relocated.insert(training_vector_index ^ 1);11581159BASISU_FRONTEND_VERIFY(cluster_sizes[subblock_to_move.m_cluster_index] >= 2);1160cluster_sizes[subblock_to_move.m_cluster_index] -= 2;11611162ignore_cluster.insert(subblock_to_move.m_cluster_index);11631164total_new_clusters++;11651166num_new_endpoint_clusters--;1167}11681169debug_printf("Introduced %i new endpoint clusters\n", total_new_clusters);11701171for (uint32_t i = 0; i < num_orig_endpoint_clusters; i++)1172{1173uint_vec &cluster_indices = m_endpoint_clusters[i];11741175uint_vec new_cluster_indices;1176for (uint32_t j = 0; j < cluster_indices.size(); j++)1177{1178uint32_t training_vector_index = cluster_indices[j];11791180if (!unordered_set_contains(training_vector_was_relocated, training_vector_index))1181new_cluster_indices.push_back(training_vector_index);1182}11831184if (cluster_indices.size() != new_cluster_indices.size())1185{1186BASISU_FRONTEND_VERIFY(new_cluster_indices.size() > 0);1187cluster_indices.swap(new_cluster_indices);1188}1189}11901191generate_block_endpoint_clusters();1192}11931194struct color_rgba_hasher1195{1196inline std::size_t operator()(const color_rgba& k) const1197{1198uint32_t v = *(const uint32_t*)&k;11991200//return bitmix32(v);12011202//v ^= (v << 10);1203//v ^= (v >> 12);12041205return v;1206}1207};12081209// Given each endpoint cluster, gather all the block pixels which are in that cluster and compute optimized ETC1S endpoints for them.1210// TODO: Don't optimize endpoint clusters which haven't changed.1211// If step>=1, we check to ensure the new endpoint values actually decrease quantization error.1212void basisu_frontend::generate_endpoint_codebook(uint32_t step)1213{1214debug_printf("generate_endpoint_codebook\n");12151216interval_timer tm;1217tm.start();12181219m_endpoint_cluster_etc_params.resize(m_endpoint_clusters.size());12201221bool use_cpu = true;1222// TODO: Get this working when step>01223if (m_params.m_pOpenCL_context && !step)1224{1225const uint32_t total_clusters = (uint32_t)m_endpoint_clusters.size();12261227basisu::vector<cl_pixel_cluster> pixel_clusters(total_clusters);12281229std::vector<color_rgba> input_pixels;1230input_pixels.reserve(m_total_blocks * 16);12311232std::vector<uint32_t> pixel_weights;1233pixel_weights.reserve(m_total_blocks * 16);12341235uint_vec cluster_sizes(total_clusters);12361237//typedef basisu::hash_map<color_rgba, uint32_t, color_rgba_hasher> color_hasher_type;1238//color_hasher_type color_hasher;1239//color_hasher.reserve(2048);12401241interval_timer hash_tm;1242hash_tm.start();12431244basisu::vector<uint32_t> colors, colors2;1245colors.reserve(65536);1246colors2.reserve(65536);12471248for (uint32_t cluster_index = 0; cluster_index < m_endpoint_clusters.size(); cluster_index++)1249{1250const basisu::vector<uint32_t>& cluster_indices = m_endpoint_clusters[cluster_index];1251assert((cluster_indices.size() & 1) == 0);12521253#if 01254uint64_t first_pixel_index = input_pixels.size();1255const uint32_t total_pixels = 16 * (cluster_indices.size() / 2);12561257input_pixels.resize(input_pixels.size() + total_pixels);1258pixel_weights.resize(pixel_weights.size() + total_pixels);12591260uint64_t dst_ofs = first_pixel_index;12611262uint64_t total_r = 0, total_g = 0, total_b = 0;1263for (uint32_t cluster_indices_iter = 0; cluster_indices_iter < cluster_indices.size(); cluster_indices_iter++)1264{1265const uint32_t subblock_index = cluster_indices[cluster_indices_iter] & 1;1266if (subblock_index)1267continue;12681269const uint32_t block_index = cluster_indices[cluster_indices_iter] >> 1;1270const color_rgba* pBlock_pixels = get_source_pixel_block(block_index).get_ptr();12711272for (uint32_t i = 0; i < 16; i++)1273{1274input_pixels[dst_ofs] = pBlock_pixels[i];1275pixel_weights[dst_ofs] = 1;1276dst_ofs++;12771278total_r += pBlock_pixels[i].r;1279total_g += pBlock_pixels[i].g;1280total_b += pBlock_pixels[i].b;1281}1282}12831284//printf("%i %f %f %f\n", cluster_index, total_r / (float)total_pixels, total_g / (float)total_pixels, total_b / (float)total_pixels);12851286pixel_clusters[cluster_index].m_first_pixel_index = first_pixel_index;1287pixel_clusters[cluster_index].m_total_pixels = total_pixels;1288cluster_sizes[cluster_index] = total_pixels;1289#elif 11290colors.resize(cluster_indices.size() * 8);1291colors2.resize(cluster_indices.size() * 8);1292uint32_t dst_ofs = 0;12931294for (uint32_t cluster_indices_iter = 0; cluster_indices_iter < cluster_indices.size(); cluster_indices_iter++)1295{1296const uint32_t subblock_index = cluster_indices[cluster_indices_iter] & 1;1297if (subblock_index)1298continue;12991300const uint32_t block_index = cluster_indices[cluster_indices_iter] >> 1;1301const color_rgba* pBlock_pixels = get_source_pixel_block(block_index).get_ptr();13021303memcpy(colors.data() + dst_ofs, pBlock_pixels, sizeof(color_rgba) * 16);1304dst_ofs += 16;13051306} // cluster_indices_iter13071308uint32_t* pSorted = radix_sort((uint32_t)colors.size(), colors.data(), colors2.data(), 0, 3);13091310const uint64_t first_pixel_index = input_pixels.size();13111312uint32_t prev_color = 0, cur_weight = 0;13131314for (uint32_t i = 0; i < colors.size(); i++)1315{1316uint32_t cur_color = pSorted[i];1317if (cur_color == prev_color)1318{1319if (++cur_weight == 0)1320cur_weight--;1321}1322else1323{1324if (cur_weight)1325{1326input_pixels.push_back(*(const color_rgba*)&prev_color);1327pixel_weights.push_back(cur_weight);1328}13291330prev_color = cur_color;1331cur_weight = 1;1332}1333}13341335if (cur_weight)1336{1337input_pixels.push_back(*(const color_rgba*)&prev_color);1338pixel_weights.push_back(cur_weight);1339}13401341uint32_t total_unique_pixels = (uint32_t)(input_pixels.size() - first_pixel_index);13421343pixel_clusters[cluster_index].m_first_pixel_index = first_pixel_index;1344pixel_clusters[cluster_index].m_total_pixels = total_unique_pixels;13451346cluster_sizes[cluster_index] = total_unique_pixels;1347#else1348color_hasher.reset();13491350for (uint32_t cluster_indices_iter = 0; cluster_indices_iter < cluster_indices.size(); cluster_indices_iter++)1351{1352const uint32_t subblock_index = cluster_indices[cluster_indices_iter] & 1;1353if (subblock_index)1354continue;13551356const uint32_t block_index = cluster_indices[cluster_indices_iter] >> 1;1357const color_rgba* pBlock_pixels = get_source_pixel_block(block_index).get_ptr();13581359uint32_t *pPrev_weight = nullptr;1360color_rgba prev_color;13611362{1363color_rgba cur_color = pBlock_pixels[0];1364auto res = color_hasher.insert(cur_color, 0);13651366uint32_t& weight = (res.first)->second;1367if (weight != UINT32_MAX)1368weight++;13691370prev_color = cur_color;1371pPrev_weight = &(res.first)->second;1372}13731374for (uint32_t i = 1; i < 16; i++)1375{1376color_rgba cur_color = pBlock_pixels[i];13771378if (cur_color == prev_color)1379{1380if (*pPrev_weight != UINT32_MAX)1381*pPrev_weight = *pPrev_weight + 1;1382}1383else1384{1385auto res = color_hasher.insert(cur_color, 0);13861387uint32_t& weight = (res.first)->second;1388if (weight != UINT32_MAX)1389weight++;13901391prev_color = cur_color;1392pPrev_weight = &(res.first)->second;1393}1394}13951396} // cluster_indices_iter13971398const uint64_t first_pixel_index = input_pixels.size();1399uint32_t total_unique_pixels = color_hasher.size();14001401pixel_clusters[cluster_index].m_first_pixel_index = first_pixel_index;1402pixel_clusters[cluster_index].m_total_pixels = total_unique_pixels;14031404input_pixels.resize(first_pixel_index + total_unique_pixels);1405pixel_weights.resize(first_pixel_index + total_unique_pixels);14061407uint32_t j = 0;14081409for (auto it = color_hasher.begin(); it != color_hasher.end(); ++it, ++j)1410{1411input_pixels[first_pixel_index + j] = it->first;1412pixel_weights[first_pixel_index + j] = it->second;1413}14141415cluster_sizes[cluster_index] = total_unique_pixels;1416#endif14171418} // cluster_index14191420debug_printf("Total hash time: %3.3f secs\n", hash_tm.get_elapsed_secs());14211422debug_printf("Total unique colors: %llu\n", input_pixels.size());14231424uint_vec sorted_cluster_indices_new_to_old(total_clusters);1425indirect_sort(total_clusters, sorted_cluster_indices_new_to_old.data(), cluster_sizes.data());1426//for (uint32_t i = 0; i < total_clusters; i++)1427// sorted_cluster_indices_new_to_old[i] = i;14281429uint_vec sorted_cluster_indices_old_to_new(total_clusters);1430for (uint32_t i = 0; i < total_clusters; i++)1431sorted_cluster_indices_old_to_new[sorted_cluster_indices_new_to_old[i]] = i;14321433basisu::vector<cl_pixel_cluster> sorted_pixel_clusters(total_clusters);1434for (uint32_t i = 0; i < total_clusters; i++)1435sorted_pixel_clusters[i] = pixel_clusters[sorted_cluster_indices_new_to_old[i]];14361437uint32_t total_perms = 64;1438if (m_params.m_compression_level <= 1)1439total_perms = 16;1440else if (m_params.m_compression_level == BASISU_MAX_COMPRESSION_LEVEL)1441total_perms = OPENCL_ENCODE_ETC1S_MAX_PERMS;14421443basisu::vector<etc_block> output_blocks(total_clusters);14441445if (opencl_encode_etc1s_pixel_clusters(1446m_params.m_pOpenCL_context,1447output_blocks.data(),1448total_clusters,1449sorted_pixel_clusters.data(),1450input_pixels.size(),1451input_pixels.data(),1452pixel_weights.data(),1453m_params.m_perceptual, total_perms))1454{1455for (uint32_t old_cluster_index = 0; old_cluster_index < m_endpoint_clusters.size(); old_cluster_index++)1456{1457const uint32_t new_cluster_index = sorted_cluster_indices_old_to_new[old_cluster_index];14581459const etc_block& blk = output_blocks[new_cluster_index];14601461endpoint_cluster_etc_params& prev_etc_params = m_endpoint_cluster_etc_params[old_cluster_index];14621463prev_etc_params.m_valid = true;1464etc_block::unpack_color5(prev_etc_params.m_color_unscaled[0], blk.get_base5_color(), false);1465prev_etc_params.m_inten_table[0] = blk.get_inten_table(0);1466prev_etc_params.m_color_error[0] = 0; // dummy value - we don't actually use this1467}14681469use_cpu = false;1470}1471else1472{1473error_printf("basisu_frontend::generate_endpoint_codebook: opencl_encode_etc1s_pixel_clusters() failed! Using CPU.\n");1474m_params.m_pOpenCL_context = nullptr;1475m_opencl_failed = true;1476}14771478} // if (opencl_is_available() && m_params.m_use_opencl)14791480if (use_cpu)1481{1482const uint32_t N = 128;1483for (uint32_t cluster_index_iter = 0; cluster_index_iter < m_endpoint_clusters.size(); cluster_index_iter += N)1484{1485const uint32_t first_index = cluster_index_iter;1486const uint32_t last_index = minimum<uint32_t>((uint32_t)m_endpoint_clusters.size(), cluster_index_iter + N);14871488m_params.m_pJob_pool->add_job([this, first_index, last_index, step] {14891490for (uint32_t cluster_index = first_index; cluster_index < last_index; cluster_index++)1491{1492const basisu::vector<uint32_t>& cluster_indices = m_endpoint_clusters[cluster_index];14931494BASISU_FRONTEND_VERIFY(cluster_indices.size());14951496const uint32_t total_pixels = (uint32_t)cluster_indices.size() * 8;14971498basisu::vector<color_rgba> cluster_pixels(total_pixels);14991500for (uint32_t cluster_indices_iter = 0; cluster_indices_iter < cluster_indices.size(); cluster_indices_iter++)1501{1502const uint32_t block_index = cluster_indices[cluster_indices_iter] >> 1;1503const uint32_t subblock_index = cluster_indices[cluster_indices_iter] & 1;15041505const bool flipped = true;15061507const color_rgba* pBlock_pixels = get_source_pixel_block(block_index).get_ptr();15081509for (uint32_t pixel_index = 0; pixel_index < 8; pixel_index++)1510{1511const color_rgba& c = pBlock_pixels[g_etc1_pixel_indices[flipped][subblock_index][pixel_index]];1512cluster_pixels[cluster_indices_iter * 8 + pixel_index] = c;1513}1514}15151516endpoint_cluster_etc_params new_subblock_params;15171518{1519etc1_optimizer optimizer;1520etc1_solution_coordinates solutions[2];15211522etc1_optimizer::params cluster_optimizer_params;1523cluster_optimizer_params.m_num_src_pixels = total_pixels;1524cluster_optimizer_params.m_pSrc_pixels = &cluster_pixels[0];15251526cluster_optimizer_params.m_use_color4 = false;1527cluster_optimizer_params.m_perceptual = m_params.m_perceptual;15281529if (m_params.m_compression_level <= 1)1530cluster_optimizer_params.m_quality = cETCQualityMedium;1531else if (m_params.m_compression_level == BASISU_MAX_COMPRESSION_LEVEL)1532cluster_optimizer_params.m_quality = cETCQualityUber;15331534etc1_optimizer::results cluster_optimizer_results;15351536basisu::vector<uint8_t> cluster_selectors(total_pixels);1537cluster_optimizer_results.m_n = total_pixels;1538cluster_optimizer_results.m_pSelectors = &cluster_selectors[0];15391540optimizer.init(cluster_optimizer_params, cluster_optimizer_results);15411542if (!optimizer.compute())1543BASISU_FRONTEND_VERIFY(false);15441545new_subblock_params.m_color_unscaled[0] = cluster_optimizer_results.m_block_color_unscaled;1546new_subblock_params.m_inten_table[0] = cluster_optimizer_results.m_block_inten_table;1547new_subblock_params.m_color_error[0] = cluster_optimizer_results.m_error;1548}15491550endpoint_cluster_etc_params& prev_etc_params = m_endpoint_cluster_etc_params[cluster_index];15511552bool use_new_subblock_params = false;1553if ((!step) || (!prev_etc_params.m_valid))1554use_new_subblock_params = true;1555else1556{1557assert(prev_etc_params.m_valid);15581559uint64_t total_prev_err = 0;15601561{1562color_rgba block_colors[4];15631564etc_block::get_block_colors5(block_colors, prev_etc_params.m_color_unscaled[0], prev_etc_params.m_inten_table[0], false);15651566uint64_t total_err = 0;15671568for (uint32_t i = 0; i < total_pixels; i++)1569{1570const color_rgba& c = cluster_pixels[i];15711572uint64_t best_err = UINT64_MAX;1573//uint32_t best_index = 0;15741575for (uint32_t s = 0; s < 4; s++)1576{1577uint64_t err = color_distance(m_params.m_perceptual, c, block_colors[s], false);1578if (err < best_err)1579{1580best_err = err;1581//best_index = s;1582}1583}15841585total_err += best_err;1586}15871588total_prev_err += total_err;1589}15901591// See if we should update this cluster's endpoints (if the error has actually fallen)1592if (total_prev_err > new_subblock_params.m_color_error[0])1593{1594use_new_subblock_params = true;1595}1596}15971598if (use_new_subblock_params)1599{1600new_subblock_params.m_valid = true;16011602prev_etc_params = new_subblock_params;1603}16041605} // cluster_index16061607});16081609} // cluster_index_iter16101611m_params.m_pJob_pool->wait_for_all();1612}16131614debug_printf("Elapsed time: %3.3f secs\n", tm.get_elapsed_secs());1615}16161617bool basisu_frontend::check_etc1s_constraints() const1618{1619basisu::vector<vec2U> block_clusters(m_total_blocks);16201621for (int cluster_index = 0; cluster_index < static_cast<int>(m_endpoint_clusters.size()); cluster_index++)1622{1623const basisu::vector<uint32_t>& cluster_indices = m_endpoint_clusters[cluster_index];16241625for (uint32_t cluster_indices_iter = 0; cluster_indices_iter < cluster_indices.size(); cluster_indices_iter++)1626{1627const uint32_t block_index = cluster_indices[cluster_indices_iter] >> 1;1628const uint32_t subblock_index = cluster_indices[cluster_indices_iter] & 1;16291630block_clusters[block_index][subblock_index] = cluster_index;16311632} // cluster_indices_iter1633}16341635for (uint32_t i = 0; i < m_total_blocks; i++)1636{1637if (block_clusters[i][0] != block_clusters[i][1])1638return false;1639}16401641return true;1642}16431644// For each block, determine which ETC1S endpoint cluster can encode that block with lowest error.1645// This reassigns blocks to different endpoint clusters.1646uint32_t basisu_frontend::refine_endpoint_clusterization()1647{1648debug_printf("refine_endpoint_clusterization\n");16491650if (m_use_hierarchical_endpoint_codebooks)1651compute_endpoint_clusters_within_each_parent_cluster();16521653// Note: It's possible that an endpoint cluster may live in more than one parent cluster after the first refinement step.16541655basisu::vector<vec2U> block_clusters(m_total_blocks);16561657for (int cluster_index = 0; cluster_index < static_cast<int>(m_endpoint_clusters.size()); cluster_index++)1658{1659const basisu::vector<uint32_t>& cluster_indices = m_endpoint_clusters[cluster_index];16601661for (uint32_t cluster_indices_iter = 0; cluster_indices_iter < cluster_indices.size(); cluster_indices_iter++)1662{1663const uint32_t block_index = cluster_indices[cluster_indices_iter] >> 1;1664const uint32_t subblock_index = cluster_indices[cluster_indices_iter] & 1;16651666block_clusters[block_index][subblock_index] = cluster_index;16671668} // cluster_indices_iter1669}16701671//----------------------------------------------------------16721673// Create a new endpoint clusterization16741675interval_timer tm;1676tm.start();16771678uint_vec best_cluster_indices(m_total_blocks);16791680bool use_cpu = true;1681// TODO: Support non-hierarchical endpoint codebooks here1682if (m_params.m_pOpenCL_context && m_use_hierarchical_endpoint_codebooks)1683{1684// For the OpenCL kernel, we order the parent endpoint clusters by smallest to largest for efficiency.1685// We also prepare an array of block info structs that point into this new parent endpoint cluster array.1686const uint32_t total_parent_clusters = (uint32_t)m_endpoint_clusters_within_each_parent_cluster.size();16871688basisu::vector<cl_block_info_struct> cl_block_info_structs(m_total_blocks);16891690// the size of each parent cluster, in total clusters1691uint_vec parent_cluster_sizes(total_parent_clusters);1692for (uint32_t i = 0; i < total_parent_clusters; i++)1693parent_cluster_sizes[i] = (uint32_t)m_endpoint_clusters_within_each_parent_cluster[i].size();16941695uint_vec first_parent_cluster_ofs(total_parent_clusters);1696uint32_t cur_ofs = 0;1697for (uint32_t i = 0; i < total_parent_clusters; i++)1698{1699first_parent_cluster_ofs[i] = cur_ofs;17001701cur_ofs += parent_cluster_sizes[i];1702}17031704// Note: total_actual_endpoint_clusters is not necessarly equal to m_endpoint_clusters.size(), because clusters may live in multiple parent clusters after the first refinement step.1705BASISU_FRONTEND_VERIFY(cur_ofs >= m_endpoint_clusters.size());1706const uint32_t total_actual_endpoint_clusters = cur_ofs;1707basisu::vector<cl_endpoint_cluster_struct> cl_endpoint_cluster_structs(total_actual_endpoint_clusters);17081709for (uint32_t i = 0; i < total_parent_clusters; i++)1710{1711const uint32_t dst_ofs = first_parent_cluster_ofs[i];17121713const uint32_t parent_cluster_size = parent_cluster_sizes[i];17141715assert(m_endpoint_clusters_within_each_parent_cluster[i].size() == parent_cluster_size);17161717for (uint32_t j = 0; j < parent_cluster_size; j++)1718{1719const uint32_t endpoint_cluster_index = m_endpoint_clusters_within_each_parent_cluster[i][j];17201721color_rgba cluster_etc_base_color(m_endpoint_cluster_etc_params[endpoint_cluster_index].m_color_unscaled[0]);1722uint32_t cluster_etc_inten = m_endpoint_cluster_etc_params[endpoint_cluster_index].m_inten_table[0];17231724cl_endpoint_cluster_structs[dst_ofs + j].m_unscaled_color = cluster_etc_base_color;1725cl_endpoint_cluster_structs[dst_ofs + j].m_etc_inten = (uint8_t)cluster_etc_inten;1726cl_endpoint_cluster_structs[dst_ofs + j].m_cluster_index = (uint16_t)endpoint_cluster_index;1727}1728}17291730for (uint32_t block_index = 0; block_index < m_total_blocks; block_index++)1731{1732const uint32_t block_parent_endpoint_cluster_index = m_block_parent_endpoint_cluster[block_index];17331734cl_block_info_structs[block_index].m_num_clusters = (uint16_t)(parent_cluster_sizes[block_parent_endpoint_cluster_index]);1735cl_block_info_structs[block_index].m_first_cluster_ofs = (uint16_t)(first_parent_cluster_ofs[block_parent_endpoint_cluster_index]);17361737const uint32_t block_cluster_index = block_clusters[block_index][0];1738cl_block_info_structs[block_index].m_cur_cluster_index = (uint16_t)block_cluster_index;1739cl_block_info_structs[block_index].m_cur_cluster_etc_inten = (uint8_t)m_endpoint_cluster_etc_params[block_cluster_index].m_inten_table[0];1740}17411742uint_vec block_cluster_indices(m_total_blocks);1743for (uint32_t i = 0; i < m_total_blocks; i++)1744block_cluster_indices[i] = block_clusters[i][0];17451746uint_vec sorted_block_indices(m_total_blocks);1747indirect_sort(m_total_blocks, sorted_block_indices.data(), block_cluster_indices.data());17481749bool status = opencl_refine_endpoint_clusterization(1750m_params.m_pOpenCL_context,1751cl_block_info_structs.data(),1752total_actual_endpoint_clusters,1753cl_endpoint_cluster_structs.data(),1754sorted_block_indices.data(),1755best_cluster_indices.data(),1756m_params.m_perceptual);17571758if (status)1759{1760use_cpu = false;1761}1762else1763{1764error_printf("basisu_frontend::refine_endpoint_clusterization: opencl_refine_endpoint_clusterization() failed! Using CPU.\n");1765m_params.m_pOpenCL_context = nullptr;1766m_opencl_failed = true;1767}1768}17691770if (use_cpu)1771{1772const uint32_t N = 1024;1773for (uint32_t block_index_iter = 0; block_index_iter < m_total_blocks; block_index_iter += N)1774{1775const uint32_t first_index = block_index_iter;1776const uint32_t last_index = minimum<uint32_t>(m_total_blocks, first_index + N);17771778m_params.m_pJob_pool->add_job([this, first_index, last_index, &best_cluster_indices, &block_clusters] {17791780for (uint32_t block_index = first_index; block_index < last_index; block_index++)1781{1782const uint32_t cluster_index = block_clusters[block_index][0];1783BASISU_FRONTEND_VERIFY(cluster_index == block_clusters[block_index][1]);17841785const color_rgba* pSubblock_pixels = get_source_pixel_block(block_index).get_ptr();1786const uint32_t num_subblock_pixels = 16;17871788uint64_t best_cluster_err = INT64_MAX;1789uint32_t best_cluster_index = 0;17901791const uint32_t block_parent_endpoint_cluster_index = m_block_parent_endpoint_cluster.size() ? m_block_parent_endpoint_cluster[block_index] : 0;1792const uint_vec* pCluster_indices = m_endpoint_clusters_within_each_parent_cluster.size() ? &m_endpoint_clusters_within_each_parent_cluster[block_parent_endpoint_cluster_index] : nullptr;17931794const uint32_t total_clusters = m_use_hierarchical_endpoint_codebooks ? (uint32_t)pCluster_indices->size() : (uint32_t)m_endpoint_clusters.size();17951796for (uint32_t i = 0; i < total_clusters; i++)1797{1798const uint32_t cluster_iter = m_use_hierarchical_endpoint_codebooks ? (*pCluster_indices)[i] : i;17991800color_rgba cluster_etc_base_color(m_endpoint_cluster_etc_params[cluster_iter].m_color_unscaled[0]);1801uint32_t cluster_etc_inten = m_endpoint_cluster_etc_params[cluster_iter].m_inten_table[0];18021803uint64_t total_err = 0;18041805const uint32_t low_selector = 0;//subblock_etc_params_vec[j].m_low_selectors[0];1806const uint32_t high_selector = 3;//subblock_etc_params_vec[j].m_high_selectors[0];1807color_rgba subblock_colors[4];1808// Can't assign it here - may result in too much error when selector quant occurs1809if (cluster_etc_inten > m_endpoint_cluster_etc_params[cluster_index].m_inten_table[0])1810{1811total_err = INT64_MAX;1812goto skip_cluster;1813}18141815etc_block::get_block_colors5(subblock_colors, cluster_etc_base_color, cluster_etc_inten);18161817#if 01818for (uint32_t p = 0; p < num_subblock_pixels; p++)1819{1820uint64_t best_err = UINT64_MAX;18211822for (uint32_t r = low_selector; r <= high_selector; r++)1823{1824uint64_t err = color_distance(m_params.m_perceptual, pSubblock_pixels[p], subblock_colors[r], false);1825best_err = minimum(best_err, err);1826if (!best_err)1827break;1828}18291830total_err += best_err;1831if (total_err > best_cluster_err)1832break;1833} // p1834#else1835if (m_params.m_perceptual)1836{1837if (!g_cpu_supports_sse41)1838{1839for (uint32_t p = 0; p < num_subblock_pixels; p++)1840{1841uint64_t best_err = UINT64_MAX;18421843for (uint32_t r = low_selector; r <= high_selector; r++)1844{1845uint64_t err = color_distance(true, pSubblock_pixels[p], subblock_colors[r], false);1846best_err = minimum(best_err, err);1847if (!best_err)1848break;1849}18501851total_err += best_err;1852if (total_err > best_cluster_err)1853break;1854} // p1855}1856else1857{1858#if BASISU_SUPPORT_SSE1859find_lowest_error_perceptual_rgb_4_N_sse41((int64_t*)&total_err, subblock_colors, pSubblock_pixels, num_subblock_pixels, best_cluster_err);1860#endif1861}1862}1863else1864{1865if (!g_cpu_supports_sse41)1866{1867for (uint32_t p = 0; p < num_subblock_pixels; p++)1868{1869uint64_t best_err = UINT64_MAX;18701871for (uint32_t r = low_selector; r <= high_selector; r++)1872{1873uint64_t err = color_distance(false, pSubblock_pixels[p], subblock_colors[r], false);1874best_err = minimum(best_err, err);1875if (!best_err)1876break;1877}18781879total_err += best_err;1880if (total_err > best_cluster_err)1881break;1882} // p1883}1884else1885{1886#if BASISU_SUPPORT_SSE1887find_lowest_error_linear_rgb_4_N_sse41((int64_t*)&total_err, subblock_colors, pSubblock_pixels, num_subblock_pixels, best_cluster_err);1888#endif1889}1890}1891#endif18921893skip_cluster:1894if ((total_err < best_cluster_err) ||1895((cluster_iter == cluster_index) && (total_err == best_cluster_err)))1896{1897best_cluster_err = total_err;1898best_cluster_index = cluster_iter;18991900if (!best_cluster_err)1901break;1902}1903} // j19041905best_cluster_indices[block_index] = best_cluster_index;19061907} // block_index19081909});19101911} // block_index_iter19121913m_params.m_pJob_pool->wait_for_all();19141915} // use_cpu19161917debug_printf("refine_endpoint_clusterization time: %3.3f secs\n", tm.get_elapsed_secs());19181919basisu::vector<typename basisu::vector<uint32_t> > optimized_endpoint_clusters(m_endpoint_clusters.size());1920uint32_t total_subblocks_reassigned = 0;19211922for (uint32_t block_index = 0; block_index < m_total_blocks; block_index++)1923{1924const uint32_t training_vector_index = block_index * 2 + 0;19251926const uint32_t orig_cluster_index = block_clusters[block_index][0];1927const uint32_t best_cluster_index = best_cluster_indices[block_index];19281929optimized_endpoint_clusters[best_cluster_index].push_back(training_vector_index);1930optimized_endpoint_clusters[best_cluster_index].push_back(training_vector_index + 1);19311932if (best_cluster_index != orig_cluster_index)1933{1934total_subblocks_reassigned++;1935}1936}19371938debug_printf("total_subblocks_reassigned: %u\n", total_subblocks_reassigned);19391940m_endpoint_clusters = optimized_endpoint_clusters;19411942return total_subblocks_reassigned;1943}19441945void basisu_frontend::eliminate_redundant_or_empty_endpoint_clusters()1946{1947debug_printf("eliminate_redundant_or_empty_endpoint_clusters\n");19481949// Step 1: Sort endpoint clusters by the base colors/intens19501951uint_vec sorted_endpoint_cluster_indices(m_endpoint_clusters.size());1952for (uint32_t i = 0; i < m_endpoint_clusters.size(); i++)1953sorted_endpoint_cluster_indices[i] = i;19541955indirect_sort((uint32_t)m_endpoint_clusters.size(), &sorted_endpoint_cluster_indices[0], &m_endpoint_cluster_etc_params[0]);19561957basisu::vector<basisu::vector<uint32_t> > new_endpoint_clusters(m_endpoint_clusters.size());1958basisu::vector<endpoint_cluster_etc_params> new_subblock_etc_params(m_endpoint_clusters.size());19591960for (uint32_t i = 0; i < m_endpoint_clusters.size(); i++)1961{1962uint32_t j = sorted_endpoint_cluster_indices[i];1963new_endpoint_clusters[i] = m_endpoint_clusters[j];1964new_subblock_etc_params[i] = m_endpoint_cluster_etc_params[j];1965}19661967new_endpoint_clusters.swap(m_endpoint_clusters);1968new_subblock_etc_params.swap(m_endpoint_cluster_etc_params);19691970// Step 2: Eliminate redundant endpoint clusters, or empty endpoint clusters19711972new_endpoint_clusters.resize(0);1973new_subblock_etc_params.resize(0);19741975for (int i = 0; i < (int)m_endpoint_clusters.size(); )1976{1977if (!m_endpoint_clusters[i].size())1978{1979i++;1980continue;1981}19821983int j;1984for (j = i + 1; j < (int)m_endpoint_clusters.size(); j++)1985{1986if (!(m_endpoint_cluster_etc_params[i] == m_endpoint_cluster_etc_params[j]))1987break;1988}19891990new_endpoint_clusters.push_back(m_endpoint_clusters[i]);1991new_subblock_etc_params.push_back(m_endpoint_cluster_etc_params[i]);19921993for (int k = i + 1; k < j; k++)1994{1995append_vector(new_endpoint_clusters.back(), m_endpoint_clusters[k]);1996}19971998i = j;1999}20002001if (m_endpoint_clusters.size() != new_endpoint_clusters.size())2002{2003if (m_params.m_debug_stats)2004debug_printf("Eliminated %u redundant or empty clusters\n", (uint32_t)(m_endpoint_clusters.size() - new_endpoint_clusters.size()));20052006m_endpoint_clusters.swap(new_endpoint_clusters);20072008m_endpoint_cluster_etc_params.swap(new_subblock_etc_params);2009}2010}20112012void basisu_frontend::create_initial_packed_texture()2013{2014debug_printf("create_initial_packed_texture\n");20152016interval_timer tm;2017tm.start();20182019bool use_cpu = true;20202021if ((m_params.m_pOpenCL_context) && (opencl_is_available()))2022{2023basisu::vector<color_rgba> block_etc5_color_intens(m_total_blocks);20242025for (uint32_t block_index = 0; block_index < m_total_blocks; block_index++)2026{2027uint32_t cluster0 = m_block_endpoint_clusters_indices[block_index][0];20282029const color_rgba& color_unscaled = m_endpoint_cluster_etc_params[cluster0].m_color_unscaled[0];2030uint32_t inten = m_endpoint_cluster_etc_params[cluster0].m_inten_table[0];20312032block_etc5_color_intens[block_index].set(color_unscaled.r, color_unscaled.g, color_unscaled.b, inten);2033}20342035bool status = opencl_determine_selectors(m_params.m_pOpenCL_context, block_etc5_color_intens.data(),2036m_encoded_blocks.data(),2037m_params.m_perceptual);2038if (!status)2039{2040error_printf("basisu_frontend::create_initial_packed_texture: opencl_determine_selectors() failed! Using CPU.\n");2041m_params.m_pOpenCL_context = nullptr;2042m_opencl_failed = true;2043}2044else2045{2046use_cpu = false;2047}2048}20492050if (use_cpu)2051{2052const uint32_t N = 4096;2053for (uint32_t block_index_iter = 0; block_index_iter < m_total_blocks; block_index_iter += N)2054{2055const uint32_t first_index = block_index_iter;2056const uint32_t last_index = minimum<uint32_t>(m_total_blocks, first_index + N);20572058m_params.m_pJob_pool->add_job([this, first_index, last_index] {20592060for (uint32_t block_index = first_index; block_index < last_index; block_index++)2061{2062uint32_t cluster0 = m_block_endpoint_clusters_indices[block_index][0];2063uint32_t cluster1 = m_block_endpoint_clusters_indices[block_index][1];2064BASISU_FRONTEND_VERIFY(cluster0 == cluster1);20652066const color_rgba* pSource_pixels = get_source_pixel_block(block_index).get_ptr();20672068etc_block& blk = m_encoded_blocks[block_index];20692070color_rgba unscaled[2] = { m_endpoint_cluster_etc_params[cluster0].m_color_unscaled[0], m_endpoint_cluster_etc_params[cluster1].m_color_unscaled[0] };2071uint32_t inten[2] = { m_endpoint_cluster_etc_params[cluster0].m_inten_table[0], m_endpoint_cluster_etc_params[cluster1].m_inten_table[0] };20722073blk.set_block_color5(unscaled[0], unscaled[1]);2074blk.set_flip_bit(true);20752076blk.set_inten_table(0, inten[0]);2077blk.set_inten_table(1, inten[1]);20782079blk.determine_selectors(pSource_pixels, m_params.m_perceptual);20802081} // block_index20822083});20842085} // block_index_iter20862087m_params.m_pJob_pool->wait_for_all();20882089} // use_cpu20902091m_orig_encoded_blocks = m_encoded_blocks;20922093debug_printf("Elapsed time: %3.3f secs\n", tm.get_elapsed_secs());2094}20952096void basisu_frontend::compute_selector_clusters_within_each_parent_cluster()2097{2098uint_vec block_selector_cluster_indices(m_total_blocks);20992100for (int cluster_index = 0; cluster_index < static_cast<int>(m_selector_cluster_block_indices.size()); cluster_index++)2101{2102const basisu::vector<uint32_t>& cluster_indices = m_selector_cluster_block_indices[cluster_index];21032104for (uint32_t cluster_indices_iter = 0; cluster_indices_iter < cluster_indices.size(); cluster_indices_iter++)2105{2106const uint32_t block_index = cluster_indices[cluster_indices_iter];21072108block_selector_cluster_indices[block_index] = cluster_index;21092110} // cluster_indices_iter21112112} // cluster_index21132114m_selector_clusters_within_each_parent_cluster.resize(0);2115m_selector_clusters_within_each_parent_cluster.resize(m_selector_parent_cluster_block_indices.size());21162117for (uint32_t block_index = 0; block_index < m_total_blocks; block_index++)2118{2119const uint32_t cluster_index = block_selector_cluster_indices[block_index];2120const uint32_t parent_cluster_index = m_block_parent_selector_cluster[block_index];21212122m_selector_clusters_within_each_parent_cluster[parent_cluster_index].push_back(cluster_index);2123}21242125for (uint32_t i = 0; i < m_selector_clusters_within_each_parent_cluster.size(); i++)2126{2127uint_vec &cluster_indices = m_selector_clusters_within_each_parent_cluster[i];21282129BASISU_FRONTEND_VERIFY(cluster_indices.size());21302131vector_sort(cluster_indices);21322133auto last = std::unique(cluster_indices.begin(), cluster_indices.end());2134cluster_indices.erase(last, cluster_indices.end());2135}2136}21372138void basisu_frontend::generate_selector_clusters()2139{2140debug_printf("generate_selector_clusters\n");21412142typedef tree_vector_quant<vec16F> vec16F_clusterizer;21432144vec16F_clusterizer::array_of_weighted_training_vecs training_vecs(m_total_blocks);21452146const uint32_t N = 4096;2147for (uint32_t block_index_iter = 0; block_index_iter < m_total_blocks; block_index_iter += N)2148{2149const uint32_t first_index = block_index_iter;2150const uint32_t last_index = minimum<uint32_t>(m_total_blocks, first_index + N);21512152m_params.m_pJob_pool->add_job( [this, first_index, last_index, &training_vecs] {21532154for (uint32_t block_index = first_index; block_index < last_index; block_index++)2155{2156const etc_block &blk = m_encoded_blocks[block_index];21572158vec16F v;2159for (uint32_t y = 0; y < 4; y++)2160for (uint32_t x = 0; x < 4; x++)2161v[x + y * 4] = static_cast<float>(blk.get_selector(x, y));21622163const uint32_t subblock_index = (blk.get_inten_table(0) > blk.get_inten_table(1)) ? 0 : 1;21642165color_rgba block_colors[2];2166blk.get_block_low_high_colors(block_colors, subblock_index);21672168const uint32_t dist = color_distance(m_params.m_perceptual, block_colors[0], block_colors[1], false);21692170const uint32_t cColorDistToWeight = 300;2171const uint32_t cMaxWeight = 4096;2172uint32_t weight = clamp<uint32_t>(dist / cColorDistToWeight, 1, cMaxWeight);21732174training_vecs[block_index].first = v;2175training_vecs[block_index].second = weight;21762177} // block_index21782179} );21802181} // block_index_iter21822183m_params.m_pJob_pool->wait_for_all();21842185vec16F_clusterizer selector_clusterizer;2186for (uint32_t i = 0; i < m_total_blocks; i++)2187selector_clusterizer.add_training_vec(training_vecs[i].first, training_vecs[i].second);21882189const int selector_parent_codebook_size = (m_params.m_compression_level <= 1) ? BASISU_SELECTOR_PARENT_CODEBOOK_SIZE_COMP_LEVEL_01 : BASISU_SELECTOR_PARENT_CODEBOOK_SIZE_COMP_LEVEL_DEFAULT;2190const uint32_t parent_codebook_size = (m_params.m_max_selector_clusters >= 256) ? selector_parent_codebook_size : 0;2191debug_printf("Using selector parent codebook size %u\n", parent_codebook_size);21922193uint32_t max_threads = 0;2194max_threads = m_params.m_multithreaded ? minimum<int>(std::thread::hardware_concurrency(), cMaxCodebookCreationThreads) : 0;2195if (m_params.m_pJob_pool)2196max_threads = minimum<int>((int)m_params.m_pJob_pool->get_total_threads(), max_threads);21972198bool status = generate_hierarchical_codebook_threaded(selector_clusterizer,2199m_params.m_max_selector_clusters, m_use_hierarchical_selector_codebooks ? parent_codebook_size : 0,2200m_selector_cluster_block_indices,2201m_selector_parent_cluster_block_indices,2202max_threads, m_params.m_pJob_pool, false);2203BASISU_FRONTEND_VERIFY(status);22042205if (m_use_hierarchical_selector_codebooks)2206{2207if (!m_selector_parent_cluster_block_indices.size())2208{2209m_selector_parent_cluster_block_indices.resize(0);2210m_selector_parent_cluster_block_indices.resize(1);2211for (uint32_t i = 0; i < m_total_blocks; i++)2212m_selector_parent_cluster_block_indices[0].push_back(i);2213}22142215BASISU_ASSUME(BASISU_SELECTOR_PARENT_CODEBOOK_SIZE_COMP_LEVEL_01 <= UINT8_MAX);2216BASISU_ASSUME(BASISU_SELECTOR_PARENT_CODEBOOK_SIZE_COMP_LEVEL_DEFAULT <= UINT8_MAX);22172218m_block_parent_selector_cluster.resize(0);2219m_block_parent_selector_cluster.resize(m_total_blocks);2220vector_set_all(m_block_parent_selector_cluster, 0xFF);22212222for (uint32_t parent_cluster_index = 0; parent_cluster_index < m_selector_parent_cluster_block_indices.size(); parent_cluster_index++)2223{2224const uint_vec &cluster = m_selector_parent_cluster_block_indices[parent_cluster_index];2225for (uint32_t j = 0; j < cluster.size(); j++)2226m_block_parent_selector_cluster[cluster[j]] = static_cast<uint8_t>(parent_cluster_index);2227}2228for (uint32_t i = 0; i < m_total_blocks; i++)2229{2230BASISU_FRONTEND_VERIFY(m_block_parent_selector_cluster[i] != 0xFF);2231}22322233// Ensure that all the blocks within each cluster are all in the same parent cluster, or something is very wrong.2234for (uint32_t cluster_index = 0; cluster_index < m_selector_cluster_block_indices.size(); cluster_index++)2235{2236const uint_vec &cluster = m_selector_cluster_block_indices[cluster_index];22372238uint32_t parent_cluster_index = 0;2239for (uint32_t j = 0; j < cluster.size(); j++)2240{2241const uint32_t block_index = cluster[j];2242if (!j)2243{2244parent_cluster_index = m_block_parent_selector_cluster[block_index];2245}2246else2247{2248BASISU_FRONTEND_VERIFY(m_block_parent_selector_cluster[block_index] == parent_cluster_index);2249}2250}2251}2252}22532254debug_printf("Total selector clusters: %u, total parent selector clusters: %u\n", (uint32_t)m_selector_cluster_block_indices.size(), (uint32_t)m_selector_parent_cluster_block_indices.size());2255}22562257void basisu_frontend::create_optimized_selector_codebook(uint32_t iter)2258{2259debug_printf("create_optimized_selector_codebook\n");22602261interval_timer tm;2262tm.start();22632264const uint32_t total_selector_clusters = (uint32_t)m_selector_cluster_block_indices.size();22652266debug_printf("Total selector clusters (from m_selector_cluster_block_indices.size()): %u\n", (uint32_t)m_selector_cluster_block_indices.size());22672268m_optimized_cluster_selectors.resize(total_selector_clusters);22692270// For each selector codebook entry, and for each of the 4x4 selectors, determine which selector minimizes the error across all the blocks that use that quantized selector.2271const uint32_t N = 256;2272for (uint32_t cluster_index_iter = 0; cluster_index_iter < total_selector_clusters; cluster_index_iter += N)2273{2274const uint32_t first_index = cluster_index_iter;2275const uint32_t last_index = minimum<uint32_t>((uint32_t)total_selector_clusters, cluster_index_iter + N);22762277m_params.m_pJob_pool->add_job([this, first_index, last_index] {22782279for (uint32_t cluster_index = first_index; cluster_index < last_index; cluster_index++)2280{2281const basisu::vector<uint32_t>& cluster_block_indices = m_selector_cluster_block_indices[cluster_index];22822283if (!cluster_block_indices.size())2284continue;22852286uint64_t overall_best_err = 0;2287(void)overall_best_err;22882289uint64_t total_err[4][4][4];2290clear_obj(total_err);22912292for (uint32_t cluster_block_index = 0; cluster_block_index < cluster_block_indices.size(); cluster_block_index++)2293{2294const uint32_t block_index = cluster_block_indices[cluster_block_index];22952296const etc_block& blk = m_encoded_blocks[block_index];22972298color_rgba blk_colors[4];2299blk.get_block_colors(blk_colors, 0);23002301for (uint32_t y = 0; y < 4; y++)2302{2303for (uint32_t x = 0; x < 4; x++)2304{2305const color_rgba& orig_color = get_source_pixel_block(block_index)(x, y);23062307if (m_params.m_perceptual)2308{2309for (uint32_t s = 0; s < 4; s++)2310total_err[y][x][s] += color_distance(true, blk_colors[s], orig_color, false);2311}2312else2313{2314for (uint32_t s = 0; s < 4; s++)2315total_err[y][x][s] += color_distance(false, blk_colors[s], orig_color, false);2316}2317} // x2318} // y23192320} // cluster_block_index23212322for (uint32_t y = 0; y < 4; y++)2323{2324for (uint32_t x = 0; x < 4; x++)2325{2326uint64_t best_err = total_err[y][x][0];2327uint8_t best_sel = 0;23282329for (uint32_t s = 1; s < 4; s++)2330{2331if (total_err[y][x][s] < best_err)2332{2333best_err = total_err[y][x][s];2334best_sel = (uint8_t)s;2335}2336}23372338m_optimized_cluster_selectors[cluster_index].set_selector(x, y, best_sel);23392340overall_best_err += best_err;2341} // x2342} // y23432344} // cluster_index23452346});23472348} // cluster_index_iter23492350m_params.m_pJob_pool->wait_for_all();23512352debug_printf("Elapsed time: %3.3f secs\n", tm.get_elapsed_secs());23532354if (m_params.m_debug_images)2355{2356uint32_t max_selector_cluster_size = 0;23572358for (uint32_t i = 0; i < m_selector_cluster_block_indices.size(); i++)2359max_selector_cluster_size = maximum<uint32_t>(max_selector_cluster_size, (uint32_t)m_selector_cluster_block_indices[i].size());23602361if ((max_selector_cluster_size * 5) < 32768)2362{2363const uint32_t x_spacer_len = 16;2364image selector_cluster_vis(x_spacer_len + max_selector_cluster_size * 5, (uint32_t)m_selector_cluster_block_indices.size() * 5);23652366for (uint32_t selector_cluster_index = 0; selector_cluster_index < m_selector_cluster_block_indices.size(); selector_cluster_index++)2367{2368const basisu::vector<uint32_t> &cluster_block_indices = m_selector_cluster_block_indices[selector_cluster_index];23692370for (uint32_t y = 0; y < 4; y++)2371for (uint32_t x = 0; x < 4; x++)2372selector_cluster_vis.set_clipped(x_spacer_len + x - 12, selector_cluster_index * 5 + y, color_rgba((m_optimized_cluster_selectors[selector_cluster_index].get_selector(x, y) * 255) / 3));23732374for (uint32_t i = 0; i < cluster_block_indices.size(); i++)2375{2376uint32_t block_index = cluster_block_indices[i];23772378const etc_block &blk = m_orig_encoded_blocks[block_index];23792380for (uint32_t y = 0; y < 4; y++)2381for (uint32_t x = 0; x < 4; x++)2382selector_cluster_vis.set_clipped(x_spacer_len + x + 5 * i, selector_cluster_index * 5 + y, color_rgba((blk.get_selector(x, y) * 255) / 3));2383}2384}23852386char buf[256];2387snprintf(buf, sizeof(buf), "selector_cluster_vis_%u.png", iter);2388save_png(buf, selector_cluster_vis);2389}2390}2391}23922393// For each block: Determine which quantized selectors best encode that block, given its quantized endpoints.2394// Note that this method may leave some empty clusters (i.e. arrays with no block indices), including at the end.2395void basisu_frontend::find_optimal_selector_clusters_for_each_block()2396{2397debug_printf("find_optimal_selector_clusters_for_each_block\n");23982399interval_timer tm;2400tm.start();24012402if (m_params.m_validate)2403{2404// Sanity checks2405BASISU_FRONTEND_VERIFY(m_selector_cluster_block_indices.size() == m_optimized_cluster_selectors.size());2406for (uint32_t i = 0; i < m_selector_clusters_within_each_parent_cluster.size(); i++)2407{2408for (uint32_t j = 0; j < m_selector_clusters_within_each_parent_cluster[i].size(); j++)2409{2410BASISU_FRONTEND_VERIFY(m_selector_clusters_within_each_parent_cluster[i][j] < m_optimized_cluster_selectors.size());2411}2412}2413}24142415m_block_selector_cluster_index.resize(m_total_blocks);24162417if (m_params.m_compression_level == 0)2418{2419// Just leave the blocks in their original selector clusters.2420for (uint32_t selector_cluster_index = 0; selector_cluster_index < m_selector_cluster_block_indices.size(); selector_cluster_index++)2421{2422for (uint32_t j = 0; j < m_selector_cluster_block_indices[selector_cluster_index].size(); j++)2423{2424const uint32_t block_index = m_selector_cluster_block_indices[selector_cluster_index][j];24252426m_block_selector_cluster_index[block_index] = selector_cluster_index;24272428etc_block& blk = m_encoded_blocks[block_index];2429blk.set_raw_selector_bits(m_optimized_cluster_selectors[selector_cluster_index].get_raw_selector_bits());2430}2431}24322433debug_printf("Elapsed time: %3.3f secs\n", tm.get_elapsed_secs());24342435return;2436}24372438bool use_cpu = true;24392440if ((m_params.m_pOpenCL_context) && m_use_hierarchical_selector_codebooks)2441{2442const uint32_t num_parent_clusters = m_selector_clusters_within_each_parent_cluster.size_u32();24432444basisu::vector<fosc_selector_struct> selector_structs;2445selector_structs.reserve(m_optimized_cluster_selectors.size());24462447uint_vec parent_selector_cluster_offsets(num_parent_clusters);24482449uint_vec selector_cluster_indices;2450selector_cluster_indices.reserve(m_optimized_cluster_selectors.size());24512452uint32_t cur_ofs = 0;2453for (uint32_t parent_index = 0; parent_index < num_parent_clusters; parent_index++)2454{2455parent_selector_cluster_offsets[parent_index] = cur_ofs;24562457for (uint32_t j = 0; j < m_selector_clusters_within_each_parent_cluster[parent_index].size(); j++)2458{2459const uint32_t selector_cluster_index = m_selector_clusters_within_each_parent_cluster[parent_index][j];24602461uint32_t sel_bits = 0;2462for (uint32_t p = 0; p < 16; p++)2463sel_bits |= (m_optimized_cluster_selectors[selector_cluster_index].get_selector(p & 3, p >> 2) << (p * 2));24642465selector_structs.enlarge(1)->m_packed_selectors = sel_bits;24662467selector_cluster_indices.push_back(selector_cluster_index);2468}24692470cur_ofs += m_selector_clusters_within_each_parent_cluster[parent_index].size_u32();2471}24722473const uint32_t total_input_selectors = cur_ofs;24742475basisu::vector<fosc_block_struct> block_structs(m_total_blocks);2476for (uint32_t i = 0; i < m_total_blocks; i++)2477{2478const uint32_t parent_selector_cluster = m_block_parent_selector_cluster[i];24792480const etc_block& blk = m_encoded_blocks[i];2481blk.unpack_color5(block_structs[i].m_etc_color5_inten, blk.get_base5_color(), false);24822483block_structs[i].m_etc_color5_inten.a = (uint8_t)blk.get_inten_table(0);2484block_structs[i].m_first_selector = parent_selector_cluster_offsets[parent_selector_cluster];2485block_structs[i].m_num_selectors = m_selector_clusters_within_each_parent_cluster[parent_selector_cluster].size_u32();2486}24872488uint_vec output_selector_cluster_indices(m_total_blocks);24892490bool status = opencl_find_optimal_selector_clusters_for_each_block(2491m_params.m_pOpenCL_context,2492block_structs.data(),2493total_input_selectors,2494selector_structs.data(),2495selector_cluster_indices.data(),2496output_selector_cluster_indices.data(),2497m_params.m_perceptual);24982499if (!status)2500{2501error_printf("basisu_frontend::find_optimal_selector_clusters_for_each_block: opencl_find_optimal_selector_clusters_for_each_block() failed! Using CPU.\n");2502m_params.m_pOpenCL_context = nullptr;2503m_opencl_failed = true;2504}2505else2506{2507for (uint32_t i = 0; i < m_selector_cluster_block_indices.size(); i++)2508{2509m_selector_cluster_block_indices[i].resize(0);2510m_selector_cluster_block_indices[i].reserve(128);2511}25122513for (uint32_t block_index = 0; block_index < m_total_blocks; block_index++)2514{2515etc_block& blk = m_encoded_blocks[block_index];25162517uint32_t best_cluster_index = output_selector_cluster_indices[block_index];25182519blk.set_raw_selector_bits(m_optimized_cluster_selectors[best_cluster_index].get_raw_selector_bits());25202521m_block_selector_cluster_index[block_index] = best_cluster_index;25222523vector_ensure_element_is_valid(m_selector_cluster_block_indices, best_cluster_index);2524m_selector_cluster_block_indices[best_cluster_index].push_back(block_index);2525}25262527use_cpu = false;2528}2529}25302531if (use_cpu)2532{2533basisu::vector<uint8_t> unpacked_optimized_cluster_selectors(16 * m_optimized_cluster_selectors.size());2534for (uint32_t cluster_index = 0; cluster_index < m_optimized_cluster_selectors.size(); cluster_index++)2535{2536for (uint32_t y = 0; y < 4; y++)2537{2538for (uint32_t x = 0; x < 4; x++)2539{2540unpacked_optimized_cluster_selectors[cluster_index * 16 + y * 4 + x] = (uint8_t)m_optimized_cluster_selectors[cluster_index].get_selector(x, y);2541}2542}2543}25442545const uint32_t N = 2048;2546for (uint32_t block_index_iter = 0; block_index_iter < m_total_blocks; block_index_iter += N)2547{2548const uint32_t first_index = block_index_iter;2549const uint32_t last_index = minimum<uint32_t>(m_total_blocks, first_index + N);25502551m_params.m_pJob_pool->add_job( [this, first_index, last_index, &unpacked_optimized_cluster_selectors] {25522553int prev_best_cluster_index = 0;25542555for (uint32_t block_index = first_index; block_index < last_index; block_index++)2556{2557const pixel_block& block = get_source_pixel_block(block_index);25582559etc_block& blk = m_encoded_blocks[block_index];25602561if ((block_index > first_index) && (block == get_source_pixel_block(block_index - 1)))2562{2563blk.set_raw_selector_bits(m_optimized_cluster_selectors[prev_best_cluster_index].get_raw_selector_bits());25642565m_block_selector_cluster_index[block_index] = prev_best_cluster_index;25662567continue;2568}25692570const color_rgba* pBlock_pixels = block.get_ptr();25712572color_rgba trial_block_colors[4];2573blk.get_block_colors_etc1s(trial_block_colors);25742575// precompute errors for the i-th block pixel and selector sel: [sel][i]2576uint32_t trial_errors[4][16];25772578if (m_params.m_perceptual)2579{2580for (uint32_t sel = 0; sel < 4; ++sel)2581for (uint32_t i = 0; i < 16; ++i)2582trial_errors[sel][i] = color_distance(true, pBlock_pixels[i], trial_block_colors[sel], false);2583}2584else2585{2586for (uint32_t sel = 0; sel < 4; ++sel)2587for (uint32_t i = 0; i < 16; ++i)2588trial_errors[sel][i] = color_distance(false, pBlock_pixels[i], trial_block_colors[sel], false);2589}25902591// Compute the minimum possible errors (given any selectors) for pixels 0-152592uint64_t min_possible_error_0_15 = 0;2593for (uint32_t i = 0; i < 16; i++)2594min_possible_error_0_15 += basisu::minimum(trial_errors[0][i], trial_errors[1][i], trial_errors[2][i], trial_errors[3][i]);25952596// Compute the minimum possible errors (given any selectors) for pixels 4-152597uint64_t min_possible_error_4_15 = 0;2598for (uint32_t i = 4; i < 16; i++)2599min_possible_error_4_15 += basisu::minimum(trial_errors[0][i], trial_errors[1][i], trial_errors[2][i], trial_errors[3][i]);26002601// Compute the minimum possible errors (given any selectors) for pixels 8-152602uint64_t min_possible_error_8_15 = 0;2603for (uint32_t i = 8; i < 16; i++)2604min_possible_error_8_15 += basisu::minimum(trial_errors[0][i], trial_errors[1][i], trial_errors[2][i], trial_errors[3][i]);26052606// Compute the minimum possible errors (given any selectors) for pixels 12-152607uint64_t min_possible_error_12_15 = 0;2608for (uint32_t i = 12; i < 16; i++)2609min_possible_error_12_15 += basisu::minimum(trial_errors[0][i], trial_errors[1][i], trial_errors[2][i], trial_errors[3][i]);26102611uint64_t best_cluster_err = INT64_MAX;2612uint32_t best_cluster_index = 0;26132614const uint32_t parent_selector_cluster = m_block_parent_selector_cluster.size() ? m_block_parent_selector_cluster[block_index] : 0;2615const uint_vec *pCluster_indices = m_selector_clusters_within_each_parent_cluster.size() ? &m_selector_clusters_within_each_parent_cluster[parent_selector_cluster] : nullptr;26162617const uint32_t total_clusters = m_use_hierarchical_selector_codebooks ? (uint32_t)pCluster_indices->size() : (uint32_t)m_selector_cluster_block_indices.size();26182619#if 02620for (uint32_t cluster_iter = 0; cluster_iter < total_clusters; cluster_iter++)2621{2622const uint32_t cluster_index = m_use_hierarchical_selector_codebooks ? (*pCluster_indices)[cluster_iter] : cluster_iter;26232624const etc_block& cluster_blk = m_optimized_cluster_selectors[cluster_index];26252626uint64_t trial_err = 0;2627for (int y = 0; y < 4; y++)2628{2629for (int x = 0; x < 4; x++)2630{2631const uint32_t sel = cluster_blk.get_selector(x, y);26322633trial_err += color_distance(m_params.m_perceptual, trial_block_colors[sel], pBlock_pixels[x + y * 4], false);2634if (trial_err > best_cluster_err)2635goto early_out;2636}2637}26382639if (trial_err < best_cluster_err)2640{2641best_cluster_err = trial_err;2642best_cluster_index = cluster_index;2643if (!best_cluster_err)2644break;2645}26462647early_out:2648;2649}2650#else2651for (uint32_t cluster_iter = 0; cluster_iter < total_clusters; cluster_iter++)2652{2653const uint32_t cluster_index = m_use_hierarchical_selector_codebooks ? (*pCluster_indices)[cluster_iter] : cluster_iter;26542655const uint8_t* pSels = &unpacked_optimized_cluster_selectors[cluster_index * 16];26562657uint64_t trial_err = (uint64_t)trial_errors[pSels[0]][0] + trial_errors[pSels[1]][1] + trial_errors[pSels[2]][2] + trial_errors[pSels[3]][3];2658if ((trial_err + min_possible_error_4_15) >= best_cluster_err)2659continue;26602661trial_err += (uint64_t)trial_errors[pSels[4]][4] + trial_errors[pSels[5]][5] + trial_errors[pSels[6]][6] + trial_errors[pSels[7]][7];2662if ((trial_err + min_possible_error_8_15) >= best_cluster_err)2663continue;26642665trial_err += (uint64_t)trial_errors[pSels[8]][8] + trial_errors[pSels[9]][9] + trial_errors[pSels[10]][10] + trial_errors[pSels[11]][11];2666if ((trial_err + min_possible_error_12_15) >= best_cluster_err)2667continue;26682669trial_err += (uint64_t)trial_errors[pSels[12]][12] + trial_errors[pSels[13]][13] + trial_errors[pSels[14]][14] + trial_errors[pSels[15]][15];26702671if (trial_err < best_cluster_err)2672{2673best_cluster_err = trial_err;2674best_cluster_index = cluster_index;2675if (best_cluster_err == min_possible_error_0_15)2676break;2677}26782679} // cluster_iter2680#endif26812682blk.set_raw_selector_bits(m_optimized_cluster_selectors[best_cluster_index].get_raw_selector_bits());26832684m_block_selector_cluster_index[block_index] = best_cluster_index;26852686prev_best_cluster_index = best_cluster_index;26872688} // block_index26892690} );26912692} // block_index_iter26932694m_params.m_pJob_pool->wait_for_all();26952696for (uint32_t i = 0; i < m_selector_cluster_block_indices.size(); i++)2697{2698m_selector_cluster_block_indices[i].resize(0);2699m_selector_cluster_block_indices[i].reserve(128);2700}27012702for (uint32_t block_index = 0; block_index < m_total_blocks; block_index++)2703{2704const uint32_t best_cluster_index = m_block_selector_cluster_index[block_index];27052706vector_ensure_element_is_valid(m_selector_cluster_block_indices, best_cluster_index);2707m_selector_cluster_block_indices[best_cluster_index].push_back(block_index);2708}27092710} // if (use_cpu)27112712debug_printf("Elapsed time: %3.3f secs\n", tm.get_elapsed_secs());2713}27142715// TODO: Remove old ETC1 specific stuff, and thread this.2716uint32_t basisu_frontend::refine_block_endpoints_given_selectors()2717{2718debug_printf("refine_block_endpoints_given_selectors\n");27192720for (int block_index = 0; block_index < static_cast<int>(m_total_blocks); block_index++)2721{2722//uint32_t selector_cluster = m_block_selector_cluster_index(block_x, block_y);2723vec2U &endpoint_clusters = m_block_endpoint_clusters_indices[block_index];27242725m_endpoint_cluster_etc_params[endpoint_clusters[0]].m_subblocks.push_back(block_index * 2);27262727m_endpoint_cluster_etc_params[endpoint_clusters[1]].m_subblocks.push_back(block_index * 2 + 1);2728}27292730uint32_t total_subblocks_refined = 0;2731uint32_t total_subblocks_examined = 0;27322733for (uint32_t endpoint_cluster_index = 0; endpoint_cluster_index < m_endpoint_cluster_etc_params.size(); endpoint_cluster_index++)2734{2735endpoint_cluster_etc_params &subblock_params = m_endpoint_cluster_etc_params[endpoint_cluster_index];27362737const uint_vec &subblocks = subblock_params.m_subblocks;2738//uint32_t total_pixels = subblock.m_subblocks.size() * 8;27392740basisu::vector<color_rgba> subblock_colors[2]; // [use_individual_mode]2741uint8_vec subblock_selectors[2];27422743uint64_t cur_subblock_err[2] = { 0, 0 };27442745for (uint32_t subblock_iter = 0; subblock_iter < subblocks.size(); subblock_iter++)2746{2747uint32_t training_vector_index = subblocks[subblock_iter];27482749uint32_t block_index = training_vector_index >> 1;2750uint32_t subblock_index = training_vector_index & 1;2751const bool is_flipped = true;27522753const etc_block &blk = m_encoded_blocks[block_index];27542755const bool use_individual_mode = !blk.get_diff_bit();27562757const color_rgba *pSource_block_pixels = get_source_pixel_block(block_index).get_ptr();27582759color_rgba unpacked_block_pixels[16];2760unpack_etc1(blk, unpacked_block_pixels);27612762for (uint32_t i = 0; i < 8; i++)2763{2764const uint32_t pixel_index = g_etc1_pixel_indices[is_flipped][subblock_index][i];2765const etc_coord2 &coords = g_etc1_pixel_coords[is_flipped][subblock_index][i];27662767subblock_colors[use_individual_mode].push_back(pSource_block_pixels[pixel_index]);27682769cur_subblock_err[use_individual_mode] += color_distance(m_params.m_perceptual, pSource_block_pixels[pixel_index], unpacked_block_pixels[pixel_index], false);27702771subblock_selectors[use_individual_mode].push_back(static_cast<uint8_t>(blk.get_selector(coords.m_x, coords.m_y)));2772}2773} // subblock_iter27742775etc1_optimizer::results cluster_optimizer_results[2];2776bool results_valid[2] = { false, false };27772778clear_obj(cluster_optimizer_results);27792780basisu::vector<uint8_t> cluster_selectors[2];27812782for (uint32_t use_individual_mode = 0; use_individual_mode < 2; use_individual_mode++)2783{2784const uint32_t total_pixels = (uint32_t)subblock_colors[use_individual_mode].size();27852786if (!total_pixels)2787continue;27882789total_subblocks_examined += total_pixels / 8;27902791etc1_optimizer optimizer;2792etc1_solution_coordinates solutions[2];27932794etc1_optimizer::params cluster_optimizer_params;2795cluster_optimizer_params.m_num_src_pixels = total_pixels;2796cluster_optimizer_params.m_pSrc_pixels = &subblock_colors[use_individual_mode][0];27972798cluster_optimizer_params.m_use_color4 = use_individual_mode != 0;2799cluster_optimizer_params.m_perceptual = m_params.m_perceptual;28002801cluster_optimizer_params.m_pForce_selectors = &subblock_selectors[use_individual_mode][0];2802cluster_optimizer_params.m_quality = cETCQualityUber;28032804cluster_selectors[use_individual_mode].resize(total_pixels);28052806cluster_optimizer_results[use_individual_mode].m_n = total_pixels;2807cluster_optimizer_results[use_individual_mode].m_pSelectors = &cluster_selectors[use_individual_mode][0];28082809optimizer.init(cluster_optimizer_params, cluster_optimizer_results[use_individual_mode]);28102811if (!optimizer.compute())2812continue;28132814if (cluster_optimizer_results[use_individual_mode].m_error < cur_subblock_err[use_individual_mode])2815results_valid[use_individual_mode] = true;28162817} // use_individual_mode28182819for (uint32_t use_individual_mode = 0; use_individual_mode < 2; use_individual_mode++)2820{2821if (!results_valid[use_individual_mode])2822continue;28232824uint32_t num_passes = use_individual_mode ? 1 : 2;28252826bool all_passed5 = true;28272828for (uint32_t pass = 0; pass < num_passes; pass++)2829{2830for (uint32_t subblock_iter = 0; subblock_iter < subblocks.size(); subblock_iter++)2831{2832const uint32_t training_vector_index = subblocks[subblock_iter];28332834const uint32_t block_index = training_vector_index >> 1;2835const uint32_t subblock_index = training_vector_index & 1;2836//const bool is_flipped = true;28372838etc_block &blk = m_encoded_blocks[block_index];28392840if (!blk.get_diff_bit() != static_cast<bool>(use_individual_mode != 0))2841continue;28422843if (use_individual_mode)2844{2845blk.set_base4_color(subblock_index, etc_block::pack_color4(cluster_optimizer_results[1].m_block_color_unscaled, false));2846blk.set_inten_table(subblock_index, cluster_optimizer_results[1].m_block_inten_table);28472848subblock_params.m_color_error[1] = cluster_optimizer_results[1].m_error;2849subblock_params.m_inten_table[1] = cluster_optimizer_results[1].m_block_inten_table;2850subblock_params.m_color_unscaled[1] = cluster_optimizer_results[1].m_block_color_unscaled;28512852total_subblocks_refined++;2853}2854else2855{2856const uint16_t base_color5 = blk.get_base5_color();2857const uint16_t delta_color3 = blk.get_delta3_color();28582859uint32_t r[2], g[2], b[2];2860etc_block::unpack_color5(r[0], g[0], b[0], base_color5, false);2861bool success = etc_block::unpack_color5(r[1], g[1], b[1], base_color5, delta_color3, false);2862assert(success);2863BASISU_NOTE_UNUSED(success);28642865r[subblock_index] = cluster_optimizer_results[0].m_block_color_unscaled.r;2866g[subblock_index] = cluster_optimizer_results[0].m_block_color_unscaled.g;2867b[subblock_index] = cluster_optimizer_results[0].m_block_color_unscaled.b;28682869color_rgba colors[2] = { color_rgba(r[0], g[0], b[0], 255), color_rgba(r[1], g[1], b[1], 255) };28702871if (!etc_block::try_pack_color5_delta3(colors))2872{2873all_passed5 = false;2874break;2875}28762877if ((pass == 1) && (all_passed5))2878{2879blk.set_block_color5(colors[0], colors[1]);2880blk.set_inten_table(subblock_index, cluster_optimizer_results[0].m_block_inten_table);28812882subblock_params.m_color_error[0] = cluster_optimizer_results[0].m_error;2883subblock_params.m_inten_table[0] = cluster_optimizer_results[0].m_block_inten_table;2884subblock_params.m_color_unscaled[0] = cluster_optimizer_results[0].m_block_color_unscaled;28852886total_subblocks_refined++;2887}2888}28892890} // subblock_iter28912892} // pass28932894} // use_individual_mode28952896} // endpoint_cluster_index28972898if (m_params.m_debug_stats)2899debug_printf("Total subblock endpoints refined: %u (%3.1f%%)\n", total_subblocks_refined, total_subblocks_refined * 100.0f / total_subblocks_examined);29002901return total_subblocks_refined;2902}29032904void basisu_frontend::dump_endpoint_clusterization_visualization(const char *pFilename, bool vis_endpoint_colors)2905{2906debug_printf("dump_endpoint_clusterization_visualization\n");29072908uint32_t max_endpoint_cluster_size = 0;29092910basisu::vector<uint32_t> cluster_sizes(m_endpoint_clusters.size());2911basisu::vector<uint32_t> sorted_cluster_indices(m_endpoint_clusters.size());2912for (uint32_t i = 0; i < m_endpoint_clusters.size(); i++)2913{2914max_endpoint_cluster_size = maximum<uint32_t>(max_endpoint_cluster_size, (uint32_t)m_endpoint_clusters[i].size());2915cluster_sizes[i] = (uint32_t)m_endpoint_clusters[i].size();2916}29172918if (!max_endpoint_cluster_size)2919return;29202921for (uint32_t i = 0; i < m_endpoint_clusters.size(); i++)2922sorted_cluster_indices[i] = i;29232924//indexed_heap_sort(endpoint_clusters.size(), cluster_sizes.get_ptr(), sorted_cluster_indices.get_ptr());29252926image endpoint_cluster_vis(12 + minimum<uint32_t>(max_endpoint_cluster_size, 2048) * 5, (uint32_t)m_endpoint_clusters.size() * 3);29272928for (uint32_t unsorted_cluster_iter = 0; unsorted_cluster_iter < m_endpoint_clusters.size(); unsorted_cluster_iter++)2929{2930const uint32_t cluster_iter = sorted_cluster_indices[unsorted_cluster_iter];29312932etc_block blk;2933blk.clear();2934blk.set_flip_bit(false);2935blk.set_diff_bit(true);2936blk.set_inten_tables_etc1s(m_endpoint_cluster_etc_params[cluster_iter].m_inten_table[0]);2937blk.set_base5_color(etc_block::pack_color5(m_endpoint_cluster_etc_params[cluster_iter].m_color_unscaled[0], false));29382939color_rgba blk_colors[4];2940blk.get_block_colors(blk_colors, 0);2941for (uint32_t i = 0; i < 4; i++)2942endpoint_cluster_vis.fill_box(i * 2, 3 * unsorted_cluster_iter, 2, 2, blk_colors[i]);29432944for (uint32_t subblock_iter = 0; subblock_iter < m_endpoint_clusters[cluster_iter].size(); subblock_iter++)2945{2946uint32_t training_vector_index = m_endpoint_clusters[cluster_iter][subblock_iter];29472948const uint32_t block_index = training_vector_index >> 1;2949const uint32_t subblock_index = training_vector_index & 1;29502951const etc_block& blk2 = m_etc1_blocks_etc1s[block_index];29522953const color_rgba *pBlock_pixels = get_source_pixel_block(block_index).get_ptr();29542955color_rgba subblock_pixels[8];29562957if (vis_endpoint_colors)2958{2959color_rgba colors[2];2960blk2.get_block_low_high_colors(colors, subblock_index);2961for (uint32_t i = 0; i < 8; i++)2962subblock_pixels[i] = colors[subblock_index];2963}2964else2965{2966for (uint32_t i = 0; i < 8; i++)2967subblock_pixels[i] = pBlock_pixels[g_etc1_pixel_indices[blk2.get_flip_bit()][subblock_index][i]];2968}29692970endpoint_cluster_vis.set_block_clipped(subblock_pixels, 12 + 5 * subblock_iter, 3 * unsorted_cluster_iter, 4, 2);2971}2972}29732974save_png(pFilename, endpoint_cluster_vis);2975debug_printf("Wrote debug visualization file %s\n", pFilename);2976}29772978void basisu_frontend::finalize()2979{2980for (uint32_t block_index = 0; block_index < m_total_blocks; block_index++)2981{2982for (uint32_t subblock_index = 0; subblock_index < 2; subblock_index++)2983{2984const uint32_t endpoint_cluster_index = get_subblock_endpoint_cluster_index(block_index, subblock_index);29852986m_endpoint_cluster_etc_params[endpoint_cluster_index].m_color_used[0] = true;2987}2988}2989}29902991// The backend has remapped the block endpoints while optimizing the output symbols for better rate distortion performance, so let's go and reoptimize the endpoint codebook.2992// This is currently the only place where the backend actually goes and changes the quantization and calls the frontend to fix things up.2993// This is basically a bottom up clusterization stage, where some leaves can be combined.2994void basisu_frontend::reoptimize_remapped_endpoints(const uint_vec &new_block_endpoints, int_vec &old_to_new_endpoint_cluster_indices, bool optimize_final_codebook, uint_vec *pBlock_selector_indices)2995{2996debug_printf("reoptimize_remapped_endpoints\n");29972998basisu::vector<uint_vec> new_endpoint_cluster_block_indices(m_endpoint_clusters.size());2999for (uint32_t i = 0; i < new_block_endpoints.size(); i++)3000new_endpoint_cluster_block_indices[new_block_endpoints[i]].push_back(i);30013002basisu::vector<uint8_t> cluster_valid(new_endpoint_cluster_block_indices.size());3003basisu::vector<uint8_t> cluster_improved(new_endpoint_cluster_block_indices.size());30043005const uint32_t N = 256;3006for (uint32_t cluster_index_iter = 0; cluster_index_iter < new_endpoint_cluster_block_indices.size(); cluster_index_iter += N)3007{3008const uint32_t first_index = cluster_index_iter;3009const uint32_t last_index = minimum<uint32_t>((uint32_t)new_endpoint_cluster_block_indices.size(), cluster_index_iter + N);30103011m_params.m_pJob_pool->add_job( [this, first_index, last_index, &cluster_improved, &cluster_valid, &new_endpoint_cluster_block_indices, &pBlock_selector_indices ] {30123013for (uint32_t cluster_index = first_index; cluster_index < last_index; cluster_index++)3014{3015const basisu::vector<uint32_t>& cluster_block_indices = new_endpoint_cluster_block_indices[cluster_index];30163017if (!cluster_block_indices.size())3018continue;30193020const uint32_t total_pixels = (uint32_t)cluster_block_indices.size() * 16;30213022basisu::vector<color_rgba> cluster_pixels(total_pixels);3023uint8_vec force_selectors(total_pixels);30243025etc_block blk;3026blk.set_block_color5_etc1s(get_endpoint_cluster_unscaled_color(cluster_index, false));3027blk.set_inten_tables_etc1s(get_endpoint_cluster_inten_table(cluster_index, false));3028blk.set_flip_bit(true);30293030uint64_t cur_err = 0;30313032for (uint32_t cluster_block_indices_iter = 0; cluster_block_indices_iter < cluster_block_indices.size(); cluster_block_indices_iter++)3033{3034const uint32_t block_index = cluster_block_indices[cluster_block_indices_iter];30353036const color_rgba *pBlock_pixels = get_source_pixel_block(block_index).get_ptr();30373038memcpy(&cluster_pixels[cluster_block_indices_iter * 16], pBlock_pixels, 16 * sizeof(color_rgba));30393040const uint32_t selector_cluster_index = pBlock_selector_indices ? (*pBlock_selector_indices)[block_index] : get_block_selector_cluster_index(block_index);30413042const etc_block &blk_selectors = get_selector_cluster_selector_bits(selector_cluster_index);30433044blk.set_raw_selector_bits(blk_selectors.get_raw_selector_bits());30453046cur_err += blk.evaluate_etc1_error(pBlock_pixels, m_params.m_perceptual);30473048for (uint32_t y = 0; y < 4; y++)3049for (uint32_t x = 0; x < 4; x++)3050force_selectors[cluster_block_indices_iter * 16 + x + y * 4] = static_cast<uint8_t>(blk_selectors.get_selector(x, y));3051}30523053endpoint_cluster_etc_params new_endpoint_cluster_etc_params;30543055{3056etc1_optimizer optimizer;3057etc1_solution_coordinates solutions[2];30583059etc1_optimizer::params cluster_optimizer_params;3060cluster_optimizer_params.m_num_src_pixels = total_pixels;3061cluster_optimizer_params.m_pSrc_pixels = &cluster_pixels[0];30623063cluster_optimizer_params.m_use_color4 = false;3064cluster_optimizer_params.m_perceptual = m_params.m_perceptual;3065cluster_optimizer_params.m_pForce_selectors = &force_selectors[0];30663067if (m_params.m_compression_level == BASISU_MAX_COMPRESSION_LEVEL)3068cluster_optimizer_params.m_quality = cETCQualityUber;3069else3070cluster_optimizer_params.m_quality = cETCQualitySlow;30713072etc1_optimizer::results cluster_optimizer_results;30733074basisu::vector<uint8_t> cluster_selectors(total_pixels);3075cluster_optimizer_results.m_n = total_pixels;3076cluster_optimizer_results.m_pSelectors = &cluster_selectors[0];30773078optimizer.init(cluster_optimizer_params, cluster_optimizer_results);30793080if (!optimizer.compute())3081BASISU_FRONTEND_VERIFY(false);30823083new_endpoint_cluster_etc_params.m_color_unscaled[0] = cluster_optimizer_results.m_block_color_unscaled;3084new_endpoint_cluster_etc_params.m_inten_table[0] = cluster_optimizer_results.m_block_inten_table;3085new_endpoint_cluster_etc_params.m_color_error[0] = cluster_optimizer_results.m_error;3086new_endpoint_cluster_etc_params.m_color_used[0] = true;3087new_endpoint_cluster_etc_params.m_valid = true;3088}30893090if (new_endpoint_cluster_etc_params.m_color_error[0] < cur_err)3091{3092m_endpoint_cluster_etc_params[cluster_index] = new_endpoint_cluster_etc_params;30933094cluster_improved[cluster_index] = true;3095}30963097cluster_valid[cluster_index] = true;30983099} // cluster_index31003101} );31023103} // cluster_index_iter31043105m_params.m_pJob_pool->wait_for_all();31063107uint32_t total_unused_clusters = 0;3108uint32_t total_improved_clusters = 0;31093110old_to_new_endpoint_cluster_indices.resize(m_endpoint_clusters.size());3111vector_set_all(old_to_new_endpoint_cluster_indices, -1);31123113int total_new_endpoint_clusters = 0;31143115for (uint32_t old_cluster_index = 0; old_cluster_index < m_endpoint_clusters.size(); old_cluster_index++)3116{3117if (!cluster_valid[old_cluster_index])3118total_unused_clusters++;3119else3120old_to_new_endpoint_cluster_indices[old_cluster_index] = total_new_endpoint_clusters++;31213122if (cluster_improved[old_cluster_index])3123total_improved_clusters++;3124}31253126debug_printf("Total unused clusters: %u\n", total_unused_clusters);3127debug_printf("Total improved_clusters: %u\n", total_improved_clusters);3128debug_printf("Total endpoint clusters: %u\n", total_new_endpoint_clusters);31293130if (optimize_final_codebook)3131{3132cluster_subblock_etc_params_vec new_endpoint_cluster_etc_params(total_new_endpoint_clusters);31333134for (uint32_t old_cluster_index = 0; old_cluster_index < m_endpoint_clusters.size(); old_cluster_index++)3135{3136if (old_to_new_endpoint_cluster_indices[old_cluster_index] >= 0)3137new_endpoint_cluster_etc_params[old_to_new_endpoint_cluster_indices[old_cluster_index]] = m_endpoint_cluster_etc_params[old_cluster_index];3138}31393140debug_printf("basisu_frontend::reoptimize_remapped_endpoints: stage 1\n");31413142basisu::vector<uint_vec> new_endpoint_clusters(total_new_endpoint_clusters);31433144for (uint32_t block_index = 0; block_index < new_block_endpoints.size(); block_index++)3145{3146const uint32_t old_endpoint_cluster_index = new_block_endpoints[block_index];31473148const int new_endpoint_cluster_index = old_to_new_endpoint_cluster_indices[old_endpoint_cluster_index];3149BASISU_FRONTEND_VERIFY(new_endpoint_cluster_index >= 0);31503151BASISU_FRONTEND_VERIFY(new_endpoint_cluster_index < (int)new_endpoint_clusters.size());31523153new_endpoint_clusters[new_endpoint_cluster_index].push_back(block_index * 2 + 0);3154new_endpoint_clusters[new_endpoint_cluster_index].push_back(block_index * 2 + 1);31553156BASISU_FRONTEND_VERIFY(new_endpoint_cluster_index < (int)new_endpoint_cluster_etc_params.size());31573158new_endpoint_cluster_etc_params[new_endpoint_cluster_index].m_subblocks.push_back(block_index * 2 + 0);3159new_endpoint_cluster_etc_params[new_endpoint_cluster_index].m_subblocks.push_back(block_index * 2 + 1);31603161m_block_endpoint_clusters_indices[block_index][0] = new_endpoint_cluster_index;3162m_block_endpoint_clusters_indices[block_index][1] = new_endpoint_cluster_index;3163}31643165debug_printf("basisu_frontend::reoptimize_remapped_endpoints: stage 2\n");31663167m_endpoint_clusters = new_endpoint_clusters;3168m_endpoint_cluster_etc_params = new_endpoint_cluster_etc_params;31693170eliminate_redundant_or_empty_endpoint_clusters();31713172debug_printf("basisu_frontend::reoptimize_remapped_endpoints: stage 3\n");31733174for (uint32_t new_cluster_index = 0; new_cluster_index < m_endpoint_clusters.size(); new_cluster_index++)3175{3176for (uint32_t cluster_block_iter = 0; cluster_block_iter < m_endpoint_clusters[new_cluster_index].size(); cluster_block_iter++)3177{3178const uint32_t subblock_index = m_endpoint_clusters[new_cluster_index][cluster_block_iter];3179const uint32_t block_index = subblock_index >> 1;31803181m_block_endpoint_clusters_indices[block_index][0] = new_cluster_index;3182m_block_endpoint_clusters_indices[block_index][1] = new_cluster_index;31833184const uint32_t old_cluster_index = new_block_endpoints[block_index];31853186old_to_new_endpoint_cluster_indices[old_cluster_index] = new_cluster_index;3187}3188}31893190debug_printf("basisu_frontend::reoptimize_remapped_endpoints: stage 4\n");31913192for (uint32_t block_index = 0; block_index < m_encoded_blocks.size(); block_index++)3193{3194const uint32_t endpoint_cluster_index = get_subblock_endpoint_cluster_index(block_index, 0);31953196m_encoded_blocks[block_index].set_block_color5_etc1s(get_endpoint_cluster_unscaled_color(endpoint_cluster_index, false));3197m_encoded_blocks[block_index].set_inten_tables_etc1s(get_endpoint_cluster_inten_table(endpoint_cluster_index, false));3198}31993200debug_printf("Final (post-RDO) endpoint clusters: %u\n", m_endpoint_clusters.size());3201}32023203//debug_printf("validate_output: %u\n", validate_output());3204}32053206// Endpoint clusterization hierarchy integrity checker.3207// Note this doesn't check for empty clusters.3208bool basisu_frontend::validate_endpoint_cluster_hierarchy(bool ensure_clusters_have_same_parents) const3209{3210if (!m_endpoint_parent_clusters.size())3211return true;32123213int_vec subblock_parent_indices(m_total_blocks * 2);3214subblock_parent_indices.set_all(-1);32153216int_vec subblock_cluster_indices(m_total_blocks * 2);3217subblock_cluster_indices.set_all(-1);32183219for (uint32_t parent_index = 0; parent_index < m_endpoint_parent_clusters.size(); parent_index++)3220{3221for (uint32_t i = 0; i < m_endpoint_parent_clusters[parent_index].size(); i++)3222{3223uint32_t subblock_index = m_endpoint_parent_clusters[parent_index][i];3224if (subblock_index >= m_total_blocks * 2)3225return false;32263227// If the endpoint cluster lives in more than one parent node, that's wrong.3228if (subblock_parent_indices[subblock_index] != -1)3229return false;32303231subblock_parent_indices[subblock_index] = parent_index;3232}3233}32343235// Make sure all endpoint clusters are present in the parent cluster.3236for (uint32_t i = 0; i < subblock_parent_indices.size(); i++)3237{3238if (subblock_parent_indices[i] == -1)3239return false;3240}32413242for (uint32_t cluster_index = 0; cluster_index < m_endpoint_clusters.size(); cluster_index++)3243{3244int parent_index = 0;32453246for (uint32_t i = 0; i < m_endpoint_clusters[cluster_index].size(); i++)3247{3248uint32_t subblock_index = m_endpoint_clusters[cluster_index][i];3249if (subblock_index >= m_total_blocks * 2)3250return false;32513252if (subblock_cluster_indices[subblock_index] != -1)3253return false;32543255subblock_cluster_indices[subblock_index] = cluster_index;32563257// There are transformations on the endpoint clusters that can break the strict tree requirement3258if (ensure_clusters_have_same_parents)3259{3260// Make sure all the subblocks are in the same parent cluster3261if (!i)3262parent_index = subblock_parent_indices[subblock_index];3263else if (subblock_parent_indices[subblock_index] != parent_index)3264return false;3265}3266}3267}32683269// Make sure all endpoint clusters are present in the parent cluster.3270for (uint32_t i = 0; i < subblock_cluster_indices.size(); i++)3271{3272if (subblock_cluster_indices[i] == -1)3273return false;3274}32753276return true;3277}32783279// This is very slow and only intended for debugging/development. It's enabled using the "-validate_etc1s" command line option.3280bool basisu_frontend::validate_output() const3281{3282debug_printf("validate_output\n");32833284if (!check_etc1s_constraints())3285return false;32863287for (uint32_t block_index = 0; block_index < m_total_blocks; block_index++)3288{3289//#define CHECK(x) do { if (!(x)) { DebugBreak(); return false; } } while(0)3290#define CHECK(x) BASISU_FRONTEND_VERIFY(x);32913292CHECK(get_output_block(block_index).get_flip_bit() == true);32933294const bool diff_flag = get_diff_flag(block_index);3295CHECK(diff_flag == true);32963297etc_block blk;3298memset(&blk, 0, sizeof(blk));3299blk.set_flip_bit(true);3300blk.set_diff_bit(true);33013302const uint32_t endpoint_cluster0_index = get_subblock_endpoint_cluster_index(block_index, 0);3303const uint32_t endpoint_cluster1_index = get_subblock_endpoint_cluster_index(block_index, 1);33043305// basisu only supports ETC1S, so these must be equal.3306CHECK(endpoint_cluster0_index == endpoint_cluster1_index);33073308CHECK(blk.set_block_color5_check(get_endpoint_cluster_unscaled_color(endpoint_cluster0_index, false), get_endpoint_cluster_unscaled_color(endpoint_cluster1_index, false)));33093310CHECK(get_endpoint_cluster_color_is_used(endpoint_cluster0_index, false));33113312blk.set_inten_table(0, get_endpoint_cluster_inten_table(endpoint_cluster0_index, false));3313blk.set_inten_table(1, get_endpoint_cluster_inten_table(endpoint_cluster1_index, false));33143315const uint32_t selector_cluster_index = get_block_selector_cluster_index(block_index);3316CHECK(selector_cluster_index < get_total_selector_clusters());33173318CHECK(vector_find(get_selector_cluster_block_indices(selector_cluster_index), block_index) != -1);33193320blk.set_raw_selector_bits(get_selector_cluster_selector_bits(selector_cluster_index).get_raw_selector_bits());33213322const etc_block &rdo_output_block = get_output_block(block_index);33233324CHECK(rdo_output_block.get_flip_bit() == blk.get_flip_bit());3325CHECK(rdo_output_block.get_diff_bit() == blk.get_diff_bit());3326CHECK(rdo_output_block.get_inten_table(0) == blk.get_inten_table(0));3327CHECK(rdo_output_block.get_inten_table(1) == blk.get_inten_table(1));3328CHECK(rdo_output_block.get_base5_color() == blk.get_base5_color());3329CHECK(rdo_output_block.get_delta3_color() == blk.get_delta3_color());3330CHECK(rdo_output_block.get_raw_selector_bits() == blk.get_raw_selector_bits());33313332#undef CHECK3333}33343335return true;3336}33373338void basisu_frontend::dump_debug_image(const char *pFilename, uint32_t first_block, uint32_t num_blocks_x, uint32_t num_blocks_y, bool output_blocks)3339{3340gpu_image g;3341g.init(texture_format::cETC1, num_blocks_x * 4, num_blocks_y * 4);33423343for (uint32_t y = 0; y < num_blocks_y; y++)3344{3345for (uint32_t x = 0; x < num_blocks_x; x++)3346{3347const uint32_t block_index = first_block + x + y * num_blocks_x;33483349etc_block &blk = *(etc_block *)g.get_block_ptr(x, y);33503351if (output_blocks)3352blk = get_output_block(block_index);3353else3354{3355const bool diff_flag = get_diff_flag(block_index);33563357blk.set_diff_bit(diff_flag);3358blk.set_flip_bit(true);33593360const uint32_t endpoint_cluster0_index = get_subblock_endpoint_cluster_index(block_index, 0);3361const uint32_t endpoint_cluster1_index = get_subblock_endpoint_cluster_index(block_index, 1);33623363if (diff_flag)3364blk.set_block_color5(get_endpoint_cluster_unscaled_color(endpoint_cluster0_index, false), get_endpoint_cluster_unscaled_color(endpoint_cluster1_index, false));3365else3366blk.set_block_color4(get_endpoint_cluster_unscaled_color(endpoint_cluster0_index, true), get_endpoint_cluster_unscaled_color(endpoint_cluster1_index, true));33673368blk.set_inten_table(0, get_endpoint_cluster_inten_table(endpoint_cluster0_index, !diff_flag));3369blk.set_inten_table(1, get_endpoint_cluster_inten_table(endpoint_cluster1_index, !diff_flag));33703371const uint32_t selector_cluster_index = get_block_selector_cluster_index(block_index);3372blk.set_raw_selector_bits(get_selector_cluster_selector_bits(selector_cluster_index).get_raw_selector_bits());3373}3374}3375}33763377image img;3378g.unpack(img);33793380save_png(pFilename, img);3381}33823383} // namespace basisu3384338533863387