Path: blob/master/thirdparty/basis_universal/encoder/basisu_astc_hdr_6x6_enc.cpp
21521 views
// File: basisu_astc_hdr_6x6_enc.cpp1#include "basisu_astc_hdr_6x6_enc.h"2#include "basisu_enc.h"3#include "basisu_astc_hdr_common.h"4#include "basisu_math.h"5#include "basisu_resampler.h"6#include "basisu_resampler_filters.h"78#define MINIZ_HEADER_FILE_ONLY9#define MINIZ_NO_ZLIB_COMPATIBLE_NAMES10#include "basisu_miniz.h"1112#include "3rdparty/android_astc_decomp.h"1314#include <array>1516using namespace basisu;17using namespace buminiz;18using namespace basist::astc_6x6_hdr;1920namespace astc_6x6_hdr21{2223static void atomic_max(std::atomic<uint32_t>& atomic_var, uint32_t new_value)24{25uint32_t current = atomic_var.load(std::memory_order_relaxed);26for ( ; ; )27{28uint32_t new_max = std::max(current, new_value);29if (atomic_var.compare_exchange_weak(current, new_max, std::memory_order_relaxed, std::memory_order_relaxed))30break;31}32}3334void astc_hdr_6x6_global_config::set_user_level(int level)35{36level = basisu::clamp<int>(level, 0, ASTC_HDR_6X6_MAX_USER_COMP_LEVEL);3738m_master_comp_level = 0;39m_highest_comp_level = 0;40m_num_reuse_xy_deltas = NUM_REUSE_XY_DELTAS;41m_extra_patterns_flag = false;42m_brute_force_partition_matching = false;4344switch (level)45{46case 0:47{48// Both reduce compression a lot when lambda>049m_favor_higher_compression = false;50m_num_reuse_xy_deltas = NUM_REUSE_XY_DELTAS / 2;51break;52}53case 1:54{55m_master_comp_level = 0;56m_highest_comp_level = 0;57break;58}59case 2:60{61m_master_comp_level = 0;62m_highest_comp_level = 1;63break;64}65case 3:66{67m_master_comp_level = 1;68m_highest_comp_level = 1;69break;70}71case 4:72{73m_master_comp_level = 1;74m_highest_comp_level = 2;75break;76}77case 5:78{79m_master_comp_level = 1;80m_highest_comp_level = 3;81break;82}83case 6:84{85m_master_comp_level = 1;86m_highest_comp_level = 4;87break;88}89case 7:90{91m_master_comp_level = 2;92m_highest_comp_level = 2;93break;94}95case 8:96{97m_master_comp_level = 2;98m_highest_comp_level = 3;99break;100}101case 9:102{103m_master_comp_level = 2;104m_highest_comp_level = 4;105break;106}107case 10:108{109m_master_comp_level = 3;110m_highest_comp_level = 3;111break;112}113case 11:114{115m_master_comp_level = 3;116m_highest_comp_level = 4;117break;118}119case 12:120default:121{122m_master_comp_level = 4;123m_highest_comp_level = 4;124m_extra_patterns_flag = true;125m_brute_force_partition_matching = true;126break;127}128}129}130131const float m1 = 0.1593017578125f; // (2610 / 2^14) * (1/100)132const float m2 = 78.84375f; // (2523 / 32) * (1/100)133const float c1 = 0.8359375f; // 3424 / (2^12)134const float c2 = 18.8515625f; // (2413 / 128)135const float c3 = 18.6875f; // (2392 / 128)136137static float forwardPQ(float Y)138{139// 10,000 here is an absolute scale - it's in nits (cd per square meter)140float L = Y * (1.0f / 10000.0f);141142float num = powf(L, m1);143float N = powf((c1 + c2 * num) / (1 + c3 * num), m2);144145return N;146}147148#if 0149static float inversePQ(float E)150{151float N = powf(E, 1.0f / m2);152153float num = basisu::maximum<float>((N - c1), 0.0f) / (c2 - c3 * N);154float L = powf(num, 1.0f / m1);155156return L * 10000.0f;157}158#endif159160// PQ function approximation: convert input to bfloat16, look up in tables, bilinear interpolation between table entries.161// max_er: 0.000023007392883, max_rel_er: 0.000023472490284, avg_er: 0.000004330495689, 6-7x faster on x86162// Highest error is for values less than SMALLEST_PQ_VAL_IN.163//164// Approximation is round trip lossless for 10-12 bits at [0,10000] nits:165// for x [0,1024] (SCALE=1023) or for x [0,4095] (SCALE=4096):166// round(forwardPQTab(inversePQ(x / SCALE)) * SCALE) == x167//168// bfloat16 has enough precision to handle 8-bit sRGB to linear conversions:169// round(linear_to_srgb(bfloat16_to_float(float_to_bfloat16(srgb_to_linear(isRGB/255.0f))))*255.0) is lossless170171const int PQ_APPROX_MIN_EXP = -16, PQ_APPROX_MAX_EXP = 16;172const int PQ_APPROX_EXP_RANGE = (PQ_APPROX_MAX_EXP - PQ_APPROX_MIN_EXP + 1);173174const float SMALLEST_PQ_VAL_IN = 0.000015258829080f;175const float SMALLEST_PQ_VAL = 0.000551903737f; // forwardPQ(SMALLEST_PQ_VAL_IN)176177const float LARGEST_PQ_VAL = 1.251312f;178179float g_pq_approx_tabs[PQ_APPROX_EXP_RANGE][128];180181static void init_pq_tables()182{183for (int exp = PQ_APPROX_MIN_EXP; exp <= PQ_APPROX_MAX_EXP; exp++)184{185for (int mant = 0; mant < 128; mant++)186{187bfloat16 b = bfloat16_init(1, exp, mant);188float bf = bfloat16_to_float(b);189190float pq = forwardPQ(bf);191192g_pq_approx_tabs[exp - PQ_APPROX_MIN_EXP][mant] = pq;193}194}195196//fmt_printf("{.15} {.15}\n", g_pq_approx_tabs[0][0], inversePQ(g_pq_approx_tabs[0][0]));197//fmt_printf("{.15}\n", forwardPQ(SMALLEST_PQ_VAL_IN));198}199200static inline float forwardPQTab(float v)201{202assert(g_pq_approx_tabs[0][0]);203204assert(v >= 0.0f);205if (v == 0.0f)206return 0.0f;207208bfloat16 bf = float_to_bfloat16(v, false);209assert(v >= bfloat16_to_float(bf));210211int exp = bfloat16_get_exp(bf);212213if (exp < PQ_APPROX_MIN_EXP)214{215// not accurate but should be good enough for our uses216return lerp(0.0f, SMALLEST_PQ_VAL, minimum(1.0f, v / SMALLEST_PQ_VAL_IN));217}218else if (exp > PQ_APPROX_MAX_EXP)219return LARGEST_PQ_VAL;220221int mant = bfloat16_get_mantissa(bf);222223float a = g_pq_approx_tabs[exp - PQ_APPROX_MIN_EXP][mant];224float bf_f32 = bfloat16_to_float(bf);225226int next_mant = mant + 1;227int next_exp = exp;228if (next_mant == 128)229{230next_mant = 0;231next_exp++;232if (next_exp > PQ_APPROX_MAX_EXP)233return a;234}235236float b = g_pq_approx_tabs[next_exp - PQ_APPROX_MIN_EXP][next_mant];237238bfloat16 next_bf = bfloat16_init(1, next_exp, next_mant);239float next_bf_f32 = bfloat16_to_float(next_bf);240assert(v <= next_bf_f32);241242float lerp_factor = (v - bf_f32) / (next_bf_f32 - bf_f32);243assert((lerp_factor >= 0) && (lerp_factor <= 1.0f));244245return lerp(a, b, lerp_factor);246}247248// 100 nits = ~.5 i249// This converts absolute linear RGB light in either REC 709 or REC2020/BT2100 color gamut to ICtCp, a coding space where Ct is scaled by 2.250// To convert to perceptual ITP for error/distance calculations, multiply the result Ct by .5 (or set itp_flag to true).251// Assumes REC 709 input, or REC 2020/BT.2100 RGB input if rec2020_bt2100_color_gamut is true.252//253// ITP info:254// https://www.portrait.com/resource-center/ictcp-color-difference-metric/255// https://professional.dolby.com/siteassets/pdfs/measuringperceptualcolorvolume_v07.253.pdf (see scale to JND's)256// This also converts from a ICtCp coding space to threshold or perceptually uniform space ITP.257//258// Linear REC709 to REC2020/BT.2100 gamut conversion:259// rgb_2100[0] = rgb_in[0] * 0.6274f + rgb_in[1] * 0.3293f + rgb_in[2] * 0.0433f;260// rgb_2100[1] = rgb_in[0] * 0.0691f + rgb_in[1] * 0.9195f + rgb_in[2] * 0.0114f;261// rgb_2100[2] = rgb_in[0] * 0.0164f + rgb_in[1] * 0.0880f + rgb_in[2] * 0.8956f;262// const float S = 1.0f / 4096.0f;263// l = (1688.0f * S) * rgb_2100[0] + (2146.0f * S) * rgb_2100[1] + (262.0f * S) * rgb_2100[2];264// m = (683.0f * S) * rgb_2100[0] + (2951.0f * S) * rgb_2100[1] + (462.0f * S) * rgb_2100[2];265// s = (99.0f * S) * rgb_2100[0] + (309.0f * S) * rgb_2100[1] + (3688.0f * S) * rgb_2100[2];266static void linear_rgb_to_ictcp(const vec3F& rgb_in, vec3F& ictcp, bool itp_flag = false, bool rec2020_bt2100_color_gamut = false)267{268vec3F rgb_2100(rgb_in);269270float l, m, s;271if (!rec2020_bt2100_color_gamut)272{273// Assume REC 709 input color gamut274// (REC2020_to_LMS * REC709_to_2020) * input_color275l = rgb_2100[0] * 0.2958097f + rgb_2100[1] * 0.6230863f + rgb_2100[2] * 0.0811040f;276m = rgb_2100[0] * 0.1562512f + rgb_2100[1] * 0.7272980f + rgb_2100[2] * 0.1164508f;277s = rgb_2100[0] * 0.0351435f + rgb_2100[1] * 0.1565601f + rgb_2100[2] * 0.8082964f;278}279else280{281// Assumes REC2020/BT.2100 input color gamut (this is from the spec)282l = 0.412109375f * rgb_2100[0] + 0.52392578125f * rgb_2100[1] + 0.06396484375f * rgb_2100[2];283m = 0.166748046875f * rgb_2100[0] + 0.720458984375f * rgb_2100[1] + 0.11279296875f * rgb_2100[2];284s = 0.024169921875f * rgb_2100[0] + 0.075439453125f * rgb_2100[1] + 0.900390625f * rgb_2100[2];285}286287float ld = forwardPQTab(l);288float md = forwardPQTab(m);289float sd = forwardPQTab(s);290291ictcp[0] = .5f * ld + .5f * md;292293// if ITP scale Ct by .5 (the ICtCp spec scaled Ct to better exploit the full scaled output, which is not perceptually linear)294if (itp_flag)295ictcp[1] = 0.806884765625f * ld + -1.6617431640625f * md + 0.8548583984375f * sd;296else297ictcp[1] = 1.61376953125f * ld + -3.323486328125f * md + 1.709716796875f * sd;298299ictcp[2] = 4.378173828125f * ld + -4.24560546875f * md + -0.132568359375f * sd;300}301302static inline void linear_rgb_to_itp(const vec3F& rgb_in, vec3F& itp, const astc_hdr_6x6_global_config &cfg)303{304linear_rgb_to_ictcp(rgb_in, itp, true, cfg.m_rec2020_bt2100_color_gamut);305}306307#if 0308// Outputs rec2020/bt2100 color gamut (i.e. this doesn't convert back to REC709 gamut).309static void ictcp_to_linear_rgb(const vec3F& ictcp, vec3F& rgb, bool itp_flag = false)310{311float ct = ictcp[1];312313if (itp_flag)314ct *= 2.0f;315316float ld = ictcp[0] + ct * 0.008609037037932726f + ictcp[2] * 0.11102962500302596f;317float md = ictcp[0] + ct * -0.008609037037932726f + ictcp[2] * -0.11102962500302596f;318float sd = ictcp[0] + ct * 0.5600313357106792f + ictcp[2] * -0.32062717498731885f;319320float l = inversePQ(ld);321float m = inversePQ(md);322float s = inversePQ(sd);323324rgb[0] = l * 3.436606694333079f + m * -2.5064521186562705f + s * 0.06984542432319149f;325rgb[1] = l * -0.7913295555989289f + m * 1.983600451792291f + s * -0.192270896193362f;326rgb[2] = l * -0.025949899690592672f + m * -0.09891371471172646f + s * 1.1248636144023192f;327}328#endif329330struct half_vec3331{332basist::half_float m_vals[3];333334inline half_vec3() { }335336inline half_vec3(basist::half_float x, basist::half_float y, basist::half_float z)337{338m_vals[0] = x;339m_vals[1] = y;340m_vals[2] = z;341}342343inline half_vec3(const half_vec3& other)344{345*this = other;346}347348inline half_vec3& operator= (const half_vec3& rhs)349{350m_vals[0] = rhs.m_vals[0];351m_vals[1] = rhs.m_vals[1];352m_vals[2] = rhs.m_vals[2];353return *this;354}355356inline void clear()357{358clear_obj(m_vals);359}360361inline half_vec3 &set(basist::half_float x, basist::half_float y, basist::half_float z)362{363m_vals[0] = x;364m_vals[1] = y;365m_vals[2] = z;366return *this;367}368369inline half_vec3& set(float x, float y, float z)370{371m_vals[0] = basist::float_to_half(x);372m_vals[1] = basist::float_to_half(y);373m_vals[2] = basist::float_to_half(z);374return *this;375}376377template<typename T>378inline half_vec3& set_vec(const T& vec)379{380m_vals[0] = basist::float_to_half(vec[0]);381m_vals[1] = basist::float_to_half(vec[1]);382m_vals[2] = basist::float_to_half(vec[2]);383return *this;384}385386template<typename T>387inline T get_vec() const388{389return T(basist::half_to_float(m_vals[0]), basist::half_to_float(m_vals[1]), basist::half_to_float(m_vals[2]));390}391392inline basist::half_float operator[] (uint32_t c) const { assert(c < 3); return m_vals[c]; }393inline basist::half_float& operator[] (uint32_t c) { assert(c < 3); return m_vals[c]; }394395float get_float_comp(uint32_t c) const396{397assert(c < 3);398return basist::half_to_float(m_vals[c]);399}400401half_vec3& set_float_comp(uint32_t c, float v)402{403assert(c < 3);404m_vals[c] = basist::float_to_half(v);405return *this;406}407};408409struct half_vec4410{411basist::half_float m_vals[4];412413inline half_vec4() { }414415inline half_vec4(basist::half_float x, basist::half_float y, basist::half_float z, basist::half_float w)416{417m_vals[0] = x;418m_vals[1] = y;419m_vals[2] = z;420m_vals[3] = w;421}422423inline half_vec4(const half_vec4& other)424{425*this = other;426}427428inline half_vec4& operator= (const half_vec4& rhs)429{430m_vals[0] = rhs.m_vals[0];431m_vals[1] = rhs.m_vals[1];432m_vals[2] = rhs.m_vals[2];433m_vals[3] = rhs.m_vals[3];434return *this;435}436437inline void clear()438{439clear_obj(m_vals);440}441442inline half_vec4& set(basist::half_float x, basist::half_float y, basist::half_float z, basist::half_float w)443{444m_vals[0] = x;445m_vals[1] = y;446m_vals[2] = z;447m_vals[3] = w;448return *this;449}450451inline half_vec4& set(float x, float y, float z, float w)452{453m_vals[0] = basist::float_to_half(x);454m_vals[1] = basist::float_to_half(y);455m_vals[2] = basist::float_to_half(z);456m_vals[3] = basist::float_to_half(w);457return *this;458}459460template<typename T>461inline half_vec4& set_vec(const T& vec)462{463m_vals[0] = basist::float_to_half(vec[0]);464m_vals[1] = basist::float_to_half(vec[1]);465m_vals[2] = basist::float_to_half(vec[2]);466m_vals[3] = basist::float_to_half(vec[3]);467return *this;468}469470template<typename T>471inline T get_vec() const472{473return T(basist::half_to_float(m_vals[0]), basist::half_to_float(m_vals[1]), basist::half_to_float(m_vals[2]), basist::half_to_float(m_vals[3]));474}475476inline basist::half_float operator[] (uint32_t c) const { assert(c < 4); return m_vals[c]; }477inline basist::half_float &operator[] (uint32_t c) { assert(c < 4); return m_vals[c]; }478479float get_float_comp(uint32_t c) const480{481assert(c < 4);482return basist::half_to_float(m_vals[c]);483}484485half_vec4& set_float_comp(uint32_t c, float v)486{487assert(c < 4);488m_vals[c] = basist::float_to_half(v);489return *this;490}491};492493const uint32_t MAX_BLOCK_W = 6, MAX_BLOCK_H = 6;494495struct trial_result496{497astc_helpers::log_astc_block m_log_blk;498double m_err;499bool m_valid;500};501502//----------------------------------------------------------503504const uint32_t NUM_PART3_MAPPINGS = 6;505static uint8_t g_part3_mapping[NUM_PART3_MAPPINGS][3] =506{507{ 0, 1, 2 },508{ 1, 2, 0 },509{ 2, 0, 1 },510{ 0, 2, 1 },511{ 1, 0, 2 },512{ 2, 1, 0 }513};514515struct partition_pattern_vec516{517uint8_t m_parts[6 * 6];518519partition_pattern_vec()520{521clear();522}523524partition_pattern_vec(const partition_pattern_vec& other)525{526*this = other;527}528529void clear()530{531memset(m_parts, 0, sizeof(m_parts));532}533534partition_pattern_vec& operator= (const partition_pattern_vec& rhs)535{536if (this == &rhs)537return *this;538memcpy(m_parts, rhs.m_parts, 36);539return *this;540}541542uint8_t operator[] (uint32_t i) const { assert(i < 36); return m_parts[i]; }543uint8_t& operator[] (uint32_t i) { assert(i < 36); return m_parts[i]; }544545uint8_t operator() (uint32_t x, uint32_t y) const { assert((x < 6) && (y < 6)); return m_parts[x + y * 6]; }546uint8_t& operator() (uint32_t x, uint32_t y) { assert((x < 6) && (y < 6)); return m_parts[x + y * 6]; }547548int get_squared_distance(const partition_pattern_vec& other) const549{550int total_dist = 0;551for (uint32_t i = 0; i < 36; i++)552total_dist += iabs((int)m_parts[i] - (int)other.m_parts[i]);553return total_dist;554}555556float get_distance(const partition_pattern_vec& other) const557{558return sqrtf((float)get_squared_distance(other));559}560561partition_pattern_vec get_permuted2(uint32_t permute_index) const562{563assert(permute_index <= 1);564565partition_pattern_vec res;566for (uint32_t i = 0; i < 36; i++)567{568assert(m_parts[i] <= 1);569res.m_parts[i] = (uint8_t)(m_parts[i] ^ permute_index);570}571572return res;573}574575partition_pattern_vec get_permuted3(uint32_t permute_index) const576{577assert(permute_index <= 5);578579partition_pattern_vec res;580for (uint32_t i = 0; i < 36; i++)581{582assert(m_parts[i] <= 2);583res.m_parts[i] = g_part3_mapping[permute_index][m_parts[i]];584}585586return res;587}588589partition_pattern_vec get_canonicalized() const590{591partition_pattern_vec res;592593int new_labels[3] = { -1, -1, -1 };594uint32_t next_index = 0;595for (uint32_t i = 0; i < 36; i++)596{597uint32_t p = m_parts[i];598if (new_labels[p] == -1)599new_labels[p] = next_index++;600601res.m_parts[i] = (uint8_t)new_labels[p];602}603604return res;605}606607bool operator== (const partition_pattern_vec& rhs) const608{609return memcmp(m_parts, rhs.m_parts, sizeof(m_parts)) == 0;610}611612operator size_t() const613{614return basisu::hash_hsieh(m_parts, sizeof(m_parts));615}616};617618struct vp_tree_node619{620partition_pattern_vec m_vantage_point;621uint32_t m_point_index;622float m_dist;623624int m_inner_node, m_outer_node;625};626627#define BRUTE_FORCE_PART_SEARCH (0)628629class vp_tree630{631public:632vp_tree()633{634}635636void clear()637{638m_nodes.clear();639}640641// This requires no redundant patterns, i.e. all must be unique.642bool init(uint32_t n, const partition_pattern_vec* pUnique_pats)643{644clear();645646uint_vec pat_indices(n);647for (uint32_t i = 0; i < n; i++)648pat_indices[i] = i;649650std::pair<int, float> root_idx = find_best_vantage_point(n, pUnique_pats, pat_indices);651652if (root_idx.first == -1)653return false;654655m_nodes.resize(1);656m_nodes[0].m_vantage_point = pUnique_pats[root_idx.first];657m_nodes[0].m_point_index = root_idx.first;658m_nodes[0].m_dist = root_idx.second;659m_nodes[0].m_inner_node = -1;660m_nodes[0].m_outer_node = -1;661662uint_vec inner_list, outer_list;663664inner_list.reserve(n / 2);665outer_list.reserve(n / 2);666667for (uint32_t pat_index = 0; pat_index < n; pat_index++)668{669if ((int)pat_index == root_idx.first)670continue;671672const float dist = m_nodes[0].m_vantage_point.get_distance(pUnique_pats[pat_index]);673674if (dist <= root_idx.second)675inner_list.push_back(pat_index);676else677outer_list.push_back(pat_index);678}679680if (inner_list.size())681{682m_nodes[0].m_inner_node = create_node(n, pUnique_pats, inner_list);683if (m_nodes[0].m_inner_node < 0)684return false;685}686687if (outer_list.size())688{689m_nodes[0].m_outer_node = create_node(n, pUnique_pats, outer_list);690if (m_nodes[0].m_outer_node < 0)691return false;692}693694return true;695}696697struct result698{699uint32_t m_pat_index;700uint32_t m_mapping_index;701float m_dist;702703bool operator< (const result& rhs) const { return m_dist < rhs.m_dist; }704bool operator> (const result& rhs) const { return m_dist > rhs.m_dist; }705};706707class result_queue708{709enum { MaxSupportedSize = 256 + 1 };710711public:712result_queue() :713m_cur_size(0)714{715}716717size_t get_size() const718{719return m_cur_size;720}721722bool empty() const723{724return !m_cur_size;725}726727typedef std::array<result, MaxSupportedSize + 1> result_array_type;728729const result_array_type& get_elements() const { return m_elements; }730result_array_type& get_elements() { return m_elements; }731732void clear()733{734m_cur_size = 0;735}736737void reserve(uint32_t n)738{739BASISU_NOTE_UNUSED(n);740}741742const result& top() const743{744assert(m_cur_size);745return m_elements[1];746}747748bool insert(const result& val, uint32_t max_size)749{750assert(max_size < MaxSupportedSize);751752if (m_cur_size >= MaxSupportedSize)753return false;754755m_elements[++m_cur_size] = val;756up_heap(m_cur_size);757758if (m_cur_size > max_size)759pop();760761return true;762}763764bool pop()765{766if (m_cur_size == 0)767return false;768769m_elements[1] = m_elements[m_cur_size--];770down_heap(1);771return true;772}773774float get_highest_dist() const775{776if (!m_cur_size)777return 0.0f;778779return top().m_dist;780}781782private:783result_array_type m_elements;784size_t m_cur_size;785786void up_heap(size_t index)787{788while ((index > 1) && (m_elements[index] > m_elements[index >> 1]))789{790std::swap(m_elements[index], m_elements[index >> 1]);791index >>= 1;792}793}794795void down_heap(size_t index)796{797for ( ; ; )798{799size_t largest = index, left_child = 2 * index, right_child = 2 * index + 1;800801if ((left_child <= m_cur_size) && (m_elements[left_child] > m_elements[largest]))802largest = left_child;803804if ((right_child <= m_cur_size) && (m_elements[right_child] > m_elements[largest]))805largest = right_child;806807if (largest == index)808break;809810std::swap(m_elements[index], m_elements[largest]);811index = largest;812}813}814};815816void find_nearest(uint32_t num_subsets, const partition_pattern_vec& desired_pat, result_queue& results, uint32_t max_results)817{818assert((num_subsets >= 2) && (num_subsets <= 3));819820results.clear();821822if (!m_nodes.size())823return;824825uint32_t num_desired_pats;826partition_pattern_vec desired_pats[NUM_PART3_MAPPINGS];827828if (num_subsets == 2)829{830num_desired_pats = 2;831for (uint32_t i = 0; i < 2; i++)832desired_pats[i] = desired_pat.get_permuted2(i);833}834else835{836num_desired_pats = NUM_PART3_MAPPINGS;837for (uint32_t i = 0; i < NUM_PART3_MAPPINGS; i++)838desired_pats[i] = desired_pat.get_permuted3(i);839}840841#if 0842find_nearest_at_node(0, num_desired_pats, desired_pats, results, max_results);843#else844find_nearest_at_node_non_recursive(0, num_desired_pats, desired_pats, results, max_results);845#endif846}847848private:849basisu::vector<vp_tree_node> m_nodes;850851void find_nearest_at_node(int node_index, uint32_t num_desired_pats, const partition_pattern_vec* pDesired_pats, result_queue& results, uint32_t max_results)852{853float best_dist_to_vantage = BIG_FLOAT_VAL;854uint32_t best_mapping = 0;855for (uint32_t i = 0; i < num_desired_pats; i++)856{857float dist = pDesired_pats[i].get_distance(m_nodes[node_index].m_vantage_point);858if (dist < best_dist_to_vantage)859{860best_dist_to_vantage = dist;861best_mapping = i;862}863}864865result r;866r.m_dist = best_dist_to_vantage;867r.m_mapping_index = best_mapping;868r.m_pat_index = m_nodes[node_index].m_point_index;869870results.insert(r, max_results);871872if (best_dist_to_vantage <= m_nodes[node_index].m_dist)873{874// inner first875if (m_nodes[node_index].m_inner_node >= 0)876find_nearest_at_node(m_nodes[node_index].m_inner_node, num_desired_pats, pDesired_pats, results, max_results);877878if (m_nodes[node_index].m_outer_node >= 0)879{880if ( (results.get_size() < max_results) ||881((m_nodes[node_index].m_dist - best_dist_to_vantage) <= results.get_highest_dist())882)883{884find_nearest_at_node(m_nodes[node_index].m_outer_node, num_desired_pats, pDesired_pats, results, max_results);885}886}887}888else889{890// outer first891if (m_nodes[node_index].m_outer_node >= 0)892find_nearest_at_node(m_nodes[node_index].m_outer_node, num_desired_pats, pDesired_pats, results, max_results);893894if (m_nodes[node_index].m_inner_node >= 0)895{896if ( (results.get_size() < max_results) ||897((best_dist_to_vantage - m_nodes[node_index].m_dist) <= results.get_highest_dist())898)899{900find_nearest_at_node(m_nodes[node_index].m_inner_node, num_desired_pats, pDesired_pats, results, max_results);901}902}903}904}905906void find_nearest_at_node_non_recursive(int init_node_index, uint32_t num_desired_pats, const partition_pattern_vec* pDesired_pats, result_queue& results, uint32_t max_results)907{908uint_vec node_stack;909node_stack.reserve(16);910node_stack.push_back(init_node_index);911912do913{914const uint32_t node_index = node_stack.back();915node_stack.pop_back();916917float best_dist_to_vantage = BIG_FLOAT_VAL;918uint32_t best_mapping = 0;919for (uint32_t i = 0; i < num_desired_pats; i++)920{921float dist = pDesired_pats[i].get_distance(m_nodes[node_index].m_vantage_point);922if (dist < best_dist_to_vantage)923{924best_dist_to_vantage = dist;925best_mapping = i;926}927}928929result r;930r.m_dist = best_dist_to_vantage;931r.m_mapping_index = best_mapping;932r.m_pat_index = m_nodes[node_index].m_point_index;933934results.insert(r, max_results);935936if (best_dist_to_vantage <= m_nodes[node_index].m_dist)937{938if (m_nodes[node_index].m_outer_node >= 0)939{940if ((results.get_size() < max_results) ||941((m_nodes[node_index].m_dist - best_dist_to_vantage) <= results.get_highest_dist())942)943{944node_stack.push_back(m_nodes[node_index].m_outer_node);945}946}947948// inner first949if (m_nodes[node_index].m_inner_node >= 0)950{951node_stack.push_back(m_nodes[node_index].m_inner_node);952}953}954else955{956if (m_nodes[node_index].m_inner_node >= 0)957{958if ((results.get_size() < max_results) ||959((best_dist_to_vantage - m_nodes[node_index].m_dist) <= results.get_highest_dist())960)961{962node_stack.push_back(m_nodes[node_index].m_inner_node);963}964}965966// outer first967if (m_nodes[node_index].m_outer_node >= 0)968{969node_stack.push_back(m_nodes[node_index].m_outer_node);970}971}972973} while (!node_stack.empty());974}975976// returns the index of the new node, or -1 on error977int create_node(uint32_t n, const partition_pattern_vec* pUnique_pats, const uint_vec& pat_indices)978{979std::pair<int, float> root_idx = find_best_vantage_point(n, pUnique_pats, pat_indices);980981if (root_idx.first < 0)982return -1;983984m_nodes.resize(m_nodes.size() + 1);985const uint32_t new_node_index = m_nodes.size_u32() - 1;986987m_nodes[new_node_index].m_vantage_point = pUnique_pats[root_idx.first];988m_nodes[new_node_index].m_point_index = root_idx.first;989m_nodes[new_node_index].m_dist = root_idx.second;990m_nodes[new_node_index].m_inner_node = -1;991m_nodes[new_node_index].m_outer_node = -1;992993uint_vec inner_list, outer_list;994995inner_list.reserve(pat_indices.size_u32() / 2);996outer_list.reserve(pat_indices.size_u32() / 2);997998for (uint32_t pat_indices_iter = 0; pat_indices_iter < pat_indices.size(); pat_indices_iter++)999{1000const uint32_t pat_index = pat_indices[pat_indices_iter];10011002if ((int)pat_index == root_idx.first)1003continue;10041005const float dist = m_nodes[new_node_index].m_vantage_point.get_distance(pUnique_pats[pat_index]);10061007if (dist <= root_idx.second)1008inner_list.push_back(pat_index);1009else1010outer_list.push_back(pat_index);1011}10121013if (inner_list.size())1014m_nodes[new_node_index].m_inner_node = create_node(n, pUnique_pats, inner_list);10151016if (outer_list.size())1017m_nodes[new_node_index].m_outer_node = create_node(n, pUnique_pats, outer_list);10181019return new_node_index;1020}10211022// returns the pattern index of the vantage point (-1 on error), and the optimal split distance1023std::pair<int, float> find_best_vantage_point(uint32_t num_unique_pats, const partition_pattern_vec* pUnique_pats, const uint_vec &pat_indices)1024{1025BASISU_NOTE_UNUSED(num_unique_pats);10261027const uint32_t n = pat_indices.size_u32();10281029assert(n);1030if (n == 1)1031return std::pair(pat_indices[0], 0.0f);10321033float best_split_metric = -1.0f;1034int best_split_pat = -1;1035float best_split_dist = 0.0f;1036float best_split_var = 0.0f;10371038basisu::vector< std::pair<float, uint32_t> > dists;1039dists.reserve(n);10401041float_vec float_dists;1042float_dists.reserve(n);10431044for (uint32_t pat_indices_iter = 0; pat_indices_iter < n; pat_indices_iter++)1045{1046const uint32_t split_pat_index = pat_indices[pat_indices_iter];1047assert(split_pat_index < num_unique_pats);10481049const partition_pattern_vec& trial_vantage = pUnique_pats[split_pat_index];10501051dists.resize(0);1052float_dists.resize(0);10531054for (uint32_t j = 0; j < n; j++)1055{1056const uint32_t pat_index = pat_indices[j];1057assert(pat_index < num_unique_pats);10581059if (pat_index == split_pat_index)1060continue;10611062float dist = trial_vantage.get_distance(pUnique_pats[pat_index]);1063dists.emplace_back(std::pair(dist, pat_index));10641065float_dists.push_back(dist);1066}10671068stats<double> s;1069s.calc(float_dists.size_u32(), float_dists.data());10701071std::sort(dists.begin(), dists.end(), [](const auto &a, const auto &b) {1072return a.first < b.first;1073});10741075const uint32_t num_dists = dists.size_u32();1076float split_dist = dists[num_dists / 2].first;1077if ((num_dists & 1) == 0)1078split_dist = (split_dist + dists[(num_dists / 2) - 1].first) * .5f;10791080uint32_t total_inner = 0, total_outer = 0;10811082for (uint32_t j = 0; j < n; j++)1083{1084const uint32_t pat_index = pat_indices[j];1085if (pat_index == split_pat_index)1086continue;10871088float dist = trial_vantage.get_distance(pUnique_pats[pat_index]);10891090if (dist <= split_dist)1091total_inner++;1092else1093total_outer++;1094}10951096float split_metric = (float)minimum(total_inner, total_outer) / (float)maximum(total_inner, total_outer);10971098if ( (split_metric > best_split_metric) ||1099((split_metric == best_split_metric) && (s.m_var > best_split_var)) )1100{1101best_split_metric = split_metric;1102best_split_dist = split_dist;1103best_split_pat = split_pat_index;1104best_split_var = (float)s.m_var;1105}1106}11071108return std::pair(best_split_pat, best_split_dist);1109}1110};11111112struct partition1113{1114uint64_t m_p;11151116inline partition() :1117m_p(0)1118{1119}11201121inline partition(uint64_t p) :1122m_p(p)1123{1124assert(p < (1ULL << 36));1125}11261127inline partition& operator=(uint64_t p)1128{1129assert(p < (1ULL << 36));1130m_p = p;1131return *this;1132}11331134inline bool operator< (const partition& p) const1135{1136return m_p < p.m_p;1137}11381139inline bool operator== (const partition& p) const1140{1141return m_p == p.m_p;1142}11431144inline operator size_t() const1145{1146return hash_hsieh((const uint8_t *)&m_p, sizeof(m_p));1147}1148};11491150partition_pattern_vec g_partitions2[NUM_UNIQUE_PARTITIONS2];1151int g_part2_seed_to_unique_index[1024];1152vp_tree g_part2_vp_tree;11531154static inline vec3F vec3F_norm_approx(vec3F axis)1155{1156float l = axis.norm();1157axis = (fabs(l) >= SMALL_FLOAT_VAL) ? (axis * bu_math::inv_sqrt(l)) : vec3F(0.577350269f);1158return axis;1159}11601161static void init_partitions2_6x6()1162{1163#if 01164// makes pattern bits to the 10-bit ASTC seed index1165typedef basisu::hash_map<uint64_t, uint32_t> partition2_hash_map;1166partition2_hash_map phash;1167phash.reserve(1024);11681169for (uint32_t i = 0; i < 1024; i++)1170{1171uint64_t p_bits = 0;1172uint64_t p_bits_inv = 0;11731174for (uint32_t y = 0; y < 6; y++)1175{1176for (uint32_t x = 0; x < 6; x++)1177{1178uint64_t p = astc_helpers::compute_texel_partition(i, x, y, 0, 2, false);1179assert(p < 2);11801181p_bits |= (p << (x + y * 6));1182p_bits_inv |= ((1 - p) << (x + y * 6));1183}1184}11851186if (!p_bits)1187continue;1188if (p_bits == ((1ULL << 36) - 1))1189continue;11901191assert(p_bits < (1ULL << 36));1192assert(p_bits_inv < (1ULL << 36));11931194if (phash.contains(p_bits))1195{1196}1197else if (phash.contains(p_bits_inv))1198{1199}1200else1201{1202auto res = phash.insert(p_bits, i);1203assert(res.second);1204BASISU_NOTE_UNUSED(res);1205}1206}12071208uint32_t num_unique_partitions2 = 0;12091210for (const auto& r : phash)1211{1212assert(r.second < 1024);12131214const uint32_t unique_index = num_unique_partitions2;1215assert(unique_index < NUM_UNIQUE_PARTITIONS2);12161217partition_pattern_vec pat_vec;1218for (uint32_t i = 0; i < 36; i++)1219pat_vec[i] = (uint8_t)((r.first >> i) & 1);12201221g_partitions2[unique_index] = pat_vec;12221223assert(g_part2_unique_index_to_seed[unique_index] == r.second);1224g_part2_seed_to_unique_index[r.second] = unique_index;12251226num_unique_partitions2++;1227}1228assert(num_unique_partitions2 == NUM_UNIQUE_PARTITIONS2);1229#else1230for (uint32_t unique_index = 0; unique_index < NUM_UNIQUE_PARTITIONS2; unique_index++)1231{1232const uint32_t seed_index = g_part2_unique_index_to_seed[unique_index];1233assert(seed_index < 1024);12341235assert(g_part2_seed_to_unique_index[seed_index] == 0);1236g_part2_seed_to_unique_index[seed_index] = unique_index;12371238partition_pattern_vec& pat_vec = g_partitions2[unique_index];12391240for (uint32_t y = 0; y < 6; y++)1241{1242for (uint32_t x = 0; x < 6; x++)1243{1244uint8_t p = (uint8_t)astc_helpers::compute_texel_partition(seed_index, x, y, 0, 2, false);1245assert(p < 2);12461247pat_vec[x + y * 6] = p;1248}1249}1250}1251#endif12521253g_part2_vp_tree.init(NUM_UNIQUE_PARTITIONS2, g_partitions2);1254}12551256static bool estimate_partition2_6x6(1257const basist::half_float pBlock_pixels_half[][3],1258int* pBest_parts, uint32_t num_best_parts)1259{1260const uint32_t BLOCK_W = 6, BLOCK_H = 6, BLOCK_T = BLOCK_W * BLOCK_H;12611262vec3F training_vecs[BLOCK_T], mean(0.0f);12631264for (uint32_t i = 0; i < BLOCK_T; i++)1265{1266vec3F& v = training_vecs[i];12671268v[0] = (float)pBlock_pixels_half[i][0];1269v[1] = (float)pBlock_pixels_half[i][1];1270v[2] = (float)pBlock_pixels_half[i][2];12711272mean += v;1273}1274mean *= (1.0f / (float)BLOCK_T);12751276vec3F max_vals(-BIG_FLOAT_VAL);12771278for (uint32_t i = 0; i < BLOCK_T; i++)1279{1280vec3F& v = training_vecs[i];1281max_vals = vec3F::component_max(max_vals, v);1282}12831284// Initialize principle axis approximation1285vec3F axis(max_vals - mean);12861287// Incremental approx. PCA - only viable if we have a reasonably fast approximation for 1.0/sqrt(x).1288for (uint32_t i = 0; i < BLOCK_T; i++)1289{1290axis = vec3F_norm_approx(axis);12911292vec3F color(training_vecs[i] - mean);12931294float d = color.dot(axis);12951296axis += color * d;1297}12981299if (axis.norm() < SMALL_FLOAT_VAL)1300axis.set(0.57735027f);1301else1302axis.normalize_in_place();13031304#if BRUTE_FORCE_PART_SEARCH1305int desired_parts[BLOCK_H][BLOCK_W]; // [y][x]1306for (uint32_t i = 0; i < BLOCK_T; i++)1307{1308float proj = (training_vecs[i] - mean).dot(axis);13091310desired_parts[i / BLOCK_W][i % BLOCK_W] = proj < 0.0f;1311}1312#else1313partition_pattern_vec desired_part;13141315for (uint32_t i = 0; i < BLOCK_T; i++)1316{1317float proj = (training_vecs[i] - mean).dot(axis);13181319desired_part.m_parts[i] = proj < 0.0f;1320}1321#endif13221323//interval_timer tm;1324//tm.start();13251326#if BRUTE_FORCE_PART_SEARCH1327uint32_t part_similarity[NUM_UNIQUE_PARTITIONS2];13281329for (uint32_t part_index = 0; part_index < NUM_UNIQUE_PARTITIONS2; part_index++)1330{1331const partition_pattern_vec &pat_vec = g_partitions2[part_index];13321333int total_sim_non_inv = 0;1334int total_sim_inv = 0;13351336for (uint32_t y = 0; y < BLOCK_H; y++)1337{1338for (uint32_t x = 0; x < BLOCK_W; x++)1339{1340int part = pat_vec[x + y * 6];13411342if (part == desired_parts[y][x])1343total_sim_non_inv++;13441345if ((part ^ 1) == desired_parts[y][x])1346total_sim_inv++;1347}1348}13491350int total_sim = maximum(total_sim_non_inv, total_sim_inv);13511352part_similarity[part_index] = (total_sim << 16) | part_index;13531354} // part_index;13551356std::sort(part_similarity, part_similarity + NUM_UNIQUE_PARTITIONS2);13571358for (uint32_t i = 0; i < num_best_parts; i++)1359pBest_parts[i] = part_similarity[(NUM_UNIQUE_PARTITIONS2 - 1) - i] & 0xFFFF;1360#else1361vp_tree::result_queue results;1362results.reserve(num_best_parts);1363g_part2_vp_tree.find_nearest(2, desired_part, results, num_best_parts);13641365assert(results.get_size() == num_best_parts);13661367const auto& elements = results.get_elements();13681369for (uint32_t i = 0; i < results.get_size(); i++)1370pBest_parts[i] = elements[1 + i].m_pat_index;1371#endif13721373//fmt_printf("{} ", tm.get_elapsed_ms());13741375return true;1376}13771378const uint32_t MIN_REFINE_LEVEL = 0;13791380static bool encode_block_2_subsets(1381trial_result res[2],1382uint32_t grid_w, uint32_t grid_h,1383uint32_t cem,1384uint32_t weights_ise_range, uint32_t endpoints_ise_range,1385const half_vec3* pBlock_pixels_half, const vec4F* pBlock_pixels_q16,1386astc_hdr_codec_base_options& coptions,1387bool uber_mode_flag,1388int unique_pat_index,1389uint32_t comp_level,1390opt_mode_t mode11_opt_mode,1391bool refine_endpoints_flag)1392{1393const uint32_t num_endpoint_vals = (cem == 11) ? basist::NUM_MODE11_ENDPOINTS : basist::NUM_MODE7_ENDPOINTS;13941395res[0].m_valid = false;1396res[1].m_valid = false;13971398const uint32_t BLOCK_W = 6, BLOCK_H = 6;13991400astc_helpers::log_astc_block best_log_blk;1401clear_obj(best_log_blk);14021403best_log_blk.m_num_partitions = 2;1404best_log_blk.m_color_endpoint_modes[0] = (uint8_t)cem;1405best_log_blk.m_color_endpoint_modes[1] = (uint8_t)cem;1406best_log_blk.m_grid_width = (uint8_t)grid_w;1407best_log_blk.m_grid_height = (uint8_t)grid_h;14081409best_log_blk.m_weight_ise_range = (uint8_t)weights_ise_range;1410best_log_blk.m_endpoint_ise_range = (uint8_t)endpoints_ise_range;14111412partition_pattern_vec* pPat = &g_partitions2[unique_pat_index];1413const uint32_t p_seed = g_part2_unique_index_to_seed[unique_pat_index];14141415vec4F part_pixels_q16[2][64];1416half_vec3 part_half_pixels[2][64];1417uint8_t part_pixel_index[2][64];1418uint32_t part_total_pixels[2] = { 0 };14191420for (uint32_t y = 0; y < BLOCK_H; y++)1421{1422for (uint32_t x = 0; x < BLOCK_W; x++)1423{1424uint32_t part_index = (*pPat)[x + y * BLOCK_W];14251426uint32_t l = part_total_pixels[part_index];14271428part_pixels_q16[part_index][l] = pBlock_pixels_q16[x + y * BLOCK_W];1429part_half_pixels[part_index][l] = pBlock_pixels_half[x + y * BLOCK_W];1430part_pixel_index[part_index][l] = (uint8_t)(x + y * BLOCK_W);14311432part_total_pixels[part_index] = l + 1;1433} // x1434} // y14351436uint8_t blk_endpoints[2][basist::NUM_MODE11_ENDPOINTS];1437uint8_t blk_weights[2][BLOCK_W * BLOCK_H];1438uint32_t best_submode[2];14391440for (uint32_t part_iter = 0; part_iter < 2; part_iter++)1441{1442assert(part_total_pixels[part_iter]);14431444double e;1445if (cem == 7)1446{1447e = encode_astc_hdr_block_mode_7(1448part_total_pixels[part_iter],1449(basist::half_float(*)[3])part_half_pixels[part_iter], (vec4F*)part_pixels_q16[part_iter],1450best_log_blk.m_weight_ise_range,1451best_submode[part_iter],1452BIG_FLOAT_VAL,1453blk_endpoints[part_iter],1454blk_weights[part_iter],1455coptions,1456best_log_blk.m_endpoint_ise_range);1457}1458else1459{1460assert(cem == 11);14611462e = encode_astc_hdr_block_mode_11(1463part_total_pixels[part_iter],1464(basist::half_float(*)[3])part_half_pixels[part_iter], (vec4F*)part_pixels_q16[part_iter],1465best_log_blk.m_weight_ise_range,1466best_submode[part_iter],1467BIG_FLOAT_VAL,1468blk_endpoints[part_iter],1469blk_weights[part_iter],1470coptions,1471false,1472best_log_blk.m_endpoint_ise_range, uber_mode_flag, false, -1, 7, false,1473mode11_opt_mode);1474}14751476if (e == BIG_FLOAT_VAL)1477return false;14781479} // part_iter14801481uint8_t ise_weights[BLOCK_W * BLOCK_H];14821483uint32_t src_pixel_index[2] = { 0, 0 };1484for (uint32_t y = 0; y < BLOCK_H; y++)1485{1486for (uint32_t x = 0; x < BLOCK_W; x++)1487{1488uint32_t part_index = (*pPat)[x + y * BLOCK_W];1489ise_weights[x + y * BLOCK_W] = blk_weights[part_index][src_pixel_index[part_index]];1490src_pixel_index[part_index]++;1491} // x1492} // y14931494if ((grid_w == BLOCK_W) && (grid_h == BLOCK_H))1495{1496best_log_blk.m_partition_id = (uint16_t)p_seed;14971498memcpy(best_log_blk.m_endpoints, blk_endpoints[0], num_endpoint_vals);1499memcpy(best_log_blk.m_endpoints + num_endpoint_vals, blk_endpoints[1], num_endpoint_vals);1500memcpy(best_log_blk.m_weights, ise_weights, BLOCK_W * BLOCK_H);15011502res[0].m_valid = true;1503res[0].m_log_blk = best_log_blk;1504}1505else1506{1507uint8_t desired_weights[BLOCK_H * BLOCK_W];15081509const auto& dequant_tab = astc_helpers::g_dequant_tables.get_weight_tab(weights_ise_range).m_ISE_to_val;15101511for (uint32_t by = 0; by < BLOCK_H; by++)1512for (uint32_t bx = 0; bx < BLOCK_W; bx++)1513desired_weights[bx + by * BLOCK_W] = dequant_tab[ise_weights[bx + by * BLOCK_W]];15141515uint8_t downsampled_weights[BLOCK_H * BLOCK_W];15161517const float* pDownsample_matrix = get_6x6_downsample_matrix(grid_w, grid_h);1518if (!pDownsample_matrix)1519{1520assert(0);1521return false;1522}15231524downsample_weight_grid(1525pDownsample_matrix,1526BLOCK_W, BLOCK_H, // source/from dimension (block size)1527grid_w, grid_h, // dest/to dimension (grid size)1528desired_weights, // these are dequantized weights, NOT ISE symbols, [by][bx]1529downsampled_weights); // [wy][wx]15301531best_log_blk.m_partition_id = (uint16_t)p_seed;1532memcpy(best_log_blk.m_endpoints, blk_endpoints[0], num_endpoint_vals);1533memcpy(best_log_blk.m_endpoints + num_endpoint_vals, blk_endpoints[1], num_endpoint_vals);15341535const auto& weight_to_ise = astc_helpers::g_dequant_tables.get_weight_tab(weights_ise_range).m_val_to_ise;15361537for (uint32_t gy = 0; gy < grid_h; gy++)1538for (uint32_t gx = 0; gx < grid_w; gx++)1539best_log_blk.m_weights[gx + gy * grid_w] = weight_to_ise[downsampled_weights[gx + gy * grid_w]];15401541res[0].m_valid = true;1542res[0].m_log_blk = best_log_blk;15431544if ((refine_endpoints_flag) && (comp_level >= MIN_REFINE_LEVEL) && ((grid_w < 6) || (grid_h < 6)))1545{1546bool any_refined = false;15471548for (uint32_t part_iter = 0; part_iter < 2; part_iter++)1549{1550bool refine_status = refine_endpoints(1551cem,1552endpoints_ise_range,1553best_log_blk.m_endpoints + part_iter * num_endpoint_vals, // the endpoints to optimize1554BLOCK_W, BLOCK_H, // block dimensions1555grid_w, grid_h, best_log_blk.m_weights, weights_ise_range, // weight grid1556part_total_pixels[part_iter], (basist::half_float(*)[3])part_half_pixels[part_iter], (vec4F*)part_pixels_q16[part_iter],1557&part_pixel_index[part_iter][0], // maps this subset's pixels to block offsets1558coptions, mode11_opt_mode);15591560if (refine_status)1561any_refined = true;1562}15631564if (any_refined)1565{1566res[1].m_valid = true;1567res[1].m_log_blk = best_log_blk;1568}1569}1570}15711572return true;1573}15741575typedef basisu::hash_map<partition_pattern_vec, std::pair<uint32_t, uint32_t > > partition3_hash_map;15761577partition_pattern_vec g_partitions3[NUM_UNIQUE_PARTITIONS3];1578int g_part3_seed_to_unique_index[1024];1579vp_tree g_part3_vp_tree;15801581static void init_partitions3_6x6()1582{1583uint32_t t = 0;15841585for (uint32_t i = 0; i < 1024; i++)1586g_part3_seed_to_unique_index[i] = -1;15871588partition3_hash_map part3_hash;1589part3_hash.reserve(512);15901591for (uint32_t seed_index = 0; seed_index < 1024; seed_index++)1592{1593partition_pattern_vec p3;1594uint32_t part_hist[3] = { 0 };15951596for (uint32_t y = 0; y < 6; y++)1597{1598for (uint32_t x = 0; x < 6; x++)1599{1600uint64_t p = astc_helpers::compute_texel_partition(seed_index, x, y, 0, 3, false);1601assert(p < 3);16021603p3.m_parts[x + y * 6] = (uint8_t)p;1604part_hist[p]++;1605}1606}16071608if (!part_hist[0] || !part_hist[1] || !part_hist[2])1609continue;16101611uint32_t j;1612for (j = 0; j < NUM_PART3_MAPPINGS; j++)1613{1614partition_pattern_vec temp_part3(p3.get_permuted3(j));16151616if (part3_hash.contains(temp_part3))1617break;1618}1619if (j < NUM_PART3_MAPPINGS)1620continue;16211622part3_hash.insert(p3, std::make_pair(seed_index, t) );16231624assert(g_part3_unique_index_to_seed[t] == seed_index);1625g_part3_seed_to_unique_index[seed_index] = t;1626g_partitions3[t] = p3;16271628t++;1629}16301631g_part3_vp_tree.init(NUM_UNIQUE_PARTITIONS3, g_partitions3);1632}16331634static bool estimate_partition3_6x6(1635const basist::half_float pBlock_pixels_half[][3],1636int* pBest_parts, uint32_t num_best_parts)1637{1638const uint32_t BLOCK_W = 6, BLOCK_H = 6, BLOCK_T = BLOCK_W * BLOCK_H, NUM_SUBSETS = 3;16391640assert(num_best_parts && (num_best_parts <= NUM_UNIQUE_PARTITIONS3));16411642vec3F training_vecs[BLOCK_T], mean(0.0f);16431644float brightest_inten = 0.0f, darkest_inten = BIG_FLOAT_VAL;1645vec3F cluster_centroids[NUM_SUBSETS];16461647for (uint32_t i = 0; i < BLOCK_T; i++)1648{1649vec3F& v = training_vecs[i];16501651v.set((float)pBlock_pixels_half[i][0], (float)pBlock_pixels_half[i][1], (float)pBlock_pixels_half[i][2]);16521653float inten = v.dot(vec3F(1.0f));1654if (inten < darkest_inten)1655{1656darkest_inten = inten;1657cluster_centroids[0] = v;1658}16591660if (inten > brightest_inten)1661{1662brightest_inten = inten;1663cluster_centroids[1] = v;1664}1665}16661667if (cluster_centroids[0] == cluster_centroids[1])1668return false;16691670float furthest_dist2 = 0.0f;1671for (uint32_t i = 0; i < BLOCK_T; i++)1672{1673vec3F& v = training_vecs[i];16741675float dist_a = v.squared_distance(cluster_centroids[0]);1676if (dist_a == 0.0f)1677continue;16781679float dist_b = v.squared_distance(cluster_centroids[1]);1680if (dist_b == 0.0f)1681continue;16821683float dist2 = dist_a + dist_b;1684if (dist2 > furthest_dist2)1685{1686furthest_dist2 = dist2;1687cluster_centroids[2] = v;1688}1689}16901691if ((cluster_centroids[0] == cluster_centroids[2]) || (cluster_centroids[1] == cluster_centroids[2]))1692return false;16931694uint32_t cluster_pixels[NUM_SUBSETS][BLOCK_T];1695uint32_t num_cluster_pixels[NUM_SUBSETS];1696vec3F new_cluster_means[NUM_SUBSETS];16971698const uint32_t NUM_ITERS = 4;16991700for (uint32_t s = 0; s < NUM_ITERS; s++)1701{1702memset(num_cluster_pixels, 0, sizeof(num_cluster_pixels));1703memset(new_cluster_means, 0, sizeof(new_cluster_means));17041705for (uint32_t i = 0; i < BLOCK_T; i++)1706{1707float d[NUM_SUBSETS] = {1708training_vecs[i].squared_distance(cluster_centroids[0]),1709training_vecs[i].squared_distance(cluster_centroids[1]),1710training_vecs[i].squared_distance(cluster_centroids[2]) };17111712float min_d = d[0];1713uint32_t min_idx = 0;1714for (uint32_t j = 1; j < NUM_SUBSETS; j++)1715{1716if (d[j] < min_d)1717{1718min_d = d[j];1719min_idx = j;1720}1721}17221723cluster_pixels[min_idx][num_cluster_pixels[min_idx]] = i;1724new_cluster_means[min_idx] += training_vecs[i];1725num_cluster_pixels[min_idx]++;1726} // i17271728for (uint32_t j = 0; j < NUM_SUBSETS; j++)1729{1730if (!num_cluster_pixels[j])1731return false;17321733cluster_centroids[j] = new_cluster_means[j] / (float)num_cluster_pixels[j];1734}1735} // s17361737partition_pattern_vec desired_part;1738for (uint32_t p = 0; p < NUM_SUBSETS; p++)1739{1740for (uint32_t i = 0; i < num_cluster_pixels[p]; i++)1741{1742const uint32_t pix_index = cluster_pixels[p][i];1743desired_part[pix_index] = (uint8_t)p;1744}1745}17461747#if BRUTE_FORCE_PART_SEARCH1748partition_pattern_vec desired_parts[NUM_PART3_MAPPINGS];1749for (uint32_t j = 0; j < NUM_PART3_MAPPINGS; j++)1750desired_parts[j] = desired_part.get_permuted3(j);17511752uint32_t part_similarity[NUM_UNIQUE_PARTITIONS3];17531754for (uint32_t part_index = 0; part_index < NUM_UNIQUE_PARTITIONS3; part_index++)1755{1756const partition_pattern_vec& pat = g_partitions3[part_index];17571758uint32_t lowest_pat_dist = UINT32_MAX;1759for (uint32_t p = 0; p < NUM_PART3_MAPPINGS; p++)1760{1761uint32_t dist = pat.get_squared_distance(desired_parts[p]);1762if (dist < lowest_pat_dist)1763lowest_pat_dist = dist;1764}17651766part_similarity[part_index] = (lowest_pat_dist << 16) | part_index;17671768} // part_index;17691770std::sort(part_similarity, part_similarity + NUM_UNIQUE_PARTITIONS3);17711772for (uint32_t i = 0; i < num_best_parts; i++)1773pBest_parts[i] = part_similarity[i] & 0xFFFF;1774#else1775vp_tree::result_queue results;1776results.reserve(num_best_parts);1777g_part3_vp_tree.find_nearest(3, desired_part, results, num_best_parts);17781779assert(results.get_size() == num_best_parts);17801781const auto& elements = results.get_elements();17821783for (uint32_t i = 0; i < results.get_size(); i++)1784pBest_parts[i] = elements[1 + i].m_pat_index;1785#endif17861787return true;1788}17891790static bool encode_block_3_subsets(1791trial_result& res,1792uint32_t cem,1793uint32_t grid_w, uint32_t grid_h,1794uint32_t weights_ise_range, uint32_t endpoints_ise_range,1795const half_vec3* pBlock_pixels_half, const vec4F* pBlock_pixels_q16,1796astc_hdr_codec_base_options& coptions,1797bool uber_mode_flag,1798const int* pEst_patterns, int num_est_patterns,1799uint32_t comp_level,1800opt_mode_t mode11_opt_mode)1801{1802BASISU_NOTE_UNUSED(uber_mode_flag);1803const uint32_t BLOCK_W = 6, BLOCK_H = 6, NUM_SUBSETS = 3;1804const uint32_t num_endpoint_vals = astc_helpers::get_num_cem_values(cem);18051806res.m_valid = false;18071808double best_e = BIG_FLOAT_VAL;18091810astc_helpers::log_astc_block best_log_blk;1811clear_obj(best_log_blk);18121813best_log_blk.m_num_partitions = NUM_SUBSETS;1814best_log_blk.m_color_endpoint_modes[0] = (uint8_t)cem;1815best_log_blk.m_color_endpoint_modes[1] = (uint8_t)cem;1816best_log_blk.m_color_endpoint_modes[2] = (uint8_t)cem;1817best_log_blk.m_grid_width = (uint8_t)grid_w;1818best_log_blk.m_grid_height = (uint8_t)grid_h;18191820best_log_blk.m_weight_ise_range = (uint8_t)weights_ise_range;1821best_log_blk.m_endpoint_ise_range = (uint8_t)endpoints_ise_range;18221823const uint32_t n = num_est_patterns ? num_est_patterns : NUM_UNIQUE_PARTITIONS3;18241825for (uint32_t unique_p_iter = 0; unique_p_iter < n; unique_p_iter++)1826{1827const uint32_t unique_part_index = num_est_patterns ? pEst_patterns[unique_p_iter] : unique_p_iter;1828assert(unique_part_index < NUM_UNIQUE_PARTITIONS3);1829const partition_pattern_vec*pPart = &g_partitions3[unique_part_index];18301831vec4F part_pixels_q16[NUM_SUBSETS][64];1832half_vec3 part_half_pixels[NUM_SUBSETS][64];1833uint8_t part_pixel_index[NUM_SUBSETS][64];1834uint32_t part_total_pixels[NUM_SUBSETS] = { 0 };18351836for (uint32_t y = 0; y < BLOCK_H; y++)1837{1838for (uint32_t x = 0; x < BLOCK_W; x++)1839{1840const uint32_t part_index = pPart->m_parts[x + y * BLOCK_W];18411842uint32_t l = part_total_pixels[part_index];18431844part_pixels_q16[part_index][l] = pBlock_pixels_q16[x + y * BLOCK_W];1845part_half_pixels[part_index][l] = pBlock_pixels_half[x + y * BLOCK_W];1846part_pixel_index[part_index][l] = (uint8_t)(x + y * BLOCK_W);18471848part_total_pixels[part_index] = l + 1;1849} // x1850} // y18511852uint8_t blk_endpoints[NUM_SUBSETS][basist::NUM_MODE11_ENDPOINTS];1853uint8_t blk_weights[NUM_SUBSETS][BLOCK_W * BLOCK_H];1854uint32_t best_submode[NUM_SUBSETS];18551856double e = 0.0f;1857for (uint32_t part_iter = 0; part_iter < NUM_SUBSETS; part_iter++)1858{1859assert(part_total_pixels[part_iter]);18601861if (cem == 7)1862{1863e += encode_astc_hdr_block_mode_7(1864part_total_pixels[part_iter],1865(basist::half_float(*)[3])part_half_pixels[part_iter], (vec4F*)part_pixels_q16[part_iter],1866best_log_blk.m_weight_ise_range,1867best_submode[part_iter],1868BIG_FLOAT_VAL,1869blk_endpoints[part_iter],1870blk_weights[part_iter],1871coptions,1872best_log_blk.m_endpoint_ise_range);1873}1874else1875{1876assert(cem == 11);18771878e += encode_astc_hdr_block_mode_11(1879part_total_pixels[part_iter],1880(basist::half_float(*)[3])part_half_pixels[part_iter], (vec4F*)part_pixels_q16[part_iter],1881best_log_blk.m_weight_ise_range,1882best_submode[part_iter],1883BIG_FLOAT_VAL,1884blk_endpoints[part_iter],1885blk_weights[part_iter],1886coptions,1887false, best_log_blk.m_endpoint_ise_range, uber_mode_flag, false,1888FIRST_MODE11_SUBMODE_INDEX, MAX_MODE11_SUBMODE_INDEX, false, mode11_opt_mode);1889}18901891} // part_iter18921893uint8_t ise_weights[BLOCK_W * BLOCK_H];18941895uint32_t src_pixel_index[NUM_SUBSETS] = { 0 };1896for (uint32_t y = 0; y < BLOCK_H; y++)1897{1898for (uint32_t x = 0; x < BLOCK_W; x++)1899{1900const uint32_t part_index = pPart->m_parts[x + y * BLOCK_W];19011902ise_weights[x + y * BLOCK_W] = blk_weights[part_index][src_pixel_index[part_index]];1903src_pixel_index[part_index]++;1904} // x1905} // y19061907if ((grid_w == BLOCK_W) && (grid_h == BLOCK_H))1908{1909if (e < best_e)1910{1911best_e = e;1912best_log_blk.m_partition_id = (uint16_t)g_part3_unique_index_to_seed[unique_part_index];19131914for (uint32_t p = 0; p < NUM_SUBSETS; p++)1915memcpy(best_log_blk.m_endpoints + num_endpoint_vals * p, blk_endpoints[p], num_endpoint_vals);19161917memcpy(best_log_blk.m_weights, ise_weights, BLOCK_W * BLOCK_H);1918}1919}1920else1921{1922uint8_t desired_weights[BLOCK_H * BLOCK_W];19231924const auto& dequant_tab = astc_helpers::g_dequant_tables.get_weight_tab(weights_ise_range).m_ISE_to_val;19251926for (uint32_t by = 0; by < BLOCK_H; by++)1927for (uint32_t bx = 0; bx < BLOCK_W; bx++)1928desired_weights[bx + by * BLOCK_W] = dequant_tab[ise_weights[bx + by * BLOCK_W]];19291930uint8_t downsampled_weights[BLOCK_H * BLOCK_W];19311932const float* pDownsample_matrix = get_6x6_downsample_matrix(grid_w, grid_h);1933if (!pDownsample_matrix)1934{1935assert(0);1936return false;1937}19381939downsample_weight_grid(1940pDownsample_matrix,1941BLOCK_W, BLOCK_H, // source/from dimension (block size)1942grid_w, grid_h, // dest/to dimension (grid size)1943desired_weights, // these are dequantized weights, NOT ISE symbols, [by][bx]1944downsampled_weights); // [wy][wx]19451946astc_helpers::log_astc_block trial_blk(best_log_blk);19471948trial_blk.m_partition_id = (uint16_t)g_part3_unique_index_to_seed[unique_part_index];19491950for (uint32_t p = 0; p < NUM_SUBSETS; p++)1951memcpy(trial_blk.m_endpoints + num_endpoint_vals * p, blk_endpoints[p], num_endpoint_vals);19521953const auto& weight_to_ise = astc_helpers::g_dequant_tables.get_weight_tab(weights_ise_range).m_val_to_ise;19541955for (uint32_t gy = 0; gy < grid_h; gy++)1956for (uint32_t gx = 0; gx < grid_w; gx++)1957trial_blk.m_weights[gx + gy * grid_w] = weight_to_ise[downsampled_weights[gx + gy * grid_w]];19581959if ((comp_level >= MIN_REFINE_LEVEL) && ((grid_w < 6) || (grid_h < 6)))1960{1961for (uint32_t part_iter = 0; part_iter < NUM_SUBSETS; part_iter++)1962{1963bool refine_status = refine_endpoints(1964cem,1965endpoints_ise_range,1966trial_blk.m_endpoints + part_iter * num_endpoint_vals, // the endpoints to optimize1967BLOCK_W, BLOCK_H, // block dimensions1968grid_w, grid_h, trial_blk.m_weights, weights_ise_range, // weight grid1969part_total_pixels[part_iter], (basist::half_float(*)[3])part_half_pixels[part_iter], (vec4F*)part_pixels_q16[part_iter],1970&part_pixel_index[part_iter][0], // maps this subset's pixels to block offsets1971coptions, mode11_opt_mode);19721973BASISU_NOTE_UNUSED(refine_status);1974}1975}19761977half_vec4 decoded_pixels_half4[BLOCK_H][BLOCK_W]; // [y][x]1978bool status = astc_helpers::decode_block(trial_blk, decoded_pixels_half4, BLOCK_W, BLOCK_H, astc_helpers::cDecodeModeHDR16);1979assert(status);1980if (!status)1981return false;19821983half_vec3 decoded_pixels_half3[BLOCK_H][BLOCK_W];1984for (uint32_t y = 0; y < BLOCK_H; y++)1985for (uint32_t x = 0; x < BLOCK_W; x++)1986decoded_pixels_half3[y][x].set(decoded_pixels_half4[y][x][0], decoded_pixels_half4[y][x][1], decoded_pixels_half4[y][x][2]);19871988double trial_err = compute_block_error(BLOCK_W * BLOCK_H, (const basist::half_float*)pBlock_pixels_half, (const basist::half_float*)decoded_pixels_half3, coptions);1989if (trial_err < best_e)1990{1991best_e = trial_err;1992best_log_blk = trial_blk;1993}1994}19951996} // unique_p_iter19971998if (best_e < BIG_FLOAT_VAL)1999{2000res.m_log_blk = best_log_blk;2001res.m_valid = true;2002res.m_err = best_e;2003}2004else2005{2006res.m_valid = false;2007}20082009return res.m_valid;2010}20112012static uint32_t encode_values(bitwise_coder &coder, uint32_t total_values, const uint8_t *pVals, uint32_t endpoint_range)2013{2014const uint32_t MAX_VALS = 64;2015uint32_t bit_values[MAX_VALS], tq_values[(MAX_VALS + 2) / 3];2016uint32_t total_tq_values = 0, tq_accum = 0, tq_mul = 1;20172018assert((total_values) && (total_values <= MAX_VALS));20192020const uint32_t ep_bits = astc_helpers::g_ise_range_table[endpoint_range][0];2021const uint32_t ep_trits = astc_helpers::g_ise_range_table[endpoint_range][1];2022const uint32_t ep_quints = astc_helpers::g_ise_range_table[endpoint_range][2];20232024for (uint32_t i = 0; i < total_values; i++)2025{2026uint32_t val = pVals[i];20272028uint32_t bits = val & ((1 << ep_bits) - 1);2029uint32_t tq = val >> ep_bits;20302031bit_values[i] = bits;20322033if (ep_trits)2034{2035assert(tq < 3);2036tq_accum += tq * tq_mul;2037tq_mul *= 3;2038if (tq_mul == 243)2039{2040assert(total_tq_values < BASISU_ARRAY_SIZE(tq_values));2041tq_values[total_tq_values++] = tq_accum;2042tq_accum = 0;2043tq_mul = 1;2044}2045}2046else if (ep_quints)2047{2048assert(tq < 5);2049tq_accum += tq * tq_mul;2050tq_mul *= 5;2051if (tq_mul == 125)2052{2053assert(total_tq_values < BASISU_ARRAY_SIZE(tq_values));2054tq_values[total_tq_values++] = tq_accum;2055tq_accum = 0;2056tq_mul = 1;2057}2058}2059}20602061uint32_t total_bits_output = 0;20622063for (uint32_t i = 0; i < total_tq_values; i++)2064{2065const uint32_t num_bits = ep_trits ? 8 : 7;2066coder.put_bits(tq_values[i], num_bits);2067total_bits_output += num_bits;2068}20692070if (tq_mul > 1)2071{2072uint32_t num_bits;2073if (ep_trits)2074{2075if (tq_mul == 3)2076num_bits = 2;2077else if (tq_mul == 9)2078num_bits = 4;2079else if (tq_mul == 27)2080num_bits = 5;2081else //if (tq_mul == 81)2082num_bits = 7;2083}2084else2085{2086if (tq_mul == 5)2087num_bits = 3;2088else //if (tq_mul == 25)2089num_bits = 5;2090}2091coder.put_bits(tq_accum, num_bits);2092total_bits_output += num_bits;2093}20942095for (uint32_t i = 0; i < total_values; i++)2096{2097coder.put_bits(bit_values[i], ep_bits);2098total_bits_output += ep_bits;2099}21002101return total_bits_output;2102}21032104static inline uint32_t get_num_endpoint_vals(uint32_t cem)2105{2106assert((cem == 7) || (cem == 11));2107return (cem == 11) ? basist::NUM_MODE11_ENDPOINTS : basist::NUM_MODE7_ENDPOINTS;2108}21092110static void code_block(bitwise_coder& coder,2111const astc_helpers::log_astc_block& log_blk,2112block_mode block_mode_index,2113endpoint_mode em, const uint8_t *pEP_deltas)2114{2115coder.put_truncated_binary((uint32_t)block_mode_index, (uint32_t)block_mode::cBMTotalModes);2116coder.put_truncated_binary((uint32_t)em, (uint32_t)endpoint_mode::cTotal);21172118const uint32_t num_endpoint_vals = get_num_endpoint_vals(log_blk.m_color_endpoint_modes[0]);21192120if ((em == endpoint_mode::cUseLeftDelta) || (em == endpoint_mode::cUseUpperDelta))2121{2122assert(log_blk.m_num_partitions == 1);21232124for (uint32_t i = 0; i < num_endpoint_vals; i++)2125coder.put_bits(pEP_deltas[i], NUM_ENDPOINT_DELTA_BITS);2126}2127else if (em == endpoint_mode::cRaw)2128{2129if (log_blk.m_num_partitions == 2)2130{2131const int unique_partition_index = g_part2_seed_to_unique_index[log_blk.m_partition_id];2132assert(unique_partition_index != -1);21332134coder.put_truncated_binary(unique_partition_index, NUM_UNIQUE_PARTITIONS2);2135}2136else if (log_blk.m_num_partitions == 3)2137{2138const int unique_partition_index = g_part3_seed_to_unique_index[log_blk.m_partition_id];2139assert(unique_partition_index != -1);21402141coder.put_truncated_binary(unique_partition_index, NUM_UNIQUE_PARTITIONS3);2142}21432144encode_values(coder, num_endpoint_vals * log_blk.m_num_partitions, log_blk.m_endpoints, log_blk.m_endpoint_ise_range);2145}21462147encode_values(coder, log_blk.m_grid_width * log_blk.m_grid_height * (log_blk.m_dual_plane ? 2 : 1), log_blk.m_weights, log_blk.m_weight_ise_range);2148}21492150struct smooth_map_params2151{2152bool m_no_mse_scaling;21532154float m_max_smooth_std_dev;2155float m_smooth_max_mse_scale;21562157float m_max_med_smooth_std_dev;2158float m_med_smooth_max_mse_scale;21592160float m_max_ultra_smooth_std_dev;2161float m_ultra_smooth_max_mse_scale;21622163bool m_debug_images;21642165smooth_map_params()2166{2167clear();2168}21692170void clear()2171{2172m_no_mse_scaling = false;21732174// 3x3 region2175m_max_smooth_std_dev = 100.0f;2176m_smooth_max_mse_scale = 13000.0f;21772178// 7x7 region2179m_max_med_smooth_std_dev = 9.0f;2180m_med_smooth_max_mse_scale = 15000.0f;21812182// 11x11 region2183m_max_ultra_smooth_std_dev = 4.0f;2184//m_ultra_smooth_max_mse_scale = 4500.0f;2185//m_ultra_smooth_max_mse_scale = 10000.0f;2186//m_ultra_smooth_max_mse_scale = 50000.0f;2187//m_ultra_smooth_max_mse_scale = 100000.0f;2188//m_ultra_smooth_max_mse_scale = 400000.0f;2189//m_ultra_smooth_max_mse_scale = 800000.0f;2190m_ultra_smooth_max_mse_scale = 2000000.0f;21912192m_debug_images = true;2193}2194};21952196Resampler::Contrib_List* g_contrib_lists[7]; // 1-621972198static void init_contrib_lists()2199{2200for (uint32_t dst_width = 1; dst_width <= 6; dst_width++)2201//g_contrib_lists[dst_width] = Resampler::make_clist(6, 6, basisu::Resampler::BOUNDARY_CLAMP, gaussian_filter, BASISU_GAUSSIAN_FILTER_SUPPORT, 6.0f / (float)dst_width, 0.0f);2202g_contrib_lists[dst_width] = Resampler::make_clist(6, 6, basisu::Resampler::BOUNDARY_CLAMP, gaussian_filter, BASISU_BELL_FILTER_SUPPORT, 6.0f / (float)dst_width, 0.0f);2203}22042205#if 02206static void filter_block(uint32_t grid_x, uint32_t grid_y, const vec3F* pSrc_block, half_vec3 *pDst_block_half3, vec4F *pDst_block_q16)2207{2208vec3F temp_block[6][6]; // [y][x]22092210// first filter rows to temp_block2211if (grid_x == 6)2212{2213memcpy(temp_block, pSrc_block, sizeof(vec3F) * 6 * 6);2214}2215else2216{2217Resampler::Contrib_List* pRow_lists = g_contrib_lists[grid_x];22182219for (uint32_t y = 0; y < 6; y++)2220{2221for (uint32_t x = 0; x < 6; x++)2222{2223vec3F p(0.0f);22242225for (uint32_t i = 0; i < pRow_lists[x].n; i++)2226p += pSrc_block[y * 6 + pRow_lists[x].p[i].pixel] * pRow_lists[x].p[i].weight;22272228p.clamp(0.0f, basist::ASTC_HDR_MAX_VAL);22292230temp_block[y][x] = p;2231} // x2232} // y2233}22342235// filter columns2236if (grid_y == 6)2237{2238for (uint32_t y = 0; y < 6; y++)2239{2240for (uint32_t x = 0; x < 6; x++)2241{2242for (uint32_t c = 0; c < 3; c++)2243{2244const basist::half_float h = basist::float_to_half(temp_block[y][x][c]);22452246pDst_block_half3[x + y * 6][c] = h;2247pDst_block_q16[x + y * 6][c] = (float)half_to_qlog16(h);2248}22492250pDst_block_q16[x + y * 6][3] = 0.0f;2251} // x2252} // y2253}2254else2255{2256Resampler::Contrib_List* pCol_lists = g_contrib_lists[grid_y];22572258for (uint32_t x = 0; x < 6; x++)2259{2260for (uint32_t y = 0; y < 6; y++)2261{2262vec3F p(0.0f);22632264for (uint32_t i = 0; i < pCol_lists[y].n; i++)2265p += temp_block[pCol_lists[y].p[i].pixel][x] * pCol_lists[y].p[i].weight;22662267p.clamp(0.0f, basist::ASTC_HDR_MAX_VAL);22682269for (uint32_t c = 0; c < 3; c++)2270{2271const basist::half_float h = basist::float_to_half(p[c]);22722273pDst_block_half3[x + y * 6][c] = h;2274pDst_block_q16[x + y * 6][c] = (float)half_to_qlog16(h);2275}22762277pDst_block_q16[x + y * 6][3] = 0.0f;22782279} // x2280} // y2281}2282}2283#endif22842285static void filter_block(uint32_t grid_x, uint32_t grid_y, const vec4F* pSrc_block, vec4F* pDst_block)2286{2287vec4F temp_block[6][6]; // [y][x]22882289// first filter rows to temp_block2290if (grid_x == 6)2291{2292memcpy(temp_block, pSrc_block, sizeof(vec4F) * 6 * 6);2293}2294else2295{2296Resampler::Contrib_List* pRow_lists = g_contrib_lists[grid_x];22972298for (uint32_t y = 0; y < 6; y++)2299{2300for (uint32_t x = 0; x < 6; x++)2301{2302vec3F p(0.0f);23032304for (uint32_t i = 0; i < pRow_lists[x].n; i++)2305p += vec3F(pSrc_block[y * 6 + pRow_lists[x].p[i].pixel]) * pRow_lists[x].p[i].weight;23062307p.clamp(0.0f, basist::ASTC_HDR_MAX_VAL);23082309temp_block[y][x] = p;2310} // x2311} // y2312}23132314// filter columns2315if (grid_y == 6)2316{2317for (uint32_t y = 0; y < 6; y++)2318{2319for (uint32_t x = 0; x < 6; x++)2320{2321for (uint32_t c = 0; c < 3; c++)2322pDst_block[x + y * 6][c] = temp_block[y][x][c];2323} // x2324} // y2325}2326else2327{2328Resampler::Contrib_List* pCol_lists = g_contrib_lists[grid_y];23292330for (uint32_t x = 0; x < 6; x++)2331{2332for (uint32_t y = 0; y < 6; y++)2333{2334vec3F p(0.0f);23352336for (uint32_t i = 0; i < pCol_lists[y].n; i++)2337p += temp_block[pCol_lists[y].p[i].pixel][x] * pCol_lists[y].p[i].weight;23382339p.clamp(0.0f, basist::ASTC_HDR_MAX_VAL);23402341pDst_block[x + y * 6] = p;23422343} // x2344} // y2345}2346}23472348static void filter_block(uint32_t grid_x, uint32_t grid_y, const vec3F* pSrc_block, vec3F* pDst_block)2349{2350vec3F temp_block[6][6]; // [y][x]23512352// first filter rows to temp_block2353if (grid_x == 6)2354{2355memcpy(temp_block, pSrc_block, sizeof(vec3F) * 6 * 6);2356}2357else2358{2359Resampler::Contrib_List* pRow_lists = g_contrib_lists[grid_x];23602361for (uint32_t y = 0; y < 6; y++)2362{2363for (uint32_t x = 0; x < 6; x++)2364{2365vec3F p(0.0f);23662367for (uint32_t i = 0; i < pRow_lists[x].n; i++)2368p += vec3F(pSrc_block[y * 6 + pRow_lists[x].p[i].pixel]) * pRow_lists[x].p[i].weight;23692370temp_block[y][x] = p;2371} // x2372} // y2373}23742375// filter columns2376if (grid_y == 6)2377{2378memcpy((void *)pDst_block, temp_block, sizeof(vec3F) * 6 * 6);2379}2380else2381{2382Resampler::Contrib_List* pCol_lists = g_contrib_lists[grid_y];23832384for (uint32_t x = 0; x < 6; x++)2385{2386for (uint32_t y = 0; y < 6; y++)2387{2388vec3F& p = pDst_block[x + y * 6];2389p.set(0.0f);23902391for (uint32_t i = 0; i < pCol_lists[y].n; i++)2392p += temp_block[pCol_lists[y].p[i].pixel][x] * pCol_lists[y].p[i].weight;2393} // x2394} // y2395}2396}23972398static float diff_blocks(const vec4F* pA, const vec4F* pB)2399{2400const uint32_t BLOCK_T = 36;24012402float diff = 0.0f;2403for (uint32_t i = 0; i < BLOCK_T; i++)2404diff += square(pA[i][0] - pB[i][0]) + square(pA[i][1] - pB[i][1]) + square(pA[i][2] - pB[i][2]);24052406return diff * (1.0f / (float)BLOCK_T);2407}24082409static float sub_and_compute_std_dev(const vec3F* pA, const vec3F* pB)2410{2411const uint32_t BLOCK_T = 36;24122413vec3F mean(0.0f);24142415for (uint32_t i = 0; i < BLOCK_T; i++)2416{2417vec3F diff(pA[i] - pB[i]);2418mean += diff;2419}24202421mean *= (1.0f / (float)BLOCK_T);24222423vec3F diff_sum(0.0f);2424for (uint32_t i = 0; i < BLOCK_T; i++)2425{2426vec3F diff(pA[i] - pB[i]);2427diff -= mean;2428diff_sum += vec3F::component_mul(diff, diff);2429}24302431vec3F var(diff_sum * (1.0f / (float)BLOCK_T));24322433vec3F std_dev(sqrtf(var[0]), sqrtf(var[1]), sqrtf(var[2]));24342435return maximum(std_dev[0], std_dev[1], std_dev[2]);2436}24372438static void create_smooth_maps2(2439vector2D<float>& smooth_block_mse_scales,2440const image& orig_img,2441smooth_map_params& params, image* pUltra_smooth_img = nullptr)2442{2443const uint32_t width = orig_img.get_width();2444const uint32_t height = orig_img.get_height();2445//const uint32_t total_pixels = orig_img.get_total_pixels();2446const uint32_t num_comps = 3;24472448if (params.m_no_mse_scaling)2449{2450smooth_block_mse_scales.set_all(1.0f);2451return;2452}24532454smooth_block_mse_scales.resize(width, height);24552456image smooth_vis, med_smooth_vis, ultra_smooth_vis;24572458if (params.m_debug_images)2459{2460smooth_vis.resize(width, height);2461med_smooth_vis.resize(width, height);2462ultra_smooth_vis.resize(width, height);2463}24642465for (uint32_t y = 0; y < height; y++)2466{2467for (uint32_t x = 0; x < width; x++)2468{2469{2470tracked_stat_dbl comp_stats[4];2471for (int yd = -1; yd <= 1; yd++)2472{2473for (int xd = -1; xd <= 1; xd++)2474{2475const color_rgba& p = orig_img.get_clamped((int)x + xd, (int)y + yd);24762477comp_stats[0].update((float)p[0]);2478comp_stats[1].update((float)p[1]);2479comp_stats[2].update((float)p[2]);2480}2481}24822483float max_std_dev = 0.0f;2484for (uint32_t i = 0; i < num_comps; i++)2485max_std_dev = basisu::maximum(max_std_dev, (float)comp_stats[i].get_std_dev());24862487float yl = clampf(max_std_dev / params.m_max_smooth_std_dev, 0.0f, 1.0f);2488//yl = powf(yl, 2.0f);2489yl = powf(yl, 1.0f / 2.0f); // substantially less bits24902491smooth_block_mse_scales(x, y) = lerp(params.m_smooth_max_mse_scale, 1.0f, yl);24922493if (params.m_debug_images)2494{2495//smooth_vis(x, y).set(clamp((int)((smooth_block_mse_scales(x, y) - 1.0f) / (params.m_smooth_max_mse_scale - 1.0f) * 255.0f + .5f), 0, 255));2496// white=high local activity (edges/detail)2497// black=low local activity (smooth - error is amplified)2498smooth_vis(x, y).set(clamp((int)((yl * 255.0f) + .5f), 0, 255));2499}2500}25012502{2503tracked_stat_dbl comp_stats[4];25042505const int S = 3;2506for (int yd = -S; yd < S; yd++)2507{2508for (int xd = -S; xd < S; xd++)2509{2510const color_rgba& p = orig_img.get_clamped((int)x + xd, (int)y + yd);25112512comp_stats[0].update((float)p[0]);2513comp_stats[1].update((float)p[1]);2514comp_stats[2].update((float)p[2]);2515}2516}25172518float max_std_dev = 0.0f;2519for (uint32_t i = 0; i < num_comps; i++)2520max_std_dev = basisu::maximum(max_std_dev, (float)comp_stats[i].get_std_dev());25212522float yl = clampf(max_std_dev / params.m_max_med_smooth_std_dev, 0.0f, 1.0f);2523//yl = powf(yl, 2.0f);25242525smooth_block_mse_scales(x, y) = lerp(params.m_med_smooth_max_mse_scale, smooth_block_mse_scales(x, y), yl);25262527if (params.m_debug_images)2528med_smooth_vis(x, y).set((int)std::round(yl * 255.0f));2529}25302531{2532tracked_stat_dbl comp_stats[4];25332534const int S = 5;2535for (int yd = -S; yd < S; yd++)2536{2537for (int xd = -S; xd < S; xd++)2538{2539const color_rgba& p = orig_img.get_clamped((int)x + xd, (int)y + yd);25402541comp_stats[0].update((float)p[0]);2542comp_stats[1].update((float)p[1]);2543comp_stats[2].update((float)p[2]);2544}2545}25462547float max_std_dev = 0.0f;2548for (uint32_t i = 0; i < num_comps; i++)2549max_std_dev = basisu::maximum(max_std_dev, (float)comp_stats[i].get_std_dev());25502551float yl = clampf(max_std_dev / params.m_max_ultra_smooth_std_dev, 0.0f, 1.0f);2552yl = powf(yl, 2.0f);25532554smooth_block_mse_scales(x, y) = lerp(params.m_ultra_smooth_max_mse_scale, smooth_block_mse_scales(x, y), yl);25552556if (params.m_debug_images)2557ultra_smooth_vis(x, y).set((int)std::round(yl * 255.0f));2558}25592560}2561}25622563if (params.m_debug_images)2564{2565save_png("dbg_smooth_vis.png", smooth_vis);2566save_png("dbg_med_smooth_vis.png", med_smooth_vis);2567save_png("dbg_ultra_smooth_vis.png", ultra_smooth_vis);25682569image vis_img(width, height);25702571float max_scale = 0.0f;2572for (uint32_t y = 0; y < height; y++)2573for (uint32_t x = 0; x < width; x++)2574max_scale = basisu::maximumf(max_scale, smooth_block_mse_scales(x, y));25752576for (uint32_t y = 0; y < height; y++)2577for (uint32_t x = 0; x < width; x++)2578vis_img(x, y).set((int)std::round(smooth_block_mse_scales(x, y) * 255.0f / max_scale));25792580save_png("scale_vis.png", vis_img);2581}25822583if (pUltra_smooth_img)2584*pUltra_smooth_img = ultra_smooth_vis;2585}25862587const float REALLY_DARK_I_THRESHOLD = 0.0625f;2588const float REALLY_DARK_MSE_ERR_SCALE = 128.0f;2589const float REALLY_DARK_DELTA_ITP_JND_SCALE = 5.0f;25902591static float compute_pixel_mse_itp(const vec3F& orig_pixel_itp, const vec3F& comp_pixel_itp, bool delta_itp_dark_adjustment)2592{2593float delta_i = orig_pixel_itp[0] - comp_pixel_itp[0];2594float delta_t = orig_pixel_itp[1] - comp_pixel_itp[1];2595float delta_p = orig_pixel_itp[2] - comp_pixel_itp[2];25962597float err = (delta_i * delta_i) + (delta_t * delta_t) + (delta_p * delta_p);25982599if (delta_itp_dark_adjustment)2600{2601// We have to process a large range of inputs, including extremely dark inputs.2602// Artifically amplify MSE on very dark pixels - otherwise they'll be overly compressed at higher lambdas.2603// This is to better handle very dark signals which could be explictly overexposed.2604float s = bu_math::smoothstep(0.0f, REALLY_DARK_I_THRESHOLD, orig_pixel_itp[0]);2605s = lerp(REALLY_DARK_MSE_ERR_SCALE, 1.0f, s);2606err *= s;2607}26082609return err;2610}26112612static float compute_block_mse_itp(uint32_t block_w, uint32_t block_h, const vec3F* pOrig_pixels_itp, const vec3F* pComp_pixels_itp, bool delta_itp_dark_adjustment)2613{2614float total_mse = 0.0f;26152616for (uint32_t y = 0; y < block_h; y++)2617{2618for (uint32_t x = 0; x < block_w; x++)2619{2620total_mse += compute_pixel_mse_itp(pOrig_pixels_itp[x + y * block_w], pComp_pixels_itp[x + y * block_w], delta_itp_dark_adjustment);2621} // x2622} // y26232624return total_mse * (1.0f / (float)(block_w * block_h));2625}26262627static float compute_block_ssim_itp(uint32_t block_w, uint32_t block_h, const vec3F* pOrig_pixels_itp, const vec3F* pComp_pixels_itp)2628{2629const uint32_t n = block_w * block_h;2630assert(n <= 36);26312632stats<float> x_stats[3], y_stats[3];2633comparative_stats<float> xy_cov[3];26342635for (uint32_t c = 0; c < 3; c++)2636{2637x_stats[c].calc_simplified(n, &pOrig_pixels_itp[0][c], 3);2638y_stats[c].calc_simplified(n, &pComp_pixels_itp[0][c], 3);2639}26402641for (uint32_t c = 0; c < 3; c++)2642xy_cov[c].calc_cov(n, &pOrig_pixels_itp[0][c], &pComp_pixels_itp[0][c], 3, 3, &x_stats[c], &y_stats[c]);26432644float ssim[3];2645const double d = 1.0f, k1 = .01f, k2 = .03f;26462647// weight mean error more highly to reduce blocking2648float ap = 1.5f, bp = 1.0f, cp = 1.0f;26492650const double s_c1 = square(k1 * d), s_c2 = square(k2 * d);2651const double s_c3(s_c2 * .5f);26522653for (uint32_t c = 0; c < 3; c++)2654{2655float lum = (float)((2.0f * x_stats[c].m_avg * y_stats[c].m_avg + s_c1) / (square(x_stats[c].m_avg) + square(y_stats[c].m_avg) + s_c1));2656lum = saturate(lum);26572658float con = (float)((2.0f * x_stats[c].m_std_dev * y_stats[c].m_std_dev + s_c2) / (x_stats[c].m_var + y_stats[c].m_var + s_c2));2659con = saturate(con);26602661float str = (float)((xy_cov[c].m_cov + s_c3) / (x_stats[c].m_std_dev * y_stats[c].m_std_dev + s_c3));2662str = saturate(str);26632664ssim[c] = powf(lum, ap) * powf(con, bp) * powf(str, cp);2665}26662667#if 02668float final_ssim = (ssim[0] * .4f + ssim[1] * .3f + ssim[2] * .3f);2669#elif 12670float final_ssim = ssim[0] * ssim[1] * ssim[2];2671#else2672const float LP = .75f;2673float final_ssim = ssim[0] * powf((ssim[1] + ssim[2]) * .5f, LP);2674#endif26752676return final_ssim;2677}26782679// delta ITP, 1.0 is JND (Rec. ITU-R BT.2124), modified for higher error at low light2680static float compute_pixel_delta_itp(const vec3F& a, const vec3F& b, const vec3F& orig, bool delta_itp_dark_adjustment)2681{2682float delta_i = a[0] - b[0];2683float delta_t = a[1] - b[1];2684float delta_p = a[2] - b[2];26852686float err = 720.0f * sqrtf((delta_i * delta_i) + (delta_t * delta_t) + (delta_p * delta_p));26872688float s = bu_math::smoothstep(0.0f, REALLY_DARK_I_THRESHOLD, orig[0]);26892690if (delta_itp_dark_adjustment)2691{2692// This is to better handle very dark signals which could be explictly overexposed.2693s = lerp(REALLY_DARK_DELTA_ITP_JND_SCALE, 1.0f, s);2694err *= s;2695}26962697return err;2698}26992700struct candidate_encoding2701{2702encoding_type m_encoding_type;27032704basist::half_float m_solid_color[3];27052706uint32_t m_run_len;27072708vec3F m_comp_pixels[MAX_BLOCK_H][MAX_BLOCK_W]; // [y][x]2709vec3F m_comp_pixels_itp[MAX_BLOCK_H][MAX_BLOCK_W]; // [y][x]27102711endpoint_mode m_endpoint_mode;2712block_mode m_block_mode;27132714bitwise_coder m_coder;27152716// The block to code, which may not be valid ASTC. This may have to be transcoded (by requantizing the weights/endpoints) before it's valid ASTC.2717// Note the endpoints may be coded endpoints OR transcoded endpoints, depending on the encoding type.2718astc_helpers::log_astc_block m_coded_log_blk;27192720// The block the decoder outputs.2721astc_helpers::log_astc_block m_decomp_log_blk;27222723int m_reuse_delta_index;27242725float m_t, m_d, m_bits;27262727candidate_encoding()2728{2729clear();2730}27312732candidate_encoding(const candidate_encoding &other)2733{2734*this = other;2735}27362737candidate_encoding(candidate_encoding&& other)2738{2739*this = std::move(other);2740}27412742candidate_encoding& operator=(const candidate_encoding& rhs)2743{2744if (this == &rhs)2745return *this;27462747m_encoding_type = rhs.m_encoding_type;2748memcpy(m_solid_color, rhs.m_solid_color, sizeof(m_solid_color));2749m_run_len = rhs.m_run_len;2750memcpy(m_comp_pixels, rhs.m_comp_pixels, sizeof(m_comp_pixels));2751m_endpoint_mode = rhs.m_endpoint_mode;2752m_block_mode = rhs.m_block_mode;2753m_coder = rhs.m_coder;2754m_coded_log_blk = rhs.m_coded_log_blk;2755m_decomp_log_blk = rhs.m_decomp_log_blk;2756m_reuse_delta_index = rhs.m_reuse_delta_index;27572758return *this;2759}27602761candidate_encoding& operator=(candidate_encoding&& rhs)2762{2763if (this == &rhs)2764return *this;27652766m_encoding_type = rhs.m_encoding_type;2767memcpy(m_solid_color, rhs.m_solid_color, sizeof(m_solid_color));2768m_run_len = rhs.m_run_len;2769memcpy(m_comp_pixels, rhs.m_comp_pixels, sizeof(m_comp_pixels));2770m_endpoint_mode = rhs.m_endpoint_mode;2771m_block_mode = rhs.m_block_mode;2772m_coder = std::move(rhs.m_coder);2773m_coded_log_blk = rhs.m_coded_log_blk;2774m_decomp_log_blk = rhs.m_decomp_log_blk;2775m_reuse_delta_index = rhs.m_reuse_delta_index;27762777return *this;2778}27792780void clear()2781{2782m_encoding_type = encoding_type::cInvalid;27832784clear_obj(m_solid_color);27852786m_run_len = 0;27872788clear_obj(m_comp_pixels);27892790m_endpoint_mode = endpoint_mode::cInvalid;2791m_block_mode = block_mode::cInvalid;27922793m_coder.restart();27942795m_coded_log_blk.clear();2796m_decomp_log_blk.clear();27972798m_t = 0;2799m_d = 0;2800m_bits = 0;28012802m_reuse_delta_index = 0;2803}2804};28052806bool decode_astc_block(uint32_t block_w, uint32_t block_h, astc_helpers::log_astc_block &log_blk, vec3F *pPixels)2807{2808assert((block_w <= 6) && (block_h <= 6));28092810half_vec4 decoded_pixels_half4[6 * 6]; // [y][x]2811bool status = astc_helpers::decode_block(log_blk, decoded_pixels_half4, block_w, block_h, astc_helpers::cDecodeModeHDR16);2812assert(status);28132814if (!status)2815return false;28162817for (uint32_t y = 0; y < block_h; y++)2818{2819for (uint32_t x = 0; x < block_w; x++)2820{2821pPixels[x + y * block_w].set(2822basist::half_to_float(decoded_pixels_half4[x + y * block_w][0]),2823basist::half_to_float(decoded_pixels_half4[x + y * block_w][1]),2824basist::half_to_float(decoded_pixels_half4[x + y * block_w][2]));2825} // x2826} //y28272828return true;2829}28302831static inline bool validate_log_blk(const astc_helpers::log_astc_block &decomp_blk)2832{2833astc_helpers::astc_block phys_blk;2834return astc_helpers::pack_astc_block(phys_blk, decomp_blk);2835}28362837#define SYNC_MARKERS (0)28382839static bool decode_file(const uint8_vec& comp_data, vector2D<astc_helpers::astc_block>& decoded_blocks, uint32_t &width, uint32_t &height)2840{2841interval_timer tm;2842tm.start();28432844const uint32_t BLOCK_W = 6, BLOCK_H = 6;28452846width = 0;2847height = 0;28482849if (comp_data.size() <= 2*3)2850return false;28512852basist::bitwise_decoder decoder;2853if (!decoder.init(comp_data.data(), comp_data.size_u32()))2854return false;28552856if (decoder.get_bits(16) != 0xABCD)2857return false;28582859width = decoder.get_bits(16);2860height = decoder.get_bits(16);28612862if (!width || !height || (width > MAX_ASTC_HDR_6X6_DIM) || (height > MAX_ASTC_HDR_6X6_DIM))2863return false;28642865const uint32_t num_blocks_x = (width + BLOCK_W - 1) / BLOCK_W;2866const uint32_t num_blocks_y = (height + BLOCK_H - 1) / BLOCK_H;2867const uint32_t total_blocks = num_blocks_x * num_blocks_y;28682869decoded_blocks.resize(num_blocks_x, num_blocks_y);2870//memset(decoded_blocks.get_ptr(), 0, decoded_blocks.size_in_bytes());28712872vector2D<astc_helpers::log_astc_block> decoded_log_blocks(num_blocks_x, num_blocks_y);2873//memset(decoded_log_blocks.get_ptr(), 0, decoded_log_blocks.size_in_bytes());28742875uint32_t cur_bx = 0, cur_by = 0;2876uint32_t step_counter = 0;2877BASISU_NOTE_UNUSED(step_counter);28782879while (cur_by < num_blocks_y)2880{2881step_counter++;28822883//if ((cur_bx == 9) && (cur_by == 13))2884// printf("!");28852886#if SYNC_MARKERS2887uint32_t mk = decoder.get_bits(16);2888if (mk != 0xDEAD)2889{2890printf("!");2891assert(0);2892return false;2893}2894#endif2895if (decoder.get_bits_remaining() < 1)2896return false;28972898encoding_type et = encoding_type::cBlock;28992900uint32_t b0 = decoder.get_bits(1);2901if (!b0)2902{2903uint32_t b1 = decoder.get_bits(1);2904if (b1)2905et = encoding_type::cReuse;2906else2907{2908uint32_t b2 = decoder.get_bits(1);2909if (b2)2910et = encoding_type::cSolid;2911else2912et = encoding_type::cRun;2913}2914}29152916switch (et)2917{2918case encoding_type::cRun:2919{2920if (!cur_bx && !cur_by)2921return false;29222923const uint32_t run_len = decoder.decode_vlc(5) + 1;29242925uint32_t num_blocks_remaining = total_blocks - (cur_bx + cur_by * num_blocks_x);2926if (run_len > num_blocks_remaining)2927return false;29282929uint32_t prev_bx = cur_bx, prev_by = cur_by;29302931if (cur_bx)2932prev_bx--;2933else2934{2935prev_bx = num_blocks_x - 1;2936prev_by--;2937}29382939const astc_helpers::log_astc_block& prev_log_blk = decoded_log_blocks(prev_bx, prev_by);2940const astc_helpers::astc_block& prev_phys_blk = decoded_blocks(prev_bx, prev_by);29412942for (uint32_t i = 0; i < run_len; i++)2943{2944decoded_log_blocks(cur_bx, cur_by) = prev_log_blk;2945decoded_blocks(cur_bx, cur_by) = prev_phys_blk;29462947cur_bx++;2948if (cur_bx == num_blocks_x)2949{2950cur_bx = 0;2951cur_by++;2952}2953}29542955break;2956}2957case encoding_type::cSolid:2958{2959const basist::half_float rh = (basist::half_float)decoder.get_bits(15);2960const basist::half_float gh = (basist::half_float)decoder.get_bits(15);2961const basist::half_float bh = (basist::half_float)decoder.get_bits(15);29622963astc_helpers::log_astc_block& log_blk = decoded_log_blocks(cur_bx, cur_by);29642965log_blk.clear();2966log_blk.m_solid_color_flag_hdr = true;2967log_blk.m_solid_color[0] = rh;2968log_blk.m_solid_color[1] = gh;2969log_blk.m_solid_color[2] = bh;2970log_blk.m_solid_color[3] = basist::float_to_half(1.0f);29712972bool status = astc_helpers::pack_astc_block(decoded_blocks(cur_bx, cur_by), log_blk);2973if (!status)2974return false;29752976cur_bx++;2977if (cur_bx == num_blocks_x)2978{2979cur_bx = 0;2980cur_by++;2981}29822983break;2984}2985case encoding_type::cReuse:2986{2987if (!cur_bx && !cur_by)2988return false;29892990const uint32_t reuse_delta_index = decoder.get_bits(REUSE_XY_DELTA_BITS);29912992const int reuse_delta_x = g_reuse_xy_deltas[reuse_delta_index].m_x;2993const int reuse_delta_y = g_reuse_xy_deltas[reuse_delta_index].m_y;29942995const int prev_bx = cur_bx + reuse_delta_x, prev_by = cur_by + reuse_delta_y;2996if ((prev_bx < 0) || (prev_bx >= (int)num_blocks_x))2997return false;2998if (prev_by < 0)2999return false;30003001const astc_helpers::log_astc_block& prev_log_blk = decoded_log_blocks(prev_bx, prev_by);3002const astc_helpers::astc_block& prev_phys_blk = decoded_blocks(prev_bx, prev_by);30033004if (prev_log_blk.m_solid_color_flag_hdr)3005return false;30063007astc_helpers::log_astc_block& log_blk = decoded_log_blocks(cur_bx, cur_by);3008astc_helpers::astc_block& phys_blk = decoded_blocks(cur_bx, cur_by);30093010log_blk = prev_log_blk;30113012const uint32_t total_grid_weights = log_blk.m_grid_width * log_blk.m_grid_height * (log_blk.m_dual_plane ? 2 : 1);30133014bool status = basist::astc_6x6_hdr::decode_values(decoder, total_grid_weights, log_blk.m_weight_ise_range, log_blk.m_weights);3015if (!status)3016return false;30173018astc_helpers::log_astc_block decomp_blk;3019status = astc_helpers::unpack_block(&prev_phys_blk, decomp_blk, BLOCK_W, BLOCK_H);3020if (!status)3021return false;30223023uint8_t transcode_weights[MAX_BLOCK_W * MAX_BLOCK_H * 2];3024basist::astc_6x6_hdr::requantize_astc_weights(total_grid_weights, log_blk.m_weights, log_blk.m_weight_ise_range, transcode_weights, decomp_blk.m_weight_ise_range);30253026copy_weight_grid(log_blk.m_dual_plane, log_blk.m_grid_width, log_blk.m_grid_height, transcode_weights, decomp_blk);30273028status = astc_helpers::pack_astc_block(phys_blk, decomp_blk);3029if (!status)3030return false;30313032cur_bx++;3033if (cur_bx == num_blocks_x)3034{3035cur_bx = 0;3036cur_by++;3037}30383039break;3040}3041case encoding_type::cBlock:3042{3043const block_mode bm = (block_mode)decoder.decode_truncated_binary((uint32_t)block_mode::cBMTotalModes);3044const endpoint_mode em = (endpoint_mode)decoder.decode_truncated_binary((uint32_t)endpoint_mode::cTotal);30453046switch (em)3047{3048case endpoint_mode::cUseLeft:3049case endpoint_mode::cUseUpper:3050{3051int neighbor_bx = cur_bx, neighbor_by = cur_by;30523053if (em == endpoint_mode::cUseLeft)3054neighbor_bx--;3055else3056neighbor_by--;30573058if ((neighbor_bx < 0) || (neighbor_by < 0))3059return false;30603061const astc_helpers::log_astc_block& neighbor_blk = decoded_log_blocks(neighbor_bx, neighbor_by);3062if (!neighbor_blk.m_color_endpoint_modes[0])3063return false;30643065const block_mode_desc& bmd = g_block_mode_descs[(uint32_t)bm];3066const uint32_t num_endpoint_values = get_num_endpoint_vals(bmd.m_cem);30673068if (bmd.m_cem != neighbor_blk.m_color_endpoint_modes[0])3069return false;30703071astc_helpers::log_astc_block& log_blk = decoded_log_blocks(cur_bx, cur_by);3072astc_helpers::astc_block& phys_blk = decoded_blocks(cur_bx, cur_by);30733074log_blk.clear();3075log_blk.m_num_partitions = 1;3076log_blk.m_color_endpoint_modes[0] = (uint8_t)bmd.m_cem;3077log_blk.m_endpoint_ise_range = neighbor_blk.m_endpoint_ise_range;3078log_blk.m_weight_ise_range = (uint8_t)bmd.m_weight_ise_range;3079log_blk.m_grid_width = (uint8_t)bmd.m_grid_x;3080log_blk.m_grid_height = (uint8_t)bmd.m_grid_y;3081log_blk.m_dual_plane = (uint8_t)bmd.m_dp;3082log_blk.m_color_component_selector = (uint8_t)bmd.m_dp_channel;30833084memcpy(log_blk.m_endpoints, neighbor_blk.m_endpoints, num_endpoint_values);30853086const uint32_t total_grid_weights = bmd.m_grid_x * bmd.m_grid_y * (bmd.m_dp ? 2 : 1);30873088bool status = decode_values(decoder, total_grid_weights, bmd.m_weight_ise_range, log_blk.m_weights);3089if (!status)3090return false;30913092astc_helpers::log_astc_block decomp_blk;3093decomp_blk.clear();30943095decomp_blk.m_num_partitions = 1;3096decomp_blk.m_color_endpoint_modes[0] = (uint8_t)bmd.m_cem;3097decomp_blk.m_endpoint_ise_range = (uint8_t)bmd.m_transcode_endpoint_ise_range;3098decomp_blk.m_weight_ise_range = (uint8_t)bmd.m_transcode_weight_ise_range;3099decomp_blk.m_dual_plane = bmd.m_dp;3100decomp_blk.m_color_component_selector = (uint8_t)bmd.m_dp_channel;31013102basist::astc_6x6_hdr::requantize_ise_endpoints(bmd.m_cem, log_blk.m_endpoint_ise_range, log_blk.m_endpoints, bmd.m_transcode_endpoint_ise_range, decomp_blk.m_endpoints);31033104uint8_t transcode_weights[MAX_BLOCK_W * MAX_BLOCK_H * 2];3105basist::astc_6x6_hdr::requantize_astc_weights(total_grid_weights, log_blk.m_weights, bmd.m_weight_ise_range, transcode_weights, bmd.m_transcode_weight_ise_range);31063107copy_weight_grid(bmd.m_dp, bmd.m_grid_x, bmd.m_grid_y, transcode_weights, decomp_blk);31083109status = astc_helpers::pack_astc_block(phys_blk, decomp_blk);3110if (!status)3111return false;31123113cur_bx++;3114if (cur_bx == num_blocks_x)3115{3116cur_bx = 0;3117cur_by++;3118}31193120break;3121}3122case endpoint_mode::cUseLeftDelta:3123case endpoint_mode::cUseUpperDelta:3124{3125int neighbor_bx = cur_bx, neighbor_by = cur_by;31263127if (em == endpoint_mode::cUseLeftDelta)3128neighbor_bx--;3129else3130neighbor_by--;31313132if ((neighbor_bx < 0) || (neighbor_by < 0))3133return false;31343135const astc_helpers::log_astc_block& neighbor_blk = decoded_log_blocks(neighbor_bx, neighbor_by);3136if (!neighbor_blk.m_color_endpoint_modes[0])3137return false;31383139const block_mode_desc& bmd = g_block_mode_descs[(uint32_t)bm];3140const uint32_t num_endpoint_values = get_num_endpoint_vals(bmd.m_cem);31413142if (bmd.m_cem != neighbor_blk.m_color_endpoint_modes[0])3143return false;31443145astc_helpers::log_astc_block& log_blk = decoded_log_blocks(cur_bx, cur_by);3146astc_helpers::astc_block& phys_blk = decoded_blocks(cur_bx, cur_by);31473148log_blk.clear();3149log_blk.m_num_partitions = 1;3150log_blk.m_color_endpoint_modes[0] = (uint8_t)bmd.m_cem;3151log_blk.m_dual_plane = bmd.m_dp;3152log_blk.m_color_component_selector = (uint8_t)bmd.m_dp_channel;31533154log_blk.m_endpoint_ise_range = (uint8_t)bmd.m_endpoint_ise_range;3155basist::astc_6x6_hdr::requantize_ise_endpoints(bmd.m_cem, neighbor_blk.m_endpoint_ise_range, neighbor_blk.m_endpoints, bmd.m_endpoint_ise_range, log_blk.m_endpoints);31563157const int total_endpoint_delta_vals = 1 << NUM_ENDPOINT_DELTA_BITS;3158const int low_delta_limit = -(total_endpoint_delta_vals / 2); // high_delta_limit = (total_endpoint_delta_vals / 2) - 1;31593160const auto& ise_to_rank = astc_helpers::g_dequant_tables.get_endpoint_tab(log_blk.m_endpoint_ise_range).m_ISE_to_rank;3161const auto& rank_to_ise = astc_helpers::g_dequant_tables.get_endpoint_tab(log_blk.m_endpoint_ise_range).m_rank_to_ISE;3162const int total_endpoint_levels = astc_helpers::get_ise_levels(log_blk.m_endpoint_ise_range);31633164for (uint32_t i = 0; i < num_endpoint_values; i++)3165{3166int cur_val = ise_to_rank[log_blk.m_endpoints[i]];31673168int delta = (int)decoder.get_bits(NUM_ENDPOINT_DELTA_BITS) + low_delta_limit;31693170cur_val += delta;3171if ((cur_val < 0) || (cur_val >= total_endpoint_levels))3172return false;31733174log_blk.m_endpoints[i] = rank_to_ise[cur_val];3175}31763177log_blk.m_weight_ise_range = (uint8_t)bmd.m_weight_ise_range;3178log_blk.m_grid_width = (uint8_t)bmd.m_grid_x;3179log_blk.m_grid_height = (uint8_t)bmd.m_grid_y;31803181const uint32_t total_grid_weights = bmd.m_grid_x * bmd.m_grid_y * (bmd.m_dp ? 2 : 1);31823183bool status = decode_values(decoder, total_grid_weights, bmd.m_weight_ise_range, log_blk.m_weights);3184if (!status)3185return false;31863187astc_helpers::log_astc_block decomp_blk;3188decomp_blk.clear();31893190decomp_blk.m_num_partitions = 1;3191decomp_blk.m_color_endpoint_modes[0] = (uint8_t)bmd.m_cem;3192decomp_blk.m_endpoint_ise_range = (uint8_t)bmd.m_transcode_endpoint_ise_range;3193decomp_blk.m_weight_ise_range = (uint8_t)bmd.m_transcode_weight_ise_range;3194decomp_blk.m_dual_plane = (uint8_t)bmd.m_dp;3195decomp_blk.m_color_component_selector = (uint8_t)bmd.m_dp_channel;31963197basist::astc_6x6_hdr::requantize_ise_endpoints(bmd.m_cem, log_blk.m_endpoint_ise_range, log_blk.m_endpoints, bmd.m_transcode_endpoint_ise_range, decomp_blk.m_endpoints);31983199uint8_t transcode_weights[MAX_BLOCK_W * MAX_BLOCK_H * 2];3200basist::astc_6x6_hdr::requantize_astc_weights(total_grid_weights, log_blk.m_weights, bmd.m_weight_ise_range, transcode_weights, bmd.m_transcode_weight_ise_range);32013202copy_weight_grid(bmd.m_dp, bmd.m_grid_x, bmd.m_grid_y, transcode_weights, decomp_blk);32033204status = astc_helpers::pack_astc_block(phys_blk, decomp_blk);3205if (!status)3206return false;32073208cur_bx++;3209if (cur_bx == num_blocks_x)3210{3211cur_bx = 0;3212cur_by++;3213}32143215break;3216}3217case endpoint_mode::cRaw:3218{3219const block_mode_desc& bmd = g_block_mode_descs[(uint32_t)bm];32203221const uint32_t num_endpoint_values = get_num_endpoint_vals(bmd.m_cem);32223223astc_helpers::log_astc_block& log_blk = decoded_log_blocks(cur_bx, cur_by);3224astc_helpers::astc_block& phys_blk = decoded_blocks(cur_bx, cur_by);32253226log_blk.clear();3227log_blk.m_num_partitions = (uint8_t)bmd.m_num_partitions;32283229for (uint32_t p = 0; p < bmd.m_num_partitions; p++)3230log_blk.m_color_endpoint_modes[p] = (uint8_t)bmd.m_cem;32313232log_blk.m_endpoint_ise_range = (uint8_t)bmd.m_endpoint_ise_range;3233log_blk.m_weight_ise_range = (uint8_t)bmd.m_weight_ise_range;32343235log_blk.m_grid_width = (uint8_t)bmd.m_grid_x;3236log_blk.m_grid_height = (uint8_t)bmd.m_grid_y;3237log_blk.m_dual_plane = (uint8_t)bmd.m_dp;3238log_blk.m_color_component_selector = (uint8_t)bmd.m_dp_channel;32393240if (bmd.m_num_partitions == 2)3241{3242const uint32_t unique_partition_index = decoder.decode_truncated_binary(NUM_UNIQUE_PARTITIONS2);3243log_blk.m_partition_id = (uint16_t)g_part2_unique_index_to_seed[unique_partition_index];3244}3245else if (bmd.m_num_partitions == 3)3246{3247const uint32_t unique_partition_index = decoder.decode_truncated_binary(NUM_UNIQUE_PARTITIONS3);3248log_blk.m_partition_id = (uint16_t)g_part3_unique_index_to_seed[unique_partition_index];3249}32503251bool status = decode_values(decoder, num_endpoint_values * bmd.m_num_partitions, bmd.m_endpoint_ise_range, log_blk.m_endpoints);3252if (!status)3253return false;32543255const uint32_t total_grid_weights = bmd.m_grid_x * bmd.m_grid_y * (bmd.m_dp ? 2 : 1);32563257status = decode_values(decoder, total_grid_weights, bmd.m_weight_ise_range, log_blk.m_weights);3258if (!status)3259return false;32603261astc_helpers::log_astc_block decomp_blk;3262decomp_blk.clear();32633264decomp_blk.m_dual_plane = bmd.m_dp;3265decomp_blk.m_color_component_selector = (uint8_t)bmd.m_dp_channel;3266decomp_blk.m_partition_id = log_blk.m_partition_id;32673268decomp_blk.m_num_partitions = (uint8_t)bmd.m_num_partitions;32693270for (uint32_t p = 0; p < bmd.m_num_partitions; p++)3271decomp_blk.m_color_endpoint_modes[p] = (uint8_t)bmd.m_cem;32723273decomp_blk.m_endpoint_ise_range = (uint8_t)bmd.m_transcode_endpoint_ise_range;3274decomp_blk.m_weight_ise_range = (uint8_t)bmd.m_transcode_weight_ise_range;32753276for (uint32_t p = 0; p < bmd.m_num_partitions; p++)3277basist::astc_6x6_hdr::requantize_ise_endpoints(bmd.m_cem, bmd.m_endpoint_ise_range, log_blk.m_endpoints + num_endpoint_values * p, bmd.m_transcode_endpoint_ise_range, decomp_blk.m_endpoints + num_endpoint_values * p);32783279uint8_t transcode_weights[MAX_BLOCK_W * MAX_BLOCK_H * 2];3280basist::astc_6x6_hdr::requantize_astc_weights(total_grid_weights, log_blk.m_weights, bmd.m_weight_ise_range, transcode_weights, bmd.m_transcode_weight_ise_range);32813282copy_weight_grid(bmd.m_dp, bmd.m_grid_x, bmd.m_grid_y, transcode_weights, decomp_blk);32833284status = astc_helpers::pack_astc_block(phys_blk, decomp_blk);3285if (!status)3286return false;32873288cur_bx++;3289if (cur_bx == num_blocks_x)3290{3291cur_bx = 0;3292cur_by++;3293}32943295break;3296}3297default:3298{3299assert(0);3300return false;3301}3302}33033304break;3305}3306default:3307{3308assert(0);3309return false;3310}3311}3312}33133314if (decoder.get_bits(16) != 0xA742)3315{3316fmt_error_printf("End marker not found!\n");3317return false;3318}33193320//fmt_printf("Total decode_file() time: {} secs\n", tm.get_elapsed_secs());33213322return true;3323}33243325static bool unpack_physical_astc_block(const void* pBlock, uint32_t block_width, uint32_t block_height, vec4F* pPixels)3326{3327astc_helpers::log_astc_block log_blk;3328if (!astc_helpers::unpack_block(pBlock, log_blk, block_width, block_height))3329return false;33303331basist::half_float half_block[MAX_BLOCK_W * MAX_BLOCK_H][4];3332if (!astc_helpers::decode_block(log_blk, half_block, block_width, block_height, astc_helpers::cDecodeModeHDR16))3333return false;33343335const uint32_t total_block_pixels = block_width * block_height;3336for (uint32_t p = 0; p < total_block_pixels; p++)3337{3338pPixels[p][0] = basist::half_to_float(half_block[p][0]);3339pPixels[p][1] = basist::half_to_float(half_block[p][1]);3340pPixels[p][2] = basist::half_to_float(half_block[p][2]);3341pPixels[p][3] = basist::half_to_float(half_block[p][3]);3342}33433344return true;3345}33463347static bool unpack_physical_astc_block_google(const void* pBlock, uint32_t block_width, uint32_t block_height, vec4F* pPixels)3348{3349return basisu_astc::astc::decompress_hdr((float *)pPixels, (uint8_t*)pBlock, block_width, block_height);3350}33513352static bool pack_bc6h_image(const imagef &src_img, vector2D<basist::bc6h_block> &bc6h_blocks, imagef *pPacked_bc6h_img, const fast_bc6h_params &enc_params)3353{3354const uint32_t width = src_img.get_width();3355const uint32_t height = src_img.get_height();33563357if (pPacked_bc6h_img)3358pPacked_bc6h_img->resize(width, height);33593360interval_timer tm;3361double total_enc_time = 0.0f;3362BASISU_NOTE_UNUSED(total_enc_time);33633364const uint32_t num_blocks_x = src_img.get_block_width(4);3365const uint32_t num_blocks_y = src_img.get_block_height(4);33663367bc6h_blocks.resize(num_blocks_x, num_blocks_y);33683369for (uint32_t by = 0; by < num_blocks_y; by++)3370{3371for (uint32_t bx = 0; bx < num_blocks_x; bx++)3372{3373// Extract source image block3374vec4F block_pixels[4][4]; // [y][x]3375src_img.extract_block_clamped(&block_pixels[0][0], bx * 4, by * 4, 4, 4);33763377basist::half_float half_pixels[16 * 3]; // [y][x]33783379for (uint32_t y = 0; y < 4; y++)3380{3381for (uint32_t x = 0; x < 4; x++)3382{3383for (uint32_t c = 0; c < 3; c++)3384{3385float v = block_pixels[y][x][c];33863387basist::half_float h = basist::float_to_half(v);33883389half_pixels[(x + y * 4) * 3 + c] = h;33903391} // c33923393} // x3394} // y33953396basist::bc6h_block& bc6h_blk = bc6h_blocks(bx, by);33973398tm.start();33993400basist::astc_6x6_hdr::fast_encode_bc6h(half_pixels, &bc6h_blk, enc_params);34013402total_enc_time += tm.get_elapsed_secs();34033404if (pPacked_bc6h_img)3405{3406basist::half_float unpacked_blk[16 * 3];3407bool status = unpack_bc6h(&bc6h_blk, unpacked_blk, false);3408assert(status);3409if (!status)3410{3411fmt_error_printf("unpack_bc6h() failed\n");3412return false;3413}34143415for (uint32_t y = 0; y < 4; y++)3416{3417for (uint32_t x = 0; x < 4; x++)3418{3419vec4F p;34203421for (uint32_t c = 0; c < 3; c++)3422{3423float v = basist::half_to_float(unpacked_blk[(x + y * 4) * 3 + c]);3424p[c] = v;34253426} // c34273428p[3] = 1.0f;34293430pPacked_bc6h_img->set_clipped(bx * 4 + x, by * 4 + y, p);3431} // x3432} // y3433}34343435} // bx3436} // by34373438//fmt_printf("Total BC6H encode time: {}\n", total_enc_time);34393440return true;3441}34423443static float dist_to_line_squared(const vec3F& p, const vec3F &line_org, const vec3F &line_dir)3444{3445vec3F q(p - line_org);3446vec3F v(q - q.dot(line_dir) * line_dir);3447return v.dot(v);3448}34493450static void estimate_partitions_mode7_and_11(3451uint32_t num_parts, // 2 or 3 partitions3452uint32_t num_unique_pats, const partition_pattern_vec* pUnique_pats, // list of all unique, canonicalized patterns3453uint32_t num_pats_to_examine, const uint32_t* pUnique_pat_indices_to_examine, // indices of pats to examine3454const vec3F *pHalf_pixels_as_floats, // block's half pixel values casted to floats3455const astc_hdr_codec_base_options& coptions, // options3456uint32_t num_desired_pats,3457int *pDesired_pat_indices_mode11, int *pDesired_pat_indices_mode7) // output indices3458{3459BASISU_NOTE_UNUSED(coptions);3460BASISU_NOTE_UNUSED(num_unique_pats);34613462const uint32_t BLOCK_W = 6, BLOCK_H = 6, MAX_PARTS = 3; // BLOCK_T = 6 * 63463assert(num_parts <= MAX_PARTS);34643465struct candidate_res3466{3467float m_total_sq_dist;3468uint32_t m_index;3469bool operator< (const candidate_res& rhs) const { return m_total_sq_dist < rhs.m_total_sq_dist; }3470};34713472const uint32_t MAX_CANDIDATES = 1024;3473assert(num_desired_pats && (num_desired_pats <= MAX_CANDIDATES));34743475candidate_res mode11_candidates[MAX_CANDIDATES];3476candidate_res mode7_candidates[MAX_CANDIDATES];34773478const vec3F grayscale_axis(0.5773502691f);34793480for (uint32_t examine_iter = 0; examine_iter < num_pats_to_examine; examine_iter++)3481{3482const uint32_t unique_part_index = pUnique_pat_indices_to_examine[examine_iter];3483assert(unique_part_index < num_unique_pats);34843485const partition_pattern_vec* pPat = &pUnique_pats[unique_part_index];34863487vec3F part_means[MAX_PARTS];3488uint32_t part_total_texels[MAX_PARTS] = { 0 };34893490for (uint32_t i = 0; i < num_parts; i++)3491part_means[i].clear();34923493for (uint32_t y = 0; y < BLOCK_H; y++)3494{3495for (uint32_t x = 0; x < BLOCK_W; x++)3496{3497const uint32_t part_index = (*pPat)(x, y);3498assert(part_index < num_parts);34993500part_means[part_index] += pHalf_pixels_as_floats[x + y * BLOCK_W];3501part_total_texels[part_index]++;35023503} // x3504} // y35053506for (uint32_t i = 0; i < num_parts; i++)3507{3508assert(part_total_texels[i]);3509part_means[i] /= (float)part_total_texels[i];3510}35113512float part_cov[MAX_PARTS][6];3513memset(part_cov, 0, sizeof(part_cov));35143515for (uint32_t y = 0; y < BLOCK_H; y++)3516{3517for (uint32_t x = 0; x < BLOCK_W; x++)3518{3519const uint32_t part_index = (*pPat)(x, y);3520assert(part_index < num_parts);35213522const vec3F p(pHalf_pixels_as_floats[x + y * BLOCK_W] - part_means[part_index]);35233524const float r = p[0], g = p[1], b = p[2];35253526part_cov[part_index][0] += r * r;3527part_cov[part_index][1] += r * g;3528part_cov[part_index][2] += r * b;3529part_cov[part_index][3] += g * g;3530part_cov[part_index][4] += g * b;3531part_cov[part_index][5] += b * b;35323533} // x3534} // y35353536// For each partition compute the total variance of all channels.3537float total_variance[MAX_PARTS];3538for (uint32_t part_index = 0; part_index < num_parts; part_index++)3539total_variance[part_index] = part_cov[part_index][0] + part_cov[part_index][3] + part_cov[part_index][5];35403541vec3F part_axis[MAX_PARTS];3542float mode11_eigenvalue_est[MAX_PARTS]; // For each partition, compute the variance along the principle axis3543float mode7_eigenvalue_est[MAX_PARTS]; // For each partition, compute the variance along the principle axis35443545for (uint32_t part_index = 0; part_index < num_parts; part_index++)3546{3547float* pCov = &part_cov[part_index][0];35483549float xr = .9f, xg = 1.0f, xb = .7f;35503551const uint32_t NUM_POWER_ITERS = 4;3552for (uint32_t iter = 0; iter < NUM_POWER_ITERS; iter++)3553{3554float r = xr * pCov[0] + xg * pCov[1] + xb * pCov[2];3555float g = xr * pCov[1] + xg * pCov[3] + xb * pCov[4];3556float b = xr * pCov[2] + xg * pCov[4] + xb * pCov[5];35573558float m = maximumf(maximumf(fabsf(r), fabsf(g)), fabsf(b));35593560if (m >= 1e-10f)3561{3562m = 1.0f / m;35633564r *= m;3565g *= m;3566b *= m;3567}35683569xr = r;3570xg = g;3571xb = b;3572}35733574float len_sq = xr * xr + xg * xg + xb * xb;35753576if (len_sq < 1e-10f)3577{3578xr = grayscale_axis[0];3579xg = grayscale_axis[0];3580xb = grayscale_axis[0];3581}3582else3583{3584len_sq = 1.0f / sqrtf(len_sq);35853586xr *= len_sq;3587xg *= len_sq;3588xb *= len_sq;3589}35903591{3592// Transform the principle axis by the covariance matrix, which will scale the vector by its eigenvalue (the variance of the dataset projected onto the principle axis).3593float r = xr * pCov[0] + xg * pCov[1] + xb * pCov[2];3594float g = xr * pCov[1] + xg * pCov[3] + xb * pCov[4];3595float b = xr * pCov[2] + xg * pCov[4] + xb * pCov[5];35963597// Estimate the principle eigenvalue by computing the magnitude of the transformed vector.3598// The result is the variance along the principle axis.3599//float z1 = sqrtf(r * r + g * g + b * b); // this works with the principle axis3600//float z2 = r * xr + g * xg + b * xb; // compute length projected along xr,xg,xb36013602mode11_eigenvalue_est[part_index] = r * xr + g * xg + b * xb;3603}36043605{3606const float yrgb = grayscale_axis[0];36073608// Transform the grayscale axis by the covariance matrix, which will scale the vector by the eigenvalue (which is the variance of the dataset projected onto this vector).3609float r = yrgb * pCov[0] + yrgb * pCov[1] + yrgb * pCov[2];3610float g = yrgb * pCov[1] + yrgb * pCov[3] + yrgb * pCov[4];3611float b = yrgb * pCov[2] + yrgb * pCov[4] + yrgb * pCov[5];36123613mode7_eigenvalue_est[part_index] = r * yrgb + g * yrgb + b * yrgb;3614}36153616} // part_index36173618// Compute the total variance (squared error) of the other 2 axes by subtracting the total variance of all channels by the variance of the principle axis.3619// TODO: Could also compute the ratio of the principle axis's variance vs. the total variance.3620float mode11_total_sq_dist_to_line_alt = 0.0f;3621for (uint32_t part_index = 0; part_index < num_parts; part_index++)3622{3623float d = maximum(0.0f, total_variance[part_index] - mode11_eigenvalue_est[part_index]);3624mode11_total_sq_dist_to_line_alt += d;3625}36263627{3628#if 03629// TODO: This total distance can be computed rapidly. First compute the total variance of each channel (sum the diag entries of the covar matrix),3630// then compute the principle eigenvalue, and subtract. The result is the variance of the projection distances.3631float total_sq_dist_to_line = 0.0f;3632for (uint32_t i = 0; i < BLOCK_T; i++)3633{3634const uint32_t part_index = (*pPat)[i];3635assert(part_index < num_parts);36363637total_sq_dist_to_line += dist_to_line_squared(pHalf_pixels_as_floats[i], part_means[part_index], part_axis[part_index]);3638}36393640mode11_candidates[examine_iter].m_total_sq_dist = total_sq_dist_to_line;3641#else3642mode11_candidates[examine_iter].m_total_sq_dist = mode11_total_sq_dist_to_line_alt;3643#endif3644mode11_candidates[examine_iter].m_index = unique_part_index;3645}36463647{3648float mode7_total_sq_dist_to_line_alt = 0.0f;3649for (uint32_t part_index = 0; part_index < num_parts; part_index++)3650{3651float d = maximum(0.0f, total_variance[part_index] - mode7_eigenvalue_est[part_index]);3652mode7_total_sq_dist_to_line_alt += d;3653}36543655mode7_candidates[examine_iter].m_total_sq_dist = mode7_total_sq_dist_to_line_alt;3656mode7_candidates[examine_iter].m_index = unique_part_index;3657}36583659} // examine_iter36603661std::sort(&mode11_candidates[0], &mode11_candidates[num_pats_to_examine]);3662std::sort(&mode7_candidates[0], &mode7_candidates[num_pats_to_examine]);36633664for (uint32_t i = 0; i < num_desired_pats; i++)3665pDesired_pat_indices_mode11[i] = mode11_candidates[i].m_index;36663667for (uint32_t i = 0; i < num_desired_pats; i++)3668pDesired_pat_indices_mode7[i] = mode7_candidates[i].m_index;3669}36703671static void estimate_partitions_mode7(3672uint32_t num_parts, // 2 or 3 partitions3673uint32_t num_unique_pats, const partition_pattern_vec* pUnique_pats, // list of all unique, canonicalized patterns3674uint32_t num_pats_to_examine, const uint32_t* pUnique_pat_indices_to_examine, // indices of pats to examine3675const vec3F* pHalf_pixels_as_floats, // block's half pixel values casted to floats3676const astc_hdr_codec_base_options& coptions, // options3677uint32_t num_desired_pats, uint32_t* pDesired_pat_indices) // output indices3678{3679BASISU_NOTE_UNUSED(coptions);3680BASISU_NOTE_UNUSED(num_unique_pats);36813682const uint32_t BLOCK_W = 6, BLOCK_H = 6, BLOCK_T = 6 * 6, MAX_PARTS = 3;3683assert(num_parts <= MAX_PARTS);36843685struct candidate_res3686{3687float m_total_sq_dist;3688uint32_t m_index;3689bool operator< (const candidate_res& rhs) const { return m_total_sq_dist < rhs.m_total_sq_dist; }3690};36913692const uint32_t MAX_CANDIDATES = 1024;3693assert(num_desired_pats && (num_desired_pats <= MAX_CANDIDATES));36943695candidate_res candidates[MAX_CANDIDATES];36963697for (uint32_t examine_iter = 0; examine_iter < num_pats_to_examine; examine_iter++)3698{3699const uint32_t unique_part_index = pUnique_pat_indices_to_examine[examine_iter];3700assert(unique_part_index < num_unique_pats);37013702const partition_pattern_vec* pPat = &pUnique_pats[unique_part_index];37033704vec3F part_means[MAX_PARTS];3705uint32_t part_total_texels[MAX_PARTS] = { 0 };37063707for (uint32_t i = 0; i < num_parts; i++)3708part_means[i].clear();37093710for (uint32_t y = 0; y < BLOCK_H; y++)3711{3712for (uint32_t x = 0; x < BLOCK_W; x++)3713{3714const uint32_t part_index = (*pPat)(x, y);3715assert(part_index < num_parts);37163717part_means[part_index] += pHalf_pixels_as_floats[x + y * BLOCK_W];3718part_total_texels[part_index]++;37193720} // x3721} // y37223723for (uint32_t i = 0; i < num_parts; i++)3724{3725assert(part_total_texels[i]);3726part_means[i] /= (float)part_total_texels[i];3727}37283729vec3F part_axis(0.5773502691f);37303731// TODO: This total distance can be computed rapidly. First compute the total variance of each channel (sum the diag entries of the covar matrix),3732// then compute the principle eigenvalue, and subtract. The result is the variance of the projection distances.3733float total_sq_dist_to_line = 0.0f;3734for (uint32_t i = 0; i < BLOCK_T; i++)3735{3736const uint32_t part_index = (*pPat)[i];3737assert(part_index < num_parts);37383739total_sq_dist_to_line += dist_to_line_squared(pHalf_pixels_as_floats[i], part_means[part_index], part_axis);3740}37413742candidates[examine_iter].m_total_sq_dist = total_sq_dist_to_line;37433744candidates[examine_iter].m_index = unique_part_index;37453746} // examine_iter37473748std::sort(&candidates[0], &candidates[num_pats_to_examine]);37493750for (uint32_t i = 0; i < num_desired_pats; i++)3751pDesired_pat_indices[i] = candidates[i].m_index;3752}37533754static float calc_deblocking_penalty_itp(3755uint32_t bx, uint32_t by, uint32_t width, uint32_t height,3756const imagef& pass_src_img_itp, const candidate_encoding& candidate)3757{3758float total_deblock_penalty = 0.0f;37593760float total_orig_mse = 0.0f, total_comp_mse = 0.0f;3761uint32_t total_c = 0;37623763for (uint32_t b = 0; b < 4; b++)3764{3765for (uint32_t i = 0; i < 6; i++)3766{3767int ox = 0, oy = 0, qx = 0, qy = 0;37683769switch (b)3770{3771case 0:3772ox = bx * 6 + i; oy = (by - 1) * 6 + 5;3773qx = bx * 6 + i; qy = by * 6;3774break;3775case 1:3776ox = bx * 6 + i; oy = (by + 1) * 6;3777qx = bx * 6 + i; qy = by * 6 + 5;3778break;3779case 2:3780ox = (bx - 1) * 6 + 5; oy = by * 6 + i;3781qx = bx * 6; qy = by * 6 + i;3782break;3783case 3:3784ox = (bx + 1) * 6; oy = by * 6 + i;3785qx = bx * 6 + 5; qy = by * 6 + i;3786break;3787}37883789if ((ox < 0) || (oy < 0) || (ox >= (int)width) || (oy >= (int)height))3790continue;37913792const vec3F& o_pixel_itp = pass_src_img_itp(ox, oy);3793const vec3F& q_pixel_itp = pass_src_img_itp(qx, qy);37943795const vec3F &d_pixel_itp = candidate.m_comp_pixels_itp[qy - by * 6][qx - bx * 6]; // compressed block37963797vec3F orig_delta_v(o_pixel_itp - q_pixel_itp);3798total_orig_mse += square(orig_delta_v[0]) + square(orig_delta_v[1]) + square(orig_delta_v[2]);37993800vec3F d_delta_v(o_pixel_itp - d_pixel_itp);3801total_comp_mse += square(d_delta_v[0]) + square(d_delta_v[1]) + square(d_delta_v[2]);38023803total_c++;3804}3805}38063807if (total_c)3808{3809total_orig_mse /= (float)total_c;3810total_comp_mse /= (float)total_c;38113812if (total_orig_mse)3813{3814total_deblock_penalty = fabsf((total_comp_mse - total_orig_mse) / total_orig_mse);3815}3816}38173818return total_deblock_penalty;3819}38203821static bool calc_strip_size(3822float lambda,3823uint32_t num_blocks_y, uint32_t total_threads, bool force_one_strip,3824uint32_t& res_total_strips, uint32_t& res_rows_per_strip, astc_hdr_6x6_global_config &global_cfg)3825{3826uint32_t total_strips = 1;38273828if (lambda == 0.0f)3829{3830if (!force_one_strip)3831{3832total_strips = total_threads;3833}3834}3835else3836{3837const uint32_t MIN_DESIRED_STRIPS = 8;3838const uint32_t MAX_TARGET_STRIPS = 32;3839const uint32_t TARGET_ASTC_6X6_ROWS_PER_STRIP = 12;38403841if (!force_one_strip)3842{3843total_strips = maximum<uint32_t>(1, num_blocks_y / TARGET_ASTC_6X6_ROWS_PER_STRIP);38443845if (num_blocks_y >= MIN_DESIRED_STRIPS * 2)3846total_strips = maximum(total_strips, MIN_DESIRED_STRIPS);3847}38483849total_strips = minimum(total_strips, MAX_TARGET_STRIPS);3850}38513852uint32_t rows_per_strip = 0;3853if (total_strips <= 1)3854{3855rows_per_strip = num_blocks_y;3856}3857else3858{3859rows_per_strip = (num_blocks_y / total_strips) & ~1;38603861if (rows_per_strip < 2)3862rows_per_strip = 2;// num_blocks_y;3863}38643865assert((rows_per_strip == num_blocks_y) || ((rows_per_strip & 1) == 0));38663867total_strips = (num_blocks_y + rows_per_strip - 1) / rows_per_strip;38683869if (global_cfg.m_debug_output)3870{3871fmt_printf("num_blocks_y: {}, total_threads : {}, Total strips : {}\n", num_blocks_y, total_threads, total_strips);3872fmt_printf("ASTC 6x6 block rows per strip: {}\n", rows_per_strip);3873fmt_printf("ASTC 6x6 block rows on final strip: {}\n", num_blocks_y - (total_strips - 1) * rows_per_strip);3874}38753876uint32_t total_rows = 0;3877for (uint32_t strip_index = 0; strip_index < total_strips; strip_index++)3878{3879uint32_t strip_first_by = strip_index * rows_per_strip;3880uint32_t strip_last_by = minimum<uint32_t>(strip_first_by + rows_per_strip - 1, num_blocks_y);38813882if (strip_index == (total_strips - 1))3883strip_last_by = num_blocks_y - 1;38843885uint32_t num_strip_block_rows = (strip_last_by - strip_first_by) + 1;3886total_rows += num_strip_block_rows;38873888if (global_cfg.m_debug_output)3889fmt_printf("Strip row: {}, total block rows: {}\n", strip_index, num_strip_block_rows);3890}38913892if (total_rows != num_blocks_y)3893{3894fmt_error_printf("Strip calc failed\n");3895return false;3896}38973898res_total_strips = total_strips;3899res_rows_per_strip = rows_per_strip;39003901return true;3902}39033904static void convet_rgb_image_to_itp(const imagef &src_img, imagef &dst_img, const astc_hdr_6x6_global_config& cfg)3905{3906const uint32_t width = src_img.get_width(), height = src_img.get_height();39073908dst_img.resize(width, height);39093910for (uint32_t y = 0; y < height; y++)3911{3912for (uint32_t x = 0; x < width; x++)3913{3914vec3F src_rgb(src_img(x, y));39153916vec3F src_itp;3917linear_rgb_to_itp(src_rgb, src_itp, cfg);39183919dst_img(x, y) = src_itp;3920}3921}3922}39233924const uint32_t BLOCK_W = 6, BLOCK_H = 6;3925const uint32_t NUM_BLOCK_PIXELS = BLOCK_W * BLOCK_H;39263927const float SOLID_PENALTY = 4.0f;3928const float REUSE_PENALTY = 1.0f;3929const float RUN_PENALTY = 10.0f;39303931const float MSE_WEIGHT = 300000.0f;3932const float SSIM_WEIGHT = 200.0f;3933const float TWO_LEVEL_PENALTY = 1.425f;3934const float SWITCH_TO_GAUSSIAN_FILTERED_THRESH1_D_SSIM = .04f;3935const float SWITCH_TO_GAUSSIAN_FILTERED_THRESH2_D_SSIM = .04f;3936const float COMPLEX_BLOCK_WEIGHT_GRID_2X2_MSE_PENALTY = 1.5f;3937const float COMPLEX_BLOCK_WEIGHT_GRID_3X3_MSE_PENALTY = 1.25f;3938const float COMPLEX_BLOCK_WEIGHT_GRID_4X4_MSE_PENALTY = 1.15f;39393940struct uastc_hdr_6x6_debug_state3941{3942uint32_t m_encoding_type_hist[(uint32_t)encoding_type::cTotal] = { 0 };3943uint32_t m_endpoint_mode_hist[(uint32_t)endpoint_mode::cTotal] = { 0 };3944uint32_t m_block_mode_hist[(uint32_t)block_mode::cBMTotalModes] = { 0 };3945uint64_t m_block_mode_total_bits[(uint32_t)block_mode::cBMTotalModes] = { 0 };39463947basisu::vector< basisu::stats<float> > m_block_mode_comp_stats[(uint32_t)block_mode::cBMTotalModes][3];3948basisu::vector< basisu::comparative_stats<float> > m_block_mode_comparative_stats[(uint32_t)block_mode::cBMTotalModes][3];39493950std::atomic<uint32_t> m_total_gaussian1_blocks;3951std::atomic<uint32_t> m_total_gaussian2_blocks;3952std::atomic<uint32_t> m_total_filter_horizontal;3953std::atomic<uint32_t> m_detail_stats[5];3954std::atomic<uint32_t> m_total_mode7_skips;39553956std::atomic<uint32_t> m_total_blocks_compressed;39573958std::atomic<uint32_t> m_total_candidates_considered;3959std::atomic<uint32_t> m_max_candidates_considered;39603961std::atomic<uint32_t> m_total_part2_stats[4];3962std::atomic<uint32_t> m_dp_stats[5];39633964std::atomic<uint32_t> m_reuse_num_parts[4];3965std::atomic<uint32_t> m_reuse_total_dp;39663967imagef m_stat_vis;3968std::mutex m_stat_vis_mutex;39693970image m_part_vis;3971image m_mode_vis;3972image m_mode_vis2;3973image m_grid_vis;3974image m_enc_vis;3975std::mutex m_vis_image_mutex;39763977std::atomic<uint32_t> m_comp_level_hist[ASTC_HDR_6X6_MAX_COMP_LEVEL + 1];39783979std::atomic<uint32_t> m_total_jnd_replacements;39803981std::mutex m_stats_mutex;39823983uastc_hdr_6x6_debug_state()3984{3985for (uint32_t i = 0; i < (uint32_t)block_mode::cBMTotalModes; i++)3986{3987for (uint32_t j = 0; j < 3; j++)3988{3989m_block_mode_comp_stats[i][j].reserve(512);3990m_block_mode_comparative_stats[i][j].reserve(512);3991}3992}3993}39943995void init(uint32_t width, uint32_t height)3996{3997m_stat_vis.resize(width, height);3998m_part_vis.resize(width, height);3999m_mode_vis.resize(width, height);4000m_mode_vis2.resize(width, height);4001m_grid_vis.resize(width, height);4002m_enc_vis.resize(width, height);40034004basisu::clear_obj(m_encoding_type_hist);4005basisu::clear_obj(m_endpoint_mode_hist);4006basisu::clear_obj(m_block_mode_hist);4007basisu::clear_obj(m_block_mode_total_bits);40084009for (uint32_t i = 0; i < (uint32_t)block_mode::cBMTotalModes; i++)4010{4011for (uint32_t j = 0; j < 3; j++)4012{4013m_block_mode_comp_stats[i][j].clear();4014m_block_mode_comparative_stats[i][j].clear();4015}4016}40174018m_total_gaussian1_blocks.store(0);4019m_total_gaussian2_blocks.store(0);4020m_total_filter_horizontal.store(0);4021for (uint32_t i = 0; i < std::size(m_detail_stats); i++)4022m_detail_stats[i].store(0);4023m_total_mode7_skips.store(0);40244025for (uint32_t i = 0; i < std::size(m_comp_level_hist); i++)4026m_comp_level_hist[i].store(0);40274028m_total_blocks_compressed.store(0);40294030m_total_candidates_considered.store(0);4031m_max_candidates_considered.store(0);40324033for (uint32_t i = 0; i < std::size(m_total_part2_stats); i++)4034m_total_part2_stats[i].store(0);40354036for (uint32_t i = 0; i < std::size(m_dp_stats); i++)4037m_dp_stats[i].store(0);40384039for (uint32_t i = 0; i < std::size(m_reuse_num_parts); i++)4040m_reuse_num_parts[i] .store(0);40414042m_reuse_total_dp.store(0);40434044m_total_jnd_replacements.store(0);4045}40464047void print(uint32_t total_blocks) const4048{4049fmt_printf("Total blocks: {}\n", total_blocks);4050fmt_printf("Total JND replacements: {} {3.2}%\n", m_total_jnd_replacements, (float)m_total_jnd_replacements * 100.0f / (float)total_blocks);4051fmt_printf("Comp level histogram: {} {} {} {} {}\n", m_comp_level_hist[0], m_comp_level_hist[1], m_comp_level_hist[2], m_comp_level_hist[3], m_comp_level_hist[4]);4052fmt_printf("Total gaussian 1 blocks: {} {3.2}%\n", m_total_gaussian1_blocks, (float)m_total_gaussian1_blocks * 100.0f / (float)total_blocks);4053fmt_printf("Total gaussian 2 blocks: {} {3.2}%\n", m_total_gaussian2_blocks, (float)m_total_gaussian2_blocks * 100.0f / (float)total_blocks);4054fmt_printf("Total filter horizontal: {} {3.2}%\n", m_total_filter_horizontal, (float)m_total_filter_horizontal * 100.0f / (float)total_blocks);4055fmt_printf("Detail stats: Detailed block low grid skip: {}, Blurry block skip: {}, Very blurry block skip: {}, NH:{} H:{}\n", m_detail_stats[0], m_detail_stats[1], m_detail_stats[2], m_detail_stats[3], m_detail_stats[4]);4056fmt_printf("Total mode7 skips: {}\n", m_total_mode7_skips);40574058fmt_printf("Total candidates: {}, {} avg per block\n", m_total_candidates_considered, (float)m_total_candidates_considered / (float)total_blocks);4059fmt_printf("Max ever candidates: {}\n", m_max_candidates_considered);40604061fmt_printf("Part2/3 stats: {} {} {} {}\n", m_total_part2_stats[0], m_total_part2_stats[1], m_total_part2_stats[2], m_total_part2_stats[3]);4062fmt_printf("Dual plane stats: {} {} {} {} {}\n", m_dp_stats[0], m_dp_stats[1], m_dp_stats[2], m_dp_stats[3], m_dp_stats[4]);4063fmt_printf("Reuse total dual plane: {}\n", m_reuse_total_dp);4064fmt_printf("Reuse part stats: {} {} {}\n", m_reuse_num_parts[1], m_reuse_num_parts[2], m_reuse_num_parts[3]);40654066fmt_printf("\nEncoding type histogram:\n");4067for (uint32_t i = 0; i < std::size(m_encoding_type_hist); i++)4068fmt_printf("{}: {}\n", i, m_encoding_type_hist[i]);40694070fmt_printf("\nEndpoint mode histogram:\n");4071for (uint32_t i = 0; i < std::size(m_endpoint_mode_hist); i++)4072fmt_printf("{}: {}\n", i, m_endpoint_mode_hist[i]);40734074fmt_printf("\nBlock mode histogram:\n");40754076uint32_t total_dp = 0, total_sp = 0;4077uint32_t total_mode11 = 0, total_mode7 = 0;4078uint32_t part_hist[3] = { 0 };4079uint32_t part2_mode7_total = 0, part2_mode11_total = 0;4080uint32_t total_used_modes = 0;4081for (uint32_t i = 0; i < std::size(m_block_mode_hist); i++)4082{4083const auto& bm_desc = g_block_mode_descs[i];40844085const uint32_t total_uses = m_block_mode_hist[i];40864087if (bm_desc.m_dp)4088total_dp += total_uses;4089else4090total_sp += total_uses;40914092if (bm_desc.m_cem == 7)4093total_mode7 += total_uses;4094else4095total_mode11 += total_uses;40964097part_hist[bm_desc.m_num_partitions - 1] += total_uses;40984099if (bm_desc.m_num_partitions == 2)4100{4101if (bm_desc.m_cem == 7)4102part2_mode7_total += total_uses;4103else4104{4105assert(bm_desc.m_cem == 11);4106part2_mode11_total += total_uses;4107}4108}41094110float avg_std_dev = 0.0f;4111float avg_cross_correlations[3] = { 0 };41124113if (m_block_mode_comp_stats[i][0].size())4114{4115const uint32_t num_uses = m_block_mode_comp_stats[i][0].size_u32();41164117for (uint32_t j = 0; j < num_uses; j++)4118avg_std_dev += (float)maximum(m_block_mode_comp_stats[i][0][j].m_std_dev, m_block_mode_comp_stats[i][1][j].m_std_dev, m_block_mode_comp_stats[i][2][j].m_std_dev);4119avg_std_dev /= (float)num_uses;41204121for (uint32_t j = 0; j < num_uses; j++)4122{4123avg_cross_correlations[0] += fabsf((float)m_block_mode_comparative_stats[i][0][j].m_pearson);4124avg_cross_correlations[1] += fabsf((float)m_block_mode_comparative_stats[i][1][j].m_pearson);4125avg_cross_correlations[2] += fabsf((float)m_block_mode_comparative_stats[i][2][j].m_pearson);4126}41274128avg_cross_correlations[0] /= (float)num_uses;4129avg_cross_correlations[1] /= (float)num_uses;4130avg_cross_correlations[2] /= (float)num_uses;4131}41324133fmt_printf("{ 2}: uses: { 6}, cem: {}, dp: {} chan: {}, parts: {}, grid: {}x{}, endpoint levels: {}, weight levels: {}, Avg bits: {}, Avg Max Std Dev: {}, RG: {} RB: {} GB: {}\n", i, total_uses,4134bm_desc.m_cem,4135bm_desc.m_dp, bm_desc.m_dp_channel,4136bm_desc.m_num_partitions,4137bm_desc.m_grid_x, bm_desc.m_grid_y,4138astc_helpers::get_ise_levels(bm_desc.m_endpoint_ise_range),4139astc_helpers::get_ise_levels(bm_desc.m_weight_ise_range),4140total_uses ? ((double)m_block_mode_total_bits[i] / total_uses) : 0.0f,4141avg_std_dev, avg_cross_correlations[0], avg_cross_correlations[1], avg_cross_correlations[2]);41424143if (total_uses)4144total_used_modes++;4145}41464147fmt_printf("Total used modes: {}\n", total_used_modes);41484149fmt_printf("Total single plane: {}, total dual plane: {}\n", total_sp, total_dp);4150fmt_printf("Total mode 11: {}, mode 7: {}\n", total_mode11, total_mode7);4151fmt_printf("Partition histogram: {} {} {}\n", part_hist[0], part_hist[1], part_hist[2]);4152fmt_printf("2 subset mode 7 uses: {}, mode 11 uses: {}\n", part2_mode7_total, part2_mode11_total);4153}4154};41554156struct uastc_hdr_6x6_encode_state4157{4158astc_hdr_codec_base_options master_coptions;41594160imagef src_img;41614162imagef src_img_filtered1;4163imagef src_img_filtered2;41644165imagef src_img_itp;4166imagef src_img_filtered1_itp;4167imagef src_img_filtered2_itp;41684169vector2D<float> smooth_block_mse_scales;41704171imagef packed_img;41724173basisu::vector<bitwise_coder> strip_bits;41744175basisu::vector2D<astc_helpers::astc_block> final_astc_blocks;41764177vector2D<candidate_encoding> coded_blocks;4178};41794180static bool compress_strip_task(4181uint32_t strip_index, uint32_t total_strips, uint32_t strip_first_by, uint32_t strip_last_by,4182uint32_t num_blocks_x, uint32_t num_blocks_y, uint32_t total_blocks, uint32_t width, uint32_t height,4183astc_hdr_6x6_global_config &global_cfg, uastc_hdr_6x6_debug_state &debug_state, uastc_hdr_6x6_encode_state &enc_state)4184{4185BASISU_NOTE_UNUSED(num_blocks_y);4186BASISU_NOTE_UNUSED(total_strips);41874188vec3F prev_comp_pixels[BLOCK_H][BLOCK_W]; // [y][x]4189basisu::clear_obj(prev_comp_pixels);41904191uint32_t prev_run_len = 0;41924193bitwise_coder prev_encoding;4194candidate_encoding prev_candidate_encoding; // the previous candidate written, which may have been a run extension4195candidate_encoding prev_non_run_candidate_encoding; // the previous *non-run* candidate written41964197bitwise_coder& strip_coded_bits = enc_state.strip_bits[strip_index];41984199const uint32_t CANDIDATES_TO_RESERVE = 1536;42004201basisu::vector<candidate_encoding> candidates;4202candidates.reserve(CANDIDATES_TO_RESERVE);42034204for (uint32_t by = strip_first_by; by <= strip_last_by; by++)4205{4206const bool has_upper_neighbor = by > strip_first_by;42074208for (uint32_t bx = 0; bx < num_blocks_x; bx++)4209{4210//if ((bx == 1) && (by == 2))4211// basisu::fmt_printf("!");42124213for (uint32_t outer_pass = 0; outer_pass < 3; outer_pass++)4214{4215const bool has_left_neighbor = bx > 0;4216//const bool has_prev = has_left_neighbor || has_upper_neighbor;42174218// Select either the original source image, or the Gaussian filtered version.4219// From here the encoder *must* use these 2 sources.4220const imagef& pass_src_img = (outer_pass == 2) ? enc_state.src_img_filtered2 :4221((outer_pass == 1) ? enc_state.src_img_filtered1 : enc_state.src_img);42224223const imagef& pass_src_img_itp = (outer_pass == 2) ? enc_state.src_img_filtered2_itp :4224((outer_pass == 1) ? enc_state.src_img_filtered1_itp : enc_state.src_img_itp);42254226// Extract source image block4227vec4F block_pixels[BLOCK_H][BLOCK_W]; // [y][x]4228pass_src_img.extract_block_clamped(&block_pixels[0][0], bx * BLOCK_W, by * BLOCK_H, BLOCK_W, BLOCK_H);42294230vec4F block_pixels_itp[BLOCK_H][BLOCK_W]; // [y][x]4231pass_src_img_itp.extract_block_clamped(&block_pixels_itp[0][0], bx * BLOCK_W, by * BLOCK_H, BLOCK_W, BLOCK_H);42324233half_vec3 half_pixels[BLOCK_H][BLOCK_W]; // [y][x] half-float values4234vec3F half_pixels_as_floats[BLOCK_H][BLOCK_W]; // [y][x] half float values, integer bits as floats4235vec4F block_pixels_q16[BLOCK_H][BLOCK_W]; // [y][x], q16 space for low-level ASTC encoding4236vec3F block_pixels_as_itp[BLOCK_H][BLOCK_W]; // [y][x] input converted to itp space, for faster error calculations42374238bool is_grayscale = true;42394240candidates.resize(0);42414242float block_ly = BIG_FLOAT_VAL, block_hy = 0.0f, block_avg_y = 0.0f;42434244for (uint32_t y = 0; y < BLOCK_H; y++)4245{4246for (uint32_t x = 0; x < BLOCK_W; x++)4247{4248vec3F rgb_input;42494250for (uint32_t c = 0; c < 3; c++)4251{4252float v = block_pixels[y][x][c];42534254rgb_input[c] = v;42554256const basist::half_float h = basisu::fast_float_to_half_no_clamp_neg_nan_or_inf(v);4257assert(h == basist::float_to_half(v));42584259half_pixels[y][x][c] = h;42604261block_pixels_q16[y][x][c] = (float)half_to_qlog16(h);42624263half_pixels_as_floats[y][x][c] = (float)h;42644265} // c42664267float py = rgb_input.dot(vec3F(REC_709_R, REC_709_G, REC_709_B));4268if (py < block_ly)4269block_ly = py;4270if (py > block_hy)4271block_hy = py;4272block_avg_y += py;42734274//linear_rgb_to_itp(rgb_input, block_pixels_as_itp[y][x]);42754276block_pixels_as_itp[y][x] = block_pixels_itp[y][x];42774278block_pixels_q16[y][x][3] = 0.0f;42794280if ((half_pixels[y][x][0] != half_pixels[y][x][1]) || (half_pixels[y][x][0] != half_pixels[y][x][2]))4281is_grayscale = false;42824283} // x4284} // y42854286block_avg_y *= (1.0f / (float)NUM_BLOCK_PIXELS);42874288encode_astc_block_stats enc_block_stats;4289enc_block_stats.init(NUM_BLOCK_PIXELS, &block_pixels_q16[0][0]);42904291vec4F x_filtered[6][6], y_filtered[6][6];42924293filter_block(3, 6, (vec4F*)block_pixels, (vec4F*)x_filtered); // filter rows (horizontal)4294filter_block(6, 3, (vec4F*)block_pixels, (vec4F*)y_filtered); // filter cols (vertically)42954296const float filtered_x_err = diff_blocks((vec4F*)block_pixels, (vec4F*)x_filtered);4297const float filtered_y_err = diff_blocks((vec4F*)block_pixels, (vec4F*)y_filtered);4298const bool filter_horizontally = filtered_x_err < filtered_y_err;42994300//const float block_mag_gradient_mag = block_max_gradient_mag(bx, by);43014302if (filter_horizontally)4303debug_state.m_total_filter_horizontal.fetch_add(1, std::memory_order_relaxed);43044305vec3F lowpass_filtered[6][6];4306filter_block(3, 3, &half_pixels_as_floats[0][0], &lowpass_filtered[0][0]);4307float lowpass_std_dev = sub_and_compute_std_dev(&lowpass_filtered[0][0], &half_pixels_as_floats[0][0]);43084309const bool very_detailed_block = lowpass_std_dev > 350.0f;4310const bool very_blurry_block = lowpass_std_dev < 30.0f;4311const bool super_blurry_block = lowpass_std_dev < 15.0f;43124313basisu::stats<float> half_comp_stats[3];4314for (uint32_t c = 0; c < 3; c++)4315half_comp_stats[c].calc(NUM_BLOCK_PIXELS, &half_pixels_as_floats[0][0][c], 3);43164317const float SINGLE_PART_HALF_THRESH = 256.0f;4318const float COMPLEX_HALF_THRESH = 1024.0f;4319// HACK HACK4320const float VERY_COMPLEX_HALF_THRESH = 1400.0f; // 1536.0f;43214322const float max_std_dev = (float)maximum(half_comp_stats[0].m_std_dev, half_comp_stats[1].m_std_dev, half_comp_stats[2].m_std_dev);43234324const bool very_simple_block = (max_std_dev < SINGLE_PART_HALF_THRESH);4325const bool complex_block = (max_std_dev > COMPLEX_HALF_THRESH);4326const bool very_complex_block = (max_std_dev > VERY_COMPLEX_HALF_THRESH);43274328// Dynamically choose a comp_level for this block.4329astc_hdr_codec_base_options coptions(enc_state.master_coptions);4330uint32_t comp_level = global_cfg.m_master_comp_level;43314332if (very_complex_block)4333comp_level = global_cfg.m_highest_comp_level;4334else if (complex_block)4335comp_level = (global_cfg.m_master_comp_level + global_cfg.m_highest_comp_level + 1) / 2;43364337debug_state.m_comp_level_hist[comp_level].fetch_add(1, std::memory_order_relaxed);43384339bool any_2subset_enabled = false, any_2subset_mode11_enabled = false, any_2subset_mode7_enabled = false, any_3subset_enabled = false;4340BASISU_NOTE_UNUSED(any_2subset_mode11_enabled);43414342for (uint32_t i = 0; i < (uint32_t)block_mode::cBMTotalModes; i++)4343{4344if (comp_level == 0)4345{4346if ((g_block_mode_descs[i].m_flags & BASIST_HDR_6X6_LEVEL0) == 0)4347continue;4348}4349else if (comp_level == 1)4350{4351if ((g_block_mode_descs[i].m_flags & BASIST_HDR_6X6_LEVEL1) == 0)4352continue;4353}4354else if (comp_level == 2)4355{4356if ((g_block_mode_descs[i].m_flags & BASIST_HDR_6X6_LEVEL2) == 0)4357continue;4358}43594360if (g_block_mode_descs[i].m_num_partitions == 2)4361{4362any_2subset_enabled = true;43634364if (g_block_mode_descs[i].m_cem == 7)4365{4366any_2subset_mode7_enabled = true;4367}4368else4369{4370assert(g_block_mode_descs[i].m_cem == 11);4371any_2subset_mode11_enabled = true;4372}4373}4374else if (g_block_mode_descs[i].m_num_partitions == 3)4375any_3subset_enabled = true;4376}43774378coptions.m_mode7_full_s_optimization = (comp_level >= 2);43794380const bool uber_mode_flag = (comp_level >= 3);4381coptions.m_allow_uber_mode = uber_mode_flag;43824383coptions.m_ultra_quant = (comp_level >= 4);43844385coptions.m_take_first_non_clamping_mode11_submode = (comp_level <= 2);4386coptions.m_take_first_non_clamping_mode7_submode = (comp_level <= 2);43874388coptions.m_disable_weight_plane_optimization = (comp_level >= 2);43894390// -------------------43914392uint32_t total_used_block_chans = 0;4393for (uint32_t i = 0; i < 3; i++)4394total_used_block_chans += (half_comp_stats[i].m_range > 0.0f);43954396const bool is_solid_block = (total_used_block_chans == 0);43974398basisu::comparative_stats<float> half_cross_chan_stats[3];43994400// R vs. G4401half_cross_chan_stats[0].calc_pearson(NUM_BLOCK_PIXELS,4402&half_pixels_as_floats[0][0][0], &half_pixels_as_floats[0][0][1],44033, 3,4404&half_comp_stats[0], &half_comp_stats[1]);44054406// R vs. B4407half_cross_chan_stats[1].calc_pearson(NUM_BLOCK_PIXELS,4408&half_pixels_as_floats[0][0][0], &half_pixels_as_floats[0][0][2],44093, 3,4410&half_comp_stats[0], &half_comp_stats[2]);44114412// G vs. B4413half_cross_chan_stats[2].calc_pearson(NUM_BLOCK_PIXELS,4414&half_pixels_as_floats[0][0][1], &half_pixels_as_floats[0][0][2],44153, 3,4416&half_comp_stats[1], &half_comp_stats[2]);44174418const float rg_corr = fabsf((float)half_cross_chan_stats[0].m_pearson);4419const float rb_corr = fabsf((float)half_cross_chan_stats[1].m_pearson);4420const float gb_corr = fabsf((float)half_cross_chan_stats[2].m_pearson);44214422float min_corr = BIG_FLOAT_VAL, max_corr = -BIG_FLOAT_VAL;4423for (uint32_t i = 0; i < 3; i++)4424{4425#if 04426// 9/5/2025, wrong metric, we're iterating channels pairs here, not individual channels.4427// On 3 active channel blocks this causes no difference.4428if (half_comp_stats[i].m_range > 0.0f)4429#else4430static const uint8_t s_chan_pairs[3][2] = { {0, 1}, {0, 2}, {1, 2} };44314432const uint32_t chanA = s_chan_pairs[i][0];4433const uint32_t chanB = s_chan_pairs[i][1];44344435if ((half_comp_stats[chanA].m_range > 0.0f) && (half_comp_stats[chanB].m_range > 0.0f))4436#endif4437{4438const float c = fabsf((float)half_cross_chan_stats[i].m_pearson);4439min_corr = minimum(min_corr, c);4440max_corr = maximum(max_corr, c);4441}4442}44434444bool use_single_subset_mode7 = true;4445if (comp_level <= 1)4446{4447// TODO: could also compute angle between principle axis and the grayscale axis.4448// TODO: Transform grayscale axis by covar matrix, compute variance vs. total variance4449const float MODE7_MIN_CHAN_CORR = .5f;4450const float MODE7_PCA_ANGLE_THRESH = .9f;4451use_single_subset_mode7 = is_grayscale || is_solid_block || ((total_used_block_chans == 1) || (min_corr >= MODE7_MIN_CHAN_CORR));44524453if (use_single_subset_mode7)4454{4455float cos_ang = fabsf(enc_block_stats.m_axis_q16.dot(vec3F(0.5773502691f)));4456if (cos_ang < MODE7_PCA_ANGLE_THRESH)4457use_single_subset_mode7 = false;4458}4459}44604461const float STRONG_CORR_THRESH = (comp_level <= 1) ? .5f : ((comp_level <= 3) ? .75f : .9f);44624463int desired_dp_chan = -1;4464if (total_used_block_chans <= 1)4465{4466// no need for dual plane (except possibly 2x2 weight grids for RDO)4467}4468else4469{4470if (min_corr >= STRONG_CORR_THRESH)4471{4472// all channel pairs strongly correlated, no need for dual plane4473debug_state.m_dp_stats[0].fetch_add(1, std::memory_order_relaxed);4474}4475else4476{4477if (total_used_block_chans == 2)4478{4479if (half_comp_stats[0].m_range == 0.0f)4480{4481// r unused, check for strong gb correlation4482if (gb_corr < STRONG_CORR_THRESH)4483desired_dp_chan = 1;4484}4485else if (half_comp_stats[1].m_range == 0.0f)4486{4487// g unused, check for strong rb correlation4488if (rb_corr < STRONG_CORR_THRESH)4489desired_dp_chan = 0;4490}4491else4492{4493// b unused, check for strong rg correlation4494if (rg_corr < STRONG_CORR_THRESH)4495desired_dp_chan = 0;4496}4497}4498else4499{4500assert(total_used_block_chans == 3);45014502// see if rg/rb is weakly correlated vs. gb4503if ((rg_corr < gb_corr) && (rb_corr < gb_corr))4504desired_dp_chan = 0;4505// see if gr/gb is weakly correlated vs. rb4506else if ((rg_corr < rb_corr) && (gb_corr < rb_corr))4507desired_dp_chan = 1;4508// assume b is weakest4509else4510desired_dp_chan = 2;4511}45124513if (desired_dp_chan == -1)4514debug_state.m_dp_stats[1].fetch_add(1, std::memory_order_relaxed);4515else4516debug_state.m_dp_stats[2 + desired_dp_chan].fetch_add(1, std::memory_order_relaxed);4517}4518}45194520// 2x2 is special for RDO at higher lambdas - always pick a preferred channel.4521int desired_dp_chan_2x2 = 0;4522if (total_used_block_chans == 2)4523{4524if (half_comp_stats[0].m_range == 0.0f)4525desired_dp_chan_2x2 = 1;4526}4527else if (total_used_block_chans == 3)4528{4529// see if rg/rb is weakly correlated vs. gb4530if ((rg_corr < gb_corr) && (rb_corr < gb_corr))4531desired_dp_chan_2x2 = 0;4532// see if gr/gb is weakly correlated vs. rb4533else if ((rg_corr < rb_corr) && (gb_corr < rb_corr))4534desired_dp_chan_2x2 = 1;4535// assume b is weakest4536else4537desired_dp_chan_2x2 = 2;4538}45394540// Gather all candidate encodings4541bool status = false;45424543// ---- Run candidate4544if ((global_cfg.m_use_runs) && (has_left_neighbor || has_upper_neighbor))4545{4546candidate_encoding candidate;4547candidate.m_coder.reserve(24);45484549candidate.m_encoding_type = encoding_type::cRun;45504551candidate.m_decomp_log_blk = prev_non_run_candidate_encoding.m_decomp_log_blk;4552candidate.m_coded_log_blk = prev_non_run_candidate_encoding.m_coded_log_blk;45534554memcpy(candidate.m_comp_pixels, prev_comp_pixels, sizeof(prev_comp_pixels));45554556if (!prev_run_len)4557{4558candidate.m_coder.put_bits(RUN_CODE, RUN_CODE_LEN);4559candidate.m_coder.put_vlc(0, 5);4560}4561else4562{4563// extend current run - compute the # of new bits needed for the extension.45644565uint32_t prev_run_bits = prev_encoding.get_total_bits_u32();4566assert(prev_run_bits > 0);45674568// We're not actually going to code this, because the previously emitted run code will be extended.4569bitwise_coder temp_coder;4570temp_coder.put_bits(RUN_CODE, RUN_CODE_LEN);4571temp_coder.put_vlc((prev_run_len + 1) - 1, 5);45724573uint32_t cur_run_bits = temp_coder.get_total_bits_u32();4574assert(cur_run_bits >= prev_run_bits);45754576uint32_t total_new_bits = cur_run_bits - prev_run_bits;4577if (total_new_bits > 0)4578candidate.m_coder.put_bits(0, total_new_bits); // dummy bits4579}45804581candidate.m_run_len = prev_run_len + 1;45824583candidates.emplace_back(std::move(candidate));4584}45854586// ---- Reuse candidate4587if ((!is_solid_block) && (global_cfg.m_lambda > 0.0f))4588{4589for (uint32_t reuse_delta_index = 0; reuse_delta_index < global_cfg.m_num_reuse_xy_deltas; reuse_delta_index++)4590{4591const int reuse_delta_x = g_reuse_xy_deltas[reuse_delta_index].m_x;4592const int reuse_delta_y = g_reuse_xy_deltas[reuse_delta_index].m_y;45934594const int reuse_bx = bx + reuse_delta_x, reuse_by = by + reuse_delta_y;4595if ((reuse_bx < 0) || (reuse_bx >= (int)num_blocks_x))4596continue;4597if (reuse_by < (int)strip_first_by)4598break;45994600const candidate_encoding& prev_candidate = enc_state.coded_blocks(reuse_bx, reuse_by);46014602// TODO - support this.4603if (prev_candidate.m_encoding_type == encoding_type::cSolid)4604continue;4605assert((prev_candidate.m_encoding_type == encoding_type::cBlock) || (prev_candidate.m_encoding_type == encoding_type::cReuse));46064607candidate_encoding candidate;4608candidate.m_coder.reserve(24);4609astc_helpers::log_astc_block& coded_log_blk = candidate.m_coded_log_blk;4610astc_helpers::log_astc_block& decomp_log_blk = candidate.m_decomp_log_blk;46114612const astc_helpers::log_astc_block& prev_coded_log_blk = prev_candidate.m_coded_log_blk;46134614const uint32_t grid_x = prev_coded_log_blk.m_grid_width, grid_y = prev_coded_log_blk.m_grid_height;4615const bool dual_plane = prev_candidate.m_coded_log_blk.m_dual_plane;4616const uint32_t num_grid_samples = grid_x * grid_y;4617const uint32_t num_endpoint_vals = get_num_endpoint_vals(prev_coded_log_blk.m_color_endpoint_modes[0]);46184619coded_log_blk = prev_candidate.m_coded_log_blk;4620decomp_log_blk = prev_candidate.m_decomp_log_blk;46214622if (prev_coded_log_blk.m_num_partitions == 1)4623{4624// Now encode the block using the transcoded endpoints4625basist::half_float decoded_half[MAX_SUPPORTED_WEIGHT_LEVELS][3];46264627if (prev_coded_log_blk.m_color_endpoint_modes[0] == 7)4628{4629status = get_astc_hdr_mode_7_block_colors(coded_log_blk.m_endpoints, &decoded_half[0][0], nullptr,4630astc_helpers::get_ise_levels(coded_log_blk.m_weight_ise_range), coded_log_blk.m_weight_ise_range, coded_log_blk.m_endpoint_ise_range);4631}4632else4633{4634status = get_astc_hdr_mode_11_block_colors(coded_log_blk.m_endpoints, &decoded_half[0][0], nullptr,4635astc_helpers::get_ise_levels(coded_log_blk.m_weight_ise_range), coded_log_blk.m_weight_ise_range, coded_log_blk.m_endpoint_ise_range);4636}4637assert(status);46384639uint8_t trial_weights0[BLOCK_W * BLOCK_H], trial_weights1[BLOCK_W * BLOCK_H];4640uint8_t transcode_weights[MAX_BLOCK_W * MAX_BLOCK_H * 2];46414642if (dual_plane)4643{4644eval_selectors_dual_plane(prev_candidate.m_coded_log_blk.m_color_component_selector,4645BLOCK_W * BLOCK_H, trial_weights0, trial_weights1, (basist::half_float*)&half_pixels[0][0][0], astc_helpers::get_ise_levels(coded_log_blk.m_weight_ise_range), &decoded_half[0][0], coptions, UINT32_MAX);46464647downsample_ise_weights_dual_plane(4648coded_log_blk.m_weight_ise_range, coded_log_blk.m_weight_ise_range,4649BLOCK_W, BLOCK_H,4650grid_x, grid_y,4651trial_weights0, trial_weights1, coded_log_blk.m_weights);46524653basist::astc_6x6_hdr::requantize_astc_weights(num_grid_samples * 2, coded_log_blk.m_weights, coded_log_blk.m_weight_ise_range, transcode_weights, decomp_log_blk.m_weight_ise_range);4654}4655else4656{4657eval_selectors(BLOCK_W * BLOCK_H, trial_weights0, coded_log_blk.m_weight_ise_range, (basist::half_float*)&half_pixels[0][0][0], astc_helpers::get_ise_levels(coded_log_blk.m_weight_ise_range), &decoded_half[0][0], coptions, UINT32_MAX);46584659downsample_ise_weights(4660coded_log_blk.m_weight_ise_range, coded_log_blk.m_weight_ise_range,4661BLOCK_W, BLOCK_H,4662grid_x, grid_y,4663trial_weights0, coded_log_blk.m_weights);46644665basist::astc_6x6_hdr::requantize_astc_weights(num_grid_samples, coded_log_blk.m_weights, coded_log_blk.m_weight_ise_range, transcode_weights, decomp_log_blk.m_weight_ise_range);4666}46674668// Create the block the decoder would transcode into.4669copy_weight_grid(dual_plane, grid_x, grid_y, transcode_weights, decomp_log_blk);4670}4671else if (prev_coded_log_blk.m_num_partitions == 2)4672{4673assert(!dual_plane);46744675const int unique_pat_index = g_part2_seed_to_unique_index[coded_log_blk.m_partition_id];4676assert((unique_pat_index >= 0) && (unique_pat_index < (int)NUM_UNIQUE_PARTITIONS2));46774678const partition_pattern_vec& pat_vec = g_partitions2[unique_pat_index];46794680vec4F part_pixels_q16[2][64];4681half_vec3 part_half_pixels[2][64];4682uint32_t part_total_pixels[2] = { 0 };46834684for (uint32_t y = 0; y < BLOCK_H; y++)4685{4686for (uint32_t x = 0; x < BLOCK_W; x++)4687{4688const uint32_t part_index = pat_vec[x + y * 6];46894690uint32_t l = part_total_pixels[part_index];46914692part_pixels_q16[part_index][l] = block_pixels_q16[y][x];4693part_half_pixels[part_index][l] = half_pixels[y][x];46944695part_total_pixels[part_index] = l + 1;4696} // x4697} // y46984699uint8_t blk_weights[2][BLOCK_W * BLOCK_H];47004701for (uint32_t part_index = 0; part_index < 2; part_index++)4702{4703basist::half_float decoded_half[MAX_SUPPORTED_WEIGHT_LEVELS][3];47044705if (prev_coded_log_blk.m_color_endpoint_modes[0] == 7)4706{4707status = get_astc_hdr_mode_7_block_colors(coded_log_blk.m_endpoints + num_endpoint_vals * part_index, &decoded_half[0][0], nullptr,4708astc_helpers::get_ise_levels(coded_log_blk.m_weight_ise_range), coded_log_blk.m_weight_ise_range, coded_log_blk.m_endpoint_ise_range);4709}4710else4711{4712status = get_astc_hdr_mode_11_block_colors(coded_log_blk.m_endpoints + num_endpoint_vals * part_index, &decoded_half[0][0], nullptr,4713astc_helpers::get_ise_levels(coded_log_blk.m_weight_ise_range), coded_log_blk.m_weight_ise_range, coded_log_blk.m_endpoint_ise_range);4714}4715assert(status);47164717eval_selectors(part_total_pixels[part_index], blk_weights[part_index], coded_log_blk.m_weight_ise_range,4718(basist::half_float*)&part_half_pixels[part_index][0][0], astc_helpers::get_ise_levels(coded_log_blk.m_weight_ise_range), &decoded_half[0][0], coptions, UINT32_MAX);47194720} // part_index47214722uint8_t ise_weights[BLOCK_W * BLOCK_H];47234724uint32_t src_pixel_index[2] = { 0, 0 };4725for (uint32_t y = 0; y < BLOCK_H; y++)4726{4727for (uint32_t x = 0; x < BLOCK_W; x++)4728{4729const uint32_t part_index = pat_vec[x + y * 6];47304731ise_weights[x + y * BLOCK_W] = blk_weights[part_index][src_pixel_index[part_index]];4732src_pixel_index[part_index]++;4733} // x4734} // y47354736downsample_ise_weights(4737coded_log_blk.m_weight_ise_range, coded_log_blk.m_weight_ise_range,4738BLOCK_W, BLOCK_H,4739grid_x, grid_y,4740ise_weights, coded_log_blk.m_weights);47414742// Transcode these codable weights to ASTC weights.4743uint8_t transcode_weights[MAX_BLOCK_W * MAX_BLOCK_H];4744basist::astc_6x6_hdr::requantize_astc_weights(num_grid_samples, coded_log_blk.m_weights, coded_log_blk.m_weight_ise_range, transcode_weights, decomp_log_blk.m_weight_ise_range);47454746// Create the block the decoder would transcode into.4747copy_weight_grid(dual_plane, grid_x, grid_y, transcode_weights, decomp_log_blk);4748}4749else if (prev_coded_log_blk.m_num_partitions == 3)4750{4751assert(!dual_plane);47524753const int unique_pat_index = g_part3_seed_to_unique_index[coded_log_blk.m_partition_id];4754assert((unique_pat_index >= 0) && (unique_pat_index < (int)NUM_UNIQUE_PARTITIONS3));47554756const partition_pattern_vec& pat = g_partitions3[unique_pat_index];47574758vec4F part_pixels_q16[3][64];4759half_vec3 part_half_pixels[3][64];4760uint32_t part_total_pixels[3] = { 0 };47614762for (uint32_t y = 0; y < BLOCK_H; y++)4763{4764for (uint32_t x = 0; x < BLOCK_W; x++)4765{4766const uint32_t part_index = pat.m_parts[x + y * BLOCK_W];47674768uint32_t l = part_total_pixels[part_index];47694770part_pixels_q16[part_index][l] = block_pixels_q16[y][x];4771part_half_pixels[part_index][l] = half_pixels[y][x];47724773part_total_pixels[part_index] = l + 1;4774} // x4775} // y47764777uint8_t blk_weights[3][BLOCK_W * BLOCK_H];47784779for (uint32_t part_index = 0; part_index < 3; part_index++)4780{4781basist::half_float decoded_half[MAX_SUPPORTED_WEIGHT_LEVELS][3];47824783status = get_astc_hdr_mode_7_block_colors(coded_log_blk.m_endpoints + num_endpoint_vals * part_index, &decoded_half[0][0], nullptr,4784astc_helpers::get_ise_levels(coded_log_blk.m_weight_ise_range), coded_log_blk.m_weight_ise_range, coded_log_blk.m_endpoint_ise_range);4785assert(status);47864787eval_selectors(part_total_pixels[part_index], blk_weights[part_index], coded_log_blk.m_weight_ise_range,4788(basist::half_float*)&part_half_pixels[part_index][0][0], astc_helpers::get_ise_levels(coded_log_blk.m_weight_ise_range), &decoded_half[0][0], coptions, UINT32_MAX);47894790} // part_index47914792uint8_t ise_weights[BLOCK_W * BLOCK_H];47934794uint32_t src_pixel_index[3] = { 0 };4795for (uint32_t y = 0; y < BLOCK_H; y++)4796{4797for (uint32_t x = 0; x < BLOCK_W; x++)4798{4799const uint32_t part_index = pat.m_parts[x + y * BLOCK_W];48004801ise_weights[x + y * BLOCK_W] = blk_weights[part_index][src_pixel_index[part_index]];4802src_pixel_index[part_index]++;4803} // x4804} // y48054806downsample_ise_weights(4807coded_log_blk.m_weight_ise_range, coded_log_blk.m_weight_ise_range,4808BLOCK_W, BLOCK_H,4809grid_x, grid_y,4810ise_weights, coded_log_blk.m_weights);48114812// Transcode these codable weights to ASTC weights.4813uint8_t transcode_weights[MAX_BLOCK_W * MAX_BLOCK_H];4814basist::astc_6x6_hdr::requantize_astc_weights(num_grid_samples, coded_log_blk.m_weights, coded_log_blk.m_weight_ise_range, transcode_weights, decomp_log_blk.m_weight_ise_range);48154816// Create the block the decoder would transcode into.4817copy_weight_grid(dual_plane, grid_x, grid_y, transcode_weights, decomp_log_blk);4818}48194820if (!validate_log_blk(decomp_log_blk))4821{4822fmt_error_printf("pack_astc_block() failed\n");4823return false;4824}48254826status = decode_astc_block(BLOCK_W, BLOCK_H, decomp_log_blk, &candidate.m_comp_pixels[0][0]);4827if (!status)4828{4829fmt_error_printf("decode_astc_block() failed\n");4830return false;4831}48324833candidate.m_coder.put_bits(REUSE_CODE, REUSE_CODE_LEN);4834candidate.m_coder.put_bits(reuse_delta_index, REUSE_XY_DELTA_BITS);4835encode_values(candidate.m_coder, num_grid_samples * (dual_plane ? 2 : 1), coded_log_blk.m_weights, coded_log_blk.m_weight_ise_range);48364837candidate.m_encoding_type = encoding_type::cReuse;4838candidate.m_block_mode = prev_candidate.m_block_mode;4839candidate.m_endpoint_mode = prev_candidate.m_endpoint_mode;4840candidate.m_reuse_delta_index = reuse_delta_index;48414842candidates.emplace_back(std::move(candidate));48434844} // reuse_delta_index4845}48464847// ---- Solid candidate4848if (global_cfg.m_use_solid_blocks)4849{4850candidate_encoding candidate;4851candidate.m_coder.reserve(24);48524853// solid4854candidate.m_encoding_type = encoding_type::cSolid;48554856float r = 0.0f, g = 0.0f, b = 0.0f;4857const float LOG_BIAS = .125f;4858bool solid_block = true;4859for (uint32_t y = 0; y < BLOCK_H; y++)4860{4861for (uint32_t x = 0; x < BLOCK_W; x++)4862{4863if ((block_pixels[0][0][0] != block_pixels[y][x][0]) ||4864(block_pixels[0][0][1] != block_pixels[y][x][1]) ||4865(block_pixels[0][0][2] != block_pixels[y][x][2]))4866{4867solid_block = false;4868}48694870r += log2f(block_pixels[y][x][0] + LOG_BIAS);4871g += log2f(block_pixels[y][x][1] + LOG_BIAS);4872b += log2f(block_pixels[y][x][2] + LOG_BIAS);4873}4874}48754876if (solid_block)4877{4878r = block_pixels[0][0][0];4879g = block_pixels[0][0][1];4880b = block_pixels[0][0][2];4881}4882else4883{4884r = maximum<float>(0.0f, powf(2.0f, r * (1.0f / (float)NUM_BLOCK_PIXELS)) - LOG_BIAS);4885g = maximum<float>(0.0f, powf(2.0f, g * (1.0f / (float)NUM_BLOCK_PIXELS)) - LOG_BIAS);4886b = maximum<float>(0.0f, powf(2.0f, b * (1.0f / (float)NUM_BLOCK_PIXELS)) - LOG_BIAS);48874888r = minimum<float>(r, basist::MAX_HALF_FLOAT);4889g = minimum<float>(g, basist::MAX_HALF_FLOAT);4890b = minimum<float>(b, basist::MAX_HALF_FLOAT);4891}48924893basist::half_float rh = float_to_half_non_neg_no_nan_inf(r), gh = float_to_half_non_neg_no_nan_inf(g), bh = float_to_half_non_neg_no_nan_inf(b);48944895candidate.m_solid_color[0] = rh;4896candidate.m_solid_color[1] = gh;4897candidate.m_solid_color[2] = bh;48984899candidate.m_coder.put_bits(SOLID_CODE, SOLID_CODE_LEN);49004901candidate.m_coder.put_bits(rh, 15);4902candidate.m_coder.put_bits(gh, 15);4903candidate.m_coder.put_bits(bh, 15);49044905vec3F cp(basist::half_to_float(rh), basist::half_to_float(gh), basist::half_to_float(bh));49064907for (uint32_t y = 0; y < BLOCK_H; y++)4908for (uint32_t x = 0; x < BLOCK_W; x++)4909candidate.m_comp_pixels[y][x] = cp;49104911astc_helpers::log_astc_block& log_blk = candidate.m_coded_log_blk;49124913log_blk.clear();4914log_blk.m_solid_color_flag_hdr = true;4915log_blk.m_solid_color[0] = rh;4916log_blk.m_solid_color[1] = gh;4917log_blk.m_solid_color[2] = bh;4918log_blk.m_solid_color[3] = basist::float_to_half(1.0f);49194920candidate.m_decomp_log_blk = log_blk;49214922candidates.emplace_back(std::move(candidate));4923}49244925if ((!is_solid_block) || (!global_cfg.m_use_solid_blocks))4926{4927static uint8_t s_parts2_normal[5] = { 0, 2, 4, 6, 8 };4928static uint8_t s_parts3_normal[5] = { 0, 0, 4, 6, 8 };49294930static uint8_t s_parts2_complex[5] = { 0, 4, 8, 10, 16 };4931static uint8_t s_parts3_complex[5] = { 0, 0, 8, 10, 16 };49324933static uint8_t s_parts2_very_complex[5] = { 0, 8, 12, 14, 20 };4934static uint8_t s_parts3_very_complex[5] = { 0, 0, 12, 14, 20 };49354936uint32_t total_parts2 = 0, total_parts3 = 0;49374938assert(comp_level < 5);4939if ((very_simple_block) && (comp_level <= 3))4940{4941// Block's std dev is so low that 2-3 subsets are unlikely to help much4942total_parts2 = 0;4943total_parts3 = 0;49444945debug_state.m_total_part2_stats[0].fetch_add(1, std::memory_order_relaxed);4946}4947else if (very_complex_block)4948{4949total_parts2 = s_parts2_very_complex[comp_level];4950total_parts3 = s_parts3_very_complex[comp_level];49514952if (global_cfg.m_extra_patterns_flag)4953{4954total_parts2 += (comp_level == 4) ? 30 : 20;4955total_parts3 += (comp_level == 4) ? 30 : 20;4956}49574958debug_state.m_total_part2_stats[2].fetch_add(1, std::memory_order_relaxed);4959}4960else if (complex_block)4961{4962total_parts2 = s_parts2_complex[comp_level];4963total_parts3 = s_parts3_complex[comp_level];49644965if (global_cfg.m_extra_patterns_flag)4966{4967total_parts2 += (comp_level == 4) ? 15 : 10;4968total_parts3 += (comp_level == 4) ? 15 : 10;4969}49704971debug_state.m_total_part2_stats[3].fetch_add(1, std::memory_order_relaxed);4972}4973else4974{4975// moderate complexity - use defaults4976total_parts2 = s_parts2_normal[comp_level];4977total_parts3 = s_parts3_normal[comp_level];49784979if (global_cfg.m_extra_patterns_flag)4980{4981total_parts2 += 5;4982total_parts3 += 5;4983}49844985debug_state.m_total_part2_stats[1].fetch_add(1, std::memory_order_relaxed);4986}49874988if (!any_2subset_enabled)4989total_parts2 = 0;49904991if (!any_3subset_enabled)4992total_parts3 = 0;49934994int best_parts2_mode11[NUM_UNIQUE_PARTITIONS2], best_parts2_mode7[NUM_UNIQUE_PARTITIONS2];4995bool has_estimated_parts2 = false;49964997if (total_parts2)4998{4999if (global_cfg.m_brute_force_partition_matching)5000{5001int candidate_pats2[NUM_UNIQUE_PARTITIONS2];5002for (uint32_t i = 0; i < NUM_UNIQUE_PARTITIONS2; i++)5003candidate_pats2[i] = i;50045005if (any_2subset_enabled)5006{5007estimate_partitions_mode7_and_11(50082,5009NUM_UNIQUE_PARTITIONS2, g_partitions2,5010NUM_UNIQUE_PARTITIONS2, (uint32_t*)candidate_pats2,5011&half_pixels_as_floats[0][0],5012coptions,5013total_parts2, best_parts2_mode11, best_parts2_mode7);5014}50155016has_estimated_parts2 = true;5017}5018else5019{5020if (comp_level >= 1)5021{5022const uint32_t MAX_CANDIDATES2 = 48;5023int candidate_pats2[MAX_CANDIDATES2 * 2];50245025uint32_t num_candidate_pats2 = maximum((total_parts2 * 3) / 2, very_complex_block ? MAX_CANDIDATES2 : (MAX_CANDIDATES2 / 2));5026num_candidate_pats2 = minimum<uint32_t>(num_candidate_pats2, (uint32_t)std::size(candidate_pats2));50275028has_estimated_parts2 = estimate_partition2_6x6((basist::half_float(*)[3])half_pixels, candidate_pats2, num_candidate_pats2);50295030if (has_estimated_parts2)5031{5032estimate_partitions_mode7_and_11(50332,5034NUM_UNIQUE_PARTITIONS2, g_partitions2,5035num_candidate_pats2, (uint32_t*)candidate_pats2,5036&half_pixels_as_floats[0][0],5037coptions,5038total_parts2, best_parts2_mode11, best_parts2_mode7);5039}5040}5041else5042{5043has_estimated_parts2 = estimate_partition2_6x6((basist::half_float(*)[3])half_pixels, best_parts2_mode11, total_parts2);50445045if ((has_estimated_parts2) && (any_2subset_mode7_enabled))5046memcpy(best_parts2_mode7, best_parts2_mode11, total_parts2 * sizeof(best_parts2_mode7[0]));5047}5048}5049}50505051int best_parts3[NUM_UNIQUE_PARTITIONS3];5052bool has_estimated_parts3 = false;50535054if (total_parts3)5055{5056#if 05057has_estimated_parts3 = estimate_partition3_6x6((basist::half_float(*)[3])half_pixels, best_parts3, total_parts3);5058#elif 15059if (global_cfg.m_brute_force_partition_matching)5060{5061int candidate_pats3[NUM_UNIQUE_PARTITIONS3];5062for (uint32_t i = 0; i < NUM_UNIQUE_PARTITIONS3; i++)5063candidate_pats3[i] = i;50645065estimate_partitions_mode7(50663,5067NUM_UNIQUE_PARTITIONS3, g_partitions3,5068NUM_UNIQUE_PARTITIONS3, (uint32_t*)candidate_pats3,5069&half_pixels_as_floats[0][0],5070coptions,5071total_parts3, (uint32_t*)best_parts3);50725073has_estimated_parts3 = true;5074}5075else5076{5077const uint32_t MAX_CANDIDATES3 = 48;5078int candidate_pats3[MAX_CANDIDATES3 * 2];50795080uint32_t num_candidate_pats3 = maximum((total_parts3 * 3) / 2, very_complex_block ? MAX_CANDIDATES3 : (MAX_CANDIDATES3 / 2));5081num_candidate_pats3 = minimum<uint32_t>(num_candidate_pats3, (uint32_t)std::size(candidate_pats3));50825083has_estimated_parts3 = estimate_partition3_6x6((basist::half_float(*)[3])half_pixels, candidate_pats3, num_candidate_pats3);50845085if (has_estimated_parts3)5086{5087estimate_partitions_mode7(50883,5089NUM_UNIQUE_PARTITIONS3, g_partitions3,5090num_candidate_pats3, (uint32_t*)candidate_pats3,5091&half_pixels_as_floats[0][0],5092coptions,5093total_parts3, (uint32_t*)best_parts3);5094}5095}5096#endif5097}50985099const opt_mode_t mode11_opt_mode = complex_block ? cWeightedLeastSquares : cOrdinaryLeastSquares;51005101// ---- Encoded block candidate5102for (uint32_t block_mode_iter = 0; block_mode_iter < (uint32_t)block_mode::cBMTotalModes; block_mode_iter++)5103{5104const block_mode bm = (block_mode)block_mode_iter;51055106if (comp_level == 0)5107{5108if ((g_block_mode_descs[block_mode_iter].m_flags & BASIST_HDR_6X6_LEVEL0) == 0)5109continue;5110}5111else if (comp_level == 1)5112{5113if ((g_block_mode_descs[block_mode_iter].m_flags & BASIST_HDR_6X6_LEVEL1) == 0)5114continue;5115}5116else if (comp_level == 2)5117{5118if ((g_block_mode_descs[block_mode_iter].m_flags & BASIST_HDR_6X6_LEVEL2) == 0)5119continue;5120}51215122if (global_cfg.m_block_stat_optimizations_flag)5123{5124if ((comp_level <= 3) && (g_block_mode_descs[block_mode_iter].m_dp))5125{5126if ((global_cfg.m_lambda > 0.0f) && (!complex_block) && (g_block_mode_descs[block_mode_iter].m_grid_x == 2) && (g_block_mode_descs[block_mode_iter].m_grid_y == 2))5127{5128if (g_block_mode_descs[block_mode_iter].m_dp_channel != desired_dp_chan_2x2)5129continue;5130}5131else5132{5133if (g_block_mode_descs[block_mode_iter].m_dp_channel != desired_dp_chan)5134continue;5135}5136}51375138if (comp_level <= 3)5139{5140const uint32_t grid_x = g_block_mode_descs[block_mode_iter].m_grid_x;5141const uint32_t grid_y = g_block_mode_descs[block_mode_iter].m_grid_y;51425143if (!g_block_mode_descs[block_mode_iter].m_dp)5144{5145// Minor gain (.5-1% less canidates)5146if (very_detailed_block)5147{5148if (grid_x * grid_y <= 12)5149{5150debug_state.m_detail_stats[0].fetch_add(1, std::memory_order_relaxed);5151continue;5152}5153}51545155// Major gains (10-25% less candidates)5156if (very_blurry_block)5157{5158if ((grid_x > 4) || (grid_y > 4) || (g_block_mode_descs[block_mode_iter].m_num_partitions > 1))5159{5160debug_state.m_detail_stats[1].fetch_add(1, std::memory_order_relaxed);5161continue;5162}5163}5164if (super_blurry_block)5165{5166if ((grid_x > 3) || (grid_y > 3) || (g_block_mode_descs[block_mode_iter].m_num_partitions > 1))5167{5168debug_state.m_detail_stats[2].fetch_add(1, std::memory_order_relaxed);5169continue;5170}5171}5172}51735174if (grid_x != grid_y)5175{5176if (grid_x < grid_y)5177{5178if (!filter_horizontally)5179{5180debug_state.m_detail_stats[3].fetch_add(1, std::memory_order_relaxed);5181continue;5182}5183}5184else5185{5186if (filter_horizontally)5187{5188debug_state.m_detail_stats[4].fetch_add(1, std::memory_order_relaxed);5189continue;5190}5191}5192}5193}51945195if (global_cfg.m_lambda == 0.0f)5196{5197// Rarely useful if lambda=05198if ((g_block_mode_descs[block_mode_iter].m_grid_x == 2) && (g_block_mode_descs[block_mode_iter].m_grid_y == 2))5199continue;5200}5201} // block_stat_optimizations_flag52025203if ((!use_single_subset_mode7) &&5204(g_block_mode_descs[block_mode_iter].m_cem == 7) &&5205(g_block_mode_descs[block_mode_iter].m_num_partitions == 1))5206{5207debug_state.m_total_mode7_skips.fetch_add(1, std::memory_order_relaxed);5208continue;5209}52105211for (uint32_t endpoint_mode_iter = 0; endpoint_mode_iter < (uint32_t)endpoint_mode::cTotal; endpoint_mode_iter++)5212{5213if (global_cfg.m_lambda == 0.0f)5214{5215// No use trying anything else5216if (endpoint_mode_iter != (uint32_t)endpoint_mode::cRaw)5217continue;5218}52195220if (global_cfg.m_disable_delta_endpoint_usage)5221{5222if ((endpoint_mode_iter == (uint32_t)endpoint_mode::cUseUpperDelta) || (endpoint_mode_iter == (uint32_t)endpoint_mode::cUseLeftDelta))5223continue;5224}52255226if (!global_cfg.m_favor_higher_compression)5227{5228if (comp_level == 0)5229{5230if (endpoint_mode_iter == (uint32_t)endpoint_mode::cUseUpperDelta)5231continue;5232}52335234if (comp_level <= 1)5235{5236if ((endpoint_mode_iter == (uint32_t)endpoint_mode::cUseLeft) || (endpoint_mode_iter == (uint32_t)endpoint_mode::cUseUpper))5237continue;5238}5239}52405241const endpoint_mode em = (endpoint_mode)endpoint_mode_iter;52425243switch (em)5244{5245case endpoint_mode::cUseLeft:5246case endpoint_mode::cUseUpper:5247{5248const block_mode_desc& local_md = g_block_mode_descs[block_mode_iter];5249const uint32_t cem = local_md.m_cem;52505251if (local_md.m_num_partitions > 1)5252break;52535254if ((em == endpoint_mode::cUseLeft) && (!has_left_neighbor))5255break;5256else if ((em == endpoint_mode::cUseUpper) && (!has_upper_neighbor))5257break;52585259candidate_encoding candidate;5260candidate.m_coder.reserve(24);5261astc_helpers::log_astc_block& coded_log_blk = candidate.m_coded_log_blk;52625263int nx = bx, ny = by;5264if (em == endpoint_mode::cUseLeft)5265nx--;5266else5267ny--;52685269const candidate_encoding& neighbor_blk = enc_state.coded_blocks(nx, ny);5270if (neighbor_blk.m_encoding_type == encoding_type::cSolid)5271break;5272assert((neighbor_blk.m_encoding_type == encoding_type::cBlock) || (neighbor_blk.m_encoding_type == encoding_type::cReuse));52735274const block_mode_desc& neighbor_md = g_block_mode_descs[(uint32_t)neighbor_blk.m_block_mode];52755276if (neighbor_md.m_cem != cem)5277break;52785279assert(neighbor_blk.m_coded_log_blk.m_color_endpoint_modes[0] == cem);52805281const uint32_t grid_x = local_md.m_grid_x, grid_y = local_md.m_grid_y;5282const bool dual_plane = local_md.m_dp;5283const uint32_t num_grid_samples = grid_x * grid_y;5284const uint32_t num_endpoint_vals = get_num_endpoint_vals(local_md.m_cem);52855286coded_log_blk.m_grid_width = (uint8_t)grid_x;5287coded_log_blk.m_grid_height = (uint8_t)grid_y;5288coded_log_blk.m_dual_plane = (uint8_t)dual_plane;5289coded_log_blk.m_color_component_selector = (uint8_t)local_md.m_dp_channel;5290coded_log_blk.m_num_partitions = 1;5291coded_log_blk.m_color_endpoint_modes[0] = (uint8_t)neighbor_md.m_cem;5292coded_log_blk.m_weight_ise_range = (uint8_t)local_md.m_weight_ise_range;52935294// We're not explictly writing any endpoints, just reusing existing ones. So copy the neighbor's endpoints unchanged (so no loss).5295coded_log_blk.m_endpoint_ise_range = neighbor_blk.m_coded_log_blk.m_endpoint_ise_range;5296memcpy(coded_log_blk.m_endpoints, neighbor_blk.m_coded_log_blk.m_endpoints, num_endpoint_vals);52975298uint8_t transcode_endpoints[basist::NUM_MODE11_ENDPOINTS];52995300// Requantize the neighbor's endpoints to whatever we'll have to transcode into to make a valid ASTC encoding.5301basist::astc_6x6_hdr::requantize_ise_endpoints(neighbor_md.m_cem,5302neighbor_blk.m_coded_log_blk.m_endpoint_ise_range, neighbor_blk.m_coded_log_blk.m_endpoints,5303local_md.m_transcode_endpoint_ise_range, transcode_endpoints);53045305// Now encode the block using the transcoded endpoints5306basist::half_float decoded_half[MAX_SUPPORTED_WEIGHT_LEVELS][3];53075308if (cem == 7)5309{5310status = get_astc_hdr_mode_7_block_colors(transcode_endpoints, &decoded_half[0][0], nullptr,5311astc_helpers::get_ise_levels(local_md.m_weight_ise_range), local_md.m_weight_ise_range, local_md.m_transcode_endpoint_ise_range);5312}5313else5314{5315status = get_astc_hdr_mode_11_block_colors(transcode_endpoints, &decoded_half[0][0], nullptr,5316astc_helpers::get_ise_levels(local_md.m_weight_ise_range), local_md.m_weight_ise_range, local_md.m_transcode_endpoint_ise_range);5317}5318if (!status)5319break;53205321uint8_t trial_weights0[BLOCK_W * BLOCK_H], trial_weights1[BLOCK_W * BLOCK_H];5322if (dual_plane)5323{5324eval_selectors_dual_plane(local_md.m_dp_channel, BLOCK_W * BLOCK_H, trial_weights0, trial_weights1, (basist::half_float*)&half_pixels[0][0][0], astc_helpers::get_ise_levels(local_md.m_weight_ise_range), &decoded_half[0][0], coptions, UINT32_MAX);53255326downsample_ise_weights_dual_plane(5327local_md.m_weight_ise_range, local_md.m_weight_ise_range,5328BLOCK_W, BLOCK_H,5329grid_x, grid_y,5330trial_weights0, trial_weights1, coded_log_blk.m_weights);5331}5332else5333{5334eval_selectors(BLOCK_W * BLOCK_H, trial_weights0, local_md.m_weight_ise_range, (basist::half_float*)&half_pixels[0][0][0], astc_helpers::get_ise_levels(local_md.m_weight_ise_range), &decoded_half[0][0], coptions, UINT32_MAX);53355336downsample_ise_weights(5337local_md.m_weight_ise_range, local_md.m_weight_ise_range,5338BLOCK_W, BLOCK_H,5339grid_x, grid_y,5340trial_weights0, coded_log_blk.m_weights);5341}53425343// Transcode these codable weights to ASTC weights.5344uint8_t transcode_weights[MAX_BLOCK_W * MAX_BLOCK_H * 2];5345basist::astc_6x6_hdr::requantize_astc_weights(num_grid_samples * (dual_plane ? 2 : 1), coded_log_blk.m_weights, local_md.m_weight_ise_range, transcode_weights, local_md.m_transcode_weight_ise_range);53465347// Create the block the decoder would transcode into.5348astc_helpers::log_astc_block& decomp_blk = candidate.m_decomp_log_blk;5349decomp_blk.clear();53505351decomp_blk.m_color_endpoint_modes[0] = (uint8_t)local_md.m_cem;5352decomp_blk.m_dual_plane = local_md.m_dp;5353decomp_blk.m_color_component_selector = (uint8_t)local_md.m_dp_channel;5354decomp_blk.m_num_partitions = 1;5355decomp_blk.m_endpoint_ise_range = (uint8_t)local_md.m_transcode_endpoint_ise_range;5356decomp_blk.m_weight_ise_range = (uint8_t)local_md.m_transcode_weight_ise_range;53575358memcpy(decomp_blk.m_endpoints, transcode_endpoints, num_endpoint_vals);53595360copy_weight_grid(dual_plane, grid_x, grid_y, transcode_weights, decomp_blk);53615362if (!validate_log_blk(decomp_blk))5363{5364fmt_error_printf("pack_astc_block() failed\n");5365return false;5366}53675368status = decode_astc_block(BLOCK_W, BLOCK_H, decomp_blk, &candidate.m_comp_pixels[0][0]);5369if (!status)5370{5371fmt_error_printf("decode_astc_block() failed\n");5372return false;5373}53745375candidate.m_coder.put_bits(BLOCK_CODE, BLOCK_CODE_LEN);5376code_block(candidate.m_coder, candidate.m_coded_log_blk, (block_mode)block_mode_iter, em, nullptr);53775378candidate.m_encoding_type = encoding_type::cBlock;5379candidate.m_endpoint_mode = em;5380candidate.m_block_mode = bm;53815382candidates.emplace_back(std::move(candidate));53835384break;5385}5386case endpoint_mode::cUseLeftDelta:5387case endpoint_mode::cUseUpperDelta:5388{5389const block_mode_desc& local_md = g_block_mode_descs[block_mode_iter];5390const uint32_t cem = local_md.m_cem;53915392if (local_md.m_num_partitions > 1)5393break;53945395if ((em == endpoint_mode::cUseLeftDelta) && (!has_left_neighbor))5396break;5397else if ((em == endpoint_mode::cUseUpperDelta) && (!has_upper_neighbor))5398break;53995400candidate_encoding candidate;5401candidate.m_coder.reserve(24);5402astc_helpers::log_astc_block& coded_log_blk = candidate.m_coded_log_blk;54035404int nx = bx, ny = by;5405if (em == endpoint_mode::cUseLeftDelta)5406nx--;5407else5408ny--;54095410const candidate_encoding& neighbor_blk = enc_state.coded_blocks(nx, ny);5411if (neighbor_blk.m_encoding_type == encoding_type::cSolid)5412break;5413assert((neighbor_blk.m_encoding_type == encoding_type::cBlock) || (neighbor_blk.m_encoding_type == encoding_type::cReuse));54145415const block_mode_desc& neighbor_md = g_block_mode_descs[(uint32_t)neighbor_blk.m_block_mode];54165417if (neighbor_md.m_cem != cem)5418break;54195420assert(neighbor_md.m_cem == local_md.m_cem);54215422const uint32_t grid_x = local_md.m_grid_x, grid_y = local_md.m_grid_y;5423const bool dual_plane = local_md.m_dp;5424const uint32_t num_grid_samples = grid_x * grid_y;5425const uint32_t num_endpoint_vals = get_num_endpoint_vals(local_md.m_cem);54265427// Dequantize neighbor's endpoints to ISE 205428uint8_t neighbor_endpoints_ise20[basist::NUM_MODE11_ENDPOINTS];5429basist::astc_6x6_hdr::requantize_ise_endpoints(neighbor_md.m_cem,5430neighbor_blk.m_coded_log_blk.m_endpoint_ise_range, neighbor_blk.m_coded_log_blk.m_endpoints,5431astc_helpers::BISE_256_LEVELS, neighbor_endpoints_ise20);54325433// Requantize neighbor's endpoints to our local desired coding ISE range5434uint8_t neighbor_endpoints_coding_ise_local[basist::NUM_MODE11_ENDPOINTS];5435basist::astc_6x6_hdr::requantize_ise_endpoints(neighbor_md.m_cem, astc_helpers::BISE_256_LEVELS, neighbor_endpoints_ise20, local_md.m_endpoint_ise_range, neighbor_endpoints_coding_ise_local);54365437uint8_t blk_endpoints[basist::NUM_MODE11_ENDPOINTS];5438uint8_t blk_weights0[NUM_BLOCK_PIXELS], blk_weights1[NUM_BLOCK_PIXELS];54395440// Now try to encode the current block using the neighbor's endpoints submode.5441double err = 0.0f;5442uint32_t best_submode = 0;54435444if (cem == 7)5445{5446int maj_index, submode_index;5447decode_cem_7_config(neighbor_endpoints_ise20, submode_index, maj_index);54485449int first_submode = submode_index, last_submode = submode_index;54505451err = encode_astc_hdr_block_mode_7(5452NUM_BLOCK_PIXELS,5453(basist::half_float(*)[3])half_pixels, (vec4F*)block_pixels_q16,5454local_md.m_weight_ise_range,5455best_submode,5456BIG_FLOAT_VAL,5457blk_endpoints, blk_weights0,5458coptions,5459local_md.m_endpoint_ise_range,5460first_submode, last_submode,5461&enc_block_stats);5462}5463else5464{5465int maj_index, submode_index;5466decode_cem_11_config(neighbor_endpoints_ise20, submode_index, maj_index);54675468int first_submode = -1, last_submode = -1;5469if (maj_index == 3)5470{5471// direct5472}5473else5474{5475first_submode = submode_index;5476last_submode = submode_index;5477}54785479if (dual_plane)5480{5481err = encode_astc_hdr_block_mode_11_dual_plane(5482NUM_BLOCK_PIXELS,5483(basist::half_float(*)[3])half_pixels, (vec4F*)block_pixels_q16,5484local_md.m_dp_channel,5485local_md.m_weight_ise_range,5486best_submode,5487BIG_FLOAT_VAL,5488blk_endpoints, blk_weights0, blk_weights1,5489coptions,5490false,5491local_md.m_endpoint_ise_range,5492false, //uber_mode_flag,5493false,5494first_submode, last_submode, true);5495}5496else5497{5498err = encode_astc_hdr_block_mode_11(5499NUM_BLOCK_PIXELS,5500(basist::half_float(*)[3])half_pixels, (vec4F*)block_pixels_q16,5501local_md.m_weight_ise_range,5502best_submode,5503BIG_FLOAT_VAL,5504blk_endpoints, blk_weights0,5505coptions,5506false,5507local_md.m_endpoint_ise_range,5508false, //uber_mode_flag,5509false,5510first_submode, last_submode, true,5511mode11_opt_mode,5512&enc_block_stats);5513}5514}55155516if (err == BIG_FLOAT_VAL)5517break;55185519uint8_t endpoint_deltas[basist::NUM_MODE11_ENDPOINTS];55205521// TODO: For now, just try 5 bits for each endpoint. Can tune later.5522// This isn't right, it's computing the deltas in ISE space.5523//const uint32_t NUM_ENDPOINT_DELTA_BITS = 5;5524const int total_endpoint_delta_vals = 1 << NUM_ENDPOINT_DELTA_BITS;5525const int low_delta_limit = -(total_endpoint_delta_vals / 2), high_delta_limit = (total_endpoint_delta_vals / 2) - 1;55265527const auto& ise_to_rank = astc_helpers::g_dequant_tables.get_endpoint_tab(local_md.m_endpoint_ise_range).m_ISE_to_rank;55285529bool all_deltas_in_limits = true;5530for (uint32_t i = 0; i < num_endpoint_vals; i++)5531{5532int endpoint_delta = (int)ise_to_rank[blk_endpoints[i]] - (int)ise_to_rank[neighbor_endpoints_coding_ise_local[i]];55335534if ((endpoint_delta < low_delta_limit) || (endpoint_delta > high_delta_limit))5535all_deltas_in_limits = false;55365537endpoint_deltas[i] = (uint8_t)(endpoint_delta + -low_delta_limit);5538}55395540if (all_deltas_in_limits)5541{5542coded_log_blk.m_grid_width = (uint8_t)grid_x;5543coded_log_blk.m_grid_height = (uint8_t)grid_y;5544coded_log_blk.m_dual_plane = (uint8_t)dual_plane;5545coded_log_blk.m_color_component_selector = (uint8_t)local_md.m_dp_channel;5546coded_log_blk.m_num_partitions = 1;5547coded_log_blk.m_color_endpoint_modes[0] = (uint8_t)local_md.m_cem;5548coded_log_blk.m_weight_ise_range = (uint8_t)local_md.m_weight_ise_range;5549coded_log_blk.m_endpoint_ise_range = (uint8_t)local_md.m_endpoint_ise_range;55505551memcpy(coded_log_blk.m_endpoints, blk_endpoints, num_endpoint_vals);55525553uint8_t transcode_endpoints[basist::NUM_MODE11_ENDPOINTS];5554uint8_t transcode_weights[MAX_BLOCK_W * MAX_BLOCK_H * 2];55555556basist::astc_6x6_hdr::requantize_ise_endpoints(local_md.m_cem, local_md.m_endpoint_ise_range, blk_endpoints, local_md.m_transcode_endpoint_ise_range, transcode_endpoints);55575558if (dual_plane)5559{5560downsample_ise_weights_dual_plane(5561local_md.m_weight_ise_range, local_md.m_weight_ise_range,5562BLOCK_W, BLOCK_H,5563grid_x, grid_y,5564blk_weights0, blk_weights1,5565coded_log_blk.m_weights);5566}5567else5568{5569downsample_ise_weights(5570local_md.m_weight_ise_range, local_md.m_weight_ise_range,5571BLOCK_W, BLOCK_H,5572grid_x, grid_y,5573blk_weights0, coded_log_blk.m_weights);5574}55755576basist::astc_6x6_hdr::requantize_astc_weights(num_grid_samples * (dual_plane ? 2 : 1), coded_log_blk.m_weights, local_md.m_weight_ise_range, transcode_weights, local_md.m_transcode_weight_ise_range);55775578// Create the block the decoder would transcode into.55795580astc_helpers::log_astc_block& decomp_blk = candidate.m_decomp_log_blk;5581decomp_blk.clear();55825583decomp_blk.m_color_endpoint_modes[0] = (uint8_t)local_md.m_cem;5584decomp_blk.m_dual_plane = local_md.m_dp;5585decomp_blk.m_color_component_selector = (uint8_t)local_md.m_dp_channel;5586decomp_blk.m_num_partitions = 1;5587decomp_blk.m_endpoint_ise_range = (uint8_t)local_md.m_transcode_endpoint_ise_range;5588decomp_blk.m_weight_ise_range = (uint8_t)local_md.m_transcode_weight_ise_range;55895590memcpy(decomp_blk.m_endpoints, transcode_endpoints, num_endpoint_vals);55915592copy_weight_grid(dual_plane, grid_x, grid_y, transcode_weights, decomp_blk);55935594if (!validate_log_blk(decomp_blk))5595{5596fmt_error_printf("pack_astc_block() failed\n");5597return false;5598}55995600status = decode_astc_block(BLOCK_W, BLOCK_H, decomp_blk, &candidate.m_comp_pixels[0][0]);5601if (!status)5602{5603fmt_error_printf("decode_astc_block() failed\n");5604return false;5605}56065607candidate.m_coder.put_bits(BLOCK_CODE, BLOCK_CODE_LEN);5608code_block(candidate.m_coder, candidate.m_coded_log_blk, bm, em, endpoint_deltas);56095610candidate.m_encoding_type = encoding_type::cBlock;5611candidate.m_endpoint_mode = em;5612candidate.m_block_mode = bm;56135614candidates.emplace_back(std::move(candidate));5615}56165617break;5618}5619case endpoint_mode::cRaw:5620{5621//if (candidates.size() == 339)5622// fmt_printf("!");56235624const auto& mode_desc = g_block_mode_descs[(uint32_t)bm];5625const uint32_t cem = mode_desc.m_cem;5626//const uint32_t num_endpoint_vals = get_num_endpoint_vals(cem);5627const bool dual_plane = mode_desc.m_dp;56285629if ((global_cfg.m_disable_twothree_subsets) && (mode_desc.m_num_partitions >= 2))5630break;56315632if (mode_desc.m_num_partitions == 3)5633{5634assert(!dual_plane);56355636if (!has_estimated_parts3)5637break;56385639assert(mode_desc.m_weight_ise_range == mode_desc.m_transcode_weight_ise_range);5640assert(mode_desc.m_endpoint_ise_range == mode_desc.m_transcode_endpoint_ise_range);56415642trial_result res;56435644status = encode_block_3_subsets(5645res,5646cem,5647mode_desc.m_grid_x, mode_desc.m_grid_y,5648mode_desc.m_weight_ise_range, mode_desc.m_endpoint_ise_range,5649&half_pixels[0][0], (vec4F*)block_pixels_q16,5650coptions,5651uber_mode_flag,5652best_parts3, total_parts3, comp_level, mode11_opt_mode);56535654if (!status)5655break;56565657assert(res.m_valid);56585659candidate_encoding candidate;5660candidate.m_coder.reserve(24);5661astc_helpers::log_astc_block& coded_log_blk = candidate.m_coded_log_blk;56625663coded_log_blk = res.m_log_blk;56645665astc_helpers::log_astc_block& decomp_blk = candidate.m_decomp_log_blk;5666decomp_blk = res.m_log_blk;56675668if (!validate_log_blk(decomp_blk))5669{5670fmt_error_printf("pack_astc_block() failed\n");5671return false;5672}56735674status = decode_astc_block(BLOCK_W, BLOCK_H, decomp_blk, &candidate.m_comp_pixels[0][0]);5675if (!status)5676{5677fmt_error_printf("decode_astc_block() failed\n");5678return false;5679}56805681candidate.m_coder.put_bits(BLOCK_CODE, BLOCK_CODE_LEN);5682code_block(candidate.m_coder, candidate.m_coded_log_blk, bm, em, nullptr);56835684candidate.m_encoding_type = encoding_type::cBlock;5685candidate.m_endpoint_mode = em;5686candidate.m_block_mode = bm;56875688candidates.emplace_back(std::move(candidate));5689}5690else if (mode_desc.m_num_partitions == 2)5691{5692assert(!dual_plane);56935694if (!has_estimated_parts2)5695break;56965697assert(mode_desc.m_weight_ise_range == mode_desc.m_transcode_weight_ise_range);5698assert(mode_desc.m_endpoint_ise_range == mode_desc.m_transcode_endpoint_ise_range);56995700for (uint32_t est_part_iter = 0; est_part_iter < total_parts2; est_part_iter++)5701{5702trial_result results[2];57035704assert(((cem == 11) && any_2subset_mode11_enabled) || ((cem == 7) && any_2subset_mode7_enabled));57055706status = encode_block_2_subsets(5707results,5708mode_desc.m_grid_x, mode_desc.m_grid_y,5709mode_desc.m_cem,5710mode_desc.m_weight_ise_range, mode_desc.m_endpoint_ise_range,5711&half_pixels[0][0], (vec4F*)block_pixels_q16,5712coptions,5713uber_mode_flag,5714(cem == 11) ? best_parts2_mode11[est_part_iter] : best_parts2_mode7[est_part_iter],5715comp_level,5716mode11_opt_mode,5717true);57185719if (!status)5720continue;57215722for (uint32_t r_iter = 0; r_iter < 2; r_iter++)5723{5724const trial_result& res = results[r_iter];57255726if (!res.m_valid)5727continue;57285729candidate_encoding candidate;5730candidate.m_coder.reserve(24);5731astc_helpers::log_astc_block& coded_log_blk = candidate.m_coded_log_blk;57325733coded_log_blk = res.m_log_blk;57345735astc_helpers::log_astc_block& decomp_blk = candidate.m_decomp_log_blk;5736decomp_blk = res.m_log_blk;57375738if (!validate_log_blk(decomp_blk))5739{5740fmt_error_printf("pack_astc_block() failed\n");5741return false;5742}57435744status = decode_astc_block(BLOCK_W, BLOCK_H, decomp_blk, &candidate.m_comp_pixels[0][0]);5745if (!status)5746{5747fmt_error_printf("decode_astc_block() failed\n");5748return false;5749}57505751candidate.m_coder.put_bits(BLOCK_CODE, BLOCK_CODE_LEN);5752code_block(candidate.m_coder, candidate.m_coded_log_blk, bm, em, nullptr);57535754candidate.m_encoding_type = encoding_type::cBlock;5755candidate.m_endpoint_mode = em;5756candidate.m_block_mode = bm;57575758candidates.emplace_back(std::move(candidate));57595760} // r_iter5761}5762}5763else5764{5765// 1 subset5766uint8_t blk_weights0[BLOCK_W * BLOCK_H], blk_weights1[BLOCK_W * BLOCK_H];5767uint32_t best_submode = 0;57685769candidate_encoding candidate;5770candidate.m_coder.reserve(24);5771astc_helpers::log_astc_block& coded_log_blk = candidate.m_coded_log_blk;57725773const uint32_t grid_x = mode_desc.m_grid_x, grid_y = mode_desc.m_grid_y;5774const uint32_t num_grid_samples = grid_x * grid_y;57755776const half_vec3* pBlock_pixels_half = &half_pixels[0][0];5777const vec4F* pBlock_pixels_q16 = &block_pixels_q16[0][0];57785779const uint32_t num_grid_samples_dp = num_grid_samples * (dual_plane ? 2 : 1);57805781uint8_t transcode_weights[MAX_BLOCK_W * MAX_BLOCK_H * 2];57825783coded_log_blk.m_grid_width = (uint8_t)grid_x;5784coded_log_blk.m_grid_height = (uint8_t)grid_y;5785coded_log_blk.m_dual_plane = (uint8_t)dual_plane;5786coded_log_blk.m_color_component_selector = (uint8_t)mode_desc.m_dp_channel;5787coded_log_blk.m_num_partitions = 1;5788coded_log_blk.m_color_endpoint_modes[0] = (uint8_t)mode_desc.m_cem;5789coded_log_blk.m_weight_ise_range = (uint8_t)mode_desc.m_weight_ise_range;5790coded_log_blk.m_endpoint_ise_range = (uint8_t)mode_desc.m_endpoint_ise_range;57915792if ((cem == 11) && (!dual_plane) && ((grid_x < BLOCK_W) || (grid_y < BLOCK_H)))5793{5794double e = encode_astc_hdr_block_downsampled_mode_11(5795BLOCK_W, BLOCK_H, grid_x, grid_y,5796mode_desc.m_weight_ise_range, mode_desc.m_endpoint_ise_range,5797NUM_BLOCK_PIXELS, (basist::half_float(*)[3])pBlock_pixels_half, pBlock_pixels_q16,5798BIG_FLOAT_VAL,5799FIRST_MODE11_SUBMODE_INDEX, MAX_MODE11_SUBMODE_INDEX, false, mode11_opt_mode,5800coded_log_blk.m_endpoints, coded_log_blk.m_weights, best_submode,5801coptions,5802&enc_block_stats);58035804if (e == BIG_FLOAT_VAL)5805break;5806}5807else5808{5809if (cem == 7)5810{5811assert(!dual_plane);58125813double e = encode_astc_hdr_block_mode_7(5814NUM_BLOCK_PIXELS,5815(basist::half_float(*)[3])pBlock_pixels_half, pBlock_pixels_q16,5816mode_desc.m_weight_ise_range,5817best_submode,5818BIG_FLOAT_VAL,5819coded_log_blk.m_endpoints,5820blk_weights0,5821coptions,5822mode_desc.m_endpoint_ise_range,58230, MAX_MODE7_SUBMODE_INDEX,5824&enc_block_stats);5825BASISU_NOTE_UNUSED(e);5826}5827else5828{5829double e;58305831if (dual_plane)5832{5833e = encode_astc_hdr_block_mode_11_dual_plane(5834NUM_BLOCK_PIXELS,5835(basist::half_float(*)[3])pBlock_pixels_half, pBlock_pixels_q16,5836mode_desc.m_dp_channel,5837mode_desc.m_weight_ise_range,5838best_submode,5839BIG_FLOAT_VAL,5840coded_log_blk.m_endpoints,5841blk_weights0, blk_weights1,5842coptions,5843false,5844mode_desc.m_endpoint_ise_range, uber_mode_flag, false, -1, 7, false);5845}5846else5847{5848e = encode_astc_hdr_block_mode_11(5849NUM_BLOCK_PIXELS,5850(basist::half_float(*)[3])pBlock_pixels_half, pBlock_pixels_q16,5851mode_desc.m_weight_ise_range,5852best_submode,5853BIG_FLOAT_VAL,5854coded_log_blk.m_endpoints,5855blk_weights0,5856coptions,5857false,5858mode_desc.m_endpoint_ise_range, uber_mode_flag, false, -1, 7, false,5859mode11_opt_mode,5860&enc_block_stats);5861}58625863if (e == BIG_FLOAT_VAL)5864break;5865}58665867if (dual_plane)5868{5869downsample_ise_weights_dual_plane(5870mode_desc.m_weight_ise_range, mode_desc.m_weight_ise_range,5871BLOCK_W, BLOCK_H,5872grid_x, grid_y,5873blk_weights0, blk_weights1,5874coded_log_blk.m_weights);5875}5876else5877{5878downsample_ise_weights(5879mode_desc.m_weight_ise_range, mode_desc.m_weight_ise_range,5880BLOCK_W, BLOCK_H,5881grid_x, grid_y,5882blk_weights0, coded_log_blk.m_weights);58835884if ((comp_level >= MIN_REFINE_LEVEL) && ((grid_x < BLOCK_W) || (grid_y < BLOCK_H)))5885{5886bool refine_status = refine_endpoints(cem,5887mode_desc.m_endpoint_ise_range, coded_log_blk.m_endpoints,58886, 6, mode_desc.m_grid_x, mode_desc.m_grid_y,5889coded_log_blk.m_weights, mode_desc.m_weight_ise_range,5890BLOCK_W * BLOCK_H,5891(basist::half_float(*)[3])pBlock_pixels_half, (vec4F*)pBlock_pixels_q16,5892nullptr,5893coptions, mode11_opt_mode);5894BASISU_NOTE_UNUSED(refine_status);5895}5896}5897}58985899basist::astc_6x6_hdr::requantize_astc_weights(num_grid_samples_dp, coded_log_blk.m_weights, mode_desc.m_weight_ise_range, transcode_weights, mode_desc.m_transcode_weight_ise_range);59005901// Create the block the decoder would transcode into.5902astc_helpers::log_astc_block& decomp_blk = candidate.m_decomp_log_blk;5903decomp_blk.clear();59045905decomp_blk.m_color_endpoint_modes[0] = (uint8_t)mode_desc.m_cem;5906decomp_blk.m_dual_plane = mode_desc.m_dp;5907decomp_blk.m_color_component_selector = (uint8_t)mode_desc.m_dp_channel;5908decomp_blk.m_num_partitions = 1;5909decomp_blk.m_endpoint_ise_range = (uint8_t)mode_desc.m_transcode_endpoint_ise_range;5910decomp_blk.m_weight_ise_range = (uint8_t)mode_desc.m_transcode_weight_ise_range;59115912basist::astc_6x6_hdr::requantize_ise_endpoints(mode_desc.m_cem, mode_desc.m_endpoint_ise_range, coded_log_blk.m_endpoints, mode_desc.m_transcode_endpoint_ise_range, decomp_blk.m_endpoints);59135914copy_weight_grid(dual_plane, grid_x, grid_y, transcode_weights, decomp_blk);59155916if (!validate_log_blk(decomp_blk))5917{5918fmt_error_printf("pack_astc_block() failed\n");5919return false;5920}59215922status = decode_astc_block(BLOCK_W, BLOCK_H, decomp_blk, &candidate.m_comp_pixels[0][0]);5923if (!status)5924{5925fmt_error_printf("decode_astc_block() failed\n");5926return false;5927}59285929candidate.m_coder.put_bits(BLOCK_CODE, BLOCK_CODE_LEN);5930code_block(candidate.m_coder, candidate.m_coded_log_blk, bm, em, nullptr);59315932candidate.m_encoding_type = encoding_type::cBlock;5933candidate.m_endpoint_mode = em;5934candidate.m_block_mode = bm;59355936candidates.emplace_back(std::move(candidate));5937}59385939break;5940}5941default:5942assert(0);5943fmt_debug_printf("Invalid endpoint mode\n");5944return false;59455946} // switch (em)59475948} // endpoint_mode_iter59495950} // block_mode_iter59515952} // is_solid_block59535954//------------------------------------------------59555956debug_state.m_total_candidates_considered.fetch_add(candidates.size_u32(), std::memory_order_relaxed);5957atomic_max(debug_state.m_max_candidates_considered, candidates.size_u32());59585959for (uint32_t candidate_iter = 0; candidate_iter < candidates.size_u32(); candidate_iter++)5960{5961auto& candidate = candidates[candidate_iter];59625963for (uint32_t y = 0; y < BLOCK_H; y++)5964for (uint32_t x = 0; x < BLOCK_W; x++)5965linear_rgb_to_itp(candidate.m_comp_pixels[y][x], candidate.m_comp_pixels_itp[y][x], global_cfg);5966}59675968// Find best overall candidate5969double best_t = BIG_FLOAT_VAL;5970int best_candidate_index = -1;59715972float best_d_ssim = BIG_FLOAT_VAL;59735974if (global_cfg.m_lambda == 0.0f)5975{5976for (uint32_t candidate_iter = 0; candidate_iter < candidates.size_u32(); candidate_iter++)5977{5978const auto& candidate = candidates[candidate_iter];59795980float candidate_d_ssim = 1.0f - compute_block_ssim_itp(BLOCK_W, BLOCK_H, &block_pixels_as_itp[0][0], &candidate.m_comp_pixels_itp[0][0]);59815982if (candidate_d_ssim < best_d_ssim)5983best_d_ssim = candidate_d_ssim;59845985candidate_d_ssim *= SSIM_WEIGHT;59865987float candidate_mse = MSE_WEIGHT * compute_block_mse_itp(BLOCK_W, BLOCK_H, &block_pixels_as_itp[0][0], &candidate.m_comp_pixels_itp[0][0], global_cfg.m_delta_itp_dark_adjustment);59885989candidate_mse += candidate_d_ssim;59905991float total_deblock_penalty = 0.0f;5992if (global_cfg.m_deblocking_flag)5993{5994total_deblock_penalty = calc_deblocking_penalty_itp(bx, by, width, height, pass_src_img_itp, candidate) * global_cfg.m_deblock_penalty_weight;5995}5996candidate_mse += total_deblock_penalty * SSIM_WEIGHT;59975998if ((candidate.m_encoding_type == encoding_type::cBlock) || (candidate.m_encoding_type == encoding_type::cReuse))5999{6000// Bias the encoder away from 2 level blocks on complex blocks6001// TODO: Perhaps only do this on large or non-interpolated grids6002if (complex_block)6003{6004if (candidate.m_coded_log_blk.m_weight_ise_range == astc_helpers::BISE_2_LEVELS)6005{6006candidate_mse *= TWO_LEVEL_PENALTY;6007}6008}60096010// Bias the encoder away from smaller weight grids if the block is very complex6011// TODO: Use the DCT to compute an approximation of the block energy/variance retained vs. lost by downsampling.6012if (complex_block)6013{6014if ((candidate.m_coded_log_blk.m_grid_width == 2) && (candidate.m_coded_log_blk.m_grid_height == 2))6015candidate_mse *= COMPLEX_BLOCK_WEIGHT_GRID_2X2_MSE_PENALTY;6016else if (minimum(candidate.m_coded_log_blk.m_grid_width, candidate.m_coded_log_blk.m_grid_height) <= 3)6017candidate_mse *= COMPLEX_BLOCK_WEIGHT_GRID_3X3_MSE_PENALTY;6018else if (minimum(candidate.m_coded_log_blk.m_grid_width, candidate.m_coded_log_blk.m_grid_height) <= 4)6019candidate_mse *= COMPLEX_BLOCK_WEIGHT_GRID_4X4_MSE_PENALTY;6020}6021}60226023float candidate_t = candidate_mse;60246025if (candidate_t < best_t)6026{6027best_t = candidate_t;6028best_candidate_index = candidate_iter;6029}60306031} // candidate_iter60326033if (global_cfg.m_gaussian1_fallback && (outer_pass == 0) && (very_complex_block) && (best_d_ssim > SWITCH_TO_GAUSSIAN_FILTERED_THRESH1_D_SSIM))6034{6035debug_state.m_total_gaussian1_blocks.fetch_add(1, std::memory_order_relaxed);6036continue;6037}60386039const float block_y_contrast_ratio = block_hy / (block_ly + .00000125f);60406041if (global_cfg.m_gaussian2_fallback && (comp_level >= 1) && (outer_pass == 1) && (very_complex_block) && (best_d_ssim > SWITCH_TO_GAUSSIAN_FILTERED_THRESH2_D_SSIM) &&6042(block_hy >= 18.0f) && (block_y_contrast_ratio > 150.0f) &&6043(block_avg_y >= 1.5f))6044{6045debug_state.m_total_gaussian2_blocks.fetch_add(1, std::memory_order_relaxed);6046continue;6047}6048}6049else6050{6051assert(enc_state.smooth_block_mse_scales.get_width() > 0);60526053// Compute block's perceptual weighting6054float perceptual_scale = 0.0f;6055for (uint32_t y = 0; y < BLOCK_H; y++)6056for (uint32_t x = 0; x < BLOCK_W; x++)6057perceptual_scale = basisu::maximumf(perceptual_scale, enc_state.smooth_block_mse_scales.at_clamped(bx * BLOCK_W + x, by * BLOCK_H + y));60586059// Very roughly normalize the computed distortion vs. bits.6060perceptual_scale *= 10.0f;60616062for (uint32_t candidate_iter = 0; candidate_iter < candidates.size_u32(); candidate_iter++)6063{6064auto& candidate = candidates[candidate_iter];60656066float d_ssim = 1.0f - compute_block_ssim_itp(BLOCK_W, BLOCK_H, &block_pixels_as_itp[0][0], &candidate.m_comp_pixels_itp[0][0]);60676068if (d_ssim < best_d_ssim)6069best_d_ssim = (float)d_ssim;60706071d_ssim *= SSIM_WEIGHT;60726073float candidate_mse = MSE_WEIGHT * compute_block_mse_itp(BLOCK_W, BLOCK_H, &block_pixels_as_itp[0][0], &candidate.m_comp_pixels_itp[0][0], global_cfg.m_delta_itp_dark_adjustment);60746075candidate_mse += d_ssim;60766077float total_deblock_penalty = 0.0f;6078if (global_cfg.m_deblocking_flag)6079{6080total_deblock_penalty = calc_deblocking_penalty_itp(bx, by, width, height, pass_src_img_itp, candidate) * global_cfg.m_deblock_penalty_weight;6081}6082candidate_mse += total_deblock_penalty * SSIM_WEIGHT;60836084if ((candidate.m_encoding_type == encoding_type::cBlock) || (candidate.m_encoding_type == encoding_type::cReuse))6085{6086// Bias the encoder away from 2 level blocks on complex blocks6087if (complex_block)6088{6089if (candidate.m_coded_log_blk.m_weight_ise_range == astc_helpers::BISE_2_LEVELS)6090{6091candidate_mse *= TWO_LEVEL_PENALTY;6092}6093}60946095// Bias the encoder away from smaller weight grids if the block is very complex6096if (complex_block)6097{6098if ((candidate.m_coded_log_blk.m_grid_width == 2) && (candidate.m_coded_log_blk.m_grid_height == 2))6099candidate_mse *= COMPLEX_BLOCK_WEIGHT_GRID_2X2_MSE_PENALTY;6100else if (minimum(candidate.m_coded_log_blk.m_grid_width, candidate.m_coded_log_blk.m_grid_height) <= 3)6101candidate_mse *= COMPLEX_BLOCK_WEIGHT_GRID_3X3_MSE_PENALTY;6102else if (minimum(candidate.m_coded_log_blk.m_grid_width, candidate.m_coded_log_blk.m_grid_height) <= 4)6103candidate_mse *= COMPLEX_BLOCK_WEIGHT_GRID_4X4_MSE_PENALTY;6104}6105}61066107float mode_penalty = 1.0f;6108if (candidate.m_encoding_type == encoding_type::cSolid)6109mode_penalty *= SOLID_PENALTY;6110else if (candidate.m_encoding_type == encoding_type::cReuse)6111mode_penalty *= REUSE_PENALTY;6112else if (candidate.m_encoding_type == encoding_type::cRun)6113mode_penalty *= (complex_block ? RUN_PENALTY * 2.0f : RUN_PENALTY);61146115float candidate_bits = (float)candidate.m_coder.get_total_bits();6116float candidate_d = candidate_mse * mode_penalty;61176118const float D_POWER = 2.0f;6119float candidate_t = perceptual_scale * powf(candidate_d, D_POWER) + candidate_bits * (global_cfg.m_lambda * 1000.0f);61206121candidate.m_t = candidate_t;6122candidate.m_d = candidate_d;6123candidate.m_bits = candidate_bits;61246125if (candidate_t < best_t)6126{6127best_t = candidate_t;6128best_candidate_index = candidate_iter;6129}61306131} // candidate_iter61326133if (global_cfg.m_gaussian1_fallback && (outer_pass == 0) && (very_complex_block) && (best_d_ssim > SWITCH_TO_GAUSSIAN_FILTERED_THRESH1_D_SSIM))6134{6135debug_state.m_total_gaussian1_blocks.fetch_add(1, std::memory_order_relaxed);6136continue;6137}61386139const float block_y_contrast_ratio = block_hy / (block_ly + .00000125f);61406141if (global_cfg.m_gaussian2_fallback && (comp_level >= 1) && (outer_pass == 1) && (very_complex_block) && (best_d_ssim > SWITCH_TO_GAUSSIAN_FILTERED_THRESH2_D_SSIM) &&6142(block_hy >= 18.0f) && (block_y_contrast_ratio > 150.0f) &&6143(block_avg_y >= 1.5f))6144{6145debug_state.m_total_gaussian2_blocks.fetch_add(1, std::memory_order_relaxed);6146continue;6147}61486149if (global_cfg.m_rdo_candidate_diversity_boost)6150{6151// candidate diversity boosting - consider candidates along/near the Pareto front6152const candidate_encoding& comp_candidate = candidates[best_candidate_index];61536154float best_d = BIG_FLOAT_VAL;61556156for (uint32_t candidate_iter = 0; candidate_iter < candidates.size_u32(); candidate_iter++)6157{6158const auto& candidate = candidates[candidate_iter];61596160if (candidate.m_bits <= comp_candidate.m_bits * global_cfg.m_rdo_candidate_diversity_boost_bit_window_weight)6161{6162if (candidate.m_d < best_d)6163{6164best_d = candidate.m_d;6165best_candidate_index = candidate_iter;6166}6167}6168}6169}61706171// candidate JND optimization - if there's a cheaper to code candidate that is nearly equivalent visually to the best candidate chose, choose that6172if (global_cfg.m_jnd_optimization)6173{6174const candidate_encoding& cur_comp_candidate = candidates[best_candidate_index];61756176float new_best_candidate_bits = BIG_FLOAT_VAL;6177int new_best_candidate_index = -1;61786179for (uint32_t candidate_iter = 0; candidate_iter < candidates.size_u32(); candidate_iter++)6180{6181if ((int)candidate_iter == best_candidate_index)6182continue;61836184const auto& candidate = candidates[candidate_iter];61856186if (candidate.m_bits >= cur_comp_candidate.m_bits)6187continue;61886189float max_delta_itp = 0.0f;6190for (uint32_t y = 0; y < BLOCK_H; y++)6191{6192for (uint32_t x = 0; x < BLOCK_W; x++)6193{6194float delta_itp = compute_pixel_delta_itp(cur_comp_candidate.m_comp_pixels_itp[y][x], candidate.m_comp_pixels_itp[y][x], block_pixels_as_itp[y][x], global_cfg.m_delta_itp_dark_adjustment);6195max_delta_itp = maximum(max_delta_itp, delta_itp);61966197if (max_delta_itp >= global_cfg.m_jnd_delta_itp_thresh)6198goto skip;6199}6200}62016202skip:6203if (max_delta_itp >= global_cfg.m_jnd_delta_itp_thresh)6204continue;62056206if (candidate.m_bits < new_best_candidate_bits)6207{6208new_best_candidate_bits = candidate.m_bits;6209new_best_candidate_index = candidate_iter;6210}6211}62126213if (new_best_candidate_index != -1)6214{6215best_candidate_index = new_best_candidate_index;6216debug_state.m_total_jnd_replacements.fetch_add(1, std::memory_order_relaxed);6217}6218}62196220} // if (lambda == 0.0f)62216222if (global_cfg.m_debug_images)6223{6224std::lock_guard<std::mutex> lck(debug_state.m_stat_vis_mutex);6225debug_state.m_stat_vis.fill_box(bx * 6, by * 6, 6, 6, vec4F(best_d_ssim, max_std_dev, lowpass_std_dev, 1.0f));6226}62276228if (best_candidate_index < 0)6229{6230assert(best_candidate_index >= 0);6231fmt_error_printf("No candidates!\n");6232return false;6233}62346235const auto& best_candidate = candidates[best_candidate_index];62366237assert(best_candidate.m_encoding_type != encoding_type::cInvalid);62386239if (best_candidate.m_encoding_type == encoding_type::cRun)6240{6241if (!prev_run_len)6242{6243if (prev_encoding.get_total_bits())6244{6245#if SYNC_MARKERS6246strip_coded_bits.put_bits(0xDEAD, 16);6247#endif62486249strip_coded_bits.append(prev_encoding);6250}62516252assert(best_candidate.m_coder.get_total_bits());62536254prev_encoding = best_candidate.m_coder;62556256prev_run_len = 1;6257}6258else6259{6260prev_run_len++;62616262const uint32_t prev_run_bits = prev_encoding.get_total_bits_u32();6263assert(prev_run_bits);6264BASISU_NOTE_UNUSED(prev_run_bits);62656266const uint32_t num_dummy_bits = best_candidate.m_coder.get_total_bits_u32();6267BASISU_NOTE_UNUSED(num_dummy_bits);62686269// Rewrite the previous encoding to extend the run length.6270prev_encoding.restart();6271prev_encoding.put_bits(RUN_CODE, RUN_CODE_LEN);6272prev_encoding.put_vlc(prev_run_len - 1, 5);62736274assert(prev_encoding.get_total_bits() == prev_run_bits + num_dummy_bits);6275}6276}6277else6278{6279if (prev_encoding.get_total_bits())6280{6281#if SYNC_MARKERS6282strip_coded_bits.put_bits(0xDEAD, 16);6283#endif62846285strip_coded_bits.append(prev_encoding);6286}62876288prev_encoding = best_candidate.m_coder;6289prev_run_len = 0;6290}62916292memcpy(prev_comp_pixels, best_candidate.m_comp_pixels, sizeof(vec3F) * BLOCK_W * BLOCK_H);62936294prev_candidate_encoding = best_candidate;62956296if (best_candidate.m_encoding_type != encoding_type::cRun)6297prev_non_run_candidate_encoding = best_candidate;62986299{6300std::lock_guard<std::mutex> lck(debug_state.m_stats_mutex);63016302debug_state.m_encoding_type_hist[(uint32_t)best_candidate.m_encoding_type]++;63036304if (best_candidate.m_encoding_type == encoding_type::cBlock)6305{6306debug_state.m_endpoint_mode_hist[(uint32_t)best_candidate.m_endpoint_mode]++;6307}63086309if ((best_candidate.m_encoding_type == encoding_type::cReuse) || (best_candidate.m_encoding_type == encoding_type::cBlock))6310{6311const uint32_t bm_index = (uint32_t)best_candidate.m_block_mode;6312assert(bm_index < (uint32_t)block_mode::cBMTotalModes);63136314debug_state.m_block_mode_hist[bm_index]++;6315debug_state.m_block_mode_total_bits[bm_index] += best_candidate.m_coder.get_total_bits();63166317for (uint32_t i = 0; i < 3; i++)6318{6319debug_state.m_block_mode_comp_stats[bm_index][i].push_back(half_comp_stats[i]);6320debug_state.m_block_mode_comparative_stats[bm_index][i].push_back(half_cross_chan_stats[i]);6321}6322}63236324if (best_candidate.m_encoding_type == encoding_type::cReuse)6325{6326debug_state.m_reuse_num_parts[best_candidate.m_coded_log_blk.m_num_partitions].fetch_add(1, std::memory_order_relaxed);63276328if (best_candidate.m_coded_log_blk.m_dual_plane)6329debug_state.m_reuse_total_dp.fetch_add(1, std::memory_order_relaxed);6330}6331}63326333enc_state.coded_blocks(bx, by) = prev_non_run_candidate_encoding;63346335// Update decoded image6336vec4F decoded_float_pixels[BLOCK_H][BLOCK_W];6337for (uint32_t y = 0; y < BLOCK_H; y++)6338for (uint32_t x = 0; x < BLOCK_W; x++)6339decoded_float_pixels[y][x] = best_candidate.m_comp_pixels[y][x];63406341enc_state.packed_img.set_block_clipped((vec4F*)decoded_float_pixels, bx * BLOCK_W, by * BLOCK_H, BLOCK_W, BLOCK_H);63426343status = astc_helpers::pack_astc_block(enc_state.final_astc_blocks(bx, by), best_candidate.m_decomp_log_blk, nullptr, nullptr);6344if (!status)6345{6346fmt_error_printf("Failed packing block\n");6347return false;6348}63496350const uint32_t r = debug_state.m_total_blocks_compressed.fetch_add(1, std::memory_order_relaxed);6351if ((r & 2047) == 2047)6352{6353if (global_cfg.m_status_output)6354{6355basisu::fmt_printf("{} of {} total blocks compressed, {3.2}%\n", r, total_blocks, (r * 100.0f) / total_blocks);6356}6357}63586359if ((global_cfg.m_debug_images) &&6360((best_candidate.m_encoding_type != encoding_type::cRun) && (best_candidate.m_encoding_type != encoding_type::cSolid)))6361{6362std::lock_guard<std::mutex> lck(debug_state.m_vis_image_mutex);63636364if (best_candidate.m_decomp_log_blk.m_num_partitions == 2)6365{6366const int part2_unique_index = g_part2_seed_to_unique_index[best_candidate.m_decomp_log_blk.m_partition_id];6367assert((part2_unique_index >= 0) && (part2_unique_index < (int)NUM_UNIQUE_PARTITIONS2));63686369const partition_pattern_vec& pat = g_partitions2[part2_unique_index];63706371for (uint32_t y = 0; y < 6; y++)6372{6373for (uint32_t x = 0; x < 6; x++)6374{6375const uint32_t p = pat[x + y * 6];6376debug_state.m_part_vis.set_clipped(bx * 6 + x, by * 6 + y, color_rgba(p ? 100 : 0, 128, p ? 100 : 0, 255));6377} // x6378} // y6379}6380else if (best_candidate.m_decomp_log_blk.m_num_partitions == 3)6381{6382//part_vis.fill_box(bx * 6, by * 6, 6, 6, color_rgba(0, 0, 255, 255));63836384const int part3_unique_index = g_part3_seed_to_unique_index[best_candidate.m_decomp_log_blk.m_partition_id];6385assert((part3_unique_index >= 0) && (part3_unique_index < (int)NUM_UNIQUE_PARTITIONS3));63866387const partition_pattern_vec& pat = g_partitions3[part3_unique_index];63886389for (uint32_t y = 0; y < 6; y++)6390{6391for (uint32_t x = 0; x < 6; x++)6392{6393const uint32_t p = pat[x + y * 6];6394color_rgba c(0, 0, 150, 255);6395if (p == 1)6396c.set(100, 0, 150, 255);6397else if (p == 2)6398c.set(0, 100, 150, 255);6399debug_state.m_part_vis.set_clipped(bx * 6 + x, by * 6 + y, c);6400} // x6401} // y6402}6403else if (best_candidate.m_decomp_log_blk.m_dual_plane)6404{6405debug_state.m_part_vis.fill_box(bx * 6, by * 6, 6, 6, color_rgba(255, 0, 255, 255));6406}6407else6408{6409debug_state.m_part_vis.fill_box(bx * 6, by * 6, 6, 6, color_rgba(255, 0, 0, 255));6410}64116412color_rgba c;6413c.set((best_candidate.m_coded_log_blk.m_grid_width * best_candidate.m_coded_log_blk.m_grid_height * 255 + 18) / 36);6414debug_state.m_grid_vis.fill_box(bx * 6, by * 6, 6, 6, c);64156416c.set(0, 0, 0, 255);6417if (complex_block)6418c[0] = 255;64196420if (very_complex_block)6421c[1] = 255;64226423if (outer_pass == 2)6424c[2] = 255;6425else if (outer_pass == 1)6426c[2] = 128;64276428debug_state.m_mode_vis.fill_box(bx * 6, by * 6, 6, 6, c);64296430c.set(0, 255, 0, 255);6431if (best_candidate.m_coded_log_blk.m_color_endpoint_modes[0] == 7)6432c.set(255, 0, 0, 255);6433debug_state.m_mode_vis2.fill_box(bx * 6, by * 6, 6, 6, c);64346435switch (best_candidate.m_encoding_type)6436{6437case encoding_type::cRun:6438c.set(0, 0, 0, 255);6439break;6440case encoding_type::cSolid:6441c.set(128, 128, 128, 255); // dark grey6442break;6443case encoding_type::cReuse:6444c.set(255, 255, 0, 255); // yellow6445break;6446case encoding_type::cBlock:6447{6448switch (best_candidate.m_endpoint_mode)6449{6450case endpoint_mode::cRaw:6451c.set(255, 0, 0, 255); // red6452break;6453case endpoint_mode::cUseLeft:6454c.set(0, 0, 255, 255); // blue6455break;6456case endpoint_mode::cUseUpper:6457c.set(0, 0, 192, 255); // darker blue6458break;6459case endpoint_mode::cUseLeftDelta:6460c.set(0, 255, 0, 255); // green6461break;6462case endpoint_mode::cUseUpperDelta:6463c.set(0, 192, 0, 255); // darker green6464break;6465default:6466break;6467}64686469break;6470}6471default:6472break;6473}64746475if (filtered_x_err < filtered_y_err)6476c[3] = 0;6477else6478c[3] = 255;64796480debug_state.m_enc_vis.fill_box(bx * 6, by * 6, 6, 6, c);6481}64826483break;64846485} // outer_pass64866487} // bx64886489} // by64906491if (prev_encoding.get_total_bits())6492{6493#if SYNC_MARKERS6494strip_coded_bits.put_bits(0xDEAD, 16);6495#endif64966497strip_coded_bits.append(prev_encoding);6498}64996500return true;6501}65026503bool g_initialized = false;65046505void global_init()6506{6507if (g_initialized)6508return;65096510interval_timer tm;6511tm.start();65126513init_pq_tables();65146515init_partitions2_6x6();6516init_partitions3_6x6();65176518init_contrib_lists();65196520g_initialized = true;65216522//fmt_printf("astc_6x6_hdr::global_init() total time: {}\n", tm.get_elapsed_secs());6523}65246525bool compress_photo(const basisu::imagef &orig_src_img, const astc_hdr_6x6_global_config &orig_global_cfg, job_pool *pJob_pool,6526basisu::uint8_vec& intermediate_tex_data, basisu::uint8_vec& astc_tex_data, result_metrics& metrics)6527{6528assert(g_initialized);6529if (!g_initialized)6530return false;65316532assert(pJob_pool);65336534if (orig_global_cfg.m_debug_output)6535{6536fmt_debug_printf("------ astc_6x6_hdr::compress_photo:\n");6537fmt_debug_printf("Source image dimensions: {}x{}\n", orig_src_img.get_width(), orig_src_img.get_height());6538fmt_debug_printf("Job pool total threads: {}\n", (uint64_t)pJob_pool->get_total_threads());6539orig_global_cfg.print();6540}65416542if (!orig_src_img.get_width() || !orig_src_img.get_height())6543{6544assert(false);6545fmt_error_printf("compress_photo: Invalid source image\n");6546return false;6547}65486549astc_hdr_6x6_global_config global_cfg(orig_global_cfg);65506551uastc_hdr_6x6_encode_state enc_state;6552enc_state.master_coptions.m_q_log_bias = Q_LOG_BIAS_6x6;6553enc_state.src_img = orig_src_img;65546555//src_img.crop(256, 256);65566557const uint32_t width = enc_state.src_img.get_width();6558const uint32_t height = enc_state.src_img.get_height();6559const uint32_t num_blocks_x = enc_state.src_img.get_block_width(BLOCK_W);6560const uint32_t num_blocks_y = enc_state.src_img.get_block_height(BLOCK_H);6561const uint32_t total_blocks = num_blocks_x * num_blocks_y;65626563for (uint32_t y = 0; y < height; y++)6564{6565for (uint32_t x = 0; x < width; x++)6566{6567for (uint32_t c = 0; c < 3; c++)6568{6569float f = enc_state.src_img(x, y)[c];65706571if (std::isinf(f) || std::isnan(f) || (f < 0.0f))6572f = 0;6573else if (f > basist::ASTC_HDR_MAX_VAL)6574f = basist::ASTC_HDR_MAX_VAL;65756576enc_state.src_img(x, y)[c] = f;65776578} // c65796580} // x6581} // y65826583if (global_cfg.m_debug_images)6584{6585write_exr((global_cfg.m_debug_image_prefix + "orig.exr").c_str(), enc_state.src_img, 3, 0);6586}65876588image src_img_compressed;6589tonemap_image_compressive2(src_img_compressed, enc_state.src_img);65906591if (global_cfg.m_debug_images)6592{6593save_png(global_cfg.m_debug_image_prefix + "compressive_tone_map.png", src_img_compressed);6594}65956596smooth_map_params rp;6597rp.m_debug_images = global_cfg.m_debug_images;65986599if (global_cfg.m_lambda != 0.0f)6600{6601if (global_cfg.m_status_output)6602fmt_printf("Creating RDO perceptual weighting maps\n");66036604create_smooth_maps2(enc_state.smooth_block_mse_scales, src_img_compressed, rp);6605}66066607if (global_cfg.m_status_output)6608fmt_printf("Blurring image\n");66096610enc_state.src_img_filtered1.resize(width, height);6611image_resample(enc_state.src_img, enc_state.src_img_filtered1, "gaussian", global_cfg.m_gaussian1_strength); //1.45f);66126613enc_state.src_img_filtered2.resize(width, height);6614image_resample(enc_state.src_img, enc_state.src_img_filtered2, "gaussian", global_cfg.m_gaussian2_strength); //1.83f);66156616if (global_cfg.m_debug_images)6617{6618write_exr((global_cfg.m_debug_image_prefix + "blurred1.exr").c_str(), enc_state.src_img_filtered1, 3, 0);6619write_exr((global_cfg.m_debug_image_prefix + "blurred2.exr").c_str(), enc_state.src_img_filtered2, 3, 0);6620}66216622if (global_cfg.m_status_output)6623fmt_printf("Transforming to ITP\n");66246625enc_state.src_img_itp.resize(width, height);6626convet_rgb_image_to_itp(enc_state.src_img, enc_state.src_img_itp, global_cfg);66276628enc_state.src_img_filtered1_itp.resize(width, height);6629convet_rgb_image_to_itp(enc_state.src_img_filtered1, enc_state.src_img_filtered1_itp, global_cfg);66306631enc_state.src_img_filtered2_itp.resize(width, height);6632convet_rgb_image_to_itp(enc_state.src_img_filtered2, enc_state.src_img_filtered2_itp, global_cfg);66336634if (global_cfg.m_lambda == 0.0f)6635global_cfg.m_favor_higher_compression = false;66366637uint32_t total_strips = 0, rows_per_strip = 0;6638if (!calc_strip_size(global_cfg.m_lambda, num_blocks_y, (uint32_t)pJob_pool->get_total_threads(), global_cfg.m_force_one_strip, total_strips, rows_per_strip, global_cfg))6639{6640fmt_error_printf("compress_photo: Failed computing strip sizes\n");6641return false;6642}66436644if (global_cfg.m_debug_output)6645fmt_printf("lambda: {}, comp_level: {}, highest_comp_level: {}, extra patterns: {}\n", global_cfg.m_lambda, global_cfg.m_master_comp_level, global_cfg.m_highest_comp_level, global_cfg.m_extra_patterns_flag);66466647enc_state.coded_blocks.resize(num_blocks_x, num_blocks_y);66486649bitwise_coder coded_bits;66506651coded_bits.put_bits(0xABCD, 16);6652coded_bits.put_bits(width, 16);6653coded_bits.put_bits(height, 16);66546655enc_state.packed_img.resize(width, height);66566657enc_state.strip_bits.resize(total_strips);66586659enc_state.final_astc_blocks.resize(num_blocks_x, num_blocks_y);66606661uastc_hdr_6x6_debug_state debug_state;66626663if (global_cfg.m_debug_images)6664debug_state.init(width, height);6665else6666debug_state.init(0, 0);66676668interval_timer tm;6669tm.start();66706671std::atomic_bool any_failed_flag;6672any_failed_flag.store(false);66736674for (uint32_t strip_index = 0; strip_index < total_strips; strip_index++)6675{6676const uint32_t strip_first_by = strip_index * rows_per_strip;66776678uint32_t strip_last_by = minimum<uint32_t>(strip_first_by + rows_per_strip - 1, num_blocks_y);6679if (strip_index == (total_strips - 1))6680strip_last_by = num_blocks_y - 1;66816682pJob_pool->add_job([&any_failed_flag, &global_cfg, &debug_state, &enc_state,6683strip_index, total_strips, strip_first_by, strip_last_by,6684num_blocks_x, num_blocks_y, total_blocks, width, height]6685{6686if (!any_failed_flag)6687{6688bool status = compress_strip_task(6689strip_index, total_strips, strip_first_by, strip_last_by,6690num_blocks_x, num_blocks_y, total_blocks, width, height,6691global_cfg, debug_state, enc_state);66926693if (!status)6694{6695fmt_error_printf("compress_photo: compress_strip_task() failed\n");6696any_failed_flag.store(true, std::memory_order_relaxed);6697}6698}6699} );67006701if (any_failed_flag)6702break;67036704} // strip_index67056706pJob_pool->wait_for_all();67076708if (any_failed_flag)6709{6710fmt_error_printf("One or more strips failed during compression\n");6711return false;6712}67136714if (global_cfg.m_debug_output)6715fmt_printf("Encoding time: {} secs\n", tm.get_elapsed_secs());67166717if (global_cfg.m_debug_output)6718debug_state.print(total_blocks);67196720if (global_cfg.m_debug_images)6721{6722save_png(global_cfg.m_debug_image_prefix + "part_vis.png", debug_state.m_part_vis);6723save_png(global_cfg.m_debug_image_prefix + "grid_vis.png", debug_state.m_grid_vis);6724save_png(global_cfg.m_debug_image_prefix + "mode_vis.png", debug_state.m_mode_vis);6725save_png(global_cfg.m_debug_image_prefix + "mode_vis2.png", debug_state.m_mode_vis2);6726save_png(global_cfg.m_debug_image_prefix + "enc_vis.png", debug_state.m_enc_vis);6727write_exr((global_cfg.m_debug_image_prefix + "stat_vis.exr").c_str(), debug_state.m_stat_vis, 3, 0);6728}67296730for (uint32_t i = 0; i < total_strips; i++)6731coded_bits.append(enc_state.strip_bits[i]);67326733coded_bits.put_bits(0xA742, 16);67346735coded_bits.flush();67366737if (global_cfg.m_output_images)6738{6739write_exr((global_cfg.m_output_image_prefix + "comp.exr").c_str(), enc_state.packed_img, 3, 0);6740}67416742if (global_cfg.m_debug_output)6743fmt_printf("\nTotal intermediate output bits/pixel: {3.4}\n", (float)coded_bits.get_total_bits() / (float)(width * height));67446745vector2D<astc_helpers::astc_block> decoded_blocks1;6746vector2D<astc_helpers::astc_block> decoded_blocks2;67476748if (global_cfg.m_debug_output)6749fmt_printf("decode_file\n");67506751uint32_t unpacked_width = 0, unpacked_height = 0;6752bool status = decode_file(coded_bits.get_bytes(), decoded_blocks1, unpacked_width, unpacked_height);6753if (!status)6754{6755fmt_error_printf("decode_file() failed\n");6756return false;6757}67586759if (global_cfg.m_debug_output)6760fmt_printf("decode_6x6_hdr\n");67616762status = decode_6x6_hdr(coded_bits.get_bytes().get_ptr(), coded_bits.get_bytes().size_in_bytes_u32(), decoded_blocks2, unpacked_width, unpacked_height);6763if (!status)6764{6765fmt_error_printf("decode_6x6_hdr_file() failed\n");6766return false;6767}67686769if ((enc_state.final_astc_blocks.get_width() != decoded_blocks1.get_width()) ||6770(enc_state.final_astc_blocks.get_height() != decoded_blocks1.get_height()))6771{6772fmt_error_printf("Decode size mismatch with decode_file\n");6773return false;6774}67756776if ((enc_state.final_astc_blocks.get_width() != decoded_blocks2.get_width()) ||6777(enc_state.final_astc_blocks.get_height() != decoded_blocks2.get_height()))6778{6779fmt_error_printf("Decode size mismatch with decode_6x6_hdr_file\n");6780return false;6781}67826783if (memcmp(decoded_blocks1.get_ptr(), enc_state.final_astc_blocks.get_ptr(), decoded_blocks1.size_in_bytes()) != 0)6784{6785fmt_error_printf("Decoded ASTC blocks verification failed\n");6786return false;6787}67886789if (memcmp(decoded_blocks2.get_ptr(), enc_state.final_astc_blocks.get_ptr(), decoded_blocks2.size_in_bytes()) != 0)6790{6791fmt_error_printf("Decoded ASTC blocks verification failed\n");6792return false;6793}67946795if (global_cfg.m_debug_output)6796basisu::fmt_printf("Decoded ASTC verification checks succeeded\n");67976798if (global_cfg.m_output_images)6799{6800if (write_astc_file((global_cfg.m_output_image_prefix + "decoded.astc").c_str(), decoded_blocks1.get_ptr(), BLOCK_W, BLOCK_H, width, height))6801{6802basisu::platform_sleep(20);68036804uint8_vec astc_file_data;6805if (read_file_to_vec((global_cfg.m_output_image_prefix + "decoded.astc").c_str(), astc_file_data))6806{6807if (astc_file_data.size() > 16)6808{6809astc_file_data.erase(0, 16);68106811size_t comp_size = 0;6812void* pComp_data = tdefl_compress_mem_to_heap(&astc_file_data[0], astc_file_data.size(), &comp_size, TDEFL_MAX_PROBES_MASK);6813mz_free(pComp_data);68146815if (global_cfg.m_debug_output)6816{6817fmt_printf(".ASTC file size (less header): {}, bits/pixel: {}, Deflate bits/pixel: {}\n",6818(uint64_t)astc_file_data.size(),6819(float)astc_file_data.size() * 8.0f / (float)(width * height),6820(float)comp_size * 8.0f / (float)(width * height));6821}6822}6823}6824}6825}68266827// Must decode all the blocks (even padded rows/cols) to match what the transcoder does.6828imagef unpacked_astc_img(num_blocks_x * 6, num_blocks_y * 6);6829imagef unpacked_astc_google_img(num_blocks_x * 6, num_blocks_y * 6);68306831for (uint32_t y = 0; y < decoded_blocks1.get_height(); y++)6832{6833for (uint32_t x = 0; x < decoded_blocks1.get_width(); x++)6834{6835const auto& phys_blk = decoded_blocks1(x, y);68366837vec4F pixels[MAX_BLOCK_W * MAX_BLOCK_H];6838status = unpack_physical_astc_block(&phys_blk, BLOCK_W, BLOCK_H, pixels);6839if (!status)6840{6841fmt_error_printf("unpack_physical_astc_block() failed\n");6842return false;6843}68446845unpacked_astc_img.set_block_clipped(pixels, x * BLOCK_W, y * BLOCK_H, BLOCK_W, BLOCK_H);68466847vec4F pixels_google[MAX_BLOCK_W * MAX_BLOCK_H];6848status = unpack_physical_astc_block_google(&phys_blk, BLOCK_W, BLOCK_H, pixels_google);6849if (!status)6850{6851fmt_error_printf("unpack_physical_astc_block_google() failed\n");6852return false;6853}68546855unpacked_astc_google_img.set_block_clipped(pixels_google, x * BLOCK_W, y * BLOCK_H, BLOCK_W, BLOCK_H);68566857for (uint32_t i = 0; i < 36; i++)6858{6859if (pixels[i] != pixels_google[i])6860{6861fmt_error_printf("pixel unpack mismatch\n");6862return false;6863}6864}6865}6866}68676868if (global_cfg.m_debug_output)6869fmt_printf("\nUnpack succeeded\n");68706871imagef unpacked_bc6h_img;68726873{6874vector2D<basist::bc6h_block> bc6h_blocks;68756876fast_bc6h_params enc_params;68776878bool pack_status = pack_bc6h_image(unpacked_astc_img, bc6h_blocks, &unpacked_bc6h_img, enc_params);6879if (!pack_status)6880{6881fmt_error_printf("pack_bc6h_image() failed!");6882return false;6883}68846885unpacked_bc6h_img.crop(width, height);68866887if (global_cfg.m_output_images)6888{6889write_exr((global_cfg.m_output_image_prefix + "unpacked_bc6h.exr").c_str(), unpacked_bc6h_img, 3, 0);6890}6891}68926893unpacked_astc_img.crop(width, height);6894unpacked_astc_google_img.crop(width, height);68956896if (global_cfg.m_output_images)6897{6898write_exr((global_cfg.m_output_image_prefix + "unpacked_astc.exr").c_str(), unpacked_astc_img, 3, 0);6899write_exr((global_cfg.m_output_image_prefix + "unpacked_google_astc.exr").c_str(), unpacked_astc_google_img, 3, 0);6900}69016902// ASTC metrics6903if (global_cfg.m_image_stats)6904{6905image_metrics im;69066907if (global_cfg.m_debug_output)6908printf("\nASTC log2 float error metrics:\n");69096910for (uint32_t i = 0; i < 3; i++)6911{6912im.calc(enc_state.src_img, unpacked_astc_img, i, 1, true, true);69136914if (global_cfg.m_debug_output)6915{6916printf("%c: ", "RGBA"[i]);6917im.print_hp();6918}6919}69206921metrics.m_im_astc_log2.calc(enc_state.src_img, unpacked_astc_img, 0, 3, true, true);69226923if (global_cfg.m_debug_output)6924{6925printf("RGB: ");6926metrics.m_im_astc_log2.print_hp();69276928printf("\n");6929}6930}69316932if (global_cfg.m_image_stats)6933{6934image_metrics im;69356936if (global_cfg.m_debug_output)6937printf("ASTC half float space error metrics (a piecewise linear approximation of log2 error):\n");69386939for (uint32_t i = 0; i < 3; i++)6940{6941im.calc_half(enc_state.src_img, unpacked_astc_img, i, 1, true);69426943if (global_cfg.m_debug_output)6944{6945printf("%c: ", "RGBA"[i]);6946im.print_hp();6947}6948}69496950metrics.m_im_astc_half.calc_half(enc_state.src_img, unpacked_astc_img, 0, 3, true);69516952if (global_cfg.m_debug_output)6953{6954printf("RGB: ");6955metrics.m_im_astc_half.print_hp();6956}6957}69586959// BC6H metrics6960if (global_cfg.m_image_stats)6961{6962image_metrics im;69636964if (global_cfg.m_debug_output)6965printf("\nBC6H log2 float error metrics:\n");69666967for (uint32_t i = 0; i < 3; i++)6968{6969im.calc(enc_state.src_img, unpacked_bc6h_img, i, 1, true, true);69706971if (global_cfg.m_debug_output)6972{6973printf("%c: ", "RGBA"[i]);6974im.print_hp();6975}6976}69776978metrics.m_im_bc6h_log2.calc(enc_state.src_img, unpacked_bc6h_img, 0, 3, true, true);69796980if (global_cfg.m_debug_output)6981{6982printf("RGB: ");6983metrics.m_im_bc6h_log2.print_hp();69846985printf("\n");6986}6987}69886989if (global_cfg.m_image_stats)6990{6991image_metrics im;69926993if (global_cfg.m_debug_output)6994printf("BC6H half float space error metrics (a piecewise linear approximation of log2 error):\n");69956996for (uint32_t i = 0; i < 3; i++)6997{6998im.calc_half(enc_state.src_img, unpacked_bc6h_img, i, 1, true);69997000if (global_cfg.m_debug_output)7001{7002printf("%c: ", "RGBA"[i]);7003im.print_hp();7004}7005}70067007metrics.m_im_bc6h_half.calc_half(enc_state.src_img, unpacked_bc6h_img, 0, 3, true);70087009if (global_cfg.m_debug_output)7010{7011printf("RGB: ");7012metrics.m_im_bc6h_half.print_hp();70137014printf("\n");7015}7016}70177018intermediate_tex_data.swap(coded_bits.get_bytes());70197020astc_tex_data.resize(decoded_blocks1.size_in_bytes());7021memcpy(astc_tex_data.data(), decoded_blocks1.get_ptr(), decoded_blocks1.size_in_bytes());70227023return true;7024}70257026} // namespace astc_6x6_hdr702770287029