Path: blob/master/thirdparty/basis_universal/encoder/basisu_astc_hdr_6x6_enc.cpp
9904 views
// File: basisu_astc_hdr_6x6_enc.cpp1#include "basisu_astc_hdr_6x6_enc.h"2#include "basisu_enc.h"3#include "basisu_astc_hdr_common.h"4#include "basisu_math.h"5#include "basisu_resampler.h"6#include "basisu_resampler_filters.h"78#define MINIZ_HEADER_FILE_ONLY9#define MINIZ_NO_ZLIB_COMPATIBLE_NAMES10#include "basisu_miniz.h"1112#include "3rdparty/android_astc_decomp.h"1314#include <array>1516using namespace basisu;17using namespace buminiz;18using namespace basist::astc_6x6_hdr;1920namespace astc_6x6_hdr21{2223static void atomic_max(std::atomic<uint32_t>& atomic_var, uint32_t new_value)24{25uint32_t current = atomic_var.load(std::memory_order_relaxed);26for ( ; ; )27{28uint32_t new_max = std::max(current, new_value);29if (atomic_var.compare_exchange_weak(current, new_max, std::memory_order_relaxed, std::memory_order_relaxed))30break;31}32}3334void astc_hdr_6x6_global_config::set_user_level(int level)35{36level = basisu::clamp<int>(level, 0, ASTC_HDR_6X6_MAX_USER_COMP_LEVEL);3738m_master_comp_level = 0;39m_highest_comp_level = 0;40m_num_reuse_xy_deltas = NUM_REUSE_XY_DELTAS;41m_extra_patterns_flag = false;42m_brute_force_partition_matching = false;4344switch (level)45{46case 0:47{48// Both reduce compression a lot when lambda>049m_favor_higher_compression = false;50m_num_reuse_xy_deltas = NUM_REUSE_XY_DELTAS / 2;51break;52}53case 1:54{55m_master_comp_level = 0;56m_highest_comp_level = 0;57break;58}59case 2:60{61m_master_comp_level = 0;62m_highest_comp_level = 1;63break;64}65case 3:66{67m_master_comp_level = 1;68m_highest_comp_level = 1;69break;70}71case 4:72{73m_master_comp_level = 1;74m_highest_comp_level = 2;75break;76}77case 5:78{79m_master_comp_level = 1;80m_highest_comp_level = 3;81break;82}83case 6:84{85m_master_comp_level = 1;86m_highest_comp_level = 4;87break;88}89case 7:90{91m_master_comp_level = 2;92m_highest_comp_level = 2;93break;94}95case 8:96{97m_master_comp_level = 2;98m_highest_comp_level = 3;99break;100}101case 9:102{103m_master_comp_level = 2;104m_highest_comp_level = 4;105break;106}107case 10:108{109m_master_comp_level = 3;110m_highest_comp_level = 3;111break;112}113case 11:114{115m_master_comp_level = 3;116m_highest_comp_level = 4;117break;118}119case 12:120default:121{122m_master_comp_level = 4;123m_highest_comp_level = 4;124m_extra_patterns_flag = true;125m_brute_force_partition_matching = true;126break;127}128}129}130131const float m1 = 0.1593017578125f; // (2610 / 2^14) * (1/100)132const float m2 = 78.84375f; // (2523 / 32) * (1/100)133const float c1 = 0.8359375f; // 3424 / (2^12)134const float c2 = 18.8515625f; // (2413 / 128)135const float c3 = 18.6875f; // (2392 / 128)136137static float forwardPQ(float Y)138{139// 10,000 here is an absolute scale - it's in nits (cd per square meter)140float L = Y * (1.0f / 10000.0f);141142float num = powf(L, m1);143float N = powf((c1 + c2 * num) / (1 + c3 * num), m2);144145return N;146}147148#if 0149static float inversePQ(float E)150{151float N = powf(E, 1.0f / m2);152153float num = basisu::maximum<float>((N - c1), 0.0f) / (c2 - c3 * N);154float L = powf(num, 1.0f / m1);155156return L * 10000.0f;157}158#endif159160// PQ function approximation: convert input to bfloat16, look up in tables, bilinear interpolation between table entries.161// max_er: 0.000023007392883, max_rel_er: 0.000023472490284, avg_er: 0.000004330495689, 6-7x faster on x86162// Highest error is for values less than SMALLEST_PQ_VAL_IN.163//164// Approximation is round trip lossless for 10-12 bits at [0,10000] nits:165// for x [0,1024] (SCALE=1023) or for x [0,4095] (SCALE=4096):166// round(forwardPQTab(inversePQ(x / SCALE)) * SCALE) == x167//168// bfloat16 has enough precision to handle 8-bit sRGB to linear conversions:169// round(linear_to_srgb(bfloat16_to_float(float_to_bfloat16(srgb_to_linear(isRGB/255.0f))))*255.0) is lossless170171const int PQ_APPROX_MIN_EXP = -16, PQ_APPROX_MAX_EXP = 16;172const int PQ_APPROX_EXP_RANGE = (PQ_APPROX_MAX_EXP - PQ_APPROX_MIN_EXP + 1);173174const float SMALLEST_PQ_VAL_IN = 0.000015258829080f;175const float SMALLEST_PQ_VAL = 0.000551903737f; // forwardPQ(SMALLEST_PQ_VAL_IN)176177const float LARGEST_PQ_VAL = 1.251312f;178179float g_pq_approx_tabs[PQ_APPROX_EXP_RANGE][128];180181static void init_pq_tables()182{183for (int exp = PQ_APPROX_MIN_EXP; exp <= PQ_APPROX_MAX_EXP; exp++)184{185for (int mant = 0; mant < 128; mant++)186{187bfloat16 b = bfloat16_init(1, exp, mant);188float bf = bfloat16_to_float(b);189190float pq = forwardPQ(bf);191192g_pq_approx_tabs[exp - PQ_APPROX_MIN_EXP][mant] = pq;193}194}195196//fmt_printf("{.15} {.15}\n", g_pq_approx_tabs[0][0], inversePQ(g_pq_approx_tabs[0][0]));197//fmt_printf("{.15}\n", forwardPQ(SMALLEST_PQ_VAL_IN));198}199200static inline float forwardPQTab(float v)201{202assert(g_pq_approx_tabs[0][0]);203204assert(v >= 0.0f);205if (v == 0.0f)206return 0.0f;207208bfloat16 bf = float_to_bfloat16(v, false);209assert(v >= bfloat16_to_float(bf));210211int exp = bfloat16_get_exp(bf);212213if (exp < PQ_APPROX_MIN_EXP)214{215// not accurate but should be good enough for our uses216return lerp(0.0f, SMALLEST_PQ_VAL, minimum(1.0f, v / SMALLEST_PQ_VAL_IN));217}218else if (exp > PQ_APPROX_MAX_EXP)219return LARGEST_PQ_VAL;220221int mant = bfloat16_get_mantissa(bf);222223float a = g_pq_approx_tabs[exp - PQ_APPROX_MIN_EXP][mant];224float bf_f32 = bfloat16_to_float(bf);225226int next_mant = mant + 1;227int next_exp = exp;228if (next_mant == 128)229{230next_mant = 0;231next_exp++;232if (next_exp > PQ_APPROX_MAX_EXP)233return a;234}235236float b = g_pq_approx_tabs[next_exp - PQ_APPROX_MIN_EXP][next_mant];237238bfloat16 next_bf = bfloat16_init(1, next_exp, next_mant);239float next_bf_f32 = bfloat16_to_float(next_bf);240assert(v <= next_bf_f32);241242float lerp_factor = (v - bf_f32) / (next_bf_f32 - bf_f32);243assert((lerp_factor >= 0) && (lerp_factor <= 1.0f));244245return lerp(a, b, lerp_factor);246}247248// 100 nits = ~.5 i249// This converts absolute linear RGB light in either REC 709 or REC2020/BT2100 color gamut to ICtCp, a coding space where Ct is scaled by 2.250// To convert to perceptual ITP for error/distance calculations, multiply the result Ct by .5 (or set itp_flag to true).251// Assumes REC 709 input, or REC 2020/BT.2100 RGB input if rec2020_bt2100_color_gamut is true.252//253// ITP info:254// https://www.portrait.com/resource-center/ictcp-color-difference-metric/255// https://professional.dolby.com/siteassets/pdfs/measuringperceptualcolorvolume_v07.253.pdf (see scale to JND's)256// This also converts from a ICtCp coding space to threshold or perceptually uniform space ITP.257//258// Linear REC709 to REC2020/BT.2100 gamut conversion:259// rgb_2100[0] = rgb_in[0] * 0.6274f + rgb_in[1] * 0.3293f + rgb_in[2] * 0.0433f;260// rgb_2100[1] = rgb_in[0] * 0.0691f + rgb_in[1] * 0.9195f + rgb_in[2] * 0.0114f;261// rgb_2100[2] = rgb_in[0] * 0.0164f + rgb_in[1] * 0.0880f + rgb_in[2] * 0.8956f;262// const float S = 1.0f / 4096.0f;263// l = (1688.0f * S) * rgb_2100[0] + (2146.0f * S) * rgb_2100[1] + (262.0f * S) * rgb_2100[2];264// m = (683.0f * S) * rgb_2100[0] + (2951.0f * S) * rgb_2100[1] + (462.0f * S) * rgb_2100[2];265// s = (99.0f * S) * rgb_2100[0] + (309.0f * S) * rgb_2100[1] + (3688.0f * S) * rgb_2100[2];266static void linear_rgb_to_ictcp(const vec3F& rgb_in, vec3F& ictcp, bool itp_flag = false, bool rec2020_bt2100_color_gamut = false)267{268vec3F rgb_2100(rgb_in);269270float l, m, s;271if (!rec2020_bt2100_color_gamut)272{273// Assume REC 709 input color gamut274// (REC2020_to_LMS * REC709_to_2020) * input_color275l = rgb_2100[0] * 0.2958097f + rgb_2100[1] * 0.6230863f + rgb_2100[2] * 0.0811040f;276m = rgb_2100[0] * 0.1562512f + rgb_2100[1] * 0.7272980f + rgb_2100[2] * 0.1164508f;277s = rgb_2100[0] * 0.0351435f + rgb_2100[1] * 0.1565601f + rgb_2100[2] * 0.8082964f;278}279else280{281// Assumes REC2020/BT.2100 input color gamut (this is from the spec)282l = 0.412109375f * rgb_2100[0] + 0.52392578125f * rgb_2100[1] + 0.06396484375f * rgb_2100[2];283m = 0.166748046875f * rgb_2100[0] + 0.720458984375f * rgb_2100[1] + 0.11279296875f * rgb_2100[2];284s = 0.024169921875f * rgb_2100[0] + 0.075439453125f * rgb_2100[1] + 0.900390625f * rgb_2100[2];285}286287float ld = forwardPQTab(l);288float md = forwardPQTab(m);289float sd = forwardPQTab(s);290291ictcp[0] = .5f * ld + .5f * md;292293// if ITP scale Ct by .5 (the ICtCp spec scaled Ct to better exploit the full scaled output, which is not perceptually linear)294if (itp_flag)295ictcp[1] = 0.806884765625f * ld + -1.6617431640625f * md + 0.8548583984375f * sd;296else297ictcp[1] = 1.61376953125f * ld + -3.323486328125f * md + 1.709716796875f * sd;298299ictcp[2] = 4.378173828125f * ld + -4.24560546875f * md + -0.132568359375f * sd;300}301302static inline void linear_rgb_to_itp(const vec3F& rgb_in, vec3F& itp, const astc_hdr_6x6_global_config &cfg)303{304linear_rgb_to_ictcp(rgb_in, itp, true, cfg.m_rec2020_bt2100_color_gamut);305}306307#if 0308// Outputs rec2020/bt2100 color gamut (i.e. this doesn't convert back to REC709 gamut).309static void ictcp_to_linear_rgb(const vec3F& ictcp, vec3F& rgb, bool itp_flag = false)310{311float ct = ictcp[1];312313if (itp_flag)314ct *= 2.0f;315316float ld = ictcp[0] + ct * 0.008609037037932726f + ictcp[2] * 0.11102962500302596f;317float md = ictcp[0] + ct * -0.008609037037932726f + ictcp[2] * -0.11102962500302596f;318float sd = ictcp[0] + ct * 0.5600313357106792f + ictcp[2] * -0.32062717498731885f;319320float l = inversePQ(ld);321float m = inversePQ(md);322float s = inversePQ(sd);323324rgb[0] = l * 3.436606694333079f + m * -2.5064521186562705f + s * 0.06984542432319149f;325rgb[1] = l * -0.7913295555989289f + m * 1.983600451792291f + s * -0.192270896193362f;326rgb[2] = l * -0.025949899690592672f + m * -0.09891371471172646f + s * 1.1248636144023192f;327}328#endif329330struct half_vec3331{332basist::half_float m_vals[3];333334inline half_vec3() { }335336inline half_vec3(basist::half_float x, basist::half_float y, basist::half_float z)337{338m_vals[0] = x;339m_vals[1] = y;340m_vals[2] = z;341}342343inline half_vec3(const half_vec3& other)344{345*this = other;346}347348inline half_vec3& operator= (const half_vec3& rhs)349{350m_vals[0] = rhs.m_vals[0];351m_vals[1] = rhs.m_vals[1];352m_vals[2] = rhs.m_vals[2];353return *this;354}355356inline void clear()357{358clear_obj(m_vals);359}360361inline half_vec3 &set(basist::half_float x, basist::half_float y, basist::half_float z)362{363m_vals[0] = x;364m_vals[1] = y;365m_vals[2] = z;366return *this;367}368369inline half_vec3& set(float x, float y, float z)370{371m_vals[0] = basist::float_to_half(x);372m_vals[1] = basist::float_to_half(y);373m_vals[2] = basist::float_to_half(z);374return *this;375}376377template<typename T>378inline half_vec3& set_vec(const T& vec)379{380m_vals[0] = basist::float_to_half(vec[0]);381m_vals[1] = basist::float_to_half(vec[1]);382m_vals[2] = basist::float_to_half(vec[2]);383return *this;384}385386template<typename T>387inline T get_vec() const388{389return T(basist::half_to_float(m_vals[0]), basist::half_to_float(m_vals[1]), basist::half_to_float(m_vals[2]));390}391392inline basist::half_float operator[] (uint32_t c) const { assert(c < 3); return m_vals[c]; }393inline basist::half_float& operator[] (uint32_t c) { assert(c < 3); return m_vals[c]; }394395float get_float_comp(uint32_t c) const396{397assert(c < 3);398return basist::half_to_float(m_vals[c]);399}400401half_vec3& set_float_comp(uint32_t c, float v)402{403assert(c < 3);404m_vals[c] = basist::float_to_half(v);405return *this;406}407};408409struct half_vec4410{411basist::half_float m_vals[4];412413inline half_vec4() { }414415inline half_vec4(basist::half_float x, basist::half_float y, basist::half_float z, basist::half_float w)416{417m_vals[0] = x;418m_vals[1] = y;419m_vals[2] = z;420m_vals[3] = w;421}422423inline half_vec4(const half_vec4& other)424{425*this = other;426}427428inline half_vec4& operator= (const half_vec4& rhs)429{430m_vals[0] = rhs.m_vals[0];431m_vals[1] = rhs.m_vals[1];432m_vals[2] = rhs.m_vals[2];433m_vals[3] = rhs.m_vals[3];434return *this;435}436437inline void clear()438{439clear_obj(m_vals);440}441442inline half_vec4& set(basist::half_float x, basist::half_float y, basist::half_float z, basist::half_float w)443{444m_vals[0] = x;445m_vals[1] = y;446m_vals[2] = z;447m_vals[3] = w;448return *this;449}450451inline half_vec4& set(float x, float y, float z, float w)452{453m_vals[0] = basist::float_to_half(x);454m_vals[1] = basist::float_to_half(y);455m_vals[2] = basist::float_to_half(z);456m_vals[3] = basist::float_to_half(w);457return *this;458}459460template<typename T>461inline half_vec4& set_vec(const T& vec)462{463m_vals[0] = basist::float_to_half(vec[0]);464m_vals[1] = basist::float_to_half(vec[1]);465m_vals[2] = basist::float_to_half(vec[2]);466m_vals[3] = basist::float_to_half(vec[3]);467return *this;468}469470template<typename T>471inline T get_vec() const472{473return T(basist::half_to_float(m_vals[0]), basist::half_to_float(m_vals[1]), basist::half_to_float(m_vals[2]), basist::half_to_float(m_vals[3]));474}475476inline basist::half_float operator[] (uint32_t c) const { assert(c < 4); return m_vals[c]; }477inline basist::half_float &operator[] (uint32_t c) { assert(c < 4); return m_vals[c]; }478479float get_float_comp(uint32_t c) const480{481assert(c < 4);482return basist::half_to_float(m_vals[c]);483}484485half_vec4& set_float_comp(uint32_t c, float v)486{487assert(c < 4);488m_vals[c] = basist::float_to_half(v);489return *this;490}491};492493const uint32_t MAX_BLOCK_W = 6, MAX_BLOCK_H = 6;494495struct trial_result496{497astc_helpers::log_astc_block m_log_blk;498double m_err;499bool m_valid;500};501502//----------------------------------------------------------503504const uint32_t NUM_PART3_MAPPINGS = 6;505static uint8_t g_part3_mapping[NUM_PART3_MAPPINGS][3] =506{507{ 0, 1, 2 },508{ 1, 2, 0 },509{ 2, 0, 1 },510{ 0, 2, 1 },511{ 1, 0, 2 },512{ 2, 1, 0 }513};514515struct partition_pattern_vec516{517uint8_t m_parts[6 * 6];518519partition_pattern_vec()520{521clear();522}523524partition_pattern_vec(const partition_pattern_vec& other)525{526*this = other;527}528529void clear()530{531memset(m_parts, 0, sizeof(m_parts));532}533534partition_pattern_vec& operator= (const partition_pattern_vec& rhs)535{536if (this == &rhs)537return *this;538memcpy(m_parts, rhs.m_parts, 36);539return *this;540}541542uint8_t operator[] (uint32_t i) const { assert(i < 36); return m_parts[i]; }543uint8_t& operator[] (uint32_t i) { assert(i < 36); return m_parts[i]; }544545uint8_t operator() (uint32_t x, uint32_t y) const { assert((x < 6) && (y < 6)); return m_parts[x + y * 6]; }546uint8_t& operator() (uint32_t x, uint32_t y) { assert((x < 6) && (y < 6)); return m_parts[x + y * 6]; }547548int get_squared_distance(const partition_pattern_vec& other) const549{550int total_dist = 0;551for (uint32_t i = 0; i < 36; i++)552total_dist += iabs((int)m_parts[i] - (int)other.m_parts[i]);553return total_dist;554}555556float get_distance(const partition_pattern_vec& other) const557{558return sqrtf((float)get_squared_distance(other));559}560561partition_pattern_vec get_permuted2(uint32_t permute_index) const562{563assert(permute_index <= 1);564565partition_pattern_vec res;566for (uint32_t i = 0; i < 36; i++)567{568assert(m_parts[i] <= 1);569res.m_parts[i] = (uint8_t)(m_parts[i] ^ permute_index);570}571572return res;573}574575partition_pattern_vec get_permuted3(uint32_t permute_index) const576{577assert(permute_index <= 5);578579partition_pattern_vec res;580for (uint32_t i = 0; i < 36; i++)581{582assert(m_parts[i] <= 2);583res.m_parts[i] = g_part3_mapping[permute_index][m_parts[i]];584}585586return res;587}588589partition_pattern_vec get_canonicalized() const590{591partition_pattern_vec res;592593int new_labels[3] = { -1, -1, -1 };594uint32_t next_index = 0;595for (uint32_t i = 0; i < 36; i++)596{597uint32_t p = m_parts[i];598if (new_labels[p] == -1)599new_labels[p] = next_index++;600601res.m_parts[i] = (uint8_t)new_labels[p];602}603604return res;605}606607bool operator== (const partition_pattern_vec& rhs) const608{609return memcmp(m_parts, rhs.m_parts, sizeof(m_parts)) == 0;610}611612operator size_t() const613{614return basisu::hash_hsieh(m_parts, sizeof(m_parts));615}616};617618struct vp_tree_node619{620partition_pattern_vec m_vantage_point;621uint32_t m_point_index;622float m_dist;623624int m_inner_node, m_outer_node;625};626627#define BRUTE_FORCE_PART_SEARCH (0)628629class vp_tree630{631public:632vp_tree()633{634}635636void clear()637{638m_nodes.clear();639}640641// This requires no redundant patterns, i.e. all must be unique.642bool init(uint32_t n, const partition_pattern_vec* pUnique_pats)643{644clear();645646uint_vec pat_indices(n);647for (uint32_t i = 0; i < n; i++)648pat_indices[i] = i;649650std::pair<int, float> root_idx = find_best_vantage_point(n, pUnique_pats, pat_indices);651652if (root_idx.first == -1)653return false;654655m_nodes.resize(1);656m_nodes[0].m_vantage_point = pUnique_pats[root_idx.first];657m_nodes[0].m_point_index = root_idx.first;658m_nodes[0].m_dist = root_idx.second;659m_nodes[0].m_inner_node = -1;660m_nodes[0].m_outer_node = -1;661662uint_vec inner_list, outer_list;663664inner_list.reserve(n / 2);665outer_list.reserve(n / 2);666667for (uint32_t pat_index = 0; pat_index < n; pat_index++)668{669if ((int)pat_index == root_idx.first)670continue;671672const float dist = m_nodes[0].m_vantage_point.get_distance(pUnique_pats[pat_index]);673674if (dist <= root_idx.second)675inner_list.push_back(pat_index);676else677outer_list.push_back(pat_index);678}679680if (inner_list.size())681{682m_nodes[0].m_inner_node = create_node(n, pUnique_pats, inner_list);683if (m_nodes[0].m_inner_node < 0)684return false;685}686687if (outer_list.size())688{689m_nodes[0].m_outer_node = create_node(n, pUnique_pats, outer_list);690if (m_nodes[0].m_outer_node < 0)691return false;692}693694return true;695}696697struct result698{699uint32_t m_pat_index;700uint32_t m_mapping_index;701float m_dist;702703bool operator< (const result& rhs) const { return m_dist < rhs.m_dist; }704bool operator> (const result& rhs) const { return m_dist > rhs.m_dist; }705};706707class result_queue708{709enum { MaxSupportedSize = 256 + 1 };710711public:712result_queue() :713m_cur_size(0)714{715}716717size_t get_size() const718{719return m_cur_size;720}721722bool empty() const723{724return !m_cur_size;725}726727typedef std::array<result, MaxSupportedSize + 1> result_array_type;728729const result_array_type& get_elements() const { return m_elements; }730result_array_type& get_elements() { return m_elements; }731732void clear()733{734m_cur_size = 0;735}736737void reserve(uint32_t n)738{739BASISU_NOTE_UNUSED(n);740}741742const result& top() const743{744assert(m_cur_size);745return m_elements[1];746}747748bool insert(const result& val, uint32_t max_size)749{750assert(max_size < MaxSupportedSize);751752if (m_cur_size >= MaxSupportedSize)753return false;754755m_elements[++m_cur_size] = val;756up_heap(m_cur_size);757758if (m_cur_size > max_size)759pop();760761return true;762}763764bool pop()765{766if (m_cur_size == 0)767return false;768769m_elements[1] = m_elements[m_cur_size--];770down_heap(1);771return true;772}773774float get_highest_dist() const775{776if (!m_cur_size)777return 0.0f;778779return top().m_dist;780}781782private:783result_array_type m_elements;784size_t m_cur_size;785786void up_heap(size_t index)787{788while ((index > 1) && (m_elements[index] > m_elements[index >> 1]))789{790std::swap(m_elements[index], m_elements[index >> 1]);791index >>= 1;792}793}794795void down_heap(size_t index)796{797for ( ; ; )798{799size_t largest = index, left_child = 2 * index, right_child = 2 * index + 1;800801if ((left_child <= m_cur_size) && (m_elements[left_child] > m_elements[largest]))802largest = left_child;803804if ((right_child <= m_cur_size) && (m_elements[right_child] > m_elements[largest]))805largest = right_child;806807if (largest == index)808break;809810std::swap(m_elements[index], m_elements[largest]);811index = largest;812}813}814};815816void find_nearest(uint32_t num_subsets, const partition_pattern_vec& desired_pat, result_queue& results, uint32_t max_results)817{818assert((num_subsets >= 2) && (num_subsets <= 3));819820results.clear();821822if (!m_nodes.size())823return;824825uint32_t num_desired_pats;826partition_pattern_vec desired_pats[NUM_PART3_MAPPINGS];827828if (num_subsets == 2)829{830num_desired_pats = 2;831for (uint32_t i = 0; i < 2; i++)832desired_pats[i] = desired_pat.get_permuted2(i);833}834else835{836num_desired_pats = NUM_PART3_MAPPINGS;837for (uint32_t i = 0; i < NUM_PART3_MAPPINGS; i++)838desired_pats[i] = desired_pat.get_permuted3(i);839}840841#if 0842find_nearest_at_node(0, num_desired_pats, desired_pats, results, max_results);843#else844find_nearest_at_node_non_recursive(0, num_desired_pats, desired_pats, results, max_results);845#endif846}847848private:849basisu::vector<vp_tree_node> m_nodes;850851void find_nearest_at_node(int node_index, uint32_t num_desired_pats, const partition_pattern_vec* pDesired_pats, result_queue& results, uint32_t max_results)852{853float best_dist_to_vantage = BIG_FLOAT_VAL;854uint32_t best_mapping = 0;855for (uint32_t i = 0; i < num_desired_pats; i++)856{857float dist = pDesired_pats[i].get_distance(m_nodes[node_index].m_vantage_point);858if (dist < best_dist_to_vantage)859{860best_dist_to_vantage = dist;861best_mapping = i;862}863}864865result r;866r.m_dist = best_dist_to_vantage;867r.m_mapping_index = best_mapping;868r.m_pat_index = m_nodes[node_index].m_point_index;869870results.insert(r, max_results);871872if (best_dist_to_vantage <= m_nodes[node_index].m_dist)873{874// inner first875if (m_nodes[node_index].m_inner_node >= 0)876find_nearest_at_node(m_nodes[node_index].m_inner_node, num_desired_pats, pDesired_pats, results, max_results);877878if (m_nodes[node_index].m_outer_node >= 0)879{880if ( (results.get_size() < max_results) ||881((m_nodes[node_index].m_dist - best_dist_to_vantage) <= results.get_highest_dist())882)883{884find_nearest_at_node(m_nodes[node_index].m_outer_node, num_desired_pats, pDesired_pats, results, max_results);885}886}887}888else889{890// outer first891if (m_nodes[node_index].m_outer_node >= 0)892find_nearest_at_node(m_nodes[node_index].m_outer_node, num_desired_pats, pDesired_pats, results, max_results);893894if (m_nodes[node_index].m_inner_node >= 0)895{896if ( (results.get_size() < max_results) ||897((best_dist_to_vantage - m_nodes[node_index].m_dist) <= results.get_highest_dist())898)899{900find_nearest_at_node(m_nodes[node_index].m_inner_node, num_desired_pats, pDesired_pats, results, max_results);901}902}903}904}905906void find_nearest_at_node_non_recursive(int init_node_index, uint32_t num_desired_pats, const partition_pattern_vec* pDesired_pats, result_queue& results, uint32_t max_results)907{908uint_vec node_stack;909node_stack.reserve(16);910node_stack.push_back(init_node_index);911912do913{914const uint32_t node_index = node_stack.back();915node_stack.pop_back();916917float best_dist_to_vantage = BIG_FLOAT_VAL;918uint32_t best_mapping = 0;919for (uint32_t i = 0; i < num_desired_pats; i++)920{921float dist = pDesired_pats[i].get_distance(m_nodes[node_index].m_vantage_point);922if (dist < best_dist_to_vantage)923{924best_dist_to_vantage = dist;925best_mapping = i;926}927}928929result r;930r.m_dist = best_dist_to_vantage;931r.m_mapping_index = best_mapping;932r.m_pat_index = m_nodes[node_index].m_point_index;933934results.insert(r, max_results);935936if (best_dist_to_vantage <= m_nodes[node_index].m_dist)937{938if (m_nodes[node_index].m_outer_node >= 0)939{940if ((results.get_size() < max_results) ||941((m_nodes[node_index].m_dist - best_dist_to_vantage) <= results.get_highest_dist())942)943{944node_stack.push_back(m_nodes[node_index].m_outer_node);945}946}947948// inner first949if (m_nodes[node_index].m_inner_node >= 0)950{951node_stack.push_back(m_nodes[node_index].m_inner_node);952}953}954else955{956if (m_nodes[node_index].m_inner_node >= 0)957{958if ((results.get_size() < max_results) ||959((best_dist_to_vantage - m_nodes[node_index].m_dist) <= results.get_highest_dist())960)961{962node_stack.push_back(m_nodes[node_index].m_inner_node);963}964}965966// outer first967if (m_nodes[node_index].m_outer_node >= 0)968{969node_stack.push_back(m_nodes[node_index].m_outer_node);970}971}972973} while (!node_stack.empty());974}975976// returns the index of the new node, or -1 on error977int create_node(uint32_t n, const partition_pattern_vec* pUnique_pats, const uint_vec& pat_indices)978{979std::pair<int, float> root_idx = find_best_vantage_point(n, pUnique_pats, pat_indices);980981if (root_idx.first < 0)982return -1;983984m_nodes.resize(m_nodes.size() + 1);985const uint32_t new_node_index = m_nodes.size_u32() - 1;986987m_nodes[new_node_index].m_vantage_point = pUnique_pats[root_idx.first];988m_nodes[new_node_index].m_point_index = root_idx.first;989m_nodes[new_node_index].m_dist = root_idx.second;990m_nodes[new_node_index].m_inner_node = -1;991m_nodes[new_node_index].m_outer_node = -1;992993uint_vec inner_list, outer_list;994995inner_list.reserve(pat_indices.size_u32() / 2);996outer_list.reserve(pat_indices.size_u32() / 2);997998for (uint32_t pat_indices_iter = 0; pat_indices_iter < pat_indices.size(); pat_indices_iter++)999{1000const uint32_t pat_index = pat_indices[pat_indices_iter];10011002if ((int)pat_index == root_idx.first)1003continue;10041005const float dist = m_nodes[new_node_index].m_vantage_point.get_distance(pUnique_pats[pat_index]);10061007if (dist <= root_idx.second)1008inner_list.push_back(pat_index);1009else1010outer_list.push_back(pat_index);1011}10121013if (inner_list.size())1014m_nodes[new_node_index].m_inner_node = create_node(n, pUnique_pats, inner_list);10151016if (outer_list.size())1017m_nodes[new_node_index].m_outer_node = create_node(n, pUnique_pats, outer_list);10181019return new_node_index;1020}10211022// returns the pattern index of the vantage point (-1 on error), and the optimal split distance1023std::pair<int, float> find_best_vantage_point(uint32_t num_unique_pats, const partition_pattern_vec* pUnique_pats, const uint_vec &pat_indices)1024{1025BASISU_NOTE_UNUSED(num_unique_pats);10261027const uint32_t n = pat_indices.size_u32();10281029assert(n);1030if (n == 1)1031return std::pair(pat_indices[0], 0.0f);10321033float best_split_metric = -1.0f;1034int best_split_pat = -1;1035float best_split_dist = 0.0f;1036float best_split_var = 0.0f;10371038basisu::vector< std::pair<float, uint32_t> > dists;1039dists.reserve(n);10401041float_vec float_dists;1042float_dists.reserve(n);10431044for (uint32_t pat_indices_iter = 0; pat_indices_iter < n; pat_indices_iter++)1045{1046const uint32_t split_pat_index = pat_indices[pat_indices_iter];1047assert(split_pat_index < num_unique_pats);10481049const partition_pattern_vec& trial_vantage = pUnique_pats[split_pat_index];10501051dists.resize(0);1052float_dists.resize(0);10531054for (uint32_t j = 0; j < n; j++)1055{1056const uint32_t pat_index = pat_indices[j];1057assert(pat_index < num_unique_pats);10581059if (pat_index == split_pat_index)1060continue;10611062float dist = trial_vantage.get_distance(pUnique_pats[pat_index]);1063dists.emplace_back(std::pair(dist, pat_index));10641065float_dists.push_back(dist);1066}10671068stats<double> s;1069s.calc(float_dists.size_u32(), float_dists.data());10701071std::sort(dists.begin(), dists.end(), [](const auto &a, const auto &b) {1072return a.first < b.first;1073});10741075const uint32_t num_dists = dists.size_u32();1076float split_dist = dists[num_dists / 2].first;1077if ((num_dists & 1) == 0)1078split_dist = (split_dist + dists[(num_dists / 2) - 1].first) * .5f;10791080uint32_t total_inner = 0, total_outer = 0;10811082for (uint32_t j = 0; j < n; j++)1083{1084const uint32_t pat_index = pat_indices[j];1085if (pat_index == split_pat_index)1086continue;10871088float dist = trial_vantage.get_distance(pUnique_pats[pat_index]);10891090if (dist <= split_dist)1091total_inner++;1092else1093total_outer++;1094}10951096float split_metric = (float)minimum(total_inner, total_outer) / (float)maximum(total_inner, total_outer);10971098if ( (split_metric > best_split_metric) ||1099((split_metric == best_split_metric) && (s.m_var > best_split_var)) )1100{1101best_split_metric = split_metric;1102best_split_dist = split_dist;1103best_split_pat = split_pat_index;1104best_split_var = (float)s.m_var;1105}1106}11071108return std::pair(best_split_pat, best_split_dist);1109}1110};11111112struct partition1113{1114uint64_t m_p;11151116inline partition() :1117m_p(0)1118{1119}11201121inline partition(uint64_t p) :1122m_p(p)1123{1124assert(p < (1ULL << 36));1125}11261127inline partition& operator=(uint64_t p)1128{1129assert(p < (1ULL << 36));1130m_p = p;1131return *this;1132}11331134inline bool operator< (const partition& p) const1135{1136return m_p < p.m_p;1137}11381139inline bool operator== (const partition& p) const1140{1141return m_p == p.m_p;1142}11431144inline operator size_t() const1145{1146return hash_hsieh((const uint8_t *)&m_p, sizeof(m_p));1147}1148};11491150partition_pattern_vec g_partitions2[NUM_UNIQUE_PARTITIONS2];1151int g_part2_seed_to_unique_index[1024];1152vp_tree g_part2_vp_tree;11531154static inline vec3F vec3F_norm_approx(vec3F axis)1155{1156float l = axis.norm();1157axis = (fabs(l) >= SMALL_FLOAT_VAL) ? (axis * bu_math::inv_sqrt(l)) : vec3F(0.577350269f);1158return axis;1159}11601161static void init_partitions2_6x6()1162{1163#if 01164// makes pattern bits to the 10-bit ASTC seed index1165typedef basisu::hash_map<uint64_t, uint32_t> partition2_hash_map;1166partition2_hash_map phash;1167phash.reserve(1024);11681169for (uint32_t i = 0; i < 1024; i++)1170{1171uint64_t p_bits = 0;1172uint64_t p_bits_inv = 0;11731174for (uint32_t y = 0; y < 6; y++)1175{1176for (uint32_t x = 0; x < 6; x++)1177{1178uint64_t p = astc_helpers::compute_texel_partition(i, x, y, 0, 2, false);1179assert(p < 2);11801181p_bits |= (p << (x + y * 6));1182p_bits_inv |= ((1 - p) << (x + y * 6));1183}1184}11851186if (!p_bits)1187continue;1188if (p_bits == ((1ULL << 36) - 1))1189continue;11901191assert(p_bits < (1ULL << 36));1192assert(p_bits_inv < (1ULL << 36));11931194if (phash.contains(p_bits))1195{1196}1197else if (phash.contains(p_bits_inv))1198{1199}1200else1201{1202auto res = phash.insert(p_bits, i);1203assert(res.second);1204BASISU_NOTE_UNUSED(res);1205}1206}12071208uint32_t num_unique_partitions2 = 0;12091210for (const auto& r : phash)1211{1212assert(r.second < 1024);12131214const uint32_t unique_index = num_unique_partitions2;1215assert(unique_index < NUM_UNIQUE_PARTITIONS2);12161217partition_pattern_vec pat_vec;1218for (uint32_t i = 0; i < 36; i++)1219pat_vec[i] = (uint8_t)((r.first >> i) & 1);12201221g_partitions2[unique_index] = pat_vec;12221223assert(g_part2_unique_index_to_seed[unique_index] == r.second);1224g_part2_seed_to_unique_index[r.second] = unique_index;12251226num_unique_partitions2++;1227}1228assert(num_unique_partitions2 == NUM_UNIQUE_PARTITIONS2);1229#else1230for (uint32_t unique_index = 0; unique_index < NUM_UNIQUE_PARTITIONS2; unique_index++)1231{1232const uint32_t seed_index = g_part2_unique_index_to_seed[unique_index];1233assert(seed_index < 1024);12341235assert(g_part2_seed_to_unique_index[seed_index] == 0);1236g_part2_seed_to_unique_index[seed_index] = unique_index;12371238partition_pattern_vec& pat_vec = g_partitions2[unique_index];12391240for (uint32_t y = 0; y < 6; y++)1241{1242for (uint32_t x = 0; x < 6; x++)1243{1244uint8_t p = (uint8_t)astc_helpers::compute_texel_partition(seed_index, x, y, 0, 2, false);1245assert(p < 2);12461247pat_vec[x + y * 6] = p;1248}1249}1250}1251#endif12521253g_part2_vp_tree.init(NUM_UNIQUE_PARTITIONS2, g_partitions2);1254}12551256static bool estimate_partition2_6x6(1257const basist::half_float pBlock_pixels_half[][3],1258int* pBest_parts, uint32_t num_best_parts)1259{1260const uint32_t BLOCK_W = 6, BLOCK_H = 6, BLOCK_T = BLOCK_W * BLOCK_H;12611262vec3F training_vecs[BLOCK_T], mean(0.0f);12631264for (uint32_t i = 0; i < BLOCK_T; i++)1265{1266vec3F& v = training_vecs[i];12671268v[0] = (float)pBlock_pixels_half[i][0];1269v[1] = (float)pBlock_pixels_half[i][1];1270v[2] = (float)pBlock_pixels_half[i][2];12711272mean += v;1273}1274mean *= (1.0f / (float)BLOCK_T);12751276vec3F max_vals(-BIG_FLOAT_VAL);12771278for (uint32_t i = 0; i < BLOCK_T; i++)1279{1280vec3F& v = training_vecs[i];1281max_vals = vec3F::component_max(max_vals, v);1282}12831284// Initialize principle axis approximation1285vec3F axis(max_vals - mean);12861287// Incremental approx. PCA - only viable if we have a reasonably fast approximation for 1.0/sqrt(x).1288for (uint32_t i = 0; i < BLOCK_T; i++)1289{1290axis = vec3F_norm_approx(axis);12911292vec3F color(training_vecs[i] - mean);12931294float d = color.dot(axis);12951296axis += color * d;1297}12981299if (axis.norm() < SMALL_FLOAT_VAL)1300axis.set(0.57735027f);1301else1302axis.normalize_in_place();13031304#if BRUTE_FORCE_PART_SEARCH1305int desired_parts[BLOCK_H][BLOCK_W]; // [y][x]1306for (uint32_t i = 0; i < BLOCK_T; i++)1307{1308float proj = (training_vecs[i] - mean).dot(axis);13091310desired_parts[i / BLOCK_W][i % BLOCK_W] = proj < 0.0f;1311}1312#else1313partition_pattern_vec desired_part;13141315for (uint32_t i = 0; i < BLOCK_T; i++)1316{1317float proj = (training_vecs[i] - mean).dot(axis);13181319desired_part.m_parts[i] = proj < 0.0f;1320}1321#endif13221323//interval_timer tm;1324//tm.start();13251326#if BRUTE_FORCE_PART_SEARCH1327uint32_t part_similarity[NUM_UNIQUE_PARTITIONS2];13281329for (uint32_t part_index = 0; part_index < NUM_UNIQUE_PARTITIONS2; part_index++)1330{1331const partition_pattern_vec &pat_vec = g_partitions2[part_index];13321333int total_sim_non_inv = 0;1334int total_sim_inv = 0;13351336for (uint32_t y = 0; y < BLOCK_H; y++)1337{1338for (uint32_t x = 0; x < BLOCK_W; x++)1339{1340int part = pat_vec[x + y * 6];13411342if (part == desired_parts[y][x])1343total_sim_non_inv++;13441345if ((part ^ 1) == desired_parts[y][x])1346total_sim_inv++;1347}1348}13491350int total_sim = maximum(total_sim_non_inv, total_sim_inv);13511352part_similarity[part_index] = (total_sim << 16) | part_index;13531354} // part_index;13551356std::sort(part_similarity, part_similarity + NUM_UNIQUE_PARTITIONS2);13571358for (uint32_t i = 0; i < num_best_parts; i++)1359pBest_parts[i] = part_similarity[(NUM_UNIQUE_PARTITIONS2 - 1) - i] & 0xFFFF;1360#else1361vp_tree::result_queue results;1362results.reserve(num_best_parts);1363g_part2_vp_tree.find_nearest(2, desired_part, results, num_best_parts);13641365assert(results.get_size() == num_best_parts);13661367const auto& elements = results.get_elements();13681369for (uint32_t i = 0; i < results.get_size(); i++)1370pBest_parts[i] = elements[1 + i].m_pat_index;1371#endif13721373//fmt_printf("{} ", tm.get_elapsed_ms());13741375return true;1376}13771378const uint32_t MIN_REFINE_LEVEL = 0;13791380static bool encode_block_2_subsets(1381trial_result res[2],1382uint32_t grid_w, uint32_t grid_h,1383uint32_t cem,1384uint32_t weights_ise_range, uint32_t endpoints_ise_range,1385const half_vec3* pBlock_pixels_half, const vec4F* pBlock_pixels_q16,1386astc_hdr_codec_base_options& coptions,1387bool uber_mode_flag,1388int unique_pat_index,1389uint32_t comp_level,1390opt_mode_t mode11_opt_mode,1391bool refine_endpoints_flag)1392{1393const uint32_t num_endpoint_vals = (cem == 11) ? basist::NUM_MODE11_ENDPOINTS : basist::NUM_MODE7_ENDPOINTS;13941395res[0].m_valid = false;1396res[1].m_valid = false;13971398const uint32_t BLOCK_W = 6, BLOCK_H = 6;13991400astc_helpers::log_astc_block best_log_blk;1401clear_obj(best_log_blk);14021403best_log_blk.m_num_partitions = 2;1404best_log_blk.m_color_endpoint_modes[0] = (uint8_t)cem;1405best_log_blk.m_color_endpoint_modes[1] = (uint8_t)cem;1406best_log_blk.m_grid_width = (uint8_t)grid_w;1407best_log_blk.m_grid_height = (uint8_t)grid_h;14081409best_log_blk.m_weight_ise_range = (uint8_t)weights_ise_range;1410best_log_blk.m_endpoint_ise_range = (uint8_t)endpoints_ise_range;14111412partition_pattern_vec* pPat = &g_partitions2[unique_pat_index];1413const uint32_t p_seed = g_part2_unique_index_to_seed[unique_pat_index];14141415vec4F part_pixels_q16[2][64];1416half_vec3 part_half_pixels[2][64];1417uint8_t part_pixel_index[2][64];1418uint32_t part_total_pixels[2] = { 0 };14191420for (uint32_t y = 0; y < BLOCK_H; y++)1421{1422for (uint32_t x = 0; x < BLOCK_W; x++)1423{1424uint32_t part_index = (*pPat)[x + y * BLOCK_W];14251426uint32_t l = part_total_pixels[part_index];14271428part_pixels_q16[part_index][l] = pBlock_pixels_q16[x + y * BLOCK_W];1429part_half_pixels[part_index][l] = pBlock_pixels_half[x + y * BLOCK_W];1430part_pixel_index[part_index][l] = (uint8_t)(x + y * BLOCK_W);14311432part_total_pixels[part_index] = l + 1;1433} // x1434} // y14351436uint8_t blk_endpoints[2][basist::NUM_MODE11_ENDPOINTS];1437uint8_t blk_weights[2][BLOCK_W * BLOCK_H];1438uint32_t best_submode[2];14391440for (uint32_t part_iter = 0; part_iter < 2; part_iter++)1441{1442assert(part_total_pixels[part_iter]);14431444double e;1445if (cem == 7)1446{1447e = encode_astc_hdr_block_mode_7(1448part_total_pixels[part_iter],1449(basist::half_float(*)[3])part_half_pixels[part_iter], (vec4F*)part_pixels_q16[part_iter],1450best_log_blk.m_weight_ise_range,1451best_submode[part_iter],1452BIG_FLOAT_VAL,1453blk_endpoints[part_iter],1454blk_weights[part_iter],1455coptions,1456best_log_blk.m_endpoint_ise_range);1457}1458else1459{1460assert(cem == 11);14611462e = encode_astc_hdr_block_mode_11(1463part_total_pixels[part_iter],1464(basist::half_float(*)[3])part_half_pixels[part_iter], (vec4F*)part_pixels_q16[part_iter],1465best_log_blk.m_weight_ise_range,1466best_submode[part_iter],1467BIG_FLOAT_VAL,1468blk_endpoints[part_iter],1469blk_weights[part_iter],1470coptions,1471false,1472best_log_blk.m_endpoint_ise_range, uber_mode_flag, false, -1, 7, false,1473mode11_opt_mode);1474}14751476if (e == BIG_FLOAT_VAL)1477return false;14781479} // part_iter14801481uint8_t ise_weights[BLOCK_W * BLOCK_H];14821483uint32_t src_pixel_index[2] = { 0, 0 };1484for (uint32_t y = 0; y < BLOCK_H; y++)1485{1486for (uint32_t x = 0; x < BLOCK_W; x++)1487{1488uint32_t part_index = (*pPat)[x + y * BLOCK_W];1489ise_weights[x + y * BLOCK_W] = blk_weights[part_index][src_pixel_index[part_index]];1490src_pixel_index[part_index]++;1491} // x1492} // y14931494if ((grid_w == BLOCK_W) && (grid_h == BLOCK_H))1495{1496best_log_blk.m_partition_id = (uint16_t)p_seed;14971498memcpy(best_log_blk.m_endpoints, blk_endpoints[0], num_endpoint_vals);1499memcpy(best_log_blk.m_endpoints + num_endpoint_vals, blk_endpoints[1], num_endpoint_vals);1500memcpy(best_log_blk.m_weights, ise_weights, BLOCK_W * BLOCK_H);15011502res[0].m_valid = true;1503res[0].m_log_blk = best_log_blk;1504}1505else1506{1507uint8_t desired_weights[BLOCK_H * BLOCK_W];15081509const auto& dequant_tab = astc_helpers::g_dequant_tables.get_weight_tab(weights_ise_range).m_ISE_to_val;15101511for (uint32_t by = 0; by < BLOCK_H; by++)1512for (uint32_t bx = 0; bx < BLOCK_W; bx++)1513desired_weights[bx + by * BLOCK_W] = dequant_tab[ise_weights[bx + by * BLOCK_W]];15141515uint8_t downsampled_weights[BLOCK_H * BLOCK_W];15161517const float* pDownsample_matrix = get_6x6_downsample_matrix(grid_w, grid_h);1518if (!pDownsample_matrix)1519{1520assert(0);1521return false;1522}15231524downsample_weight_grid(1525pDownsample_matrix,1526BLOCK_W, BLOCK_H, // source/from dimension (block size)1527grid_w, grid_h, // dest/to dimension (grid size)1528desired_weights, // these are dequantized weights, NOT ISE symbols, [by][bx]1529downsampled_weights); // [wy][wx]15301531best_log_blk.m_partition_id = (uint16_t)p_seed;1532memcpy(best_log_blk.m_endpoints, blk_endpoints[0], num_endpoint_vals);1533memcpy(best_log_blk.m_endpoints + num_endpoint_vals, blk_endpoints[1], num_endpoint_vals);15341535const auto& weight_to_ise = astc_helpers::g_dequant_tables.get_weight_tab(weights_ise_range).m_val_to_ise;15361537for (uint32_t gy = 0; gy < grid_h; gy++)1538for (uint32_t gx = 0; gx < grid_w; gx++)1539best_log_blk.m_weights[gx + gy * grid_w] = weight_to_ise[downsampled_weights[gx + gy * grid_w]];15401541res[0].m_valid = true;1542res[0].m_log_blk = best_log_blk;15431544if ((refine_endpoints_flag) && (comp_level >= MIN_REFINE_LEVEL) && ((grid_w < 6) || (grid_h < 6)))1545{1546bool any_refined = false;15471548for (uint32_t part_iter = 0; part_iter < 2; part_iter++)1549{1550bool refine_status = refine_endpoints(1551cem,1552endpoints_ise_range,1553best_log_blk.m_endpoints + part_iter * num_endpoint_vals, // the endpoints to optimize1554BLOCK_W, BLOCK_H, // block dimensions1555grid_w, grid_h, best_log_blk.m_weights, weights_ise_range, // weight grid1556part_total_pixels[part_iter], (basist::half_float(*)[3])part_half_pixels[part_iter], (vec4F*)part_pixels_q16[part_iter],1557&part_pixel_index[part_iter][0], // maps this subset's pixels to block offsets1558coptions, mode11_opt_mode);15591560if (refine_status)1561any_refined = true;1562}15631564if (any_refined)1565{1566res[1].m_valid = true;1567res[1].m_log_blk = best_log_blk;1568}1569}1570}15711572return true;1573}15741575typedef basisu::hash_map<partition_pattern_vec, std::pair<uint32_t, uint32_t > > partition3_hash_map;15761577partition_pattern_vec g_partitions3[NUM_UNIQUE_PARTITIONS3];1578int g_part3_seed_to_unique_index[1024];1579vp_tree g_part3_vp_tree;15801581static void init_partitions3_6x6()1582{1583uint32_t t = 0;15841585for (uint32_t i = 0; i < 1024; i++)1586g_part3_seed_to_unique_index[i] = -1;15871588partition3_hash_map part3_hash;1589part3_hash.reserve(512);15901591for (uint32_t seed_index = 0; seed_index < 1024; seed_index++)1592{1593partition_pattern_vec p3;1594uint32_t part_hist[3] = { 0 };15951596for (uint32_t y = 0; y < 6; y++)1597{1598for (uint32_t x = 0; x < 6; x++)1599{1600uint64_t p = astc_helpers::compute_texel_partition(seed_index, x, y, 0, 3, false);1601assert(p < 3);16021603p3.m_parts[x + y * 6] = (uint8_t)p;1604part_hist[p]++;1605}1606}16071608if (!part_hist[0] || !part_hist[1] || !part_hist[2])1609continue;16101611uint32_t j;1612for (j = 0; j < NUM_PART3_MAPPINGS; j++)1613{1614partition_pattern_vec temp_part3(p3.get_permuted3(j));16151616if (part3_hash.contains(temp_part3))1617break;1618}1619if (j < NUM_PART3_MAPPINGS)1620continue;16211622part3_hash.insert(p3, std::make_pair(seed_index, t) );16231624assert(g_part3_unique_index_to_seed[t] == seed_index);1625g_part3_seed_to_unique_index[seed_index] = t;1626g_partitions3[t] = p3;16271628t++;1629}16301631g_part3_vp_tree.init(NUM_UNIQUE_PARTITIONS3, g_partitions3);1632}16331634static bool estimate_partition3_6x6(1635const basist::half_float pBlock_pixels_half[][3],1636int* pBest_parts, uint32_t num_best_parts)1637{1638const uint32_t BLOCK_W = 6, BLOCK_H = 6, BLOCK_T = BLOCK_W * BLOCK_H, NUM_SUBSETS = 3;16391640assert(num_best_parts && (num_best_parts <= NUM_UNIQUE_PARTITIONS3));16411642vec3F training_vecs[BLOCK_T], mean(0.0f);16431644float brightest_inten = 0.0f, darkest_inten = BIG_FLOAT_VAL;1645vec3F cluster_centroids[NUM_SUBSETS];16461647for (uint32_t i = 0; i < BLOCK_T; i++)1648{1649vec3F& v = training_vecs[i];16501651v.set((float)pBlock_pixels_half[i][0], (float)pBlock_pixels_half[i][1], (float)pBlock_pixels_half[i][2]);16521653float inten = v.dot(vec3F(1.0f));1654if (inten < darkest_inten)1655{1656darkest_inten = inten;1657cluster_centroids[0] = v;1658}16591660if (inten > brightest_inten)1661{1662brightest_inten = inten;1663cluster_centroids[1] = v;1664}1665}16661667if (cluster_centroids[0] == cluster_centroids[1])1668return false;16691670float furthest_dist2 = 0.0f;1671for (uint32_t i = 0; i < BLOCK_T; i++)1672{1673vec3F& v = training_vecs[i];16741675float dist_a = v.squared_distance(cluster_centroids[0]);1676if (dist_a == 0.0f)1677continue;16781679float dist_b = v.squared_distance(cluster_centroids[1]);1680if (dist_b == 0.0f)1681continue;16821683float dist2 = dist_a + dist_b;1684if (dist2 > furthest_dist2)1685{1686furthest_dist2 = dist2;1687cluster_centroids[2] = v;1688}1689}16901691if ((cluster_centroids[0] == cluster_centroids[2]) || (cluster_centroids[1] == cluster_centroids[2]))1692return false;16931694uint32_t cluster_pixels[NUM_SUBSETS][BLOCK_T];1695uint32_t num_cluster_pixels[NUM_SUBSETS];1696vec3F new_cluster_means[NUM_SUBSETS];16971698const uint32_t NUM_ITERS = 4;16991700for (uint32_t s = 0; s < NUM_ITERS; s++)1701{1702memset(num_cluster_pixels, 0, sizeof(num_cluster_pixels));1703memset(new_cluster_means, 0, sizeof(new_cluster_means));17041705for (uint32_t i = 0; i < BLOCK_T; i++)1706{1707float d[NUM_SUBSETS] = {1708training_vecs[i].squared_distance(cluster_centroids[0]),1709training_vecs[i].squared_distance(cluster_centroids[1]),1710training_vecs[i].squared_distance(cluster_centroids[2]) };17111712float min_d = d[0];1713uint32_t min_idx = 0;1714for (uint32_t j = 1; j < NUM_SUBSETS; j++)1715{1716if (d[j] < min_d)1717{1718min_d = d[j];1719min_idx = j;1720}1721}17221723cluster_pixels[min_idx][num_cluster_pixels[min_idx]] = i;1724new_cluster_means[min_idx] += training_vecs[i];1725num_cluster_pixels[min_idx]++;1726} // i17271728for (uint32_t j = 0; j < NUM_SUBSETS; j++)1729{1730if (!num_cluster_pixels[j])1731return false;17321733cluster_centroids[j] = new_cluster_means[j] / (float)num_cluster_pixels[j];1734}1735} // s17361737partition_pattern_vec desired_part;1738for (uint32_t p = 0; p < NUM_SUBSETS; p++)1739{1740for (uint32_t i = 0; i < num_cluster_pixels[p]; i++)1741{1742const uint32_t pix_index = cluster_pixels[p][i];1743desired_part[pix_index] = (uint8_t)p;1744}1745}17461747#if BRUTE_FORCE_PART_SEARCH1748partition_pattern_vec desired_parts[NUM_PART3_MAPPINGS];1749for (uint32_t j = 0; j < NUM_PART3_MAPPINGS; j++)1750desired_parts[j] = desired_part.get_permuted3(j);17511752uint32_t part_similarity[NUM_UNIQUE_PARTITIONS3];17531754for (uint32_t part_index = 0; part_index < NUM_UNIQUE_PARTITIONS3; part_index++)1755{1756const partition_pattern_vec& pat = g_partitions3[part_index];17571758uint32_t lowest_pat_dist = UINT32_MAX;1759for (uint32_t p = 0; p < NUM_PART3_MAPPINGS; p++)1760{1761uint32_t dist = pat.get_squared_distance(desired_parts[p]);1762if (dist < lowest_pat_dist)1763lowest_pat_dist = dist;1764}17651766part_similarity[part_index] = (lowest_pat_dist << 16) | part_index;17671768} // part_index;17691770std::sort(part_similarity, part_similarity + NUM_UNIQUE_PARTITIONS3);17711772for (uint32_t i = 0; i < num_best_parts; i++)1773pBest_parts[i] = part_similarity[i] & 0xFFFF;1774#else1775vp_tree::result_queue results;1776results.reserve(num_best_parts);1777g_part3_vp_tree.find_nearest(3, desired_part, results, num_best_parts);17781779assert(results.get_size() == num_best_parts);17801781const auto& elements = results.get_elements();17821783for (uint32_t i = 0; i < results.get_size(); i++)1784pBest_parts[i] = elements[1 + i].m_pat_index;1785#endif17861787return true;1788}17891790static bool encode_block_3_subsets(1791trial_result& res,1792uint32_t cem,1793uint32_t grid_w, uint32_t grid_h,1794uint32_t weights_ise_range, uint32_t endpoints_ise_range,1795const half_vec3* pBlock_pixels_half, const vec4F* pBlock_pixels_q16,1796astc_hdr_codec_base_options& coptions,1797bool uber_mode_flag,1798const int* pEst_patterns, int num_est_patterns,1799uint32_t comp_level,1800opt_mode_t mode11_opt_mode)1801{1802BASISU_NOTE_UNUSED(uber_mode_flag);1803const uint32_t BLOCK_W = 6, BLOCK_H = 6, NUM_SUBSETS = 3;1804const uint32_t num_endpoint_vals = astc_helpers::get_num_cem_values(cem);18051806res.m_valid = false;18071808double best_e = BIG_FLOAT_VAL;18091810astc_helpers::log_astc_block best_log_blk;1811clear_obj(best_log_blk);18121813best_log_blk.m_num_partitions = NUM_SUBSETS;1814best_log_blk.m_color_endpoint_modes[0] = (uint8_t)cem;1815best_log_blk.m_color_endpoint_modes[1] = (uint8_t)cem;1816best_log_blk.m_color_endpoint_modes[2] = (uint8_t)cem;1817best_log_blk.m_grid_width = (uint8_t)grid_w;1818best_log_blk.m_grid_height = (uint8_t)grid_h;18191820best_log_blk.m_weight_ise_range = (uint8_t)weights_ise_range;1821best_log_blk.m_endpoint_ise_range = (uint8_t)endpoints_ise_range;18221823const uint32_t n = num_est_patterns ? num_est_patterns : NUM_UNIQUE_PARTITIONS3;18241825for (uint32_t unique_p_iter = 0; unique_p_iter < n; unique_p_iter++)1826{1827const uint32_t unique_part_index = num_est_patterns ? pEst_patterns[unique_p_iter] : unique_p_iter;1828assert(unique_part_index < NUM_UNIQUE_PARTITIONS3);1829const partition_pattern_vec*pPart = &g_partitions3[unique_part_index];18301831vec4F part_pixels_q16[NUM_SUBSETS][64];1832half_vec3 part_half_pixels[NUM_SUBSETS][64];1833uint8_t part_pixel_index[NUM_SUBSETS][64];1834uint32_t part_total_pixels[NUM_SUBSETS] = { 0 };18351836for (uint32_t y = 0; y < BLOCK_H; y++)1837{1838for (uint32_t x = 0; x < BLOCK_W; x++)1839{1840const uint32_t part_index = pPart->m_parts[x + y * BLOCK_W];18411842uint32_t l = part_total_pixels[part_index];18431844part_pixels_q16[part_index][l] = pBlock_pixels_q16[x + y * BLOCK_W];1845part_half_pixels[part_index][l] = pBlock_pixels_half[x + y * BLOCK_W];1846part_pixel_index[part_index][l] = (uint8_t)(x + y * BLOCK_W);18471848part_total_pixels[part_index] = l + 1;1849} // x1850} // y18511852uint8_t blk_endpoints[NUM_SUBSETS][basist::NUM_MODE11_ENDPOINTS];1853uint8_t blk_weights[NUM_SUBSETS][BLOCK_W * BLOCK_H];1854uint32_t best_submode[NUM_SUBSETS];18551856double e = 0.0f;1857for (uint32_t part_iter = 0; part_iter < NUM_SUBSETS; part_iter++)1858{1859assert(part_total_pixels[part_iter]);18601861if (cem == 7)1862{1863e += encode_astc_hdr_block_mode_7(1864part_total_pixels[part_iter],1865(basist::half_float(*)[3])part_half_pixels[part_iter], (vec4F*)part_pixels_q16[part_iter],1866best_log_blk.m_weight_ise_range,1867best_submode[part_iter],1868BIG_FLOAT_VAL,1869blk_endpoints[part_iter],1870blk_weights[part_iter],1871coptions,1872best_log_blk.m_endpoint_ise_range);1873}1874else1875{1876assert(cem == 11);18771878e += encode_astc_hdr_block_mode_11(1879part_total_pixels[part_iter],1880(basist::half_float(*)[3])part_half_pixels[part_iter], (vec4F*)part_pixels_q16[part_iter],1881best_log_blk.m_weight_ise_range,1882best_submode[part_iter],1883BIG_FLOAT_VAL,1884blk_endpoints[part_iter],1885blk_weights[part_iter],1886coptions,1887false, best_log_blk.m_endpoint_ise_range, uber_mode_flag, false,1888FIRST_MODE11_SUBMODE_INDEX, MAX_MODE11_SUBMODE_INDEX, false, mode11_opt_mode);1889}18901891} // part_iter18921893uint8_t ise_weights[BLOCK_W * BLOCK_H];18941895uint32_t src_pixel_index[NUM_SUBSETS] = { 0 };1896for (uint32_t y = 0; y < BLOCK_H; y++)1897{1898for (uint32_t x = 0; x < BLOCK_W; x++)1899{1900const uint32_t part_index = pPart->m_parts[x + y * BLOCK_W];19011902ise_weights[x + y * BLOCK_W] = blk_weights[part_index][src_pixel_index[part_index]];1903src_pixel_index[part_index]++;1904} // x1905} // y19061907if ((grid_w == BLOCK_W) && (grid_h == BLOCK_H))1908{1909if (e < best_e)1910{1911best_e = e;1912best_log_blk.m_partition_id = (uint16_t)g_part3_unique_index_to_seed[unique_part_index];19131914for (uint32_t p = 0; p < NUM_SUBSETS; p++)1915memcpy(best_log_blk.m_endpoints + num_endpoint_vals * p, blk_endpoints[p], num_endpoint_vals);19161917memcpy(best_log_blk.m_weights, ise_weights, BLOCK_W * BLOCK_H);1918}1919}1920else1921{1922uint8_t desired_weights[BLOCK_H * BLOCK_W];19231924const auto& dequant_tab = astc_helpers::g_dequant_tables.get_weight_tab(weights_ise_range).m_ISE_to_val;19251926for (uint32_t by = 0; by < BLOCK_H; by++)1927for (uint32_t bx = 0; bx < BLOCK_W; bx++)1928desired_weights[bx + by * BLOCK_W] = dequant_tab[ise_weights[bx + by * BLOCK_W]];19291930uint8_t downsampled_weights[BLOCK_H * BLOCK_W];19311932const float* pDownsample_matrix = get_6x6_downsample_matrix(grid_w, grid_h);1933if (!pDownsample_matrix)1934{1935assert(0);1936return false;1937}19381939downsample_weight_grid(1940pDownsample_matrix,1941BLOCK_W, BLOCK_H, // source/from dimension (block size)1942grid_w, grid_h, // dest/to dimension (grid size)1943desired_weights, // these are dequantized weights, NOT ISE symbols, [by][bx]1944downsampled_weights); // [wy][wx]19451946astc_helpers::log_astc_block trial_blk(best_log_blk);19471948trial_blk.m_partition_id = (uint16_t)g_part3_unique_index_to_seed[unique_part_index];19491950for (uint32_t p = 0; p < NUM_SUBSETS; p++)1951memcpy(trial_blk.m_endpoints + num_endpoint_vals * p, blk_endpoints[p], num_endpoint_vals);19521953const auto& weight_to_ise = astc_helpers::g_dequant_tables.get_weight_tab(weights_ise_range).m_val_to_ise;19541955for (uint32_t gy = 0; gy < grid_h; gy++)1956for (uint32_t gx = 0; gx < grid_w; gx++)1957trial_blk.m_weights[gx + gy * grid_w] = weight_to_ise[downsampled_weights[gx + gy * grid_w]];19581959if ((comp_level >= MIN_REFINE_LEVEL) && ((grid_w < 6) || (grid_h < 6)))1960{1961for (uint32_t part_iter = 0; part_iter < NUM_SUBSETS; part_iter++)1962{1963bool refine_status = refine_endpoints(1964cem,1965endpoints_ise_range,1966trial_blk.m_endpoints + part_iter * num_endpoint_vals, // the endpoints to optimize1967BLOCK_W, BLOCK_H, // block dimensions1968grid_w, grid_h, trial_blk.m_weights, weights_ise_range, // weight grid1969part_total_pixels[part_iter], (basist::half_float(*)[3])part_half_pixels[part_iter], (vec4F*)part_pixels_q16[part_iter],1970&part_pixel_index[part_iter][0], // maps this subset's pixels to block offsets1971coptions, mode11_opt_mode);19721973BASISU_NOTE_UNUSED(refine_status);1974}1975}19761977half_vec4 decoded_pixels_half4[BLOCK_H][BLOCK_W]; // [y][x]1978bool status = astc_helpers::decode_block(trial_blk, decoded_pixels_half4, BLOCK_W, BLOCK_H, astc_helpers::cDecodeModeHDR16);1979assert(status);1980if (!status)1981return false;19821983half_vec3 decoded_pixels_half3[BLOCK_H][BLOCK_W];1984for (uint32_t y = 0; y < BLOCK_H; y++)1985for (uint32_t x = 0; x < BLOCK_W; x++)1986decoded_pixels_half3[y][x].set(decoded_pixels_half4[y][x][0], decoded_pixels_half4[y][x][1], decoded_pixels_half4[y][x][2]);19871988double trial_err = compute_block_error(BLOCK_W * BLOCK_H, (const basist::half_float*)pBlock_pixels_half, (const basist::half_float*)decoded_pixels_half3, coptions);1989if (trial_err < best_e)1990{1991best_e = trial_err;1992best_log_blk = trial_blk;1993}1994}19951996} // unique_p_iter19971998if (best_e < BIG_FLOAT_VAL)1999{2000res.m_log_blk = best_log_blk;2001res.m_valid = true;2002res.m_err = best_e;2003}2004else2005{2006res.m_valid = false;2007}20082009return res.m_valid;2010}20112012static uint32_t encode_values(bitwise_coder &coder, uint32_t total_values, const uint8_t *pVals, uint32_t endpoint_range)2013{2014const uint32_t MAX_VALS = 64;2015uint32_t bit_values[MAX_VALS], tq_values[(MAX_VALS + 2) / 3];2016uint32_t total_tq_values = 0, tq_accum = 0, tq_mul = 1;20172018assert((total_values) && (total_values <= MAX_VALS));20192020const uint32_t ep_bits = astc_helpers::g_ise_range_table[endpoint_range][0];2021const uint32_t ep_trits = astc_helpers::g_ise_range_table[endpoint_range][1];2022const uint32_t ep_quints = astc_helpers::g_ise_range_table[endpoint_range][2];20232024for (uint32_t i = 0; i < total_values; i++)2025{2026uint32_t val = pVals[i];20272028uint32_t bits = val & ((1 << ep_bits) - 1);2029uint32_t tq = val >> ep_bits;20302031bit_values[i] = bits;20322033if (ep_trits)2034{2035assert(tq < 3);2036tq_accum += tq * tq_mul;2037tq_mul *= 3;2038if (tq_mul == 243)2039{2040assert(total_tq_values < BASISU_ARRAY_SIZE(tq_values));2041tq_values[total_tq_values++] = tq_accum;2042tq_accum = 0;2043tq_mul = 1;2044}2045}2046else if (ep_quints)2047{2048assert(tq < 5);2049tq_accum += tq * tq_mul;2050tq_mul *= 5;2051if (tq_mul == 125)2052{2053assert(total_tq_values < BASISU_ARRAY_SIZE(tq_values));2054tq_values[total_tq_values++] = tq_accum;2055tq_accum = 0;2056tq_mul = 1;2057}2058}2059}20602061uint32_t total_bits_output = 0;20622063for (uint32_t i = 0; i < total_tq_values; i++)2064{2065const uint32_t num_bits = ep_trits ? 8 : 7;2066coder.put_bits(tq_values[i], num_bits);2067total_bits_output += num_bits;2068}20692070if (tq_mul > 1)2071{2072uint32_t num_bits;2073if (ep_trits)2074{2075if (tq_mul == 3)2076num_bits = 2;2077else if (tq_mul == 9)2078num_bits = 4;2079else if (tq_mul == 27)2080num_bits = 5;2081else //if (tq_mul == 81)2082num_bits = 7;2083}2084else2085{2086if (tq_mul == 5)2087num_bits = 3;2088else //if (tq_mul == 25)2089num_bits = 5;2090}2091coder.put_bits(tq_accum, num_bits);2092total_bits_output += num_bits;2093}20942095for (uint32_t i = 0; i < total_values; i++)2096{2097coder.put_bits(bit_values[i], ep_bits);2098total_bits_output += ep_bits;2099}21002101return total_bits_output;2102}21032104static inline uint32_t get_num_endpoint_vals(uint32_t cem)2105{2106assert((cem == 7) || (cem == 11));2107return (cem == 11) ? basist::NUM_MODE11_ENDPOINTS : basist::NUM_MODE7_ENDPOINTS;2108}21092110static void code_block(bitwise_coder& coder,2111const astc_helpers::log_astc_block& log_blk,2112block_mode block_mode_index,2113endpoint_mode em, const uint8_t *pEP_deltas)2114{2115coder.put_truncated_binary((uint32_t)block_mode_index, (uint32_t)block_mode::cBMTotalModes);2116coder.put_truncated_binary((uint32_t)em, (uint32_t)endpoint_mode::cTotal);21172118const uint32_t num_endpoint_vals = get_num_endpoint_vals(log_blk.m_color_endpoint_modes[0]);21192120if ((em == endpoint_mode::cUseLeftDelta) || (em == endpoint_mode::cUseUpperDelta))2121{2122assert(log_blk.m_num_partitions == 1);21232124for (uint32_t i = 0; i < num_endpoint_vals; i++)2125coder.put_bits(pEP_deltas[i], NUM_ENDPOINT_DELTA_BITS);2126}2127else if (em == endpoint_mode::cRaw)2128{2129if (log_blk.m_num_partitions == 2)2130{2131const int unique_partition_index = g_part2_seed_to_unique_index[log_blk.m_partition_id];2132assert(unique_partition_index != -1);21332134coder.put_truncated_binary(unique_partition_index, NUM_UNIQUE_PARTITIONS2);2135}2136else if (log_blk.m_num_partitions == 3)2137{2138const int unique_partition_index = g_part3_seed_to_unique_index[log_blk.m_partition_id];2139assert(unique_partition_index != -1);21402141coder.put_truncated_binary(unique_partition_index, NUM_UNIQUE_PARTITIONS3);2142}21432144encode_values(coder, num_endpoint_vals * log_blk.m_num_partitions, log_blk.m_endpoints, log_blk.m_endpoint_ise_range);2145}21462147encode_values(coder, log_blk.m_grid_width * log_blk.m_grid_height * (log_blk.m_dual_plane ? 2 : 1), log_blk.m_weights, log_blk.m_weight_ise_range);2148}21492150struct smooth_map_params2151{2152bool m_no_mse_scaling;21532154float m_max_smooth_std_dev;2155float m_smooth_max_mse_scale;21562157float m_max_med_smooth_std_dev;2158float m_med_smooth_max_mse_scale;21592160float m_max_ultra_smooth_std_dev;2161float m_ultra_smooth_max_mse_scale;21622163bool m_debug_images;21642165smooth_map_params()2166{2167clear();2168}21692170void clear()2171{2172m_no_mse_scaling = false;21732174// 3x3 region2175m_max_smooth_std_dev = 100.0f;2176m_smooth_max_mse_scale = 13000.0f;21772178// 7x7 region2179m_max_med_smooth_std_dev = 9.0f;2180m_med_smooth_max_mse_scale = 15000.0f;21812182// 11x11 region2183m_max_ultra_smooth_std_dev = 4.0f;2184//m_ultra_smooth_max_mse_scale = 4500.0f;2185//m_ultra_smooth_max_mse_scale = 10000.0f;2186//m_ultra_smooth_max_mse_scale = 50000.0f;2187//m_ultra_smooth_max_mse_scale = 100000.0f;2188//m_ultra_smooth_max_mse_scale = 400000.0f;2189//m_ultra_smooth_max_mse_scale = 800000.0f;2190m_ultra_smooth_max_mse_scale = 2000000.0f;21912192m_debug_images = true;2193}2194};21952196Resampler::Contrib_List* g_contrib_lists[7]; // 1-621972198static void init_contrib_lists()2199{2200for (uint32_t dst_width = 1; dst_width <= 6; dst_width++)2201//g_contrib_lists[dst_width] = Resampler::make_clist(6, 6, basisu::Resampler::BOUNDARY_CLAMP, gaussian_filter, BASISU_GAUSSIAN_FILTER_SUPPORT, 6.0f / (float)dst_width, 0.0f);2202g_contrib_lists[dst_width] = Resampler::make_clist(6, 6, basisu::Resampler::BOUNDARY_CLAMP, gaussian_filter, BASISU_BELL_FILTER_SUPPORT, 6.0f / (float)dst_width, 0.0f);2203}22042205#if 02206static void filter_block(uint32_t grid_x, uint32_t grid_y, const vec3F* pSrc_block, half_vec3 *pDst_block_half3, vec4F *pDst_block_q16)2207{2208vec3F temp_block[6][6]; // [y][x]22092210// first filter rows to temp_block2211if (grid_x == 6)2212{2213memcpy(temp_block, pSrc_block, sizeof(vec3F) * 6 * 6);2214}2215else2216{2217Resampler::Contrib_List* pRow_lists = g_contrib_lists[grid_x];22182219for (uint32_t y = 0; y < 6; y++)2220{2221for (uint32_t x = 0; x < 6; x++)2222{2223vec3F p(0.0f);22242225for (uint32_t i = 0; i < pRow_lists[x].n; i++)2226p += pSrc_block[y * 6 + pRow_lists[x].p[i].pixel] * pRow_lists[x].p[i].weight;22272228p.clamp(0.0f, basist::ASTC_HDR_MAX_VAL);22292230temp_block[y][x] = p;2231} // x2232} // y2233}22342235// filter columns2236if (grid_y == 6)2237{2238for (uint32_t y = 0; y < 6; y++)2239{2240for (uint32_t x = 0; x < 6; x++)2241{2242for (uint32_t c = 0; c < 3; c++)2243{2244const basist::half_float h = basist::float_to_half(temp_block[y][x][c]);22452246pDst_block_half3[x + y * 6][c] = h;2247pDst_block_q16[x + y * 6][c] = (float)half_to_qlog16(h);2248}22492250pDst_block_q16[x + y * 6][3] = 0.0f;2251} // x2252} // y2253}2254else2255{2256Resampler::Contrib_List* pCol_lists = g_contrib_lists[grid_y];22572258for (uint32_t x = 0; x < 6; x++)2259{2260for (uint32_t y = 0; y < 6; y++)2261{2262vec3F p(0.0f);22632264for (uint32_t i = 0; i < pCol_lists[y].n; i++)2265p += temp_block[pCol_lists[y].p[i].pixel][x] * pCol_lists[y].p[i].weight;22662267p.clamp(0.0f, basist::ASTC_HDR_MAX_VAL);22682269for (uint32_t c = 0; c < 3; c++)2270{2271const basist::half_float h = basist::float_to_half(p[c]);22722273pDst_block_half3[x + y * 6][c] = h;2274pDst_block_q16[x + y * 6][c] = (float)half_to_qlog16(h);2275}22762277pDst_block_q16[x + y * 6][3] = 0.0f;22782279} // x2280} // y2281}2282}2283#endif22842285static void filter_block(uint32_t grid_x, uint32_t grid_y, const vec4F* pSrc_block, vec4F* pDst_block)2286{2287vec4F temp_block[6][6]; // [y][x]22882289// first filter rows to temp_block2290if (grid_x == 6)2291{2292memcpy(temp_block, pSrc_block, sizeof(vec4F) * 6 * 6);2293}2294else2295{2296Resampler::Contrib_List* pRow_lists = g_contrib_lists[grid_x];22972298for (uint32_t y = 0; y < 6; y++)2299{2300for (uint32_t x = 0; x < 6; x++)2301{2302vec3F p(0.0f);23032304for (uint32_t i = 0; i < pRow_lists[x].n; i++)2305p += vec3F(pSrc_block[y * 6 + pRow_lists[x].p[i].pixel]) * pRow_lists[x].p[i].weight;23062307p.clamp(0.0f, basist::ASTC_HDR_MAX_VAL);23082309temp_block[y][x] = p;2310} // x2311} // y2312}23132314// filter columns2315if (grid_y == 6)2316{2317for (uint32_t y = 0; y < 6; y++)2318{2319for (uint32_t x = 0; x < 6; x++)2320{2321for (uint32_t c = 0; c < 3; c++)2322pDst_block[x + y * 6][c] = temp_block[y][x][c];2323} // x2324} // y2325}2326else2327{2328Resampler::Contrib_List* pCol_lists = g_contrib_lists[grid_y];23292330for (uint32_t x = 0; x < 6; x++)2331{2332for (uint32_t y = 0; y < 6; y++)2333{2334vec3F p(0.0f);23352336for (uint32_t i = 0; i < pCol_lists[y].n; i++)2337p += temp_block[pCol_lists[y].p[i].pixel][x] * pCol_lists[y].p[i].weight;23382339p.clamp(0.0f, basist::ASTC_HDR_MAX_VAL);23402341pDst_block[x + y * 6] = p;23422343} // x2344} // y2345}2346}23472348static void filter_block(uint32_t grid_x, uint32_t grid_y, const vec3F* pSrc_block, vec3F* pDst_block)2349{2350vec3F temp_block[6][6]; // [y][x]23512352// first filter rows to temp_block2353if (grid_x == 6)2354{2355memcpy(temp_block, pSrc_block, sizeof(vec3F) * 6 * 6);2356}2357else2358{2359Resampler::Contrib_List* pRow_lists = g_contrib_lists[grid_x];23602361for (uint32_t y = 0; y < 6; y++)2362{2363for (uint32_t x = 0; x < 6; x++)2364{2365vec3F p(0.0f);23662367for (uint32_t i = 0; i < pRow_lists[x].n; i++)2368p += vec3F(pSrc_block[y * 6 + pRow_lists[x].p[i].pixel]) * pRow_lists[x].p[i].weight;23692370temp_block[y][x] = p;2371} // x2372} // y2373}23742375// filter columns2376if (grid_y == 6)2377{2378memcpy(pDst_block, temp_block, sizeof(vec3F) * 6 * 6);2379}2380else2381{2382Resampler::Contrib_List* pCol_lists = g_contrib_lists[grid_y];23832384for (uint32_t x = 0; x < 6; x++)2385{2386for (uint32_t y = 0; y < 6; y++)2387{2388vec3F& p = pDst_block[x + y * 6];2389p.set(0.0f);23902391for (uint32_t i = 0; i < pCol_lists[y].n; i++)2392p += temp_block[pCol_lists[y].p[i].pixel][x] * pCol_lists[y].p[i].weight;2393} // x2394} // y2395}2396}23972398static float diff_blocks(const vec4F* pA, const vec4F* pB)2399{2400const uint32_t BLOCK_T = 36;24012402float diff = 0.0f;2403for (uint32_t i = 0; i < BLOCK_T; i++)2404diff += square(pA[i][0] - pB[i][0]) + square(pA[i][1] - pB[i][1]) + square(pA[i][2] - pB[i][2]);24052406return diff * (1.0f / (float)BLOCK_T);2407}24082409static float sub_and_compute_std_dev(const vec3F* pA, const vec3F* pB)2410{2411const uint32_t BLOCK_T = 36;24122413vec3F mean(0.0f);24142415for (uint32_t i = 0; i < BLOCK_T; i++)2416{2417vec3F diff(pA[i] - pB[i]);2418mean += diff;2419}24202421mean *= (1.0f / (float)BLOCK_T);24222423vec3F diff_sum(0.0f);2424for (uint32_t i = 0; i < BLOCK_T; i++)2425{2426vec3F diff(pA[i] - pB[i]);2427diff -= mean;2428diff_sum += vec3F::component_mul(diff, diff);2429}24302431vec3F var(diff_sum * (1.0f / (float)BLOCK_T));24322433vec3F std_dev(sqrtf(var[0]), sqrtf(var[1]), sqrtf(var[2]));24342435return maximum(std_dev[0], std_dev[1], std_dev[2]);2436}24372438static void create_smooth_maps2(2439vector2D<float>& smooth_block_mse_scales,2440const image& orig_img,2441smooth_map_params& params, image* pUltra_smooth_img = nullptr)2442{2443const uint32_t width = orig_img.get_width();2444const uint32_t height = orig_img.get_height();2445//const uint32_t total_pixels = orig_img.get_total_pixels();2446const uint32_t num_comps = 3;24472448if (params.m_no_mse_scaling)2449{2450smooth_block_mse_scales.set_all(1.0f);2451return;2452}24532454smooth_block_mse_scales.resize(width, height);24552456image smooth_vis, med_smooth_vis, ultra_smooth_vis;24572458if (params.m_debug_images)2459{2460smooth_vis.resize(width, height);2461med_smooth_vis.resize(width, height);2462ultra_smooth_vis.resize(width, height);2463}24642465for (uint32_t y = 0; y < height; y++)2466{2467for (uint32_t x = 0; x < width; x++)2468{2469{2470tracked_stat_dbl comp_stats[4];2471for (int yd = -1; yd <= 1; yd++)2472{2473for (int xd = -1; xd <= 1; xd++)2474{2475const color_rgba& p = orig_img.get_clamped((int)x + xd, (int)y + yd);24762477comp_stats[0].update((float)p[0]);2478comp_stats[1].update((float)p[1]);2479comp_stats[2].update((float)p[2]);2480}2481}24822483float max_std_dev = 0.0f;2484for (uint32_t i = 0; i < num_comps; i++)2485max_std_dev = basisu::maximum(max_std_dev, (float)comp_stats[i].get_std_dev());24862487float yl = clampf(max_std_dev / params.m_max_smooth_std_dev, 0.0f, 1.0f);2488//yl = powf(yl, 2.0f);2489yl = powf(yl, 1.0f / 2.0f); // substantially less bits24902491smooth_block_mse_scales(x, y) = lerp(params.m_smooth_max_mse_scale, 1.0f, yl);24922493if (params.m_debug_images)2494{2495//smooth_vis(x, y).set(clamp((int)((smooth_block_mse_scales(x, y) - 1.0f) / (params.m_smooth_max_mse_scale - 1.0f) * 255.0f + .5f), 0, 255));2496// white=high local activity (edges/detail)2497// black=low local activity (smooth - error is amplified)2498smooth_vis(x, y).set(clamp((int)((yl * 255.0f) + .5f), 0, 255));2499}2500}25012502{2503tracked_stat_dbl comp_stats[4];25042505const int S = 3;2506for (int yd = -S; yd < S; yd++)2507{2508for (int xd = -S; xd < S; xd++)2509{2510const color_rgba& p = orig_img.get_clamped((int)x + xd, (int)y + yd);25112512comp_stats[0].update((float)p[0]);2513comp_stats[1].update((float)p[1]);2514comp_stats[2].update((float)p[2]);2515}2516}25172518float max_std_dev = 0.0f;2519for (uint32_t i = 0; i < num_comps; i++)2520max_std_dev = basisu::maximum(max_std_dev, (float)comp_stats[i].get_std_dev());25212522float yl = clampf(max_std_dev / params.m_max_med_smooth_std_dev, 0.0f, 1.0f);2523//yl = powf(yl, 2.0f);25242525smooth_block_mse_scales(x, y) = lerp(params.m_med_smooth_max_mse_scale, smooth_block_mse_scales(x, y), yl);25262527if (params.m_debug_images)2528med_smooth_vis(x, y).set((int)std::round(yl * 255.0f));2529}25302531{2532tracked_stat_dbl comp_stats[4];25332534const int S = 5;2535for (int yd = -S; yd < S; yd++)2536{2537for (int xd = -S; xd < S; xd++)2538{2539const color_rgba& p = orig_img.get_clamped((int)x + xd, (int)y + yd);25402541comp_stats[0].update((float)p[0]);2542comp_stats[1].update((float)p[1]);2543comp_stats[2].update((float)p[2]);2544}2545}25462547float max_std_dev = 0.0f;2548for (uint32_t i = 0; i < num_comps; i++)2549max_std_dev = basisu::maximum(max_std_dev, (float)comp_stats[i].get_std_dev());25502551float yl = clampf(max_std_dev / params.m_max_ultra_smooth_std_dev, 0.0f, 1.0f);2552yl = powf(yl, 2.0f);25532554smooth_block_mse_scales(x, y) = lerp(params.m_ultra_smooth_max_mse_scale, smooth_block_mse_scales(x, y), yl);25552556if (params.m_debug_images)2557ultra_smooth_vis(x, y).set((int)std::round(yl * 255.0f));2558}25592560}2561}25622563if (params.m_debug_images)2564{2565save_png("dbg_smooth_vis.png", smooth_vis);2566save_png("dbg_med_smooth_vis.png", med_smooth_vis);2567save_png("dbg_ultra_smooth_vis.png", ultra_smooth_vis);25682569image vis_img(width, height);25702571float max_scale = 0.0f;2572for (uint32_t y = 0; y < height; y++)2573for (uint32_t x = 0; x < width; x++)2574max_scale = basisu::maximumf(max_scale, smooth_block_mse_scales(x, y));25752576for (uint32_t y = 0; y < height; y++)2577for (uint32_t x = 0; x < width; x++)2578vis_img(x, y).set((int)std::round(smooth_block_mse_scales(x, y) * 255.0f / max_scale));25792580save_png("scale_vis.png", vis_img);2581}25822583if (pUltra_smooth_img)2584*pUltra_smooth_img = ultra_smooth_vis;2585}25862587const float REALLY_DARK_I_THRESHOLD = 0.0625f;2588const float REALLY_DARK_MSE_ERR_SCALE = 128.0f;2589const float REALLY_DARK_DELTA_ITP_JND_SCALE = 5.0f;25902591static float compute_pixel_mse_itp(const vec3F& orig_pixel_itp, const vec3F& comp_pixel_itp, bool delta_itp_dark_adjustment)2592{2593float delta_i = orig_pixel_itp[0] - comp_pixel_itp[0];2594float delta_t = orig_pixel_itp[1] - comp_pixel_itp[1];2595float delta_p = orig_pixel_itp[2] - comp_pixel_itp[2];25962597float err = (delta_i * delta_i) + (delta_t * delta_t) + (delta_p * delta_p);25982599if (delta_itp_dark_adjustment)2600{2601// We have to process a large range of inputs, including extremely dark inputs.2602// Artifically amplify MSE on very dark pixels - otherwise they'll be overly compressed at higher lambdas.2603// This is to better handle very dark signals which could be explictly overexposed.2604float s = bu_math::smoothstep(0.0f, REALLY_DARK_I_THRESHOLD, orig_pixel_itp[0]);2605s = lerp(REALLY_DARK_MSE_ERR_SCALE, 1.0f, s);2606err *= s;2607}26082609return err;2610}26112612static float compute_block_mse_itp(uint32_t block_w, uint32_t block_h, const vec3F* pOrig_pixels_itp, const vec3F* pComp_pixels_itp, bool delta_itp_dark_adjustment)2613{2614float total_mse = 0.0f;26152616for (uint32_t y = 0; y < block_h; y++)2617{2618for (uint32_t x = 0; x < block_w; x++)2619{2620total_mse += compute_pixel_mse_itp(pOrig_pixels_itp[x + y * block_w], pComp_pixels_itp[x + y * block_w], delta_itp_dark_adjustment);2621} // x2622} // y26232624return total_mse * (1.0f / (float)(block_w * block_h));2625}26262627static float compute_block_ssim_itp(uint32_t block_w, uint32_t block_h, const vec3F* pOrig_pixels_itp, const vec3F* pComp_pixels_itp)2628{2629const uint32_t n = block_w * block_h;2630assert(n <= 36);26312632stats<float> x_stats[3], y_stats[3];2633comparative_stats<float> xy_cov[3];26342635for (uint32_t c = 0; c < 3; c++)2636{2637x_stats[c].calc_simplified(n, &pOrig_pixels_itp[0][c], 3);2638y_stats[c].calc_simplified(n, &pComp_pixels_itp[0][c], 3);2639}26402641for (uint32_t c = 0; c < 3; c++)2642xy_cov[c].calc_cov(n, &pOrig_pixels_itp[0][c], &pComp_pixels_itp[0][c], 3, 3, &x_stats[c], &y_stats[c]);26432644float ssim[3];2645const double d = 1.0f, k1 = .01f, k2 = .03f;26462647// weight mean error more highly to reduce blocking2648float ap = 1.5f, bp = 1.0f, cp = 1.0f;26492650const double s_c1 = square(k1 * d), s_c2 = square(k2 * d);2651const double s_c3(s_c2 * .5f);26522653for (uint32_t c = 0; c < 3; c++)2654{2655float lum = (float)((2.0f * x_stats[c].m_avg * y_stats[c].m_avg + s_c1) / (square(x_stats[c].m_avg) + square(y_stats[c].m_avg) + s_c1));2656lum = saturate(lum);26572658float con = (float)((2.0f * x_stats[c].m_std_dev * y_stats[c].m_std_dev + s_c2) / (x_stats[c].m_var + y_stats[c].m_var + s_c2));2659con = saturate(con);26602661float str = (float)((xy_cov[c].m_cov + s_c3) / (x_stats[c].m_std_dev * y_stats[c].m_std_dev + s_c3));2662str = saturate(str);26632664ssim[c] = powf(lum, ap) * powf(con, bp) * powf(str, cp);2665}26662667#if 02668float final_ssim = (ssim[0] * .4f + ssim[1] * .3f + ssim[2] * .3f);2669#elif 12670float final_ssim = ssim[0] * ssim[1] * ssim[2];2671#else2672const float LP = .75f;2673float final_ssim = ssim[0] * powf((ssim[1] + ssim[2]) * .5f, LP);2674#endif26752676return final_ssim;2677}26782679// delta ITP, 1.0 is JND (Rec. ITU-R BT.2124), modified for higher error at low light2680static float compute_pixel_delta_itp(const vec3F& a, const vec3F& b, const vec3F& orig, bool delta_itp_dark_adjustment)2681{2682float delta_i = a[0] - b[0];2683float delta_t = a[1] - b[1];2684float delta_p = a[2] - b[2];26852686float err = 720.0f * sqrtf((delta_i * delta_i) + (delta_t * delta_t) + (delta_p * delta_p));26872688float s = bu_math::smoothstep(0.0f, REALLY_DARK_I_THRESHOLD, orig[0]);26892690if (delta_itp_dark_adjustment)2691{2692// This is to better handle very dark signals which could be explictly overexposed.2693s = lerp(REALLY_DARK_DELTA_ITP_JND_SCALE, 1.0f, s);2694err *= s;2695}26962697return err;2698}26992700struct candidate_encoding2701{2702encoding_type m_encoding_type;27032704basist::half_float m_solid_color[3];27052706uint32_t m_run_len;27072708vec3F m_comp_pixels[MAX_BLOCK_H][MAX_BLOCK_W]; // [y][x]2709vec3F m_comp_pixels_itp[MAX_BLOCK_H][MAX_BLOCK_W]; // [y][x]27102711endpoint_mode m_endpoint_mode;2712block_mode m_block_mode;27132714bitwise_coder m_coder;27152716// The block to code, which may not be valid ASTC. This may have to be transcoded (by requantizing the weights/endpoints) before it's valid ASTC.2717// Note the endpoints may be coded endpoints OR transcoded endpoints, depending on the encoding type.2718astc_helpers::log_astc_block m_coded_log_blk;27192720// The block the decoder outputs.2721astc_helpers::log_astc_block m_decomp_log_blk;27222723int m_reuse_delta_index;27242725float m_t, m_d, m_bits;27262727candidate_encoding()2728{2729clear();2730}27312732candidate_encoding(const candidate_encoding &other)2733{2734*this = other;2735}27362737candidate_encoding(candidate_encoding&& other)2738{2739*this = std::move(other);2740}27412742candidate_encoding& operator=(const candidate_encoding& rhs)2743{2744if (this == &rhs)2745return *this;27462747m_encoding_type = rhs.m_encoding_type;2748memcpy(m_solid_color, rhs.m_solid_color, sizeof(m_solid_color));2749m_run_len = rhs.m_run_len;2750memcpy(m_comp_pixels, rhs.m_comp_pixels, sizeof(m_comp_pixels));2751m_endpoint_mode = rhs.m_endpoint_mode;2752m_block_mode = rhs.m_block_mode;2753m_coder = rhs.m_coder;2754m_coded_log_blk = rhs.m_coded_log_blk;2755m_decomp_log_blk = rhs.m_decomp_log_blk;2756m_reuse_delta_index = rhs.m_reuse_delta_index;27572758return *this;2759}27602761candidate_encoding& operator=(candidate_encoding&& rhs)2762{2763if (this == &rhs)2764return *this;27652766m_encoding_type = rhs.m_encoding_type;2767memcpy(m_solid_color, rhs.m_solid_color, sizeof(m_solid_color));2768m_run_len = rhs.m_run_len;2769memcpy(m_comp_pixels, rhs.m_comp_pixels, sizeof(m_comp_pixels));2770m_endpoint_mode = rhs.m_endpoint_mode;2771m_block_mode = rhs.m_block_mode;2772m_coder = std::move(rhs.m_coder);2773m_coded_log_blk = rhs.m_coded_log_blk;2774m_decomp_log_blk = rhs.m_decomp_log_blk;2775m_reuse_delta_index = rhs.m_reuse_delta_index;27762777return *this;2778}27792780void clear()2781{2782m_encoding_type = encoding_type::cInvalid;27832784clear_obj(m_solid_color);27852786m_run_len = 0;27872788clear_obj(m_comp_pixels);27892790m_endpoint_mode = endpoint_mode::cInvalid;2791m_block_mode = block_mode::cInvalid;27922793m_coder.restart();27942795m_coded_log_blk.clear();2796m_decomp_log_blk.clear();27972798m_t = 0;2799m_d = 0;2800m_bits = 0;28012802m_reuse_delta_index = 0;2803}2804};28052806bool decode_astc_block(uint32_t block_w, uint32_t block_h, astc_helpers::log_astc_block &log_blk, vec3F *pPixels)2807{2808assert((block_w <= 6) && (block_h <= 6));28092810half_vec4 decoded_pixels_half4[6 * 6]; // [y][x]2811bool status = astc_helpers::decode_block(log_blk, decoded_pixels_half4, block_w, block_h, astc_helpers::cDecodeModeHDR16);2812assert(status);28132814if (!status)2815return false;28162817for (uint32_t y = 0; y < block_h; y++)2818{2819for (uint32_t x = 0; x < block_w; x++)2820{2821pPixels[x + y * block_w].set(2822basist::half_to_float(decoded_pixels_half4[x + y * block_w][0]),2823basist::half_to_float(decoded_pixels_half4[x + y * block_w][1]),2824basist::half_to_float(decoded_pixels_half4[x + y * block_w][2]));2825} // x2826} //y28272828return true;2829}28302831static inline bool validate_log_blk(const astc_helpers::log_astc_block &decomp_blk)2832{2833astc_helpers::astc_block phys_blk;2834return astc_helpers::pack_astc_block(phys_blk, decomp_blk);2835}28362837#define SYNC_MARKERS (0)28382839static bool decode_file(const uint8_vec& comp_data, vector2D<astc_helpers::astc_block>& decoded_blocks, uint32_t &width, uint32_t &height)2840{2841interval_timer tm;2842tm.start();28432844const uint32_t BLOCK_W = 6, BLOCK_H = 6;28452846width = 0;2847height = 0;28482849if (comp_data.size() <= 2*3)2850return false;28512852basist::bitwise_decoder decoder;2853if (!decoder.init(comp_data.data(), comp_data.size_u32()))2854return false;28552856if (decoder.get_bits(16) != 0xABCD)2857return false;28582859width = decoder.get_bits(16);2860height = decoder.get_bits(16);28612862if (!width || !height || (width > MAX_ASTC_HDR_6X6_DIM) || (height > MAX_ASTC_HDR_6X6_DIM))2863return false;28642865const uint32_t num_blocks_x = (width + BLOCK_W - 1) / BLOCK_W;2866const uint32_t num_blocks_y = (height + BLOCK_H - 1) / BLOCK_H;2867const uint32_t total_blocks = num_blocks_x * num_blocks_y;28682869decoded_blocks.resize(num_blocks_x, num_blocks_y);2870//memset(decoded_blocks.get_ptr(), 0, decoded_blocks.size_in_bytes());28712872vector2D<astc_helpers::log_astc_block> decoded_log_blocks(num_blocks_x, num_blocks_y);2873//memset(decoded_log_blocks.get_ptr(), 0, decoded_log_blocks.size_in_bytes());28742875uint32_t cur_bx = 0, cur_by = 0;2876uint32_t step_counter = 0;2877BASISU_NOTE_UNUSED(step_counter);28782879while (cur_by < num_blocks_y)2880{2881step_counter++;28822883//if ((cur_bx == 9) && (cur_by == 13))2884// printf("!");28852886#if SYNC_MARKERS2887uint32_t mk = decoder.get_bits(16);2888if (mk != 0xDEAD)2889{2890printf("!");2891assert(0);2892return false;2893}2894#endif2895if (decoder.get_bits_remaining() < 1)2896return false;28972898encoding_type et = encoding_type::cBlock;28992900uint32_t b0 = decoder.get_bits(1);2901if (!b0)2902{2903uint32_t b1 = decoder.get_bits(1);2904if (b1)2905et = encoding_type::cReuse;2906else2907{2908uint32_t b2 = decoder.get_bits(1);2909if (b2)2910et = encoding_type::cSolid;2911else2912et = encoding_type::cRun;2913}2914}29152916switch (et)2917{2918case encoding_type::cRun:2919{2920if (!cur_bx && !cur_by)2921return false;29222923const uint32_t run_len = decoder.decode_vlc(5) + 1;29242925uint32_t num_blocks_remaining = total_blocks - (cur_bx + cur_by * num_blocks_x);2926if (run_len > num_blocks_remaining)2927return false;29282929uint32_t prev_bx = cur_bx, prev_by = cur_by;29302931if (cur_bx)2932prev_bx--;2933else2934{2935prev_bx = num_blocks_x - 1;2936prev_by--;2937}29382939const astc_helpers::log_astc_block& prev_log_blk = decoded_log_blocks(prev_bx, prev_by);2940const astc_helpers::astc_block& prev_phys_blk = decoded_blocks(prev_bx, prev_by);29412942for (uint32_t i = 0; i < run_len; i++)2943{2944decoded_log_blocks(cur_bx, cur_by) = prev_log_blk;2945decoded_blocks(cur_bx, cur_by) = prev_phys_blk;29462947cur_bx++;2948if (cur_bx == num_blocks_x)2949{2950cur_bx = 0;2951cur_by++;2952}2953}29542955break;2956}2957case encoding_type::cSolid:2958{2959const basist::half_float rh = (basist::half_float)decoder.get_bits(15);2960const basist::half_float gh = (basist::half_float)decoder.get_bits(15);2961const basist::half_float bh = (basist::half_float)decoder.get_bits(15);29622963astc_helpers::log_astc_block& log_blk = decoded_log_blocks(cur_bx, cur_by);29642965log_blk.clear();2966log_blk.m_solid_color_flag_hdr = true;2967log_blk.m_solid_color[0] = rh;2968log_blk.m_solid_color[1] = gh;2969log_blk.m_solid_color[2] = bh;2970log_blk.m_solid_color[3] = basist::float_to_half(1.0f);29712972bool status = astc_helpers::pack_astc_block(decoded_blocks(cur_bx, cur_by), log_blk);2973if (!status)2974return false;29752976cur_bx++;2977if (cur_bx == num_blocks_x)2978{2979cur_bx = 0;2980cur_by++;2981}29822983break;2984}2985case encoding_type::cReuse:2986{2987if (!cur_bx && !cur_by)2988return false;29892990const uint32_t reuse_delta_index = decoder.get_bits(REUSE_XY_DELTA_BITS);29912992const int reuse_delta_x = g_reuse_xy_deltas[reuse_delta_index].m_x;2993const int reuse_delta_y = g_reuse_xy_deltas[reuse_delta_index].m_y;29942995const int prev_bx = cur_bx + reuse_delta_x, prev_by = cur_by + reuse_delta_y;2996if ((prev_bx < 0) || (prev_bx >= (int)num_blocks_x))2997return false;2998if (prev_by < 0)2999return false;30003001const astc_helpers::log_astc_block& prev_log_blk = decoded_log_blocks(prev_bx, prev_by);3002const astc_helpers::astc_block& prev_phys_blk = decoded_blocks(prev_bx, prev_by);30033004if (prev_log_blk.m_solid_color_flag_hdr)3005return false;30063007astc_helpers::log_astc_block& log_blk = decoded_log_blocks(cur_bx, cur_by);3008astc_helpers::astc_block& phys_blk = decoded_blocks(cur_bx, cur_by);30093010log_blk = prev_log_blk;30113012const uint32_t total_grid_weights = log_blk.m_grid_width * log_blk.m_grid_height * (log_blk.m_dual_plane ? 2 : 1);30133014bool status = basist::astc_6x6_hdr::decode_values(decoder, total_grid_weights, log_blk.m_weight_ise_range, log_blk.m_weights);3015if (!status)3016return false;30173018astc_helpers::log_astc_block decomp_blk;3019status = astc_helpers::unpack_block(&prev_phys_blk, decomp_blk, BLOCK_W, BLOCK_H);3020if (!status)3021return false;30223023uint8_t transcode_weights[MAX_BLOCK_W * MAX_BLOCK_H * 2];3024basist::astc_6x6_hdr::requantize_astc_weights(total_grid_weights, log_blk.m_weights, log_blk.m_weight_ise_range, transcode_weights, decomp_blk.m_weight_ise_range);30253026copy_weight_grid(log_blk.m_dual_plane, log_blk.m_grid_width, log_blk.m_grid_height, transcode_weights, decomp_blk);30273028status = astc_helpers::pack_astc_block(phys_blk, decomp_blk);3029if (!status)3030return false;30313032cur_bx++;3033if (cur_bx == num_blocks_x)3034{3035cur_bx = 0;3036cur_by++;3037}30383039break;3040}3041case encoding_type::cBlock:3042{3043const block_mode bm = (block_mode)decoder.decode_truncated_binary((uint32_t)block_mode::cBMTotalModes);3044const endpoint_mode em = (endpoint_mode)decoder.decode_truncated_binary((uint32_t)endpoint_mode::cTotal);30453046switch (em)3047{3048case endpoint_mode::cUseLeft:3049case endpoint_mode::cUseUpper:3050{3051int neighbor_bx = cur_bx, neighbor_by = cur_by;30523053if (em == endpoint_mode::cUseLeft)3054neighbor_bx--;3055else3056neighbor_by--;30573058if ((neighbor_bx < 0) || (neighbor_by < 0))3059return false;30603061const astc_helpers::log_astc_block& neighbor_blk = decoded_log_blocks(neighbor_bx, neighbor_by);3062if (!neighbor_blk.m_color_endpoint_modes[0])3063return false;30643065const block_mode_desc& bmd = g_block_mode_descs[(uint32_t)bm];3066const uint32_t num_endpoint_values = get_num_endpoint_vals(bmd.m_cem);30673068if (bmd.m_cem != neighbor_blk.m_color_endpoint_modes[0])3069return false;30703071astc_helpers::log_astc_block& log_blk = decoded_log_blocks(cur_bx, cur_by);3072astc_helpers::astc_block& phys_blk = decoded_blocks(cur_bx, cur_by);30733074log_blk.clear();3075log_blk.m_num_partitions = 1;3076log_blk.m_color_endpoint_modes[0] = (uint8_t)bmd.m_cem;3077log_blk.m_endpoint_ise_range = neighbor_blk.m_endpoint_ise_range;3078log_blk.m_weight_ise_range = (uint8_t)bmd.m_weight_ise_range;3079log_blk.m_grid_width = (uint8_t)bmd.m_grid_x;3080log_blk.m_grid_height = (uint8_t)bmd.m_grid_y;3081log_blk.m_dual_plane = (uint8_t)bmd.m_dp;3082log_blk.m_color_component_selector = (uint8_t)bmd.m_dp_channel;30833084memcpy(log_blk.m_endpoints, neighbor_blk.m_endpoints, num_endpoint_values);30853086const uint32_t total_grid_weights = bmd.m_grid_x * bmd.m_grid_y * (bmd.m_dp ? 2 : 1);30873088bool status = decode_values(decoder, total_grid_weights, bmd.m_weight_ise_range, log_blk.m_weights);3089if (!status)3090return false;30913092astc_helpers::log_astc_block decomp_blk;3093decomp_blk.clear();30943095decomp_blk.m_num_partitions = 1;3096decomp_blk.m_color_endpoint_modes[0] = (uint8_t)bmd.m_cem;3097decomp_blk.m_endpoint_ise_range = (uint8_t)bmd.m_transcode_endpoint_ise_range;3098decomp_blk.m_weight_ise_range = (uint8_t)bmd.m_transcode_weight_ise_range;3099decomp_blk.m_dual_plane = bmd.m_dp;3100decomp_blk.m_color_component_selector = (uint8_t)bmd.m_dp_channel;31013102basist::astc_6x6_hdr::requantize_ise_endpoints(bmd.m_cem, log_blk.m_endpoint_ise_range, log_blk.m_endpoints, bmd.m_transcode_endpoint_ise_range, decomp_blk.m_endpoints);31033104uint8_t transcode_weights[MAX_BLOCK_W * MAX_BLOCK_H * 2];3105basist::astc_6x6_hdr::requantize_astc_weights(total_grid_weights, log_blk.m_weights, bmd.m_weight_ise_range, transcode_weights, bmd.m_transcode_weight_ise_range);31063107copy_weight_grid(bmd.m_dp, bmd.m_grid_x, bmd.m_grid_y, transcode_weights, decomp_blk);31083109status = astc_helpers::pack_astc_block(phys_blk, decomp_blk);3110if (!status)3111return false;31123113cur_bx++;3114if (cur_bx == num_blocks_x)3115{3116cur_bx = 0;3117cur_by++;3118}31193120break;3121}3122case endpoint_mode::cUseLeftDelta:3123case endpoint_mode::cUseUpperDelta:3124{3125int neighbor_bx = cur_bx, neighbor_by = cur_by;31263127if (em == endpoint_mode::cUseLeftDelta)3128neighbor_bx--;3129else3130neighbor_by--;31313132if ((neighbor_bx < 0) || (neighbor_by < 0))3133return false;31343135const astc_helpers::log_astc_block& neighbor_blk = decoded_log_blocks(neighbor_bx, neighbor_by);3136if (!neighbor_blk.m_color_endpoint_modes[0])3137return false;31383139const block_mode_desc& bmd = g_block_mode_descs[(uint32_t)bm];3140const uint32_t num_endpoint_values = get_num_endpoint_vals(bmd.m_cem);31413142if (bmd.m_cem != neighbor_blk.m_color_endpoint_modes[0])3143return false;31443145astc_helpers::log_astc_block& log_blk = decoded_log_blocks(cur_bx, cur_by);3146astc_helpers::astc_block& phys_blk = decoded_blocks(cur_bx, cur_by);31473148log_blk.clear();3149log_blk.m_num_partitions = 1;3150log_blk.m_color_endpoint_modes[0] = (uint8_t)bmd.m_cem;3151log_blk.m_dual_plane = bmd.m_dp;3152log_blk.m_color_component_selector = (uint8_t)bmd.m_dp_channel;31533154log_blk.m_endpoint_ise_range = (uint8_t)bmd.m_endpoint_ise_range;3155basist::astc_6x6_hdr::requantize_ise_endpoints(bmd.m_cem, neighbor_blk.m_endpoint_ise_range, neighbor_blk.m_endpoints, bmd.m_endpoint_ise_range, log_blk.m_endpoints);31563157const int total_endpoint_delta_vals = 1 << NUM_ENDPOINT_DELTA_BITS;3158const int low_delta_limit = -(total_endpoint_delta_vals / 2); // high_delta_limit = (total_endpoint_delta_vals / 2) - 1;31593160const auto& ise_to_rank = astc_helpers::g_dequant_tables.get_endpoint_tab(log_blk.m_endpoint_ise_range).m_ISE_to_rank;3161const auto& rank_to_ise = astc_helpers::g_dequant_tables.get_endpoint_tab(log_blk.m_endpoint_ise_range).m_rank_to_ISE;3162const int total_endpoint_levels = astc_helpers::get_ise_levels(log_blk.m_endpoint_ise_range);31633164for (uint32_t i = 0; i < num_endpoint_values; i++)3165{3166int cur_val = ise_to_rank[log_blk.m_endpoints[i]];31673168int delta = (int)decoder.get_bits(NUM_ENDPOINT_DELTA_BITS) + low_delta_limit;31693170cur_val += delta;3171if ((cur_val < 0) || (cur_val >= total_endpoint_levels))3172return false;31733174log_blk.m_endpoints[i] = rank_to_ise[cur_val];3175}31763177log_blk.m_weight_ise_range = (uint8_t)bmd.m_weight_ise_range;3178log_blk.m_grid_width = (uint8_t)bmd.m_grid_x;3179log_blk.m_grid_height = (uint8_t)bmd.m_grid_y;31803181const uint32_t total_grid_weights = bmd.m_grid_x * bmd.m_grid_y * (bmd.m_dp ? 2 : 1);31823183bool status = decode_values(decoder, total_grid_weights, bmd.m_weight_ise_range, log_blk.m_weights);3184if (!status)3185return false;31863187astc_helpers::log_astc_block decomp_blk;3188decomp_blk.clear();31893190decomp_blk.m_num_partitions = 1;3191decomp_blk.m_color_endpoint_modes[0] = (uint8_t)bmd.m_cem;3192decomp_blk.m_endpoint_ise_range = (uint8_t)bmd.m_transcode_endpoint_ise_range;3193decomp_blk.m_weight_ise_range = (uint8_t)bmd.m_transcode_weight_ise_range;3194decomp_blk.m_dual_plane = (uint8_t)bmd.m_dp;3195decomp_blk.m_color_component_selector = (uint8_t)bmd.m_dp_channel;31963197basist::astc_6x6_hdr::requantize_ise_endpoints(bmd.m_cem, log_blk.m_endpoint_ise_range, log_blk.m_endpoints, bmd.m_transcode_endpoint_ise_range, decomp_blk.m_endpoints);31983199uint8_t transcode_weights[MAX_BLOCK_W * MAX_BLOCK_H * 2];3200basist::astc_6x6_hdr::requantize_astc_weights(total_grid_weights, log_blk.m_weights, bmd.m_weight_ise_range, transcode_weights, bmd.m_transcode_weight_ise_range);32013202copy_weight_grid(bmd.m_dp, bmd.m_grid_x, bmd.m_grid_y, transcode_weights, decomp_blk);32033204status = astc_helpers::pack_astc_block(phys_blk, decomp_blk);3205if (!status)3206return false;32073208cur_bx++;3209if (cur_bx == num_blocks_x)3210{3211cur_bx = 0;3212cur_by++;3213}32143215break;3216}3217case endpoint_mode::cRaw:3218{3219const block_mode_desc& bmd = g_block_mode_descs[(uint32_t)bm];32203221const uint32_t num_endpoint_values = get_num_endpoint_vals(bmd.m_cem);32223223astc_helpers::log_astc_block& log_blk = decoded_log_blocks(cur_bx, cur_by);3224astc_helpers::astc_block& phys_blk = decoded_blocks(cur_bx, cur_by);32253226log_blk.clear();3227log_blk.m_num_partitions = (uint8_t)bmd.m_num_partitions;32283229for (uint32_t p = 0; p < bmd.m_num_partitions; p++)3230log_blk.m_color_endpoint_modes[p] = (uint8_t)bmd.m_cem;32313232log_blk.m_endpoint_ise_range = (uint8_t)bmd.m_endpoint_ise_range;3233log_blk.m_weight_ise_range = (uint8_t)bmd.m_weight_ise_range;32343235log_blk.m_grid_width = (uint8_t)bmd.m_grid_x;3236log_blk.m_grid_height = (uint8_t)bmd.m_grid_y;3237log_blk.m_dual_plane = (uint8_t)bmd.m_dp;3238log_blk.m_color_component_selector = (uint8_t)bmd.m_dp_channel;32393240if (bmd.m_num_partitions == 2)3241{3242const uint32_t unique_partition_index = decoder.decode_truncated_binary(NUM_UNIQUE_PARTITIONS2);3243log_blk.m_partition_id = (uint16_t)g_part2_unique_index_to_seed[unique_partition_index];3244}3245else if (bmd.m_num_partitions == 3)3246{3247const uint32_t unique_partition_index = decoder.decode_truncated_binary(NUM_UNIQUE_PARTITIONS3);3248log_blk.m_partition_id = (uint16_t)g_part3_unique_index_to_seed[unique_partition_index];3249}32503251bool status = decode_values(decoder, num_endpoint_values * bmd.m_num_partitions, bmd.m_endpoint_ise_range, log_blk.m_endpoints);3252if (!status)3253return false;32543255const uint32_t total_grid_weights = bmd.m_grid_x * bmd.m_grid_y * (bmd.m_dp ? 2 : 1);32563257status = decode_values(decoder, total_grid_weights, bmd.m_weight_ise_range, log_blk.m_weights);3258if (!status)3259return false;32603261astc_helpers::log_astc_block decomp_blk;3262decomp_blk.clear();32633264decomp_blk.m_dual_plane = bmd.m_dp;3265decomp_blk.m_color_component_selector = (uint8_t)bmd.m_dp_channel;3266decomp_blk.m_partition_id = log_blk.m_partition_id;32673268decomp_blk.m_num_partitions = (uint8_t)bmd.m_num_partitions;32693270for (uint32_t p = 0; p < bmd.m_num_partitions; p++)3271decomp_blk.m_color_endpoint_modes[p] = (uint8_t)bmd.m_cem;32723273decomp_blk.m_endpoint_ise_range = (uint8_t)bmd.m_transcode_endpoint_ise_range;3274decomp_blk.m_weight_ise_range = (uint8_t)bmd.m_transcode_weight_ise_range;32753276for (uint32_t p = 0; p < bmd.m_num_partitions; p++)3277basist::astc_6x6_hdr::requantize_ise_endpoints(bmd.m_cem, bmd.m_endpoint_ise_range, log_blk.m_endpoints + num_endpoint_values * p, bmd.m_transcode_endpoint_ise_range, decomp_blk.m_endpoints + num_endpoint_values * p);32783279uint8_t transcode_weights[MAX_BLOCK_W * MAX_BLOCK_H * 2];3280basist::astc_6x6_hdr::requantize_astc_weights(total_grid_weights, log_blk.m_weights, bmd.m_weight_ise_range, transcode_weights, bmd.m_transcode_weight_ise_range);32813282copy_weight_grid(bmd.m_dp, bmd.m_grid_x, bmd.m_grid_y, transcode_weights, decomp_blk);32833284status = astc_helpers::pack_astc_block(phys_blk, decomp_blk);3285if (!status)3286return false;32873288cur_bx++;3289if (cur_bx == num_blocks_x)3290{3291cur_bx = 0;3292cur_by++;3293}32943295break;3296}3297default:3298{3299assert(0);3300return false;3301}3302}33033304break;3305}3306default:3307{3308assert(0);3309return false;3310}3311}3312}33133314if (decoder.get_bits(16) != 0xA742)3315{3316fmt_error_printf("End marker not found!\n");3317return false;3318}33193320//fmt_printf("Total decode_file() time: {} secs\n", tm.get_elapsed_secs());33213322return true;3323}33243325static bool unpack_physical_astc_block(const void* pBlock, uint32_t block_width, uint32_t block_height, vec4F* pPixels)3326{3327astc_helpers::log_astc_block log_blk;3328if (!astc_helpers::unpack_block(pBlock, log_blk, block_width, block_height))3329return false;33303331basist::half_float half_block[MAX_BLOCK_W * MAX_BLOCK_H][4];3332if (!astc_helpers::decode_block(log_blk, half_block, block_width, block_height, astc_helpers::cDecodeModeHDR16))3333return false;33343335const uint32_t total_block_pixels = block_width * block_height;3336for (uint32_t p = 0; p < total_block_pixels; p++)3337{3338pPixels[p][0] = basist::half_to_float(half_block[p][0]);3339pPixels[p][1] = basist::half_to_float(half_block[p][1]);3340pPixels[p][2] = basist::half_to_float(half_block[p][2]);3341pPixels[p][3] = basist::half_to_float(half_block[p][3]);3342}33433344return true;3345}33463347static bool unpack_physical_astc_block_google(const void* pBlock, uint32_t block_width, uint32_t block_height, vec4F* pPixels)3348{3349return basisu_astc::astc::decompress_hdr((float *)pPixels, (uint8_t*)pBlock, block_width, block_height);3350}33513352static bool pack_bc6h_image(const imagef &src_img, vector2D<basist::bc6h_block> &bc6h_blocks, imagef *pPacked_bc6h_img, const fast_bc6h_params &enc_params)3353{3354const uint32_t width = src_img.get_width();3355const uint32_t height = src_img.get_height();33563357if (pPacked_bc6h_img)3358pPacked_bc6h_img->resize(width, height);33593360interval_timer tm;3361double total_enc_time = 0.0f;33623363const uint32_t num_blocks_x = src_img.get_block_width(4);3364const uint32_t num_blocks_y = src_img.get_block_height(4);33653366bc6h_blocks.resize(num_blocks_x, num_blocks_y);33673368for (uint32_t by = 0; by < num_blocks_y; by++)3369{3370for (uint32_t bx = 0; bx < num_blocks_x; bx++)3371{3372// Extract source image block3373vec4F block_pixels[4][4]; // [y][x]3374src_img.extract_block_clamped(&block_pixels[0][0], bx * 4, by * 4, 4, 4);33753376basist::half_float half_pixels[16 * 3]; // [y][x]33773378for (uint32_t y = 0; y < 4; y++)3379{3380for (uint32_t x = 0; x < 4; x++)3381{3382for (uint32_t c = 0; c < 3; c++)3383{3384float v = block_pixels[y][x][c];33853386basist::half_float h = basist::float_to_half(v);33873388half_pixels[(x + y * 4) * 3 + c] = h;33893390} // c33913392} // x3393} // y33943395basist::bc6h_block& bc6h_blk = bc6h_blocks(bx, by);33963397tm.start();33983399basist::astc_6x6_hdr::fast_encode_bc6h(half_pixels, &bc6h_blk, enc_params);34003401total_enc_time += tm.get_elapsed_secs();34023403if (pPacked_bc6h_img)3404{3405basist::half_float unpacked_blk[16 * 3];3406bool status = unpack_bc6h(&bc6h_blk, unpacked_blk, false);3407assert(status);3408if (!status)3409{3410fmt_error_printf("unpack_bc6h() failed\n");3411return false;3412}34133414for (uint32_t y = 0; y < 4; y++)3415{3416for (uint32_t x = 0; x < 4; x++)3417{3418vec4F p;34193420for (uint32_t c = 0; c < 3; c++)3421{3422float v = basist::half_to_float(unpacked_blk[(x + y * 4) * 3 + c]);3423p[c] = v;34243425} // c34263427p[3] = 1.0f;34283429pPacked_bc6h_img->set_clipped(bx * 4 + x, by * 4 + y, p);3430} // x3431} // y3432}34333434} // bx3435} // by34363437//fmt_printf("Total BC6H encode time: {}\n", total_enc_time);34383439return true;3440}34413442static float dist_to_line_squared(const vec3F& p, const vec3F &line_org, const vec3F &line_dir)3443{3444vec3F q(p - line_org);3445vec3F v(q - q.dot(line_dir) * line_dir);3446return v.dot(v);3447}34483449static void estimate_partitions_mode7_and_11(3450uint32_t num_parts, // 2 or 3 partitions3451uint32_t num_unique_pats, const partition_pattern_vec* pUnique_pats, // list of all unique, canonicalized patterns3452uint32_t num_pats_to_examine, const uint32_t* pUnique_pat_indices_to_examine, // indices of pats to examine3453const vec3F *pHalf_pixels_as_floats, // block's half pixel values casted to floats3454const astc_hdr_codec_base_options& coptions, // options3455uint32_t num_desired_pats,3456int *pDesired_pat_indices_mode11, int *pDesired_pat_indices_mode7) // output indices3457{3458BASISU_NOTE_UNUSED(coptions);3459BASISU_NOTE_UNUSED(num_unique_pats);34603461const uint32_t BLOCK_W = 6, BLOCK_H = 6, MAX_PARTS = 3; // BLOCK_T = 6 * 63462assert(num_parts <= MAX_PARTS);34633464struct candidate_res3465{3466float m_total_sq_dist;3467uint32_t m_index;3468bool operator< (const candidate_res& rhs) const { return m_total_sq_dist < rhs.m_total_sq_dist; }3469};34703471const uint32_t MAX_CANDIDATES = 1024;3472assert(num_desired_pats && (num_desired_pats <= MAX_CANDIDATES));34733474candidate_res mode11_candidates[MAX_CANDIDATES];3475candidate_res mode7_candidates[MAX_CANDIDATES];34763477const vec3F grayscale_axis(0.5773502691f);34783479for (uint32_t examine_iter = 0; examine_iter < num_pats_to_examine; examine_iter++)3480{3481const uint32_t unique_part_index = pUnique_pat_indices_to_examine[examine_iter];3482assert(unique_part_index < num_unique_pats);34833484const partition_pattern_vec* pPat = &pUnique_pats[unique_part_index];34853486vec3F part_means[MAX_PARTS];3487uint32_t part_total_texels[MAX_PARTS] = { 0 };34883489for (uint32_t i = 0; i < num_parts; i++)3490part_means[i].clear();34913492for (uint32_t y = 0; y < BLOCK_H; y++)3493{3494for (uint32_t x = 0; x < BLOCK_W; x++)3495{3496const uint32_t part_index = (*pPat)(x, y);3497assert(part_index < num_parts);34983499part_means[part_index] += pHalf_pixels_as_floats[x + y * BLOCK_W];3500part_total_texels[part_index]++;35013502} // x3503} // y35043505for (uint32_t i = 0; i < num_parts; i++)3506{3507assert(part_total_texels[i]);3508part_means[i] /= (float)part_total_texels[i];3509}35103511float part_cov[MAX_PARTS][6];3512memset(part_cov, 0, sizeof(part_cov));35133514for (uint32_t y = 0; y < BLOCK_H; y++)3515{3516for (uint32_t x = 0; x < BLOCK_W; x++)3517{3518const uint32_t part_index = (*pPat)(x, y);3519assert(part_index < num_parts);35203521const vec3F p(pHalf_pixels_as_floats[x + y * BLOCK_W] - part_means[part_index]);35223523const float r = p[0], g = p[1], b = p[2];35243525part_cov[part_index][0] += r * r;3526part_cov[part_index][1] += r * g;3527part_cov[part_index][2] += r * b;3528part_cov[part_index][3] += g * g;3529part_cov[part_index][4] += g * b;3530part_cov[part_index][5] += b * b;35313532} // x3533} // y35343535// For each partition compute the total variance of all channels.3536float total_variance[MAX_PARTS];3537for (uint32_t part_index = 0; part_index < num_parts; part_index++)3538total_variance[part_index] = part_cov[part_index][0] + part_cov[part_index][3] + part_cov[part_index][5];35393540vec3F part_axis[MAX_PARTS];3541float mode11_eigenvalue_est[MAX_PARTS]; // For each partition, compute the variance along the principle axis3542float mode7_eigenvalue_est[MAX_PARTS]; // For each partition, compute the variance along the principle axis35433544for (uint32_t part_index = 0; part_index < num_parts; part_index++)3545{3546float* pCov = &part_cov[part_index][0];35473548float xr = .9f, xg = 1.0f, xb = .7f;35493550const uint32_t NUM_POWER_ITERS = 4;3551for (uint32_t iter = 0; iter < NUM_POWER_ITERS; iter++)3552{3553float r = xr * pCov[0] + xg * pCov[1] + xb * pCov[2];3554float g = xr * pCov[1] + xg * pCov[3] + xb * pCov[4];3555float b = xr * pCov[2] + xg * pCov[4] + xb * pCov[5];35563557float m = maximumf(maximumf(fabsf(r), fabsf(g)), fabsf(b));35583559if (m >= 1e-10f)3560{3561m = 1.0f / m;35623563r *= m;3564g *= m;3565b *= m;3566}35673568xr = r;3569xg = g;3570xb = b;3571}35723573float len_sq = xr * xr + xg * xg + xb * xb;35743575if (len_sq < 1e-10f)3576{3577xr = grayscale_axis[0];3578xg = grayscale_axis[0];3579xb = grayscale_axis[0];3580}3581else3582{3583len_sq = 1.0f / sqrtf(len_sq);35843585xr *= len_sq;3586xg *= len_sq;3587xb *= len_sq;3588}35893590{3591// Transform the principle axis by the covariance matrix, which will scale the vector by its eigenvalue (the variance of the dataset projected onto the principle axis).3592float r = xr * pCov[0] + xg * pCov[1] + xb * pCov[2];3593float g = xr * pCov[1] + xg * pCov[3] + xb * pCov[4];3594float b = xr * pCov[2] + xg * pCov[4] + xb * pCov[5];35953596// Estimate the principle eigenvalue by computing the magnitude of the transformed vector.3597// The result is the variance along the principle axis.3598//float z1 = sqrtf(r * r + g * g + b * b); // this works with the principle axis3599//float z2 = r * xr + g * xg + b * xb; // compute length projected along xr,xg,xb36003601mode11_eigenvalue_est[part_index] = r * xr + g * xg + b * xb;3602}36033604{3605const float yrgb = grayscale_axis[0];36063607// Transform the grayscale axis by the covariance matrix, which will scale the vector by the eigenvalue (which is the variance of the dataset projected onto this vector).3608float r = yrgb * pCov[0] + yrgb * pCov[1] + yrgb * pCov[2];3609float g = yrgb * pCov[1] + yrgb * pCov[3] + yrgb * pCov[4];3610float b = yrgb * pCov[2] + yrgb * pCov[4] + yrgb * pCov[5];36113612mode7_eigenvalue_est[part_index] = r * yrgb + g * yrgb + b * yrgb;3613}36143615} // part_index36163617// Compute the total variance (squared error) of the other 2 axes by subtracting the total variance of all channels by the variance of the principle axis.3618// TODO: Could also compute the ratio of the principle axis's variance vs. the total variance.3619float mode11_total_sq_dist_to_line_alt = 0.0f;3620for (uint32_t part_index = 0; part_index < num_parts; part_index++)3621{3622float d = maximum(0.0f, total_variance[part_index] - mode11_eigenvalue_est[part_index]);3623mode11_total_sq_dist_to_line_alt += d;3624}36253626{3627#if 03628// TODO: This total distance can be computed rapidly. First compute the total variance of each channel (sum the diag entries of the covar matrix),3629// then compute the principle eigenvalue, and subtract. The result is the variance of the projection distances.3630float total_sq_dist_to_line = 0.0f;3631for (uint32_t i = 0; i < BLOCK_T; i++)3632{3633const uint32_t part_index = (*pPat)[i];3634assert(part_index < num_parts);36353636total_sq_dist_to_line += dist_to_line_squared(pHalf_pixels_as_floats[i], part_means[part_index], part_axis[part_index]);3637}36383639mode11_candidates[examine_iter].m_total_sq_dist = total_sq_dist_to_line;3640#else3641mode11_candidates[examine_iter].m_total_sq_dist = mode11_total_sq_dist_to_line_alt;3642#endif3643mode11_candidates[examine_iter].m_index = unique_part_index;3644}36453646{3647float mode7_total_sq_dist_to_line_alt = 0.0f;3648for (uint32_t part_index = 0; part_index < num_parts; part_index++)3649{3650float d = maximum(0.0f, total_variance[part_index] - mode7_eigenvalue_est[part_index]);3651mode7_total_sq_dist_to_line_alt += d;3652}36533654mode7_candidates[examine_iter].m_total_sq_dist = mode7_total_sq_dist_to_line_alt;3655mode7_candidates[examine_iter].m_index = unique_part_index;3656}36573658} // examine_iter36593660std::sort(&mode11_candidates[0], &mode11_candidates[num_pats_to_examine]);3661std::sort(&mode7_candidates[0], &mode7_candidates[num_pats_to_examine]);36623663for (uint32_t i = 0; i < num_desired_pats; i++)3664pDesired_pat_indices_mode11[i] = mode11_candidates[i].m_index;36653666for (uint32_t i = 0; i < num_desired_pats; i++)3667pDesired_pat_indices_mode7[i] = mode7_candidates[i].m_index;3668}36693670static void estimate_partitions_mode7(3671uint32_t num_parts, // 2 or 3 partitions3672uint32_t num_unique_pats, const partition_pattern_vec* pUnique_pats, // list of all unique, canonicalized patterns3673uint32_t num_pats_to_examine, const uint32_t* pUnique_pat_indices_to_examine, // indices of pats to examine3674const vec3F* pHalf_pixels_as_floats, // block's half pixel values casted to floats3675const astc_hdr_codec_base_options& coptions, // options3676uint32_t num_desired_pats, uint32_t* pDesired_pat_indices) // output indices3677{3678BASISU_NOTE_UNUSED(coptions);3679BASISU_NOTE_UNUSED(num_unique_pats);36803681const uint32_t BLOCK_W = 6, BLOCK_H = 6, BLOCK_T = 6 * 6, MAX_PARTS = 3;3682assert(num_parts <= MAX_PARTS);36833684struct candidate_res3685{3686float m_total_sq_dist;3687uint32_t m_index;3688bool operator< (const candidate_res& rhs) const { return m_total_sq_dist < rhs.m_total_sq_dist; }3689};36903691const uint32_t MAX_CANDIDATES = 1024;3692assert(num_desired_pats && (num_desired_pats <= MAX_CANDIDATES));36933694candidate_res candidates[MAX_CANDIDATES];36953696for (uint32_t examine_iter = 0; examine_iter < num_pats_to_examine; examine_iter++)3697{3698const uint32_t unique_part_index = pUnique_pat_indices_to_examine[examine_iter];3699assert(unique_part_index < num_unique_pats);37003701const partition_pattern_vec* pPat = &pUnique_pats[unique_part_index];37023703vec3F part_means[MAX_PARTS];3704uint32_t part_total_texels[MAX_PARTS] = { 0 };37053706for (uint32_t i = 0; i < num_parts; i++)3707part_means[i].clear();37083709for (uint32_t y = 0; y < BLOCK_H; y++)3710{3711for (uint32_t x = 0; x < BLOCK_W; x++)3712{3713const uint32_t part_index = (*pPat)(x, y);3714assert(part_index < num_parts);37153716part_means[part_index] += pHalf_pixels_as_floats[x + y * BLOCK_W];3717part_total_texels[part_index]++;37183719} // x3720} // y37213722for (uint32_t i = 0; i < num_parts; i++)3723{3724assert(part_total_texels[i]);3725part_means[i] /= (float)part_total_texels[i];3726}37273728vec3F part_axis(0.5773502691f);37293730// TODO: This total distance can be computed rapidly. First compute the total variance of each channel (sum the diag entries of the covar matrix),3731// then compute the principle eigenvalue, and subtract. The result is the variance of the projection distances.3732float total_sq_dist_to_line = 0.0f;3733for (uint32_t i = 0; i < BLOCK_T; i++)3734{3735const uint32_t part_index = (*pPat)[i];3736assert(part_index < num_parts);37373738total_sq_dist_to_line += dist_to_line_squared(pHalf_pixels_as_floats[i], part_means[part_index], part_axis);3739}37403741candidates[examine_iter].m_total_sq_dist = total_sq_dist_to_line;37423743candidates[examine_iter].m_index = unique_part_index;37443745} // examine_iter37463747std::sort(&candidates[0], &candidates[num_pats_to_examine]);37483749for (uint32_t i = 0; i < num_desired_pats; i++)3750pDesired_pat_indices[i] = candidates[i].m_index;3751}37523753static float calc_deblocking_penalty_itp(3754uint32_t bx, uint32_t by, uint32_t width, uint32_t height,3755const imagef& pass_src_img_itp, const candidate_encoding& candidate)3756{3757float total_deblock_penalty = 0.0f;37583759float total_orig_mse = 0.0f, total_comp_mse = 0.0f;3760uint32_t total_c = 0;37613762for (uint32_t b = 0; b < 4; b++)3763{3764for (uint32_t i = 0; i < 6; i++)3765{3766int ox = 0, oy = 0, qx = 0, qy = 0;37673768switch (b)3769{3770case 0:3771ox = bx * 6 + i; oy = (by - 1) * 6 + 5;3772qx = bx * 6 + i; qy = by * 6;3773break;3774case 1:3775ox = bx * 6 + i; oy = (by + 1) * 6;3776qx = bx * 6 + i; qy = by * 6 + 5;3777break;3778case 2:3779ox = (bx - 1) * 6 + 5; oy = by * 6 + i;3780qx = bx * 6; qy = by * 6 + i;3781break;3782case 3:3783ox = (bx + 1) * 6; oy = by * 6 + i;3784qx = bx * 6 + 5; qy = by * 6 + i;3785break;3786}37873788if ((ox < 0) || (oy < 0) || (ox >= (int)width) || (oy >= (int)height))3789continue;37903791const vec3F& o_pixel_itp = pass_src_img_itp(ox, oy);3792const vec3F& q_pixel_itp = pass_src_img_itp(qx, qy);37933794const vec3F &d_pixel_itp = candidate.m_comp_pixels_itp[qy - by * 6][qx - bx * 6]; // compressed block37953796vec3F orig_delta_v(o_pixel_itp - q_pixel_itp);3797total_orig_mse += square(orig_delta_v[0]) + square(orig_delta_v[1]) + square(orig_delta_v[2]);37983799vec3F d_delta_v(o_pixel_itp - d_pixel_itp);3800total_comp_mse += square(d_delta_v[0]) + square(d_delta_v[1]) + square(d_delta_v[2]);38013802total_c++;3803}3804}38053806if (total_c)3807{3808total_orig_mse /= (float)total_c;3809total_comp_mse /= (float)total_c;38103811if (total_orig_mse)3812{3813total_deblock_penalty = fabsf((total_comp_mse - total_orig_mse) / total_orig_mse);3814}3815}38163817return total_deblock_penalty;3818}38193820static bool calc_strip_size(3821float lambda,3822uint32_t num_blocks_y, uint32_t total_threads, bool force_one_strip,3823uint32_t& res_total_strips, uint32_t& res_rows_per_strip, astc_hdr_6x6_global_config &global_cfg)3824{3825uint32_t total_strips = 1;38263827if (lambda == 0.0f)3828{3829if (!force_one_strip)3830{3831total_strips = total_threads;3832}3833}3834else3835{3836const uint32_t MIN_DESIRED_STRIPS = 8;3837const uint32_t MAX_TARGET_STRIPS = 32;3838const uint32_t TARGET_ASTC_6X6_ROWS_PER_STRIP = 12;38393840if (!force_one_strip)3841{3842total_strips = maximum<uint32_t>(1, num_blocks_y / TARGET_ASTC_6X6_ROWS_PER_STRIP);38433844if (num_blocks_y >= MIN_DESIRED_STRIPS * 2)3845total_strips = maximum(total_strips, MIN_DESIRED_STRIPS);3846}38473848total_strips = minimum(total_strips, MAX_TARGET_STRIPS);3849}38503851uint32_t rows_per_strip = 0;3852if (total_strips <= 1)3853{3854rows_per_strip = num_blocks_y;3855}3856else3857{3858rows_per_strip = (num_blocks_y / total_strips) & ~1;38593860if (rows_per_strip < 2)3861rows_per_strip = 2;// num_blocks_y;3862}38633864assert((rows_per_strip == num_blocks_y) || ((rows_per_strip & 1) == 0));38653866total_strips = (num_blocks_y + rows_per_strip - 1) / rows_per_strip;38673868if (global_cfg.m_debug_output)3869{3870fmt_printf("num_blocks_y: {}, total_threads : {}, Total strips : {}\n", num_blocks_y, total_threads, total_strips);3871fmt_printf("ASTC 6x6 block rows per strip: {}\n", rows_per_strip);3872fmt_printf("ASTC 6x6 block rows on final strip: {}\n", num_blocks_y - (total_strips - 1) * rows_per_strip);3873}38743875uint32_t total_rows = 0;3876for (uint32_t strip_index = 0; strip_index < total_strips; strip_index++)3877{3878uint32_t strip_first_by = strip_index * rows_per_strip;3879uint32_t strip_last_by = minimum<uint32_t>(strip_first_by + rows_per_strip - 1, num_blocks_y);38803881if (strip_index == (total_strips - 1))3882strip_last_by = num_blocks_y - 1;38833884uint32_t num_strip_block_rows = (strip_last_by - strip_first_by) + 1;3885total_rows += num_strip_block_rows;38863887if (global_cfg.m_debug_output)3888fmt_printf("Strip row: {}, total block rows: {}\n", strip_index, num_strip_block_rows);3889}38903891if (total_rows != num_blocks_y)3892{3893fmt_error_printf("Strip calc failed\n");3894return false;3895}38963897res_total_strips = total_strips;3898res_rows_per_strip = rows_per_strip;38993900return true;3901}39023903static void convet_rgb_image_to_itp(const imagef &src_img, imagef &dst_img, const astc_hdr_6x6_global_config& cfg)3904{3905const uint32_t width = src_img.get_width(), height = src_img.get_height();39063907dst_img.resize(width, height);39083909for (uint32_t y = 0; y < height; y++)3910{3911for (uint32_t x = 0; x < width; x++)3912{3913vec3F src_rgb(src_img(x, y));39143915vec3F src_itp;3916linear_rgb_to_itp(src_rgb, src_itp, cfg);39173918dst_img(x, y) = src_itp;3919}3920}3921}39223923const uint32_t BLOCK_W = 6, BLOCK_H = 6;3924const uint32_t NUM_BLOCK_PIXELS = BLOCK_W * BLOCK_H;39253926const float SOLID_PENALTY = 4.0f;3927const float REUSE_PENALTY = 1.0f;3928const float RUN_PENALTY = 10.0f;39293930const float MSE_WEIGHT = 300000.0f;3931const float SSIM_WEIGHT = 200.0f;3932const float TWO_LEVEL_PENALTY = 1.425f;3933const float SWITCH_TO_GAUSSIAN_FILTERED_THRESH1_D_SSIM = .04f;3934const float SWITCH_TO_GAUSSIAN_FILTERED_THRESH2_D_SSIM = .04f;3935const float COMPLEX_BLOCK_WEIGHT_GRID_2X2_MSE_PENALTY = 1.5f;3936const float COMPLEX_BLOCK_WEIGHT_GRID_3X3_MSE_PENALTY = 1.25f;3937const float COMPLEX_BLOCK_WEIGHT_GRID_4X4_MSE_PENALTY = 1.15f;39383939struct uastc_hdr_6x6_debug_state3940{3941uint32_t m_encoding_type_hist[(uint32_t)encoding_type::cTotal] = { 0 };3942uint32_t m_endpoint_mode_hist[(uint32_t)endpoint_mode::cTotal] = { 0 };3943uint32_t m_block_mode_hist[(uint32_t)block_mode::cBMTotalModes] = { 0 };3944uint64_t m_block_mode_total_bits[(uint32_t)block_mode::cBMTotalModes] = { 0 };39453946basisu::vector< basisu::stats<float> > m_block_mode_comp_stats[(uint32_t)block_mode::cBMTotalModes][3];3947basisu::vector< basisu::comparative_stats<float> > m_block_mode_comparative_stats[(uint32_t)block_mode::cBMTotalModes][3];39483949std::atomic<uint32_t> m_total_gaussian1_blocks;3950std::atomic<uint32_t> m_total_gaussian2_blocks;3951std::atomic<uint32_t> m_total_filter_horizontal;3952std::atomic<uint32_t> m_detail_stats[5];3953std::atomic<uint32_t> m_total_mode7_skips;39543955std::atomic<uint32_t> m_total_blocks_compressed;39563957std::atomic<uint32_t> m_total_candidates_considered;3958std::atomic<uint32_t> m_max_candidates_considered;39593960std::atomic<uint32_t> m_total_part2_stats[4];3961std::atomic<uint32_t> m_dp_stats[5];39623963std::atomic<uint32_t> m_reuse_num_parts[4];3964std::atomic<uint32_t> m_reuse_total_dp;39653966imagef m_stat_vis;3967std::mutex m_stat_vis_mutex;39683969image m_part_vis;3970image m_mode_vis;3971image m_mode_vis2;3972image m_grid_vis;3973image m_enc_vis;3974std::mutex m_vis_image_mutex;39753976std::atomic<uint32_t> m_comp_level_hist[ASTC_HDR_6X6_MAX_COMP_LEVEL + 1];39773978std::atomic<uint32_t> m_total_jnd_replacements;39793980std::mutex m_stats_mutex;39813982uastc_hdr_6x6_debug_state()3983{3984for (uint32_t i = 0; i < (uint32_t)block_mode::cBMTotalModes; i++)3985{3986for (uint32_t j = 0; j < 3; j++)3987{3988m_block_mode_comp_stats[i][j].reserve(512);3989m_block_mode_comparative_stats[i][j].reserve(512);3990}3991}3992}39933994void init(uint32_t width, uint32_t height)3995{3996m_stat_vis.resize(width, height);3997m_part_vis.resize(width, height);3998m_mode_vis.resize(width, height);3999m_mode_vis2.resize(width, height);4000m_grid_vis.resize(width, height);4001m_enc_vis.resize(width, height);40024003basisu::clear_obj(m_encoding_type_hist);4004basisu::clear_obj(m_endpoint_mode_hist);4005basisu::clear_obj(m_block_mode_hist);4006basisu::clear_obj(m_block_mode_total_bits);40074008for (uint32_t i = 0; i < (uint32_t)block_mode::cBMTotalModes; i++)4009{4010for (uint32_t j = 0; j < 3; j++)4011{4012m_block_mode_comp_stats[i][j].clear();4013m_block_mode_comparative_stats[i][j].clear();4014}4015}40164017m_total_gaussian1_blocks.store(0);4018m_total_gaussian2_blocks.store(0);4019m_total_filter_horizontal.store(0);4020for (uint32_t i = 0; i < std::size(m_detail_stats); i++)4021m_detail_stats[i].store(0);4022m_total_mode7_skips.store(0);40234024for (uint32_t i = 0; i < std::size(m_comp_level_hist); i++)4025m_comp_level_hist[i].store(0);40264027m_total_blocks_compressed.store(0);40284029m_total_candidates_considered.store(0);4030m_max_candidates_considered.store(0);40314032for (uint32_t i = 0; i < std::size(m_total_part2_stats); i++)4033m_total_part2_stats[i].store(0);40344035for (uint32_t i = 0; i < std::size(m_dp_stats); i++)4036m_dp_stats[i].store(0);40374038for (uint32_t i = 0; i < std::size(m_reuse_num_parts); i++)4039m_reuse_num_parts[i] .store(0);40404041m_reuse_total_dp.store(0);40424043m_total_jnd_replacements.store(0);4044}40454046void print(uint32_t total_blocks) const4047{4048fmt_printf("Total blocks: {}\n", total_blocks);4049fmt_printf("Total JND replacements: {} {3.2}%\n", m_total_jnd_replacements, (float)m_total_jnd_replacements * 100.0f / (float)total_blocks);4050fmt_printf("Comp level histogram: {} {} {} {} {}\n", m_comp_level_hist[0], m_comp_level_hist[1], m_comp_level_hist[2], m_comp_level_hist[3], m_comp_level_hist[4]);4051fmt_printf("Total gaussian 1 blocks: {} {3.2}%\n", m_total_gaussian1_blocks, (float)m_total_gaussian1_blocks * 100.0f / (float)total_blocks);4052fmt_printf("Total gaussian 2 blocks: {} {3.2}%\n", m_total_gaussian2_blocks, (float)m_total_gaussian2_blocks * 100.0f / (float)total_blocks);4053fmt_printf("Total filter horizontal: {} {3.2}%\n", m_total_filter_horizontal, (float)m_total_filter_horizontal * 100.0f / (float)total_blocks);4054fmt_printf("Detail stats: Detailed block low grid skip: {}, Blurry block skip: {}, Very blurry block skip: {}, NH:{} H:{}\n", m_detail_stats[0], m_detail_stats[1], m_detail_stats[2], m_detail_stats[3], m_detail_stats[4]);4055fmt_printf("Total mode7 skips: {}\n", m_total_mode7_skips);40564057fmt_printf("Total candidates: {}, {} avg per block\n", m_total_candidates_considered, (float)m_total_candidates_considered / (float)total_blocks);4058fmt_printf("Max ever candidates: {}\n", m_max_candidates_considered);40594060fmt_printf("Part2/3 stats: {} {} {} {}\n", m_total_part2_stats[0], m_total_part2_stats[1], m_total_part2_stats[2], m_total_part2_stats[3]);4061fmt_printf("Dual plane stats: {} {} {} {} {}\n", m_dp_stats[0], m_dp_stats[1], m_dp_stats[2], m_dp_stats[3], m_dp_stats[4]);4062fmt_printf("Reuse total dual plane: {}\n", m_reuse_total_dp);4063fmt_printf("Reuse part stats: {} {} {}\n", m_reuse_num_parts[1], m_reuse_num_parts[2], m_reuse_num_parts[3]);40644065fmt_printf("\nEncoding type histogram:\n");4066for (uint32_t i = 0; i < std::size(m_encoding_type_hist); i++)4067fmt_printf("{}: {}\n", i, m_encoding_type_hist[i]);40684069fmt_printf("\nEndpoint mode histogram:\n");4070for (uint32_t i = 0; i < std::size(m_endpoint_mode_hist); i++)4071fmt_printf("{}: {}\n", i, m_endpoint_mode_hist[i]);40724073fmt_printf("\nBlock mode histogram:\n");40744075uint32_t total_dp = 0, total_sp = 0;4076uint32_t total_mode11 = 0, total_mode7 = 0;4077uint32_t part_hist[3] = { 0 };4078uint32_t part2_mode7_total = 0, part2_mode11_total = 0;4079uint32_t total_used_modes = 0;4080for (uint32_t i = 0; i < std::size(m_block_mode_hist); i++)4081{4082const auto& bm_desc = g_block_mode_descs[i];40834084const uint32_t total_uses = m_block_mode_hist[i];40854086if (bm_desc.m_dp)4087total_dp += total_uses;4088else4089total_sp += total_uses;40904091if (bm_desc.m_cem == 7)4092total_mode7 += total_uses;4093else4094total_mode11 += total_uses;40954096part_hist[bm_desc.m_num_partitions - 1] += total_uses;40974098if (bm_desc.m_num_partitions == 2)4099{4100if (bm_desc.m_cem == 7)4101part2_mode7_total += total_uses;4102else4103{4104assert(bm_desc.m_cem == 11);4105part2_mode11_total += total_uses;4106}4107}41084109float avg_std_dev = 0.0f;4110float avg_cross_correlations[3] = { 0 };41114112if (m_block_mode_comp_stats[i][0].size())4113{4114const uint32_t num_uses = m_block_mode_comp_stats[i][0].size_u32();41154116for (uint32_t j = 0; j < num_uses; j++)4117avg_std_dev += (float)maximum(m_block_mode_comp_stats[i][0][j].m_std_dev, m_block_mode_comp_stats[i][1][j].m_std_dev, m_block_mode_comp_stats[i][2][j].m_std_dev);4118avg_std_dev /= (float)num_uses;41194120for (uint32_t j = 0; j < num_uses; j++)4121{4122avg_cross_correlations[0] += fabsf((float)m_block_mode_comparative_stats[i][0][j].m_pearson);4123avg_cross_correlations[1] += fabsf((float)m_block_mode_comparative_stats[i][1][j].m_pearson);4124avg_cross_correlations[2] += fabsf((float)m_block_mode_comparative_stats[i][2][j].m_pearson);4125}41264127avg_cross_correlations[0] /= (float)num_uses;4128avg_cross_correlations[1] /= (float)num_uses;4129avg_cross_correlations[2] /= (float)num_uses;4130}41314132fmt_printf("{ 2}: uses: { 6}, cem: {}, dp: {} chan: {}, parts: {}, grid: {}x{}, endpoint levels: {}, weight levels: {}, Avg bits: {}, Avg Max Std Dev: {}, RG: {} RB: {} GB: {}\n", i, total_uses,4133bm_desc.m_cem,4134bm_desc.m_dp, bm_desc.m_dp_channel,4135bm_desc.m_num_partitions,4136bm_desc.m_grid_x, bm_desc.m_grid_y,4137astc_helpers::get_ise_levels(bm_desc.m_endpoint_ise_range),4138astc_helpers::get_ise_levels(bm_desc.m_weight_ise_range),4139total_uses ? ((double)m_block_mode_total_bits[i] / total_uses) : 0.0f,4140avg_std_dev, avg_cross_correlations[0], avg_cross_correlations[1], avg_cross_correlations[2]);41414142if (total_uses)4143total_used_modes++;4144}41454146fmt_printf("Total used modes: {}\n", total_used_modes);41474148fmt_printf("Total single plane: {}, total dual plane: {}\n", total_sp, total_dp);4149fmt_printf("Total mode 11: {}, mode 7: {}\n", total_mode11, total_mode7);4150fmt_printf("Partition histogram: {} {} {}\n", part_hist[0], part_hist[1], part_hist[2]);4151fmt_printf("2 subset mode 7 uses: {}, mode 11 uses: {}\n", part2_mode7_total, part2_mode11_total);4152}4153};41544155struct uastc_hdr_6x6_encode_state4156{4157astc_hdr_codec_base_options master_coptions;41584159imagef src_img;41604161imagef src_img_filtered1;4162imagef src_img_filtered2;41634164imagef src_img_itp;4165imagef src_img_filtered1_itp;4166imagef src_img_filtered2_itp;41674168vector2D<float> smooth_block_mse_scales;41694170imagef packed_img;41714172basisu::vector<bitwise_coder> strip_bits;41734174basisu::vector2D<astc_helpers::astc_block> final_astc_blocks;41754176vector2D<candidate_encoding> coded_blocks;4177};41784179static bool compress_strip_task(4180uint32_t strip_index, uint32_t total_strips, uint32_t strip_first_by, uint32_t strip_last_by,4181uint32_t num_blocks_x, uint32_t num_blocks_y, uint32_t total_blocks, uint32_t width, uint32_t height,4182astc_hdr_6x6_global_config &global_cfg, uastc_hdr_6x6_debug_state &debug_state, uastc_hdr_6x6_encode_state &enc_state)4183{4184BASISU_NOTE_UNUSED(num_blocks_y);4185BASISU_NOTE_UNUSED(total_strips);41864187vec3F prev_comp_pixels[BLOCK_H][BLOCK_W]; // [y][x]4188basisu::clear_obj(prev_comp_pixels);41894190uint32_t prev_run_len = 0;41914192bitwise_coder prev_encoding;4193candidate_encoding prev_candidate_encoding; // the previous candidate written, which may have been a run extension4194candidate_encoding prev_non_run_candidate_encoding; // the previous *non-run* candidate written41954196bitwise_coder& strip_coded_bits = enc_state.strip_bits[strip_index];41974198const uint32_t CANDIDATES_TO_RESERVE = 1536;41994200basisu::vector<candidate_encoding> candidates;4201candidates.reserve(CANDIDATES_TO_RESERVE);42024203for (uint32_t by = strip_first_by; by <= strip_last_by; by++)4204{4205const bool has_upper_neighbor = by > strip_first_by;42064207for (uint32_t bx = 0; bx < num_blocks_x; bx++)4208{4209//if ((bx == 1) && (by == 2))4210// basisu::fmt_printf("!");42114212for (uint32_t outer_pass = 0; outer_pass < 3; outer_pass++)4213{4214const bool has_left_neighbor = bx > 0;4215//const bool has_prev = has_left_neighbor || has_upper_neighbor;42164217// Select either the original source image, or the Gaussian filtered version.4218// From here the encoder *must* use these 2 sources.4219const imagef& pass_src_img = (outer_pass == 2) ? enc_state.src_img_filtered2 :4220((outer_pass == 1) ? enc_state.src_img_filtered1 : enc_state.src_img);42214222const imagef& pass_src_img_itp = (outer_pass == 2) ? enc_state.src_img_filtered2_itp :4223((outer_pass == 1) ? enc_state.src_img_filtered1_itp : enc_state.src_img_itp);42244225// Extract source image block4226vec4F block_pixels[BLOCK_H][BLOCK_W]; // [y][x]4227pass_src_img.extract_block_clamped(&block_pixels[0][0], bx * BLOCK_W, by * BLOCK_H, BLOCK_W, BLOCK_H);42284229vec4F block_pixels_itp[BLOCK_H][BLOCK_W]; // [y][x]4230pass_src_img_itp.extract_block_clamped(&block_pixels_itp[0][0], bx * BLOCK_W, by * BLOCK_H, BLOCK_W, BLOCK_H);42314232half_vec3 half_pixels[BLOCK_H][BLOCK_W]; // [y][x] half-float values4233vec3F half_pixels_as_floats[BLOCK_H][BLOCK_W]; // [y][x] half float values, integer bits as floats4234vec4F block_pixels_q16[BLOCK_H][BLOCK_W]; // [y][x], q16 space for low-level ASTC encoding4235vec3F block_pixels_as_itp[BLOCK_H][BLOCK_W]; // [y][x] input converted to itp space, for faster error calculations42364237bool is_grayscale = true;42384239candidates.resize(0);42404241float block_ly = BIG_FLOAT_VAL, block_hy = 0.0f, block_avg_y = 0.0f;42424243for (uint32_t y = 0; y < BLOCK_H; y++)4244{4245for (uint32_t x = 0; x < BLOCK_W; x++)4246{4247vec3F rgb_input;42484249for (uint32_t c = 0; c < 3; c++)4250{4251float v = block_pixels[y][x][c];42524253rgb_input[c] = v;42544255const basist::half_float h = basisu::fast_float_to_half_no_clamp_neg_nan_or_inf(v);4256assert(h == basist::float_to_half(v));42574258half_pixels[y][x][c] = h;42594260block_pixels_q16[y][x][c] = (float)half_to_qlog16(h);42614262half_pixels_as_floats[y][x][c] = (float)h;42634264} // c42654266float py = rgb_input.dot(vec3F(REC_709_R, REC_709_G, REC_709_B));4267if (py < block_ly)4268block_ly = py;4269if (py > block_hy)4270block_hy = py;4271block_avg_y += py;42724273//linear_rgb_to_itp(rgb_input, block_pixels_as_itp[y][x]);42744275block_pixels_as_itp[y][x] = block_pixels_itp[y][x];42764277block_pixels_q16[y][x][3] = 0.0f;42784279if ((half_pixels[y][x][0] != half_pixels[y][x][1]) || (half_pixels[y][x][0] != half_pixels[y][x][2]))4280is_grayscale = false;42814282} // x4283} // y42844285block_avg_y *= (1.0f / (float)NUM_BLOCK_PIXELS);42864287encode_astc_block_stats enc_block_stats;4288enc_block_stats.init(NUM_BLOCK_PIXELS, &block_pixels_q16[0][0]);42894290vec4F x_filtered[6][6], y_filtered[6][6];42914292filter_block(3, 6, (vec4F*)block_pixels, (vec4F*)x_filtered); // filter rows (horizontal)4293filter_block(6, 3, (vec4F*)block_pixels, (vec4F*)y_filtered); // filter cols (vertically)42944295const float filtered_x_err = diff_blocks((vec4F*)block_pixels, (vec4F*)x_filtered);4296const float filtered_y_err = diff_blocks((vec4F*)block_pixels, (vec4F*)y_filtered);4297const bool filter_horizontally = filtered_x_err < filtered_y_err;42984299//const float block_mag_gradient_mag = block_max_gradient_mag(bx, by);43004301if (filter_horizontally)4302debug_state.m_total_filter_horizontal.fetch_add(1, std::memory_order_relaxed);43034304vec3F lowpass_filtered[6][6];4305filter_block(3, 3, &half_pixels_as_floats[0][0], &lowpass_filtered[0][0]);4306float lowpass_std_dev = sub_and_compute_std_dev(&lowpass_filtered[0][0], &half_pixels_as_floats[0][0]);43074308const bool very_detailed_block = lowpass_std_dev > 350.0f;4309const bool very_blurry_block = lowpass_std_dev < 30.0f;4310const bool super_blurry_block = lowpass_std_dev < 15.0f;43114312basisu::stats<float> half_comp_stats[3];4313for (uint32_t c = 0; c < 3; c++)4314half_comp_stats[c].calc(NUM_BLOCK_PIXELS, &half_pixels_as_floats[0][0][c], 3);43154316const float SINGLE_PART_HALF_THRESH = 256.0f;4317const float COMPLEX_HALF_THRESH = 1024.0f;4318// HACK HACK4319const float VERY_COMPLEX_HALF_THRESH = 1400.0f; // 1536.0f;43204321const float max_std_dev = (float)maximum(half_comp_stats[0].m_std_dev, half_comp_stats[1].m_std_dev, half_comp_stats[2].m_std_dev);43224323const bool very_simple_block = (max_std_dev < SINGLE_PART_HALF_THRESH);4324const bool complex_block = (max_std_dev > COMPLEX_HALF_THRESH);4325const bool very_complex_block = (max_std_dev > VERY_COMPLEX_HALF_THRESH);43264327// Dynamically choose a comp_level for this block.4328astc_hdr_codec_base_options coptions(enc_state.master_coptions);4329uint32_t comp_level = global_cfg.m_master_comp_level;43304331if (very_complex_block)4332comp_level = global_cfg.m_highest_comp_level;4333else if (complex_block)4334comp_level = (global_cfg.m_master_comp_level + global_cfg.m_highest_comp_level + 1) / 2;43354336debug_state.m_comp_level_hist[comp_level].fetch_add(1, std::memory_order_relaxed);43374338bool any_2subset_enabled = false, any_2subset_mode11_enabled = false, any_2subset_mode7_enabled = false, any_3subset_enabled = false;4339BASISU_NOTE_UNUSED(any_2subset_mode11_enabled);43404341for (uint32_t i = 0; i < (uint32_t)block_mode::cBMTotalModes; i++)4342{4343if (comp_level == 0)4344{4345if ((g_block_mode_descs[i].m_flags & BASIST_HDR_6X6_LEVEL0) == 0)4346continue;4347}4348else if (comp_level == 1)4349{4350if ((g_block_mode_descs[i].m_flags & BASIST_HDR_6X6_LEVEL1) == 0)4351continue;4352}4353else if (comp_level == 2)4354{4355if ((g_block_mode_descs[i].m_flags & BASIST_HDR_6X6_LEVEL2) == 0)4356continue;4357}43584359if (g_block_mode_descs[i].m_num_partitions == 2)4360{4361any_2subset_enabled = true;43624363if (g_block_mode_descs[i].m_cem == 7)4364{4365any_2subset_mode7_enabled = true;4366}4367else4368{4369assert(g_block_mode_descs[i].m_cem == 11);4370any_2subset_mode11_enabled = true;4371}4372}4373else if (g_block_mode_descs[i].m_num_partitions == 3)4374any_3subset_enabled = true;4375}43764377coptions.m_mode7_full_s_optimization = (comp_level >= 2);43784379const bool uber_mode_flag = (comp_level >= 3);4380coptions.m_allow_uber_mode = uber_mode_flag;43814382coptions.m_ultra_quant = (comp_level >= 4);43834384coptions.m_take_first_non_clamping_mode11_submode = (comp_level <= 2);4385coptions.m_take_first_non_clamping_mode7_submode = (comp_level <= 2);43864387coptions.m_disable_weight_plane_optimization = (comp_level >= 2);43884389// -------------------43904391uint32_t total_used_block_chans = 0;4392for (uint32_t i = 0; i < 3; i++)4393total_used_block_chans += (half_comp_stats[i].m_range > 0.0f);43944395const bool is_solid_block = (total_used_block_chans == 0);43964397basisu::comparative_stats<float> half_cross_chan_stats[3];43984399// R vs. G4400half_cross_chan_stats[0].calc_pearson(NUM_BLOCK_PIXELS,4401&half_pixels_as_floats[0][0][0], &half_pixels_as_floats[0][0][1],44023, 3,4403&half_comp_stats[0], &half_comp_stats[1]);44044405// R vs. B4406half_cross_chan_stats[1].calc_pearson(NUM_BLOCK_PIXELS,4407&half_pixels_as_floats[0][0][0], &half_pixels_as_floats[0][0][2],44083, 3,4409&half_comp_stats[0], &half_comp_stats[2]);44104411// G vs. B4412half_cross_chan_stats[2].calc_pearson(NUM_BLOCK_PIXELS,4413&half_pixels_as_floats[0][0][1], &half_pixels_as_floats[0][0][2],44143, 3,4415&half_comp_stats[1], &half_comp_stats[2]);44164417const float rg_corr = fabsf((float)half_cross_chan_stats[0].m_pearson);4418const float rb_corr = fabsf((float)half_cross_chan_stats[1].m_pearson);4419const float gb_corr = fabsf((float)half_cross_chan_stats[2].m_pearson);44204421float min_corr = BIG_FLOAT_VAL, max_corr = -BIG_FLOAT_VAL;4422for (uint32_t i = 0; i < 3; i++)4423{4424if (half_comp_stats[i].m_range > 0.0f)4425{4426const float c = fabsf((float)half_cross_chan_stats[i].m_pearson);4427min_corr = minimum(min_corr, c);4428max_corr = maximum(max_corr, c);4429}4430}44314432bool use_single_subset_mode7 = true;4433if (comp_level <= 1)4434{4435// TODO: could also compute angle between principle axis and the grayscale axis.4436// TODO: Transform grayscale axis by covar matrix, compute variance vs. total variance4437const float MODE7_MIN_CHAN_CORR = .5f;4438const float MODE7_PCA_ANGLE_THRESH = .9f;4439use_single_subset_mode7 = is_grayscale || is_solid_block || (min_corr >= MODE7_MIN_CHAN_CORR);44404441if (use_single_subset_mode7)4442{4443float cos_ang = fabsf(enc_block_stats.m_axis_q16.dot(vec3F(0.5773502691f)));4444if (cos_ang < MODE7_PCA_ANGLE_THRESH)4445use_single_subset_mode7 = false;4446}4447}44484449const float STRONG_CORR_THRESH = (comp_level <= 1) ? .5f : ((comp_level <= 3) ? .75f : .9f);44504451int desired_dp_chan = -1;4452if (total_used_block_chans <= 1)4453{4454// no need for dual plane (except possibly 2x2 weight grids for RDO)4455}4456else4457{4458if (min_corr >= STRONG_CORR_THRESH)4459{4460// all channel pairs strongly correlated, no need for dual plane4461debug_state.m_dp_stats[0].fetch_add(1, std::memory_order_relaxed);4462}4463else4464{4465if (total_used_block_chans == 2)4466{4467if (half_comp_stats[0].m_range == 0.0f)4468{4469// r unused, check for strong gb correlation4470if (gb_corr < STRONG_CORR_THRESH)4471desired_dp_chan = 1;4472}4473else if (half_comp_stats[1].m_range == 0.0f)4474{4475// g unused, check for strong rb correlation4476if (rb_corr < STRONG_CORR_THRESH)4477desired_dp_chan = 0;4478}4479else4480{4481// b unused, check for strong rg correlation4482if (rg_corr < STRONG_CORR_THRESH)4483desired_dp_chan = 0;4484}4485}4486else4487{4488assert(total_used_block_chans == 3);44894490// see if rg/rb is weakly correlated vs. gb4491if ((rg_corr < gb_corr) && (rb_corr < gb_corr))4492desired_dp_chan = 0;4493// see if gr/gb is weakly correlated vs. rb4494else if ((rg_corr < rb_corr) && (gb_corr < rb_corr))4495desired_dp_chan = 1;4496// assume b is weakest4497else4498desired_dp_chan = 2;4499}45004501if (desired_dp_chan == -1)4502debug_state.m_dp_stats[1].fetch_add(1, std::memory_order_relaxed);4503else4504debug_state.m_dp_stats[2 + desired_dp_chan].fetch_add(1, std::memory_order_relaxed);4505}4506}45074508// 2x2 is special for RDO at higher lambdas - always pick a preferred channel.4509int desired_dp_chan_2x2 = 0;4510if (total_used_block_chans == 2)4511{4512if (half_comp_stats[0].m_range == 0.0f)4513desired_dp_chan_2x2 = 1;4514}4515else if (total_used_block_chans == 3)4516{4517// see if rg/rb is weakly correlated vs. gb4518if ((rg_corr < gb_corr) && (rb_corr < gb_corr))4519desired_dp_chan_2x2 = 0;4520// see if gr/gb is weakly correlated vs. rb4521else if ((rg_corr < rb_corr) && (gb_corr < rb_corr))4522desired_dp_chan_2x2 = 1;4523// assume b is weakest4524else4525desired_dp_chan_2x2 = 2;4526}45274528// Gather all candidate encodings4529bool status = false;45304531// ---- Run candidate4532if ((global_cfg.m_use_runs) && (has_left_neighbor || has_upper_neighbor))4533{4534candidate_encoding candidate;4535candidate.m_coder.reserve(24);45364537candidate.m_encoding_type = encoding_type::cRun;45384539candidate.m_decomp_log_blk = prev_non_run_candidate_encoding.m_decomp_log_blk;4540candidate.m_coded_log_blk = prev_non_run_candidate_encoding.m_coded_log_blk;45414542memcpy(candidate.m_comp_pixels, prev_comp_pixels, sizeof(prev_comp_pixels));45434544if (!prev_run_len)4545{4546candidate.m_coder.put_bits(RUN_CODE, RUN_CODE_LEN);4547candidate.m_coder.put_vlc(0, 5);4548}4549else4550{4551// extend current run - compute the # of new bits needed for the extension.45524553uint32_t prev_run_bits = prev_encoding.get_total_bits_u32();4554assert(prev_run_bits > 0);45554556// We're not actually going to code this, because the previously emitted run code will be extended.4557bitwise_coder temp_coder;4558temp_coder.put_bits(RUN_CODE, RUN_CODE_LEN);4559temp_coder.put_vlc((prev_run_len + 1) - 1, 5);45604561uint32_t cur_run_bits = temp_coder.get_total_bits_u32();4562assert(cur_run_bits >= prev_run_bits);45634564uint32_t total_new_bits = cur_run_bits - prev_run_bits;4565if (total_new_bits > 0)4566candidate.m_coder.put_bits(0, total_new_bits); // dummy bits4567}45684569candidate.m_run_len = prev_run_len + 1;45704571candidates.emplace_back(std::move(candidate));4572}45734574// ---- Reuse candidate4575if ((!is_solid_block) && (global_cfg.m_lambda > 0.0f))4576{4577for (uint32_t reuse_delta_index = 0; reuse_delta_index < global_cfg.m_num_reuse_xy_deltas; reuse_delta_index++)4578{4579const int reuse_delta_x = g_reuse_xy_deltas[reuse_delta_index].m_x;4580const int reuse_delta_y = g_reuse_xy_deltas[reuse_delta_index].m_y;45814582const int reuse_bx = bx + reuse_delta_x, reuse_by = by + reuse_delta_y;4583if ((reuse_bx < 0) || (reuse_bx >= (int)num_blocks_x))4584continue;4585if (reuse_by < (int)strip_first_by)4586break;45874588const candidate_encoding& prev_candidate = enc_state.coded_blocks(reuse_bx, reuse_by);45894590// TODO - support this.4591if (prev_candidate.m_encoding_type == encoding_type::cSolid)4592continue;4593assert((prev_candidate.m_encoding_type == encoding_type::cBlock) || (prev_candidate.m_encoding_type == encoding_type::cReuse));45944595candidate_encoding candidate;4596candidate.m_coder.reserve(24);4597astc_helpers::log_astc_block& coded_log_blk = candidate.m_coded_log_blk;4598astc_helpers::log_astc_block& decomp_log_blk = candidate.m_decomp_log_blk;45994600const astc_helpers::log_astc_block& prev_coded_log_blk = prev_candidate.m_coded_log_blk;46014602const uint32_t grid_x = prev_coded_log_blk.m_grid_width, grid_y = prev_coded_log_blk.m_grid_height;4603const bool dual_plane = prev_candidate.m_coded_log_blk.m_dual_plane;4604const uint32_t num_grid_samples = grid_x * grid_y;4605const uint32_t num_endpoint_vals = get_num_endpoint_vals(prev_coded_log_blk.m_color_endpoint_modes[0]);46064607coded_log_blk = prev_candidate.m_coded_log_blk;4608decomp_log_blk = prev_candidate.m_decomp_log_blk;46094610if (prev_coded_log_blk.m_num_partitions == 1)4611{4612// Now encode the block using the transcoded endpoints4613basist::half_float decoded_half[MAX_SUPPORTED_WEIGHT_LEVELS][3];46144615if (prev_coded_log_blk.m_color_endpoint_modes[0] == 7)4616{4617status = get_astc_hdr_mode_7_block_colors(coded_log_blk.m_endpoints, &decoded_half[0][0], nullptr,4618astc_helpers::get_ise_levels(coded_log_blk.m_weight_ise_range), coded_log_blk.m_weight_ise_range, coded_log_blk.m_endpoint_ise_range);4619}4620else4621{4622status = get_astc_hdr_mode_11_block_colors(coded_log_blk.m_endpoints, &decoded_half[0][0], nullptr,4623astc_helpers::get_ise_levels(coded_log_blk.m_weight_ise_range), coded_log_blk.m_weight_ise_range, coded_log_blk.m_endpoint_ise_range);4624}4625assert(status);46264627uint8_t trial_weights0[BLOCK_W * BLOCK_H], trial_weights1[BLOCK_W * BLOCK_H];4628uint8_t transcode_weights[MAX_BLOCK_W * MAX_BLOCK_H * 2];46294630if (dual_plane)4631{4632eval_selectors_dual_plane(prev_candidate.m_coded_log_blk.m_color_component_selector,4633BLOCK_W * BLOCK_H, trial_weights0, trial_weights1, (basist::half_float*)&half_pixels[0][0][0], astc_helpers::get_ise_levels(coded_log_blk.m_weight_ise_range), &decoded_half[0][0], coptions, UINT32_MAX);46344635downsample_ise_weights_dual_plane(4636coded_log_blk.m_weight_ise_range, coded_log_blk.m_weight_ise_range,4637BLOCK_W, BLOCK_H,4638grid_x, grid_y,4639trial_weights0, trial_weights1, coded_log_blk.m_weights);46404641basist::astc_6x6_hdr::requantize_astc_weights(num_grid_samples * 2, coded_log_blk.m_weights, coded_log_blk.m_weight_ise_range, transcode_weights, decomp_log_blk.m_weight_ise_range);4642}4643else4644{4645eval_selectors(BLOCK_W * BLOCK_H, trial_weights0, coded_log_blk.m_weight_ise_range, (basist::half_float*)&half_pixels[0][0][0], astc_helpers::get_ise_levels(coded_log_blk.m_weight_ise_range), &decoded_half[0][0], coptions, UINT32_MAX);46464647downsample_ise_weights(4648coded_log_blk.m_weight_ise_range, coded_log_blk.m_weight_ise_range,4649BLOCK_W, BLOCK_H,4650grid_x, grid_y,4651trial_weights0, coded_log_blk.m_weights);46524653basist::astc_6x6_hdr::requantize_astc_weights(num_grid_samples, coded_log_blk.m_weights, coded_log_blk.m_weight_ise_range, transcode_weights, decomp_log_blk.m_weight_ise_range);4654}46554656// Create the block the decoder would transcode into.4657copy_weight_grid(dual_plane, grid_x, grid_y, transcode_weights, decomp_log_blk);4658}4659else if (prev_coded_log_blk.m_num_partitions == 2)4660{4661assert(!dual_plane);46624663const int unique_pat_index = g_part2_seed_to_unique_index[coded_log_blk.m_partition_id];4664assert((unique_pat_index >= 0) && (unique_pat_index < (int)NUM_UNIQUE_PARTITIONS2));46654666const partition_pattern_vec& pat_vec = g_partitions2[unique_pat_index];46674668vec4F part_pixels_q16[2][64];4669half_vec3 part_half_pixels[2][64];4670uint32_t part_total_pixels[2] = { 0 };46714672for (uint32_t y = 0; y < BLOCK_H; y++)4673{4674for (uint32_t x = 0; x < BLOCK_W; x++)4675{4676const uint32_t part_index = pat_vec[x + y * 6];46774678uint32_t l = part_total_pixels[part_index];46794680part_pixels_q16[part_index][l] = block_pixels_q16[y][x];4681part_half_pixels[part_index][l] = half_pixels[y][x];46824683part_total_pixels[part_index] = l + 1;4684} // x4685} // y46864687uint8_t blk_weights[2][BLOCK_W * BLOCK_H];46884689for (uint32_t part_index = 0; part_index < 2; part_index++)4690{4691basist::half_float decoded_half[MAX_SUPPORTED_WEIGHT_LEVELS][3];46924693if (prev_coded_log_blk.m_color_endpoint_modes[0] == 7)4694{4695status = get_astc_hdr_mode_7_block_colors(coded_log_blk.m_endpoints + num_endpoint_vals * part_index, &decoded_half[0][0], nullptr,4696astc_helpers::get_ise_levels(coded_log_blk.m_weight_ise_range), coded_log_blk.m_weight_ise_range, coded_log_blk.m_endpoint_ise_range);4697}4698else4699{4700status = get_astc_hdr_mode_11_block_colors(coded_log_blk.m_endpoints + num_endpoint_vals * part_index, &decoded_half[0][0], nullptr,4701astc_helpers::get_ise_levels(coded_log_blk.m_weight_ise_range), coded_log_blk.m_weight_ise_range, coded_log_blk.m_endpoint_ise_range);4702}4703assert(status);47044705eval_selectors(part_total_pixels[part_index], blk_weights[part_index], coded_log_blk.m_weight_ise_range,4706(basist::half_float*)&part_half_pixels[part_index][0][0], astc_helpers::get_ise_levels(coded_log_blk.m_weight_ise_range), &decoded_half[0][0], coptions, UINT32_MAX);47074708} // part_index47094710uint8_t ise_weights[BLOCK_W * BLOCK_H];47114712uint32_t src_pixel_index[2] = { 0, 0 };4713for (uint32_t y = 0; y < BLOCK_H; y++)4714{4715for (uint32_t x = 0; x < BLOCK_W; x++)4716{4717const uint32_t part_index = pat_vec[x + y * 6];47184719ise_weights[x + y * BLOCK_W] = blk_weights[part_index][src_pixel_index[part_index]];4720src_pixel_index[part_index]++;4721} // x4722} // y47234724downsample_ise_weights(4725coded_log_blk.m_weight_ise_range, coded_log_blk.m_weight_ise_range,4726BLOCK_W, BLOCK_H,4727grid_x, grid_y,4728ise_weights, coded_log_blk.m_weights);47294730// Transcode these codable weights to ASTC weights.4731uint8_t transcode_weights[MAX_BLOCK_W * MAX_BLOCK_H];4732basist::astc_6x6_hdr::requantize_astc_weights(num_grid_samples, coded_log_blk.m_weights, coded_log_blk.m_weight_ise_range, transcode_weights, decomp_log_blk.m_weight_ise_range);47334734// Create the block the decoder would transcode into.4735copy_weight_grid(dual_plane, grid_x, grid_y, transcode_weights, decomp_log_blk);4736}4737else if (prev_coded_log_blk.m_num_partitions == 3)4738{4739assert(!dual_plane);47404741const int unique_pat_index = g_part3_seed_to_unique_index[coded_log_blk.m_partition_id];4742assert((unique_pat_index >= 0) && (unique_pat_index < (int)NUM_UNIQUE_PARTITIONS3));47434744const partition_pattern_vec& pat = g_partitions3[unique_pat_index];47454746vec4F part_pixels_q16[3][64];4747half_vec3 part_half_pixels[3][64];4748uint32_t part_total_pixels[3] = { 0 };47494750for (uint32_t y = 0; y < BLOCK_H; y++)4751{4752for (uint32_t x = 0; x < BLOCK_W; x++)4753{4754const uint32_t part_index = pat.m_parts[x + y * BLOCK_W];47554756uint32_t l = part_total_pixels[part_index];47574758part_pixels_q16[part_index][l] = block_pixels_q16[y][x];4759part_half_pixels[part_index][l] = half_pixels[y][x];47604761part_total_pixels[part_index] = l + 1;4762} // x4763} // y47644765uint8_t blk_weights[3][BLOCK_W * BLOCK_H];47664767for (uint32_t part_index = 0; part_index < 3; part_index++)4768{4769basist::half_float decoded_half[MAX_SUPPORTED_WEIGHT_LEVELS][3];47704771status = get_astc_hdr_mode_7_block_colors(coded_log_blk.m_endpoints + num_endpoint_vals * part_index, &decoded_half[0][0], nullptr,4772astc_helpers::get_ise_levels(coded_log_blk.m_weight_ise_range), coded_log_blk.m_weight_ise_range, coded_log_blk.m_endpoint_ise_range);4773assert(status);47744775eval_selectors(part_total_pixels[part_index], blk_weights[part_index], coded_log_blk.m_weight_ise_range,4776(basist::half_float*)&part_half_pixels[part_index][0][0], astc_helpers::get_ise_levels(coded_log_blk.m_weight_ise_range), &decoded_half[0][0], coptions, UINT32_MAX);47774778} // part_index47794780uint8_t ise_weights[BLOCK_W * BLOCK_H];47814782uint32_t src_pixel_index[3] = { 0 };4783for (uint32_t y = 0; y < BLOCK_H; y++)4784{4785for (uint32_t x = 0; x < BLOCK_W; x++)4786{4787const uint32_t part_index = pat.m_parts[x + y * BLOCK_W];47884789ise_weights[x + y * BLOCK_W] = blk_weights[part_index][src_pixel_index[part_index]];4790src_pixel_index[part_index]++;4791} // x4792} // y47934794downsample_ise_weights(4795coded_log_blk.m_weight_ise_range, coded_log_blk.m_weight_ise_range,4796BLOCK_W, BLOCK_H,4797grid_x, grid_y,4798ise_weights, coded_log_blk.m_weights);47994800// Transcode these codable weights to ASTC weights.4801uint8_t transcode_weights[MAX_BLOCK_W * MAX_BLOCK_H];4802basist::astc_6x6_hdr::requantize_astc_weights(num_grid_samples, coded_log_blk.m_weights, coded_log_blk.m_weight_ise_range, transcode_weights, decomp_log_blk.m_weight_ise_range);48034804// Create the block the decoder would transcode into.4805copy_weight_grid(dual_plane, grid_x, grid_y, transcode_weights, decomp_log_blk);4806}48074808if (!validate_log_blk(decomp_log_blk))4809{4810fmt_error_printf("pack_astc_block() failed\n");4811return false;4812}48134814status = decode_astc_block(BLOCK_W, BLOCK_H, decomp_log_blk, &candidate.m_comp_pixels[0][0]);4815if (!status)4816{4817fmt_error_printf("decode_astc_block() failed\n");4818return false;4819}48204821candidate.m_coder.put_bits(REUSE_CODE, REUSE_CODE_LEN);4822candidate.m_coder.put_bits(reuse_delta_index, REUSE_XY_DELTA_BITS);4823encode_values(candidate.m_coder, num_grid_samples * (dual_plane ? 2 : 1), coded_log_blk.m_weights, coded_log_blk.m_weight_ise_range);48244825candidate.m_encoding_type = encoding_type::cReuse;4826candidate.m_block_mode = prev_candidate.m_block_mode;4827candidate.m_endpoint_mode = prev_candidate.m_endpoint_mode;4828candidate.m_reuse_delta_index = reuse_delta_index;48294830candidates.emplace_back(std::move(candidate));48314832} // reuse_delta_index4833}48344835// ---- Solid candidate4836if (global_cfg.m_use_solid_blocks)4837{4838candidate_encoding candidate;4839candidate.m_coder.reserve(24);48404841// solid4842candidate.m_encoding_type = encoding_type::cSolid;48434844float r = 0.0f, g = 0.0f, b = 0.0f;4845const float LOG_BIAS = .125f;4846bool solid_block = true;4847for (uint32_t y = 0; y < BLOCK_H; y++)4848{4849for (uint32_t x = 0; x < BLOCK_W; x++)4850{4851if ((block_pixels[0][0][0] != block_pixels[y][x][0]) ||4852(block_pixels[0][0][1] != block_pixels[y][x][1]) ||4853(block_pixels[0][0][2] != block_pixels[y][x][2]))4854{4855solid_block = false;4856}48574858r += log2f(block_pixels[y][x][0] + LOG_BIAS);4859g += log2f(block_pixels[y][x][1] + LOG_BIAS);4860b += log2f(block_pixels[y][x][2] + LOG_BIAS);4861}4862}48634864if (solid_block)4865{4866r = block_pixels[0][0][0];4867g = block_pixels[0][0][1];4868b = block_pixels[0][0][2];4869}4870else4871{4872r = maximum<float>(0.0f, powf(2.0f, r * (1.0f / (float)NUM_BLOCK_PIXELS)) - LOG_BIAS);4873g = maximum<float>(0.0f, powf(2.0f, g * (1.0f / (float)NUM_BLOCK_PIXELS)) - LOG_BIAS);4874b = maximum<float>(0.0f, powf(2.0f, b * (1.0f / (float)NUM_BLOCK_PIXELS)) - LOG_BIAS);48754876r = minimum<float>(r, basist::MAX_HALF_FLOAT);4877g = minimum<float>(g, basist::MAX_HALF_FLOAT);4878b = minimum<float>(b, basist::MAX_HALF_FLOAT);4879}48804881basist::half_float rh = float_to_half_non_neg_no_nan_inf(r), gh = float_to_half_non_neg_no_nan_inf(g), bh = float_to_half_non_neg_no_nan_inf(b);48824883candidate.m_solid_color[0] = rh;4884candidate.m_solid_color[1] = gh;4885candidate.m_solid_color[2] = bh;48864887candidate.m_coder.put_bits(SOLID_CODE, SOLID_CODE_LEN);48884889candidate.m_coder.put_bits(rh, 15);4890candidate.m_coder.put_bits(gh, 15);4891candidate.m_coder.put_bits(bh, 15);48924893vec3F cp(basist::half_to_float(rh), basist::half_to_float(gh), basist::half_to_float(bh));48944895for (uint32_t y = 0; y < BLOCK_H; y++)4896for (uint32_t x = 0; x < BLOCK_W; x++)4897candidate.m_comp_pixels[y][x] = cp;48984899astc_helpers::log_astc_block& log_blk = candidate.m_coded_log_blk;49004901log_blk.clear();4902log_blk.m_solid_color_flag_hdr = true;4903log_blk.m_solid_color[0] = rh;4904log_blk.m_solid_color[1] = gh;4905log_blk.m_solid_color[2] = bh;4906log_blk.m_solid_color[3] = basist::float_to_half(1.0f);49074908candidate.m_decomp_log_blk = log_blk;49094910candidates.emplace_back(std::move(candidate));4911}49124913if ((!is_solid_block) || (!global_cfg.m_use_solid_blocks))4914{4915static uint8_t s_parts2_normal[5] = { 0, 2, 4, 6, 8 };4916static uint8_t s_parts3_normal[5] = { 0, 0, 4, 6, 8 };49174918static uint8_t s_parts2_complex[5] = { 0, 4, 8, 10, 16 };4919static uint8_t s_parts3_complex[5] = { 0, 0, 8, 10, 16 };49204921static uint8_t s_parts2_very_complex[5] = { 0, 8, 12, 14, 20 };4922static uint8_t s_parts3_very_complex[5] = { 0, 0, 12, 14, 20 };49234924uint32_t total_parts2 = 0, total_parts3 = 0;49254926assert(comp_level < 5);4927if ((very_simple_block) && (comp_level <= 3))4928{4929// Block's std dev is so low that 2-3 subsets are unlikely to help much4930total_parts2 = 0;4931total_parts3 = 0;49324933debug_state.m_total_part2_stats[0].fetch_add(1, std::memory_order_relaxed);4934}4935else if (very_complex_block)4936{4937total_parts2 = s_parts2_very_complex[comp_level];4938total_parts3 = s_parts3_very_complex[comp_level];49394940if (global_cfg.m_extra_patterns_flag)4941{4942total_parts2 += (comp_level == 4) ? 30 : 20;4943total_parts3 += (comp_level == 4) ? 30 : 20;4944}49454946debug_state.m_total_part2_stats[2].fetch_add(1, std::memory_order_relaxed);4947}4948else if (complex_block)4949{4950total_parts2 = s_parts2_complex[comp_level];4951total_parts3 = s_parts3_complex[comp_level];49524953if (global_cfg.m_extra_patterns_flag)4954{4955total_parts2 += (comp_level == 4) ? 15 : 10;4956total_parts3 += (comp_level == 4) ? 15 : 10;4957}49584959debug_state.m_total_part2_stats[3].fetch_add(1, std::memory_order_relaxed);4960}4961else4962{4963// moderate complexity - use defaults4964total_parts2 = s_parts2_normal[comp_level];4965total_parts3 = s_parts3_normal[comp_level];49664967if (global_cfg.m_extra_patterns_flag)4968{4969total_parts2 += 5;4970total_parts3 += 5;4971}49724973debug_state.m_total_part2_stats[1].fetch_add(1, std::memory_order_relaxed);4974}49754976if (!any_2subset_enabled)4977total_parts2 = 0;49784979if (!any_3subset_enabled)4980total_parts3 = 0;49814982int best_parts2_mode11[NUM_UNIQUE_PARTITIONS2], best_parts2_mode7[NUM_UNIQUE_PARTITIONS2];4983bool has_estimated_parts2 = false;49844985if (total_parts2)4986{4987if (global_cfg.m_brute_force_partition_matching)4988{4989int candidate_pats2[NUM_UNIQUE_PARTITIONS2];4990for (uint32_t i = 0; i < NUM_UNIQUE_PARTITIONS2; i++)4991candidate_pats2[i] = i;49924993if (any_2subset_enabled)4994{4995estimate_partitions_mode7_and_11(49962,4997NUM_UNIQUE_PARTITIONS2, g_partitions2,4998NUM_UNIQUE_PARTITIONS2, (uint32_t*)candidate_pats2,4999&half_pixels_as_floats[0][0],5000coptions,5001total_parts2, best_parts2_mode11, best_parts2_mode7);5002}50035004has_estimated_parts2 = true;5005}5006else5007{5008if (comp_level >= 1)5009{5010const uint32_t MAX_CANDIDATES2 = 48;5011int candidate_pats2[MAX_CANDIDATES2 * 2];50125013uint32_t num_candidate_pats2 = maximum((total_parts2 * 3) / 2, very_complex_block ? MAX_CANDIDATES2 : (MAX_CANDIDATES2 / 2));5014num_candidate_pats2 = minimum<uint32_t>(num_candidate_pats2, (uint32_t)std::size(candidate_pats2));50155016has_estimated_parts2 = estimate_partition2_6x6((basist::half_float(*)[3])half_pixels, candidate_pats2, num_candidate_pats2);50175018if (has_estimated_parts2)5019{5020estimate_partitions_mode7_and_11(50212,5022NUM_UNIQUE_PARTITIONS2, g_partitions2,5023num_candidate_pats2, (uint32_t*)candidate_pats2,5024&half_pixels_as_floats[0][0],5025coptions,5026total_parts2, best_parts2_mode11, best_parts2_mode7);5027}5028}5029else5030{5031has_estimated_parts2 = estimate_partition2_6x6((basist::half_float(*)[3])half_pixels, best_parts2_mode11, total_parts2);50325033if ((has_estimated_parts2) && (any_2subset_mode7_enabled))5034memcpy(best_parts2_mode7, best_parts2_mode11, total_parts2 * sizeof(best_parts2_mode7[0]));5035}5036}5037}50385039int best_parts3[NUM_UNIQUE_PARTITIONS3];5040bool has_estimated_parts3 = false;50415042if (total_parts3)5043{5044#if 05045has_estimated_parts3 = estimate_partition3_6x6((basist::half_float(*)[3])half_pixels, best_parts3, total_parts3);5046#elif 15047if (global_cfg.m_brute_force_partition_matching)5048{5049int candidate_pats3[NUM_UNIQUE_PARTITIONS3];5050for (uint32_t i = 0; i < NUM_UNIQUE_PARTITIONS3; i++)5051candidate_pats3[i] = i;50525053estimate_partitions_mode7(50543,5055NUM_UNIQUE_PARTITIONS3, g_partitions3,5056NUM_UNIQUE_PARTITIONS3, (uint32_t*)candidate_pats3,5057&half_pixels_as_floats[0][0],5058coptions,5059total_parts3, (uint32_t*)best_parts3);50605061has_estimated_parts3 = true;5062}5063else5064{5065const uint32_t MAX_CANDIDATES3 = 48;5066int candidate_pats3[MAX_CANDIDATES3 * 2];50675068uint32_t num_candidate_pats3 = maximum((total_parts3 * 3) / 2, very_complex_block ? MAX_CANDIDATES3 : (MAX_CANDIDATES3 / 2));5069num_candidate_pats3 = minimum<uint32_t>(num_candidate_pats3, (uint32_t)std::size(candidate_pats3));50705071has_estimated_parts3 = estimate_partition3_6x6((basist::half_float(*)[3])half_pixels, candidate_pats3, num_candidate_pats3);50725073if (has_estimated_parts3)5074{5075estimate_partitions_mode7(50763,5077NUM_UNIQUE_PARTITIONS3, g_partitions3,5078num_candidate_pats3, (uint32_t*)candidate_pats3,5079&half_pixels_as_floats[0][0],5080coptions,5081total_parts3, (uint32_t*)best_parts3);5082}5083}5084#endif5085}50865087const opt_mode_t mode11_opt_mode = complex_block ? cWeightedLeastSquares : cOrdinaryLeastSquares;50885089// ---- Encoded block candidate5090for (uint32_t block_mode_iter = 0; block_mode_iter < (uint32_t)block_mode::cBMTotalModes; block_mode_iter++)5091{5092const block_mode bm = (block_mode)block_mode_iter;50935094if (comp_level == 0)5095{5096if ((g_block_mode_descs[block_mode_iter].m_flags & BASIST_HDR_6X6_LEVEL0) == 0)5097continue;5098}5099else if (comp_level == 1)5100{5101if ((g_block_mode_descs[block_mode_iter].m_flags & BASIST_HDR_6X6_LEVEL1) == 0)5102continue;5103}5104else if (comp_level == 2)5105{5106if ((g_block_mode_descs[block_mode_iter].m_flags & BASIST_HDR_6X6_LEVEL2) == 0)5107continue;5108}51095110if (global_cfg.m_block_stat_optimizations_flag)5111{5112if ((comp_level <= 3) && (g_block_mode_descs[block_mode_iter].m_dp))5113{5114if ((global_cfg.m_lambda > 0.0f) && (!complex_block) && (g_block_mode_descs[block_mode_iter].m_grid_x == 2) && (g_block_mode_descs[block_mode_iter].m_grid_y == 2))5115{5116if (g_block_mode_descs[block_mode_iter].m_dp_channel != desired_dp_chan_2x2)5117continue;5118}5119else5120{5121if (g_block_mode_descs[block_mode_iter].m_dp_channel != desired_dp_chan)5122continue;5123}5124}51255126if (comp_level <= 3)5127{5128const uint32_t grid_x = g_block_mode_descs[block_mode_iter].m_grid_x;5129const uint32_t grid_y = g_block_mode_descs[block_mode_iter].m_grid_y;51305131if (!g_block_mode_descs[block_mode_iter].m_dp)5132{5133// Minor gain (.5-1% less canidates)5134if (very_detailed_block)5135{5136if (grid_x * grid_y <= 12)5137{5138debug_state.m_detail_stats[0].fetch_add(1, std::memory_order_relaxed);5139continue;5140}5141}51425143// Major gains (10-25% less candidates)5144if (very_blurry_block)5145{5146if ((grid_x > 4) || (grid_y > 4) || (g_block_mode_descs[block_mode_iter].m_num_partitions > 1))5147{5148debug_state.m_detail_stats[1].fetch_add(1, std::memory_order_relaxed);5149continue;5150}5151}5152if (super_blurry_block)5153{5154if ((grid_x > 3) || (grid_y > 3) || (g_block_mode_descs[block_mode_iter].m_num_partitions > 1))5155{5156debug_state.m_detail_stats[2].fetch_add(1, std::memory_order_relaxed);5157continue;5158}5159}5160}51615162if (grid_x != grid_y)5163{5164if (grid_x < grid_y)5165{5166if (!filter_horizontally)5167{5168debug_state.m_detail_stats[3].fetch_add(1, std::memory_order_relaxed);5169continue;5170}5171}5172else5173{5174if (filter_horizontally)5175{5176debug_state.m_detail_stats[4].fetch_add(1, std::memory_order_relaxed);5177continue;5178}5179}5180}5181}51825183if (global_cfg.m_lambda == 0.0f)5184{5185// Rarely useful if lambda=05186if ((g_block_mode_descs[block_mode_iter].m_grid_x == 2) && (g_block_mode_descs[block_mode_iter].m_grid_y == 2))5187continue;5188}5189} // block_stat_optimizations_flag51905191if ((!use_single_subset_mode7) &&5192(g_block_mode_descs[block_mode_iter].m_cem == 7) &&5193(g_block_mode_descs[block_mode_iter].m_num_partitions == 1))5194{5195debug_state.m_total_mode7_skips.fetch_add(1, std::memory_order_relaxed);5196continue;5197}51985199for (uint32_t endpoint_mode_iter = 0; endpoint_mode_iter < (uint32_t)endpoint_mode::cTotal; endpoint_mode_iter++)5200{5201if (global_cfg.m_lambda == 0.0f)5202{5203// No use trying anything else5204if (endpoint_mode_iter != (uint32_t)endpoint_mode::cRaw)5205continue;5206}52075208if (global_cfg.m_disable_delta_endpoint_usage)5209{5210if ((endpoint_mode_iter == (uint32_t)endpoint_mode::cUseUpperDelta) || (endpoint_mode_iter == (uint32_t)endpoint_mode::cUseLeftDelta))5211continue;5212}52135214if (!global_cfg.m_favor_higher_compression)5215{5216if (comp_level == 0)5217{5218if (endpoint_mode_iter == (uint32_t)endpoint_mode::cUseUpperDelta)5219continue;5220}52215222if (comp_level <= 1)5223{5224if ((endpoint_mode_iter == (uint32_t)endpoint_mode::cUseLeft) || (endpoint_mode_iter == (uint32_t)endpoint_mode::cUseUpper))5225continue;5226}5227}52285229const endpoint_mode em = (endpoint_mode)endpoint_mode_iter;52305231switch (em)5232{5233case endpoint_mode::cUseLeft:5234case endpoint_mode::cUseUpper:5235{5236const block_mode_desc& local_md = g_block_mode_descs[block_mode_iter];5237const uint32_t cem = local_md.m_cem;52385239if (local_md.m_num_partitions > 1)5240break;52415242if ((em == endpoint_mode::cUseLeft) && (!has_left_neighbor))5243break;5244else if ((em == endpoint_mode::cUseUpper) && (!has_upper_neighbor))5245break;52465247candidate_encoding candidate;5248candidate.m_coder.reserve(24);5249astc_helpers::log_astc_block& coded_log_blk = candidate.m_coded_log_blk;52505251int nx = bx, ny = by;5252if (em == endpoint_mode::cUseLeft)5253nx--;5254else5255ny--;52565257const candidate_encoding& neighbor_blk = enc_state.coded_blocks(nx, ny);5258if (neighbor_blk.m_encoding_type == encoding_type::cSolid)5259break;5260assert((neighbor_blk.m_encoding_type == encoding_type::cBlock) || (neighbor_blk.m_encoding_type == encoding_type::cReuse));52615262const block_mode_desc& neighbor_md = g_block_mode_descs[(uint32_t)neighbor_blk.m_block_mode];52635264if (neighbor_md.m_cem != cem)5265break;52665267assert(neighbor_blk.m_coded_log_blk.m_color_endpoint_modes[0] == cem);52685269const uint32_t grid_x = local_md.m_grid_x, grid_y = local_md.m_grid_y;5270const bool dual_plane = local_md.m_dp;5271const uint32_t num_grid_samples = grid_x * grid_y;5272const uint32_t num_endpoint_vals = get_num_endpoint_vals(local_md.m_cem);52735274coded_log_blk.m_grid_width = (uint8_t)grid_x;5275coded_log_blk.m_grid_height = (uint8_t)grid_y;5276coded_log_blk.m_dual_plane = (uint8_t)dual_plane;5277coded_log_blk.m_color_component_selector = (uint8_t)local_md.m_dp_channel;5278coded_log_blk.m_num_partitions = 1;5279coded_log_blk.m_color_endpoint_modes[0] = (uint8_t)neighbor_md.m_cem;5280coded_log_blk.m_weight_ise_range = (uint8_t)local_md.m_weight_ise_range;52815282// We're not explictly writing any endpoints, just reusing existing ones. So copy the neighbor's endpoints unchanged (so no loss).5283coded_log_blk.m_endpoint_ise_range = neighbor_blk.m_coded_log_blk.m_endpoint_ise_range;5284memcpy(coded_log_blk.m_endpoints, neighbor_blk.m_coded_log_blk.m_endpoints, num_endpoint_vals);52855286uint8_t transcode_endpoints[basist::NUM_MODE11_ENDPOINTS];52875288// Requantize the neighbor's endpoints to whatever we'll have to transcode into to make a valid ASTC encoding.5289basist::astc_6x6_hdr::requantize_ise_endpoints(neighbor_md.m_cem,5290neighbor_blk.m_coded_log_blk.m_endpoint_ise_range, neighbor_blk.m_coded_log_blk.m_endpoints,5291local_md.m_transcode_endpoint_ise_range, transcode_endpoints);52925293// Now encode the block using the transcoded endpoints5294basist::half_float decoded_half[MAX_SUPPORTED_WEIGHT_LEVELS][3];52955296if (cem == 7)5297{5298status = get_astc_hdr_mode_7_block_colors(transcode_endpoints, &decoded_half[0][0], nullptr,5299astc_helpers::get_ise_levels(local_md.m_weight_ise_range), local_md.m_weight_ise_range, local_md.m_transcode_endpoint_ise_range);5300}5301else5302{5303status = get_astc_hdr_mode_11_block_colors(transcode_endpoints, &decoded_half[0][0], nullptr,5304astc_helpers::get_ise_levels(local_md.m_weight_ise_range), local_md.m_weight_ise_range, local_md.m_transcode_endpoint_ise_range);5305}5306if (!status)5307break;53085309uint8_t trial_weights0[BLOCK_W * BLOCK_H], trial_weights1[BLOCK_W * BLOCK_H];5310if (dual_plane)5311{5312eval_selectors_dual_plane(local_md.m_dp_channel, BLOCK_W * BLOCK_H, trial_weights0, trial_weights1, (basist::half_float*)&half_pixels[0][0][0], astc_helpers::get_ise_levels(local_md.m_weight_ise_range), &decoded_half[0][0], coptions, UINT32_MAX);53135314downsample_ise_weights_dual_plane(5315local_md.m_weight_ise_range, local_md.m_weight_ise_range,5316BLOCK_W, BLOCK_H,5317grid_x, grid_y,5318trial_weights0, trial_weights1, coded_log_blk.m_weights);5319}5320else5321{5322eval_selectors(BLOCK_W * BLOCK_H, trial_weights0, local_md.m_weight_ise_range, (basist::half_float*)&half_pixels[0][0][0], astc_helpers::get_ise_levels(local_md.m_weight_ise_range), &decoded_half[0][0], coptions, UINT32_MAX);53235324downsample_ise_weights(5325local_md.m_weight_ise_range, local_md.m_weight_ise_range,5326BLOCK_W, BLOCK_H,5327grid_x, grid_y,5328trial_weights0, coded_log_blk.m_weights);5329}53305331// Transcode these codable weights to ASTC weights.5332uint8_t transcode_weights[MAX_BLOCK_W * MAX_BLOCK_H * 2];5333basist::astc_6x6_hdr::requantize_astc_weights(num_grid_samples * (dual_plane ? 2 : 1), coded_log_blk.m_weights, local_md.m_weight_ise_range, transcode_weights, local_md.m_transcode_weight_ise_range);53345335// Create the block the decoder would transcode into.5336astc_helpers::log_astc_block& decomp_blk = candidate.m_decomp_log_blk;5337decomp_blk.clear();53385339decomp_blk.m_color_endpoint_modes[0] = (uint8_t)local_md.m_cem;5340decomp_blk.m_dual_plane = local_md.m_dp;5341decomp_blk.m_color_component_selector = (uint8_t)local_md.m_dp_channel;5342decomp_blk.m_num_partitions = 1;5343decomp_blk.m_endpoint_ise_range = (uint8_t)local_md.m_transcode_endpoint_ise_range;5344decomp_blk.m_weight_ise_range = (uint8_t)local_md.m_transcode_weight_ise_range;53455346memcpy(decomp_blk.m_endpoints, transcode_endpoints, num_endpoint_vals);53475348copy_weight_grid(dual_plane, grid_x, grid_y, transcode_weights, decomp_blk);53495350if (!validate_log_blk(decomp_blk))5351{5352fmt_error_printf("pack_astc_block() failed\n");5353return false;5354}53555356status = decode_astc_block(BLOCK_W, BLOCK_H, decomp_blk, &candidate.m_comp_pixels[0][0]);5357if (!status)5358{5359fmt_error_printf("decode_astc_block() failed\n");5360return false;5361}53625363candidate.m_coder.put_bits(BLOCK_CODE, BLOCK_CODE_LEN);5364code_block(candidate.m_coder, candidate.m_coded_log_blk, (block_mode)block_mode_iter, em, nullptr);53655366candidate.m_encoding_type = encoding_type::cBlock;5367candidate.m_endpoint_mode = em;5368candidate.m_block_mode = bm;53695370candidates.emplace_back(std::move(candidate));53715372break;5373}5374case endpoint_mode::cUseLeftDelta:5375case endpoint_mode::cUseUpperDelta:5376{5377const block_mode_desc& local_md = g_block_mode_descs[block_mode_iter];5378const uint32_t cem = local_md.m_cem;53795380if (local_md.m_num_partitions > 1)5381break;53825383if ((em == endpoint_mode::cUseLeftDelta) && (!has_left_neighbor))5384break;5385else if ((em == endpoint_mode::cUseUpperDelta) && (!has_upper_neighbor))5386break;53875388candidate_encoding candidate;5389candidate.m_coder.reserve(24);5390astc_helpers::log_astc_block& coded_log_blk = candidate.m_coded_log_blk;53915392int nx = bx, ny = by;5393if (em == endpoint_mode::cUseLeftDelta)5394nx--;5395else5396ny--;53975398const candidate_encoding& neighbor_blk = enc_state.coded_blocks(nx, ny);5399if (neighbor_blk.m_encoding_type == encoding_type::cSolid)5400break;5401assert((neighbor_blk.m_encoding_type == encoding_type::cBlock) || (neighbor_blk.m_encoding_type == encoding_type::cReuse));54025403const block_mode_desc& neighbor_md = g_block_mode_descs[(uint32_t)neighbor_blk.m_block_mode];54045405if (neighbor_md.m_cem != cem)5406break;54075408assert(neighbor_md.m_cem == local_md.m_cem);54095410const uint32_t grid_x = local_md.m_grid_x, grid_y = local_md.m_grid_y;5411const bool dual_plane = local_md.m_dp;5412const uint32_t num_grid_samples = grid_x * grid_y;5413const uint32_t num_endpoint_vals = get_num_endpoint_vals(local_md.m_cem);54145415// Dequantize neighbor's endpoints to ISE 205416uint8_t neighbor_endpoints_ise20[basist::NUM_MODE11_ENDPOINTS];5417basist::astc_6x6_hdr::requantize_ise_endpoints(neighbor_md.m_cem,5418neighbor_blk.m_coded_log_blk.m_endpoint_ise_range, neighbor_blk.m_coded_log_blk.m_endpoints,5419astc_helpers::BISE_256_LEVELS, neighbor_endpoints_ise20);54205421// Requantize neighbor's endpoints to our local desired coding ISE range5422uint8_t neighbor_endpoints_coding_ise_local[basist::NUM_MODE11_ENDPOINTS];5423basist::astc_6x6_hdr::requantize_ise_endpoints(neighbor_md.m_cem, astc_helpers::BISE_256_LEVELS, neighbor_endpoints_ise20, local_md.m_endpoint_ise_range, neighbor_endpoints_coding_ise_local);54245425uint8_t blk_endpoints[basist::NUM_MODE11_ENDPOINTS];5426uint8_t blk_weights0[NUM_BLOCK_PIXELS], blk_weights1[NUM_BLOCK_PIXELS];54275428// Now try to encode the current block using the neighbor's endpoints submode.5429double err = 0.0f;5430uint32_t best_submode = 0;54315432if (cem == 7)5433{5434int maj_index, submode_index;5435decode_cem_7_config(neighbor_endpoints_ise20, submode_index, maj_index);54365437int first_submode = submode_index, last_submode = submode_index;54385439err = encode_astc_hdr_block_mode_7(5440NUM_BLOCK_PIXELS,5441(basist::half_float(*)[3])half_pixels, (vec4F*)block_pixels_q16,5442local_md.m_weight_ise_range,5443best_submode,5444BIG_FLOAT_VAL,5445blk_endpoints, blk_weights0,5446coptions,5447local_md.m_endpoint_ise_range,5448first_submode, last_submode,5449&enc_block_stats);5450}5451else5452{5453int maj_index, submode_index;5454decode_cem_11_config(neighbor_endpoints_ise20, submode_index, maj_index);54555456int first_submode = -1, last_submode = -1;5457if (maj_index == 3)5458{5459// direct5460}5461else5462{5463first_submode = submode_index;5464last_submode = submode_index;5465}54665467if (dual_plane)5468{5469err = encode_astc_hdr_block_mode_11_dual_plane(5470NUM_BLOCK_PIXELS,5471(basist::half_float(*)[3])half_pixels, (vec4F*)block_pixels_q16,5472local_md.m_dp_channel,5473local_md.m_weight_ise_range,5474best_submode,5475BIG_FLOAT_VAL,5476blk_endpoints, blk_weights0, blk_weights1,5477coptions,5478false,5479local_md.m_endpoint_ise_range,5480false, //uber_mode_flag,5481false,5482first_submode, last_submode, true);5483}5484else5485{5486err = encode_astc_hdr_block_mode_11(5487NUM_BLOCK_PIXELS,5488(basist::half_float(*)[3])half_pixels, (vec4F*)block_pixels_q16,5489local_md.m_weight_ise_range,5490best_submode,5491BIG_FLOAT_VAL,5492blk_endpoints, blk_weights0,5493coptions,5494false,5495local_md.m_endpoint_ise_range,5496false, //uber_mode_flag,5497false,5498first_submode, last_submode, true,5499mode11_opt_mode,5500&enc_block_stats);5501}5502}55035504if (err == BIG_FLOAT_VAL)5505break;55065507uint8_t endpoint_deltas[basist::NUM_MODE11_ENDPOINTS];55085509// TODO: For now, just try 5 bits for each endpoint. Can tune later.5510// This isn't right, it's computing the deltas in ISE space.5511//const uint32_t NUM_ENDPOINT_DELTA_BITS = 5;5512const int total_endpoint_delta_vals = 1 << NUM_ENDPOINT_DELTA_BITS;5513const int low_delta_limit = -(total_endpoint_delta_vals / 2), high_delta_limit = (total_endpoint_delta_vals / 2) - 1;55145515const auto& ise_to_rank = astc_helpers::g_dequant_tables.get_endpoint_tab(local_md.m_endpoint_ise_range).m_ISE_to_rank;55165517bool all_deltas_in_limits = true;5518for (uint32_t i = 0; i < num_endpoint_vals; i++)5519{5520int endpoint_delta = (int)ise_to_rank[blk_endpoints[i]] - (int)ise_to_rank[neighbor_endpoints_coding_ise_local[i]];55215522if ((endpoint_delta < low_delta_limit) || (endpoint_delta > high_delta_limit))5523all_deltas_in_limits = false;55245525endpoint_deltas[i] = (uint8_t)(endpoint_delta + -low_delta_limit);5526}55275528if (all_deltas_in_limits)5529{5530coded_log_blk.m_grid_width = (uint8_t)grid_x;5531coded_log_blk.m_grid_height = (uint8_t)grid_y;5532coded_log_blk.m_dual_plane = (uint8_t)dual_plane;5533coded_log_blk.m_color_component_selector = (uint8_t)local_md.m_dp_channel;5534coded_log_blk.m_num_partitions = 1;5535coded_log_blk.m_color_endpoint_modes[0] = (uint8_t)local_md.m_cem;5536coded_log_blk.m_weight_ise_range = (uint8_t)local_md.m_weight_ise_range;5537coded_log_blk.m_endpoint_ise_range = (uint8_t)local_md.m_endpoint_ise_range;55385539memcpy(coded_log_blk.m_endpoints, blk_endpoints, num_endpoint_vals);55405541uint8_t transcode_endpoints[basist::NUM_MODE11_ENDPOINTS];5542uint8_t transcode_weights[MAX_BLOCK_W * MAX_BLOCK_H * 2];55435544basist::astc_6x6_hdr::requantize_ise_endpoints(local_md.m_cem, local_md.m_endpoint_ise_range, blk_endpoints, local_md.m_transcode_endpoint_ise_range, transcode_endpoints);55455546if (dual_plane)5547{5548downsample_ise_weights_dual_plane(5549local_md.m_weight_ise_range, local_md.m_weight_ise_range,5550BLOCK_W, BLOCK_H,5551grid_x, grid_y,5552blk_weights0, blk_weights1,5553coded_log_blk.m_weights);5554}5555else5556{5557downsample_ise_weights(5558local_md.m_weight_ise_range, local_md.m_weight_ise_range,5559BLOCK_W, BLOCK_H,5560grid_x, grid_y,5561blk_weights0, coded_log_blk.m_weights);5562}55635564basist::astc_6x6_hdr::requantize_astc_weights(num_grid_samples * (dual_plane ? 2 : 1), coded_log_blk.m_weights, local_md.m_weight_ise_range, transcode_weights, local_md.m_transcode_weight_ise_range);55655566// Create the block the decoder would transcode into.55675568astc_helpers::log_astc_block& decomp_blk = candidate.m_decomp_log_blk;5569decomp_blk.clear();55705571decomp_blk.m_color_endpoint_modes[0] = (uint8_t)local_md.m_cem;5572decomp_blk.m_dual_plane = local_md.m_dp;5573decomp_blk.m_color_component_selector = (uint8_t)local_md.m_dp_channel;5574decomp_blk.m_num_partitions = 1;5575decomp_blk.m_endpoint_ise_range = (uint8_t)local_md.m_transcode_endpoint_ise_range;5576decomp_blk.m_weight_ise_range = (uint8_t)local_md.m_transcode_weight_ise_range;55775578memcpy(decomp_blk.m_endpoints, transcode_endpoints, num_endpoint_vals);55795580copy_weight_grid(dual_plane, grid_x, grid_y, transcode_weights, decomp_blk);55815582if (!validate_log_blk(decomp_blk))5583{5584fmt_error_printf("pack_astc_block() failed\n");5585return false;5586}55875588status = decode_astc_block(BLOCK_W, BLOCK_H, decomp_blk, &candidate.m_comp_pixels[0][0]);5589if (!status)5590{5591fmt_error_printf("decode_astc_block() failed\n");5592return false;5593}55945595candidate.m_coder.put_bits(BLOCK_CODE, BLOCK_CODE_LEN);5596code_block(candidate.m_coder, candidate.m_coded_log_blk, bm, em, endpoint_deltas);55975598candidate.m_encoding_type = encoding_type::cBlock;5599candidate.m_endpoint_mode = em;5600candidate.m_block_mode = bm;56015602candidates.emplace_back(std::move(candidate));5603}56045605break;5606}5607case endpoint_mode::cRaw:5608{5609//if (candidates.size() == 339)5610// fmt_printf("!");56115612const auto& mode_desc = g_block_mode_descs[(uint32_t)bm];5613const uint32_t cem = mode_desc.m_cem;5614//const uint32_t num_endpoint_vals = get_num_endpoint_vals(cem);5615const bool dual_plane = mode_desc.m_dp;56165617if ((global_cfg.m_disable_twothree_subsets) && (mode_desc.m_num_partitions >= 2))5618break;56195620if (mode_desc.m_num_partitions == 3)5621{5622assert(!dual_plane);56235624if (!has_estimated_parts3)5625break;56265627assert(mode_desc.m_weight_ise_range == mode_desc.m_transcode_weight_ise_range);5628assert(mode_desc.m_endpoint_ise_range == mode_desc.m_transcode_endpoint_ise_range);56295630trial_result res;56315632status = encode_block_3_subsets(5633res,5634cem,5635mode_desc.m_grid_x, mode_desc.m_grid_y,5636mode_desc.m_weight_ise_range, mode_desc.m_endpoint_ise_range,5637&half_pixels[0][0], (vec4F*)block_pixels_q16,5638coptions,5639uber_mode_flag,5640best_parts3, total_parts3, comp_level, mode11_opt_mode);56415642if (!status)5643break;56445645assert(res.m_valid);56465647candidate_encoding candidate;5648candidate.m_coder.reserve(24);5649astc_helpers::log_astc_block& coded_log_blk = candidate.m_coded_log_blk;56505651coded_log_blk = res.m_log_blk;56525653astc_helpers::log_astc_block& decomp_blk = candidate.m_decomp_log_blk;5654decomp_blk = res.m_log_blk;56555656if (!validate_log_blk(decomp_blk))5657{5658fmt_error_printf("pack_astc_block() failed\n");5659return false;5660}56615662status = decode_astc_block(BLOCK_W, BLOCK_H, decomp_blk, &candidate.m_comp_pixels[0][0]);5663if (!status)5664{5665fmt_error_printf("decode_astc_block() failed\n");5666return false;5667}56685669candidate.m_coder.put_bits(BLOCK_CODE, BLOCK_CODE_LEN);5670code_block(candidate.m_coder, candidate.m_coded_log_blk, bm, em, nullptr);56715672candidate.m_encoding_type = encoding_type::cBlock;5673candidate.m_endpoint_mode = em;5674candidate.m_block_mode = bm;56755676candidates.emplace_back(std::move(candidate));5677}5678else if (mode_desc.m_num_partitions == 2)5679{5680assert(!dual_plane);56815682if (!has_estimated_parts2)5683break;56845685assert(mode_desc.m_weight_ise_range == mode_desc.m_transcode_weight_ise_range);5686assert(mode_desc.m_endpoint_ise_range == mode_desc.m_transcode_endpoint_ise_range);56875688for (uint32_t est_part_iter = 0; est_part_iter < total_parts2; est_part_iter++)5689{5690trial_result results[2];56915692assert(((cem == 11) && any_2subset_mode11_enabled) || ((cem == 7) && any_2subset_mode7_enabled));56935694status = encode_block_2_subsets(5695results,5696mode_desc.m_grid_x, mode_desc.m_grid_y,5697mode_desc.m_cem,5698mode_desc.m_weight_ise_range, mode_desc.m_endpoint_ise_range,5699&half_pixels[0][0], (vec4F*)block_pixels_q16,5700coptions,5701uber_mode_flag,5702(cem == 11) ? best_parts2_mode11[est_part_iter] : best_parts2_mode7[est_part_iter],5703comp_level,5704mode11_opt_mode,5705true);57065707if (!status)5708continue;57095710for (uint32_t r_iter = 0; r_iter < 2; r_iter++)5711{5712const trial_result& res = results[r_iter];57135714if (!res.m_valid)5715continue;57165717candidate_encoding candidate;5718candidate.m_coder.reserve(24);5719astc_helpers::log_astc_block& coded_log_blk = candidate.m_coded_log_blk;57205721coded_log_blk = res.m_log_blk;57225723astc_helpers::log_astc_block& decomp_blk = candidate.m_decomp_log_blk;5724decomp_blk = res.m_log_blk;57255726if (!validate_log_blk(decomp_blk))5727{5728fmt_error_printf("pack_astc_block() failed\n");5729return false;5730}57315732status = decode_astc_block(BLOCK_W, BLOCK_H, decomp_blk, &candidate.m_comp_pixels[0][0]);5733if (!status)5734{5735fmt_error_printf("decode_astc_block() failed\n");5736return false;5737}57385739candidate.m_coder.put_bits(BLOCK_CODE, BLOCK_CODE_LEN);5740code_block(candidate.m_coder, candidate.m_coded_log_blk, bm, em, nullptr);57415742candidate.m_encoding_type = encoding_type::cBlock;5743candidate.m_endpoint_mode = em;5744candidate.m_block_mode = bm;57455746candidates.emplace_back(std::move(candidate));57475748} // r_iter5749}5750}5751else5752{5753// 1 subset5754uint8_t blk_weights0[BLOCK_W * BLOCK_H], blk_weights1[BLOCK_W * BLOCK_H];5755uint32_t best_submode = 0;57565757candidate_encoding candidate;5758candidate.m_coder.reserve(24);5759astc_helpers::log_astc_block& coded_log_blk = candidate.m_coded_log_blk;57605761const uint32_t grid_x = mode_desc.m_grid_x, grid_y = mode_desc.m_grid_y;5762const uint32_t num_grid_samples = grid_x * grid_y;57635764const half_vec3* pBlock_pixels_half = &half_pixels[0][0];5765const vec4F* pBlock_pixels_q16 = &block_pixels_q16[0][0];57665767const uint32_t num_grid_samples_dp = num_grid_samples * (dual_plane ? 2 : 1);57685769uint8_t transcode_weights[MAX_BLOCK_W * MAX_BLOCK_H * 2];57705771coded_log_blk.m_grid_width = (uint8_t)grid_x;5772coded_log_blk.m_grid_height = (uint8_t)grid_y;5773coded_log_blk.m_dual_plane = (uint8_t)dual_plane;5774coded_log_blk.m_color_component_selector = (uint8_t)mode_desc.m_dp_channel;5775coded_log_blk.m_num_partitions = 1;5776coded_log_blk.m_color_endpoint_modes[0] = (uint8_t)mode_desc.m_cem;5777coded_log_blk.m_weight_ise_range = (uint8_t)mode_desc.m_weight_ise_range;5778coded_log_blk.m_endpoint_ise_range = (uint8_t)mode_desc.m_endpoint_ise_range;57795780if ((cem == 11) && (!dual_plane) && ((grid_x < BLOCK_W) || (grid_y < BLOCK_H)))5781{5782double e = encode_astc_hdr_block_downsampled_mode_11(5783BLOCK_W, BLOCK_H, grid_x, grid_y,5784mode_desc.m_weight_ise_range, mode_desc.m_endpoint_ise_range,5785NUM_BLOCK_PIXELS, (basist::half_float(*)[3])pBlock_pixels_half, pBlock_pixels_q16,5786BIG_FLOAT_VAL,5787FIRST_MODE11_SUBMODE_INDEX, MAX_MODE11_SUBMODE_INDEX, false, mode11_opt_mode,5788coded_log_blk.m_endpoints, coded_log_blk.m_weights, best_submode,5789coptions,5790&enc_block_stats);57915792if (e == BIG_FLOAT_VAL)5793break;5794}5795else5796{5797if (cem == 7)5798{5799assert(!dual_plane);58005801double e = encode_astc_hdr_block_mode_7(5802NUM_BLOCK_PIXELS,5803(basist::half_float(*)[3])pBlock_pixels_half, pBlock_pixels_q16,5804mode_desc.m_weight_ise_range,5805best_submode,5806BIG_FLOAT_VAL,5807coded_log_blk.m_endpoints,5808blk_weights0,5809coptions,5810mode_desc.m_endpoint_ise_range,58110, MAX_MODE7_SUBMODE_INDEX,5812&enc_block_stats);5813BASISU_NOTE_UNUSED(e);5814}5815else5816{5817double e;58185819if (dual_plane)5820{5821e = encode_astc_hdr_block_mode_11_dual_plane(5822NUM_BLOCK_PIXELS,5823(basist::half_float(*)[3])pBlock_pixels_half, pBlock_pixels_q16,5824mode_desc.m_dp_channel,5825mode_desc.m_weight_ise_range,5826best_submode,5827BIG_FLOAT_VAL,5828coded_log_blk.m_endpoints,5829blk_weights0, blk_weights1,5830coptions,5831false,5832mode_desc.m_endpoint_ise_range, uber_mode_flag, false, -1, 7, false);5833}5834else5835{5836e = encode_astc_hdr_block_mode_11(5837NUM_BLOCK_PIXELS,5838(basist::half_float(*)[3])pBlock_pixels_half, pBlock_pixels_q16,5839mode_desc.m_weight_ise_range,5840best_submode,5841BIG_FLOAT_VAL,5842coded_log_blk.m_endpoints,5843blk_weights0,5844coptions,5845false,5846mode_desc.m_endpoint_ise_range, uber_mode_flag, false, -1, 7, false,5847mode11_opt_mode,5848&enc_block_stats);5849}58505851if (e == BIG_FLOAT_VAL)5852break;5853}58545855if (dual_plane)5856{5857downsample_ise_weights_dual_plane(5858mode_desc.m_weight_ise_range, mode_desc.m_weight_ise_range,5859BLOCK_W, BLOCK_H,5860grid_x, grid_y,5861blk_weights0, blk_weights1,5862coded_log_blk.m_weights);5863}5864else5865{5866downsample_ise_weights(5867mode_desc.m_weight_ise_range, mode_desc.m_weight_ise_range,5868BLOCK_W, BLOCK_H,5869grid_x, grid_y,5870blk_weights0, coded_log_blk.m_weights);58715872if ((comp_level >= MIN_REFINE_LEVEL) && ((grid_x < BLOCK_W) || (grid_y < BLOCK_H)))5873{5874bool refine_status = refine_endpoints(cem,5875mode_desc.m_endpoint_ise_range, coded_log_blk.m_endpoints,58766, 6, mode_desc.m_grid_x, mode_desc.m_grid_y,5877coded_log_blk.m_weights, mode_desc.m_weight_ise_range,5878BLOCK_W * BLOCK_H,5879(basist::half_float(*)[3])pBlock_pixels_half, (vec4F*)pBlock_pixels_q16,5880nullptr,5881coptions, mode11_opt_mode);5882BASISU_NOTE_UNUSED(refine_status);5883}5884}5885}58865887basist::astc_6x6_hdr::requantize_astc_weights(num_grid_samples_dp, coded_log_blk.m_weights, mode_desc.m_weight_ise_range, transcode_weights, mode_desc.m_transcode_weight_ise_range);58885889// Create the block the decoder would transcode into.5890astc_helpers::log_astc_block& decomp_blk = candidate.m_decomp_log_blk;5891decomp_blk.clear();58925893decomp_blk.m_color_endpoint_modes[0] = (uint8_t)mode_desc.m_cem;5894decomp_blk.m_dual_plane = mode_desc.m_dp;5895decomp_blk.m_color_component_selector = (uint8_t)mode_desc.m_dp_channel;5896decomp_blk.m_num_partitions = 1;5897decomp_blk.m_endpoint_ise_range = (uint8_t)mode_desc.m_transcode_endpoint_ise_range;5898decomp_blk.m_weight_ise_range = (uint8_t)mode_desc.m_transcode_weight_ise_range;58995900basist::astc_6x6_hdr::requantize_ise_endpoints(mode_desc.m_cem, mode_desc.m_endpoint_ise_range, coded_log_blk.m_endpoints, mode_desc.m_transcode_endpoint_ise_range, decomp_blk.m_endpoints);59015902copy_weight_grid(dual_plane, grid_x, grid_y, transcode_weights, decomp_blk);59035904if (!validate_log_blk(decomp_blk))5905{5906fmt_error_printf("pack_astc_block() failed\n");5907return false;5908}59095910status = decode_astc_block(BLOCK_W, BLOCK_H, decomp_blk, &candidate.m_comp_pixels[0][0]);5911if (!status)5912{5913fmt_error_printf("decode_astc_block() failed\n");5914return false;5915}59165917candidate.m_coder.put_bits(BLOCK_CODE, BLOCK_CODE_LEN);5918code_block(candidate.m_coder, candidate.m_coded_log_blk, bm, em, nullptr);59195920candidate.m_encoding_type = encoding_type::cBlock;5921candidate.m_endpoint_mode = em;5922candidate.m_block_mode = bm;59235924candidates.emplace_back(std::move(candidate));5925}59265927break;5928}5929default:5930assert(0);5931fmt_debug_printf("Invalid endpoint mode\n");5932return false;59335934} // switch (em)59355936} // endpoint_mode_iter59375938} // block_mode_iter59395940} // is_solid_block59415942//------------------------------------------------59435944debug_state.m_total_candidates_considered.fetch_add(candidates.size_u32(), std::memory_order_relaxed);5945atomic_max(debug_state.m_max_candidates_considered, candidates.size_u32());59465947for (uint32_t candidate_iter = 0; candidate_iter < candidates.size_u32(); candidate_iter++)5948{5949auto& candidate = candidates[candidate_iter];59505951for (uint32_t y = 0; y < BLOCK_H; y++)5952for (uint32_t x = 0; x < BLOCK_W; x++)5953linear_rgb_to_itp(candidate.m_comp_pixels[y][x], candidate.m_comp_pixels_itp[y][x], global_cfg);5954}59555956// Find best overall candidate5957double best_t = BIG_FLOAT_VAL;5958int best_candidate_index = -1;59595960float best_d_ssim = BIG_FLOAT_VAL;59615962if (global_cfg.m_lambda == 0.0f)5963{5964for (uint32_t candidate_iter = 0; candidate_iter < candidates.size_u32(); candidate_iter++)5965{5966const auto& candidate = candidates[candidate_iter];59675968float candidate_d_ssim = 1.0f - compute_block_ssim_itp(BLOCK_W, BLOCK_H, &block_pixels_as_itp[0][0], &candidate.m_comp_pixels_itp[0][0]);59695970if (candidate_d_ssim < best_d_ssim)5971best_d_ssim = candidate_d_ssim;59725973candidate_d_ssim *= SSIM_WEIGHT;59745975float candidate_mse = MSE_WEIGHT * compute_block_mse_itp(BLOCK_W, BLOCK_H, &block_pixels_as_itp[0][0], &candidate.m_comp_pixels_itp[0][0], global_cfg.m_delta_itp_dark_adjustment);59765977candidate_mse += candidate_d_ssim;59785979float total_deblock_penalty = 0.0f;5980if (global_cfg.m_deblocking_flag)5981{5982total_deblock_penalty = calc_deblocking_penalty_itp(bx, by, width, height, pass_src_img_itp, candidate) * global_cfg.m_deblock_penalty_weight;5983}5984candidate_mse += total_deblock_penalty * SSIM_WEIGHT;59855986if ((candidate.m_encoding_type == encoding_type::cBlock) || (candidate.m_encoding_type == encoding_type::cReuse))5987{5988// Bias the encoder away from 2 level blocks on complex blocks5989// TODO: Perhaps only do this on large or non-interpolated grids5990if (complex_block)5991{5992if (candidate.m_coded_log_blk.m_weight_ise_range == astc_helpers::BISE_2_LEVELS)5993{5994candidate_mse *= TWO_LEVEL_PENALTY;5995}5996}59975998// Bias the encoder away from smaller weight grids if the block is very complex5999// TODO: Use the DCT to compute an approximation of the block energy/variance retained vs. lost by downsampling.6000if (complex_block)6001{6002if ((candidate.m_coded_log_blk.m_grid_width == 2) && (candidate.m_coded_log_blk.m_grid_height == 2))6003candidate_mse *= COMPLEX_BLOCK_WEIGHT_GRID_2X2_MSE_PENALTY;6004else if (minimum(candidate.m_coded_log_blk.m_grid_width, candidate.m_coded_log_blk.m_grid_height) <= 3)6005candidate_mse *= COMPLEX_BLOCK_WEIGHT_GRID_3X3_MSE_PENALTY;6006else if (minimum(candidate.m_coded_log_blk.m_grid_width, candidate.m_coded_log_blk.m_grid_height) <= 4)6007candidate_mse *= COMPLEX_BLOCK_WEIGHT_GRID_4X4_MSE_PENALTY;6008}6009}60106011float candidate_t = candidate_mse;60126013if (candidate_t < best_t)6014{6015best_t = candidate_t;6016best_candidate_index = candidate_iter;6017}60186019} // candidate_iter60206021if (global_cfg.m_gaussian1_fallback && (outer_pass == 0) && (very_complex_block) && (best_d_ssim > SWITCH_TO_GAUSSIAN_FILTERED_THRESH1_D_SSIM))6022{6023debug_state.m_total_gaussian1_blocks.fetch_add(1, std::memory_order_relaxed);6024continue;6025}60266027const float block_y_contrast_ratio = block_hy / (block_ly + .00000125f);60286029if (global_cfg.m_gaussian2_fallback && (comp_level >= 1) && (outer_pass == 1) && (very_complex_block) && (best_d_ssim > SWITCH_TO_GAUSSIAN_FILTERED_THRESH2_D_SSIM) &&6030(block_hy >= 18.0f) && (block_y_contrast_ratio > 150.0f) &&6031(block_avg_y >= 1.5f))6032{6033debug_state.m_total_gaussian2_blocks.fetch_add(1, std::memory_order_relaxed);6034continue;6035}6036}6037else6038{6039assert(enc_state.smooth_block_mse_scales.get_width() > 0);60406041// Compute block's perceptual weighting6042float perceptual_scale = 0.0f;6043for (uint32_t y = 0; y < BLOCK_H; y++)6044for (uint32_t x = 0; x < BLOCK_W; x++)6045perceptual_scale = basisu::maximumf(perceptual_scale, enc_state.smooth_block_mse_scales.at_clamped(bx * BLOCK_W + x, by * BLOCK_H + y));60466047// Very roughly normalize the computed distortion vs. bits.6048perceptual_scale *= 10.0f;60496050for (uint32_t candidate_iter = 0; candidate_iter < candidates.size_u32(); candidate_iter++)6051{6052auto& candidate = candidates[candidate_iter];60536054float d_ssim = 1.0f - compute_block_ssim_itp(BLOCK_W, BLOCK_H, &block_pixels_as_itp[0][0], &candidate.m_comp_pixels_itp[0][0]);60556056if (d_ssim < best_d_ssim)6057best_d_ssim = (float)d_ssim;60586059d_ssim *= SSIM_WEIGHT;60606061float candidate_mse = MSE_WEIGHT * compute_block_mse_itp(BLOCK_W, BLOCK_H, &block_pixels_as_itp[0][0], &candidate.m_comp_pixels_itp[0][0], global_cfg.m_delta_itp_dark_adjustment);60626063candidate_mse += d_ssim;60646065float total_deblock_penalty = 0.0f;6066if (global_cfg.m_deblocking_flag)6067{6068total_deblock_penalty = calc_deblocking_penalty_itp(bx, by, width, height, pass_src_img_itp, candidate) * global_cfg.m_deblock_penalty_weight;6069}6070candidate_mse += total_deblock_penalty * SSIM_WEIGHT;60716072if ((candidate.m_encoding_type == encoding_type::cBlock) || (candidate.m_encoding_type == encoding_type::cReuse))6073{6074// Bias the encoder away from 2 level blocks on complex blocks6075if (complex_block)6076{6077if (candidate.m_coded_log_blk.m_weight_ise_range == astc_helpers::BISE_2_LEVELS)6078{6079candidate_mse *= TWO_LEVEL_PENALTY;6080}6081}60826083// Bias the encoder away from smaller weight grids if the block is very complex6084if (complex_block)6085{6086if ((candidate.m_coded_log_blk.m_grid_width == 2) && (candidate.m_coded_log_blk.m_grid_height == 2))6087candidate_mse *= COMPLEX_BLOCK_WEIGHT_GRID_2X2_MSE_PENALTY;6088else if (minimum(candidate.m_coded_log_blk.m_grid_width, candidate.m_coded_log_blk.m_grid_height) <= 3)6089candidate_mse *= COMPLEX_BLOCK_WEIGHT_GRID_3X3_MSE_PENALTY;6090else if (minimum(candidate.m_coded_log_blk.m_grid_width, candidate.m_coded_log_blk.m_grid_height) <= 4)6091candidate_mse *= COMPLEX_BLOCK_WEIGHT_GRID_4X4_MSE_PENALTY;6092}6093}60946095float mode_penalty = 1.0f;6096if (candidate.m_encoding_type == encoding_type::cSolid)6097mode_penalty *= SOLID_PENALTY;6098else if (candidate.m_encoding_type == encoding_type::cReuse)6099mode_penalty *= REUSE_PENALTY;6100else if (candidate.m_encoding_type == encoding_type::cRun)6101mode_penalty *= (complex_block ? RUN_PENALTY * 2.0f : RUN_PENALTY);61026103float candidate_bits = (float)candidate.m_coder.get_total_bits();6104float candidate_d = candidate_mse * mode_penalty;61056106const float D_POWER = 2.0f;6107float candidate_t = perceptual_scale * powf(candidate_d, D_POWER) + candidate_bits * (global_cfg.m_lambda * 1000.0f);61086109candidate.m_t = candidate_t;6110candidate.m_d = candidate_d;6111candidate.m_bits = candidate_bits;61126113if (candidate_t < best_t)6114{6115best_t = candidate_t;6116best_candidate_index = candidate_iter;6117}61186119} // candidate_iter61206121if (global_cfg.m_gaussian1_fallback && (outer_pass == 0) && (very_complex_block) && (best_d_ssim > SWITCH_TO_GAUSSIAN_FILTERED_THRESH1_D_SSIM))6122{6123debug_state.m_total_gaussian1_blocks.fetch_add(1, std::memory_order_relaxed);6124continue;6125}61266127const float block_y_contrast_ratio = block_hy / (block_ly + .00000125f);61286129if (global_cfg.m_gaussian2_fallback && (comp_level >= 1) && (outer_pass == 1) && (very_complex_block) && (best_d_ssim > SWITCH_TO_GAUSSIAN_FILTERED_THRESH2_D_SSIM) &&6130(block_hy >= 18.0f) && (block_y_contrast_ratio > 150.0f) &&6131(block_avg_y >= 1.5f))6132{6133debug_state.m_total_gaussian2_blocks.fetch_add(1, std::memory_order_relaxed);6134continue;6135}61366137if (global_cfg.m_rdo_candidate_diversity_boost)6138{6139// candidate diversity boosting - consider candidates along/near the Pareto front6140const candidate_encoding& comp_candidate = candidates[best_candidate_index];61416142float best_d = BIG_FLOAT_VAL;61436144for (uint32_t candidate_iter = 0; candidate_iter < candidates.size_u32(); candidate_iter++)6145{6146const auto& candidate = candidates[candidate_iter];61476148if (candidate.m_bits <= comp_candidate.m_bits * global_cfg.m_rdo_candidate_diversity_boost_bit_window_weight)6149{6150if (candidate.m_d < best_d)6151{6152best_d = candidate.m_d;6153best_candidate_index = candidate_iter;6154}6155}6156}6157}61586159// candidate JND optimization - if there's a cheaper to code candidate that is nearly equivalent visually to the best candidate chose, choose that6160if (global_cfg.m_jnd_optimization)6161{6162const candidate_encoding& cur_comp_candidate = candidates[best_candidate_index];61636164float new_best_candidate_bits = BIG_FLOAT_VAL;6165int new_best_candidate_index = -1;61666167for (uint32_t candidate_iter = 0; candidate_iter < candidates.size_u32(); candidate_iter++)6168{6169if ((int)candidate_iter == best_candidate_index)6170continue;61716172const auto& candidate = candidates[candidate_iter];61736174if (candidate.m_bits >= cur_comp_candidate.m_bits)6175continue;61766177float max_delta_itp = 0.0f;6178for (uint32_t y = 0; y < BLOCK_H; y++)6179{6180for (uint32_t x = 0; x < BLOCK_W; x++)6181{6182float delta_itp = compute_pixel_delta_itp(cur_comp_candidate.m_comp_pixels_itp[y][x], candidate.m_comp_pixels_itp[y][x], block_pixels_as_itp[y][x], global_cfg.m_delta_itp_dark_adjustment);6183max_delta_itp = maximum(max_delta_itp, delta_itp);61846185if (max_delta_itp >= global_cfg.m_jnd_delta_itp_thresh)6186goto skip;6187}6188}61896190skip:6191if (max_delta_itp >= global_cfg.m_jnd_delta_itp_thresh)6192continue;61936194if (candidate.m_bits < new_best_candidate_bits)6195{6196new_best_candidate_bits = candidate.m_bits;6197new_best_candidate_index = candidate_iter;6198}6199}62006201if (new_best_candidate_index != -1)6202{6203best_candidate_index = new_best_candidate_index;6204debug_state.m_total_jnd_replacements.fetch_add(1, std::memory_order_relaxed);6205}6206}62076208} // if (lambda == 0.0f)62096210if (global_cfg.m_debug_images)6211{6212std::lock_guard<std::mutex> lck(debug_state.m_stat_vis_mutex);6213debug_state.m_stat_vis.fill_box(bx * 6, by * 6, 6, 6, vec4F(best_d_ssim, max_std_dev, lowpass_std_dev, 1.0f));6214}62156216if (best_candidate_index < 0)6217{6218assert(best_candidate_index >= 0);6219fmt_error_printf("No candidates!\n");6220return false;6221}62226223const auto& best_candidate = candidates[best_candidate_index];62246225assert(best_candidate.m_encoding_type != encoding_type::cInvalid);62266227if (best_candidate.m_encoding_type == encoding_type::cRun)6228{6229if (!prev_run_len)6230{6231if (prev_encoding.get_total_bits())6232{6233#if SYNC_MARKERS6234strip_coded_bits.put_bits(0xDEAD, 16);6235#endif62366237strip_coded_bits.append(prev_encoding);6238}62396240assert(best_candidate.m_coder.get_total_bits());62416242prev_encoding = best_candidate.m_coder;62436244prev_run_len = 1;6245}6246else6247{6248prev_run_len++;62496250const uint32_t prev_run_bits = prev_encoding.get_total_bits_u32();6251assert(prev_run_bits);6252BASISU_NOTE_UNUSED(prev_run_bits);62536254const uint32_t num_dummy_bits = best_candidate.m_coder.get_total_bits_u32();6255BASISU_NOTE_UNUSED(num_dummy_bits);62566257// Rewrite the previous encoding to extend the run length.6258prev_encoding.restart();6259prev_encoding.put_bits(RUN_CODE, RUN_CODE_LEN);6260prev_encoding.put_vlc(prev_run_len - 1, 5);62616262assert(prev_encoding.get_total_bits() == prev_run_bits + num_dummy_bits);6263}6264}6265else6266{6267if (prev_encoding.get_total_bits())6268{6269#if SYNC_MARKERS6270strip_coded_bits.put_bits(0xDEAD, 16);6271#endif62726273strip_coded_bits.append(prev_encoding);6274}62756276prev_encoding = best_candidate.m_coder;6277prev_run_len = 0;6278}62796280memcpy(prev_comp_pixels, best_candidate.m_comp_pixels, sizeof(vec3F) * BLOCK_W * BLOCK_H);62816282prev_candidate_encoding = best_candidate;62836284if (best_candidate.m_encoding_type != encoding_type::cRun)6285prev_non_run_candidate_encoding = best_candidate;62866287{6288std::lock_guard<std::mutex> lck(debug_state.m_stats_mutex);62896290debug_state.m_encoding_type_hist[(uint32_t)best_candidate.m_encoding_type]++;62916292if (best_candidate.m_encoding_type == encoding_type::cBlock)6293{6294debug_state.m_endpoint_mode_hist[(uint32_t)best_candidate.m_endpoint_mode]++;6295}62966297if ((best_candidate.m_encoding_type == encoding_type::cReuse) || (best_candidate.m_encoding_type == encoding_type::cBlock))6298{6299const uint32_t bm_index = (uint32_t)best_candidate.m_block_mode;6300assert(bm_index < (uint32_t)block_mode::cBMTotalModes);63016302debug_state.m_block_mode_hist[bm_index]++;6303debug_state.m_block_mode_total_bits[bm_index] += best_candidate.m_coder.get_total_bits();63046305for (uint32_t i = 0; i < 3; i++)6306{6307debug_state.m_block_mode_comp_stats[bm_index][i].push_back(half_comp_stats[i]);6308debug_state.m_block_mode_comparative_stats[bm_index][i].push_back(half_cross_chan_stats[i]);6309}6310}63116312if (best_candidate.m_encoding_type == encoding_type::cReuse)6313{6314debug_state.m_reuse_num_parts[best_candidate.m_coded_log_blk.m_num_partitions].fetch_add(1, std::memory_order_relaxed);63156316if (best_candidate.m_coded_log_blk.m_dual_plane)6317debug_state.m_reuse_total_dp.fetch_add(1, std::memory_order_relaxed);6318}6319}63206321enc_state.coded_blocks(bx, by) = prev_non_run_candidate_encoding;63226323// Update decoded image6324vec4F decoded_float_pixels[BLOCK_H][BLOCK_W];6325for (uint32_t y = 0; y < BLOCK_H; y++)6326for (uint32_t x = 0; x < BLOCK_W; x++)6327decoded_float_pixels[y][x] = best_candidate.m_comp_pixels[y][x];63286329enc_state.packed_img.set_block_clipped((vec4F*)decoded_float_pixels, bx * BLOCK_W, by * BLOCK_H, BLOCK_W, BLOCK_H);63306331status = astc_helpers::pack_astc_block(enc_state.final_astc_blocks(bx, by), best_candidate.m_decomp_log_blk, nullptr, nullptr);6332if (!status)6333{6334fmt_error_printf("Failed packing block\n");6335return false;6336}63376338const uint32_t r = debug_state.m_total_blocks_compressed.fetch_add(1, std::memory_order_relaxed);6339if ((r & 2047) == 2047)6340{6341if (global_cfg.m_status_output)6342{6343basisu::fmt_printf("{} of {} total blocks compressed, {3.2}%\n", r, total_blocks, (r * 100.0f) / total_blocks);6344}6345}63466347if ((global_cfg.m_debug_images) &&6348((best_candidate.m_encoding_type != encoding_type::cRun) && (best_candidate.m_encoding_type != encoding_type::cSolid)))6349{6350std::lock_guard<std::mutex> lck(debug_state.m_vis_image_mutex);63516352if (best_candidate.m_decomp_log_blk.m_num_partitions == 2)6353{6354const int part2_unique_index = g_part2_seed_to_unique_index[best_candidate.m_decomp_log_blk.m_partition_id];6355assert((part2_unique_index >= 0) && (part2_unique_index < (int)NUM_UNIQUE_PARTITIONS2));63566357const partition_pattern_vec& pat = g_partitions2[part2_unique_index];63586359for (uint32_t y = 0; y < 6; y++)6360{6361for (uint32_t x = 0; x < 6; x++)6362{6363const uint32_t p = pat[x + y * 6];6364debug_state.m_part_vis.set_clipped(bx * 6 + x, by * 6 + y, color_rgba(p ? 100 : 0, 128, p ? 100 : 0, 255));6365} // x6366} // y6367}6368else if (best_candidate.m_decomp_log_blk.m_num_partitions == 3)6369{6370//part_vis.fill_box(bx * 6, by * 6, 6, 6, color_rgba(0, 0, 255, 255));63716372const int part3_unique_index = g_part3_seed_to_unique_index[best_candidate.m_decomp_log_blk.m_partition_id];6373assert((part3_unique_index >= 0) && (part3_unique_index < (int)NUM_UNIQUE_PARTITIONS3));63746375const partition_pattern_vec& pat = g_partitions3[part3_unique_index];63766377for (uint32_t y = 0; y < 6; y++)6378{6379for (uint32_t x = 0; x < 6; x++)6380{6381const uint32_t p = pat[x + y * 6];6382color_rgba c(0, 0, 150, 255);6383if (p == 1)6384c.set(100, 0, 150, 255);6385else if (p == 2)6386c.set(0, 100, 150, 255);6387debug_state.m_part_vis.set_clipped(bx * 6 + x, by * 6 + y, c);6388} // x6389} // y6390}6391else if (best_candidate.m_decomp_log_blk.m_dual_plane)6392{6393debug_state.m_part_vis.fill_box(bx * 6, by * 6, 6, 6, color_rgba(255, 0, 255, 255));6394}6395else6396{6397debug_state.m_part_vis.fill_box(bx * 6, by * 6, 6, 6, color_rgba(255, 0, 0, 255));6398}63996400color_rgba c;6401c.set((best_candidate.m_coded_log_blk.m_grid_width * best_candidate.m_coded_log_blk.m_grid_height * 255 + 18) / 36);6402debug_state.m_grid_vis.fill_box(bx * 6, by * 6, 6, 6, c);64036404c.set(0, 0, 0, 255);6405if (complex_block)6406c[0] = 255;64076408if (very_complex_block)6409c[1] = 255;64106411if (outer_pass == 2)6412c[2] = 255;6413else if (outer_pass == 1)6414c[2] = 128;64156416debug_state.m_mode_vis.fill_box(bx * 6, by * 6, 6, 6, c);64176418c.set(0, 255, 0, 255);6419if (best_candidate.m_coded_log_blk.m_color_endpoint_modes[0] == 7)6420c.set(255, 0, 0, 255);6421debug_state.m_mode_vis2.fill_box(bx * 6, by * 6, 6, 6, c);64226423switch (best_candidate.m_encoding_type)6424{6425case encoding_type::cRun:6426c.set(0, 0, 0, 255);6427break;6428case encoding_type::cSolid:6429c.set(128, 128, 128, 255); // dark grey6430break;6431case encoding_type::cReuse:6432c.set(255, 255, 0, 255); // yellow6433break;6434case encoding_type::cBlock:6435{6436switch (best_candidate.m_endpoint_mode)6437{6438case endpoint_mode::cRaw:6439c.set(255, 0, 0, 255); // red6440break;6441case endpoint_mode::cUseLeft:6442c.set(0, 0, 255, 255); // blue6443break;6444case endpoint_mode::cUseUpper:6445c.set(0, 0, 192, 255); // darker blue6446break;6447case endpoint_mode::cUseLeftDelta:6448c.set(0, 255, 0, 255); // green6449break;6450case endpoint_mode::cUseUpperDelta:6451c.set(0, 192, 0, 255); // darker green6452break;6453default:6454break;6455}64566457break;6458}6459default:6460break;6461}64626463if (filtered_x_err < filtered_y_err)6464c[3] = 0;6465else6466c[3] = 255;64676468debug_state.m_enc_vis.fill_box(bx * 6, by * 6, 6, 6, c);6469}64706471break;64726473} // outer_pass64746475} // bx64766477} // by64786479if (prev_encoding.get_total_bits())6480{6481#if SYNC_MARKERS6482strip_coded_bits.put_bits(0xDEAD, 16);6483#endif64846485strip_coded_bits.append(prev_encoding);6486}64876488return true;6489}64906491bool g_initialized = false;64926493void global_init()6494{6495if (g_initialized)6496return;64976498interval_timer tm;6499tm.start();65006501init_pq_tables();65026503init_partitions2_6x6();6504init_partitions3_6x6();65056506init_contrib_lists();65076508g_initialized = true;65096510//fmt_printf("astc_6x6_hdr::global_init() total time: {}\n", tm.get_elapsed_secs());6511}65126513bool compress_photo(const basisu::imagef &orig_src_img, const astc_hdr_6x6_global_config &orig_global_cfg, job_pool *pJob_pool,6514basisu::uint8_vec& intermediate_tex_data, basisu::uint8_vec& astc_tex_data, result_metrics& metrics)6515{6516assert(g_initialized);6517if (!g_initialized)6518return false;65196520assert(pJob_pool);65216522if (orig_global_cfg.m_debug_output)6523{6524fmt_debug_printf("------ astc_6x6_hdr::compress_photo:\n");6525fmt_debug_printf("Source image dimensions: {}x{}\n", orig_src_img.get_width(), orig_src_img.get_height());6526fmt_debug_printf("Job pool total threads: {}\n", (uint64_t)pJob_pool->get_total_threads());6527orig_global_cfg.print();6528}65296530if (!orig_src_img.get_width() || !orig_src_img.get_height())6531{6532assert(false);6533fmt_error_printf("compress_photo: Invalid source image\n");6534return false;6535}65366537astc_hdr_6x6_global_config global_cfg(orig_global_cfg);65386539uastc_hdr_6x6_encode_state enc_state;6540enc_state.master_coptions.m_q_log_bias = Q_LOG_BIAS_6x6;6541enc_state.src_img = orig_src_img;65426543//src_img.crop(256, 256);65446545const uint32_t width = enc_state.src_img.get_width();6546const uint32_t height = enc_state.src_img.get_height();6547const uint32_t num_blocks_x = enc_state.src_img.get_block_width(BLOCK_W);6548const uint32_t num_blocks_y = enc_state.src_img.get_block_height(BLOCK_H);6549const uint32_t total_blocks = num_blocks_x * num_blocks_y;65506551for (uint32_t y = 0; y < height; y++)6552{6553for (uint32_t x = 0; x < width; x++)6554{6555for (uint32_t c = 0; c < 3; c++)6556{6557float f = enc_state.src_img(x, y)[c];65586559if (std::isinf(f) || std::isnan(f) || (f < 0.0f))6560f = 0;6561else if (f > basist::ASTC_HDR_MAX_VAL)6562f = basist::ASTC_HDR_MAX_VAL;65636564enc_state.src_img(x, y)[c] = f;65656566} // c65676568} // x6569} // y65706571if (global_cfg.m_debug_images)6572{6573write_exr((global_cfg.m_debug_image_prefix + "orig.exr").c_str(), enc_state.src_img, 3, 0);6574}65756576image src_img_compressed;6577tonemap_image_compressive2(src_img_compressed, enc_state.src_img);65786579if (global_cfg.m_debug_images)6580{6581save_png(global_cfg.m_debug_image_prefix + "compressive_tone_map.png", src_img_compressed);6582}65836584smooth_map_params rp;6585rp.m_debug_images = global_cfg.m_debug_images;65866587if (global_cfg.m_lambda != 0.0f)6588{6589if (global_cfg.m_status_output)6590fmt_printf("Creating RDO perceptual weighting maps\n");65916592create_smooth_maps2(enc_state.smooth_block_mse_scales, src_img_compressed, rp);6593}65946595if (global_cfg.m_status_output)6596fmt_printf("Blurring image\n");65976598enc_state.src_img_filtered1.resize(width, height);6599image_resample(enc_state.src_img, enc_state.src_img_filtered1, "gaussian", global_cfg.m_gaussian1_strength); //1.45f);66006601enc_state.src_img_filtered2.resize(width, height);6602image_resample(enc_state.src_img, enc_state.src_img_filtered2, "gaussian", global_cfg.m_gaussian2_strength); //1.83f);66036604if (global_cfg.m_debug_images)6605{6606write_exr((global_cfg.m_debug_image_prefix + "blurred1.exr").c_str(), enc_state.src_img_filtered1, 3, 0);6607write_exr((global_cfg.m_debug_image_prefix + "blurred2.exr").c_str(), enc_state.src_img_filtered2, 3, 0);6608}66096610if (global_cfg.m_status_output)6611fmt_printf("Transforming to ITP\n");66126613enc_state.src_img_itp.resize(width, height);6614convet_rgb_image_to_itp(enc_state.src_img, enc_state.src_img_itp, global_cfg);66156616enc_state.src_img_filtered1_itp.resize(width, height);6617convet_rgb_image_to_itp(enc_state.src_img_filtered1, enc_state.src_img_filtered1_itp, global_cfg);66186619enc_state.src_img_filtered2_itp.resize(width, height);6620convet_rgb_image_to_itp(enc_state.src_img_filtered2, enc_state.src_img_filtered2_itp, global_cfg);66216622if (global_cfg.m_lambda == 0.0f)6623global_cfg.m_favor_higher_compression = false;66246625uint32_t total_strips = 0, rows_per_strip = 0;6626if (!calc_strip_size(global_cfg.m_lambda, num_blocks_y, (uint32_t)pJob_pool->get_total_threads(), global_cfg.m_force_one_strip, total_strips, rows_per_strip, global_cfg))6627{6628fmt_error_printf("compress_photo: Failed computing strip sizes\n");6629return false;6630}66316632if (global_cfg.m_debug_output)6633fmt_printf("lambda: {}, comp_level: {}, highest_comp_level: {}, extra patterns: {}\n", global_cfg.m_lambda, global_cfg.m_master_comp_level, global_cfg.m_highest_comp_level, global_cfg.m_extra_patterns_flag);66346635enc_state.coded_blocks.resize(num_blocks_x, num_blocks_y);66366637bitwise_coder coded_bits;66386639coded_bits.put_bits(0xABCD, 16);6640coded_bits.put_bits(width, 16);6641coded_bits.put_bits(height, 16);66426643enc_state.packed_img.resize(width, height);66446645enc_state.strip_bits.resize(total_strips);66466647enc_state.final_astc_blocks.resize(num_blocks_x, num_blocks_y);66486649uastc_hdr_6x6_debug_state debug_state;66506651if (global_cfg.m_debug_images)6652debug_state.init(width, height);6653else6654debug_state.init(0, 0);66556656interval_timer tm;6657tm.start();66586659std::atomic_bool any_failed_flag;6660any_failed_flag.store(false);66616662for (uint32_t strip_index = 0; strip_index < total_strips; strip_index++)6663{6664const uint32_t strip_first_by = strip_index * rows_per_strip;66656666uint32_t strip_last_by = minimum<uint32_t>(strip_first_by + rows_per_strip - 1, num_blocks_y);6667if (strip_index == (total_strips - 1))6668strip_last_by = num_blocks_y - 1;66696670pJob_pool->add_job([&any_failed_flag, &global_cfg, &debug_state, &enc_state,6671strip_index, total_strips, strip_first_by, strip_last_by,6672num_blocks_x, num_blocks_y, total_blocks, width, height]6673{6674if (!any_failed_flag)6675{6676bool status = compress_strip_task(6677strip_index, total_strips, strip_first_by, strip_last_by,6678num_blocks_x, num_blocks_y, total_blocks, width, height,6679global_cfg, debug_state, enc_state);66806681if (!status)6682{6683fmt_error_printf("compress_photo: compress_strip_task() failed\n");6684any_failed_flag.store(true, std::memory_order_relaxed);6685}6686}6687} );66886689if (any_failed_flag)6690break;66916692} // strip_index66936694pJob_pool->wait_for_all();66956696if (any_failed_flag)6697{6698fmt_error_printf("One or more strips failed during compression\n");6699return false;6700}67016702if (global_cfg.m_debug_output)6703fmt_printf("Encoding time: {} secs\n", tm.get_elapsed_secs());67046705if (global_cfg.m_debug_output)6706debug_state.print(total_blocks);67076708if (global_cfg.m_debug_images)6709{6710save_png(global_cfg.m_debug_image_prefix + "part_vis.png", debug_state.m_part_vis);6711save_png(global_cfg.m_debug_image_prefix + "grid_vis.png", debug_state.m_grid_vis);6712save_png(global_cfg.m_debug_image_prefix + "mode_vis.png", debug_state.m_mode_vis);6713save_png(global_cfg.m_debug_image_prefix + "mode_vis2.png", debug_state.m_mode_vis2);6714save_png(global_cfg.m_debug_image_prefix + "enc_vis.png", debug_state.m_enc_vis);6715write_exr((global_cfg.m_debug_image_prefix + "stat_vis.exr").c_str(), debug_state.m_stat_vis, 3, 0);6716}67176718for (uint32_t i = 0; i < total_strips; i++)6719coded_bits.append(enc_state.strip_bits[i]);67206721coded_bits.put_bits(0xA742, 16);67226723coded_bits.flush();67246725if (global_cfg.m_output_images)6726{6727write_exr((global_cfg.m_output_image_prefix + "comp.exr").c_str(), enc_state.packed_img, 3, 0);6728}67296730if (global_cfg.m_debug_output)6731fmt_printf("\nTotal intermediate output bits/pixel: {3.4}\n", (float)coded_bits.get_total_bits() / (float)(width * height));67326733vector2D<astc_helpers::astc_block> decoded_blocks1;6734vector2D<astc_helpers::astc_block> decoded_blocks2;67356736if (global_cfg.m_debug_output)6737fmt_printf("decode_file\n");67386739uint32_t unpacked_width = 0, unpacked_height = 0;6740bool status = decode_file(coded_bits.get_bytes(), decoded_blocks1, unpacked_width, unpacked_height);6741if (!status)6742{6743fmt_error_printf("decode_file() failed\n");6744return false;6745}67466747if (global_cfg.m_debug_output)6748fmt_printf("decode_6x6_hdr\n");67496750status = decode_6x6_hdr(coded_bits.get_bytes().get_ptr(), coded_bits.get_bytes().size_in_bytes_u32(), decoded_blocks2, unpacked_width, unpacked_height);6751if (!status)6752{6753fmt_error_printf("decode_6x6_hdr_file() failed\n");6754return false;6755}67566757if ((enc_state.final_astc_blocks.get_width() != decoded_blocks1.get_width()) ||6758(enc_state.final_astc_blocks.get_height() != decoded_blocks1.get_height()))6759{6760fmt_error_printf("Decode size mismatch with decode_file\n");6761return false;6762}67636764if ((enc_state.final_astc_blocks.get_width() != decoded_blocks2.get_width()) ||6765(enc_state.final_astc_blocks.get_height() != decoded_blocks2.get_height()))6766{6767fmt_error_printf("Decode size mismatch with decode_6x6_hdr_file\n");6768return false;6769}67706771if (memcmp(decoded_blocks1.get_ptr(), enc_state.final_astc_blocks.get_ptr(), decoded_blocks1.size_in_bytes()) != 0)6772{6773fmt_error_printf("Decoded ASTC blocks verification failed\n");6774return false;6775}67766777if (memcmp(decoded_blocks2.get_ptr(), enc_state.final_astc_blocks.get_ptr(), decoded_blocks2.size_in_bytes()) != 0)6778{6779fmt_error_printf("Decoded ASTC blocks verification failed\n");6780return false;6781}67826783if (global_cfg.m_debug_output)6784basisu::fmt_printf("Decoded ASTC verification checks succeeded\n");67856786if (global_cfg.m_output_images)6787{6788if (write_astc_file((global_cfg.m_output_image_prefix + "decoded.astc").c_str(), decoded_blocks1.get_ptr(), BLOCK_W, BLOCK_H, width, height))6789{6790basisu::platform_sleep(20);67916792uint8_vec astc_file_data;6793if (read_file_to_vec((global_cfg.m_output_image_prefix + "decoded.astc").c_str(), astc_file_data))6794{6795if (astc_file_data.size() > 16)6796{6797astc_file_data.erase(0, 16);67986799size_t comp_size = 0;6800void* pComp_data = tdefl_compress_mem_to_heap(&astc_file_data[0], astc_file_data.size(), &comp_size, TDEFL_MAX_PROBES_MASK);6801mz_free(pComp_data);68026803if (global_cfg.m_debug_output)6804{6805fmt_printf(".ASTC file size (less header): {}, bits/pixel: {}, Deflate bits/pixel: {}\n",6806(uint64_t)astc_file_data.size(),6807(float)astc_file_data.size() * 8.0f / (float)(width * height),6808(float)comp_size * 8.0f / (float)(width * height));6809}6810}6811}6812}6813}68146815// Must decode all the blocks (even padded rows/cols) to match what the transcoder does.6816imagef unpacked_astc_img(num_blocks_x * 6, num_blocks_y * 6);6817imagef unpacked_astc_google_img(num_blocks_x * 6, num_blocks_y * 6);68186819for (uint32_t y = 0; y < decoded_blocks1.get_height(); y++)6820{6821for (uint32_t x = 0; x < decoded_blocks1.get_width(); x++)6822{6823const auto& phys_blk = decoded_blocks1(x, y);68246825vec4F pixels[MAX_BLOCK_W * MAX_BLOCK_H];6826status = unpack_physical_astc_block(&phys_blk, BLOCK_W, BLOCK_H, pixels);6827if (!status)6828{6829fmt_error_printf("unpack_physical_astc_block() failed\n");6830return false;6831}68326833unpacked_astc_img.set_block_clipped(pixels, x * BLOCK_W, y * BLOCK_H, BLOCK_W, BLOCK_H);68346835vec4F pixels_google[MAX_BLOCK_W * MAX_BLOCK_H];6836status = unpack_physical_astc_block_google(&phys_blk, BLOCK_W, BLOCK_H, pixels_google);6837if (!status)6838{6839fmt_error_printf("unpack_physical_astc_block_google() failed\n");6840return false;6841}68426843unpacked_astc_google_img.set_block_clipped(pixels_google, x * BLOCK_W, y * BLOCK_H, BLOCK_W, BLOCK_H);68446845for (uint32_t i = 0; i < 36; i++)6846{6847if (pixels[i] != pixels_google[i])6848{6849fmt_error_printf("pixel unpack mismatch\n");6850return false;6851}6852}6853}6854}68556856if (global_cfg.m_debug_output)6857fmt_printf("\nUnpack succeeded\n");68586859imagef unpacked_bc6h_img;68606861{6862vector2D<basist::bc6h_block> bc6h_blocks;68636864fast_bc6h_params enc_params;68656866bool pack_status = pack_bc6h_image(unpacked_astc_img, bc6h_blocks, &unpacked_bc6h_img, enc_params);6867if (!pack_status)6868{6869fmt_error_printf("pack_bc6h_image() failed!");6870return false;6871}68726873unpacked_bc6h_img.crop(width, height);68746875if (global_cfg.m_output_images)6876{6877write_exr((global_cfg.m_output_image_prefix + "unpacked_bc6h.exr").c_str(), unpacked_bc6h_img, 3, 0);6878}6879}68806881unpacked_astc_img.crop(width, height);6882unpacked_astc_google_img.crop(width, height);68836884if (global_cfg.m_output_images)6885{6886write_exr((global_cfg.m_output_image_prefix + "unpacked_astc.exr").c_str(), unpacked_astc_img, 3, 0);6887write_exr((global_cfg.m_output_image_prefix + "unpacked_google_astc.exr").c_str(), unpacked_astc_google_img, 3, 0);6888}68896890// ASTC metrics6891if (global_cfg.m_image_stats)6892{6893image_metrics im;68946895if (global_cfg.m_debug_output)6896printf("\nASTC log2 float error metrics:\n");68976898for (uint32_t i = 0; i < 3; i++)6899{6900im.calc(enc_state.src_img, unpacked_astc_img, i, 1, true, true);69016902if (global_cfg.m_debug_output)6903{6904printf("%c: ", "RGBA"[i]);6905im.print_hp();6906}6907}69086909metrics.m_im_astc_log2.calc(enc_state.src_img, unpacked_astc_img, 0, 3, true, true);69106911if (global_cfg.m_debug_output)6912{6913printf("RGB: ");6914metrics.m_im_astc_log2.print_hp();69156916printf("\n");6917}6918}69196920if (global_cfg.m_image_stats)6921{6922image_metrics im;69236924if (global_cfg.m_debug_output)6925printf("ASTC half float space error metrics (a piecewise linear approximation of log2 error):\n");69266927for (uint32_t i = 0; i < 3; i++)6928{6929im.calc_half(enc_state.src_img, unpacked_astc_img, i, 1, true);69306931if (global_cfg.m_debug_output)6932{6933printf("%c: ", "RGBA"[i]);6934im.print_hp();6935}6936}69376938metrics.m_im_astc_half.calc_half(enc_state.src_img, unpacked_astc_img, 0, 3, true);69396940if (global_cfg.m_debug_output)6941{6942printf("RGB: ");6943metrics.m_im_astc_half.print_hp();6944}6945}69466947// BC6H metrics6948if (global_cfg.m_image_stats)6949{6950image_metrics im;69516952if (global_cfg.m_debug_output)6953printf("\nBC6H log2 float error metrics:\n");69546955for (uint32_t i = 0; i < 3; i++)6956{6957im.calc(enc_state.src_img, unpacked_bc6h_img, i, 1, true, true);69586959if (global_cfg.m_debug_output)6960{6961printf("%c: ", "RGBA"[i]);6962im.print_hp();6963}6964}69656966metrics.m_im_bc6h_log2.calc(enc_state.src_img, unpacked_bc6h_img, 0, 3, true, true);69676968if (global_cfg.m_debug_output)6969{6970printf("RGB: ");6971metrics.m_im_bc6h_log2.print_hp();69726973printf("\n");6974}6975}69766977if (global_cfg.m_image_stats)6978{6979image_metrics im;69806981if (global_cfg.m_debug_output)6982printf("BC6H half float space error metrics (a piecewise linear approximation of log2 error):\n");69836984for (uint32_t i = 0; i < 3; i++)6985{6986im.calc_half(enc_state.src_img, unpacked_bc6h_img, i, 1, true);69876988if (global_cfg.m_debug_output)6989{6990printf("%c: ", "RGBA"[i]);6991im.print_hp();6992}6993}69946995metrics.m_im_bc6h_half.calc_half(enc_state.src_img, unpacked_bc6h_img, 0, 3, true);69966997if (global_cfg.m_debug_output)6998{6999printf("RGB: ");7000metrics.m_im_bc6h_half.print_hp();70017002printf("\n");7003}7004}70057006intermediate_tex_data.swap(coded_bits.get_bytes());70077008astc_tex_data.resize(decoded_blocks1.size_in_bytes());7009memcpy(astc_tex_data.data(), decoded_blocks1.get_ptr(), decoded_blocks1.size_in_bytes());70107011return true;7012}70137014} // namespace astc_6x6_hdr701570167017