Path: blob/master/thirdparty/astcenc/astcenc_averages_and_directions.cpp
9902 views
// SPDX-License-Identifier: Apache-2.01// ----------------------------------------------------------------------------2// Copyright 2011-2025 Arm Limited3//4// Licensed under the Apache License, Version 2.0 (the "License"); you may not5// use this file except in compliance with the License. You may obtain a copy6// of the License at:7//8// http://www.apache.org/licenses/LICENSE-2.09//10// Unless required by applicable law or agreed to in writing, software11// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT12// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the13// License for the specific language governing permissions and limitations14// under the License.15// ----------------------------------------------------------------------------1617/**18* @brief Functions for finding dominant direction of a set of colors.19*/20#if !defined(ASTCENC_DECOMPRESS_ONLY)2122#include "astcenc_internal.h"2324#include <cassert>2526/**27* @brief Compute the average RGB color of each partition.28*29* The algorithm here uses a vectorized sequential scan and per-partition30* color accumulators, using select() to mask texel lanes in other partitions.31*32* We only accumulate sums for N-1 partitions during the scan; the value for33* the last partition can be computed given that we know the block-wide average34* already.35*36* Because of this we could reduce the loop iteration count so it "just" spans37* the max texel index needed for the N-1 partitions, which could need fewer38* iterations than the full block texel count. However, this makes the loop39* count erratic and causes more branch mispredictions so is a net loss.40*41* @param pi The partitioning to use.42* @param blk The block data to process.43* @param[out] averages The output averages. Unused partition indices will44* not be initialized, and lane<3> will be zero.45*/46static void compute_partition_averages_rgb(47const partition_info& pi,48const image_block& blk,49vfloat4 averages[BLOCK_MAX_PARTITIONS]50) {51unsigned int partition_count = pi.partition_count;52size_t texel_count = blk.texel_count;53promise(texel_count > 0);5455// For 1 partition just use the precomputed mean56if (partition_count == 1)57{58averages[0] = blk.data_mean.swz<0, 1, 2>();59}60// For 2 partitions scan results for partition 0, compute partition 161else if (partition_count == 2)62{63vfloatacc pp_avg_rgb[3] {};6465vint lane_id = vint::lane_id();66for (size_t i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)67{68vint texel_partition(pi.partition_of_texel + i);6970vmask lane_mask = lane_id < vint_from_size(texel_count);71lane_id += vint(ASTCENC_SIMD_WIDTH);7273vmask p0_mask = lane_mask & (texel_partition == vint(0));7475vfloat data_r = loada(blk.data_r + i);76haccumulate(pp_avg_rgb[0], data_r, p0_mask);7778vfloat data_g = loada(blk.data_g + i);79haccumulate(pp_avg_rgb[1], data_g, p0_mask);8081vfloat data_b = loada(blk.data_b + i);82haccumulate(pp_avg_rgb[2], data_b, p0_mask);83}8485vfloat4 block_total = blk.data_mean.swz<0, 1, 2>() * static_cast<float>(blk.texel_count);8687vfloat4 p0_total = vfloat3(hadd_s(pp_avg_rgb[0]),88hadd_s(pp_avg_rgb[1]),89hadd_s(pp_avg_rgb[2]));9091vfloat4 p1_total = block_total - p0_total;9293averages[0] = p0_total / static_cast<float>(pi.partition_texel_count[0]);94averages[1] = p1_total / static_cast<float>(pi.partition_texel_count[1]);95}96// For 3 partitions scan results for partition 0/1, compute partition 297else if (partition_count == 3)98{99vfloatacc pp_avg_rgb[2][3] {};100101vint lane_id = vint::lane_id();102for (size_t i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)103{104vint texel_partition(pi.partition_of_texel + i);105106vmask lane_mask = lane_id < vint_from_size(texel_count);107lane_id += vint(ASTCENC_SIMD_WIDTH);108109vmask p0_mask = lane_mask & (texel_partition == vint(0));110vmask p1_mask = lane_mask & (texel_partition == vint(1));111112vfloat data_r = loada(blk.data_r + i);113haccumulate(pp_avg_rgb[0][0], data_r, p0_mask);114haccumulate(pp_avg_rgb[1][0], data_r, p1_mask);115116vfloat data_g = loada(blk.data_g + i);117haccumulate(pp_avg_rgb[0][1], data_g, p0_mask);118haccumulate(pp_avg_rgb[1][1], data_g, p1_mask);119120vfloat data_b = loada(blk.data_b + i);121haccumulate(pp_avg_rgb[0][2], data_b, p0_mask);122haccumulate(pp_avg_rgb[1][2], data_b, p1_mask);123}124125vfloat4 block_total = blk.data_mean.swz<0, 1, 2>() * static_cast<float>(blk.texel_count);126127vfloat4 p0_total = vfloat3(hadd_s(pp_avg_rgb[0][0]),128hadd_s(pp_avg_rgb[0][1]),129hadd_s(pp_avg_rgb[0][2]));130131vfloat4 p1_total = vfloat3(hadd_s(pp_avg_rgb[1][0]),132hadd_s(pp_avg_rgb[1][1]),133hadd_s(pp_avg_rgb[1][2]));134135vfloat4 p2_total = block_total - p0_total - p1_total;136137averages[0] = p0_total / static_cast<float>(pi.partition_texel_count[0]);138averages[1] = p1_total / static_cast<float>(pi.partition_texel_count[1]);139averages[2] = p2_total / static_cast<float>(pi.partition_texel_count[2]);140}141else142{143// For 4 partitions scan results for partition 0/1/2, compute partition 3144vfloatacc pp_avg_rgb[3][3] {};145146vint lane_id = vint::lane_id();147for (size_t i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)148{149vint texel_partition(pi.partition_of_texel + i);150151vmask lane_mask = lane_id < vint_from_size(texel_count);152lane_id += vint(ASTCENC_SIMD_WIDTH);153154vmask p0_mask = lane_mask & (texel_partition == vint(0));155vmask p1_mask = lane_mask & (texel_partition == vint(1));156vmask p2_mask = lane_mask & (texel_partition == vint(2));157158vfloat data_r = loada(blk.data_r + i);159haccumulate(pp_avg_rgb[0][0], data_r, p0_mask);160haccumulate(pp_avg_rgb[1][0], data_r, p1_mask);161haccumulate(pp_avg_rgb[2][0], data_r, p2_mask);162163vfloat data_g = loada(blk.data_g + i);164haccumulate(pp_avg_rgb[0][1], data_g, p0_mask);165haccumulate(pp_avg_rgb[1][1], data_g, p1_mask);166haccumulate(pp_avg_rgb[2][1], data_g, p2_mask);167168vfloat data_b = loada(blk.data_b + i);169haccumulate(pp_avg_rgb[0][2], data_b, p0_mask);170haccumulate(pp_avg_rgb[1][2], data_b, p1_mask);171haccumulate(pp_avg_rgb[2][2], data_b, p2_mask);172}173174vfloat4 block_total = blk.data_mean.swz<0, 1, 2>() * static_cast<float>(blk.texel_count);175176vfloat4 p0_total = vfloat3(hadd_s(pp_avg_rgb[0][0]),177hadd_s(pp_avg_rgb[0][1]),178hadd_s(pp_avg_rgb[0][2]));179180vfloat4 p1_total = vfloat3(hadd_s(pp_avg_rgb[1][0]),181hadd_s(pp_avg_rgb[1][1]),182hadd_s(pp_avg_rgb[1][2]));183184vfloat4 p2_total = vfloat3(hadd_s(pp_avg_rgb[2][0]),185hadd_s(pp_avg_rgb[2][1]),186hadd_s(pp_avg_rgb[2][2]));187188vfloat4 p3_total = block_total - p0_total - p1_total- p2_total;189190averages[0] = p0_total / static_cast<float>(pi.partition_texel_count[0]);191averages[1] = p1_total / static_cast<float>(pi.partition_texel_count[1]);192averages[2] = p2_total / static_cast<float>(pi.partition_texel_count[2]);193averages[3] = p3_total / static_cast<float>(pi.partition_texel_count[3]);194}195}196197/**198* @brief Compute the average RGBA color of each partition.199*200* The algorithm here uses a vectorized sequential scan and per-partition201* color accumulators, using select() to mask texel lanes in other partitions.202*203* We only accumulate sums for N-1 partitions during the scan; the value for204* the last partition can be computed given that we know the block-wide average205* already.206*207* Because of this we could reduce the loop iteration count so it "just" spans208* the max texel index needed for the N-1 partitions, which could need fewer209* iterations than the full block texel count. However, this makes the loop210* count erratic and causes more branch mispredictions so is a net loss.211*212* @param pi The partitioning to use.213* @param blk The block data to process.214* @param[out] averages The output averages. Unused partition indices will215* not be initialized.216*/217static void compute_partition_averages_rgba(218const partition_info& pi,219const image_block& blk,220vfloat4 averages[BLOCK_MAX_PARTITIONS]221) {222unsigned int partition_count = pi.partition_count;223size_t texel_count = blk.texel_count;224promise(texel_count > 0);225226// For 1 partition just use the precomputed mean227if (partition_count == 1)228{229averages[0] = blk.data_mean;230}231// For 2 partitions scan results for partition 0, compute partition 1232else if (partition_count == 2)233{234vfloat4 pp_avg_rgba[4] {};235236vint lane_id = vint::lane_id();237for (size_t i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)238{239vint texel_partition(pi.partition_of_texel + i);240241vmask lane_mask = lane_id < vint_from_size(texel_count);242lane_id += vint(ASTCENC_SIMD_WIDTH);243244vmask p0_mask = lane_mask & (texel_partition == vint(0));245246vfloat data_r = loada(blk.data_r + i);247haccumulate(pp_avg_rgba[0], data_r, p0_mask);248249vfloat data_g = loada(blk.data_g + i);250haccumulate(pp_avg_rgba[1], data_g, p0_mask);251252vfloat data_b = loada(blk.data_b + i);253haccumulate(pp_avg_rgba[2], data_b, p0_mask);254255vfloat data_a = loada(blk.data_a + i);256haccumulate(pp_avg_rgba[3], data_a, p0_mask);257}258259vfloat4 block_total = blk.data_mean * static_cast<float>(blk.texel_count);260261vfloat4 p0_total = vfloat4(hadd_s(pp_avg_rgba[0]),262hadd_s(pp_avg_rgba[1]),263hadd_s(pp_avg_rgba[2]),264hadd_s(pp_avg_rgba[3]));265266vfloat4 p1_total = block_total - p0_total;267268averages[0] = p0_total / static_cast<float>(pi.partition_texel_count[0]);269averages[1] = p1_total / static_cast<float>(pi.partition_texel_count[1]);270}271// For 3 partitions scan results for partition 0/1, compute partition 2272else if (partition_count == 3)273{274vfloat4 pp_avg_rgba[2][4] {};275276vint lane_id = vint::lane_id();277for (size_t i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)278{279vint texel_partition(pi.partition_of_texel + i);280281vmask lane_mask = lane_id < vint_from_size(texel_count);282lane_id += vint(ASTCENC_SIMD_WIDTH);283284vmask p0_mask = lane_mask & (texel_partition == vint(0));285vmask p1_mask = lane_mask & (texel_partition == vint(1));286287vfloat data_r = loada(blk.data_r + i);288haccumulate(pp_avg_rgba[0][0], data_r, p0_mask);289haccumulate(pp_avg_rgba[1][0], data_r, p1_mask);290291vfloat data_g = loada(blk.data_g + i);292haccumulate(pp_avg_rgba[0][1], data_g, p0_mask);293haccumulate(pp_avg_rgba[1][1], data_g, p1_mask);294295vfloat data_b = loada(blk.data_b + i);296haccumulate(pp_avg_rgba[0][2], data_b, p0_mask);297haccumulate(pp_avg_rgba[1][2], data_b, p1_mask);298299vfloat data_a = loada(blk.data_a + i);300haccumulate(pp_avg_rgba[0][3], data_a, p0_mask);301haccumulate(pp_avg_rgba[1][3], data_a, p1_mask);302}303304vfloat4 block_total = blk.data_mean * static_cast<float>(blk.texel_count);305306vfloat4 p0_total = vfloat4(hadd_s(pp_avg_rgba[0][0]),307hadd_s(pp_avg_rgba[0][1]),308hadd_s(pp_avg_rgba[0][2]),309hadd_s(pp_avg_rgba[0][3]));310311vfloat4 p1_total = vfloat4(hadd_s(pp_avg_rgba[1][0]),312hadd_s(pp_avg_rgba[1][1]),313hadd_s(pp_avg_rgba[1][2]),314hadd_s(pp_avg_rgba[1][3]));315316vfloat4 p2_total = block_total - p0_total - p1_total;317318averages[0] = p0_total / static_cast<float>(pi.partition_texel_count[0]);319averages[1] = p1_total / static_cast<float>(pi.partition_texel_count[1]);320averages[2] = p2_total / static_cast<float>(pi.partition_texel_count[2]);321}322else323{324// For 4 partitions scan results for partition 0/1/2, compute partition 3325vfloat4 pp_avg_rgba[3][4] {};326327vint lane_id = vint::lane_id();328for (size_t i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)329{330vint texel_partition(pi.partition_of_texel + i);331332vmask lane_mask = lane_id < vint_from_size(texel_count);333lane_id += vint(ASTCENC_SIMD_WIDTH);334335vmask p0_mask = lane_mask & (texel_partition == vint(0));336vmask p1_mask = lane_mask & (texel_partition == vint(1));337vmask p2_mask = lane_mask & (texel_partition == vint(2));338339vfloat data_r = loada(blk.data_r + i);340haccumulate(pp_avg_rgba[0][0], data_r, p0_mask);341haccumulate(pp_avg_rgba[1][0], data_r, p1_mask);342haccumulate(pp_avg_rgba[2][0], data_r, p2_mask);343344vfloat data_g = loada(blk.data_g + i);345haccumulate(pp_avg_rgba[0][1], data_g, p0_mask);346haccumulate(pp_avg_rgba[1][1], data_g, p1_mask);347haccumulate(pp_avg_rgba[2][1], data_g, p2_mask);348349vfloat data_b = loada(blk.data_b + i);350haccumulate(pp_avg_rgba[0][2], data_b, p0_mask);351haccumulate(pp_avg_rgba[1][2], data_b, p1_mask);352haccumulate(pp_avg_rgba[2][2], data_b, p2_mask);353354vfloat data_a = loada(blk.data_a + i);355haccumulate(pp_avg_rgba[0][3], data_a, p0_mask);356haccumulate(pp_avg_rgba[1][3], data_a, p1_mask);357haccumulate(pp_avg_rgba[2][3], data_a, p2_mask);358}359360vfloat4 block_total = blk.data_mean * static_cast<float>(blk.texel_count);361362vfloat4 p0_total = vfloat4(hadd_s(pp_avg_rgba[0][0]),363hadd_s(pp_avg_rgba[0][1]),364hadd_s(pp_avg_rgba[0][2]),365hadd_s(pp_avg_rgba[0][3]));366367vfloat4 p1_total = vfloat4(hadd_s(pp_avg_rgba[1][0]),368hadd_s(pp_avg_rgba[1][1]),369hadd_s(pp_avg_rgba[1][2]),370hadd_s(pp_avg_rgba[1][3]));371372vfloat4 p2_total = vfloat4(hadd_s(pp_avg_rgba[2][0]),373hadd_s(pp_avg_rgba[2][1]),374hadd_s(pp_avg_rgba[2][2]),375hadd_s(pp_avg_rgba[2][3]));376377vfloat4 p3_total = block_total - p0_total - p1_total- p2_total;378379averages[0] = p0_total / static_cast<float>(pi.partition_texel_count[0]);380averages[1] = p1_total / static_cast<float>(pi.partition_texel_count[1]);381averages[2] = p2_total / static_cast<float>(pi.partition_texel_count[2]);382averages[3] = p3_total / static_cast<float>(pi.partition_texel_count[3]);383}384}385386/* See header for documentation. */387void compute_avgs_and_dirs_4_comp(388const partition_info& pi,389const image_block& blk,390partition_metrics pm[BLOCK_MAX_PARTITIONS]391) {392size_t partition_count = pi.partition_count;393promise(partition_count > 0);394395// Pre-compute partition_averages396vfloat4 partition_averages[BLOCK_MAX_PARTITIONS];397compute_partition_averages_rgba(pi, blk, partition_averages);398399for (size_t partition = 0; partition < partition_count; partition++)400{401const uint8_t *texel_indexes = pi.texels_of_partition[partition];402size_t texel_count = pi.partition_texel_count[partition];403promise(texel_count > 0);404405vfloat4 average = partition_averages[partition];406pm[partition].avg = average;407408vfloat4 sum_xp = vfloat4::zero();409vfloat4 sum_yp = vfloat4::zero();410vfloat4 sum_zp = vfloat4::zero();411vfloat4 sum_wp = vfloat4::zero();412413for (size_t i = 0; i < texel_count; i++)414{415unsigned int iwt = texel_indexes[i];416vfloat4 texel_datum = blk.texel(iwt);417texel_datum = texel_datum - average;418419vfloat4 zero = vfloat4::zero();420421vmask4 tdm0 = texel_datum.swz<0,0,0,0>() > zero;422sum_xp += select(zero, texel_datum, tdm0);423424vmask4 tdm1 = texel_datum.swz<1,1,1,1>() > zero;425sum_yp += select(zero, texel_datum, tdm1);426427vmask4 tdm2 = texel_datum.swz<2,2,2,2>() > zero;428sum_zp += select(zero, texel_datum, tdm2);429430vmask4 tdm3 = texel_datum.swz<3,3,3,3>() > zero;431sum_wp += select(zero, texel_datum, tdm3);432}433434vfloat4 prod_xp = dot(sum_xp, sum_xp);435vfloat4 prod_yp = dot(sum_yp, sum_yp);436vfloat4 prod_zp = dot(sum_zp, sum_zp);437vfloat4 prod_wp = dot(sum_wp, sum_wp);438439vfloat4 best_vector = sum_xp;440vfloat4 best_sum = prod_xp;441442vmask4 mask = prod_yp > best_sum;443best_vector = select(best_vector, sum_yp, mask);444best_sum = select(best_sum, prod_yp, mask);445446mask = prod_zp > best_sum;447best_vector = select(best_vector, sum_zp, mask);448best_sum = select(best_sum, prod_zp, mask);449450mask = prod_wp > best_sum;451best_vector = select(best_vector, sum_wp, mask);452453pm[partition].dir = best_vector;454}455}456457/* See header for documentation. */458void compute_avgs_and_dirs_3_comp(459const partition_info& pi,460const image_block& blk,461unsigned int omitted_component,462partition_metrics pm[BLOCK_MAX_PARTITIONS]463) {464// Pre-compute partition_averages465vfloat4 partition_averages[BLOCK_MAX_PARTITIONS];466compute_partition_averages_rgba(pi, blk, partition_averages);467468const float* data_vr = blk.data_r;469const float* data_vg = blk.data_g;470const float* data_vb = blk.data_b;471472// TODO: Data-driven permute would be useful to avoid this ...473if (omitted_component == 0)474{475partition_averages[0] = partition_averages[0].swz<1, 2, 3>();476partition_averages[1] = partition_averages[1].swz<1, 2, 3>();477partition_averages[2] = partition_averages[2].swz<1, 2, 3>();478partition_averages[3] = partition_averages[3].swz<1, 2, 3>();479480data_vr = blk.data_g;481data_vg = blk.data_b;482data_vb = blk.data_a;483}484else if (omitted_component == 1)485{486partition_averages[0] = partition_averages[0].swz<0, 2, 3>();487partition_averages[1] = partition_averages[1].swz<0, 2, 3>();488partition_averages[2] = partition_averages[2].swz<0, 2, 3>();489partition_averages[3] = partition_averages[3].swz<0, 2, 3>();490491data_vg = blk.data_b;492data_vb = blk.data_a;493}494else if (omitted_component == 2)495{496partition_averages[0] = partition_averages[0].swz<0, 1, 3>();497partition_averages[1] = partition_averages[1].swz<0, 1, 3>();498partition_averages[2] = partition_averages[2].swz<0, 1, 3>();499partition_averages[3] = partition_averages[3].swz<0, 1, 3>();500501data_vb = blk.data_a;502}503else504{505partition_averages[0] = partition_averages[0].swz<0, 1, 2>();506partition_averages[1] = partition_averages[1].swz<0, 1, 2>();507partition_averages[2] = partition_averages[2].swz<0, 1, 2>();508partition_averages[3] = partition_averages[3].swz<0, 1, 2>();509}510511size_t partition_count = pi.partition_count;512promise(partition_count > 0);513514for (size_t partition = 0; partition < partition_count; partition++)515{516const uint8_t *texel_indexes = pi.texels_of_partition[partition];517size_t texel_count = pi.partition_texel_count[partition];518promise(texel_count > 0);519520vfloat4 average = partition_averages[partition];521pm[partition].avg = average;522523vfloat4 sum_xp = vfloat4::zero();524vfloat4 sum_yp = vfloat4::zero();525vfloat4 sum_zp = vfloat4::zero();526527for (size_t i = 0; i < texel_count; i++)528{529unsigned int iwt = texel_indexes[i];530531vfloat4 texel_datum = vfloat3(data_vr[iwt],532data_vg[iwt],533data_vb[iwt]);534texel_datum = texel_datum - average;535536vfloat4 zero = vfloat4::zero();537538vmask4 tdm0 = texel_datum.swz<0,0,0,0>() > zero;539sum_xp += select(zero, texel_datum, tdm0);540541vmask4 tdm1 = texel_datum.swz<1,1,1,1>() > zero;542sum_yp += select(zero, texel_datum, tdm1);543544vmask4 tdm2 = texel_datum.swz<2,2,2,2>() > zero;545sum_zp += select(zero, texel_datum, tdm2);546}547548vfloat4 prod_xp = dot(sum_xp, sum_xp);549vfloat4 prod_yp = dot(sum_yp, sum_yp);550vfloat4 prod_zp = dot(sum_zp, sum_zp);551552vfloat4 best_vector = sum_xp;553vfloat4 best_sum = prod_xp;554555vmask4 mask = prod_yp > best_sum;556best_vector = select(best_vector, sum_yp, mask);557best_sum = select(best_sum, prod_yp, mask);558559mask = prod_zp > best_sum;560best_vector = select(best_vector, sum_zp, mask);561562pm[partition].dir = best_vector;563}564}565566/* See header for documentation. */567void compute_avgs_and_dirs_3_comp_rgb(568const partition_info& pi,569const image_block& blk,570partition_metrics pm[BLOCK_MAX_PARTITIONS]571) {572size_t partition_count = pi.partition_count;573promise(partition_count > 0);574575// Pre-compute partition_averages576vfloat4 partition_averages[BLOCK_MAX_PARTITIONS];577compute_partition_averages_rgb(pi, blk, partition_averages);578579for (size_t partition = 0; partition < partition_count; partition++)580{581const uint8_t *texel_indexes = pi.texels_of_partition[partition];582size_t texel_count = pi.partition_texel_count[partition];583promise(texel_count > 0);584585vfloat4 average = partition_averages[partition];586pm[partition].avg = average;587588vfloat4 sum_xp = vfloat4::zero();589vfloat4 sum_yp = vfloat4::zero();590vfloat4 sum_zp = vfloat4::zero();591592for (size_t i = 0; i < texel_count; i++)593{594unsigned int iwt = texel_indexes[i];595596vfloat4 texel_datum = blk.texel3(iwt);597texel_datum = texel_datum - average;598599vfloat4 zero = vfloat4::zero();600601vmask4 tdm0 = texel_datum.swz<0,0,0,0>() > zero;602sum_xp += select(zero, texel_datum, tdm0);603604vmask4 tdm1 = texel_datum.swz<1,1,1,1>() > zero;605sum_yp += select(zero, texel_datum, tdm1);606607vmask4 tdm2 = texel_datum.swz<2,2,2,2>() > zero;608sum_zp += select(zero, texel_datum, tdm2);609}610611vfloat4 prod_xp = dot(sum_xp, sum_xp);612vfloat4 prod_yp = dot(sum_yp, sum_yp);613vfloat4 prod_zp = dot(sum_zp, sum_zp);614615vfloat4 best_vector = sum_xp;616vfloat4 best_sum = prod_xp;617618vmask4 mask = prod_yp > best_sum;619best_vector = select(best_vector, sum_yp, mask);620best_sum = select(best_sum, prod_yp, mask);621622mask = prod_zp > best_sum;623best_vector = select(best_vector, sum_zp, mask);624625pm[partition].dir = best_vector;626}627}628629/* See header for documentation. */630void compute_avgs_and_dirs_2_comp(631const partition_info& pt,632const image_block& blk,633unsigned int component1,634unsigned int component2,635partition_metrics pm[BLOCK_MAX_PARTITIONS]636) {637vfloat4 average;638639const float* data_vr = nullptr;640const float* data_vg = nullptr;641642if (component1 == 0 && component2 == 1)643{644average = blk.data_mean.swz<0, 1>();645646data_vr = blk.data_r;647data_vg = blk.data_g;648}649else if (component1 == 0 && component2 == 2)650{651average = blk.data_mean.swz<0, 2>();652653data_vr = blk.data_r;654data_vg = blk.data_b;655}656else // (component1 == 1 && component2 == 2)657{658assert(component1 == 1 && component2 == 2);659660average = blk.data_mean.swz<1, 2>();661662data_vr = blk.data_g;663data_vg = blk.data_b;664}665666size_t partition_count = pt.partition_count;667promise(partition_count > 0);668669for (size_t partition = 0; partition < partition_count; partition++)670{671const uint8_t *texel_indexes = pt.texels_of_partition[partition];672size_t texel_count = pt.partition_texel_count[partition];673promise(texel_count > 0);674675// Only compute a partition mean if more than one partition676if (partition_count > 1)677{678average = vfloat4::zero();679for (size_t i = 0; i < texel_count; i++)680{681unsigned int iwt = texel_indexes[i];682average += vfloat2(data_vr[iwt], data_vg[iwt]);683}684685average = average / static_cast<float>(texel_count);686}687688pm[partition].avg = average;689690vfloat4 sum_xp = vfloat4::zero();691vfloat4 sum_yp = vfloat4::zero();692693for (size_t i = 0; i < texel_count; i++)694{695unsigned int iwt = texel_indexes[i];696vfloat4 texel_datum = vfloat2(data_vr[iwt], data_vg[iwt]);697texel_datum = texel_datum - average;698699vfloat4 zero = vfloat4::zero();700701vmask4 tdm0 = texel_datum.swz<0,0,0,0>() > zero;702sum_xp += select(zero, texel_datum, tdm0);703704vmask4 tdm1 = texel_datum.swz<1,1,1,1>() > zero;705sum_yp += select(zero, texel_datum, tdm1);706}707708vfloat4 prod_xp = dot(sum_xp, sum_xp);709vfloat4 prod_yp = dot(sum_yp, sum_yp);710711vfloat4 best_vector = sum_xp;712vfloat4 best_sum = prod_xp;713714vmask4 mask = prod_yp > best_sum;715best_vector = select(best_vector, sum_yp, mask);716717pm[partition].dir = best_vector;718}719}720721/* See header for documentation. */722void compute_error_squared_rgba(723const partition_info& pi,724const image_block& blk,725const processed_line4 uncor_plines[BLOCK_MAX_PARTITIONS],726const processed_line4 samec_plines[BLOCK_MAX_PARTITIONS],727float line_lengths[BLOCK_MAX_PARTITIONS],728float& uncor_error,729float& samec_error730) {731size_t partition_count = pi.partition_count;732promise(partition_count > 0);733734vfloatacc uncor_errorsumv = vfloatacc::zero();735vfloatacc samec_errorsumv = vfloatacc::zero();736737for (size_t partition = 0; partition < partition_count; partition++)738{739const uint8_t *texel_indexes = pi.texels_of_partition[partition];740741processed_line4 l_uncor = uncor_plines[partition];742processed_line4 l_samec = samec_plines[partition];743744size_t texel_count = pi.partition_texel_count[partition];745promise(texel_count > 0);746747// Vectorize some useful scalar inputs748vfloat l_uncor_bs0(l_uncor.bs.lane<0>());749vfloat l_uncor_bs1(l_uncor.bs.lane<1>());750vfloat l_uncor_bs2(l_uncor.bs.lane<2>());751vfloat l_uncor_bs3(l_uncor.bs.lane<3>());752753vfloat l_uncor_amod0(l_uncor.amod.lane<0>());754vfloat l_uncor_amod1(l_uncor.amod.lane<1>());755vfloat l_uncor_amod2(l_uncor.amod.lane<2>());756vfloat l_uncor_amod3(l_uncor.amod.lane<3>());757758vfloat l_samec_bs0(l_samec.bs.lane<0>());759vfloat l_samec_bs1(l_samec.bs.lane<1>());760vfloat l_samec_bs2(l_samec.bs.lane<2>());761vfloat l_samec_bs3(l_samec.bs.lane<3>());762763assert(all(l_samec.amod == vfloat4(0.0f)));764765vfloat uncor_loparamv(1e10f);766vfloat uncor_hiparamv(-1e10f);767768vfloat ew_r(blk.channel_weight.lane<0>());769vfloat ew_g(blk.channel_weight.lane<1>());770vfloat ew_b(blk.channel_weight.lane<2>());771vfloat ew_a(blk.channel_weight.lane<3>());772773// This implementation over-shoots, but this is safe as we initialize the texel_indexes774// array to extend the last value. This means min/max are not impacted, but we need to mask775// out the dummy values when we compute the line weighting.776vint lane_ids = vint::lane_id();777for (size_t i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)778{779vmask mask = lane_ids < vint_from_size(texel_count);780const uint8_t* texel_idxs = texel_indexes + i;781782vfloat data_r = gatherf_byte_inds<vfloat>(blk.data_r, texel_idxs);783vfloat data_g = gatherf_byte_inds<vfloat>(blk.data_g, texel_idxs);784vfloat data_b = gatherf_byte_inds<vfloat>(blk.data_b, texel_idxs);785vfloat data_a = gatherf_byte_inds<vfloat>(blk.data_a, texel_idxs);786787vfloat uncor_param = (data_r * l_uncor_bs0)788+ (data_g * l_uncor_bs1)789+ (data_b * l_uncor_bs2)790+ (data_a * l_uncor_bs3);791792uncor_loparamv = min(uncor_param, uncor_loparamv);793uncor_hiparamv = max(uncor_param, uncor_hiparamv);794795vfloat uncor_dist0 = (l_uncor_amod0 - data_r)796+ (uncor_param * l_uncor_bs0);797vfloat uncor_dist1 = (l_uncor_amod1 - data_g)798+ (uncor_param * l_uncor_bs1);799vfloat uncor_dist2 = (l_uncor_amod2 - data_b)800+ (uncor_param * l_uncor_bs2);801vfloat uncor_dist3 = (l_uncor_amod3 - data_a)802+ (uncor_param * l_uncor_bs3);803804vfloat uncor_err = (ew_r * uncor_dist0 * uncor_dist0)805+ (ew_g * uncor_dist1 * uncor_dist1)806+ (ew_b * uncor_dist2 * uncor_dist2)807+ (ew_a * uncor_dist3 * uncor_dist3);808809haccumulate(uncor_errorsumv, uncor_err, mask);810811// Process samechroma data812vfloat samec_param = (data_r * l_samec_bs0)813+ (data_g * l_samec_bs1)814+ (data_b * l_samec_bs2)815+ (data_a * l_samec_bs3);816817vfloat samec_dist0 = samec_param * l_samec_bs0 - data_r;818vfloat samec_dist1 = samec_param * l_samec_bs1 - data_g;819vfloat samec_dist2 = samec_param * l_samec_bs2 - data_b;820vfloat samec_dist3 = samec_param * l_samec_bs3 - data_a;821822vfloat samec_err = (ew_r * samec_dist0 * samec_dist0)823+ (ew_g * samec_dist1 * samec_dist1)824+ (ew_b * samec_dist2 * samec_dist2)825+ (ew_a * samec_dist3 * samec_dist3);826827haccumulate(samec_errorsumv, samec_err, mask);828829lane_ids += vint(ASTCENC_SIMD_WIDTH);830}831832// Turn very small numbers and NaNs into a small number833float uncor_linelen = hmax_s(uncor_hiparamv) - hmin_s(uncor_loparamv);834line_lengths[partition] = astc::max(uncor_linelen, 1e-7f);835}836837uncor_error = hadd_s(uncor_errorsumv);838samec_error = hadd_s(samec_errorsumv);839}840841/* See header for documentation. */842void compute_error_squared_rgb(843const partition_info& pi,844const image_block& blk,845partition_lines3 plines[BLOCK_MAX_PARTITIONS],846float& uncor_error,847float& samec_error848) {849size_t partition_count = pi.partition_count;850promise(partition_count > 0);851852vfloatacc uncor_errorsumv = vfloatacc::zero();853vfloatacc samec_errorsumv = vfloatacc::zero();854855for (size_t partition = 0; partition < partition_count; partition++)856{857partition_lines3& pl = plines[partition];858const uint8_t *texel_indexes = pi.texels_of_partition[partition];859size_t texel_count = pi.partition_texel_count[partition];860promise(texel_count > 0);861862processed_line3 l_uncor = pl.uncor_pline;863processed_line3 l_samec = pl.samec_pline;864865// Vectorize some useful scalar inputs866vfloat l_uncor_bs0(l_uncor.bs.lane<0>());867vfloat l_uncor_bs1(l_uncor.bs.lane<1>());868vfloat l_uncor_bs2(l_uncor.bs.lane<2>());869870vfloat l_uncor_amod0(l_uncor.amod.lane<0>());871vfloat l_uncor_amod1(l_uncor.amod.lane<1>());872vfloat l_uncor_amod2(l_uncor.amod.lane<2>());873874vfloat l_samec_bs0(l_samec.bs.lane<0>());875vfloat l_samec_bs1(l_samec.bs.lane<1>());876vfloat l_samec_bs2(l_samec.bs.lane<2>());877878assert(all(l_samec.amod == vfloat4(0.0f)));879880vfloat uncor_loparamv(1e10f);881vfloat uncor_hiparamv(-1e10f);882883vfloat ew_r(blk.channel_weight.lane<0>());884vfloat ew_g(blk.channel_weight.lane<1>());885vfloat ew_b(blk.channel_weight.lane<2>());886887// This implementation over-shoots, but this is safe as we initialize the weights array888// to extend the last value. This means min/max are not impacted, but we need to mask889// out the dummy values when we compute the line weighting.890vint lane_ids = vint::lane_id();891for (size_t i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)892{893vmask mask = lane_ids < vint_from_size(texel_count);894const uint8_t* texel_idxs = texel_indexes + i;895896vfloat data_r = gatherf_byte_inds<vfloat>(blk.data_r, texel_idxs);897vfloat data_g = gatherf_byte_inds<vfloat>(blk.data_g, texel_idxs);898vfloat data_b = gatherf_byte_inds<vfloat>(blk.data_b, texel_idxs);899900vfloat uncor_param = (data_r * l_uncor_bs0)901+ (data_g * l_uncor_bs1)902+ (data_b * l_uncor_bs2);903904uncor_loparamv = min(uncor_param, uncor_loparamv);905uncor_hiparamv = max(uncor_param, uncor_hiparamv);906907vfloat uncor_dist0 = (l_uncor_amod0 - data_r)908+ (uncor_param * l_uncor_bs0);909vfloat uncor_dist1 = (l_uncor_amod1 - data_g)910+ (uncor_param * l_uncor_bs1);911vfloat uncor_dist2 = (l_uncor_amod2 - data_b)912+ (uncor_param * l_uncor_bs2);913914vfloat uncor_err = (ew_r * uncor_dist0 * uncor_dist0)915+ (ew_g * uncor_dist1 * uncor_dist1)916+ (ew_b * uncor_dist2 * uncor_dist2);917918haccumulate(uncor_errorsumv, uncor_err, mask);919920// Process samechroma data921vfloat samec_param = (data_r * l_samec_bs0)922+ (data_g * l_samec_bs1)923+ (data_b * l_samec_bs2);924925vfloat samec_dist0 = samec_param * l_samec_bs0 - data_r;926vfloat samec_dist1 = samec_param * l_samec_bs1 - data_g;927vfloat samec_dist2 = samec_param * l_samec_bs2 - data_b;928929vfloat samec_err = (ew_r * samec_dist0 * samec_dist0)930+ (ew_g * samec_dist1 * samec_dist1)931+ (ew_b * samec_dist2 * samec_dist2);932933haccumulate(samec_errorsumv, samec_err, mask);934935lane_ids += vint(ASTCENC_SIMD_WIDTH);936}937938// Turn very small numbers and NaNs into a small number939float uncor_linelen = hmax_s(uncor_hiparamv) - hmin_s(uncor_loparamv);940pl.line_length = astc::max(uncor_linelen, 1e-7f);941}942943uncor_error = hadd_s(uncor_errorsumv);944samec_error = hadd_s(samec_errorsumv);945}946947#endif948949950