Path: blob/master/thirdparty/astcenc/astcenc_image.cpp
9902 views
// SPDX-License-Identifier: Apache-2.01// ----------------------------------------------------------------------------2// Copyright 2011-2024 Arm Limited3//4// Licensed under the Apache License, Version 2.0 (the "License"); you may not5// use this file except in compliance with the License. You may obtain a copy6// of the License at:7//8// http://www.apache.org/licenses/LICENSE-2.09//10// Unless required by applicable law or agreed to in writing, software11// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT12// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the13// License for the specific language governing permissions and limitations14// under the License.15// ----------------------------------------------------------------------------1617/**18* @brief Functions for creating in-memory ASTC image structures.19*/2021#include <cassert>22#include <cstring>2324#include "astcenc_internal.h"2526/**27* @brief Loader pipeline function type for data fetch from memory.28*/29using pixel_loader = vfloat4(*)(const void*, int);3031/**32* @brief Loader pipeline function type for swizzling data in a vector.33*/34using pixel_swizzler = vfloat4(*)(vfloat4, const astcenc_swizzle&);3536/**37* @brief Loader pipeline function type for converting data in a vector to LNS.38*/39using pixel_converter = vfloat4(*)(vfloat4, vmask4);4041/**42* @brief Load a 8-bit UNORM texel from a data array.43*44* @param data The data pointer.45* @param base_offset The index offset to the start of the pixel.46*/47static vfloat4 load_texel_u8(48const void* data,49int base_offset50) {51const uint8_t* data8 = static_cast<const uint8_t*>(data);52return int_to_float(vint4(data8 + base_offset)) / 255.0f;53}5455/**56* @brief Load a 16-bit fp16 texel from a data array.57*58* @param data The data pointer.59* @param base_offset The index offset to the start of the pixel.60*/61static vfloat4 load_texel_f16(62const void* data,63int base_offset64) {65const uint16_t* data16 = static_cast<const uint16_t*>(data);66int r = data16[base_offset ];67int g = data16[base_offset + 1];68int b = data16[base_offset + 2];69int a = data16[base_offset + 3];70return float16_to_float(vint4(r, g, b, a));71}7273/**74* @brief Load a 32-bit float texel from a data array.75*76* @param data The data pointer.77* @param base_offset The index offset to the start of the pixel.78*/79static vfloat4 load_texel_f32(80const void* data,81int base_offset82) {83const float* data32 = static_cast<const float*>(data);84return vfloat4(data32 + base_offset);85}8687/**88* @brief Dummy no-op swizzle function.89*90* @param data The source RGBA vector to swizzle.91* @param swz The swizzle to use.92*/93static vfloat4 swz_texel_skip(94vfloat4 data,95const astcenc_swizzle& swz96) {97(void)swz;98return data;99}100101/**102* @brief Swizzle a texel into a new arrangement.103*104* @param data The source RGBA vector to swizzle.105* @param swz The swizzle to use.106*/107static vfloat4 swz_texel(108vfloat4 data,109const astcenc_swizzle& swz110) {111ASTCENC_ALIGNAS float datas[6];112113storea(data, datas);114datas[ASTCENC_SWZ_0] = 0.0f;115datas[ASTCENC_SWZ_1] = 1.0f;116117return vfloat4(datas[swz.r], datas[swz.g], datas[swz.b], datas[swz.a]);118}119120/**121* @brief Encode a texel that is entirely LDR linear.122*123* @param data The RGBA data to encode.124* @param lns_mask The mask for the HDR channels than need LNS encoding.125*/126static vfloat4 encode_texel_unorm(127vfloat4 data,128vmask4 lns_mask129) {130(void)lns_mask;131return data * 65535.0f;132}133134/**135* @brief Encode a texel that includes at least some HDR LNS texels.136*137* @param data The RGBA data to encode.138* @param lns_mask The mask for the HDR channels than need LNS encoding.139*/140static vfloat4 encode_texel_lns(141vfloat4 data,142vmask4 lns_mask143) {144vfloat4 datav_unorm = data * 65535.0f;145vfloat4 datav_lns = float_to_lns(data);146return select(datav_unorm, datav_lns, lns_mask);147}148149/* See header for documentation. */150void load_image_block(151astcenc_profile decode_mode,152const astcenc_image& img,153image_block& blk,154const block_size_descriptor& bsd,155unsigned int xpos,156unsigned int ypos,157unsigned int zpos,158const astcenc_swizzle& swz159) {160unsigned int xsize = img.dim_x;161unsigned int ysize = img.dim_y;162unsigned int zsize = img.dim_z;163164blk.xpos = xpos;165blk.ypos = ypos;166blk.zpos = zpos;167168// True if any non-identity swizzle169bool needs_swz = (swz.r != ASTCENC_SWZ_R) || (swz.g != ASTCENC_SWZ_G) ||170(swz.b != ASTCENC_SWZ_B) || (swz.a != ASTCENC_SWZ_A);171172int idx = 0;173174vfloat4 data_min(1e38f);175vfloat4 data_mean(0.0f);176vfloat4 data_mean_scale(1.0f / static_cast<float>(bsd.texel_count));177vfloat4 data_max(-1e38f);178vmask4 grayscalev(true);179180// This works because we impose the same choice everywhere during encode181uint8_t rgb_lns = (decode_mode == ASTCENC_PRF_HDR) ||182(decode_mode == ASTCENC_PRF_HDR_RGB_LDR_A) ? 1 : 0;183uint8_t a_lns = decode_mode == ASTCENC_PRF_HDR ? 1 : 0;184vint4 use_lns(rgb_lns, rgb_lns, rgb_lns, a_lns);185vmask4 lns_mask = use_lns != vint4::zero();186187// Set up the function pointers for loading pipeline as needed188pixel_loader loader = load_texel_u8;189if (img.data_type == ASTCENC_TYPE_F16)190{191loader = load_texel_f16;192}193else if (img.data_type == ASTCENC_TYPE_F32)194{195loader = load_texel_f32;196}197198pixel_swizzler swizzler = swz_texel_skip;199if (needs_swz)200{201swizzler = swz_texel;202}203204pixel_converter converter = encode_texel_unorm;205if (any(lns_mask))206{207converter = encode_texel_lns;208}209210for (unsigned int z = 0; z < bsd.zdim; z++)211{212unsigned int zi = astc::min(zpos + z, zsize - 1);213void* plane = img.data[zi];214215for (unsigned int y = 0; y < bsd.ydim; y++)216{217unsigned int yi = astc::min(ypos + y, ysize - 1);218219for (unsigned int x = 0; x < bsd.xdim; x++)220{221unsigned int xi = astc::min(xpos + x, xsize - 1);222223vfloat4 datav = loader(plane, (4 * xsize * yi) + (4 * xi));224datav = swizzler(datav, swz);225datav = converter(datav, lns_mask);226227// Compute block metadata228data_min = min(data_min, datav);229data_mean += datav * data_mean_scale;230data_max = max(data_max, datav);231232grayscalev = grayscalev & (datav.swz<0,0,0,0>() == datav.swz<1,1,2,2>());233234blk.data_r[idx] = datav.lane<0>();235blk.data_g[idx] = datav.lane<1>();236blk.data_b[idx] = datav.lane<2>();237blk.data_a[idx] = datav.lane<3>();238239blk.rgb_lns[idx] = rgb_lns;240blk.alpha_lns[idx] = a_lns;241242idx++;243}244}245}246247// Reverse the encoding so we store origin block in the original format248vfloat4 data_enc = blk.texel(0);249vfloat4 data_enc_unorm = data_enc / 65535.0f;250vfloat4 data_enc_lns = vfloat4::zero();251252if (rgb_lns || a_lns)253{254data_enc_lns = float16_to_float(lns_to_sf16(float_to_int(data_enc)));255}256257blk.origin_texel = select(data_enc_unorm, data_enc_lns, lns_mask);258259// Store block metadata260blk.data_min = data_min;261blk.data_mean = data_mean;262blk.data_max = data_max;263blk.grayscale = all(grayscalev);264}265266/* See header for documentation. */267void load_image_block_fast_ldr(268astcenc_profile decode_mode,269const astcenc_image& img,270image_block& blk,271const block_size_descriptor& bsd,272unsigned int xpos,273unsigned int ypos,274unsigned int zpos,275const astcenc_swizzle& swz276) {277(void)swz;278(void)decode_mode;279280unsigned int xsize = img.dim_x;281unsigned int ysize = img.dim_y;282283blk.xpos = xpos;284blk.ypos = ypos;285blk.zpos = zpos;286287vfloat4 data_min(1e38f);288vfloat4 data_mean = vfloat4::zero();289vfloat4 data_max(-1e38f);290vmask4 grayscalev(true);291int idx = 0;292293const uint8_t* plane = static_cast<const uint8_t*>(img.data[0]);294for (unsigned int y = ypos; y < ypos + bsd.ydim; y++)295{296unsigned int yi = astc::min(y, ysize - 1);297298for (unsigned int x = xpos; x < xpos + bsd.xdim; x++)299{300unsigned int xi = astc::min(x, xsize - 1);301302vint4 datavi = vint4(plane + (4 * xsize * yi) + (4 * xi));303vfloat4 datav = int_to_float(datavi) * (65535.0f / 255.0f);304305// Compute block metadata306data_min = min(data_min, datav);307data_mean += datav;308data_max = max(data_max, datav);309310grayscalev = grayscalev & (datav.swz<0,0,0,0>() == datav.swz<1,1,2,2>());311312blk.data_r[idx] = datav.lane<0>();313blk.data_g[idx] = datav.lane<1>();314blk.data_b[idx] = datav.lane<2>();315blk.data_a[idx] = datav.lane<3>();316317idx++;318}319}320321// Reverse the encoding so we store origin block in the original format322blk.origin_texel = blk.texel(0) / 65535.0f;323324// Store block metadata325blk.rgb_lns[0] = 0;326blk.alpha_lns[0] = 0;327blk.data_min = data_min;328blk.data_mean = data_mean / static_cast<float>(bsd.texel_count);329blk.data_max = data_max;330blk.grayscale = all(grayscalev);331}332333/* See header for documentation. */334void store_image_block(335astcenc_image& img,336const image_block& blk,337const block_size_descriptor& bsd,338unsigned int xpos,339unsigned int ypos,340unsigned int zpos,341const astcenc_swizzle& swz342) {343unsigned int x_size = img.dim_x;344unsigned int x_start = xpos;345unsigned int x_end = astc::min(x_size, xpos + bsd.xdim);346unsigned int x_count = x_end - x_start;347unsigned int x_nudge = bsd.xdim - x_count;348349unsigned int y_size = img.dim_y;350unsigned int y_start = ypos;351unsigned int y_end = astc::min(y_size, ypos + bsd.ydim);352unsigned int y_count = y_end - y_start;353unsigned int y_nudge = (bsd.ydim - y_count) * bsd.xdim;354355unsigned int z_size = img.dim_z;356unsigned int z_start = zpos;357unsigned int z_end = astc::min(z_size, zpos + bsd.zdim);358359// True if any non-identity swizzle360bool needs_swz = (swz.r != ASTCENC_SWZ_R) || (swz.g != ASTCENC_SWZ_G) ||361(swz.b != ASTCENC_SWZ_B) || (swz.a != ASTCENC_SWZ_A);362363// True if any swizzle uses Z reconstruct364bool needs_z = (swz.r == ASTCENC_SWZ_Z) || (swz.g == ASTCENC_SWZ_Z) ||365(swz.b == ASTCENC_SWZ_Z) || (swz.a == ASTCENC_SWZ_Z);366367int idx = 0;368if (img.data_type == ASTCENC_TYPE_U8)369{370for (unsigned int z = z_start; z < z_end; z++)371{372// Fetch the image plane373uint8_t* data8 = static_cast<uint8_t*>(img.data[z]);374375for (unsigned int y = y_start; y < y_end; y++)376{377uint8_t* data8_row = data8 + (4 * x_size * y) + (4 * x_start);378379for (unsigned int x = 0; x < x_count; x += ASTCENC_SIMD_WIDTH)380{381unsigned int max_texels = ASTCENC_SIMD_WIDTH;382unsigned int used_texels = astc::min(x_count - x, max_texels);383384// Unaligned load as rows are not always SIMD_WIDTH long385vfloat data_r(blk.data_r + idx);386vfloat data_g(blk.data_g + idx);387vfloat data_b(blk.data_b + idx);388vfloat data_a(blk.data_a + idx);389390vint data_ri = float_to_int_rtn(min(data_r, 1.0f) * 255.0f);391vint data_gi = float_to_int_rtn(min(data_g, 1.0f) * 255.0f);392vint data_bi = float_to_int_rtn(min(data_b, 1.0f) * 255.0f);393vint data_ai = float_to_int_rtn(min(data_a, 1.0f) * 255.0f);394395if (needs_swz)396{397vint swizzle_table[7];398swizzle_table[ASTCENC_SWZ_0] = vint(0);399swizzle_table[ASTCENC_SWZ_1] = vint(255);400swizzle_table[ASTCENC_SWZ_R] = data_ri;401swizzle_table[ASTCENC_SWZ_G] = data_gi;402swizzle_table[ASTCENC_SWZ_B] = data_bi;403swizzle_table[ASTCENC_SWZ_A] = data_ai;404405if (needs_z)406{407vfloat data_x = (data_r * vfloat(2.0f)) - vfloat(1.0f);408vfloat data_y = (data_a * vfloat(2.0f)) - vfloat(1.0f);409vfloat data_z = vfloat(1.0f) - (data_x * data_x) - (data_y * data_y);410data_z = max(data_z, 0.0f);411data_z = (sqrt(data_z) * vfloat(0.5f)) + vfloat(0.5f);412413swizzle_table[ASTCENC_SWZ_Z] = float_to_int_rtn(min(data_z, 1.0f) * 255.0f);414}415416data_ri = swizzle_table[swz.r];417data_gi = swizzle_table[swz.g];418data_bi = swizzle_table[swz.b];419data_ai = swizzle_table[swz.a];420}421422// Errors are NaN encoded - convert to magenta error color423// Branch is OK here - it is almost never true so predicts well424vmask nan_mask = data_r != data_r;425if (any(nan_mask))426{427data_ri = select(data_ri, vint(0xFF), nan_mask);428data_gi = select(data_gi, vint(0x00), nan_mask);429data_bi = select(data_bi, vint(0xFF), nan_mask);430data_ai = select(data_ai, vint(0xFF), nan_mask);431}432433vint data_rgbai = interleave_rgba8(data_ri, data_gi, data_bi, data_ai);434vmask store_mask = vint::lane_id() < vint(used_texels);435store_lanes_masked(data8_row, data_rgbai, store_mask);436437data8_row += ASTCENC_SIMD_WIDTH * 4;438idx += used_texels;439}440idx += x_nudge;441}442idx += y_nudge;443}444}445else if (img.data_type == ASTCENC_TYPE_F16)446{447for (unsigned int z = z_start; z < z_end; z++)448{449// Fetch the image plane450uint16_t* data16 = static_cast<uint16_t*>(img.data[z]);451452for (unsigned int y = y_start; y < y_end; y++)453{454uint16_t* data16_row = data16 + (4 * x_size * y) + (4 * x_start);455456for (unsigned int x = 0; x < x_count; x++)457{458vint4 color;459460// NaNs are handled inline - no need to special case461if (needs_swz)462{463float data[7];464data[ASTCENC_SWZ_0] = 0.0f;465data[ASTCENC_SWZ_1] = 1.0f;466data[ASTCENC_SWZ_R] = blk.data_r[idx];467data[ASTCENC_SWZ_G] = blk.data_g[idx];468data[ASTCENC_SWZ_B] = blk.data_b[idx];469data[ASTCENC_SWZ_A] = blk.data_a[idx];470471if (needs_z)472{473float xN = (data[0] * 2.0f) - 1.0f;474float yN = (data[3] * 2.0f) - 1.0f;475float zN = 1.0f - xN * xN - yN * yN;476if (zN < 0.0f)477{478zN = 0.0f;479}480data[ASTCENC_SWZ_Z] = (astc::sqrt(zN) * 0.5f) + 0.5f;481}482483vfloat4 colorf(data[swz.r], data[swz.g], data[swz.b], data[swz.a]);484color = float_to_float16(colorf);485}486else487{488vfloat4 colorf = blk.texel(idx);489color = float_to_float16(colorf);490}491492// TODO: Vectorize with store N shorts?493data16_row[0] = static_cast<uint16_t>(color.lane<0>());494data16_row[1] = static_cast<uint16_t>(color.lane<1>());495data16_row[2] = static_cast<uint16_t>(color.lane<2>());496data16_row[3] = static_cast<uint16_t>(color.lane<3>());497data16_row += 4;498idx++;499}500idx += x_nudge;501}502idx += y_nudge;503}504}505else // if (img.data_type == ASTCENC_TYPE_F32)506{507assert(img.data_type == ASTCENC_TYPE_F32);508509for (unsigned int z = z_start; z < z_end; z++)510{511// Fetch the image plane512float* data32 = static_cast<float*>(img.data[z]);513514for (unsigned int y = y_start; y < y_end; y++)515{516float* data32_row = data32 + (4 * x_size * y) + (4 * x_start);517518for (unsigned int x = 0; x < x_count; x++)519{520vfloat4 color = blk.texel(idx);521522// NaNs are handled inline - no need to special case523if (needs_swz)524{525float data[7];526data[ASTCENC_SWZ_0] = 0.0f;527data[ASTCENC_SWZ_1] = 1.0f;528data[ASTCENC_SWZ_R] = color.lane<0>();529data[ASTCENC_SWZ_G] = color.lane<1>();530data[ASTCENC_SWZ_B] = color.lane<2>();531data[ASTCENC_SWZ_A] = color.lane<3>();532533if (needs_z)534{535float xN = (data[0] * 2.0f) - 1.0f;536float yN = (data[3] * 2.0f) - 1.0f;537float zN = 1.0f - xN * xN - yN * yN;538if (zN < 0.0f)539{540zN = 0.0f;541}542data[ASTCENC_SWZ_Z] = (astc::sqrt(zN) * 0.5f) + 0.5f;543}544545color = vfloat4(data[swz.r], data[swz.g], data[swz.b], data[swz.a]);546}547548store(color, data32_row);549data32_row += 4;550idx++;551}552idx += x_nudge;553}554idx += y_nudge;555}556}557}558559560