Path: blob/master/thirdparty/astcenc/astcenc_compute_variance.cpp
9896 views
// SPDX-License-Identifier: Apache-2.01// ----------------------------------------------------------------------------2// Copyright 2011-2022 Arm Limited3//4// Licensed under the Apache License, Version 2.0 (the "License"); you may not5// use this file except in compliance with the License. You may obtain a copy6// of the License at:7//8// http://www.apache.org/licenses/LICENSE-2.09//10// Unless required by applicable law or agreed to in writing, software11// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT12// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the13// License for the specific language governing permissions and limitations14// under the License.15// ----------------------------------------------------------------------------1617#if !defined(ASTCENC_DECOMPRESS_ONLY)1819/**20* @brief Functions to calculate variance per component in a NxN footprint.21*22* We need N to be parametric, so the routine below uses summed area tables in order to execute in23* O(1) time independent of how big N is.24*25* The addition uses a Brent-Kung-based parallel prefix adder. This uses the prefix tree to first26* perform a binary reduction, and then distributes the results. This method means that there is no27* serial dependency between a given element and the next one, and also significantly improves28* numerical stability allowing us to use floats rather than doubles.29*/3031#include "astcenc_internal.h"3233#include <cassert>3435/**36* @brief Generate a prefix-sum array using the Brent-Kung algorithm.37*38* This will take an input array of the form:39* v0, v1, v2, ...40* ... and modify in-place to turn it into a prefix-sum array of the form:41* v0, v0+v1, v0+v1+v2, ...42*43* @param d The array to prefix-sum.44* @param items The number of items in the array.45* @param stride The item spacing in the array; i.e. dense arrays should use 1.46*/47static void brent_kung_prefix_sum(48vfloat4* d,49size_t items,50int stride51) {52if (items < 2)53return;5455size_t lc_stride = 2;56size_t log2_stride = 1;5758// The reduction-tree loop59do {60size_t step = lc_stride >> 1;61size_t start = lc_stride - 1;62size_t iters = items >> log2_stride;6364vfloat4 *da = d + (start * stride);65ptrdiff_t ofs = -static_cast<ptrdiff_t>(step * stride);66size_t ofs_stride = stride << log2_stride;6768while (iters)69{70*da = *da + da[ofs];71da += ofs_stride;72iters--;73}7475log2_stride += 1;76lc_stride <<= 1;77} while (lc_stride <= items);7879// The expansion-tree loop80do {81log2_stride -= 1;82lc_stride >>= 1;8384size_t step = lc_stride >> 1;85size_t start = step + lc_stride - 1;86size_t iters = (items - step) >> log2_stride;8788vfloat4 *da = d + (start * stride);89ptrdiff_t ofs = -static_cast<ptrdiff_t>(step * stride);90size_t ofs_stride = stride << log2_stride;9192while (iters)93{94*da = *da + da[ofs];95da += ofs_stride;96iters--;97}98} while (lc_stride > 2);99}100101/* See header for documentation. */102void compute_pixel_region_variance(103astcenc_contexti& ctx,104const pixel_region_args& arg105) {106// Unpack the memory structure into local variables107const astcenc_image* img = arg.img;108astcenc_swizzle swz = arg.swz;109bool have_z = arg.have_z;110111int size_x = arg.size_x;112int size_y = arg.size_y;113int size_z = arg.size_z;114115int offset_x = arg.offset_x;116int offset_y = arg.offset_y;117int offset_z = arg.offset_z;118119int alpha_kernel_radius = arg.alpha_kernel_radius;120121float* input_alpha_averages = ctx.input_alpha_averages;122vfloat4* work_memory = arg.work_memory;123124// Compute memory sizes and dimensions that we need125int kernel_radius = alpha_kernel_radius;126int kerneldim = 2 * kernel_radius + 1;127int kernel_radius_xy = kernel_radius;128int kernel_radius_z = have_z ? kernel_radius : 0;129130int padsize_x = size_x + kerneldim;131int padsize_y = size_y + kerneldim;132int padsize_z = size_z + (have_z ? kerneldim : 0);133int sizeprod = padsize_x * padsize_y * padsize_z;134135int zd_start = have_z ? 1 : 0;136137vfloat4 *varbuf1 = work_memory;138vfloat4 *varbuf2 = work_memory + sizeprod;139140// Scaling factors to apply to Y and Z for accesses into the work buffers141int yst = padsize_x;142int zst = padsize_x * padsize_y;143144// Scaling factors to apply to Y and Z for accesses into result buffers145int ydt = img->dim_x;146int zdt = img->dim_x * img->dim_y;147148// Macros to act as accessor functions for the work-memory149#define VARBUF1(z, y, x) varbuf1[z * zst + y * yst + x]150#define VARBUF2(z, y, x) varbuf2[z * zst + y * yst + x]151152// Load N and N^2 values into the work buffers153if (img->data_type == ASTCENC_TYPE_U8)154{155// Swizzle data structure 4 = ZERO, 5 = ONE156uint8_t data[6];157data[ASTCENC_SWZ_0] = 0;158data[ASTCENC_SWZ_1] = 255;159160for (int z = zd_start; z < padsize_z; z++)161{162int z_src = (z - zd_start) + offset_z - kernel_radius_z;163z_src = astc::clamp(z_src, 0, static_cast<int>(img->dim_z - 1));164uint8_t* data8 = static_cast<uint8_t*>(img->data[z_src]);165166for (int y = 1; y < padsize_y; y++)167{168int y_src = (y - 1) + offset_y - kernel_radius_xy;169y_src = astc::clamp(y_src, 0, static_cast<int>(img->dim_y - 1));170171for (int x = 1; x < padsize_x; x++)172{173int x_src = (x - 1) + offset_x - kernel_radius_xy;174x_src = astc::clamp(x_src, 0, static_cast<int>(img->dim_x - 1));175176data[0] = data8[(4 * img->dim_x * y_src) + (4 * x_src )];177data[1] = data8[(4 * img->dim_x * y_src) + (4 * x_src + 1)];178data[2] = data8[(4 * img->dim_x * y_src) + (4 * x_src + 2)];179data[3] = data8[(4 * img->dim_x * y_src) + (4 * x_src + 3)];180181uint8_t r = data[swz.r];182uint8_t g = data[swz.g];183uint8_t b = data[swz.b];184uint8_t a = data[swz.a];185186vfloat4 d = vfloat4 (r * (1.0f / 255.0f),187g * (1.0f / 255.0f),188b * (1.0f / 255.0f),189a * (1.0f / 255.0f));190191VARBUF1(z, y, x) = d;192VARBUF2(z, y, x) = d * d;193}194}195}196}197else if (img->data_type == ASTCENC_TYPE_F16)198{199// Swizzle data structure 4 = ZERO, 5 = ONE (in FP16)200uint16_t data[6];201data[ASTCENC_SWZ_0] = 0;202data[ASTCENC_SWZ_1] = 0x3C00;203204for (int z = zd_start; z < padsize_z; z++)205{206int z_src = (z - zd_start) + offset_z - kernel_radius_z;207z_src = astc::clamp(z_src, 0, static_cast<int>(img->dim_z - 1));208uint16_t* data16 = static_cast<uint16_t*>(img->data[z_src]);209210for (int y = 1; y < padsize_y; y++)211{212int y_src = (y - 1) + offset_y - kernel_radius_xy;213y_src = astc::clamp(y_src, 0, static_cast<int>(img->dim_y - 1));214215for (int x = 1; x < padsize_x; x++)216{217int x_src = (x - 1) + offset_x - kernel_radius_xy;218x_src = astc::clamp(x_src, 0, static_cast<int>(img->dim_x - 1));219220data[0] = data16[(4 * img->dim_x * y_src) + (4 * x_src )];221data[1] = data16[(4 * img->dim_x * y_src) + (4 * x_src + 1)];222data[2] = data16[(4 * img->dim_x * y_src) + (4 * x_src + 2)];223data[3] = data16[(4 * img->dim_x * y_src) + (4 * x_src + 3)];224225vint4 di(data[swz.r], data[swz.g], data[swz.b], data[swz.a]);226vfloat4 d = float16_to_float(di);227228VARBUF1(z, y, x) = d;229VARBUF2(z, y, x) = d * d;230}231}232}233}234else // if (img->data_type == ASTCENC_TYPE_F32)235{236assert(img->data_type == ASTCENC_TYPE_F32);237238// Swizzle data structure 4 = ZERO, 5 = ONE (in FP16)239float data[6];240data[ASTCENC_SWZ_0] = 0.0f;241data[ASTCENC_SWZ_1] = 1.0f;242243for (int z = zd_start; z < padsize_z; z++)244{245int z_src = (z - zd_start) + offset_z - kernel_radius_z;246z_src = astc::clamp(z_src, 0, static_cast<int>(img->dim_z - 1));247float* data32 = static_cast<float*>(img->data[z_src]);248249for (int y = 1; y < padsize_y; y++)250{251int y_src = (y - 1) + offset_y - kernel_radius_xy;252y_src = astc::clamp(y_src, 0, static_cast<int>(img->dim_y - 1));253254for (int x = 1; x < padsize_x; x++)255{256int x_src = (x - 1) + offset_x - kernel_radius_xy;257x_src = astc::clamp(x_src, 0, static_cast<int>(img->dim_x - 1));258259data[0] = data32[(4 * img->dim_x * y_src) + (4 * x_src )];260data[1] = data32[(4 * img->dim_x * y_src) + (4 * x_src + 1)];261data[2] = data32[(4 * img->dim_x * y_src) + (4 * x_src + 2)];262data[3] = data32[(4 * img->dim_x * y_src) + (4 * x_src + 3)];263264float r = data[swz.r];265float g = data[swz.g];266float b = data[swz.b];267float a = data[swz.a];268269vfloat4 d(r, g, b, a);270271VARBUF1(z, y, x) = d;272VARBUF2(z, y, x) = d * d;273}274}275}276}277278// Pad with an extra layer of 0s; this forms the edge of the SAT tables279vfloat4 vbz = vfloat4::zero();280for (int z = 0; z < padsize_z; z++)281{282for (int y = 0; y < padsize_y; y++)283{284VARBUF1(z, y, 0) = vbz;285VARBUF2(z, y, 0) = vbz;286}287288for (int x = 0; x < padsize_x; x++)289{290VARBUF1(z, 0, x) = vbz;291VARBUF2(z, 0, x) = vbz;292}293}294295if (have_z)296{297for (int y = 0; y < padsize_y; y++)298{299for (int x = 0; x < padsize_x; x++)300{301VARBUF1(0, y, x) = vbz;302VARBUF2(0, y, x) = vbz;303}304}305}306307// Generate summed-area tables for N and N^2; this is done in-place, using308// a Brent-Kung parallel-prefix based algorithm to minimize precision loss309for (int z = zd_start; z < padsize_z; z++)310{311for (int y = 1; y < padsize_y; y++)312{313brent_kung_prefix_sum(&(VARBUF1(z, y, 1)), padsize_x - 1, 1);314brent_kung_prefix_sum(&(VARBUF2(z, y, 1)), padsize_x - 1, 1);315}316}317318for (int z = zd_start; z < padsize_z; z++)319{320for (int x = 1; x < padsize_x; x++)321{322brent_kung_prefix_sum(&(VARBUF1(z, 1, x)), padsize_y - 1, yst);323brent_kung_prefix_sum(&(VARBUF2(z, 1, x)), padsize_y - 1, yst);324}325}326327if (have_z)328{329for (int y = 1; y < padsize_y; y++)330{331for (int x = 1; x < padsize_x; x++)332{333brent_kung_prefix_sum(&(VARBUF1(1, y, x)), padsize_z - 1, zst);334brent_kung_prefix_sum(&(VARBUF2(1, y, x)), padsize_z - 1, zst);335}336}337}338339// Compute a few constants used in the variance-calculation.340float alpha_kdim = static_cast<float>(2 * alpha_kernel_radius + 1);341float alpha_rsamples;342343if (have_z)344{345alpha_rsamples = 1.0f / (alpha_kdim * alpha_kdim * alpha_kdim);346}347else348{349alpha_rsamples = 1.0f / (alpha_kdim * alpha_kdim);350}351352// Use the summed-area tables to compute variance for each neighborhood353if (have_z)354{355for (int z = 0; z < size_z; z++)356{357int z_src = z + kernel_radius_z;358int z_dst = z + offset_z;359int z_low = z_src - alpha_kernel_radius;360int z_high = z_src + alpha_kernel_radius + 1;361362for (int y = 0; y < size_y; y++)363{364int y_src = y + kernel_radius_xy;365int y_dst = y + offset_y;366int y_low = y_src - alpha_kernel_radius;367int y_high = y_src + alpha_kernel_radius + 1;368369for (int x = 0; x < size_x; x++)370{371int x_src = x + kernel_radius_xy;372int x_dst = x + offset_x;373int x_low = x_src - alpha_kernel_radius;374int x_high = x_src + alpha_kernel_radius + 1;375376// Summed-area table lookups for alpha average377float vasum = ( VARBUF1(z_high, y_low, x_low).lane<3>()378- VARBUF1(z_high, y_low, x_high).lane<3>()379- VARBUF1(z_high, y_high, x_low).lane<3>()380+ VARBUF1(z_high, y_high, x_high).lane<3>()) -381( VARBUF1(z_low, y_low, x_low).lane<3>()382- VARBUF1(z_low, y_low, x_high).lane<3>()383- VARBUF1(z_low, y_high, x_low).lane<3>()384+ VARBUF1(z_low, y_high, x_high).lane<3>());385386int out_index = z_dst * zdt + y_dst * ydt + x_dst;387input_alpha_averages[out_index] = (vasum * alpha_rsamples);388}389}390}391}392else393{394for (int y = 0; y < size_y; y++)395{396int y_src = y + kernel_radius_xy;397int y_dst = y + offset_y;398int y_low = y_src - alpha_kernel_radius;399int y_high = y_src + alpha_kernel_radius + 1;400401for (int x = 0; x < size_x; x++)402{403int x_src = x + kernel_radius_xy;404int x_dst = x + offset_x;405int x_low = x_src - alpha_kernel_radius;406int x_high = x_src + alpha_kernel_radius + 1;407408// Summed-area table lookups for alpha average409float vasum = VARBUF1(0, y_low, x_low).lane<3>()410- VARBUF1(0, y_low, x_high).lane<3>()411- VARBUF1(0, y_high, x_low).lane<3>()412+ VARBUF1(0, y_high, x_high).lane<3>();413414int out_index = y_dst * ydt + x_dst;415input_alpha_averages[out_index] = (vasum * alpha_rsamples);416}417}418}419}420421/* See header for documentation. */422unsigned int init_compute_averages(423const astcenc_image& img,424unsigned int alpha_kernel_radius,425const astcenc_swizzle& swz,426avg_args& ag427) {428unsigned int size_x = img.dim_x;429unsigned int size_y = img.dim_y;430unsigned int size_z = img.dim_z;431432// Compute maximum block size and from that the working memory buffer size433unsigned int kernel_radius = alpha_kernel_radius;434unsigned int kerneldim = 2 * kernel_radius + 1;435436bool have_z = (size_z > 1);437unsigned int max_blk_size_xy = have_z ? 16 : 32;438unsigned int max_blk_size_z = astc::min(size_z, have_z ? 16u : 1u);439440unsigned int max_padsize_xy = max_blk_size_xy + kerneldim;441unsigned int max_padsize_z = max_blk_size_z + (have_z ? kerneldim : 0);442443// Perform block-wise averages calculations across the image444// Initialize fields which are not populated until later445ag.arg.size_x = 0;446ag.arg.size_y = 0;447ag.arg.size_z = 0;448ag.arg.offset_x = 0;449ag.arg.offset_y = 0;450ag.arg.offset_z = 0;451ag.arg.work_memory = nullptr;452453ag.arg.img = &img;454ag.arg.swz = swz;455ag.arg.have_z = have_z;456ag.arg.alpha_kernel_radius = alpha_kernel_radius;457458ag.img_size_x = size_x;459ag.img_size_y = size_y;460ag.img_size_z = size_z;461ag.blk_size_xy = max_blk_size_xy;462ag.blk_size_z = max_blk_size_z;463ag.work_memory_size = 2 * max_padsize_xy * max_padsize_xy * max_padsize_z;464465// The parallel task count466unsigned int z_tasks = (size_z + max_blk_size_z - 1) / max_blk_size_z;467unsigned int y_tasks = (size_y + max_blk_size_xy - 1) / max_blk_size_xy;468return z_tasks * y_tasks;469}470471#endif472473474