Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
godotengine
GitHub Repository: godotengine/godot
Path: blob/master/thirdparty/astcenc/astcenc_compute_variance.cpp
9896 views
1
// SPDX-License-Identifier: Apache-2.0
2
// ----------------------------------------------------------------------------
3
// Copyright 2011-2022 Arm Limited
4
//
5
// Licensed under the Apache License, Version 2.0 (the "License"); you may not
6
// use this file except in compliance with the License. You may obtain a copy
7
// of the License at:
8
//
9
// http://www.apache.org/licenses/LICENSE-2.0
10
//
11
// Unless required by applicable law or agreed to in writing, software
12
// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
13
// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
14
// License for the specific language governing permissions and limitations
15
// under the License.
16
// ----------------------------------------------------------------------------
17
18
#if !defined(ASTCENC_DECOMPRESS_ONLY)
19
20
/**
21
* @brief Functions to calculate variance per component in a NxN footprint.
22
*
23
* We need N to be parametric, so the routine below uses summed area tables in order to execute in
24
* O(1) time independent of how big N is.
25
*
26
* The addition uses a Brent-Kung-based parallel prefix adder. This uses the prefix tree to first
27
* perform a binary reduction, and then distributes the results. This method means that there is no
28
* serial dependency between a given element and the next one, and also significantly improves
29
* numerical stability allowing us to use floats rather than doubles.
30
*/
31
32
#include "astcenc_internal.h"
33
34
#include <cassert>
35
36
/**
37
* @brief Generate a prefix-sum array using the Brent-Kung algorithm.
38
*
39
* This will take an input array of the form:
40
* v0, v1, v2, ...
41
* ... and modify in-place to turn it into a prefix-sum array of the form:
42
* v0, v0+v1, v0+v1+v2, ...
43
*
44
* @param d The array to prefix-sum.
45
* @param items The number of items in the array.
46
* @param stride The item spacing in the array; i.e. dense arrays should use 1.
47
*/
48
static void brent_kung_prefix_sum(
49
vfloat4* d,
50
size_t items,
51
int stride
52
) {
53
if (items < 2)
54
return;
55
56
size_t lc_stride = 2;
57
size_t log2_stride = 1;
58
59
// The reduction-tree loop
60
do {
61
size_t step = lc_stride >> 1;
62
size_t start = lc_stride - 1;
63
size_t iters = items >> log2_stride;
64
65
vfloat4 *da = d + (start * stride);
66
ptrdiff_t ofs = -static_cast<ptrdiff_t>(step * stride);
67
size_t ofs_stride = stride << log2_stride;
68
69
while (iters)
70
{
71
*da = *da + da[ofs];
72
da += ofs_stride;
73
iters--;
74
}
75
76
log2_stride += 1;
77
lc_stride <<= 1;
78
} while (lc_stride <= items);
79
80
// The expansion-tree loop
81
do {
82
log2_stride -= 1;
83
lc_stride >>= 1;
84
85
size_t step = lc_stride >> 1;
86
size_t start = step + lc_stride - 1;
87
size_t iters = (items - step) >> log2_stride;
88
89
vfloat4 *da = d + (start * stride);
90
ptrdiff_t ofs = -static_cast<ptrdiff_t>(step * stride);
91
size_t ofs_stride = stride << log2_stride;
92
93
while (iters)
94
{
95
*da = *da + da[ofs];
96
da += ofs_stride;
97
iters--;
98
}
99
} while (lc_stride > 2);
100
}
101
102
/* See header for documentation. */
103
void compute_pixel_region_variance(
104
astcenc_contexti& ctx,
105
const pixel_region_args& arg
106
) {
107
// Unpack the memory structure into local variables
108
const astcenc_image* img = arg.img;
109
astcenc_swizzle swz = arg.swz;
110
bool have_z = arg.have_z;
111
112
int size_x = arg.size_x;
113
int size_y = arg.size_y;
114
int size_z = arg.size_z;
115
116
int offset_x = arg.offset_x;
117
int offset_y = arg.offset_y;
118
int offset_z = arg.offset_z;
119
120
int alpha_kernel_radius = arg.alpha_kernel_radius;
121
122
float* input_alpha_averages = ctx.input_alpha_averages;
123
vfloat4* work_memory = arg.work_memory;
124
125
// Compute memory sizes and dimensions that we need
126
int kernel_radius = alpha_kernel_radius;
127
int kerneldim = 2 * kernel_radius + 1;
128
int kernel_radius_xy = kernel_radius;
129
int kernel_radius_z = have_z ? kernel_radius : 0;
130
131
int padsize_x = size_x + kerneldim;
132
int padsize_y = size_y + kerneldim;
133
int padsize_z = size_z + (have_z ? kerneldim : 0);
134
int sizeprod = padsize_x * padsize_y * padsize_z;
135
136
int zd_start = have_z ? 1 : 0;
137
138
vfloat4 *varbuf1 = work_memory;
139
vfloat4 *varbuf2 = work_memory + sizeprod;
140
141
// Scaling factors to apply to Y and Z for accesses into the work buffers
142
int yst = padsize_x;
143
int zst = padsize_x * padsize_y;
144
145
// Scaling factors to apply to Y and Z for accesses into result buffers
146
int ydt = img->dim_x;
147
int zdt = img->dim_x * img->dim_y;
148
149
// Macros to act as accessor functions for the work-memory
150
#define VARBUF1(z, y, x) varbuf1[z * zst + y * yst + x]
151
#define VARBUF2(z, y, x) varbuf2[z * zst + y * yst + x]
152
153
// Load N and N^2 values into the work buffers
154
if (img->data_type == ASTCENC_TYPE_U8)
155
{
156
// Swizzle data structure 4 = ZERO, 5 = ONE
157
uint8_t data[6];
158
data[ASTCENC_SWZ_0] = 0;
159
data[ASTCENC_SWZ_1] = 255;
160
161
for (int z = zd_start; z < padsize_z; z++)
162
{
163
int z_src = (z - zd_start) + offset_z - kernel_radius_z;
164
z_src = astc::clamp(z_src, 0, static_cast<int>(img->dim_z - 1));
165
uint8_t* data8 = static_cast<uint8_t*>(img->data[z_src]);
166
167
for (int y = 1; y < padsize_y; y++)
168
{
169
int y_src = (y - 1) + offset_y - kernel_radius_xy;
170
y_src = astc::clamp(y_src, 0, static_cast<int>(img->dim_y - 1));
171
172
for (int x = 1; x < padsize_x; x++)
173
{
174
int x_src = (x - 1) + offset_x - kernel_radius_xy;
175
x_src = astc::clamp(x_src, 0, static_cast<int>(img->dim_x - 1));
176
177
data[0] = data8[(4 * img->dim_x * y_src) + (4 * x_src )];
178
data[1] = data8[(4 * img->dim_x * y_src) + (4 * x_src + 1)];
179
data[2] = data8[(4 * img->dim_x * y_src) + (4 * x_src + 2)];
180
data[3] = data8[(4 * img->dim_x * y_src) + (4 * x_src + 3)];
181
182
uint8_t r = data[swz.r];
183
uint8_t g = data[swz.g];
184
uint8_t b = data[swz.b];
185
uint8_t a = data[swz.a];
186
187
vfloat4 d = vfloat4 (r * (1.0f / 255.0f),
188
g * (1.0f / 255.0f),
189
b * (1.0f / 255.0f),
190
a * (1.0f / 255.0f));
191
192
VARBUF1(z, y, x) = d;
193
VARBUF2(z, y, x) = d * d;
194
}
195
}
196
}
197
}
198
else if (img->data_type == ASTCENC_TYPE_F16)
199
{
200
// Swizzle data structure 4 = ZERO, 5 = ONE (in FP16)
201
uint16_t data[6];
202
data[ASTCENC_SWZ_0] = 0;
203
data[ASTCENC_SWZ_1] = 0x3C00;
204
205
for (int z = zd_start; z < padsize_z; z++)
206
{
207
int z_src = (z - zd_start) + offset_z - kernel_radius_z;
208
z_src = astc::clamp(z_src, 0, static_cast<int>(img->dim_z - 1));
209
uint16_t* data16 = static_cast<uint16_t*>(img->data[z_src]);
210
211
for (int y = 1; y < padsize_y; y++)
212
{
213
int y_src = (y - 1) + offset_y - kernel_radius_xy;
214
y_src = astc::clamp(y_src, 0, static_cast<int>(img->dim_y - 1));
215
216
for (int x = 1; x < padsize_x; x++)
217
{
218
int x_src = (x - 1) + offset_x - kernel_radius_xy;
219
x_src = astc::clamp(x_src, 0, static_cast<int>(img->dim_x - 1));
220
221
data[0] = data16[(4 * img->dim_x * y_src) + (4 * x_src )];
222
data[1] = data16[(4 * img->dim_x * y_src) + (4 * x_src + 1)];
223
data[2] = data16[(4 * img->dim_x * y_src) + (4 * x_src + 2)];
224
data[3] = data16[(4 * img->dim_x * y_src) + (4 * x_src + 3)];
225
226
vint4 di(data[swz.r], data[swz.g], data[swz.b], data[swz.a]);
227
vfloat4 d = float16_to_float(di);
228
229
VARBUF1(z, y, x) = d;
230
VARBUF2(z, y, x) = d * d;
231
}
232
}
233
}
234
}
235
else // if (img->data_type == ASTCENC_TYPE_F32)
236
{
237
assert(img->data_type == ASTCENC_TYPE_F32);
238
239
// Swizzle data structure 4 = ZERO, 5 = ONE (in FP16)
240
float data[6];
241
data[ASTCENC_SWZ_0] = 0.0f;
242
data[ASTCENC_SWZ_1] = 1.0f;
243
244
for (int z = zd_start; z < padsize_z; z++)
245
{
246
int z_src = (z - zd_start) + offset_z - kernel_radius_z;
247
z_src = astc::clamp(z_src, 0, static_cast<int>(img->dim_z - 1));
248
float* data32 = static_cast<float*>(img->data[z_src]);
249
250
for (int y = 1; y < padsize_y; y++)
251
{
252
int y_src = (y - 1) + offset_y - kernel_radius_xy;
253
y_src = astc::clamp(y_src, 0, static_cast<int>(img->dim_y - 1));
254
255
for (int x = 1; x < padsize_x; x++)
256
{
257
int x_src = (x - 1) + offset_x - kernel_radius_xy;
258
x_src = astc::clamp(x_src, 0, static_cast<int>(img->dim_x - 1));
259
260
data[0] = data32[(4 * img->dim_x * y_src) + (4 * x_src )];
261
data[1] = data32[(4 * img->dim_x * y_src) + (4 * x_src + 1)];
262
data[2] = data32[(4 * img->dim_x * y_src) + (4 * x_src + 2)];
263
data[3] = data32[(4 * img->dim_x * y_src) + (4 * x_src + 3)];
264
265
float r = data[swz.r];
266
float g = data[swz.g];
267
float b = data[swz.b];
268
float a = data[swz.a];
269
270
vfloat4 d(r, g, b, a);
271
272
VARBUF1(z, y, x) = d;
273
VARBUF2(z, y, x) = d * d;
274
}
275
}
276
}
277
}
278
279
// Pad with an extra layer of 0s; this forms the edge of the SAT tables
280
vfloat4 vbz = vfloat4::zero();
281
for (int z = 0; z < padsize_z; z++)
282
{
283
for (int y = 0; y < padsize_y; y++)
284
{
285
VARBUF1(z, y, 0) = vbz;
286
VARBUF2(z, y, 0) = vbz;
287
}
288
289
for (int x = 0; x < padsize_x; x++)
290
{
291
VARBUF1(z, 0, x) = vbz;
292
VARBUF2(z, 0, x) = vbz;
293
}
294
}
295
296
if (have_z)
297
{
298
for (int y = 0; y < padsize_y; y++)
299
{
300
for (int x = 0; x < padsize_x; x++)
301
{
302
VARBUF1(0, y, x) = vbz;
303
VARBUF2(0, y, x) = vbz;
304
}
305
}
306
}
307
308
// Generate summed-area tables for N and N^2; this is done in-place, using
309
// a Brent-Kung parallel-prefix based algorithm to minimize precision loss
310
for (int z = zd_start; z < padsize_z; z++)
311
{
312
for (int y = 1; y < padsize_y; y++)
313
{
314
brent_kung_prefix_sum(&(VARBUF1(z, y, 1)), padsize_x - 1, 1);
315
brent_kung_prefix_sum(&(VARBUF2(z, y, 1)), padsize_x - 1, 1);
316
}
317
}
318
319
for (int z = zd_start; z < padsize_z; z++)
320
{
321
for (int x = 1; x < padsize_x; x++)
322
{
323
brent_kung_prefix_sum(&(VARBUF1(z, 1, x)), padsize_y - 1, yst);
324
brent_kung_prefix_sum(&(VARBUF2(z, 1, x)), padsize_y - 1, yst);
325
}
326
}
327
328
if (have_z)
329
{
330
for (int y = 1; y < padsize_y; y++)
331
{
332
for (int x = 1; x < padsize_x; x++)
333
{
334
brent_kung_prefix_sum(&(VARBUF1(1, y, x)), padsize_z - 1, zst);
335
brent_kung_prefix_sum(&(VARBUF2(1, y, x)), padsize_z - 1, zst);
336
}
337
}
338
}
339
340
// Compute a few constants used in the variance-calculation.
341
float alpha_kdim = static_cast<float>(2 * alpha_kernel_radius + 1);
342
float alpha_rsamples;
343
344
if (have_z)
345
{
346
alpha_rsamples = 1.0f / (alpha_kdim * alpha_kdim * alpha_kdim);
347
}
348
else
349
{
350
alpha_rsamples = 1.0f / (alpha_kdim * alpha_kdim);
351
}
352
353
// Use the summed-area tables to compute variance for each neighborhood
354
if (have_z)
355
{
356
for (int z = 0; z < size_z; z++)
357
{
358
int z_src = z + kernel_radius_z;
359
int z_dst = z + offset_z;
360
int z_low = z_src - alpha_kernel_radius;
361
int z_high = z_src + alpha_kernel_radius + 1;
362
363
for (int y = 0; y < size_y; y++)
364
{
365
int y_src = y + kernel_radius_xy;
366
int y_dst = y + offset_y;
367
int y_low = y_src - alpha_kernel_radius;
368
int y_high = y_src + alpha_kernel_radius + 1;
369
370
for (int x = 0; x < size_x; x++)
371
{
372
int x_src = x + kernel_radius_xy;
373
int x_dst = x + offset_x;
374
int x_low = x_src - alpha_kernel_radius;
375
int x_high = x_src + alpha_kernel_radius + 1;
376
377
// Summed-area table lookups for alpha average
378
float vasum = ( VARBUF1(z_high, y_low, x_low).lane<3>()
379
- VARBUF1(z_high, y_low, x_high).lane<3>()
380
- VARBUF1(z_high, y_high, x_low).lane<3>()
381
+ VARBUF1(z_high, y_high, x_high).lane<3>()) -
382
( VARBUF1(z_low, y_low, x_low).lane<3>()
383
- VARBUF1(z_low, y_low, x_high).lane<3>()
384
- VARBUF1(z_low, y_high, x_low).lane<3>()
385
+ VARBUF1(z_low, y_high, x_high).lane<3>());
386
387
int out_index = z_dst * zdt + y_dst * ydt + x_dst;
388
input_alpha_averages[out_index] = (vasum * alpha_rsamples);
389
}
390
}
391
}
392
}
393
else
394
{
395
for (int y = 0; y < size_y; y++)
396
{
397
int y_src = y + kernel_radius_xy;
398
int y_dst = y + offset_y;
399
int y_low = y_src - alpha_kernel_radius;
400
int y_high = y_src + alpha_kernel_radius + 1;
401
402
for (int x = 0; x < size_x; x++)
403
{
404
int x_src = x + kernel_radius_xy;
405
int x_dst = x + offset_x;
406
int x_low = x_src - alpha_kernel_radius;
407
int x_high = x_src + alpha_kernel_radius + 1;
408
409
// Summed-area table lookups for alpha average
410
float vasum = VARBUF1(0, y_low, x_low).lane<3>()
411
- VARBUF1(0, y_low, x_high).lane<3>()
412
- VARBUF1(0, y_high, x_low).lane<3>()
413
+ VARBUF1(0, y_high, x_high).lane<3>();
414
415
int out_index = y_dst * ydt + x_dst;
416
input_alpha_averages[out_index] = (vasum * alpha_rsamples);
417
}
418
}
419
}
420
}
421
422
/* See header for documentation. */
423
unsigned int init_compute_averages(
424
const astcenc_image& img,
425
unsigned int alpha_kernel_radius,
426
const astcenc_swizzle& swz,
427
avg_args& ag
428
) {
429
unsigned int size_x = img.dim_x;
430
unsigned int size_y = img.dim_y;
431
unsigned int size_z = img.dim_z;
432
433
// Compute maximum block size and from that the working memory buffer size
434
unsigned int kernel_radius = alpha_kernel_radius;
435
unsigned int kerneldim = 2 * kernel_radius + 1;
436
437
bool have_z = (size_z > 1);
438
unsigned int max_blk_size_xy = have_z ? 16 : 32;
439
unsigned int max_blk_size_z = astc::min(size_z, have_z ? 16u : 1u);
440
441
unsigned int max_padsize_xy = max_blk_size_xy + kerneldim;
442
unsigned int max_padsize_z = max_blk_size_z + (have_z ? kerneldim : 0);
443
444
// Perform block-wise averages calculations across the image
445
// Initialize fields which are not populated until later
446
ag.arg.size_x = 0;
447
ag.arg.size_y = 0;
448
ag.arg.size_z = 0;
449
ag.arg.offset_x = 0;
450
ag.arg.offset_y = 0;
451
ag.arg.offset_z = 0;
452
ag.arg.work_memory = nullptr;
453
454
ag.arg.img = &img;
455
ag.arg.swz = swz;
456
ag.arg.have_z = have_z;
457
ag.arg.alpha_kernel_radius = alpha_kernel_radius;
458
459
ag.img_size_x = size_x;
460
ag.img_size_y = size_y;
461
ag.img_size_z = size_z;
462
ag.blk_size_xy = max_blk_size_xy;
463
ag.blk_size_z = max_blk_size_z;
464
ag.work_memory_size = 2 * max_padsize_xy * max_padsize_xy * max_padsize_z;
465
466
// The parallel task count
467
unsigned int z_tasks = (size_z + max_blk_size_z - 1) / max_blk_size_z;
468
unsigned int y_tasks = (size_y + max_blk_size_xy - 1) / max_blk_size_xy;
469
return z_tasks * y_tasks;
470
}
471
472
#endif
473
474