CoCalc -- astcenc_compute

GitHub Repository: godotengine/godot
Path: blob/master/thirdparty/astcenc/astcenc_compute_variance.cpp
⁹⁸⁹⁶ views
1
// SPDX-License-Identifier: Apache-2.0
2
// ----------------------------------------------------------------------------
3
// Copyright 2011-2022 Arm Limited
4
//
5
// Licensed under the Apache License, Version 2.0 (the "License"); you may not
6
// use this file except in compliance with the License. You may obtain a copy
7
// of the License at:
8
//
9
//     http://www.apache.org/licenses/LICENSE-2.0
10
//
11
// Unless required by applicable law or agreed to in writing, software
12
// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
13
// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
14
// License for the specific language governing permissions and limitations
15
// under the License.
16
// ----------------------------------------------------------------------------
17

18
#if !defined(ASTCENC_DECOMPRESS_ONLY)
19

20
/**
21
 * @brief Functions to calculate variance per component in a NxN footprint.
22
 *
23
 * We need N to be parametric, so the routine below uses summed area tables in order to execute in
24
 * O(1) time independent of how big N is.
25
 *
26
 * The addition uses a Brent-Kung-based parallel prefix adder. This uses the prefix tree to first
27
 * perform a binary reduction, and then distributes the results. This method means that there is no
28
 * serial dependency between a given element and the next one, and also significantly improves
29
 * numerical stability allowing us to use floats rather than doubles.
30
 */
31

32
#include "astcenc_internal.h"
33

34
#include <cassert>
35

36
/**
37
 * @brief Generate a prefix-sum array using the Brent-Kung algorithm.
38
 *
39
 * This will take an input array of the form:
40
 *     v0, v1, v2, ...
41
 * ... and modify in-place to turn it into a prefix-sum array of the form:
42
 *     v0, v0+v1, v0+v1+v2, ...
43
 *
44
 * @param d      The array to prefix-sum.
45
 * @param items  The number of items in the array.
46
 * @param stride The item spacing in the array; i.e. dense arrays should use 1.
47
 */
48
static void brent_kung_prefix_sum(
49
	vfloat4* d,
50
	size_t items,
51
	int stride
52
) {
53
	if (items < 2)
54
		return;
55

56
	size_t lc_stride = 2;
57
	size_t log2_stride = 1;
58

59
	// The reduction-tree loop
60
	do {
61
		size_t step = lc_stride >> 1;
62
		size_t start = lc_stride - 1;
63
		size_t iters = items >> log2_stride;
64

65
		vfloat4 *da = d + (start * stride);
66
		ptrdiff_t ofs = -static_cast<ptrdiff_t>(step * stride);
67
		size_t ofs_stride = stride << log2_stride;
68

69
		while (iters)
70
		{
71
			*da = *da + da[ofs];
72
			da += ofs_stride;
73
			iters--;
74
		}
75

76
		log2_stride += 1;
77
		lc_stride <<= 1;
78
	} while (lc_stride <= items);
79

80
	// The expansion-tree loop
81
	do {
82
		log2_stride -= 1;
83
		lc_stride >>= 1;
84

85
		size_t step = lc_stride >> 1;
86
		size_t start = step + lc_stride - 1;
87
		size_t iters = (items - step) >> log2_stride;
88

89
		vfloat4 *da = d + (start * stride);
90
		ptrdiff_t ofs = -static_cast<ptrdiff_t>(step * stride);
91
		size_t ofs_stride = stride << log2_stride;
92

93
		while (iters)
94
		{
95
			*da = *da + da[ofs];
96
			da += ofs_stride;
97
			iters--;
98
		}
99
	} while (lc_stride > 2);
100
}
101

102
/* See header for documentation. */
103
void compute_pixel_region_variance(
104
	astcenc_contexti& ctx,
105
	const pixel_region_args& arg
106
) {
107
	// Unpack the memory structure into local variables
108
	const astcenc_image* img = arg.img;
109
	astcenc_swizzle swz = arg.swz;
110
	bool have_z = arg.have_z;
111

112
	int size_x = arg.size_x;
113
	int size_y = arg.size_y;
114
	int size_z = arg.size_z;
115

116
	int offset_x = arg.offset_x;
117
	int offset_y = arg.offset_y;
118
	int offset_z = arg.offset_z;
119

120
	int alpha_kernel_radius = arg.alpha_kernel_radius;
121

122
	float*   input_alpha_averages = ctx.input_alpha_averages;
123
	vfloat4* work_memory = arg.work_memory;
124

125
	// Compute memory sizes and dimensions that we need
126
	int kernel_radius = alpha_kernel_radius;
127
	int kerneldim = 2 * kernel_radius + 1;
128
	int kernel_radius_xy = kernel_radius;
129
	int kernel_radius_z = have_z ? kernel_radius : 0;
130

131
	int padsize_x = size_x + kerneldim;
132
	int padsize_y = size_y + kerneldim;
133
	int padsize_z = size_z + (have_z ? kerneldim : 0);
134
	int sizeprod = padsize_x * padsize_y * padsize_z;
135

136
	int zd_start = have_z ? 1 : 0;
137

138
	vfloat4 *varbuf1 = work_memory;
139
	vfloat4 *varbuf2 = work_memory + sizeprod;
140

141
	// Scaling factors to apply to Y and Z for accesses into the work buffers
142
	int yst = padsize_x;
143
	int zst = padsize_x * padsize_y;
144

145
	// Scaling factors to apply to Y and Z for accesses into result buffers
146
	int ydt = img->dim_x;
147
	int zdt = img->dim_x * img->dim_y;
148

149
	// Macros to act as accessor functions for the work-memory
150
	#define VARBUF1(z, y, x) varbuf1[z * zst + y * yst + x]
151
	#define VARBUF2(z, y, x) varbuf2[z * zst + y * yst + x]
152

153
	// Load N and N^2 values into the work buffers
154
	if (img->data_type == ASTCENC_TYPE_U8)
155
	{
156
		// Swizzle data structure 4 = ZERO, 5 = ONE
157
		uint8_t data[6];
158
		data[ASTCENC_SWZ_0] = 0;
159
		data[ASTCENC_SWZ_1] = 255;
160

161
		for (int z = zd_start; z < padsize_z; z++)
162
		{
163
			int z_src = (z - zd_start) + offset_z - kernel_radius_z;
164
			z_src = astc::clamp(z_src, 0, static_cast<int>(img->dim_z - 1));
165
			uint8_t* data8 = static_cast<uint8_t*>(img->data[z_src]);
166

167
			for (int y = 1; y < padsize_y; y++)
168
			{
169
				int y_src = (y - 1) + offset_y - kernel_radius_xy;
170
				y_src = astc::clamp(y_src, 0, static_cast<int>(img->dim_y - 1));
171

172
				for (int x = 1; x < padsize_x; x++)
173
				{
174
					int x_src = (x - 1) + offset_x - kernel_radius_xy;
175
					x_src = astc::clamp(x_src, 0, static_cast<int>(img->dim_x - 1));
176

177
					data[0] = data8[(4 * img->dim_x * y_src) + (4 * x_src    )];
178
					data[1] = data8[(4 * img->dim_x * y_src) + (4 * x_src + 1)];
179
					data[2] = data8[(4 * img->dim_x * y_src) + (4 * x_src + 2)];
180
					data[3] = data8[(4 * img->dim_x * y_src) + (4 * x_src + 3)];
181

182
					uint8_t r = data[swz.r];
183
					uint8_t g = data[swz.g];
184
					uint8_t b = data[swz.b];
185
					uint8_t a = data[swz.a];
186

187
					vfloat4 d = vfloat4 (r * (1.0f / 255.0f),
188
					                     g * (1.0f / 255.0f),
189
					                     b * (1.0f / 255.0f),
190
					                     a * (1.0f / 255.0f));
191

192
					VARBUF1(z, y, x) = d;
193
					VARBUF2(z, y, x) = d * d;
194
				}
195
			}
196
		}
197
	}
198
	else if (img->data_type == ASTCENC_TYPE_F16)
199
	{
200
		// Swizzle data structure 4 = ZERO, 5 = ONE (in FP16)
201
		uint16_t data[6];
202
		data[ASTCENC_SWZ_0] = 0;
203
		data[ASTCENC_SWZ_1] = 0x3C00;
204

205
		for (int z = zd_start; z < padsize_z; z++)
206
		{
207
			int z_src = (z - zd_start) + offset_z - kernel_radius_z;
208
			z_src = astc::clamp(z_src, 0, static_cast<int>(img->dim_z - 1));
209
			uint16_t* data16 = static_cast<uint16_t*>(img->data[z_src]);
210

211
			for (int y = 1; y < padsize_y; y++)
212
			{
213
				int y_src = (y - 1) + offset_y - kernel_radius_xy;
214
				y_src = astc::clamp(y_src, 0, static_cast<int>(img->dim_y - 1));
215

216
				for (int x = 1; x < padsize_x; x++)
217
				{
218
					int x_src = (x - 1) + offset_x - kernel_radius_xy;
219
					x_src = astc::clamp(x_src, 0, static_cast<int>(img->dim_x - 1));
220

221
					data[0] = data16[(4 * img->dim_x * y_src) + (4 * x_src    )];
222
					data[1] = data16[(4 * img->dim_x * y_src) + (4 * x_src + 1)];
223
					data[2] = data16[(4 * img->dim_x * y_src) + (4 * x_src + 2)];
224
					data[3] = data16[(4 * img->dim_x * y_src) + (4 * x_src + 3)];
225

226
					vint4 di(data[swz.r], data[swz.g], data[swz.b], data[swz.a]);
227
					vfloat4 d = float16_to_float(di);
228

229
					VARBUF1(z, y, x) = d;
230
					VARBUF2(z, y, x) = d * d;
231
				}
232
			}
233
		}
234
	}
235
	else // if (img->data_type == ASTCENC_TYPE_F32)
236
	{
237
		assert(img->data_type == ASTCENC_TYPE_F32);
238

239
		// Swizzle data structure 4 = ZERO, 5 = ONE (in FP16)
240
		float data[6];
241
		data[ASTCENC_SWZ_0] = 0.0f;
242
		data[ASTCENC_SWZ_1] = 1.0f;
243

244
		for (int z = zd_start; z < padsize_z; z++)
245
		{
246
			int z_src = (z - zd_start) + offset_z - kernel_radius_z;
247
			z_src = astc::clamp(z_src, 0, static_cast<int>(img->dim_z - 1));
248
			float* data32 = static_cast<float*>(img->data[z_src]);
249

250
			for (int y = 1; y < padsize_y; y++)
251
			{
252
				int y_src = (y - 1) + offset_y - kernel_radius_xy;
253
				y_src = astc::clamp(y_src, 0, static_cast<int>(img->dim_y - 1));
254

255
				for (int x = 1; x < padsize_x; x++)
256
				{
257
					int x_src = (x - 1) + offset_x - kernel_radius_xy;
258
					x_src = astc::clamp(x_src, 0, static_cast<int>(img->dim_x - 1));
259

260
					data[0] = data32[(4 * img->dim_x * y_src) + (4 * x_src    )];
261
					data[1] = data32[(4 * img->dim_x * y_src) + (4 * x_src + 1)];
262
					data[2] = data32[(4 * img->dim_x * y_src) + (4 * x_src + 2)];
263
					data[3] = data32[(4 * img->dim_x * y_src) + (4 * x_src + 3)];
264

265
					float r = data[swz.r];
266
					float g = data[swz.g];
267
					float b = data[swz.b];
268
					float a = data[swz.a];
269

270
					vfloat4 d(r, g, b, a);
271

272
					VARBUF1(z, y, x) = d;
273
					VARBUF2(z, y, x) = d * d;
274
				}
275
			}
276
		}
277
	}
278

279
	// Pad with an extra layer of 0s; this forms the edge of the SAT tables
280
	vfloat4 vbz = vfloat4::zero();
281
	for (int z = 0; z < padsize_z; z++)
282
	{
283
		for (int y = 0; y < padsize_y; y++)
284
		{
285
			VARBUF1(z, y, 0) = vbz;
286
			VARBUF2(z, y, 0) = vbz;
287
		}
288

289
		for (int x = 0; x < padsize_x; x++)
290
		{
291
			VARBUF1(z, 0, x) = vbz;
292
			VARBUF2(z, 0, x) = vbz;
293
		}
294
	}
295

296
	if (have_z)
297
	{
298
		for (int y = 0; y < padsize_y; y++)
299
		{
300
			for (int x = 0; x < padsize_x; x++)
301
			{
302
				VARBUF1(0, y, x) = vbz;
303
				VARBUF2(0, y, x) = vbz;
304
			}
305
		}
306
	}
307

308
	// Generate summed-area tables for N and N^2; this is done in-place, using
309
	// a Brent-Kung parallel-prefix based algorithm to minimize precision loss
310
	for (int z = zd_start; z < padsize_z; z++)
311
	{
312
		for (int y = 1; y < padsize_y; y++)
313
		{
314
			brent_kung_prefix_sum(&(VARBUF1(z, y, 1)), padsize_x - 1, 1);
315
			brent_kung_prefix_sum(&(VARBUF2(z, y, 1)), padsize_x - 1, 1);
316
		}
317
	}
318

319
	for (int z = zd_start; z < padsize_z; z++)
320
	{
321
		for (int x = 1; x < padsize_x; x++)
322
		{
323
			brent_kung_prefix_sum(&(VARBUF1(z, 1, x)), padsize_y - 1, yst);
324
			brent_kung_prefix_sum(&(VARBUF2(z, 1, x)), padsize_y - 1, yst);
325
		}
326
	}
327

328
	if (have_z)
329
	{
330
		for (int y = 1; y < padsize_y; y++)
331
		{
332
			for (int x = 1; x < padsize_x; x++)
333
			{
334
				brent_kung_prefix_sum(&(VARBUF1(1, y, x)), padsize_z - 1, zst);
335
				brent_kung_prefix_sum(&(VARBUF2(1, y, x)), padsize_z - 1, zst);
336
			}
337
		}
338
	}
339

340
	// Compute a few constants used in the variance-calculation.
341
	float alpha_kdim = static_cast<float>(2 * alpha_kernel_radius + 1);
342
	float alpha_rsamples;
343

344
	if (have_z)
345
	{
346
		alpha_rsamples = 1.0f / (alpha_kdim * alpha_kdim * alpha_kdim);
347
	}
348
	else
349
	{
350
		alpha_rsamples = 1.0f / (alpha_kdim * alpha_kdim);
351
	}
352

353
	// Use the summed-area tables to compute variance for each neighborhood
354
	if (have_z)
355
	{
356
		for (int z = 0; z < size_z; z++)
357
		{
358
			int z_src = z + kernel_radius_z;
359
			int z_dst = z + offset_z;
360
			int z_low  = z_src - alpha_kernel_radius;
361
			int z_high = z_src + alpha_kernel_radius + 1;
362

363
			for (int y = 0; y < size_y; y++)
364
			{
365
				int y_src = y + kernel_radius_xy;
366
				int y_dst = y + offset_y;
367
				int y_low  = y_src - alpha_kernel_radius;
368
				int y_high = y_src + alpha_kernel_radius + 1;
369

370
				for (int x = 0; x < size_x; x++)
371
				{
372
					int x_src = x + kernel_radius_xy;
373
					int x_dst = x + offset_x;
374
					int x_low  = x_src - alpha_kernel_radius;
375
					int x_high = x_src + alpha_kernel_radius + 1;
376

377
					// Summed-area table lookups for alpha average
378
					float vasum = (  VARBUF1(z_high, y_low,  x_low).lane<3>()
379
					               - VARBUF1(z_high, y_low,  x_high).lane<3>()
380
					               - VARBUF1(z_high, y_high, x_low).lane<3>()
381
					               + VARBUF1(z_high, y_high, x_high).lane<3>()) -
382
					              (  VARBUF1(z_low,  y_low,  x_low).lane<3>()
383
					               - VARBUF1(z_low,  y_low,  x_high).lane<3>()
384
					               - VARBUF1(z_low,  y_high, x_low).lane<3>()
385
					               + VARBUF1(z_low,  y_high, x_high).lane<3>());
386

387
					int out_index = z_dst * zdt + y_dst * ydt + x_dst;
388
					input_alpha_averages[out_index] = (vasum * alpha_rsamples);
389
				}
390
			}
391
		}
392
	}
393
	else
394
	{
395
		for (int y = 0; y < size_y; y++)
396
		{
397
			int y_src = y + kernel_radius_xy;
398
			int y_dst = y + offset_y;
399
			int y_low  = y_src - alpha_kernel_radius;
400
			int y_high = y_src + alpha_kernel_radius + 1;
401

402
			for (int x = 0; x < size_x; x++)
403
			{
404
				int x_src = x + kernel_radius_xy;
405
				int x_dst = x + offset_x;
406
				int x_low  = x_src - alpha_kernel_radius;
407
				int x_high = x_src + alpha_kernel_radius + 1;
408

409
				// Summed-area table lookups for alpha average
410
				float vasum = VARBUF1(0, y_low,  x_low).lane<3>()
411
				            - VARBUF1(0, y_low,  x_high).lane<3>()
412
				            - VARBUF1(0, y_high, x_low).lane<3>()
413
				            + VARBUF1(0, y_high, x_high).lane<3>();
414

415
				int out_index = y_dst * ydt + x_dst;
416
				input_alpha_averages[out_index] = (vasum * alpha_rsamples);
417
			}
418
		}
419
	}
420
}
421

422
/* See header for documentation. */
423
unsigned int init_compute_averages(
424
	const astcenc_image& img,
425
	unsigned int alpha_kernel_radius,
426
	const astcenc_swizzle& swz,
427
	avg_args& ag
428
) {
429
	unsigned int size_x = img.dim_x;
430
	unsigned int size_y = img.dim_y;
431
	unsigned int size_z = img.dim_z;
432

433
	// Compute maximum block size and from that the working memory buffer size
434
	unsigned int kernel_radius = alpha_kernel_radius;
435
	unsigned int kerneldim = 2 * kernel_radius + 1;
436

437
	bool have_z = (size_z > 1);
438
	unsigned int max_blk_size_xy = have_z ? 16 : 32;
439
	unsigned int max_blk_size_z = astc::min(size_z, have_z ? 16u : 1u);
440

441
	unsigned int max_padsize_xy = max_blk_size_xy + kerneldim;
442
	unsigned int max_padsize_z = max_blk_size_z + (have_z ? kerneldim : 0);
443

444
	// Perform block-wise averages calculations across the image
445
	// Initialize fields which are not populated until later
446
	ag.arg.size_x = 0;
447
	ag.arg.size_y = 0;
448
	ag.arg.size_z = 0;
449
	ag.arg.offset_x = 0;
450
	ag.arg.offset_y = 0;
451
	ag.arg.offset_z = 0;
452
	ag.arg.work_memory = nullptr;
453

454
	ag.arg.img = &img;
455
	ag.arg.swz = swz;
456
	ag.arg.have_z = have_z;
457
	ag.arg.alpha_kernel_radius = alpha_kernel_radius;
458

459
	ag.img_size_x = size_x;
460
	ag.img_size_y = size_y;
461
	ag.img_size_z = size_z;
462
	ag.blk_size_xy = max_blk_size_xy;
463
	ag.blk_size_z = max_blk_size_z;
464
	ag.work_memory_size = 2 * max_padsize_xy * max_padsize_xy * max_padsize_z;
465

466
	// The parallel task count
467
	unsigned int z_tasks = (size_z + max_blk_size_z - 1) / max_blk_size_z;
468
	unsigned int y_tasks = (size_y + max_blk_size_xy - 1) / max_blk_size_xy;
469
	return z_tasks * y_tasks;
470
}
471

472
#endif
473

474
Product

Resources

Company