CoCalc -- astcenc_vecmathlib_common

GitHub Repository: godotengine/godot
Path: blob/master/thirdparty/astcenc/astcenc_vecmathlib_common_4.h
⁹⁸⁹⁶ views
1
// SPDX-License-Identifier: Apache-2.0
2
// ----------------------------------------------------------------------------
3
// Copyright 2020-2025 Arm Limited
4
//
5
// Licensed under the Apache License, Version 2.0 (the "License"); you may not
6
// use this file except in compliance with the License. You may obtain a copy
7
// of the License at:
8
//
9
//     http://www.apache.org/licenses/LICENSE-2.0
10
//
11
// Unless required by applicable law or agreed to in writing, software
12
// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
13
// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
14
// License for the specific language governing permissions and limitations
15
// under the License.
16
// ----------------------------------------------------------------------------
17

18
/**
19
 * @brief Generic 4x32-bit vector functions.
20
 *
21
 * This module implements generic 4-wide vector functions that are valid for
22
 * all instruction sets, typically implemented using lower level 4-wide
23
 * operations that are ISA-specific.
24
 */
25

26
#ifndef ASTC_VECMATHLIB_COMMON_4_H_INCLUDED
27
#define ASTC_VECMATHLIB_COMMON_4_H_INCLUDED
28

29
#ifndef ASTCENC_SIMD_INLINE
30
	#error "Include astcenc_vecmathlib.h, do not include directly"
31
#endif
32

33
#include <cstdio>
34
#include <limits>
35

36
// ============================================================================
37
// vint4 operators and functions
38
// ============================================================================
39

40
/**
41
 * @brief Overload: vector by scalar addition.
42
 */
43
ASTCENC_SIMD_INLINE vint4 operator+(vint4 a, int b)
44
{
45
	return a + vint4(b);
46
}
47

48
/**
49
 * @brief Overload: vector by vector incremental addition.
50
 */
51
ASTCENC_SIMD_INLINE vint4& operator+=(vint4& a, const vint4& b)
52
{
53
	a = a + b;
54
	return a;
55
}
56

57
/**
58
 * @brief Overload: vector by scalar subtraction.
59
 */
60
ASTCENC_SIMD_INLINE vint4 operator-(vint4 a, int b)
61
{
62
	return a - vint4(b);
63
}
64

65
/**
66
 * @brief Overload: vector by scalar multiplication.
67
 */
68
ASTCENC_SIMD_INLINE vint4 operator*(vint4 a, int b)
69
{
70
	return a * vint4(b);
71
}
72

73
/**
74
 * @brief Overload: vector by scalar bitwise or.
75
 */
76
ASTCENC_SIMD_INLINE vint4 operator|(vint4 a, int b)
77
{
78
	return a | vint4(b);
79
}
80

81
/**
82
 * @brief Overload: vector by scalar bitwise and.
83
 */
84
ASTCENC_SIMD_INLINE vint4 operator&(vint4 a, int b)
85
{
86
	return a & vint4(b);
87
}
88

89
/**
90
 * @brief Overload: vector by scalar bitwise xor.
91
 */
92
ASTCENC_SIMD_INLINE vint4 operator^(vint4 a, int b)
93
{
94
	return a ^ vint4(b);
95
}
96

97
/**
98
 * @brief Return the clamped value between min and max.
99
 */
100
ASTCENC_SIMD_INLINE vint4 clamp(int minv, int maxv, vint4 a)
101
{
102
	return min(max(a, vint4(minv)), vint4(maxv));
103
}
104

105
/**
106
 * @brief Return the horizontal sum of RGB vector lanes as a scalar.
107
 */
108
ASTCENC_SIMD_INLINE int hadd_rgb_s(vint4 a)
109
{
110
	return a.lane<0>() + a.lane<1>() + a.lane<2>();
111
}
112

113
/**
114
 * @brief Return the horizontal minimum of a vector.
115
 */
116
ASTCENC_SIMD_INLINE int hmin_s(vint4 a)
117
{
118
	return hmin(a).lane<0>();
119
}
120

121
/**
122
 * @brief Generate a vint4 from a size_t.
123
 */
124
 ASTCENC_SIMD_INLINE vint4 vint4_from_size(size_t a)
125
 {
126
	assert(a <= std::numeric_limits<int>::max());
127
	return vint4(static_cast<int>(a));
128
 }
129

130
/**
131
 * @brief Return the horizontal maximum of a vector.
132
 */
133
ASTCENC_SIMD_INLINE int hmax_s(vint4 a)
134
{
135
	return hmax(a).lane<0>();
136
}
137

138
// ============================================================================
139
// vfloat4 operators and functions
140
// ============================================================================
141

142
/**
143
 * @brief Overload: vector by vector incremental addition.
144
 */
145
ASTCENC_SIMD_INLINE vfloat4& operator+=(vfloat4& a, const vfloat4& b)
146
{
147
	a = a + b;
148
	return a;
149
}
150

151
/**
152
 * @brief Overload: vector by scalar addition.
153
 */
154
ASTCENC_SIMD_INLINE vfloat4 operator+(vfloat4 a, float b)
155
{
156
	return a + vfloat4(b);
157
}
158

159
/**
160
 * @brief Overload: vector by scalar subtraction.
161
 */
162
ASTCENC_SIMD_INLINE vfloat4 operator-(vfloat4 a, float b)
163
{
164
	return a - vfloat4(b);
165
}
166

167
/**
168
 * @brief Overload: vector by scalar multiplication.
169
 */
170
ASTCENC_SIMD_INLINE vfloat4 operator*(vfloat4 a, float b)
171
{
172
	return a * vfloat4(b);
173
}
174

175
/**
176
 * @brief Overload: scalar by vector multiplication.
177
 */
178
ASTCENC_SIMD_INLINE vfloat4 operator*(float a, vfloat4 b)
179
{
180
	return vfloat4(a) * b;
181
}
182

183
/**
184
 * @brief Overload: vector by scalar division.
185
 */
186
ASTCENC_SIMD_INLINE vfloat4 operator/(vfloat4 a, float b)
187
{
188
	return a / vfloat4(b);
189
}
190

191
/**
192
 * @brief Overload: scalar by vector division.
193
 */
194
ASTCENC_SIMD_INLINE vfloat4 operator/(float a, vfloat4 b)
195
{
196
	return vfloat4(a) / b;
197
}
198

199
/**
200
 * @brief Return the min vector of a vector and a scalar.
201
 *
202
 * If either lane value is NaN, @c b will be returned for that lane.
203
 */
204
ASTCENC_SIMD_INLINE vfloat4 min(vfloat4 a, float b)
205
{
206
	return min(a, vfloat4(b));
207
}
208

209
/**
210
 * @brief Return the max vector of a vector and a scalar.
211
 *
212
 * If either lane value is NaN, @c b will be returned for that lane.
213
 */
214
ASTCENC_SIMD_INLINE vfloat4 max(vfloat4 a, float b)
215
{
216
	return max(a, vfloat4(b));
217
}
218

219
/**
220
 * @brief Return the clamped value between min and max.
221
 *
222
 * It is assumed that neither @c min nor @c max are NaN values. If @c a is NaN
223
 * then @c min will be returned for that lane.
224
 */
225
ASTCENC_SIMD_INLINE vfloat4 clamp(float minv, float maxv, vfloat4 a)
226
{
227
	// Do not reorder - second operand will return if either is NaN
228
	return min(max(a, minv), maxv);
229
}
230

231
/**
232
 * @brief Return the clamped value between 0.0f and 1.0f.
233
 *
234
 * If @c a is NaN then zero will be returned for that lane.
235
 */
236
ASTCENC_SIMD_INLINE vfloat4 clampzo(vfloat4 a)
237
{
238
	// Do not reorder - second operand will return if either is NaN
239
	return min(max(a, vfloat4::zero()), 1.0f);
240
}
241

242
/**
243
 * @brief Return the horizontal minimum of a vector.
244
 */
245
ASTCENC_SIMD_INLINE float hmin_s(vfloat4 a)
246
{
247
	return hmin(a).lane<0>();
248
}
249

250
/**
251
 * @brief Return the horizontal min of RGB vector lanes as a scalar.
252
 */
253
ASTCENC_SIMD_INLINE float hmin_rgb_s(vfloat4 a)
254
{
255
	a.set_lane<3>(a.lane<0>());
256
	return hmin_s(a);
257
}
258

259
/**
260
 * @brief Return the horizontal maximum of a vector.
261
 */
262
ASTCENC_SIMD_INLINE float hmax_s(vfloat4 a)
263
{
264
	return hmax(a).lane<0>();
265
}
266

267
/**
268
 * @brief Accumulate lane-wise sums for a vector.
269
 */
270
ASTCENC_SIMD_INLINE void haccumulate(vfloat4& accum, vfloat4 a)
271
{
272
	accum = accum + a;
273
}
274

275
/**
276
 * @brief Accumulate lane-wise sums for a masked vector.
277
 */
278
ASTCENC_SIMD_INLINE void haccumulate(vfloat4& accum, vfloat4 a, vmask4 m)
279
{
280
	a = select(vfloat4::zero(), a, m);
281
	haccumulate(accum, a);
282
}
283

284
/**
285
 * @brief Return the horizontal sum of RGB vector lanes as a scalar.
286
 */
287
ASTCENC_SIMD_INLINE float hadd_rgb_s(vfloat4 a)
288
{
289
	return a.lane<0>() + a.lane<1>() + a.lane<2>();
290
}
291

292
#if !defined(ASTCENC_USE_NATIVE_DOT_PRODUCT)
293

294
/**
295
 * @brief Return the dot product for the full 4 lanes, returning scalar.
296
 */
297
ASTCENC_SIMD_INLINE float dot_s(vfloat4 a, vfloat4 b)
298
{
299
	vfloat4 m = a * b;
300
	return hadd_s(m);
301
}
302

303
/**
304
 * @brief Return the dot product for the full 4 lanes, returning vector.
305
 */
306
ASTCENC_SIMD_INLINE vfloat4 dot(vfloat4 a, vfloat4 b)
307
{
308
	vfloat4 m = a * b;
309
	return vfloat4(hadd_s(m));
310
}
311

312
/**
313
 * @brief Return the dot product for the bottom 3 lanes, returning scalar.
314
 */
315
ASTCENC_SIMD_INLINE float dot3_s(vfloat4 a, vfloat4 b)
316
{
317
	vfloat4 m = a * b;
318
	return hadd_rgb_s(m);
319
}
320

321
/**
322
 * @brief Return the dot product for the bottom 3 lanes, returning vector.
323
 */
324
ASTCENC_SIMD_INLINE vfloat4 dot3(vfloat4 a, vfloat4 b)
325
{
326
	vfloat4 m = a * b;
327
	float d3 = hadd_rgb_s(m);
328
	return vfloat4(d3, d3, d3, 0.0f);
329
}
330

331
#endif
332

333
#if !defined(ASTCENC_USE_NATIVE_POPCOUNT)
334

335
/**
336
 * @brief Population bit count.
337
 *
338
 * @param v   The value to population count.
339
 *
340
 * @return The number of 1 bits.
341
 */
342
static inline int popcount(uint64_t v)
343
{
344
	uint64_t mask1 = 0x5555555555555555ULL;
345
	uint64_t mask2 = 0x3333333333333333ULL;
346
	uint64_t mask3 = 0x0F0F0F0F0F0F0F0FULL;
347
	v -= (v >> 1) & mask1;
348
	v = (v & mask2) + ((v >> 2) & mask2);
349
	v += v >> 4;
350
	v &= mask3;
351
	v *= 0x0101010101010101ULL;
352
	v >>= 56;
353
	return static_cast<int>(v);
354
}
355

356
#endif
357

358
/**
359
 * @brief Apply signed bit transfer.
360
 *
361
 * @param input0   The first encoded endpoint.
362
 * @param input1   The second encoded endpoint.
363
 */
364
static ASTCENC_SIMD_INLINE void bit_transfer_signed(
365
	vint4& input0,
366
	vint4& input1
367
) {
368
	input1 = lsr<1>(input1) | (input0 & 0x80);
369
	input0 = lsr<1>(input0) & 0x3F;
370

371
	vmask4 mask = (input0 & 0x20) != vint4::zero();
372
	input0 = select(input0, input0 - 0x40, mask);
373
}
374

375
/**
376
 * @brief Debug function to print a vector of ints.
377
 */
378
ASTCENC_SIMD_INLINE void print(vint4 a)
379
{
380
	ASTCENC_ALIGNAS int v[4];
381
	storea(a, v);
382
	printf("v4_i32:\n  %8d %8d %8d %8d\n",
383
	       v[0], v[1], v[2], v[3]);
384
}
385

386
/**
387
 * @brief Debug function to print a vector of ints.
388
 */
389
ASTCENC_SIMD_INLINE void printx(vint4 a)
390
{
391
	ASTCENC_ALIGNAS int v[4];
392
	storea(a, v);
393

394
	unsigned int uv[4];
395
	std::memcpy(uv, v, sizeof(int) * 4);
396

397
	printf("v4_i32:\n  %08x %08x %08x %08x\n",
398
		uv[0], uv[1], uv[2], uv[3]);
399
}
400

401
/**
402
 * @brief Debug function to print a vector of floats.
403
 */
404
ASTCENC_SIMD_INLINE void print(vfloat4 a)
405
{
406
	ASTCENC_ALIGNAS float v[4];
407
	storea(a, v);
408
	printf("v4_f32:\n  %0.4f %0.4f %0.4f %0.4f\n",
409
	       static_cast<double>(v[0]), static_cast<double>(v[1]),
410
	       static_cast<double>(v[2]), static_cast<double>(v[3]));
411
}
412

413
/**
414
 * @brief Debug function to print a vector of masks.
415
 */
416
ASTCENC_SIMD_INLINE void print(vmask4 a)
417
{
418
	print(select(vint4(0), vint4(1), a));
419
}
420

421
#endif // #ifndef ASTC_VECMATHLIB_COMMON_4_H_INCLUDED
422

423
Product

Resources

Company