CoCalc -- Math3D.h

CoCalc provides the best real-time collaborative environment for Jupyter Notebooks, LaTeX documents, and SageMath, scalable from individual users to large groups and classes!

GitHub Repository: hrydgard/ppsspp
Path: blob/master/GPU/Math3D.h
Views: ¹⁴⁰¹
1
// Copyright (c) 2012- PPSSPP Project.
2

3
// This program is free software: you can redistribute it and/or modify
4
// it under the terms of the GNU General Public License as published by
5
// the Free Software Foundation, version 2.0 or later versions.
6

7
// This program is distributed in the hope that it will be useful,
8
// but WITHOUT ANY WARRANTY; without even the implied warranty of
9
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
10
// GNU General Public License 2.0 for more details.
11

12
// A copy of the GPL 2.0 should have been included with the program.
13
// If not, see http://www.gnu.org/licenses/
14

15
// Official git repository and contact information can be found at
16
// https://github.com/hrydgard/ppsspp and http://www.ppsspp.org/.
17

18
#pragma once
19

20
#include "ppsspp_config.h"
21
#include <cmath>
22

23
#include "Common/Common.h"
24
#include "Core/Util/AudioFormat.h"  // for clamp_u8
25
#include "Common/Math/fast/fast_matrix.h"
26

27
#if defined(_M_SSE)
28
#include <emmintrin.h>
29
#include <smmintrin.h>
30
#endif
31

32
#if PPSSPP_ARCH(ARM_NEON)
33
#if defined(_MSC_VER) && PPSSPP_ARCH(ARM64)
34
#include <arm64_neon.h>
35
#else
36
#include <arm_neon.h>
37
#endif
38
#endif
39

40
#if PPSSPP_PLATFORM(WINDOWS) && (defined(_MSC_VER) || defined(__clang__) || defined(__INTEL_COMPILER))
41
#define MATH3D_CALL __vectorcall
42
#else
43
#define MATH3D_CALL
44
#endif
45

46
// There's probably a better place to define these macros.
47
#if PPSSPP_ARCH(X86)
48
// On 32-bit x86, MSVC does not guarantee alignment for
49
// SSE arguments passed on stack (Compiler Error C2719), see e.g.:
50
//   https://stackoverflow.com/questions/10484422/msvc-cannot-send-function-parameters-of-16byte-alignment-on-x86
51
//   https://stackoverflow.com/questions/28488986/formal-parameter-with-declspecalign16-wont-be-aligned
52
// So, as a workaround, "dangerous" cases are loaded via loadu* on 32-bit x86.
53
// Compilers are decently ok at eliminating these extra loads, at least
54
// in trivial cases.
55
// NOTE: not to be outdone, GCC has its own flavor of broken, see e.g.:
56
//   http://www.peterstock.co.uk/games/mingw_sse/
57
//   https://github.com/nothings/stb/issues/81
58
// which is probably worse since it breaks alignment of locals and/or
59
// spills, but that, hopefully, does not affect PPSSPP (modern GCC+Linux
60
// is 16-byte aligned on x86, and MinGW is not a supported PPSSPP target).
61
// NOTE: weird double-casts add a bit of type-safety.
62
#define SAFE_M128(v)  _mm_loadu_ps   (reinterpret_cast<const float*>  (static_cast<const __m128*> (&(v))))
63
#define SAFE_M128I(v) _mm_loadu_si128(reinterpret_cast<const __m128i*>(static_cast<const __m128i*>(&(v))))
64
#else // x64, FWIW also works for non-x86.
65
#define SAFE_M128(v)  (v)
66
#define SAFE_M128I(v) (v)
67
#endif
68

69
namespace Math3D {
70

71
// Helper for Vec classes to clamp values.
72
template<typename T>
73
inline static T VecClamp(const T &v, const T &low, const T &high)
74
{
75
	if (v > high)
76
		return high;
77
	if (v < low)
78
		return low;
79
	return v;
80
}
81

82
template<typename T>
83
class Vec2 {
84
public:
85
	struct {
86
		T x,y;
87
	};
88

89
	T* AsArray() { return &x; }
90
	const T* AsArray() const { return &x; }
91

92
	Vec2() {}
93
	Vec2(const T a[2]) : x(a[0]), y(a[1]) {}
94
	Vec2(const T& _x, const T& _y) : x(_x), y(_y) {}
95

96
	template<typename T2>
97
	Vec2<T2> Cast() const
98
	{
99
		return Vec2<T2>((T2)x, (T2)y);
100
	}
101

102
	static Vec2 AssignToAll(const T& f)
103
	{
104
		return Vec2<T>(f, f);
105
	}
106

107
	void Write(T a[2])
108
	{
109
		a[0] = x; a[1] = y;
110
	}
111

112
	Vec2 operator +(const Vec2& other) const
113
	{
114
		return Vec2(x+other.x, y+other.y);
115
	}
116
	void operator += (const Vec2 &other)
117
	{
118
		x+=other.x; y+=other.y;
119
	}
120
	Vec2 operator -(const Vec2& other) const
121
	{
122
		return Vec2(x-other.x, y-other.y);
123
	}
124
	void operator -= (const Vec2& other)
125
	{
126
		x-=other.x; y-=other.y;
127
	}
128
	Vec2 operator -() const
129
	{
130
		return Vec2(-x,-y);
131
	}
132
	Vec2 operator * (const Vec2& other) const
133
	{
134
		return Vec2(x*other.x, y*other.y);
135
	}
136
	template<typename V>
137
	Vec2 operator * (const V& f) const
138
	{
139
		return Vec2(x*f,y*f);
140
	}
141
	template<typename V>
142
	void operator *= (const V& f)
143
	{
144
		x*=f; y*=f;
145
	}
146
	template<typename V>
147
	Vec2 operator / (const V& f) const
148
	{
149
		return Vec2(x/f,y/f);
150
	}
151
	template<typename V>
152
	void operator /= (const V& f)
153
	{
154
		*this = *this / f;
155
	}
156

157
	T Length2() const
158
	{
159
		return x*x + y*y;
160
	}
161

162
	Vec2 Clamp(const T &l, const T &h) const
163
	{
164
		return Vec2(VecClamp(x, l, h), VecClamp(y, l, h));
165
	}
166

167
	// Only implemented for T=float
168
	float Length() const;
169
	void SetLength(const float l);
170
	Vec2 WithLength(const float l) const;
171
	float Distance2To(const Vec2 &other) const;
172
	Vec2 Normalized() const;
173
	float Normalize(); // returns the previous length, which is often useful
174

175
	T& operator [] (int i) //allow vector[1] = 3   (vector.y=3)
176
	{
177
		return *((&x) + i);
178
	}
179
	T operator [] (const int i) const
180
	{
181
		return *((&x) + i);
182
	}
183

184
	void SetZero()
185
	{
186
		x=0; y=0;
187
	}
188

189
	// Common aliases: UV (texel coordinates), ST (texture coordinates)
190
	T& u() { return x; }
191
	T& v() { return y; }
192
	T& s() { return x; }
193
	T& t() { return y; }
194

195
	const T& u() const { return x; }
196
	const T& v() const { return y; }
197
	const T& s() const { return x; }
198
	const T& t() const { return y; }
199

200
	// swizzlers - create a subvector of specific components
201
	const Vec2 yx() const { return Vec2(y, x); }
202
	const Vec2 vu() const { return Vec2(y, x); }
203
	const Vec2 ts() const { return Vec2(y, x); }
204
};
205

206
template<typename T>
207
class Vec3Packed;
208

209
template<typename T>
210
class Vec3
211
{
212
public:
213
	union
214
	{
215
		struct
216
		{
217
			T x,y,z;
218
		};
219
#if defined(_M_SSE)
220
		__m128i ivec;
221
		__m128 vec;
222
#elif PPSSPP_ARCH(ARM_NEON)
223
		int32x4_t ivec;
224
		float32x4_t vec;
225
#endif
226
	};
227

228
	T* AsArray() { return &x; }
229
	const T* AsArray() const { return &x; }
230

231
	Vec3() {}
232
	Vec3(const T a[3]) : x(a[0]), y(a[1]), z(a[2]) {}
233
	constexpr Vec3(const T& _x, const T& _y, const T& _z) : x(_x), y(_y), z(_z) {}
234
	Vec3(const Vec2<T>& _xy, const T& _z) : x(_xy.x), y(_xy.y), z(_z) {}
235
#if defined(_M_SSE)
236
	constexpr Vec3(const __m128 &_vec) : vec(_vec) {}
237
	constexpr Vec3(const __m128i &_ivec) : ivec(_ivec) {}
238
	Vec3(const Vec3Packed<T> &_xyz) {
239
		vec = _mm_loadu_ps(_xyz.AsArray());
240
	}
241
#elif PPSSPP_ARCH(ARM_NEON)
242
	Vec3(const float32x4_t &_vec) : vec(_vec) {}
243
#if !defined(_MSC_VER)
244
	Vec3(const int32x4_t &_ivec) : ivec(_ivec) {}
245
#endif
246
	Vec3(const Vec3Packed<T> &_xyz) {
247
		vec = vld1q_f32(_xyz.AsArray());
248
	}
249
#else
250
	Vec3(const Vec3Packed<T> &_xyz) : x(_xyz.x), y(_xyz.y), z(_xyz.z) {}
251
#endif
252

253
	template<typename T2>
254
	constexpr Vec3<T2> Cast() const
255
	{
256
		return Vec3<T2>((T2)x, (T2)y, (T2)z);
257
	}
258

259
	// Only implemented for T=int and T=float
260
	static Vec3 FromRGB(unsigned int rgb);
261
	unsigned int ToRGB() const; // alpha bits set to zero
262

263
	static constexpr Vec3 AssignToAll(const T& f)
264
	{
265
		return Vec3<T>(f, f, f);
266
	}
267

268
	void Write(T a[3])
269
	{
270
		a[0] = x; a[1] = y; a[2] = z;
271
	}
272

273
	Vec3 operator +(const Vec3 &other) const
274
	{
275
		return Vec3(x+other.x, y+other.y, z+other.z);
276
	}
277
	void operator += (const Vec3 &other)
278
	{
279
		x+=other.x; y+=other.y; z+=other.z;
280
	}
281
	Vec3 operator -(const Vec3 &other) const
282
	{
283
		return Vec3(x-other.x, y-other.y, z-other.z);
284
	}
285
	void operator -= (const Vec3 &other)
286
	{
287
		x-=other.x; y-=other.y; z-=other.z;
288
	}
289
	Vec3 operator -() const
290
	{
291
		return Vec3(-x,-y,-z);
292
	}
293
	Vec3 operator * (const Vec3 &other) const
294
	{
295
		return Vec3(x*other.x, y*other.y, z*other.z);
296
	}
297
	template<typename V>
298
	Vec3 operator * (const V& f) const
299
	{
300
		return Vec3(x*f,y*f,z*f);
301
	}
302
	template<typename V>
303
	void operator *= (const V& f)
304
	{
305
		x*=f; y*=f; z*=f;
306
	}
307
	template<typename V>
308
	Vec3 operator / (const V& f) const
309
	{
310
		return Vec3(x/f,y/f,z/f);
311
	}
312
	template<typename V>
313
	void operator /= (const V& f)
314
	{
315
		*this = *this / f;
316
	}
317

318
	bool operator ==(const Vec3 &other) const {
319
		return x == other.x && y == other.y && z == other.z;
320
	}
321

322
	T Length2() const
323
	{
324
		return x*x + y*y + z*z;
325
	}
326

327
	Vec3 Clamp(const T &l, const T &h) const
328
	{
329
		return Vec3(VecClamp(x, l, h), VecClamp(y, l, h), VecClamp(z, l, h));
330
	}
331

332
	// Only implemented for T=float
333
	float Length() const;
334
	void SetLength(const float l);
335
	Vec3 WithLength(const float l) const;
336
	float Distance2To(const Vec3 &other) const;
337
	Vec3 Normalized(bool useSSE4 = false) const;
338
	Vec3 NormalizedOr001(bool useSSE4 = false) const;
339
	float Normalize(); // returns the previous length, which is often useful
340
	float NormalizeOr001();
341

342
	T& operator [] (int i) //allow vector[2] = 3   (vector.z=3)
343
	{
344
		return *((&x) + i);
345
	}
346
	T operator [] (const int i) const
347
	{
348
		return *((&x) + i);
349
	}
350

351
	void SetZero()
352
	{
353
		x=0; y=0; z=0;
354
	}
355

356
	// Common aliases: UVW (texel coordinates), RGB (colors), STQ (texture coordinates)
357
	T& u() { return x; }
358
	T& v() { return y; }
359
	T& w() { return z; }
360

361
	T& r() { return x; }
362
	T& g() { return y; }
363
	T& b() { return z; }
364

365
	T& s() { return x; }
366
	T& t() { return y; }
367
	T& q() { return z; }
368

369
	const T& u() const { return x; }
370
	const T& v() const { return y; }
371
	const T& w() const { return z; }
372

373
	const T& r() const { return x; }
374
	const T& g() const { return y; }
375
	const T& b() const { return z; }
376

377
	const T& s() const { return x; }
378
	const T& t() const { return y; }
379
	const T& q() const { return z; }
380

381
	// swizzlers - create a subvector of specific components
382
	// e.g. Vec2 uv() { return Vec2(x,y); }
383
	// _DEFINE_SWIZZLER2 defines a single such function, DEFINE_SWIZZLER2 defines all of them for all component names (x<->r) and permutations (xy<->yx)
384
#define _DEFINE_SWIZZLER2(a, b, name) const Vec2<T> name() const { return Vec2<T>(a, b); }
385
#define DEFINE_SWIZZLER2(a, b, a2, b2, a3, b3, a4, b4) \
386
	_DEFINE_SWIZZLER2(a, b, a##b); \
387
	_DEFINE_SWIZZLER2(a, b, a2##b2); \
388
	_DEFINE_SWIZZLER2(a, b, a3##b3); \
389
	_DEFINE_SWIZZLER2(a, b, a4##b4); \
390
	_DEFINE_SWIZZLER2(b, a, b##a); \
391
	_DEFINE_SWIZZLER2(b, a, b2##a2); \
392
	_DEFINE_SWIZZLER2(b, a, b3##a3); \
393
	_DEFINE_SWIZZLER2(b, a, b4##a4);
394

395
	DEFINE_SWIZZLER2(x, y, r, g, u, v, s, t);
396
	DEFINE_SWIZZLER2(x, z, r, b, u, w, s, q);
397
	DEFINE_SWIZZLER2(y, z, g, b, v, w, t, q);
398
#undef DEFINE_SWIZZLER2
399
#undef _DEFINE_SWIZZLER2
400
};
401

402
template<typename T>
403
class Vec3Packed
404
{
405
public:
406
	union
407
	{
408
		struct
409
		{
410
			T x,y,z;
411
		};
412
	};
413

414
	T* AsArray() { return &x; }
415
	const T* AsArray() const { return &x; }
416

417
	Vec3Packed() {}
418
	Vec3Packed(const T a[3]) : x(a[0]), y(a[1]), z(a[2]) {}
419
	Vec3Packed(const T& _x, const T& _y, const T& _z) : x(_x), y(_y), z(_z) {}
420
	Vec3Packed(const Vec2<T>& _xy, const T& _z) : x(_xy.x), y(_xy.y), z(_z) {}
421
	Vec3Packed(const Vec3<T>& _xyz) {
422
		memcpy(&x, _xyz.AsArray(), sizeof(float) * 3);
423
	}
424

425
	template<typename T2>
426
	Vec3Packed<T2> Cast() const
427
	{
428
		return Vec3Packed<T2>((T2)x, (T2)y, (T2)z);
429
	}
430

431
	// Only implemented for T=int and T=float
432
	static Vec3Packed FromRGB(unsigned int rgb);
433
	unsigned int ToRGB() const; // alpha bits set to zero
434

435
	static Vec3Packed AssignToAll(const T& f)
436
	{
437
		return Vec3Packed<T>(f, f, f);
438
	}
439

440
	void Write(T a[3])
441
	{
442
		a[0] = x; a[1] = y; a[2] = z;
443
	}
444

445
	Vec3Packed operator +(const Vec3Packed &other) const
446
	{
447
		return Vec3Packed(x+other.x, y+other.y, z+other.z);
448
	}
449
	void operator += (const Vec3Packed &other)
450
	{
451
		x+=other.x; y+=other.y; z+=other.z;
452
	}
453
	Vec3Packed operator -(const Vec3Packed &other) const
454
	{
455
		return Vec3Packed(x-other.x, y-other.y, z-other.z);
456
	}
457
	void operator -= (const Vec3Packed &other)
458
	{
459
		x-=other.x; y-=other.y; z-=other.z;
460
	}
461
	Vec3Packed operator -() const
462
	{
463
		return Vec3Packed(-x,-y,-z);
464
	}
465
	Vec3Packed operator * (const Vec3Packed &other) const
466
	{
467
		return Vec3Packed(x*other.x, y*other.y, z*other.z);
468
	}
469
	template<typename V>
470
	Vec3Packed operator * (const V& f) const
471
	{
472
		return Vec3Packed(x*f,y*f,z*f);
473
	}
474
	template<typename V>
475
	void operator *= (const V& f)
476
	{
477
		x*=f; y*=f; z*=f;
478
	}
479
	template<typename V>
480
	Vec3Packed operator / (const V& f) const
481
	{
482
		return Vec3Packed(x/f,y/f,z/f);
483
	}
484
	template<typename V>
485
	void operator /= (const V& f)
486
	{
487
		*this = *this / f;
488
	}
489

490
	T Length2() const
491
	{
492
		return x*x + y*y + z*z;
493
	}
494

495
	Vec3Packed Clamp(const T &l, const T &h) const
496
	{
497
		return Vec3Packed(VecClamp(x, l, h), VecClamp(y, l, h), VecClamp(z, l, h));
498
	}
499

500
	// Only implemented for T=float
501
	float Length() const;
502
	void SetLength(const float l);
503
	Vec3Packed WithLength(const float l) const;
504
	float Distance2To(const Vec3Packed &other) const;
505
	Vec3Packed Normalized() const;
506
	float Normalize(); // returns the previous length, which is often useful
507

508
	T& operator [] (int i) //allow vector[2] = 3   (vector.z=3)
509
	{
510
		return *((&x) + i);
511
	}
512
	T operator [] (const int i) const
513
	{
514
		return *((&x) + i);
515
	}
516

517
	void SetZero()
518
	{
519
		x=0; y=0; z=0;
520
	}
521

522
	// Common aliases: UVW (texel coordinates), RGB (colors), STQ (texture coordinates)
523
	T& u() { return x; }
524
	T& v() { return y; }
525
	T& w() { return z; }
526

527
	T& r() { return x; }
528
	T& g() { return y; }
529
	T& b() { return z; }
530

531
	T& s() { return x; }
532
	T& t() { return y; }
533
	T& q() { return z; }
534

535
	const T& u() const { return x; }
536
	const T& v() const { return y; }
537
	const T& w() const { return z; }
538

539
	const T& r() const { return x; }
540
	const T& g() const { return y; }
541
	const T& b() const { return z; }
542

543
	const T& s() const { return x; }
544
	const T& t() const { return y; }
545
	const T& q() const { return z; }
546

547
	// swizzlers - create a subvector of specific components
548
	// e.g. Vec2 uv() { return Vec2(x,y); }
549
	// _DEFINE_SWIZZLER2 defines a single such function, DEFINE_SWIZZLER2 defines all of them for all component names (x<->r) and permutations (xy<->yx)
550
#define _DEFINE_SWIZZLER2(a, b, name) const Vec2<T> name() const { return Vec2<T>(a, b); }
551
#define DEFINE_SWIZZLER2(a, b, a2, b2, a3, b3, a4, b4) \
552
	_DEFINE_SWIZZLER2(a, b, a##b); \
553
	_DEFINE_SWIZZLER2(a, b, a2##b2); \
554
	_DEFINE_SWIZZLER2(a, b, a3##b3); \
555
	_DEFINE_SWIZZLER2(a, b, a4##b4); \
556
	_DEFINE_SWIZZLER2(b, a, b##a); \
557
	_DEFINE_SWIZZLER2(b, a, b2##a2); \
558
	_DEFINE_SWIZZLER2(b, a, b3##a3); \
559
	_DEFINE_SWIZZLER2(b, a, b4##a4);
560

561
	DEFINE_SWIZZLER2(x, y, r, g, u, v, s, t);
562
	DEFINE_SWIZZLER2(x, z, r, b, u, w, s, q);
563
	DEFINE_SWIZZLER2(y, z, g, b, v, w, t, q);
564
#undef DEFINE_SWIZZLER2
565
#undef _DEFINE_SWIZZLER2
566
};
567

568
template<typename T>
569
class Vec4
570
{
571
public:
572
	union
573
	{
574
		struct
575
		{
576
			T x,y,z,w;
577
		};
578
#if defined(_M_SSE)
579
		__m128i ivec;
580
		__m128 vec;
581
#elif PPSSPP_ARCH(ARM_NEON)
582
		int32x4_t ivec;
583
		float32x4_t vec;
584
#endif
585
	};
586

587
	T* AsArray() { return &x; }
588
	const T* AsArray() const { return &x; }
589

590
	Vec4() {}
591
	Vec4(const T a[4]) : x(a[0]), y(a[1]), z(a[2]), w(a[3]) {}
592
	Vec4(const T& _x, const T& _y, const T& _z, const T& _w) : x(_x), y(_y), z(_z), w(_w) {}
593
	Vec4(const Vec2<T>& _xy, const T& _z, const T& _w) : x(_xy.x), y(_xy.y), z(_z), w(_w) {}
594
	Vec4(const Vec3<T>& _xyz, const T& _w) : x(_xyz.x), y(_xyz.y), z(_xyz.z), w(_w) {}
595
#if defined(_M_SSE)
596
	Vec4(const __m128 &_vec) : vec(_vec) {}
597
	Vec4(const __m128i &_ivec) : ivec(_ivec) {}
598
#elif PPSSPP_ARCH(ARM_NEON)
599
	Vec4(const float32x4_t &_vec) : vec(_vec) {}
600
#if !defined(_MSC_VER)
601
	Vec4(const int32x4_t &_ivec) : ivec(_ivec) {}
602
#endif
603
#endif
604

605
	template<typename T2>
606
	Vec4<T2> Cast() const {
607
		if constexpr (std::is_same<T, float>::value && std::is_same<T2, int>::value) {
608
#if defined(_M_SSE)
609
			return _mm_cvtps_epi32(SAFE_M128(vec));
610
#elif PPSSPP_ARCH(ARM_NEON)
611
			return vcvtq_s32_f32(vec);
612
#endif
613
		}
614
		if constexpr (std::is_same<T, int>::value && std::is_same<T2, float>::value) {
615
#if defined(_M_SSE)
616
			return _mm_cvtepi32_ps(SAFE_M128I(ivec));
617
#elif PPSSPP_ARCH(ARM_NEON)
618
			return vcvtq_f32_s32(ivec);
619
#endif
620
		}
621
		return Vec4<T2>((T2)x, (T2)y, (T2)z, (T2)w);
622
	}
623

624
	// Only implemented for T=int and T=float
625
	static Vec4 FromRGBA(unsigned int rgba);
626
	static Vec4 FromRGBA(const u8 *rgba);
627
	unsigned int ToRGBA() const;
628
	void ToRGBA(u8 *rgba) const;
629

630
	static Vec4 AssignToAll(const T& f)
631
	{
632
		return Vec4<T>(f, f, f, f);
633
	}
634

635
	void Write(T a[4])
636
	{
637
		a[0] = x; a[1] = y; a[2] = z; a[3] = w;
638
	}
639

640
	Vec4 operator +(const Vec4& other) const
641
	{
642
		return Vec4(x+other.x, y+other.y, z+other.z, w+other.w);
643
	}
644
	void operator += (const Vec4& other)
645
	{
646
		x+=other.x; y+=other.y; z+=other.z; w+=other.w;
647
	}
648
	Vec4 operator -(const Vec4 &other) const
649
	{
650
		return Vec4(x-other.x, y-other.y, z-other.z, w-other.w);
651
	}
652
	void operator -= (const Vec4 &other)
653
	{
654
		x-=other.x; y-=other.y; z-=other.z; w-=other.w;
655
	}
656
	Vec4 operator -() const
657
	{
658
		return Vec4(-x,-y,-z,-w);
659
	}
660
	Vec4 operator * (const Vec4 &other) const
661
	{
662
		return Vec4(x*other.x, y*other.y, z*other.z, w*other.w);
663
	}
664
	Vec4 operator | (const Vec4 &other) const
665
	{
666
		return Vec4(x | other.x, y | other.y, z | other.z, w | other.w);
667
	}
668
	Vec4 operator & (const Vec4 &other) const
669
	{
670
		return Vec4(x & other.x, y & other.y, z & other.z, w & other.w);
671
	}
672
	Vec4 operator << (const int amount) const
673
	{
674
		// NOTE: x*(1<<amount), etc., might be safer, since
675
		// left-shifting negatives is UB pre-C++20.
676
		return Vec4(x << amount, y << amount, z << amount, w << amount);
677
	}
678
	Vec4 operator >> (const int amount) const
679
	{
680
		return Vec4(x >> amount, y >> amount, z >> amount, w >> amount);
681
	}
682
	template<typename V>
683
	Vec4 operator * (const V& f) const
684
	{
685
		return Vec4(x*f,y*f,z*f,w*f);
686
	}
687
	template<typename V>
688
	void operator *= (const V& f)
689
	{
690
		x*=f; y*=f; z*=f; w*=f;
691
	}
692
	template<typename V>
693
	Vec4 operator / (const V& f) const
694
	{
695
		return Vec4(x/f,y/f,z/f,w/f);
696
	}
697
	template<typename V>
698
	void operator /= (const V& f)
699
	{
700
		*this = *this / f;
701
	}
702

703
	bool operator ==(const Vec4 &other) const {
704
		return x == other.x && y == other.y && z == other.z && w == other.w;
705
	}
706

707
	T Length2() const
708
	{
709
		return x*x + y*y + z*z + w*w;
710
	}
711

712
	Vec4 Clamp(const T &l, const T &h) const
713
	{
714
		return Vec4(VecClamp(x, l, h), VecClamp(y, l, h), VecClamp(z, l, h), VecClamp(w, l, h));
715
	}
716

717
	Vec4 Reciprocal() const
718
	{
719
		const T one = 1.0f;
720
		return Vec4(one / x, one / y, one / z, one / w);
721
	}
722

723
	// Only implemented for T=float
724
	float Length() const;
725
	void SetLength(const float l);
726
	Vec4 WithLength(const float l) const;
727
	float Distance2To(const Vec4 &other) const;
728
	Vec4 Normalized() const;
729
	float Normalize(); // returns the previous length, which is often useful
730

731
	T& operator [] (int i) //allow vector[2] = 3   (vector.z=3)
732
	{
733
		return *((&x) + i);
734
	}
735
	T operator [] (const int i) const
736
	{
737
		return *((&x) + i);
738
	}
739

740
	void SetZero()
741
	{
742
		x=0; y=0; z=0; w=0;
743
	}
744

745
	// Common alias: RGBA (colors)
746
	T& r() { return x; }
747
	T& g() { return y; }
748
	T& b() { return z; }
749
	T& a() { return w; }
750

751
	const T& r() const { return x; }
752
	const T& g() const { return y; }
753
	const T& b() const { return z; }
754
	const T& a() const { return w; }
755

756
	// swizzlers - create a subvector of specific components
757
	// e.g. Vec2 uv() { return Vec2(x,y); }
758
	// _DEFINE_SWIZZLER2 defines a single such function, DEFINE_SWIZZLER2 defines all of them for all component names (x<->r) and permutations (xy<->yx)
759
#define _DEFINE_SWIZZLER2(a, b, name) const Vec2<T> name() const { return Vec2<T>(a, b); }
760
#define DEFINE_SWIZZLER2(a, b, a2, b2) \
761
	_DEFINE_SWIZZLER2(a, b, a##b); \
762
	_DEFINE_SWIZZLER2(a, b, a2##b2); \
763
	_DEFINE_SWIZZLER2(b, a, b##a); \
764
	_DEFINE_SWIZZLER2(b, a, b2##a2);
765

766
	DEFINE_SWIZZLER2(x, y, r, g);
767
	DEFINE_SWIZZLER2(x, z, r, b);
768
	DEFINE_SWIZZLER2(x, w, r, a);
769
	DEFINE_SWIZZLER2(y, z, g, b);
770
	DEFINE_SWIZZLER2(y, w, g, a);
771
	DEFINE_SWIZZLER2(z, w, b, a);
772
#undef DEFINE_SWIZZLER2
773
#undef _DEFINE_SWIZZLER2
774

775
#define _DEFINE_SWIZZLER3(a, b, c, name) const Vec3<T> name() const { return Vec3<T>(a, b, c); }
776
#define DEFINE_SWIZZLER3(a, b, c, a2, b2, c2) \
777
	_DEFINE_SWIZZLER3(a, b, c, a##b##c); \
778
	_DEFINE_SWIZZLER3(a, c, b, a##c##b); \
779
	_DEFINE_SWIZZLER3(b, a, c, b##a##c); \
780
	_DEFINE_SWIZZLER3(b, c, a, b##c##a); \
781
	_DEFINE_SWIZZLER3(c, a, b, c##a##b); \
782
	_DEFINE_SWIZZLER3(c, b, a, c##b##a); \
783
	_DEFINE_SWIZZLER3(a, b, c, a2##b2##c2); \
784
	_DEFINE_SWIZZLER3(a, c, b, a2##c2##b2); \
785
	_DEFINE_SWIZZLER3(b, a, c, b2##a2##c2); \
786
	_DEFINE_SWIZZLER3(b, c, a, b2##c2##a2); \
787
	_DEFINE_SWIZZLER3(c, a, b, c2##a2##b2); \
788
	_DEFINE_SWIZZLER3(c, b, a, c2##b2##a2);
789

790
	DEFINE_SWIZZLER3(x, y, z, r, g, b);
791
	DEFINE_SWIZZLER3(x, y, w, r, g, a);
792
	DEFINE_SWIZZLER3(x, z, w, r, b, a);
793
	DEFINE_SWIZZLER3(y, z, w, g, b, a);
794
#undef DEFINE_SWIZZLER3
795
#undef _DEFINE_SWIZZLER3
796
};
797

798

799
template<typename BaseType>
800
class Mat3x3
801
{
802
public:
803
	// Convention: first three values = first column
804
	Mat3x3(const BaseType values[])
805
	{
806
		for (unsigned int i = 0; i < 3*3; ++i)
807
		{
808
			this->values[i] = values[i];
809
		}
810
	}
811

812
	Mat3x3(BaseType _00, BaseType _01, BaseType _02, BaseType _10, BaseType _11, BaseType _12, BaseType _20, BaseType _21, BaseType _22)
813
	{
814
		values[0] = _00;
815
		values[1] = _01;
816
		values[2] = _02;
817
		values[3] = _10;
818
		values[4] = _11;
819
		values[5] = _12;
820
		values[6] = _20;
821
		values[7] = _21;
822
		values[8] = _22;
823
	}
824

825
	template<typename T>
826
	Vec3<T> operator * (const Vec3<T>& vec) const
827
	{
828
		Vec3<T> ret;
829
		ret.x = values[0]*vec.x + values[3]*vec.y + values[6]*vec.z;
830
		ret.y = values[1]*vec.x + values[4]*vec.y + values[7]*vec.z;
831
		ret.z = values[2]*vec.x + values[5]*vec.y + values[8]*vec.z;
832
		return ret;
833
	}
834

835
	Mat3x3 Inverse() const
836
	{
837
		float a = values[0];
838
		float b = values[1];
839
		float c = values[2];
840
		float d = values[3];
841
		float e = values[4];
842
		float f = values[5];
843
		float g = values[6];
844
		float h = values[7];
845
		float i = values[8];
846
		return Mat3x3(e*i-f*h, f*g-d*i, d*h-e*g,
847
						c*h-b*i, a*i-c*g, b*g-a*h,
848
						b*f-c*e, c*d-a*f, a*e-b*d) / Det();
849
	}
850

851
	BaseType Det() const
852
	{
853
		return values[0]*values[4]*values[8] + values[3]*values[7]*values[2] +
854
				values[6]*values[1]*values[5] - values[2]*values[4]*values[6] -
855
				values[5]*values[7]*values[0] - values[8]*values[1]*values[3];
856
	}
857

858
	Mat3x3 operator / (const BaseType& val) const
859
	{
860
		return Mat3x3(values[0]/val, values[1]/val, values[2]/val,
861
						values[3]/val, values[4]/val, values[5]/val,
862
						values[6]/val, values[7]/val, values[8]/val);
863
	}
864

865
private:
866
	BaseType values[3*3];
867
};
868

869

870
template<typename BaseType>
871
class Mat4x4
872
{
873
public:
874
	// Convention: first four values in arrow = first column
875
	Mat4x4(const BaseType values[])
876
	{
877
		for (unsigned int i = 0; i < 4*4; ++i)
878
		{
879
			this->values[i] = values[i];
880
		}
881
	}
882

883
	template<typename T>
884
	Vec4<T> operator * (const Vec4<T>& vec) const
885
	{
886
		Vec4<T> ret;
887
		ret.x = values[0]*vec.x + values[4]*vec.y + values[8]*vec.z + values[12]*vec.w;
888
		ret.y = values[1]*vec.x + values[5]*vec.y + values[9]*vec.z + values[13]*vec.w;
889
		ret.z = values[2]*vec.x + values[6]*vec.y + values[10]*vec.z + values[14]*vec.w;
890
		ret.w = values[3]*vec.x + values[7]*vec.y + values[11]*vec.z + values[15]*vec.w;
891
		return ret;
892
	}
893

894
private:
895
	BaseType values[4*4];
896
};
897

898
}; // namespace Math3D
899

900
typedef Math3D::Vec2<float> Vec2f;
901
typedef Math3D::Vec3<float> Vec3f;
902
typedef Math3D::Vec3Packed<float> Vec3Packedf;
903
typedef Math3D::Vec4<float> Vec4f;
904

905
#if defined(_M_SSE)
906
template<unsigned i>
907
float MATH3D_CALL vectorGetByIndex(__m128 v) {
908
	// shuffle V so that the element that we want is moved to the bottom
909
	return _mm_cvtss_f32(_mm_shuffle_ps(v, v, _MM_SHUFFLE(i, i, i, i)));
910
}
911
#endif
912

913
#if defined(_M_SSE)
914
// x, y, and z should be broadcast.  Should only be used through Vec3f version.
915
// Note that this will read an extra float from the matrix, so it better not be at the end of an allocation!
916
inline __m128 MATH3D_CALL Vec3ByMatrix43Internal(__m128 x, __m128 y, __m128 z, const float m[12]) {
917
	__m128 col0 = _mm_loadu_ps(m);
918
	__m128 col1 = _mm_loadu_ps(m + 3);
919
	__m128 col2 = _mm_loadu_ps(m + 6);
920
	__m128 col3 = _mm_loadu_ps(m + 9);
921
	__m128 sum = _mm_add_ps(
922
		_mm_add_ps(_mm_mul_ps(col0, x), _mm_mul_ps(col1, y)),
923
		_mm_add_ps(_mm_mul_ps(col2, z), col3));
924
	return sum;
925
}
926
#elif PPSSPP_ARCH(ARM64_NEON)
927
inline float32x4_t Vec3ByMatrix43Internal(float32x4_t vec, const float m[16]) {
928
	float32x4_t col0 = vld1q_f32(m);
929
	float32x4_t col1 = vld1q_f32(m + 3);
930
	float32x4_t col2 = vld1q_f32(m + 6);
931
	float32x4_t col3 = vld1q_f32(m + 9);
932
	float32x4_t sum = vaddq_f32(
933
		vaddq_f32(vmulq_laneq_f32(col0, vec, 0), vmulq_laneq_f32(col1, vec, 1)),
934
		vaddq_f32(vmulq_laneq_f32(col2, vec, 2), col3));
935
	return sum;
936
}
937
#elif PPSSPP_ARCH(ARM_NEON)
938
inline float32x4_t Vec3ByMatrix43Internal(float32x4_t vec, const float m[16]) {
939
	float32x4_t col0 = vld1q_f32(m);
940
	float32x4_t col1 = vld1q_f32(m + 3);
941
	float32x4_t col2 = vld1q_f32(m + 6);
942
	float32x4_t col3 = vld1q_f32(m + 9);
943
	float32x4_t sum = vaddq_f32(
944
		vaddq_f32(vmulq_lane_f32(col0, vget_low_f32(vec), 0), vmulq_lane_f32(col1, vget_low_f32(vec), 1)),
945
		vaddq_f32(vmulq_lane_f32(col2, vget_high_f32(vec), 0), col3));
946
	return sum;
947
}
948
#endif
949

950
// v and vecOut must point to different memory.
951
inline void Vec3ByMatrix43(float vecOut[3], const float v[3], const float m[12]) {
952
#if defined(_M_SSE)
953
	__m128 x = _mm_set1_ps(v[0]);
954
	__m128 y = _mm_set1_ps(v[1]);
955
	__m128 z = _mm_set1_ps(v[2]);
956
	__m128 sum = Vec3ByMatrix43Internal(x, y, z, m);
957
	// Not sure what the best way to store 3 elements is. Ideally, we should
958
	// probably store all four.
959
	vecOut[0] = _mm_cvtss_f32(sum);
960
	vecOut[1] = vectorGetByIndex<1>(sum);
961
	vecOut[2] = vectorGetByIndex<2>(sum);
962
#elif PPSSPP_ARCH(ARM_NEON)
963
	float vecIn[4] = {v[0], v[1], v[2], 1.0f};
964
	float32x4_t sum = Vec3ByMatrix43Internal(vld1q_f32(vecIn), m);
965
	vecOut[0] = vgetq_lane_f32(sum, 0);
966
	vecOut[1] = vgetq_lane_f32(sum, 1);
967
	vecOut[2] = vgetq_lane_f32(sum, 2);
968
#else
969
	vecOut[0] = v[0] * m[0] + v[1] * m[3] + v[2] * m[6] + m[9];
970
	vecOut[1] = v[0] * m[1] + v[1] * m[4] + v[2] * m[7] + m[10];
971
	vecOut[2] = v[0] * m[2] + v[1] * m[5] + v[2] * m[8] + m[11];
972
#endif
973
}
974

975
inline Vec3f MATH3D_CALL Vec3ByMatrix43(const Vec3f v, const float m[12]) {
976
#if defined(_M_SSE)
977
	const __m128 vv = SAFE_M128(v.vec);
978
	__m128 x = _mm_shuffle_ps(vv, vv, _MM_SHUFFLE(0, 0, 0, 0));
979
	__m128 y = _mm_shuffle_ps(vv, vv, _MM_SHUFFLE(1, 1, 1, 1));
980
	__m128 z = _mm_shuffle_ps(vv, vv, _MM_SHUFFLE(2, 2, 2, 2));
981
	return Vec3ByMatrix43Internal(x, y, z, m);
982
#elif PPSSPP_ARCH(ARM_NEON)
983
	return Vec3ByMatrix43Internal(v.vec, m);
984
#else
985
	Vec3f vecOut;
986
	Vec3ByMatrix43(vecOut.AsArray(), v.AsArray(), m);
987
	return vecOut;
988
#endif
989
}
990

991
#if defined(_M_SSE)
992
// x, y, and z should be broadcast.  Should only be used through Vec3f version.
993
inline __m128 MATH3D_CALL Vec3ByMatrix44Internal(__m128 x, __m128 y, __m128 z, const float m[16]) {
994
	__m128 col0 = _mm_loadu_ps(m);
995
	__m128 col1 = _mm_loadu_ps(m + 4);
996
	__m128 col2 = _mm_loadu_ps(m + 8);
997
	__m128 col3 = _mm_loadu_ps(m + 12);
998
	__m128 sum = _mm_add_ps(
999
		_mm_add_ps(_mm_mul_ps(col0, x), _mm_mul_ps(col1, y)),
1000
		_mm_add_ps(_mm_mul_ps(col2, z), col3));
1001
	return sum;
1002
}
1003
#elif PPSSPP_ARCH(ARM64_NEON)
1004
inline float32x4_t Vec3ByMatrix44Internal(float32x4_t vec, const float m[16]) {
1005
	float32x4_t col0 = vld1q_f32(m);
1006
	float32x4_t col1 = vld1q_f32(m + 4);
1007
	float32x4_t col2 = vld1q_f32(m + 8);
1008
	float32x4_t col3 = vld1q_f32(m + 12);
1009
	float32x4_t sum = vaddq_f32(
1010
		vaddq_f32(vmulq_laneq_f32(col0, vec, 0), vmulq_laneq_f32(col1, vec, 1)),
1011
		vaddq_f32(vmulq_laneq_f32(col2, vec, 2), col3));
1012
	return sum;
1013
}
1014
#elif PPSSPP_ARCH(ARM_NEON)
1015
inline float32x4_t Vec3ByMatrix44Internal(float32x4_t vec, const float m[16]) {
1016
	float32x4_t col0 = vld1q_f32(m);
1017
	float32x4_t col1 = vld1q_f32(m + 4);
1018
	float32x4_t col2 = vld1q_f32(m + 8);
1019
	float32x4_t col3 = vld1q_f32(m + 12);
1020
	float32x4_t sum = vaddq_f32(
1021
		vaddq_f32(vmulq_lane_f32(col0, vget_low_f32(vec), 0), vmulq_lane_f32(col1, vget_low_f32(vec), 1)),
1022
		vaddq_f32(vmulq_lane_f32(col2, vget_high_f32(vec), 0), col3));
1023
	return sum;
1024
}
1025
#endif
1026

1027
inline void Vec3ByMatrix44(float vecOut[4], const float v[3], const float m[16]) {
1028
#if defined(_M_SSE)
1029
	__m128 x = _mm_set1_ps(v[0]);
1030
	__m128 y = _mm_set1_ps(v[1]);
1031
	__m128 z = _mm_set1_ps(v[2]);
1032
	__m128 sum = Vec3ByMatrix44Internal(x, y, z, m);
1033
	_mm_storeu_ps(vecOut, sum);
1034
#elif PPSSPP_ARCH(ARM_NEON)
1035
	float vecIn[4] = {v[0], v[1], v[2], 1.0f};
1036
	float32x4_t sum = Vec3ByMatrix44Internal(vld1q_f32(vecIn), m);
1037
	vst1q_f32(vecOut, sum);
1038
#else
1039
	vecOut[0] = v[0] * m[0] + v[1] * m[4] + v[2] * m[8] + m[12];
1040
	vecOut[1] = v[0] * m[1] + v[1] * m[5] + v[2] * m[9] + m[13];
1041
	vecOut[2] = v[0] * m[2] + v[1] * m[6] + v[2] * m[10] + m[14];
1042
	vecOut[3] = v[0] * m[3] + v[1] * m[7] + v[2] * m[11] + m[15];
1043
#endif
1044
}
1045

1046
inline Vec4f MATH3D_CALL Vec3ByMatrix44(const Vec3f v, const float m[16]) {
1047
#if defined(_M_SSE)
1048
	const __m128 vv = SAFE_M128(v.vec);
1049
	__m128 x = _mm_shuffle_ps(vv, vv, _MM_SHUFFLE(0, 0, 0, 0));
1050
	__m128 y = _mm_shuffle_ps(vv, vv, _MM_SHUFFLE(1, 1, 1, 1));
1051
	__m128 z = _mm_shuffle_ps(vv, vv, _MM_SHUFFLE(2, 2, 2, 2));
1052
	return Vec3ByMatrix44Internal(x, y, z, m);
1053
#elif PPSSPP_ARCH(ARM_NEON)
1054
	return Vec3ByMatrix44Internal(v.vec, m);
1055
#else
1056
	Vec4f vecOut;
1057
	Vec3ByMatrix44(vecOut.AsArray(), v.AsArray(), m);
1058
	return vecOut;
1059
#endif
1060
}
1061

1062
#if defined(_M_SSE)
1063
// x, y, and z should be broadcast.  Should only be used through Vec3f version.
1064
inline __m128 MATH3D_CALL Norm3ByMatrix43Internal(__m128 x, __m128 y, __m128 z, const float m[12]) {
1065
	__m128 col0 = _mm_loadu_ps(m);
1066
	__m128 col1 = _mm_loadu_ps(m + 3);
1067
	__m128 col2 = _mm_loadu_ps(m + 6);
1068
	__m128 sum = _mm_add_ps(
1069
		_mm_add_ps(_mm_mul_ps(col0, x), _mm_mul_ps(col1, y)),
1070
		_mm_mul_ps(col2, z));
1071
	return sum;
1072
}
1073
#elif PPSSPP_ARCH(ARM64_NEON)
1074
inline float32x4_t Norm3ByMatrix43Internal(float32x4_t vec, const float m[16]) {
1075
	float32x4_t col0 = vld1q_f32(m);
1076
	float32x4_t col1 = vld1q_f32(m + 3);
1077
	float32x4_t col2 = vld1q_f32(m + 6);
1078
	float32x4_t sum = vaddq_f32(
1079
		vaddq_f32(vmulq_laneq_f32(col0, vec, 0), vmulq_laneq_f32(col1, vec, 1)),
1080
		vmulq_laneq_f32(col2, vec, 2));
1081
	return sum;
1082
}
1083
#elif PPSSPP_ARCH(ARM_NEON)
1084
inline float32x4_t Norm3ByMatrix43Internal(float32x4_t vec, const float m[16]) {
1085
	float32x4_t col0 = vld1q_f32(m);
1086
	float32x4_t col1 = vld1q_f32(m + 3);
1087
	float32x4_t col2 = vld1q_f32(m + 6);
1088
	float32x4_t sum = vaddq_f32(
1089
		vaddq_f32(vmulq_lane_f32(col0, vget_low_f32(vec), 0), vmulq_lane_f32(col1, vget_low_f32(vec), 1)),
1090
		vmulq_lane_f32(col2, vget_high_f32(vec), 0));
1091
	return sum;
1092
}
1093
#endif
1094

1095
inline void Norm3ByMatrix43(float vecOut[3], const float v[3], const float m[12]) {
1096
#if defined(_M_SSE)
1097
	__m128 x = _mm_set1_ps(v[0]);
1098
	__m128 y = _mm_set1_ps(v[1]);
1099
	__m128 z = _mm_set1_ps(v[2]);
1100
	__m128 sum = Norm3ByMatrix43Internal(x, y, z, m);
1101
	vecOut[0] = _mm_cvtss_f32(sum);
1102
	vecOut[1] = vectorGetByIndex<1>(sum);
1103
	vecOut[2] = vectorGetByIndex<2>(sum);
1104
#elif PPSSPP_ARCH(ARM_NEON)
1105
	float32x4_t sum = Norm3ByMatrix43Internal(vld1q_f32(v), m);
1106
	vecOut[0] = vgetq_lane_f32(sum, 0);
1107
	vecOut[1] = vgetq_lane_f32(sum, 1);
1108
	vecOut[2] = vgetq_lane_f32(sum, 2);
1109
#else
1110
	vecOut[0] = v[0] * m[0] + v[1] * m[3] + v[2] * m[6];
1111
	vecOut[1] = v[0] * m[1] + v[1] * m[4] + v[2] * m[7];
1112
	vecOut[2] = v[0] * m[2] + v[1] * m[5] + v[2] * m[8];
1113
#endif
1114
}
1115

1116
inline Vec3f MATH3D_CALL Norm3ByMatrix43(const Vec3f v, const float m[12]) {
1117
#if defined(_M_SSE)
1118
	const __m128 vv = SAFE_M128(v.vec);
1119
	__m128 x = _mm_shuffle_ps(vv, vv, _MM_SHUFFLE(0, 0, 0, 0));
1120
	__m128 y = _mm_shuffle_ps(vv, vv, _MM_SHUFFLE(1, 1, 1, 1));
1121
	__m128 z = _mm_shuffle_ps(vv, vv, _MM_SHUFFLE(2, 2, 2, 2));
1122
	return Norm3ByMatrix43Internal(x, y, z, m);
1123
#elif PPSSPP_ARCH(ARM_NEON)
1124
	return Norm3ByMatrix43Internal(v.vec, m);
1125
#else
1126
	Vec3f vecOut;
1127
	Norm3ByMatrix43(vecOut.AsArray(), v.AsArray(), m);
1128
	return vecOut;
1129
#endif
1130
}
1131

1132
inline void Matrix4ByMatrix4(float out[16], const float a[16], const float b[16]) {
1133
	fast_matrix_mul_4x4(out, b, a);
1134
}
1135

1136
inline void ConvertMatrix4x3To4x4(float *m4x4, const float *m4x3) {
1137
	m4x4[0] = m4x3[0];
1138
	m4x4[1] = m4x3[1];
1139
	m4x4[2] = m4x3[2];
1140
	m4x4[3] = 0.0f;
1141
	m4x4[4] = m4x3[3];
1142
	m4x4[5] = m4x3[4];
1143
	m4x4[6] = m4x3[5];
1144
	m4x4[7] = 0.0f;
1145
	m4x4[8] = m4x3[6];
1146
	m4x4[9] = m4x3[7];
1147
	m4x4[10] = m4x3[8];
1148
	m4x4[11] = 0.0f;
1149
	m4x4[12] = m4x3[9];
1150
	m4x4[13] = m4x3[10];
1151
	m4x4[14] = m4x3[11];
1152
	m4x4[15] = 1.0f;
1153
}
1154

1155
inline void ConvertMatrix4x3To4x4Transposed(float *m4x4, const float *m4x3) {
1156
#if PPSSPP_ARCH(ARM_NEON)
1157
	// vld3q is a perfect match here!
1158
	float32x4x3_t packed = vld3q_f32(m4x3);
1159
	vst1q_f32(m4x4, packed.val[0]);
1160
	vst1q_f32(m4x4 + 4, packed.val[1]);
1161
	vst1q_f32(m4x4 + 8, packed.val[2]);
1162
#else
1163
	m4x4[0] = m4x3[0];
1164
	m4x4[1] = m4x3[3];
1165
	m4x4[2] = m4x3[6];
1166
	m4x4[3] = m4x3[9];
1167
	m4x4[4] = m4x3[1];
1168
	m4x4[5] = m4x3[4];
1169
	m4x4[6] = m4x3[7];
1170
	m4x4[7] = m4x3[10];
1171
	m4x4[8] = m4x3[2];
1172
	m4x4[9] = m4x3[5];
1173
	m4x4[10] = m4x3[8];
1174
	m4x4[11] = m4x3[11];
1175
#endif
1176
	m4x4[12] = 0.0f;
1177
	m4x4[13] = 0.0f;
1178
	m4x4[14] = 0.0f;
1179
	m4x4[15] = 1.0f;
1180
}
1181

1182
// 0369
1183
// 147A
1184
// 258B
1185
// ->>-
1186
// 0123
1187
// 4567
1188
// 89AB
1189
// Don't see a way to SIMD that. Should be pretty fast anyway.
1190
inline void ConvertMatrix4x3To3x4Transposed(float *m4x4, const float *m4x3) {
1191
#if PPSSPP_ARCH(ARM_NEON)
1192
	// vld3q is a perfect match here!
1193
	float32x4x3_t packed = vld3q_f32(m4x3);
1194
	vst1q_f32(m4x4, packed.val[0]);
1195
	vst1q_f32(m4x4 + 4, packed.val[1]);
1196
	vst1q_f32(m4x4 + 8, packed.val[2]);
1197
#else
1198
	m4x4[0] = m4x3[0];
1199
	m4x4[1] = m4x3[3];
1200
	m4x4[2] = m4x3[6];
1201
	m4x4[3] = m4x3[9];
1202
	m4x4[4] = m4x3[1];
1203
	m4x4[5] = m4x3[4];
1204
	m4x4[6] = m4x3[7];
1205
	m4x4[7] = m4x3[10];
1206
	m4x4[8] = m4x3[2];
1207
	m4x4[9] = m4x3[5];
1208
	m4x4[10] = m4x3[8];
1209
	m4x4[11] = m4x3[11];
1210
#endif
1211
}
1212

1213
inline void Transpose4x4(float out[16], const float in[16]) {
1214
	for (int i = 0; i < 4; i++) {
1215
		for (int j = 0; j < 4; j++) {
1216
			out[i * 4 + j] = in[j * 4 + i];
1217
		}
1218
	}
1219
}
1220

1221
namespace Math3D {
1222

1223
template<typename T>
1224
inline T Dot(const Vec2<T>& a, const Vec2<T>& b)
1225
{
1226
	return a.x*b.x + a.y*b.y;
1227
}
1228

1229
template<typename T>
1230
inline T Dot(const Vec3<T>& a, const Vec3<T>& b)
1231
{
1232
	return a.x*b.x + a.y*b.y + a.z*b.z;
1233
}
1234

1235
template<typename T>
1236
inline T Dot(const Vec4<T>& a, const Vec4<T>& b)
1237
{
1238
	return a.x*b.x + a.y*b.y + a.z*b.z + a.w*b.w;
1239
}
1240

1241
template<typename T>
1242
inline Vec3<T> Cross(const Vec3<T>& a, const Vec3<T>& b)
1243
{
1244
	return Vec3<T>(a.y*b.z-a.z*b.y, a.z*b.x-a.x*b.z, a.x*b.y-a.y*b.x);
1245
}
1246

1247
template<typename T>
1248
inline Vec3Packed<T> Cross(const Vec3Packed<T>& a, const Vec3Packed<T>& b)
1249
{
1250
	return Vec3Packed<T>(a.y*b.z-a.z*b.y, a.z*b.x-a.x*b.z, a.x*b.y-a.y*b.x);
1251
}
1252

1253
template<>
1254
inline Vec3<float> Vec3<float>::FromRGB(unsigned int rgb)
1255
{
1256
#if defined(_M_SSE)
1257
	__m128i z = _mm_setzero_si128();
1258
	__m128i c = _mm_cvtsi32_si128(rgb);
1259
	c = _mm_unpacklo_epi16(_mm_unpacklo_epi8(c, z), z);
1260
	return Vec3<float>(_mm_mul_ps(_mm_cvtepi32_ps(c), _mm_set_ps1(1.0f / 255.0f)));
1261
#elif PPSSPP_ARCH(ARM_NEON)
1262
	uint8x8_t c = vreinterpret_u8_u32(vdup_n_u32(rgb));
1263
	uint32x4_t u = vmovl_u16(vget_low_u16(vmovl_u8(c)));
1264
	return Vec3<float>(vmulq_f32(vcvtq_f32_u32(u), vdupq_n_f32(1.0f / 255.0f)));
1265
#else
1266
	return Vec3((rgb & 0xFF) * (1.0f/255.0f),
1267
				((rgb >> 8) & 0xFF) * (1.0f/255.0f),
1268
				((rgb >> 16) & 0xFF) * (1.0f/255.0f));
1269
#endif
1270
}
1271

1272
template<>
1273
inline Vec3<int> Vec3<int>::FromRGB(unsigned int rgb)
1274
{
1275
#if defined(_M_SSE)
1276
	__m128i z = _mm_setzero_si128();
1277
	__m128i c = _mm_cvtsi32_si128(rgb);
1278
	c = _mm_unpacklo_epi16(_mm_unpacklo_epi8(c, z), z);
1279
	return Vec3<int>(c);
1280
#elif PPSSPP_ARCH(ARM_NEON)
1281
	uint8x8_t c = vreinterpret_u8_u32(vdup_n_u32(rgb));
1282
	uint32x4_t u = vmovl_u16(vget_low_u16(vmovl_u8(c)));
1283
	return Vec3<int>(vreinterpretq_s32_u32(u));
1284
#else
1285
	return Vec3(rgb & 0xFF, (rgb >> 8) & 0xFF, (rgb >> 16) & 0xFF);
1286
#endif
1287
}
1288

1289
template<>
1290
__forceinline unsigned int Vec3<float>::ToRGB() const
1291
{
1292
#if defined(_M_SSE)
1293
	__m128i c = _mm_cvtps_epi32(_mm_mul_ps(SAFE_M128(vec), _mm_set_ps1(255.0f)));
1294
	__m128i c16 = _mm_packs_epi32(c, c);
1295
	return _mm_cvtsi128_si32(_mm_packus_epi16(c16, c16)) & 0x00FFFFFF;
1296
#elif PPSSPP_ARCH(ARM_NEON)
1297
	uint16x4_t c16 = vqmovun_s32(vcvtq_s32_f32(vmulq_f32(vsetq_lane_f32(0.0f, vec, 3), vdupq_n_f32(255.0f))));
1298
	uint8x8_t c8 = vqmovn_u16(vcombine_u16(c16, c16));
1299
	return vget_lane_u32(vreinterpret_u32_u8(c8), 0);
1300
#else
1301
	return (clamp_u8((int)(r() * 255.f)) << 0) |
1302
			(clamp_u8((int)(g() * 255.f)) << 8) |
1303
			(clamp_u8((int)(b() * 255.f)) << 16);
1304
#endif
1305
}
1306

1307
template<>
1308
__forceinline unsigned int Vec3<int>::ToRGB() const
1309
{
1310
#if defined(_M_SSE)
1311
	__m128i c16 = _mm_packs_epi32(SAFE_M128I(ivec), SAFE_M128I(ivec));
1312
	return _mm_cvtsi128_si32(_mm_packus_epi16(c16, c16)) & 0x00FFFFFF;
1313
#elif PPSSPP_ARCH(ARM_NEON)
1314
	uint16x4_t c16 = vqmovun_s32(vsetq_lane_s32(0, ivec, 3));
1315
	uint8x8_t c8 = vqmovn_u16(vcombine_u16(c16, c16));
1316
	return vget_lane_u32(vreinterpret_u32_u8(c8), 0);
1317
#else
1318
	return clamp_u8(r()) | (clamp_u8(g()) << 8) | (clamp_u8(b()) << 16);
1319
#endif
1320
}
1321

1322
template<>
1323
inline Vec4<float> Vec4<float>::FromRGBA(unsigned int rgba)
1324
{
1325
#if defined(_M_SSE)
1326
	__m128i z = _mm_setzero_si128();
1327
	__m128i c = _mm_cvtsi32_si128(rgba);
1328
	c = _mm_unpacklo_epi16(_mm_unpacklo_epi8(c, z), z);
1329
	return Vec4<float>(_mm_mul_ps(_mm_cvtepi32_ps(c), _mm_set_ps1(1.0f / 255.0f)));
1330
#elif PPSSPP_ARCH(ARM_NEON)
1331
	uint8x8_t c = vreinterpret_u8_u32(vdup_n_u32(rgba));
1332
	uint32x4_t u = vmovl_u16(vget_low_u16(vmovl_u8(c)));
1333
	return Vec4<float>(vmulq_f32(vcvtq_f32_u32(u), vdupq_n_f32(1.0f / 255.0f)));
1334
#else
1335
	return Vec4((rgba & 0xFF) * (1.0f/255.0f),
1336
				((rgba >> 8) & 0xFF) * (1.0f/255.0f),
1337
				((rgba >> 16) & 0xFF) * (1.0f/255.0f),
1338
				((rgba >> 24) & 0xFF) * (1.0f/255.0f));
1339
#endif
1340
}
1341

1342
template<typename T>
1343
inline Vec4<T> Vec4<T>::FromRGBA(const u8 *rgba)
1344
{
1345
	return Vec4<T>::FromRGBA(*(unsigned int *)rgba);
1346
}
1347

1348
template<>
1349
inline Vec4<int> Vec4<int>::FromRGBA(unsigned int rgba)
1350
{
1351
#if defined(_M_SSE)
1352
	__m128i z = _mm_setzero_si128();
1353
	__m128i c = _mm_cvtsi32_si128(rgba);
1354
	c = _mm_unpacklo_epi16(_mm_unpacklo_epi8(c, z), z);
1355
	return Vec4<int>(c);
1356
#elif PPSSPP_ARCH(ARM_NEON)
1357
	uint8x8_t c = vreinterpret_u8_u32(vdup_n_u32(rgba));
1358
	uint32x4_t u = vmovl_u16(vget_low_u16(vmovl_u8(c)));
1359
	return Vec4<int>(vreinterpretq_s32_u32(u));
1360
#else
1361
	return Vec4(rgba & 0xFF, (rgba >> 8) & 0xFF, (rgba >> 16) & 0xFF, (rgba >> 24) & 0xFF);
1362
#endif
1363
}
1364

1365
template<>
1366
__forceinline unsigned int Vec4<float>::ToRGBA() const
1367
{
1368
#if defined(_M_SSE)
1369
	__m128i c = _mm_cvtps_epi32(_mm_mul_ps(SAFE_M128(vec), _mm_set_ps1(255.0f)));
1370
	__m128i c16 = _mm_packs_epi32(c, c);
1371
	return _mm_cvtsi128_si32(_mm_packus_epi16(c16, c16));
1372
#elif PPSSPP_ARCH(ARM_NEON)
1373
	uint16x4_t c16 = vqmovun_s32(vcvtq_s32_f32(vmulq_f32(vec, vdupq_n_f32(255.0f))));
1374
	uint8x8_t c8 = vqmovn_u16(vcombine_u16(c16, c16));
1375
	return vget_lane_u32(vreinterpret_u32_u8(c8), 0);
1376
#else
1377
	return (clamp_u8((int)(r() * 255.f)) << 0) |
1378
			(clamp_u8((int)(g() * 255.f)) << 8) |
1379
			(clamp_u8((int)(b() * 255.f)) << 16) |
1380
			(clamp_u8((int)(a() * 255.f)) << 24);
1381
#endif
1382
}
1383

1384
template<>
1385
__forceinline unsigned int Vec4<int>::ToRGBA() const
1386
{
1387
#if defined(_M_SSE)
1388
	__m128i c16 = _mm_packs_epi32(SAFE_M128I(ivec), SAFE_M128I(ivec));
1389
	return _mm_cvtsi128_si32(_mm_packus_epi16(c16, c16));
1390
#elif PPSSPP_ARCH(ARM_NEON)
1391
	uint16x4_t c16 = vqmovun_s32(ivec);
1392
	uint8x8_t c8 = vqmovn_u16(vcombine_u16(c16, c16));
1393
	return vget_lane_u32(vreinterpret_u32_u8(c8), 0);
1394
#else
1395
	return clamp_u8(r()) | (clamp_u8(g()) << 8) | (clamp_u8(b()) << 16) | (clamp_u8(a()) << 24);
1396
#endif
1397
}
1398

1399
template<typename T>
1400
__forceinline void Vec4<T>::ToRGBA(u8 *rgba) const
1401
{
1402
	*(u32 *)rgba = ToRGBA();
1403
}
1404

1405
#if defined(_M_SSE)
1406
// Specialized for SIMD optimization
1407

1408
// Vec3<float> operation
1409
template<>
1410
inline void Vec3<float>::operator += (const Vec3<float> &other) {
1411
	vec = _mm_add_ps(SAFE_M128(vec), SAFE_M128(other.vec));
1412
}
1413

1414
template<>
1415
inline Vec3<float> Vec3<float>::operator + (const Vec3 &other) const {
1416
	return Vec3<float>(_mm_add_ps(SAFE_M128(vec), SAFE_M128(other.vec)));
1417
}
1418

1419
template<>
1420
inline void Vec3<float>::operator -= (const Vec3<float> &other) {
1421
	vec = _mm_sub_ps(SAFE_M128(vec), SAFE_M128(other.vec));
1422
}
1423

1424
template<>
1425
inline Vec3<float> Vec3<float>::operator - (const Vec3 &other) const {
1426
	return Vec3<float>(_mm_sub_ps(SAFE_M128(vec), SAFE_M128(other.vec)));
1427
}
1428

1429
template<>
1430
inline Vec3<float> Vec3<float>::operator * (const Vec3 &other) const {
1431
	return Vec3<float>(_mm_mul_ps(SAFE_M128(vec), SAFE_M128(other.vec)));
1432
}
1433

1434
template<> template<>
1435
inline Vec3<float> Vec3<float>::operator * (const float &other) const {
1436
	return Vec3<float>(_mm_mul_ps(SAFE_M128(vec), _mm_set_ps1(other)));
1437
}
1438

1439
// Vec4<int> operation
1440
template<>
1441
inline Vec4<int> Vec4<int>::operator + (const Vec4 &other) const {
1442
	return Vec4<int>(_mm_add_epi32(SAFE_M128I(ivec), SAFE_M128I(other.ivec)));
1443
}
1444

1445
template<>
1446
inline Vec4<int> Vec4<int>::operator * (const Vec4 &other) const {
1447
	__m128i a = SAFE_M128I(ivec);
1448
	__m128i b = SAFE_M128I(other.ivec);
1449
	// Intel in its immense wisdom decided that
1450
	// SSE2 does not get _mm_mullo_epi32(),
1451
	// so we do it this way. This is what clang does,
1452
	// which seems about as good as it gets.
1453
	__m128i m02 = _mm_mul_epu32(a, b);
1454
	__m128i m13 = _mm_mul_epu32(
1455
		_mm_shuffle_epi32(a, _MM_SHUFFLE(3, 3, 1, 1)),
1456
		_mm_shuffle_epi32(b, _MM_SHUFFLE(3, 3, 1, 1)));
1457
	__m128i ret = _mm_unpacklo_epi32(
1458
		_mm_shuffle_epi32(m02, _MM_SHUFFLE(3, 2, 2, 0)),
1459
		_mm_shuffle_epi32(m13, _MM_SHUFFLE(3, 2, 2, 0)));
1460
	return Vec4<int>(ret);
1461
}
1462

1463
template<> template<>
1464
inline Vec4<int> Vec4<int>::operator * (const int &other) const {
1465
	return (*this) * Vec4<int>(_mm_set1_epi32(other));
1466
}
1467

1468
template<>
1469
inline Vec4<int> Vec4<int>::operator | (const Vec4 &other) const {
1470
	return Vec4<int>(_mm_or_si128(SAFE_M128I(ivec), SAFE_M128I(other.ivec)));
1471
}
1472

1473
template<>
1474
inline Vec4<int> Vec4<int>::operator & (const Vec4 &other) const {
1475
	return Vec4<int>(_mm_and_si128(SAFE_M128I(ivec), SAFE_M128I(other.ivec)));
1476
}
1477

1478
// NOTE: modern GCC, clang, and MSVC are all ok with
1479
// non-compile-time-const amount for _mm_slli_epi32/_mm_srli_epi32.
1480
template<>
1481
inline Vec4<int> Vec4<int>::operator << (const int amount) const {
1482
	return Vec4<int>(_mm_slli_epi32(SAFE_M128I(ivec), amount));
1483
}
1484

1485
template<>
1486
inline Vec4<int> Vec4<int>::operator >> (const int amount) const {
1487
	return Vec4<int>(_mm_srli_epi32(SAFE_M128I(ivec), amount));
1488
}
1489

1490
// Vec4<float> operation
1491
template<>
1492
inline void Vec4<float>::operator += (const Vec4<float> &other) {
1493
	vec = _mm_add_ps(SAFE_M128(vec), SAFE_M128(other.vec));
1494
}
1495

1496
template<>
1497
inline Vec4<float> Vec4<float>::operator + (const Vec4 &other) const {
1498
	return Vec4<float>(_mm_add_ps(SAFE_M128(vec), SAFE_M128(other.vec)));
1499
}
1500

1501
template<>
1502
inline Vec4<float> Vec4<float>::operator * (const Vec4 &other) const {
1503
	return Vec4<float>(_mm_mul_ps(SAFE_M128(vec), SAFE_M128(other.vec)));
1504
}
1505

1506
template<> template<>
1507
inline Vec4<float> Vec4<float>::operator * (const float &other) const {
1508
	return Vec4<float>(_mm_mul_ps(SAFE_M128(vec), _mm_set_ps1(other)));
1509
}
1510

1511
// Vec3<float> cross product
1512
template<>
1513
inline Vec3<float> Cross(const Vec3<float> &a, const Vec3<float> &b)
1514
{
1515
#if PPSSPP_ARCH(X86)
1516
	__m128 avec = _mm_loadu_ps(&a.x);
1517
	__m128 bvec = _mm_loadu_ps(&b.x);
1518
#else
1519
	__m128 avec = a.vec;
1520
	__m128 bvec = b.vec;
1521
#endif
1522
	const __m128 left = _mm_mul_ps(_mm_shuffle_ps(avec, avec, _MM_SHUFFLE(3, 0, 2, 1)), _mm_shuffle_ps(bvec, bvec, _MM_SHUFFLE(3, 1, 0, 2)));
1523
	const __m128 right = _mm_mul_ps(_mm_shuffle_ps(avec, avec, _MM_SHUFFLE(3, 1, 0, 2)), _mm_shuffle_ps(bvec, bvec, _MM_SHUFFLE(3, 0, 2, 1)));
1524
	return _mm_sub_ps(left, right);
1525
}
1526
#endif
1527

1528
}; // namespace Math3D
1529

1530
// linear interpolation via float: 0.0=begin, 1.0=end
1531
template<typename X>
1532
inline X Lerp(const X& begin, const X& end, const float t)
1533
{
1534
	return begin*(1.f-t) + end*t;
1535
}
1536

1537
// linear interpolation via int: 0=begin, base=end
1538
template<typename X, int base>
1539
inline X LerpInt(const X& begin, const X& end, const int t)
1540
{
1541
	return (begin*(base-t) + end*t) / base;
1542
}
1543

1544
CoCalc provides the best real-time collaborative environment for Jupyter Notebooks, LaTeX documents, and SageMath, scalable from individual users to large groups and classes!

Product

Resources

Company