CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutSign UpSign In
hrydgard

CoCalc provides the best real-time collaborative environment for Jupyter Notebooks, LaTeX documents, and SageMath, scalable from individual users to large groups and classes!

GitHub Repository: hrydgard/ppsspp
Path: blob/master/GPU/Math3D.h
Views: 1401
1
// Copyright (c) 2012- PPSSPP Project.
2
3
// This program is free software: you can redistribute it and/or modify
4
// it under the terms of the GNU General Public License as published by
5
// the Free Software Foundation, version 2.0 or later versions.
6
7
// This program is distributed in the hope that it will be useful,
8
// but WITHOUT ANY WARRANTY; without even the implied warranty of
9
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
10
// GNU General Public License 2.0 for more details.
11
12
// A copy of the GPL 2.0 should have been included with the program.
13
// If not, see http://www.gnu.org/licenses/
14
15
// Official git repository and contact information can be found at
16
// https://github.com/hrydgard/ppsspp and http://www.ppsspp.org/.
17
18
#pragma once
19
20
#include "ppsspp_config.h"
21
#include <cmath>
22
23
#include "Common/Common.h"
24
#include "Core/Util/AudioFormat.h" // for clamp_u8
25
#include "Common/Math/fast/fast_matrix.h"
26
27
#if defined(_M_SSE)
28
#include <emmintrin.h>
29
#include <smmintrin.h>
30
#endif
31
32
#if PPSSPP_ARCH(ARM_NEON)
33
#if defined(_MSC_VER) && PPSSPP_ARCH(ARM64)
34
#include <arm64_neon.h>
35
#else
36
#include <arm_neon.h>
37
#endif
38
#endif
39
40
#if PPSSPP_PLATFORM(WINDOWS) && (defined(_MSC_VER) || defined(__clang__) || defined(__INTEL_COMPILER))
41
#define MATH3D_CALL __vectorcall
42
#else
43
#define MATH3D_CALL
44
#endif
45
46
// There's probably a better place to define these macros.
47
#if PPSSPP_ARCH(X86)
48
// On 32-bit x86, MSVC does not guarantee alignment for
49
// SSE arguments passed on stack (Compiler Error C2719), see e.g.:
50
// https://stackoverflow.com/questions/10484422/msvc-cannot-send-function-parameters-of-16byte-alignment-on-x86
51
// https://stackoverflow.com/questions/28488986/formal-parameter-with-declspecalign16-wont-be-aligned
52
// So, as a workaround, "dangerous" cases are loaded via loadu* on 32-bit x86.
53
// Compilers are decently ok at eliminating these extra loads, at least
54
// in trivial cases.
55
// NOTE: not to be outdone, GCC has its own flavor of broken, see e.g.:
56
// http://www.peterstock.co.uk/games/mingw_sse/
57
// https://github.com/nothings/stb/issues/81
58
// which is probably worse since it breaks alignment of locals and/or
59
// spills, but that, hopefully, does not affect PPSSPP (modern GCC+Linux
60
// is 16-byte aligned on x86, and MinGW is not a supported PPSSPP target).
61
// NOTE: weird double-casts add a bit of type-safety.
62
#define SAFE_M128(v) _mm_loadu_ps (reinterpret_cast<const float*> (static_cast<const __m128*> (&(v))))
63
#define SAFE_M128I(v) _mm_loadu_si128(reinterpret_cast<const __m128i*>(static_cast<const __m128i*>(&(v))))
64
#else // x64, FWIW also works for non-x86.
65
#define SAFE_M128(v) (v)
66
#define SAFE_M128I(v) (v)
67
#endif
68
69
namespace Math3D {
70
71
// Helper for Vec classes to clamp values.
72
template<typename T>
73
inline static T VecClamp(const T &v, const T &low, const T &high)
74
{
75
if (v > high)
76
return high;
77
if (v < low)
78
return low;
79
return v;
80
}
81
82
template<typename T>
83
class Vec2 {
84
public:
85
struct {
86
T x,y;
87
};
88
89
T* AsArray() { return &x; }
90
const T* AsArray() const { return &x; }
91
92
Vec2() {}
93
Vec2(const T a[2]) : x(a[0]), y(a[1]) {}
94
Vec2(const T& _x, const T& _y) : x(_x), y(_y) {}
95
96
template<typename T2>
97
Vec2<T2> Cast() const
98
{
99
return Vec2<T2>((T2)x, (T2)y);
100
}
101
102
static Vec2 AssignToAll(const T& f)
103
{
104
return Vec2<T>(f, f);
105
}
106
107
void Write(T a[2])
108
{
109
a[0] = x; a[1] = y;
110
}
111
112
Vec2 operator +(const Vec2& other) const
113
{
114
return Vec2(x+other.x, y+other.y);
115
}
116
void operator += (const Vec2 &other)
117
{
118
x+=other.x; y+=other.y;
119
}
120
Vec2 operator -(const Vec2& other) const
121
{
122
return Vec2(x-other.x, y-other.y);
123
}
124
void operator -= (const Vec2& other)
125
{
126
x-=other.x; y-=other.y;
127
}
128
Vec2 operator -() const
129
{
130
return Vec2(-x,-y);
131
}
132
Vec2 operator * (const Vec2& other) const
133
{
134
return Vec2(x*other.x, y*other.y);
135
}
136
template<typename V>
137
Vec2 operator * (const V& f) const
138
{
139
return Vec2(x*f,y*f);
140
}
141
template<typename V>
142
void operator *= (const V& f)
143
{
144
x*=f; y*=f;
145
}
146
template<typename V>
147
Vec2 operator / (const V& f) const
148
{
149
return Vec2(x/f,y/f);
150
}
151
template<typename V>
152
void operator /= (const V& f)
153
{
154
*this = *this / f;
155
}
156
157
T Length2() const
158
{
159
return x*x + y*y;
160
}
161
162
Vec2 Clamp(const T &l, const T &h) const
163
{
164
return Vec2(VecClamp(x, l, h), VecClamp(y, l, h));
165
}
166
167
// Only implemented for T=float
168
float Length() const;
169
void SetLength(const float l);
170
Vec2 WithLength(const float l) const;
171
float Distance2To(const Vec2 &other) const;
172
Vec2 Normalized() const;
173
float Normalize(); // returns the previous length, which is often useful
174
175
T& operator [] (int i) //allow vector[1] = 3 (vector.y=3)
176
{
177
return *((&x) + i);
178
}
179
T operator [] (const int i) const
180
{
181
return *((&x) + i);
182
}
183
184
void SetZero()
185
{
186
x=0; y=0;
187
}
188
189
// Common aliases: UV (texel coordinates), ST (texture coordinates)
190
T& u() { return x; }
191
T& v() { return y; }
192
T& s() { return x; }
193
T& t() { return y; }
194
195
const T& u() const { return x; }
196
const T& v() const { return y; }
197
const T& s() const { return x; }
198
const T& t() const { return y; }
199
200
// swizzlers - create a subvector of specific components
201
const Vec2 yx() const { return Vec2(y, x); }
202
const Vec2 vu() const { return Vec2(y, x); }
203
const Vec2 ts() const { return Vec2(y, x); }
204
};
205
206
template<typename T>
207
class Vec3Packed;
208
209
template<typename T>
210
class Vec3
211
{
212
public:
213
union
214
{
215
struct
216
{
217
T x,y,z;
218
};
219
#if defined(_M_SSE)
220
__m128i ivec;
221
__m128 vec;
222
#elif PPSSPP_ARCH(ARM_NEON)
223
int32x4_t ivec;
224
float32x4_t vec;
225
#endif
226
};
227
228
T* AsArray() { return &x; }
229
const T* AsArray() const { return &x; }
230
231
Vec3() {}
232
Vec3(const T a[3]) : x(a[0]), y(a[1]), z(a[2]) {}
233
constexpr Vec3(const T& _x, const T& _y, const T& _z) : x(_x), y(_y), z(_z) {}
234
Vec3(const Vec2<T>& _xy, const T& _z) : x(_xy.x), y(_xy.y), z(_z) {}
235
#if defined(_M_SSE)
236
constexpr Vec3(const __m128 &_vec) : vec(_vec) {}
237
constexpr Vec3(const __m128i &_ivec) : ivec(_ivec) {}
238
Vec3(const Vec3Packed<T> &_xyz) {
239
vec = _mm_loadu_ps(_xyz.AsArray());
240
}
241
#elif PPSSPP_ARCH(ARM_NEON)
242
Vec3(const float32x4_t &_vec) : vec(_vec) {}
243
#if !defined(_MSC_VER)
244
Vec3(const int32x4_t &_ivec) : ivec(_ivec) {}
245
#endif
246
Vec3(const Vec3Packed<T> &_xyz) {
247
vec = vld1q_f32(_xyz.AsArray());
248
}
249
#else
250
Vec3(const Vec3Packed<T> &_xyz) : x(_xyz.x), y(_xyz.y), z(_xyz.z) {}
251
#endif
252
253
template<typename T2>
254
constexpr Vec3<T2> Cast() const
255
{
256
return Vec3<T2>((T2)x, (T2)y, (T2)z);
257
}
258
259
// Only implemented for T=int and T=float
260
static Vec3 FromRGB(unsigned int rgb);
261
unsigned int ToRGB() const; // alpha bits set to zero
262
263
static constexpr Vec3 AssignToAll(const T& f)
264
{
265
return Vec3<T>(f, f, f);
266
}
267
268
void Write(T a[3])
269
{
270
a[0] = x; a[1] = y; a[2] = z;
271
}
272
273
Vec3 operator +(const Vec3 &other) const
274
{
275
return Vec3(x+other.x, y+other.y, z+other.z);
276
}
277
void operator += (const Vec3 &other)
278
{
279
x+=other.x; y+=other.y; z+=other.z;
280
}
281
Vec3 operator -(const Vec3 &other) const
282
{
283
return Vec3(x-other.x, y-other.y, z-other.z);
284
}
285
void operator -= (const Vec3 &other)
286
{
287
x-=other.x; y-=other.y; z-=other.z;
288
}
289
Vec3 operator -() const
290
{
291
return Vec3(-x,-y,-z);
292
}
293
Vec3 operator * (const Vec3 &other) const
294
{
295
return Vec3(x*other.x, y*other.y, z*other.z);
296
}
297
template<typename V>
298
Vec3 operator * (const V& f) const
299
{
300
return Vec3(x*f,y*f,z*f);
301
}
302
template<typename V>
303
void operator *= (const V& f)
304
{
305
x*=f; y*=f; z*=f;
306
}
307
template<typename V>
308
Vec3 operator / (const V& f) const
309
{
310
return Vec3(x/f,y/f,z/f);
311
}
312
template<typename V>
313
void operator /= (const V& f)
314
{
315
*this = *this / f;
316
}
317
318
bool operator ==(const Vec3 &other) const {
319
return x == other.x && y == other.y && z == other.z;
320
}
321
322
T Length2() const
323
{
324
return x*x + y*y + z*z;
325
}
326
327
Vec3 Clamp(const T &l, const T &h) const
328
{
329
return Vec3(VecClamp(x, l, h), VecClamp(y, l, h), VecClamp(z, l, h));
330
}
331
332
// Only implemented for T=float
333
float Length() const;
334
void SetLength(const float l);
335
Vec3 WithLength(const float l) const;
336
float Distance2To(const Vec3 &other) const;
337
Vec3 Normalized(bool useSSE4 = false) const;
338
Vec3 NormalizedOr001(bool useSSE4 = false) const;
339
float Normalize(); // returns the previous length, which is often useful
340
float NormalizeOr001();
341
342
T& operator [] (int i) //allow vector[2] = 3 (vector.z=3)
343
{
344
return *((&x) + i);
345
}
346
T operator [] (const int i) const
347
{
348
return *((&x) + i);
349
}
350
351
void SetZero()
352
{
353
x=0; y=0; z=0;
354
}
355
356
// Common aliases: UVW (texel coordinates), RGB (colors), STQ (texture coordinates)
357
T& u() { return x; }
358
T& v() { return y; }
359
T& w() { return z; }
360
361
T& r() { return x; }
362
T& g() { return y; }
363
T& b() { return z; }
364
365
T& s() { return x; }
366
T& t() { return y; }
367
T& q() { return z; }
368
369
const T& u() const { return x; }
370
const T& v() const { return y; }
371
const T& w() const { return z; }
372
373
const T& r() const { return x; }
374
const T& g() const { return y; }
375
const T& b() const { return z; }
376
377
const T& s() const { return x; }
378
const T& t() const { return y; }
379
const T& q() const { return z; }
380
381
// swizzlers - create a subvector of specific components
382
// e.g. Vec2 uv() { return Vec2(x,y); }
383
// _DEFINE_SWIZZLER2 defines a single such function, DEFINE_SWIZZLER2 defines all of them for all component names (x<->r) and permutations (xy<->yx)
384
#define _DEFINE_SWIZZLER2(a, b, name) const Vec2<T> name() const { return Vec2<T>(a, b); }
385
#define DEFINE_SWIZZLER2(a, b, a2, b2, a3, b3, a4, b4) \
386
_DEFINE_SWIZZLER2(a, b, a##b); \
387
_DEFINE_SWIZZLER2(a, b, a2##b2); \
388
_DEFINE_SWIZZLER2(a, b, a3##b3); \
389
_DEFINE_SWIZZLER2(a, b, a4##b4); \
390
_DEFINE_SWIZZLER2(b, a, b##a); \
391
_DEFINE_SWIZZLER2(b, a, b2##a2); \
392
_DEFINE_SWIZZLER2(b, a, b3##a3); \
393
_DEFINE_SWIZZLER2(b, a, b4##a4);
394
395
DEFINE_SWIZZLER2(x, y, r, g, u, v, s, t);
396
DEFINE_SWIZZLER2(x, z, r, b, u, w, s, q);
397
DEFINE_SWIZZLER2(y, z, g, b, v, w, t, q);
398
#undef DEFINE_SWIZZLER2
399
#undef _DEFINE_SWIZZLER2
400
};
401
402
template<typename T>
403
class Vec3Packed
404
{
405
public:
406
union
407
{
408
struct
409
{
410
T x,y,z;
411
};
412
};
413
414
T* AsArray() { return &x; }
415
const T* AsArray() const { return &x; }
416
417
Vec3Packed() {}
418
Vec3Packed(const T a[3]) : x(a[0]), y(a[1]), z(a[2]) {}
419
Vec3Packed(const T& _x, const T& _y, const T& _z) : x(_x), y(_y), z(_z) {}
420
Vec3Packed(const Vec2<T>& _xy, const T& _z) : x(_xy.x), y(_xy.y), z(_z) {}
421
Vec3Packed(const Vec3<T>& _xyz) {
422
memcpy(&x, _xyz.AsArray(), sizeof(float) * 3);
423
}
424
425
template<typename T2>
426
Vec3Packed<T2> Cast() const
427
{
428
return Vec3Packed<T2>((T2)x, (T2)y, (T2)z);
429
}
430
431
// Only implemented for T=int and T=float
432
static Vec3Packed FromRGB(unsigned int rgb);
433
unsigned int ToRGB() const; // alpha bits set to zero
434
435
static Vec3Packed AssignToAll(const T& f)
436
{
437
return Vec3Packed<T>(f, f, f);
438
}
439
440
void Write(T a[3])
441
{
442
a[0] = x; a[1] = y; a[2] = z;
443
}
444
445
Vec3Packed operator +(const Vec3Packed &other) const
446
{
447
return Vec3Packed(x+other.x, y+other.y, z+other.z);
448
}
449
void operator += (const Vec3Packed &other)
450
{
451
x+=other.x; y+=other.y; z+=other.z;
452
}
453
Vec3Packed operator -(const Vec3Packed &other) const
454
{
455
return Vec3Packed(x-other.x, y-other.y, z-other.z);
456
}
457
void operator -= (const Vec3Packed &other)
458
{
459
x-=other.x; y-=other.y; z-=other.z;
460
}
461
Vec3Packed operator -() const
462
{
463
return Vec3Packed(-x,-y,-z);
464
}
465
Vec3Packed operator * (const Vec3Packed &other) const
466
{
467
return Vec3Packed(x*other.x, y*other.y, z*other.z);
468
}
469
template<typename V>
470
Vec3Packed operator * (const V& f) const
471
{
472
return Vec3Packed(x*f,y*f,z*f);
473
}
474
template<typename V>
475
void operator *= (const V& f)
476
{
477
x*=f; y*=f; z*=f;
478
}
479
template<typename V>
480
Vec3Packed operator / (const V& f) const
481
{
482
return Vec3Packed(x/f,y/f,z/f);
483
}
484
template<typename V>
485
void operator /= (const V& f)
486
{
487
*this = *this / f;
488
}
489
490
T Length2() const
491
{
492
return x*x + y*y + z*z;
493
}
494
495
Vec3Packed Clamp(const T &l, const T &h) const
496
{
497
return Vec3Packed(VecClamp(x, l, h), VecClamp(y, l, h), VecClamp(z, l, h));
498
}
499
500
// Only implemented for T=float
501
float Length() const;
502
void SetLength(const float l);
503
Vec3Packed WithLength(const float l) const;
504
float Distance2To(const Vec3Packed &other) const;
505
Vec3Packed Normalized() const;
506
float Normalize(); // returns the previous length, which is often useful
507
508
T& operator [] (int i) //allow vector[2] = 3 (vector.z=3)
509
{
510
return *((&x) + i);
511
}
512
T operator [] (const int i) const
513
{
514
return *((&x) + i);
515
}
516
517
void SetZero()
518
{
519
x=0; y=0; z=0;
520
}
521
522
// Common aliases: UVW (texel coordinates), RGB (colors), STQ (texture coordinates)
523
T& u() { return x; }
524
T& v() { return y; }
525
T& w() { return z; }
526
527
T& r() { return x; }
528
T& g() { return y; }
529
T& b() { return z; }
530
531
T& s() { return x; }
532
T& t() { return y; }
533
T& q() { return z; }
534
535
const T& u() const { return x; }
536
const T& v() const { return y; }
537
const T& w() const { return z; }
538
539
const T& r() const { return x; }
540
const T& g() const { return y; }
541
const T& b() const { return z; }
542
543
const T& s() const { return x; }
544
const T& t() const { return y; }
545
const T& q() const { return z; }
546
547
// swizzlers - create a subvector of specific components
548
// e.g. Vec2 uv() { return Vec2(x,y); }
549
// _DEFINE_SWIZZLER2 defines a single such function, DEFINE_SWIZZLER2 defines all of them for all component names (x<->r) and permutations (xy<->yx)
550
#define _DEFINE_SWIZZLER2(a, b, name) const Vec2<T> name() const { return Vec2<T>(a, b); }
551
#define DEFINE_SWIZZLER2(a, b, a2, b2, a3, b3, a4, b4) \
552
_DEFINE_SWIZZLER2(a, b, a##b); \
553
_DEFINE_SWIZZLER2(a, b, a2##b2); \
554
_DEFINE_SWIZZLER2(a, b, a3##b3); \
555
_DEFINE_SWIZZLER2(a, b, a4##b4); \
556
_DEFINE_SWIZZLER2(b, a, b##a); \
557
_DEFINE_SWIZZLER2(b, a, b2##a2); \
558
_DEFINE_SWIZZLER2(b, a, b3##a3); \
559
_DEFINE_SWIZZLER2(b, a, b4##a4);
560
561
DEFINE_SWIZZLER2(x, y, r, g, u, v, s, t);
562
DEFINE_SWIZZLER2(x, z, r, b, u, w, s, q);
563
DEFINE_SWIZZLER2(y, z, g, b, v, w, t, q);
564
#undef DEFINE_SWIZZLER2
565
#undef _DEFINE_SWIZZLER2
566
};
567
568
template<typename T>
569
class Vec4
570
{
571
public:
572
union
573
{
574
struct
575
{
576
T x,y,z,w;
577
};
578
#if defined(_M_SSE)
579
__m128i ivec;
580
__m128 vec;
581
#elif PPSSPP_ARCH(ARM_NEON)
582
int32x4_t ivec;
583
float32x4_t vec;
584
#endif
585
};
586
587
T* AsArray() { return &x; }
588
const T* AsArray() const { return &x; }
589
590
Vec4() {}
591
Vec4(const T a[4]) : x(a[0]), y(a[1]), z(a[2]), w(a[3]) {}
592
Vec4(const T& _x, const T& _y, const T& _z, const T& _w) : x(_x), y(_y), z(_z), w(_w) {}
593
Vec4(const Vec2<T>& _xy, const T& _z, const T& _w) : x(_xy.x), y(_xy.y), z(_z), w(_w) {}
594
Vec4(const Vec3<T>& _xyz, const T& _w) : x(_xyz.x), y(_xyz.y), z(_xyz.z), w(_w) {}
595
#if defined(_M_SSE)
596
Vec4(const __m128 &_vec) : vec(_vec) {}
597
Vec4(const __m128i &_ivec) : ivec(_ivec) {}
598
#elif PPSSPP_ARCH(ARM_NEON)
599
Vec4(const float32x4_t &_vec) : vec(_vec) {}
600
#if !defined(_MSC_VER)
601
Vec4(const int32x4_t &_ivec) : ivec(_ivec) {}
602
#endif
603
#endif
604
605
template<typename T2>
606
Vec4<T2> Cast() const {
607
if constexpr (std::is_same<T, float>::value && std::is_same<T2, int>::value) {
608
#if defined(_M_SSE)
609
return _mm_cvtps_epi32(SAFE_M128(vec));
610
#elif PPSSPP_ARCH(ARM_NEON)
611
return vcvtq_s32_f32(vec);
612
#endif
613
}
614
if constexpr (std::is_same<T, int>::value && std::is_same<T2, float>::value) {
615
#if defined(_M_SSE)
616
return _mm_cvtepi32_ps(SAFE_M128I(ivec));
617
#elif PPSSPP_ARCH(ARM_NEON)
618
return vcvtq_f32_s32(ivec);
619
#endif
620
}
621
return Vec4<T2>((T2)x, (T2)y, (T2)z, (T2)w);
622
}
623
624
// Only implemented for T=int and T=float
625
static Vec4 FromRGBA(unsigned int rgba);
626
static Vec4 FromRGBA(const u8 *rgba);
627
unsigned int ToRGBA() const;
628
void ToRGBA(u8 *rgba) const;
629
630
static Vec4 AssignToAll(const T& f)
631
{
632
return Vec4<T>(f, f, f, f);
633
}
634
635
void Write(T a[4])
636
{
637
a[0] = x; a[1] = y; a[2] = z; a[3] = w;
638
}
639
640
Vec4 operator +(const Vec4& other) const
641
{
642
return Vec4(x+other.x, y+other.y, z+other.z, w+other.w);
643
}
644
void operator += (const Vec4& other)
645
{
646
x+=other.x; y+=other.y; z+=other.z; w+=other.w;
647
}
648
Vec4 operator -(const Vec4 &other) const
649
{
650
return Vec4(x-other.x, y-other.y, z-other.z, w-other.w);
651
}
652
void operator -= (const Vec4 &other)
653
{
654
x-=other.x; y-=other.y; z-=other.z; w-=other.w;
655
}
656
Vec4 operator -() const
657
{
658
return Vec4(-x,-y,-z,-w);
659
}
660
Vec4 operator * (const Vec4 &other) const
661
{
662
return Vec4(x*other.x, y*other.y, z*other.z, w*other.w);
663
}
664
Vec4 operator | (const Vec4 &other) const
665
{
666
return Vec4(x | other.x, y | other.y, z | other.z, w | other.w);
667
}
668
Vec4 operator & (const Vec4 &other) const
669
{
670
return Vec4(x & other.x, y & other.y, z & other.z, w & other.w);
671
}
672
Vec4 operator << (const int amount) const
673
{
674
// NOTE: x*(1<<amount), etc., might be safer, since
675
// left-shifting negatives is UB pre-C++20.
676
return Vec4(x << amount, y << amount, z << amount, w << amount);
677
}
678
Vec4 operator >> (const int amount) const
679
{
680
return Vec4(x >> amount, y >> amount, z >> amount, w >> amount);
681
}
682
template<typename V>
683
Vec4 operator * (const V& f) const
684
{
685
return Vec4(x*f,y*f,z*f,w*f);
686
}
687
template<typename V>
688
void operator *= (const V& f)
689
{
690
x*=f; y*=f; z*=f; w*=f;
691
}
692
template<typename V>
693
Vec4 operator / (const V& f) const
694
{
695
return Vec4(x/f,y/f,z/f,w/f);
696
}
697
template<typename V>
698
void operator /= (const V& f)
699
{
700
*this = *this / f;
701
}
702
703
bool operator ==(const Vec4 &other) const {
704
return x == other.x && y == other.y && z == other.z && w == other.w;
705
}
706
707
T Length2() const
708
{
709
return x*x + y*y + z*z + w*w;
710
}
711
712
Vec4 Clamp(const T &l, const T &h) const
713
{
714
return Vec4(VecClamp(x, l, h), VecClamp(y, l, h), VecClamp(z, l, h), VecClamp(w, l, h));
715
}
716
717
Vec4 Reciprocal() const
718
{
719
const T one = 1.0f;
720
return Vec4(one / x, one / y, one / z, one / w);
721
}
722
723
// Only implemented for T=float
724
float Length() const;
725
void SetLength(const float l);
726
Vec4 WithLength(const float l) const;
727
float Distance2To(const Vec4 &other) const;
728
Vec4 Normalized() const;
729
float Normalize(); // returns the previous length, which is often useful
730
731
T& operator [] (int i) //allow vector[2] = 3 (vector.z=3)
732
{
733
return *((&x) + i);
734
}
735
T operator [] (const int i) const
736
{
737
return *((&x) + i);
738
}
739
740
void SetZero()
741
{
742
x=0; y=0; z=0; w=0;
743
}
744
745
// Common alias: RGBA (colors)
746
T& r() { return x; }
747
T& g() { return y; }
748
T& b() { return z; }
749
T& a() { return w; }
750
751
const T& r() const { return x; }
752
const T& g() const { return y; }
753
const T& b() const { return z; }
754
const T& a() const { return w; }
755
756
// swizzlers - create a subvector of specific components
757
// e.g. Vec2 uv() { return Vec2(x,y); }
758
// _DEFINE_SWIZZLER2 defines a single such function, DEFINE_SWIZZLER2 defines all of them for all component names (x<->r) and permutations (xy<->yx)
759
#define _DEFINE_SWIZZLER2(a, b, name) const Vec2<T> name() const { return Vec2<T>(a, b); }
760
#define DEFINE_SWIZZLER2(a, b, a2, b2) \
761
_DEFINE_SWIZZLER2(a, b, a##b); \
762
_DEFINE_SWIZZLER2(a, b, a2##b2); \
763
_DEFINE_SWIZZLER2(b, a, b##a); \
764
_DEFINE_SWIZZLER2(b, a, b2##a2);
765
766
DEFINE_SWIZZLER2(x, y, r, g);
767
DEFINE_SWIZZLER2(x, z, r, b);
768
DEFINE_SWIZZLER2(x, w, r, a);
769
DEFINE_SWIZZLER2(y, z, g, b);
770
DEFINE_SWIZZLER2(y, w, g, a);
771
DEFINE_SWIZZLER2(z, w, b, a);
772
#undef DEFINE_SWIZZLER2
773
#undef _DEFINE_SWIZZLER2
774
775
#define _DEFINE_SWIZZLER3(a, b, c, name) const Vec3<T> name() const { return Vec3<T>(a, b, c); }
776
#define DEFINE_SWIZZLER3(a, b, c, a2, b2, c2) \
777
_DEFINE_SWIZZLER3(a, b, c, a##b##c); \
778
_DEFINE_SWIZZLER3(a, c, b, a##c##b); \
779
_DEFINE_SWIZZLER3(b, a, c, b##a##c); \
780
_DEFINE_SWIZZLER3(b, c, a, b##c##a); \
781
_DEFINE_SWIZZLER3(c, a, b, c##a##b); \
782
_DEFINE_SWIZZLER3(c, b, a, c##b##a); \
783
_DEFINE_SWIZZLER3(a, b, c, a2##b2##c2); \
784
_DEFINE_SWIZZLER3(a, c, b, a2##c2##b2); \
785
_DEFINE_SWIZZLER3(b, a, c, b2##a2##c2); \
786
_DEFINE_SWIZZLER3(b, c, a, b2##c2##a2); \
787
_DEFINE_SWIZZLER3(c, a, b, c2##a2##b2); \
788
_DEFINE_SWIZZLER3(c, b, a, c2##b2##a2);
789
790
DEFINE_SWIZZLER3(x, y, z, r, g, b);
791
DEFINE_SWIZZLER3(x, y, w, r, g, a);
792
DEFINE_SWIZZLER3(x, z, w, r, b, a);
793
DEFINE_SWIZZLER3(y, z, w, g, b, a);
794
#undef DEFINE_SWIZZLER3
795
#undef _DEFINE_SWIZZLER3
796
};
797
798
799
template<typename BaseType>
800
class Mat3x3
801
{
802
public:
803
// Convention: first three values = first column
804
Mat3x3(const BaseType values[])
805
{
806
for (unsigned int i = 0; i < 3*3; ++i)
807
{
808
this->values[i] = values[i];
809
}
810
}
811
812
Mat3x3(BaseType _00, BaseType _01, BaseType _02, BaseType _10, BaseType _11, BaseType _12, BaseType _20, BaseType _21, BaseType _22)
813
{
814
values[0] = _00;
815
values[1] = _01;
816
values[2] = _02;
817
values[3] = _10;
818
values[4] = _11;
819
values[5] = _12;
820
values[6] = _20;
821
values[7] = _21;
822
values[8] = _22;
823
}
824
825
template<typename T>
826
Vec3<T> operator * (const Vec3<T>& vec) const
827
{
828
Vec3<T> ret;
829
ret.x = values[0]*vec.x + values[3]*vec.y + values[6]*vec.z;
830
ret.y = values[1]*vec.x + values[4]*vec.y + values[7]*vec.z;
831
ret.z = values[2]*vec.x + values[5]*vec.y + values[8]*vec.z;
832
return ret;
833
}
834
835
Mat3x3 Inverse() const
836
{
837
float a = values[0];
838
float b = values[1];
839
float c = values[2];
840
float d = values[3];
841
float e = values[4];
842
float f = values[5];
843
float g = values[6];
844
float h = values[7];
845
float i = values[8];
846
return Mat3x3(e*i-f*h, f*g-d*i, d*h-e*g,
847
c*h-b*i, a*i-c*g, b*g-a*h,
848
b*f-c*e, c*d-a*f, a*e-b*d) / Det();
849
}
850
851
BaseType Det() const
852
{
853
return values[0]*values[4]*values[8] + values[3]*values[7]*values[2] +
854
values[6]*values[1]*values[5] - values[2]*values[4]*values[6] -
855
values[5]*values[7]*values[0] - values[8]*values[1]*values[3];
856
}
857
858
Mat3x3 operator / (const BaseType& val) const
859
{
860
return Mat3x3(values[0]/val, values[1]/val, values[2]/val,
861
values[3]/val, values[4]/val, values[5]/val,
862
values[6]/val, values[7]/val, values[8]/val);
863
}
864
865
private:
866
BaseType values[3*3];
867
};
868
869
870
template<typename BaseType>
871
class Mat4x4
872
{
873
public:
874
// Convention: first four values in arrow = first column
875
Mat4x4(const BaseType values[])
876
{
877
for (unsigned int i = 0; i < 4*4; ++i)
878
{
879
this->values[i] = values[i];
880
}
881
}
882
883
template<typename T>
884
Vec4<T> operator * (const Vec4<T>& vec) const
885
{
886
Vec4<T> ret;
887
ret.x = values[0]*vec.x + values[4]*vec.y + values[8]*vec.z + values[12]*vec.w;
888
ret.y = values[1]*vec.x + values[5]*vec.y + values[9]*vec.z + values[13]*vec.w;
889
ret.z = values[2]*vec.x + values[6]*vec.y + values[10]*vec.z + values[14]*vec.w;
890
ret.w = values[3]*vec.x + values[7]*vec.y + values[11]*vec.z + values[15]*vec.w;
891
return ret;
892
}
893
894
private:
895
BaseType values[4*4];
896
};
897
898
}; // namespace Math3D
899
900
typedef Math3D::Vec2<float> Vec2f;
901
typedef Math3D::Vec3<float> Vec3f;
902
typedef Math3D::Vec3Packed<float> Vec3Packedf;
903
typedef Math3D::Vec4<float> Vec4f;
904
905
#if defined(_M_SSE)
906
template<unsigned i>
907
float MATH3D_CALL vectorGetByIndex(__m128 v) {
908
// shuffle V so that the element that we want is moved to the bottom
909
return _mm_cvtss_f32(_mm_shuffle_ps(v, v, _MM_SHUFFLE(i, i, i, i)));
910
}
911
#endif
912
913
#if defined(_M_SSE)
914
// x, y, and z should be broadcast. Should only be used through Vec3f version.
915
// Note that this will read an extra float from the matrix, so it better not be at the end of an allocation!
916
inline __m128 MATH3D_CALL Vec3ByMatrix43Internal(__m128 x, __m128 y, __m128 z, const float m[12]) {
917
__m128 col0 = _mm_loadu_ps(m);
918
__m128 col1 = _mm_loadu_ps(m + 3);
919
__m128 col2 = _mm_loadu_ps(m + 6);
920
__m128 col3 = _mm_loadu_ps(m + 9);
921
__m128 sum = _mm_add_ps(
922
_mm_add_ps(_mm_mul_ps(col0, x), _mm_mul_ps(col1, y)),
923
_mm_add_ps(_mm_mul_ps(col2, z), col3));
924
return sum;
925
}
926
#elif PPSSPP_ARCH(ARM64_NEON)
927
inline float32x4_t Vec3ByMatrix43Internal(float32x4_t vec, const float m[16]) {
928
float32x4_t col0 = vld1q_f32(m);
929
float32x4_t col1 = vld1q_f32(m + 3);
930
float32x4_t col2 = vld1q_f32(m + 6);
931
float32x4_t col3 = vld1q_f32(m + 9);
932
float32x4_t sum = vaddq_f32(
933
vaddq_f32(vmulq_laneq_f32(col0, vec, 0), vmulq_laneq_f32(col1, vec, 1)),
934
vaddq_f32(vmulq_laneq_f32(col2, vec, 2), col3));
935
return sum;
936
}
937
#elif PPSSPP_ARCH(ARM_NEON)
938
inline float32x4_t Vec3ByMatrix43Internal(float32x4_t vec, const float m[16]) {
939
float32x4_t col0 = vld1q_f32(m);
940
float32x4_t col1 = vld1q_f32(m + 3);
941
float32x4_t col2 = vld1q_f32(m + 6);
942
float32x4_t col3 = vld1q_f32(m + 9);
943
float32x4_t sum = vaddq_f32(
944
vaddq_f32(vmulq_lane_f32(col0, vget_low_f32(vec), 0), vmulq_lane_f32(col1, vget_low_f32(vec), 1)),
945
vaddq_f32(vmulq_lane_f32(col2, vget_high_f32(vec), 0), col3));
946
return sum;
947
}
948
#endif
949
950
// v and vecOut must point to different memory.
951
inline void Vec3ByMatrix43(float vecOut[3], const float v[3], const float m[12]) {
952
#if defined(_M_SSE)
953
__m128 x = _mm_set1_ps(v[0]);
954
__m128 y = _mm_set1_ps(v[1]);
955
__m128 z = _mm_set1_ps(v[2]);
956
__m128 sum = Vec3ByMatrix43Internal(x, y, z, m);
957
// Not sure what the best way to store 3 elements is. Ideally, we should
958
// probably store all four.
959
vecOut[0] = _mm_cvtss_f32(sum);
960
vecOut[1] = vectorGetByIndex<1>(sum);
961
vecOut[2] = vectorGetByIndex<2>(sum);
962
#elif PPSSPP_ARCH(ARM_NEON)
963
float vecIn[4] = {v[0], v[1], v[2], 1.0f};
964
float32x4_t sum = Vec3ByMatrix43Internal(vld1q_f32(vecIn), m);
965
vecOut[0] = vgetq_lane_f32(sum, 0);
966
vecOut[1] = vgetq_lane_f32(sum, 1);
967
vecOut[2] = vgetq_lane_f32(sum, 2);
968
#else
969
vecOut[0] = v[0] * m[0] + v[1] * m[3] + v[2] * m[6] + m[9];
970
vecOut[1] = v[0] * m[1] + v[1] * m[4] + v[2] * m[7] + m[10];
971
vecOut[2] = v[0] * m[2] + v[1] * m[5] + v[2] * m[8] + m[11];
972
#endif
973
}
974
975
inline Vec3f MATH3D_CALL Vec3ByMatrix43(const Vec3f v, const float m[12]) {
976
#if defined(_M_SSE)
977
const __m128 vv = SAFE_M128(v.vec);
978
__m128 x = _mm_shuffle_ps(vv, vv, _MM_SHUFFLE(0, 0, 0, 0));
979
__m128 y = _mm_shuffle_ps(vv, vv, _MM_SHUFFLE(1, 1, 1, 1));
980
__m128 z = _mm_shuffle_ps(vv, vv, _MM_SHUFFLE(2, 2, 2, 2));
981
return Vec3ByMatrix43Internal(x, y, z, m);
982
#elif PPSSPP_ARCH(ARM_NEON)
983
return Vec3ByMatrix43Internal(v.vec, m);
984
#else
985
Vec3f vecOut;
986
Vec3ByMatrix43(vecOut.AsArray(), v.AsArray(), m);
987
return vecOut;
988
#endif
989
}
990
991
#if defined(_M_SSE)
992
// x, y, and z should be broadcast. Should only be used through Vec3f version.
993
inline __m128 MATH3D_CALL Vec3ByMatrix44Internal(__m128 x, __m128 y, __m128 z, const float m[16]) {
994
__m128 col0 = _mm_loadu_ps(m);
995
__m128 col1 = _mm_loadu_ps(m + 4);
996
__m128 col2 = _mm_loadu_ps(m + 8);
997
__m128 col3 = _mm_loadu_ps(m + 12);
998
__m128 sum = _mm_add_ps(
999
_mm_add_ps(_mm_mul_ps(col0, x), _mm_mul_ps(col1, y)),
1000
_mm_add_ps(_mm_mul_ps(col2, z), col3));
1001
return sum;
1002
}
1003
#elif PPSSPP_ARCH(ARM64_NEON)
1004
inline float32x4_t Vec3ByMatrix44Internal(float32x4_t vec, const float m[16]) {
1005
float32x4_t col0 = vld1q_f32(m);
1006
float32x4_t col1 = vld1q_f32(m + 4);
1007
float32x4_t col2 = vld1q_f32(m + 8);
1008
float32x4_t col3 = vld1q_f32(m + 12);
1009
float32x4_t sum = vaddq_f32(
1010
vaddq_f32(vmulq_laneq_f32(col0, vec, 0), vmulq_laneq_f32(col1, vec, 1)),
1011
vaddq_f32(vmulq_laneq_f32(col2, vec, 2), col3));
1012
return sum;
1013
}
1014
#elif PPSSPP_ARCH(ARM_NEON)
1015
inline float32x4_t Vec3ByMatrix44Internal(float32x4_t vec, const float m[16]) {
1016
float32x4_t col0 = vld1q_f32(m);
1017
float32x4_t col1 = vld1q_f32(m + 4);
1018
float32x4_t col2 = vld1q_f32(m + 8);
1019
float32x4_t col3 = vld1q_f32(m + 12);
1020
float32x4_t sum = vaddq_f32(
1021
vaddq_f32(vmulq_lane_f32(col0, vget_low_f32(vec), 0), vmulq_lane_f32(col1, vget_low_f32(vec), 1)),
1022
vaddq_f32(vmulq_lane_f32(col2, vget_high_f32(vec), 0), col3));
1023
return sum;
1024
}
1025
#endif
1026
1027
inline void Vec3ByMatrix44(float vecOut[4], const float v[3], const float m[16]) {
1028
#if defined(_M_SSE)
1029
__m128 x = _mm_set1_ps(v[0]);
1030
__m128 y = _mm_set1_ps(v[1]);
1031
__m128 z = _mm_set1_ps(v[2]);
1032
__m128 sum = Vec3ByMatrix44Internal(x, y, z, m);
1033
_mm_storeu_ps(vecOut, sum);
1034
#elif PPSSPP_ARCH(ARM_NEON)
1035
float vecIn[4] = {v[0], v[1], v[2], 1.0f};
1036
float32x4_t sum = Vec3ByMatrix44Internal(vld1q_f32(vecIn), m);
1037
vst1q_f32(vecOut, sum);
1038
#else
1039
vecOut[0] = v[0] * m[0] + v[1] * m[4] + v[2] * m[8] + m[12];
1040
vecOut[1] = v[0] * m[1] + v[1] * m[5] + v[2] * m[9] + m[13];
1041
vecOut[2] = v[0] * m[2] + v[1] * m[6] + v[2] * m[10] + m[14];
1042
vecOut[3] = v[0] * m[3] + v[1] * m[7] + v[2] * m[11] + m[15];
1043
#endif
1044
}
1045
1046
inline Vec4f MATH3D_CALL Vec3ByMatrix44(const Vec3f v, const float m[16]) {
1047
#if defined(_M_SSE)
1048
const __m128 vv = SAFE_M128(v.vec);
1049
__m128 x = _mm_shuffle_ps(vv, vv, _MM_SHUFFLE(0, 0, 0, 0));
1050
__m128 y = _mm_shuffle_ps(vv, vv, _MM_SHUFFLE(1, 1, 1, 1));
1051
__m128 z = _mm_shuffle_ps(vv, vv, _MM_SHUFFLE(2, 2, 2, 2));
1052
return Vec3ByMatrix44Internal(x, y, z, m);
1053
#elif PPSSPP_ARCH(ARM_NEON)
1054
return Vec3ByMatrix44Internal(v.vec, m);
1055
#else
1056
Vec4f vecOut;
1057
Vec3ByMatrix44(vecOut.AsArray(), v.AsArray(), m);
1058
return vecOut;
1059
#endif
1060
}
1061
1062
#if defined(_M_SSE)
1063
// x, y, and z should be broadcast. Should only be used through Vec3f version.
1064
inline __m128 MATH3D_CALL Norm3ByMatrix43Internal(__m128 x, __m128 y, __m128 z, const float m[12]) {
1065
__m128 col0 = _mm_loadu_ps(m);
1066
__m128 col1 = _mm_loadu_ps(m + 3);
1067
__m128 col2 = _mm_loadu_ps(m + 6);
1068
__m128 sum = _mm_add_ps(
1069
_mm_add_ps(_mm_mul_ps(col0, x), _mm_mul_ps(col1, y)),
1070
_mm_mul_ps(col2, z));
1071
return sum;
1072
}
1073
#elif PPSSPP_ARCH(ARM64_NEON)
1074
inline float32x4_t Norm3ByMatrix43Internal(float32x4_t vec, const float m[16]) {
1075
float32x4_t col0 = vld1q_f32(m);
1076
float32x4_t col1 = vld1q_f32(m + 3);
1077
float32x4_t col2 = vld1q_f32(m + 6);
1078
float32x4_t sum = vaddq_f32(
1079
vaddq_f32(vmulq_laneq_f32(col0, vec, 0), vmulq_laneq_f32(col1, vec, 1)),
1080
vmulq_laneq_f32(col2, vec, 2));
1081
return sum;
1082
}
1083
#elif PPSSPP_ARCH(ARM_NEON)
1084
inline float32x4_t Norm3ByMatrix43Internal(float32x4_t vec, const float m[16]) {
1085
float32x4_t col0 = vld1q_f32(m);
1086
float32x4_t col1 = vld1q_f32(m + 3);
1087
float32x4_t col2 = vld1q_f32(m + 6);
1088
float32x4_t sum = vaddq_f32(
1089
vaddq_f32(vmulq_lane_f32(col0, vget_low_f32(vec), 0), vmulq_lane_f32(col1, vget_low_f32(vec), 1)),
1090
vmulq_lane_f32(col2, vget_high_f32(vec), 0));
1091
return sum;
1092
}
1093
#endif
1094
1095
inline void Norm3ByMatrix43(float vecOut[3], const float v[3], const float m[12]) {
1096
#if defined(_M_SSE)
1097
__m128 x = _mm_set1_ps(v[0]);
1098
__m128 y = _mm_set1_ps(v[1]);
1099
__m128 z = _mm_set1_ps(v[2]);
1100
__m128 sum = Norm3ByMatrix43Internal(x, y, z, m);
1101
vecOut[0] = _mm_cvtss_f32(sum);
1102
vecOut[1] = vectorGetByIndex<1>(sum);
1103
vecOut[2] = vectorGetByIndex<2>(sum);
1104
#elif PPSSPP_ARCH(ARM_NEON)
1105
float32x4_t sum = Norm3ByMatrix43Internal(vld1q_f32(v), m);
1106
vecOut[0] = vgetq_lane_f32(sum, 0);
1107
vecOut[1] = vgetq_lane_f32(sum, 1);
1108
vecOut[2] = vgetq_lane_f32(sum, 2);
1109
#else
1110
vecOut[0] = v[0] * m[0] + v[1] * m[3] + v[2] * m[6];
1111
vecOut[1] = v[0] * m[1] + v[1] * m[4] + v[2] * m[7];
1112
vecOut[2] = v[0] * m[2] + v[1] * m[5] + v[2] * m[8];
1113
#endif
1114
}
1115
1116
inline Vec3f MATH3D_CALL Norm3ByMatrix43(const Vec3f v, const float m[12]) {
1117
#if defined(_M_SSE)
1118
const __m128 vv = SAFE_M128(v.vec);
1119
__m128 x = _mm_shuffle_ps(vv, vv, _MM_SHUFFLE(0, 0, 0, 0));
1120
__m128 y = _mm_shuffle_ps(vv, vv, _MM_SHUFFLE(1, 1, 1, 1));
1121
__m128 z = _mm_shuffle_ps(vv, vv, _MM_SHUFFLE(2, 2, 2, 2));
1122
return Norm3ByMatrix43Internal(x, y, z, m);
1123
#elif PPSSPP_ARCH(ARM_NEON)
1124
return Norm3ByMatrix43Internal(v.vec, m);
1125
#else
1126
Vec3f vecOut;
1127
Norm3ByMatrix43(vecOut.AsArray(), v.AsArray(), m);
1128
return vecOut;
1129
#endif
1130
}
1131
1132
inline void Matrix4ByMatrix4(float out[16], const float a[16], const float b[16]) {
1133
fast_matrix_mul_4x4(out, b, a);
1134
}
1135
1136
inline void ConvertMatrix4x3To4x4(float *m4x4, const float *m4x3) {
1137
m4x4[0] = m4x3[0];
1138
m4x4[1] = m4x3[1];
1139
m4x4[2] = m4x3[2];
1140
m4x4[3] = 0.0f;
1141
m4x4[4] = m4x3[3];
1142
m4x4[5] = m4x3[4];
1143
m4x4[6] = m4x3[5];
1144
m4x4[7] = 0.0f;
1145
m4x4[8] = m4x3[6];
1146
m4x4[9] = m4x3[7];
1147
m4x4[10] = m4x3[8];
1148
m4x4[11] = 0.0f;
1149
m4x4[12] = m4x3[9];
1150
m4x4[13] = m4x3[10];
1151
m4x4[14] = m4x3[11];
1152
m4x4[15] = 1.0f;
1153
}
1154
1155
inline void ConvertMatrix4x3To4x4Transposed(float *m4x4, const float *m4x3) {
1156
#if PPSSPP_ARCH(ARM_NEON)
1157
// vld3q is a perfect match here!
1158
float32x4x3_t packed = vld3q_f32(m4x3);
1159
vst1q_f32(m4x4, packed.val[0]);
1160
vst1q_f32(m4x4 + 4, packed.val[1]);
1161
vst1q_f32(m4x4 + 8, packed.val[2]);
1162
#else
1163
m4x4[0] = m4x3[0];
1164
m4x4[1] = m4x3[3];
1165
m4x4[2] = m4x3[6];
1166
m4x4[3] = m4x3[9];
1167
m4x4[4] = m4x3[1];
1168
m4x4[5] = m4x3[4];
1169
m4x4[6] = m4x3[7];
1170
m4x4[7] = m4x3[10];
1171
m4x4[8] = m4x3[2];
1172
m4x4[9] = m4x3[5];
1173
m4x4[10] = m4x3[8];
1174
m4x4[11] = m4x3[11];
1175
#endif
1176
m4x4[12] = 0.0f;
1177
m4x4[13] = 0.0f;
1178
m4x4[14] = 0.0f;
1179
m4x4[15] = 1.0f;
1180
}
1181
1182
// 0369
1183
// 147A
1184
// 258B
1185
// ->>-
1186
// 0123
1187
// 4567
1188
// 89AB
1189
// Don't see a way to SIMD that. Should be pretty fast anyway.
1190
inline void ConvertMatrix4x3To3x4Transposed(float *m4x4, const float *m4x3) {
1191
#if PPSSPP_ARCH(ARM_NEON)
1192
// vld3q is a perfect match here!
1193
float32x4x3_t packed = vld3q_f32(m4x3);
1194
vst1q_f32(m4x4, packed.val[0]);
1195
vst1q_f32(m4x4 + 4, packed.val[1]);
1196
vst1q_f32(m4x4 + 8, packed.val[2]);
1197
#else
1198
m4x4[0] = m4x3[0];
1199
m4x4[1] = m4x3[3];
1200
m4x4[2] = m4x3[6];
1201
m4x4[3] = m4x3[9];
1202
m4x4[4] = m4x3[1];
1203
m4x4[5] = m4x3[4];
1204
m4x4[6] = m4x3[7];
1205
m4x4[7] = m4x3[10];
1206
m4x4[8] = m4x3[2];
1207
m4x4[9] = m4x3[5];
1208
m4x4[10] = m4x3[8];
1209
m4x4[11] = m4x3[11];
1210
#endif
1211
}
1212
1213
inline void Transpose4x4(float out[16], const float in[16]) {
1214
for (int i = 0; i < 4; i++) {
1215
for (int j = 0; j < 4; j++) {
1216
out[i * 4 + j] = in[j * 4 + i];
1217
}
1218
}
1219
}
1220
1221
namespace Math3D {
1222
1223
template<typename T>
1224
inline T Dot(const Vec2<T>& a, const Vec2<T>& b)
1225
{
1226
return a.x*b.x + a.y*b.y;
1227
}
1228
1229
template<typename T>
1230
inline T Dot(const Vec3<T>& a, const Vec3<T>& b)
1231
{
1232
return a.x*b.x + a.y*b.y + a.z*b.z;
1233
}
1234
1235
template<typename T>
1236
inline T Dot(const Vec4<T>& a, const Vec4<T>& b)
1237
{
1238
return a.x*b.x + a.y*b.y + a.z*b.z + a.w*b.w;
1239
}
1240
1241
template<typename T>
1242
inline Vec3<T> Cross(const Vec3<T>& a, const Vec3<T>& b)
1243
{
1244
return Vec3<T>(a.y*b.z-a.z*b.y, a.z*b.x-a.x*b.z, a.x*b.y-a.y*b.x);
1245
}
1246
1247
template<typename T>
1248
inline Vec3Packed<T> Cross(const Vec3Packed<T>& a, const Vec3Packed<T>& b)
1249
{
1250
return Vec3Packed<T>(a.y*b.z-a.z*b.y, a.z*b.x-a.x*b.z, a.x*b.y-a.y*b.x);
1251
}
1252
1253
template<>
1254
inline Vec3<float> Vec3<float>::FromRGB(unsigned int rgb)
1255
{
1256
#if defined(_M_SSE)
1257
__m128i z = _mm_setzero_si128();
1258
__m128i c = _mm_cvtsi32_si128(rgb);
1259
c = _mm_unpacklo_epi16(_mm_unpacklo_epi8(c, z), z);
1260
return Vec3<float>(_mm_mul_ps(_mm_cvtepi32_ps(c), _mm_set_ps1(1.0f / 255.0f)));
1261
#elif PPSSPP_ARCH(ARM_NEON)
1262
uint8x8_t c = vreinterpret_u8_u32(vdup_n_u32(rgb));
1263
uint32x4_t u = vmovl_u16(vget_low_u16(vmovl_u8(c)));
1264
return Vec3<float>(vmulq_f32(vcvtq_f32_u32(u), vdupq_n_f32(1.0f / 255.0f)));
1265
#else
1266
return Vec3((rgb & 0xFF) * (1.0f/255.0f),
1267
((rgb >> 8) & 0xFF) * (1.0f/255.0f),
1268
((rgb >> 16) & 0xFF) * (1.0f/255.0f));
1269
#endif
1270
}
1271
1272
template<>
1273
inline Vec3<int> Vec3<int>::FromRGB(unsigned int rgb)
1274
{
1275
#if defined(_M_SSE)
1276
__m128i z = _mm_setzero_si128();
1277
__m128i c = _mm_cvtsi32_si128(rgb);
1278
c = _mm_unpacklo_epi16(_mm_unpacklo_epi8(c, z), z);
1279
return Vec3<int>(c);
1280
#elif PPSSPP_ARCH(ARM_NEON)
1281
uint8x8_t c = vreinterpret_u8_u32(vdup_n_u32(rgb));
1282
uint32x4_t u = vmovl_u16(vget_low_u16(vmovl_u8(c)));
1283
return Vec3<int>(vreinterpretq_s32_u32(u));
1284
#else
1285
return Vec3(rgb & 0xFF, (rgb >> 8) & 0xFF, (rgb >> 16) & 0xFF);
1286
#endif
1287
}
1288
1289
template<>
1290
__forceinline unsigned int Vec3<float>::ToRGB() const
1291
{
1292
#if defined(_M_SSE)
1293
__m128i c = _mm_cvtps_epi32(_mm_mul_ps(SAFE_M128(vec), _mm_set_ps1(255.0f)));
1294
__m128i c16 = _mm_packs_epi32(c, c);
1295
return _mm_cvtsi128_si32(_mm_packus_epi16(c16, c16)) & 0x00FFFFFF;
1296
#elif PPSSPP_ARCH(ARM_NEON)
1297
uint16x4_t c16 = vqmovun_s32(vcvtq_s32_f32(vmulq_f32(vsetq_lane_f32(0.0f, vec, 3), vdupq_n_f32(255.0f))));
1298
uint8x8_t c8 = vqmovn_u16(vcombine_u16(c16, c16));
1299
return vget_lane_u32(vreinterpret_u32_u8(c8), 0);
1300
#else
1301
return (clamp_u8((int)(r() * 255.f)) << 0) |
1302
(clamp_u8((int)(g() * 255.f)) << 8) |
1303
(clamp_u8((int)(b() * 255.f)) << 16);
1304
#endif
1305
}
1306
1307
template<>
1308
__forceinline unsigned int Vec3<int>::ToRGB() const
1309
{
1310
#if defined(_M_SSE)
1311
__m128i c16 = _mm_packs_epi32(SAFE_M128I(ivec), SAFE_M128I(ivec));
1312
return _mm_cvtsi128_si32(_mm_packus_epi16(c16, c16)) & 0x00FFFFFF;
1313
#elif PPSSPP_ARCH(ARM_NEON)
1314
uint16x4_t c16 = vqmovun_s32(vsetq_lane_s32(0, ivec, 3));
1315
uint8x8_t c8 = vqmovn_u16(vcombine_u16(c16, c16));
1316
return vget_lane_u32(vreinterpret_u32_u8(c8), 0);
1317
#else
1318
return clamp_u8(r()) | (clamp_u8(g()) << 8) | (clamp_u8(b()) << 16);
1319
#endif
1320
}
1321
1322
template<>
1323
inline Vec4<float> Vec4<float>::FromRGBA(unsigned int rgba)
1324
{
1325
#if defined(_M_SSE)
1326
__m128i z = _mm_setzero_si128();
1327
__m128i c = _mm_cvtsi32_si128(rgba);
1328
c = _mm_unpacklo_epi16(_mm_unpacklo_epi8(c, z), z);
1329
return Vec4<float>(_mm_mul_ps(_mm_cvtepi32_ps(c), _mm_set_ps1(1.0f / 255.0f)));
1330
#elif PPSSPP_ARCH(ARM_NEON)
1331
uint8x8_t c = vreinterpret_u8_u32(vdup_n_u32(rgba));
1332
uint32x4_t u = vmovl_u16(vget_low_u16(vmovl_u8(c)));
1333
return Vec4<float>(vmulq_f32(vcvtq_f32_u32(u), vdupq_n_f32(1.0f / 255.0f)));
1334
#else
1335
return Vec4((rgba & 0xFF) * (1.0f/255.0f),
1336
((rgba >> 8) & 0xFF) * (1.0f/255.0f),
1337
((rgba >> 16) & 0xFF) * (1.0f/255.0f),
1338
((rgba >> 24) & 0xFF) * (1.0f/255.0f));
1339
#endif
1340
}
1341
1342
template<typename T>
1343
inline Vec4<T> Vec4<T>::FromRGBA(const u8 *rgba)
1344
{
1345
return Vec4<T>::FromRGBA(*(unsigned int *)rgba);
1346
}
1347
1348
template<>
1349
inline Vec4<int> Vec4<int>::FromRGBA(unsigned int rgba)
1350
{
1351
#if defined(_M_SSE)
1352
__m128i z = _mm_setzero_si128();
1353
__m128i c = _mm_cvtsi32_si128(rgba);
1354
c = _mm_unpacklo_epi16(_mm_unpacklo_epi8(c, z), z);
1355
return Vec4<int>(c);
1356
#elif PPSSPP_ARCH(ARM_NEON)
1357
uint8x8_t c = vreinterpret_u8_u32(vdup_n_u32(rgba));
1358
uint32x4_t u = vmovl_u16(vget_low_u16(vmovl_u8(c)));
1359
return Vec4<int>(vreinterpretq_s32_u32(u));
1360
#else
1361
return Vec4(rgba & 0xFF, (rgba >> 8) & 0xFF, (rgba >> 16) & 0xFF, (rgba >> 24) & 0xFF);
1362
#endif
1363
}
1364
1365
template<>
1366
__forceinline unsigned int Vec4<float>::ToRGBA() const
1367
{
1368
#if defined(_M_SSE)
1369
__m128i c = _mm_cvtps_epi32(_mm_mul_ps(SAFE_M128(vec), _mm_set_ps1(255.0f)));
1370
__m128i c16 = _mm_packs_epi32(c, c);
1371
return _mm_cvtsi128_si32(_mm_packus_epi16(c16, c16));
1372
#elif PPSSPP_ARCH(ARM_NEON)
1373
uint16x4_t c16 = vqmovun_s32(vcvtq_s32_f32(vmulq_f32(vec, vdupq_n_f32(255.0f))));
1374
uint8x8_t c8 = vqmovn_u16(vcombine_u16(c16, c16));
1375
return vget_lane_u32(vreinterpret_u32_u8(c8), 0);
1376
#else
1377
return (clamp_u8((int)(r() * 255.f)) << 0) |
1378
(clamp_u8((int)(g() * 255.f)) << 8) |
1379
(clamp_u8((int)(b() * 255.f)) << 16) |
1380
(clamp_u8((int)(a() * 255.f)) << 24);
1381
#endif
1382
}
1383
1384
template<>
1385
__forceinline unsigned int Vec4<int>::ToRGBA() const
1386
{
1387
#if defined(_M_SSE)
1388
__m128i c16 = _mm_packs_epi32(SAFE_M128I(ivec), SAFE_M128I(ivec));
1389
return _mm_cvtsi128_si32(_mm_packus_epi16(c16, c16));
1390
#elif PPSSPP_ARCH(ARM_NEON)
1391
uint16x4_t c16 = vqmovun_s32(ivec);
1392
uint8x8_t c8 = vqmovn_u16(vcombine_u16(c16, c16));
1393
return vget_lane_u32(vreinterpret_u32_u8(c8), 0);
1394
#else
1395
return clamp_u8(r()) | (clamp_u8(g()) << 8) | (clamp_u8(b()) << 16) | (clamp_u8(a()) << 24);
1396
#endif
1397
}
1398
1399
template<typename T>
1400
__forceinline void Vec4<T>::ToRGBA(u8 *rgba) const
1401
{
1402
*(u32 *)rgba = ToRGBA();
1403
}
1404
1405
#if defined(_M_SSE)
1406
// Specialized for SIMD optimization
1407
1408
// Vec3<float> operation
1409
template<>
1410
inline void Vec3<float>::operator += (const Vec3<float> &other) {
1411
vec = _mm_add_ps(SAFE_M128(vec), SAFE_M128(other.vec));
1412
}
1413
1414
template<>
1415
inline Vec3<float> Vec3<float>::operator + (const Vec3 &other) const {
1416
return Vec3<float>(_mm_add_ps(SAFE_M128(vec), SAFE_M128(other.vec)));
1417
}
1418
1419
template<>
1420
inline void Vec3<float>::operator -= (const Vec3<float> &other) {
1421
vec = _mm_sub_ps(SAFE_M128(vec), SAFE_M128(other.vec));
1422
}
1423
1424
template<>
1425
inline Vec3<float> Vec3<float>::operator - (const Vec3 &other) const {
1426
return Vec3<float>(_mm_sub_ps(SAFE_M128(vec), SAFE_M128(other.vec)));
1427
}
1428
1429
template<>
1430
inline Vec3<float> Vec3<float>::operator * (const Vec3 &other) const {
1431
return Vec3<float>(_mm_mul_ps(SAFE_M128(vec), SAFE_M128(other.vec)));
1432
}
1433
1434
template<> template<>
1435
inline Vec3<float> Vec3<float>::operator * (const float &other) const {
1436
return Vec3<float>(_mm_mul_ps(SAFE_M128(vec), _mm_set_ps1(other)));
1437
}
1438
1439
// Vec4<int> operation
1440
template<>
1441
inline Vec4<int> Vec4<int>::operator + (const Vec4 &other) const {
1442
return Vec4<int>(_mm_add_epi32(SAFE_M128I(ivec), SAFE_M128I(other.ivec)));
1443
}
1444
1445
template<>
1446
inline Vec4<int> Vec4<int>::operator * (const Vec4 &other) const {
1447
__m128i a = SAFE_M128I(ivec);
1448
__m128i b = SAFE_M128I(other.ivec);
1449
// Intel in its immense wisdom decided that
1450
// SSE2 does not get _mm_mullo_epi32(),
1451
// so we do it this way. This is what clang does,
1452
// which seems about as good as it gets.
1453
__m128i m02 = _mm_mul_epu32(a, b);
1454
__m128i m13 = _mm_mul_epu32(
1455
_mm_shuffle_epi32(a, _MM_SHUFFLE(3, 3, 1, 1)),
1456
_mm_shuffle_epi32(b, _MM_SHUFFLE(3, 3, 1, 1)));
1457
__m128i ret = _mm_unpacklo_epi32(
1458
_mm_shuffle_epi32(m02, _MM_SHUFFLE(3, 2, 2, 0)),
1459
_mm_shuffle_epi32(m13, _MM_SHUFFLE(3, 2, 2, 0)));
1460
return Vec4<int>(ret);
1461
}
1462
1463
template<> template<>
1464
inline Vec4<int> Vec4<int>::operator * (const int &other) const {
1465
return (*this) * Vec4<int>(_mm_set1_epi32(other));
1466
}
1467
1468
template<>
1469
inline Vec4<int> Vec4<int>::operator | (const Vec4 &other) const {
1470
return Vec4<int>(_mm_or_si128(SAFE_M128I(ivec), SAFE_M128I(other.ivec)));
1471
}
1472
1473
template<>
1474
inline Vec4<int> Vec4<int>::operator & (const Vec4 &other) const {
1475
return Vec4<int>(_mm_and_si128(SAFE_M128I(ivec), SAFE_M128I(other.ivec)));
1476
}
1477
1478
// NOTE: modern GCC, clang, and MSVC are all ok with
1479
// non-compile-time-const amount for _mm_slli_epi32/_mm_srli_epi32.
1480
template<>
1481
inline Vec4<int> Vec4<int>::operator << (const int amount) const {
1482
return Vec4<int>(_mm_slli_epi32(SAFE_M128I(ivec), amount));
1483
}
1484
1485
template<>
1486
inline Vec4<int> Vec4<int>::operator >> (const int amount) const {
1487
return Vec4<int>(_mm_srli_epi32(SAFE_M128I(ivec), amount));
1488
}
1489
1490
// Vec4<float> operation
1491
template<>
1492
inline void Vec4<float>::operator += (const Vec4<float> &other) {
1493
vec = _mm_add_ps(SAFE_M128(vec), SAFE_M128(other.vec));
1494
}
1495
1496
template<>
1497
inline Vec4<float> Vec4<float>::operator + (const Vec4 &other) const {
1498
return Vec4<float>(_mm_add_ps(SAFE_M128(vec), SAFE_M128(other.vec)));
1499
}
1500
1501
template<>
1502
inline Vec4<float> Vec4<float>::operator * (const Vec4 &other) const {
1503
return Vec4<float>(_mm_mul_ps(SAFE_M128(vec), SAFE_M128(other.vec)));
1504
}
1505
1506
template<> template<>
1507
inline Vec4<float> Vec4<float>::operator * (const float &other) const {
1508
return Vec4<float>(_mm_mul_ps(SAFE_M128(vec), _mm_set_ps1(other)));
1509
}
1510
1511
// Vec3<float> cross product
1512
template<>
1513
inline Vec3<float> Cross(const Vec3<float> &a, const Vec3<float> &b)
1514
{
1515
#if PPSSPP_ARCH(X86)
1516
__m128 avec = _mm_loadu_ps(&a.x);
1517
__m128 bvec = _mm_loadu_ps(&b.x);
1518
#else
1519
__m128 avec = a.vec;
1520
__m128 bvec = b.vec;
1521
#endif
1522
const __m128 left = _mm_mul_ps(_mm_shuffle_ps(avec, avec, _MM_SHUFFLE(3, 0, 2, 1)), _mm_shuffle_ps(bvec, bvec, _MM_SHUFFLE(3, 1, 0, 2)));
1523
const __m128 right = _mm_mul_ps(_mm_shuffle_ps(avec, avec, _MM_SHUFFLE(3, 1, 0, 2)), _mm_shuffle_ps(bvec, bvec, _MM_SHUFFLE(3, 0, 2, 1)));
1524
return _mm_sub_ps(left, right);
1525
}
1526
#endif
1527
1528
}; // namespace Math3D
1529
1530
// linear interpolation via float: 0.0=begin, 1.0=end
1531
template<typename X>
1532
inline X Lerp(const X& begin, const X& end, const float t)
1533
{
1534
return begin*(1.f-t) + end*t;
1535
}
1536
1537
// linear interpolation via int: 0=begin, base=end
1538
template<typename X, int base>
1539
inline X LerpInt(const X& begin, const X& end, const int t)
1540
{
1541
return (begin*(base-t) + end*t) / base;
1542
}
1543
1544