CoCalc provides the best real-time collaborative environment for Jupyter Notebooks, LaTeX documents, and SageMath, scalable from individual users to large groups and classes!
CoCalc provides the best real-time collaborative environment for Jupyter Notebooks, LaTeX documents, and SageMath, scalable from individual users to large groups and classes!
Path: blob/master/GPU/Math3D.h
Views: 1401
// Copyright (c) 2012- PPSSPP Project.12// This program is free software: you can redistribute it and/or modify3// it under the terms of the GNU General Public License as published by4// the Free Software Foundation, version 2.0 or later versions.56// This program is distributed in the hope that it will be useful,7// but WITHOUT ANY WARRANTY; without even the implied warranty of8// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the9// GNU General Public License 2.0 for more details.1011// A copy of the GPL 2.0 should have been included with the program.12// If not, see http://www.gnu.org/licenses/1314// Official git repository and contact information can be found at15// https://github.com/hrydgard/ppsspp and http://www.ppsspp.org/.1617#pragma once1819#include "ppsspp_config.h"20#include <cmath>2122#include "Common/Common.h"23#include "Core/Util/AudioFormat.h" // for clamp_u824#include "Common/Math/fast/fast_matrix.h"2526#if defined(_M_SSE)27#include <emmintrin.h>28#include <smmintrin.h>29#endif3031#if PPSSPP_ARCH(ARM_NEON)32#if defined(_MSC_VER) && PPSSPP_ARCH(ARM64)33#include <arm64_neon.h>34#else35#include <arm_neon.h>36#endif37#endif3839#if PPSSPP_PLATFORM(WINDOWS) && (defined(_MSC_VER) || defined(__clang__) || defined(__INTEL_COMPILER))40#define MATH3D_CALL __vectorcall41#else42#define MATH3D_CALL43#endif4445// There's probably a better place to define these macros.46#if PPSSPP_ARCH(X86)47// On 32-bit x86, MSVC does not guarantee alignment for48// SSE arguments passed on stack (Compiler Error C2719), see e.g.:49// https://stackoverflow.com/questions/10484422/msvc-cannot-send-function-parameters-of-16byte-alignment-on-x8650// https://stackoverflow.com/questions/28488986/formal-parameter-with-declspecalign16-wont-be-aligned51// So, as a workaround, "dangerous" cases are loaded via loadu* on 32-bit x86.52// Compilers are decently ok at eliminating these extra loads, at least53// in trivial cases.54// NOTE: not to be outdone, GCC has its own flavor of broken, see e.g.:55// http://www.peterstock.co.uk/games/mingw_sse/56// https://github.com/nothings/stb/issues/8157// which is probably worse since it breaks alignment of locals and/or58// spills, but that, hopefully, does not affect PPSSPP (modern GCC+Linux59// is 16-byte aligned on x86, and MinGW is not a supported PPSSPP target).60// NOTE: weird double-casts add a bit of type-safety.61#define SAFE_M128(v) _mm_loadu_ps (reinterpret_cast<const float*> (static_cast<const __m128*> (&(v))))62#define SAFE_M128I(v) _mm_loadu_si128(reinterpret_cast<const __m128i*>(static_cast<const __m128i*>(&(v))))63#else // x64, FWIW also works for non-x86.64#define SAFE_M128(v) (v)65#define SAFE_M128I(v) (v)66#endif6768namespace Math3D {6970// Helper for Vec classes to clamp values.71template<typename T>72inline static T VecClamp(const T &v, const T &low, const T &high)73{74if (v > high)75return high;76if (v < low)77return low;78return v;79}8081template<typename T>82class Vec2 {83public:84struct {85T x,y;86};8788T* AsArray() { return &x; }89const T* AsArray() const { return &x; }9091Vec2() {}92Vec2(const T a[2]) : x(a[0]), y(a[1]) {}93Vec2(const T& _x, const T& _y) : x(_x), y(_y) {}9495template<typename T2>96Vec2<T2> Cast() const97{98return Vec2<T2>((T2)x, (T2)y);99}100101static Vec2 AssignToAll(const T& f)102{103return Vec2<T>(f, f);104}105106void Write(T a[2])107{108a[0] = x; a[1] = y;109}110111Vec2 operator +(const Vec2& other) const112{113return Vec2(x+other.x, y+other.y);114}115void operator += (const Vec2 &other)116{117x+=other.x; y+=other.y;118}119Vec2 operator -(const Vec2& other) const120{121return Vec2(x-other.x, y-other.y);122}123void operator -= (const Vec2& other)124{125x-=other.x; y-=other.y;126}127Vec2 operator -() const128{129return Vec2(-x,-y);130}131Vec2 operator * (const Vec2& other) const132{133return Vec2(x*other.x, y*other.y);134}135template<typename V>136Vec2 operator * (const V& f) const137{138return Vec2(x*f,y*f);139}140template<typename V>141void operator *= (const V& f)142{143x*=f; y*=f;144}145template<typename V>146Vec2 operator / (const V& f) const147{148return Vec2(x/f,y/f);149}150template<typename V>151void operator /= (const V& f)152{153*this = *this / f;154}155156T Length2() const157{158return x*x + y*y;159}160161Vec2 Clamp(const T &l, const T &h) const162{163return Vec2(VecClamp(x, l, h), VecClamp(y, l, h));164}165166// Only implemented for T=float167float Length() const;168void SetLength(const float l);169Vec2 WithLength(const float l) const;170float Distance2To(const Vec2 &other) const;171Vec2 Normalized() const;172float Normalize(); // returns the previous length, which is often useful173174T& operator [] (int i) //allow vector[1] = 3 (vector.y=3)175{176return *((&x) + i);177}178T operator [] (const int i) const179{180return *((&x) + i);181}182183void SetZero()184{185x=0; y=0;186}187188// Common aliases: UV (texel coordinates), ST (texture coordinates)189T& u() { return x; }190T& v() { return y; }191T& s() { return x; }192T& t() { return y; }193194const T& u() const { return x; }195const T& v() const { return y; }196const T& s() const { return x; }197const T& t() const { return y; }198199// swizzlers - create a subvector of specific components200const Vec2 yx() const { return Vec2(y, x); }201const Vec2 vu() const { return Vec2(y, x); }202const Vec2 ts() const { return Vec2(y, x); }203};204205template<typename T>206class Vec3Packed;207208template<typename T>209class Vec3210{211public:212union213{214struct215{216T x,y,z;217};218#if defined(_M_SSE)219__m128i ivec;220__m128 vec;221#elif PPSSPP_ARCH(ARM_NEON)222int32x4_t ivec;223float32x4_t vec;224#endif225};226227T* AsArray() { return &x; }228const T* AsArray() const { return &x; }229230Vec3() {}231Vec3(const T a[3]) : x(a[0]), y(a[1]), z(a[2]) {}232constexpr Vec3(const T& _x, const T& _y, const T& _z) : x(_x), y(_y), z(_z) {}233Vec3(const Vec2<T>& _xy, const T& _z) : x(_xy.x), y(_xy.y), z(_z) {}234#if defined(_M_SSE)235constexpr Vec3(const __m128 &_vec) : vec(_vec) {}236constexpr Vec3(const __m128i &_ivec) : ivec(_ivec) {}237Vec3(const Vec3Packed<T> &_xyz) {238vec = _mm_loadu_ps(_xyz.AsArray());239}240#elif PPSSPP_ARCH(ARM_NEON)241Vec3(const float32x4_t &_vec) : vec(_vec) {}242#if !defined(_MSC_VER)243Vec3(const int32x4_t &_ivec) : ivec(_ivec) {}244#endif245Vec3(const Vec3Packed<T> &_xyz) {246vec = vld1q_f32(_xyz.AsArray());247}248#else249Vec3(const Vec3Packed<T> &_xyz) : x(_xyz.x), y(_xyz.y), z(_xyz.z) {}250#endif251252template<typename T2>253constexpr Vec3<T2> Cast() const254{255return Vec3<T2>((T2)x, (T2)y, (T2)z);256}257258// Only implemented for T=int and T=float259static Vec3 FromRGB(unsigned int rgb);260unsigned int ToRGB() const; // alpha bits set to zero261262static constexpr Vec3 AssignToAll(const T& f)263{264return Vec3<T>(f, f, f);265}266267void Write(T a[3])268{269a[0] = x; a[1] = y; a[2] = z;270}271272Vec3 operator +(const Vec3 &other) const273{274return Vec3(x+other.x, y+other.y, z+other.z);275}276void operator += (const Vec3 &other)277{278x+=other.x; y+=other.y; z+=other.z;279}280Vec3 operator -(const Vec3 &other) const281{282return Vec3(x-other.x, y-other.y, z-other.z);283}284void operator -= (const Vec3 &other)285{286x-=other.x; y-=other.y; z-=other.z;287}288Vec3 operator -() const289{290return Vec3(-x,-y,-z);291}292Vec3 operator * (const Vec3 &other) const293{294return Vec3(x*other.x, y*other.y, z*other.z);295}296template<typename V>297Vec3 operator * (const V& f) const298{299return Vec3(x*f,y*f,z*f);300}301template<typename V>302void operator *= (const V& f)303{304x*=f; y*=f; z*=f;305}306template<typename V>307Vec3 operator / (const V& f) const308{309return Vec3(x/f,y/f,z/f);310}311template<typename V>312void operator /= (const V& f)313{314*this = *this / f;315}316317bool operator ==(const Vec3 &other) const {318return x == other.x && y == other.y && z == other.z;319}320321T Length2() const322{323return x*x + y*y + z*z;324}325326Vec3 Clamp(const T &l, const T &h) const327{328return Vec3(VecClamp(x, l, h), VecClamp(y, l, h), VecClamp(z, l, h));329}330331// Only implemented for T=float332float Length() const;333void SetLength(const float l);334Vec3 WithLength(const float l) const;335float Distance2To(const Vec3 &other) const;336Vec3 Normalized(bool useSSE4 = false) const;337Vec3 NormalizedOr001(bool useSSE4 = false) const;338float Normalize(); // returns the previous length, which is often useful339float NormalizeOr001();340341T& operator [] (int i) //allow vector[2] = 3 (vector.z=3)342{343return *((&x) + i);344}345T operator [] (const int i) const346{347return *((&x) + i);348}349350void SetZero()351{352x=0; y=0; z=0;353}354355// Common aliases: UVW (texel coordinates), RGB (colors), STQ (texture coordinates)356T& u() { return x; }357T& v() { return y; }358T& w() { return z; }359360T& r() { return x; }361T& g() { return y; }362T& b() { return z; }363364T& s() { return x; }365T& t() { return y; }366T& q() { return z; }367368const T& u() const { return x; }369const T& v() const { return y; }370const T& w() const { return z; }371372const T& r() const { return x; }373const T& g() const { return y; }374const T& b() const { return z; }375376const T& s() const { return x; }377const T& t() const { return y; }378const T& q() const { return z; }379380// swizzlers - create a subvector of specific components381// e.g. Vec2 uv() { return Vec2(x,y); }382// _DEFINE_SWIZZLER2 defines a single such function, DEFINE_SWIZZLER2 defines all of them for all component names (x<->r) and permutations (xy<->yx)383#define _DEFINE_SWIZZLER2(a, b, name) const Vec2<T> name() const { return Vec2<T>(a, b); }384#define DEFINE_SWIZZLER2(a, b, a2, b2, a3, b3, a4, b4) \385_DEFINE_SWIZZLER2(a, b, a##b); \386_DEFINE_SWIZZLER2(a, b, a2##b2); \387_DEFINE_SWIZZLER2(a, b, a3##b3); \388_DEFINE_SWIZZLER2(a, b, a4##b4); \389_DEFINE_SWIZZLER2(b, a, b##a); \390_DEFINE_SWIZZLER2(b, a, b2##a2); \391_DEFINE_SWIZZLER2(b, a, b3##a3); \392_DEFINE_SWIZZLER2(b, a, b4##a4);393394DEFINE_SWIZZLER2(x, y, r, g, u, v, s, t);395DEFINE_SWIZZLER2(x, z, r, b, u, w, s, q);396DEFINE_SWIZZLER2(y, z, g, b, v, w, t, q);397#undef DEFINE_SWIZZLER2398#undef _DEFINE_SWIZZLER2399};400401template<typename T>402class Vec3Packed403{404public:405union406{407struct408{409T x,y,z;410};411};412413T* AsArray() { return &x; }414const T* AsArray() const { return &x; }415416Vec3Packed() {}417Vec3Packed(const T a[3]) : x(a[0]), y(a[1]), z(a[2]) {}418Vec3Packed(const T& _x, const T& _y, const T& _z) : x(_x), y(_y), z(_z) {}419Vec3Packed(const Vec2<T>& _xy, const T& _z) : x(_xy.x), y(_xy.y), z(_z) {}420Vec3Packed(const Vec3<T>& _xyz) {421memcpy(&x, _xyz.AsArray(), sizeof(float) * 3);422}423424template<typename T2>425Vec3Packed<T2> Cast() const426{427return Vec3Packed<T2>((T2)x, (T2)y, (T2)z);428}429430// Only implemented for T=int and T=float431static Vec3Packed FromRGB(unsigned int rgb);432unsigned int ToRGB() const; // alpha bits set to zero433434static Vec3Packed AssignToAll(const T& f)435{436return Vec3Packed<T>(f, f, f);437}438439void Write(T a[3])440{441a[0] = x; a[1] = y; a[2] = z;442}443444Vec3Packed operator +(const Vec3Packed &other) const445{446return Vec3Packed(x+other.x, y+other.y, z+other.z);447}448void operator += (const Vec3Packed &other)449{450x+=other.x; y+=other.y; z+=other.z;451}452Vec3Packed operator -(const Vec3Packed &other) const453{454return Vec3Packed(x-other.x, y-other.y, z-other.z);455}456void operator -= (const Vec3Packed &other)457{458x-=other.x; y-=other.y; z-=other.z;459}460Vec3Packed operator -() const461{462return Vec3Packed(-x,-y,-z);463}464Vec3Packed operator * (const Vec3Packed &other) const465{466return Vec3Packed(x*other.x, y*other.y, z*other.z);467}468template<typename V>469Vec3Packed operator * (const V& f) const470{471return Vec3Packed(x*f,y*f,z*f);472}473template<typename V>474void operator *= (const V& f)475{476x*=f; y*=f; z*=f;477}478template<typename V>479Vec3Packed operator / (const V& f) const480{481return Vec3Packed(x/f,y/f,z/f);482}483template<typename V>484void operator /= (const V& f)485{486*this = *this / f;487}488489T Length2() const490{491return x*x + y*y + z*z;492}493494Vec3Packed Clamp(const T &l, const T &h) const495{496return Vec3Packed(VecClamp(x, l, h), VecClamp(y, l, h), VecClamp(z, l, h));497}498499// Only implemented for T=float500float Length() const;501void SetLength(const float l);502Vec3Packed WithLength(const float l) const;503float Distance2To(const Vec3Packed &other) const;504Vec3Packed Normalized() const;505float Normalize(); // returns the previous length, which is often useful506507T& operator [] (int i) //allow vector[2] = 3 (vector.z=3)508{509return *((&x) + i);510}511T operator [] (const int i) const512{513return *((&x) + i);514}515516void SetZero()517{518x=0; y=0; z=0;519}520521// Common aliases: UVW (texel coordinates), RGB (colors), STQ (texture coordinates)522T& u() { return x; }523T& v() { return y; }524T& w() { return z; }525526T& r() { return x; }527T& g() { return y; }528T& b() { return z; }529530T& s() { return x; }531T& t() { return y; }532T& q() { return z; }533534const T& u() const { return x; }535const T& v() const { return y; }536const T& w() const { return z; }537538const T& r() const { return x; }539const T& g() const { return y; }540const T& b() const { return z; }541542const T& s() const { return x; }543const T& t() const { return y; }544const T& q() const { return z; }545546// swizzlers - create a subvector of specific components547// e.g. Vec2 uv() { return Vec2(x,y); }548// _DEFINE_SWIZZLER2 defines a single such function, DEFINE_SWIZZLER2 defines all of them for all component names (x<->r) and permutations (xy<->yx)549#define _DEFINE_SWIZZLER2(a, b, name) const Vec2<T> name() const { return Vec2<T>(a, b); }550#define DEFINE_SWIZZLER2(a, b, a2, b2, a3, b3, a4, b4) \551_DEFINE_SWIZZLER2(a, b, a##b); \552_DEFINE_SWIZZLER2(a, b, a2##b2); \553_DEFINE_SWIZZLER2(a, b, a3##b3); \554_DEFINE_SWIZZLER2(a, b, a4##b4); \555_DEFINE_SWIZZLER2(b, a, b##a); \556_DEFINE_SWIZZLER2(b, a, b2##a2); \557_DEFINE_SWIZZLER2(b, a, b3##a3); \558_DEFINE_SWIZZLER2(b, a, b4##a4);559560DEFINE_SWIZZLER2(x, y, r, g, u, v, s, t);561DEFINE_SWIZZLER2(x, z, r, b, u, w, s, q);562DEFINE_SWIZZLER2(y, z, g, b, v, w, t, q);563#undef DEFINE_SWIZZLER2564#undef _DEFINE_SWIZZLER2565};566567template<typename T>568class Vec4569{570public:571union572{573struct574{575T x,y,z,w;576};577#if defined(_M_SSE)578__m128i ivec;579__m128 vec;580#elif PPSSPP_ARCH(ARM_NEON)581int32x4_t ivec;582float32x4_t vec;583#endif584};585586T* AsArray() { return &x; }587const T* AsArray() const { return &x; }588589Vec4() {}590Vec4(const T a[4]) : x(a[0]), y(a[1]), z(a[2]), w(a[3]) {}591Vec4(const T& _x, const T& _y, const T& _z, const T& _w) : x(_x), y(_y), z(_z), w(_w) {}592Vec4(const Vec2<T>& _xy, const T& _z, const T& _w) : x(_xy.x), y(_xy.y), z(_z), w(_w) {}593Vec4(const Vec3<T>& _xyz, const T& _w) : x(_xyz.x), y(_xyz.y), z(_xyz.z), w(_w) {}594#if defined(_M_SSE)595Vec4(const __m128 &_vec) : vec(_vec) {}596Vec4(const __m128i &_ivec) : ivec(_ivec) {}597#elif PPSSPP_ARCH(ARM_NEON)598Vec4(const float32x4_t &_vec) : vec(_vec) {}599#if !defined(_MSC_VER)600Vec4(const int32x4_t &_ivec) : ivec(_ivec) {}601#endif602#endif603604template<typename T2>605Vec4<T2> Cast() const {606if constexpr (std::is_same<T, float>::value && std::is_same<T2, int>::value) {607#if defined(_M_SSE)608return _mm_cvtps_epi32(SAFE_M128(vec));609#elif PPSSPP_ARCH(ARM_NEON)610return vcvtq_s32_f32(vec);611#endif612}613if constexpr (std::is_same<T, int>::value && std::is_same<T2, float>::value) {614#if defined(_M_SSE)615return _mm_cvtepi32_ps(SAFE_M128I(ivec));616#elif PPSSPP_ARCH(ARM_NEON)617return vcvtq_f32_s32(ivec);618#endif619}620return Vec4<T2>((T2)x, (T2)y, (T2)z, (T2)w);621}622623// Only implemented for T=int and T=float624static Vec4 FromRGBA(unsigned int rgba);625static Vec4 FromRGBA(const u8 *rgba);626unsigned int ToRGBA() const;627void ToRGBA(u8 *rgba) const;628629static Vec4 AssignToAll(const T& f)630{631return Vec4<T>(f, f, f, f);632}633634void Write(T a[4])635{636a[0] = x; a[1] = y; a[2] = z; a[3] = w;637}638639Vec4 operator +(const Vec4& other) const640{641return Vec4(x+other.x, y+other.y, z+other.z, w+other.w);642}643void operator += (const Vec4& other)644{645x+=other.x; y+=other.y; z+=other.z; w+=other.w;646}647Vec4 operator -(const Vec4 &other) const648{649return Vec4(x-other.x, y-other.y, z-other.z, w-other.w);650}651void operator -= (const Vec4 &other)652{653x-=other.x; y-=other.y; z-=other.z; w-=other.w;654}655Vec4 operator -() const656{657return Vec4(-x,-y,-z,-w);658}659Vec4 operator * (const Vec4 &other) const660{661return Vec4(x*other.x, y*other.y, z*other.z, w*other.w);662}663Vec4 operator | (const Vec4 &other) const664{665return Vec4(x | other.x, y | other.y, z | other.z, w | other.w);666}667Vec4 operator & (const Vec4 &other) const668{669return Vec4(x & other.x, y & other.y, z & other.z, w & other.w);670}671Vec4 operator << (const int amount) const672{673// NOTE: x*(1<<amount), etc., might be safer, since674// left-shifting negatives is UB pre-C++20.675return Vec4(x << amount, y << amount, z << amount, w << amount);676}677Vec4 operator >> (const int amount) const678{679return Vec4(x >> amount, y >> amount, z >> amount, w >> amount);680}681template<typename V>682Vec4 operator * (const V& f) const683{684return Vec4(x*f,y*f,z*f,w*f);685}686template<typename V>687void operator *= (const V& f)688{689x*=f; y*=f; z*=f; w*=f;690}691template<typename V>692Vec4 operator / (const V& f) const693{694return Vec4(x/f,y/f,z/f,w/f);695}696template<typename V>697void operator /= (const V& f)698{699*this = *this / f;700}701702bool operator ==(const Vec4 &other) const {703return x == other.x && y == other.y && z == other.z && w == other.w;704}705706T Length2() const707{708return x*x + y*y + z*z + w*w;709}710711Vec4 Clamp(const T &l, const T &h) const712{713return Vec4(VecClamp(x, l, h), VecClamp(y, l, h), VecClamp(z, l, h), VecClamp(w, l, h));714}715716Vec4 Reciprocal() const717{718const T one = 1.0f;719return Vec4(one / x, one / y, one / z, one / w);720}721722// Only implemented for T=float723float Length() const;724void SetLength(const float l);725Vec4 WithLength(const float l) const;726float Distance2To(const Vec4 &other) const;727Vec4 Normalized() const;728float Normalize(); // returns the previous length, which is often useful729730T& operator [] (int i) //allow vector[2] = 3 (vector.z=3)731{732return *((&x) + i);733}734T operator [] (const int i) const735{736return *((&x) + i);737}738739void SetZero()740{741x=0; y=0; z=0; w=0;742}743744// Common alias: RGBA (colors)745T& r() { return x; }746T& g() { return y; }747T& b() { return z; }748T& a() { return w; }749750const T& r() const { return x; }751const T& g() const { return y; }752const T& b() const { return z; }753const T& a() const { return w; }754755// swizzlers - create a subvector of specific components756// e.g. Vec2 uv() { return Vec2(x,y); }757// _DEFINE_SWIZZLER2 defines a single such function, DEFINE_SWIZZLER2 defines all of them for all component names (x<->r) and permutations (xy<->yx)758#define _DEFINE_SWIZZLER2(a, b, name) const Vec2<T> name() const { return Vec2<T>(a, b); }759#define DEFINE_SWIZZLER2(a, b, a2, b2) \760_DEFINE_SWIZZLER2(a, b, a##b); \761_DEFINE_SWIZZLER2(a, b, a2##b2); \762_DEFINE_SWIZZLER2(b, a, b##a); \763_DEFINE_SWIZZLER2(b, a, b2##a2);764765DEFINE_SWIZZLER2(x, y, r, g);766DEFINE_SWIZZLER2(x, z, r, b);767DEFINE_SWIZZLER2(x, w, r, a);768DEFINE_SWIZZLER2(y, z, g, b);769DEFINE_SWIZZLER2(y, w, g, a);770DEFINE_SWIZZLER2(z, w, b, a);771#undef DEFINE_SWIZZLER2772#undef _DEFINE_SWIZZLER2773774#define _DEFINE_SWIZZLER3(a, b, c, name) const Vec3<T> name() const { return Vec3<T>(a, b, c); }775#define DEFINE_SWIZZLER3(a, b, c, a2, b2, c2) \776_DEFINE_SWIZZLER3(a, b, c, a##b##c); \777_DEFINE_SWIZZLER3(a, c, b, a##c##b); \778_DEFINE_SWIZZLER3(b, a, c, b##a##c); \779_DEFINE_SWIZZLER3(b, c, a, b##c##a); \780_DEFINE_SWIZZLER3(c, a, b, c##a##b); \781_DEFINE_SWIZZLER3(c, b, a, c##b##a); \782_DEFINE_SWIZZLER3(a, b, c, a2##b2##c2); \783_DEFINE_SWIZZLER3(a, c, b, a2##c2##b2); \784_DEFINE_SWIZZLER3(b, a, c, b2##a2##c2); \785_DEFINE_SWIZZLER3(b, c, a, b2##c2##a2); \786_DEFINE_SWIZZLER3(c, a, b, c2##a2##b2); \787_DEFINE_SWIZZLER3(c, b, a, c2##b2##a2);788789DEFINE_SWIZZLER3(x, y, z, r, g, b);790DEFINE_SWIZZLER3(x, y, w, r, g, a);791DEFINE_SWIZZLER3(x, z, w, r, b, a);792DEFINE_SWIZZLER3(y, z, w, g, b, a);793#undef DEFINE_SWIZZLER3794#undef _DEFINE_SWIZZLER3795};796797798template<typename BaseType>799class Mat3x3800{801public:802// Convention: first three values = first column803Mat3x3(const BaseType values[])804{805for (unsigned int i = 0; i < 3*3; ++i)806{807this->values[i] = values[i];808}809}810811Mat3x3(BaseType _00, BaseType _01, BaseType _02, BaseType _10, BaseType _11, BaseType _12, BaseType _20, BaseType _21, BaseType _22)812{813values[0] = _00;814values[1] = _01;815values[2] = _02;816values[3] = _10;817values[4] = _11;818values[5] = _12;819values[6] = _20;820values[7] = _21;821values[8] = _22;822}823824template<typename T>825Vec3<T> operator * (const Vec3<T>& vec) const826{827Vec3<T> ret;828ret.x = values[0]*vec.x + values[3]*vec.y + values[6]*vec.z;829ret.y = values[1]*vec.x + values[4]*vec.y + values[7]*vec.z;830ret.z = values[2]*vec.x + values[5]*vec.y + values[8]*vec.z;831return ret;832}833834Mat3x3 Inverse() const835{836float a = values[0];837float b = values[1];838float c = values[2];839float d = values[3];840float e = values[4];841float f = values[5];842float g = values[6];843float h = values[7];844float i = values[8];845return Mat3x3(e*i-f*h, f*g-d*i, d*h-e*g,846c*h-b*i, a*i-c*g, b*g-a*h,847b*f-c*e, c*d-a*f, a*e-b*d) / Det();848}849850BaseType Det() const851{852return values[0]*values[4]*values[8] + values[3]*values[7]*values[2] +853values[6]*values[1]*values[5] - values[2]*values[4]*values[6] -854values[5]*values[7]*values[0] - values[8]*values[1]*values[3];855}856857Mat3x3 operator / (const BaseType& val) const858{859return Mat3x3(values[0]/val, values[1]/val, values[2]/val,860values[3]/val, values[4]/val, values[5]/val,861values[6]/val, values[7]/val, values[8]/val);862}863864private:865BaseType values[3*3];866};867868869template<typename BaseType>870class Mat4x4871{872public:873// Convention: first four values in arrow = first column874Mat4x4(const BaseType values[])875{876for (unsigned int i = 0; i < 4*4; ++i)877{878this->values[i] = values[i];879}880}881882template<typename T>883Vec4<T> operator * (const Vec4<T>& vec) const884{885Vec4<T> ret;886ret.x = values[0]*vec.x + values[4]*vec.y + values[8]*vec.z + values[12]*vec.w;887ret.y = values[1]*vec.x + values[5]*vec.y + values[9]*vec.z + values[13]*vec.w;888ret.z = values[2]*vec.x + values[6]*vec.y + values[10]*vec.z + values[14]*vec.w;889ret.w = values[3]*vec.x + values[7]*vec.y + values[11]*vec.z + values[15]*vec.w;890return ret;891}892893private:894BaseType values[4*4];895};896897}; // namespace Math3D898899typedef Math3D::Vec2<float> Vec2f;900typedef Math3D::Vec3<float> Vec3f;901typedef Math3D::Vec3Packed<float> Vec3Packedf;902typedef Math3D::Vec4<float> Vec4f;903904#if defined(_M_SSE)905template<unsigned i>906float MATH3D_CALL vectorGetByIndex(__m128 v) {907// shuffle V so that the element that we want is moved to the bottom908return _mm_cvtss_f32(_mm_shuffle_ps(v, v, _MM_SHUFFLE(i, i, i, i)));909}910#endif911912#if defined(_M_SSE)913// x, y, and z should be broadcast. Should only be used through Vec3f version.914// Note that this will read an extra float from the matrix, so it better not be at the end of an allocation!915inline __m128 MATH3D_CALL Vec3ByMatrix43Internal(__m128 x, __m128 y, __m128 z, const float m[12]) {916__m128 col0 = _mm_loadu_ps(m);917__m128 col1 = _mm_loadu_ps(m + 3);918__m128 col2 = _mm_loadu_ps(m + 6);919__m128 col3 = _mm_loadu_ps(m + 9);920__m128 sum = _mm_add_ps(921_mm_add_ps(_mm_mul_ps(col0, x), _mm_mul_ps(col1, y)),922_mm_add_ps(_mm_mul_ps(col2, z), col3));923return sum;924}925#elif PPSSPP_ARCH(ARM64_NEON)926inline float32x4_t Vec3ByMatrix43Internal(float32x4_t vec, const float m[16]) {927float32x4_t col0 = vld1q_f32(m);928float32x4_t col1 = vld1q_f32(m + 3);929float32x4_t col2 = vld1q_f32(m + 6);930float32x4_t col3 = vld1q_f32(m + 9);931float32x4_t sum = vaddq_f32(932vaddq_f32(vmulq_laneq_f32(col0, vec, 0), vmulq_laneq_f32(col1, vec, 1)),933vaddq_f32(vmulq_laneq_f32(col2, vec, 2), col3));934return sum;935}936#elif PPSSPP_ARCH(ARM_NEON)937inline float32x4_t Vec3ByMatrix43Internal(float32x4_t vec, const float m[16]) {938float32x4_t col0 = vld1q_f32(m);939float32x4_t col1 = vld1q_f32(m + 3);940float32x4_t col2 = vld1q_f32(m + 6);941float32x4_t col3 = vld1q_f32(m + 9);942float32x4_t sum = vaddq_f32(943vaddq_f32(vmulq_lane_f32(col0, vget_low_f32(vec), 0), vmulq_lane_f32(col1, vget_low_f32(vec), 1)),944vaddq_f32(vmulq_lane_f32(col2, vget_high_f32(vec), 0), col3));945return sum;946}947#endif948949// v and vecOut must point to different memory.950inline void Vec3ByMatrix43(float vecOut[3], const float v[3], const float m[12]) {951#if defined(_M_SSE)952__m128 x = _mm_set1_ps(v[0]);953__m128 y = _mm_set1_ps(v[1]);954__m128 z = _mm_set1_ps(v[2]);955__m128 sum = Vec3ByMatrix43Internal(x, y, z, m);956// Not sure what the best way to store 3 elements is. Ideally, we should957// probably store all four.958vecOut[0] = _mm_cvtss_f32(sum);959vecOut[1] = vectorGetByIndex<1>(sum);960vecOut[2] = vectorGetByIndex<2>(sum);961#elif PPSSPP_ARCH(ARM_NEON)962float vecIn[4] = {v[0], v[1], v[2], 1.0f};963float32x4_t sum = Vec3ByMatrix43Internal(vld1q_f32(vecIn), m);964vecOut[0] = vgetq_lane_f32(sum, 0);965vecOut[1] = vgetq_lane_f32(sum, 1);966vecOut[2] = vgetq_lane_f32(sum, 2);967#else968vecOut[0] = v[0] * m[0] + v[1] * m[3] + v[2] * m[6] + m[9];969vecOut[1] = v[0] * m[1] + v[1] * m[4] + v[2] * m[7] + m[10];970vecOut[2] = v[0] * m[2] + v[1] * m[5] + v[2] * m[8] + m[11];971#endif972}973974inline Vec3f MATH3D_CALL Vec3ByMatrix43(const Vec3f v, const float m[12]) {975#if defined(_M_SSE)976const __m128 vv = SAFE_M128(v.vec);977__m128 x = _mm_shuffle_ps(vv, vv, _MM_SHUFFLE(0, 0, 0, 0));978__m128 y = _mm_shuffle_ps(vv, vv, _MM_SHUFFLE(1, 1, 1, 1));979__m128 z = _mm_shuffle_ps(vv, vv, _MM_SHUFFLE(2, 2, 2, 2));980return Vec3ByMatrix43Internal(x, y, z, m);981#elif PPSSPP_ARCH(ARM_NEON)982return Vec3ByMatrix43Internal(v.vec, m);983#else984Vec3f vecOut;985Vec3ByMatrix43(vecOut.AsArray(), v.AsArray(), m);986return vecOut;987#endif988}989990#if defined(_M_SSE)991// x, y, and z should be broadcast. Should only be used through Vec3f version.992inline __m128 MATH3D_CALL Vec3ByMatrix44Internal(__m128 x, __m128 y, __m128 z, const float m[16]) {993__m128 col0 = _mm_loadu_ps(m);994__m128 col1 = _mm_loadu_ps(m + 4);995__m128 col2 = _mm_loadu_ps(m + 8);996__m128 col3 = _mm_loadu_ps(m + 12);997__m128 sum = _mm_add_ps(998_mm_add_ps(_mm_mul_ps(col0, x), _mm_mul_ps(col1, y)),999_mm_add_ps(_mm_mul_ps(col2, z), col3));1000return sum;1001}1002#elif PPSSPP_ARCH(ARM64_NEON)1003inline float32x4_t Vec3ByMatrix44Internal(float32x4_t vec, const float m[16]) {1004float32x4_t col0 = vld1q_f32(m);1005float32x4_t col1 = vld1q_f32(m + 4);1006float32x4_t col2 = vld1q_f32(m + 8);1007float32x4_t col3 = vld1q_f32(m + 12);1008float32x4_t sum = vaddq_f32(1009vaddq_f32(vmulq_laneq_f32(col0, vec, 0), vmulq_laneq_f32(col1, vec, 1)),1010vaddq_f32(vmulq_laneq_f32(col2, vec, 2), col3));1011return sum;1012}1013#elif PPSSPP_ARCH(ARM_NEON)1014inline float32x4_t Vec3ByMatrix44Internal(float32x4_t vec, const float m[16]) {1015float32x4_t col0 = vld1q_f32(m);1016float32x4_t col1 = vld1q_f32(m + 4);1017float32x4_t col2 = vld1q_f32(m + 8);1018float32x4_t col3 = vld1q_f32(m + 12);1019float32x4_t sum = vaddq_f32(1020vaddq_f32(vmulq_lane_f32(col0, vget_low_f32(vec), 0), vmulq_lane_f32(col1, vget_low_f32(vec), 1)),1021vaddq_f32(vmulq_lane_f32(col2, vget_high_f32(vec), 0), col3));1022return sum;1023}1024#endif10251026inline void Vec3ByMatrix44(float vecOut[4], const float v[3], const float m[16]) {1027#if defined(_M_SSE)1028__m128 x = _mm_set1_ps(v[0]);1029__m128 y = _mm_set1_ps(v[1]);1030__m128 z = _mm_set1_ps(v[2]);1031__m128 sum = Vec3ByMatrix44Internal(x, y, z, m);1032_mm_storeu_ps(vecOut, sum);1033#elif PPSSPP_ARCH(ARM_NEON)1034float vecIn[4] = {v[0], v[1], v[2], 1.0f};1035float32x4_t sum = Vec3ByMatrix44Internal(vld1q_f32(vecIn), m);1036vst1q_f32(vecOut, sum);1037#else1038vecOut[0] = v[0] * m[0] + v[1] * m[4] + v[2] * m[8] + m[12];1039vecOut[1] = v[0] * m[1] + v[1] * m[5] + v[2] * m[9] + m[13];1040vecOut[2] = v[0] * m[2] + v[1] * m[6] + v[2] * m[10] + m[14];1041vecOut[3] = v[0] * m[3] + v[1] * m[7] + v[2] * m[11] + m[15];1042#endif1043}10441045inline Vec4f MATH3D_CALL Vec3ByMatrix44(const Vec3f v, const float m[16]) {1046#if defined(_M_SSE)1047const __m128 vv = SAFE_M128(v.vec);1048__m128 x = _mm_shuffle_ps(vv, vv, _MM_SHUFFLE(0, 0, 0, 0));1049__m128 y = _mm_shuffle_ps(vv, vv, _MM_SHUFFLE(1, 1, 1, 1));1050__m128 z = _mm_shuffle_ps(vv, vv, _MM_SHUFFLE(2, 2, 2, 2));1051return Vec3ByMatrix44Internal(x, y, z, m);1052#elif PPSSPP_ARCH(ARM_NEON)1053return Vec3ByMatrix44Internal(v.vec, m);1054#else1055Vec4f vecOut;1056Vec3ByMatrix44(vecOut.AsArray(), v.AsArray(), m);1057return vecOut;1058#endif1059}10601061#if defined(_M_SSE)1062// x, y, and z should be broadcast. Should only be used through Vec3f version.1063inline __m128 MATH3D_CALL Norm3ByMatrix43Internal(__m128 x, __m128 y, __m128 z, const float m[12]) {1064__m128 col0 = _mm_loadu_ps(m);1065__m128 col1 = _mm_loadu_ps(m + 3);1066__m128 col2 = _mm_loadu_ps(m + 6);1067__m128 sum = _mm_add_ps(1068_mm_add_ps(_mm_mul_ps(col0, x), _mm_mul_ps(col1, y)),1069_mm_mul_ps(col2, z));1070return sum;1071}1072#elif PPSSPP_ARCH(ARM64_NEON)1073inline float32x4_t Norm3ByMatrix43Internal(float32x4_t vec, const float m[16]) {1074float32x4_t col0 = vld1q_f32(m);1075float32x4_t col1 = vld1q_f32(m + 3);1076float32x4_t col2 = vld1q_f32(m + 6);1077float32x4_t sum = vaddq_f32(1078vaddq_f32(vmulq_laneq_f32(col0, vec, 0), vmulq_laneq_f32(col1, vec, 1)),1079vmulq_laneq_f32(col2, vec, 2));1080return sum;1081}1082#elif PPSSPP_ARCH(ARM_NEON)1083inline float32x4_t Norm3ByMatrix43Internal(float32x4_t vec, const float m[16]) {1084float32x4_t col0 = vld1q_f32(m);1085float32x4_t col1 = vld1q_f32(m + 3);1086float32x4_t col2 = vld1q_f32(m + 6);1087float32x4_t sum = vaddq_f32(1088vaddq_f32(vmulq_lane_f32(col0, vget_low_f32(vec), 0), vmulq_lane_f32(col1, vget_low_f32(vec), 1)),1089vmulq_lane_f32(col2, vget_high_f32(vec), 0));1090return sum;1091}1092#endif10931094inline void Norm3ByMatrix43(float vecOut[3], const float v[3], const float m[12]) {1095#if defined(_M_SSE)1096__m128 x = _mm_set1_ps(v[0]);1097__m128 y = _mm_set1_ps(v[1]);1098__m128 z = _mm_set1_ps(v[2]);1099__m128 sum = Norm3ByMatrix43Internal(x, y, z, m);1100vecOut[0] = _mm_cvtss_f32(sum);1101vecOut[1] = vectorGetByIndex<1>(sum);1102vecOut[2] = vectorGetByIndex<2>(sum);1103#elif PPSSPP_ARCH(ARM_NEON)1104float32x4_t sum = Norm3ByMatrix43Internal(vld1q_f32(v), m);1105vecOut[0] = vgetq_lane_f32(sum, 0);1106vecOut[1] = vgetq_lane_f32(sum, 1);1107vecOut[2] = vgetq_lane_f32(sum, 2);1108#else1109vecOut[0] = v[0] * m[0] + v[1] * m[3] + v[2] * m[6];1110vecOut[1] = v[0] * m[1] + v[1] * m[4] + v[2] * m[7];1111vecOut[2] = v[0] * m[2] + v[1] * m[5] + v[2] * m[8];1112#endif1113}11141115inline Vec3f MATH3D_CALL Norm3ByMatrix43(const Vec3f v, const float m[12]) {1116#if defined(_M_SSE)1117const __m128 vv = SAFE_M128(v.vec);1118__m128 x = _mm_shuffle_ps(vv, vv, _MM_SHUFFLE(0, 0, 0, 0));1119__m128 y = _mm_shuffle_ps(vv, vv, _MM_SHUFFLE(1, 1, 1, 1));1120__m128 z = _mm_shuffle_ps(vv, vv, _MM_SHUFFLE(2, 2, 2, 2));1121return Norm3ByMatrix43Internal(x, y, z, m);1122#elif PPSSPP_ARCH(ARM_NEON)1123return Norm3ByMatrix43Internal(v.vec, m);1124#else1125Vec3f vecOut;1126Norm3ByMatrix43(vecOut.AsArray(), v.AsArray(), m);1127return vecOut;1128#endif1129}11301131inline void Matrix4ByMatrix4(float out[16], const float a[16], const float b[16]) {1132fast_matrix_mul_4x4(out, b, a);1133}11341135inline void ConvertMatrix4x3To4x4(float *m4x4, const float *m4x3) {1136m4x4[0] = m4x3[0];1137m4x4[1] = m4x3[1];1138m4x4[2] = m4x3[2];1139m4x4[3] = 0.0f;1140m4x4[4] = m4x3[3];1141m4x4[5] = m4x3[4];1142m4x4[6] = m4x3[5];1143m4x4[7] = 0.0f;1144m4x4[8] = m4x3[6];1145m4x4[9] = m4x3[7];1146m4x4[10] = m4x3[8];1147m4x4[11] = 0.0f;1148m4x4[12] = m4x3[9];1149m4x4[13] = m4x3[10];1150m4x4[14] = m4x3[11];1151m4x4[15] = 1.0f;1152}11531154inline void ConvertMatrix4x3To4x4Transposed(float *m4x4, const float *m4x3) {1155#if PPSSPP_ARCH(ARM_NEON)1156// vld3q is a perfect match here!1157float32x4x3_t packed = vld3q_f32(m4x3);1158vst1q_f32(m4x4, packed.val[0]);1159vst1q_f32(m4x4 + 4, packed.val[1]);1160vst1q_f32(m4x4 + 8, packed.val[2]);1161#else1162m4x4[0] = m4x3[0];1163m4x4[1] = m4x3[3];1164m4x4[2] = m4x3[6];1165m4x4[3] = m4x3[9];1166m4x4[4] = m4x3[1];1167m4x4[5] = m4x3[4];1168m4x4[6] = m4x3[7];1169m4x4[7] = m4x3[10];1170m4x4[8] = m4x3[2];1171m4x4[9] = m4x3[5];1172m4x4[10] = m4x3[8];1173m4x4[11] = m4x3[11];1174#endif1175m4x4[12] = 0.0f;1176m4x4[13] = 0.0f;1177m4x4[14] = 0.0f;1178m4x4[15] = 1.0f;1179}11801181// 03691182// 147A1183// 258B1184// ->>-1185// 01231186// 45671187// 89AB1188// Don't see a way to SIMD that. Should be pretty fast anyway.1189inline void ConvertMatrix4x3To3x4Transposed(float *m4x4, const float *m4x3) {1190#if PPSSPP_ARCH(ARM_NEON)1191// vld3q is a perfect match here!1192float32x4x3_t packed = vld3q_f32(m4x3);1193vst1q_f32(m4x4, packed.val[0]);1194vst1q_f32(m4x4 + 4, packed.val[1]);1195vst1q_f32(m4x4 + 8, packed.val[2]);1196#else1197m4x4[0] = m4x3[0];1198m4x4[1] = m4x3[3];1199m4x4[2] = m4x3[6];1200m4x4[3] = m4x3[9];1201m4x4[4] = m4x3[1];1202m4x4[5] = m4x3[4];1203m4x4[6] = m4x3[7];1204m4x4[7] = m4x3[10];1205m4x4[8] = m4x3[2];1206m4x4[9] = m4x3[5];1207m4x4[10] = m4x3[8];1208m4x4[11] = m4x3[11];1209#endif1210}12111212inline void Transpose4x4(float out[16], const float in[16]) {1213for (int i = 0; i < 4; i++) {1214for (int j = 0; j < 4; j++) {1215out[i * 4 + j] = in[j * 4 + i];1216}1217}1218}12191220namespace Math3D {12211222template<typename T>1223inline T Dot(const Vec2<T>& a, const Vec2<T>& b)1224{1225return a.x*b.x + a.y*b.y;1226}12271228template<typename T>1229inline T Dot(const Vec3<T>& a, const Vec3<T>& b)1230{1231return a.x*b.x + a.y*b.y + a.z*b.z;1232}12331234template<typename T>1235inline T Dot(const Vec4<T>& a, const Vec4<T>& b)1236{1237return a.x*b.x + a.y*b.y + a.z*b.z + a.w*b.w;1238}12391240template<typename T>1241inline Vec3<T> Cross(const Vec3<T>& a, const Vec3<T>& b)1242{1243return Vec3<T>(a.y*b.z-a.z*b.y, a.z*b.x-a.x*b.z, a.x*b.y-a.y*b.x);1244}12451246template<typename T>1247inline Vec3Packed<T> Cross(const Vec3Packed<T>& a, const Vec3Packed<T>& b)1248{1249return Vec3Packed<T>(a.y*b.z-a.z*b.y, a.z*b.x-a.x*b.z, a.x*b.y-a.y*b.x);1250}12511252template<>1253inline Vec3<float> Vec3<float>::FromRGB(unsigned int rgb)1254{1255#if defined(_M_SSE)1256__m128i z = _mm_setzero_si128();1257__m128i c = _mm_cvtsi32_si128(rgb);1258c = _mm_unpacklo_epi16(_mm_unpacklo_epi8(c, z), z);1259return Vec3<float>(_mm_mul_ps(_mm_cvtepi32_ps(c), _mm_set_ps1(1.0f / 255.0f)));1260#elif PPSSPP_ARCH(ARM_NEON)1261uint8x8_t c = vreinterpret_u8_u32(vdup_n_u32(rgb));1262uint32x4_t u = vmovl_u16(vget_low_u16(vmovl_u8(c)));1263return Vec3<float>(vmulq_f32(vcvtq_f32_u32(u), vdupq_n_f32(1.0f / 255.0f)));1264#else1265return Vec3((rgb & 0xFF) * (1.0f/255.0f),1266((rgb >> 8) & 0xFF) * (1.0f/255.0f),1267((rgb >> 16) & 0xFF) * (1.0f/255.0f));1268#endif1269}12701271template<>1272inline Vec3<int> Vec3<int>::FromRGB(unsigned int rgb)1273{1274#if defined(_M_SSE)1275__m128i z = _mm_setzero_si128();1276__m128i c = _mm_cvtsi32_si128(rgb);1277c = _mm_unpacklo_epi16(_mm_unpacklo_epi8(c, z), z);1278return Vec3<int>(c);1279#elif PPSSPP_ARCH(ARM_NEON)1280uint8x8_t c = vreinterpret_u8_u32(vdup_n_u32(rgb));1281uint32x4_t u = vmovl_u16(vget_low_u16(vmovl_u8(c)));1282return Vec3<int>(vreinterpretq_s32_u32(u));1283#else1284return Vec3(rgb & 0xFF, (rgb >> 8) & 0xFF, (rgb >> 16) & 0xFF);1285#endif1286}12871288template<>1289__forceinline unsigned int Vec3<float>::ToRGB() const1290{1291#if defined(_M_SSE)1292__m128i c = _mm_cvtps_epi32(_mm_mul_ps(SAFE_M128(vec), _mm_set_ps1(255.0f)));1293__m128i c16 = _mm_packs_epi32(c, c);1294return _mm_cvtsi128_si32(_mm_packus_epi16(c16, c16)) & 0x00FFFFFF;1295#elif PPSSPP_ARCH(ARM_NEON)1296uint16x4_t c16 = vqmovun_s32(vcvtq_s32_f32(vmulq_f32(vsetq_lane_f32(0.0f, vec, 3), vdupq_n_f32(255.0f))));1297uint8x8_t c8 = vqmovn_u16(vcombine_u16(c16, c16));1298return vget_lane_u32(vreinterpret_u32_u8(c8), 0);1299#else1300return (clamp_u8((int)(r() * 255.f)) << 0) |1301(clamp_u8((int)(g() * 255.f)) << 8) |1302(clamp_u8((int)(b() * 255.f)) << 16);1303#endif1304}13051306template<>1307__forceinline unsigned int Vec3<int>::ToRGB() const1308{1309#if defined(_M_SSE)1310__m128i c16 = _mm_packs_epi32(SAFE_M128I(ivec), SAFE_M128I(ivec));1311return _mm_cvtsi128_si32(_mm_packus_epi16(c16, c16)) & 0x00FFFFFF;1312#elif PPSSPP_ARCH(ARM_NEON)1313uint16x4_t c16 = vqmovun_s32(vsetq_lane_s32(0, ivec, 3));1314uint8x8_t c8 = vqmovn_u16(vcombine_u16(c16, c16));1315return vget_lane_u32(vreinterpret_u32_u8(c8), 0);1316#else1317return clamp_u8(r()) | (clamp_u8(g()) << 8) | (clamp_u8(b()) << 16);1318#endif1319}13201321template<>1322inline Vec4<float> Vec4<float>::FromRGBA(unsigned int rgba)1323{1324#if defined(_M_SSE)1325__m128i z = _mm_setzero_si128();1326__m128i c = _mm_cvtsi32_si128(rgba);1327c = _mm_unpacklo_epi16(_mm_unpacklo_epi8(c, z), z);1328return Vec4<float>(_mm_mul_ps(_mm_cvtepi32_ps(c), _mm_set_ps1(1.0f / 255.0f)));1329#elif PPSSPP_ARCH(ARM_NEON)1330uint8x8_t c = vreinterpret_u8_u32(vdup_n_u32(rgba));1331uint32x4_t u = vmovl_u16(vget_low_u16(vmovl_u8(c)));1332return Vec4<float>(vmulq_f32(vcvtq_f32_u32(u), vdupq_n_f32(1.0f / 255.0f)));1333#else1334return Vec4((rgba & 0xFF) * (1.0f/255.0f),1335((rgba >> 8) & 0xFF) * (1.0f/255.0f),1336((rgba >> 16) & 0xFF) * (1.0f/255.0f),1337((rgba >> 24) & 0xFF) * (1.0f/255.0f));1338#endif1339}13401341template<typename T>1342inline Vec4<T> Vec4<T>::FromRGBA(const u8 *rgba)1343{1344return Vec4<T>::FromRGBA(*(unsigned int *)rgba);1345}13461347template<>1348inline Vec4<int> Vec4<int>::FromRGBA(unsigned int rgba)1349{1350#if defined(_M_SSE)1351__m128i z = _mm_setzero_si128();1352__m128i c = _mm_cvtsi32_si128(rgba);1353c = _mm_unpacklo_epi16(_mm_unpacklo_epi8(c, z), z);1354return Vec4<int>(c);1355#elif PPSSPP_ARCH(ARM_NEON)1356uint8x8_t c = vreinterpret_u8_u32(vdup_n_u32(rgba));1357uint32x4_t u = vmovl_u16(vget_low_u16(vmovl_u8(c)));1358return Vec4<int>(vreinterpretq_s32_u32(u));1359#else1360return Vec4(rgba & 0xFF, (rgba >> 8) & 0xFF, (rgba >> 16) & 0xFF, (rgba >> 24) & 0xFF);1361#endif1362}13631364template<>1365__forceinline unsigned int Vec4<float>::ToRGBA() const1366{1367#if defined(_M_SSE)1368__m128i c = _mm_cvtps_epi32(_mm_mul_ps(SAFE_M128(vec), _mm_set_ps1(255.0f)));1369__m128i c16 = _mm_packs_epi32(c, c);1370return _mm_cvtsi128_si32(_mm_packus_epi16(c16, c16));1371#elif PPSSPP_ARCH(ARM_NEON)1372uint16x4_t c16 = vqmovun_s32(vcvtq_s32_f32(vmulq_f32(vec, vdupq_n_f32(255.0f))));1373uint8x8_t c8 = vqmovn_u16(vcombine_u16(c16, c16));1374return vget_lane_u32(vreinterpret_u32_u8(c8), 0);1375#else1376return (clamp_u8((int)(r() * 255.f)) << 0) |1377(clamp_u8((int)(g() * 255.f)) << 8) |1378(clamp_u8((int)(b() * 255.f)) << 16) |1379(clamp_u8((int)(a() * 255.f)) << 24);1380#endif1381}13821383template<>1384__forceinline unsigned int Vec4<int>::ToRGBA() const1385{1386#if defined(_M_SSE)1387__m128i c16 = _mm_packs_epi32(SAFE_M128I(ivec), SAFE_M128I(ivec));1388return _mm_cvtsi128_si32(_mm_packus_epi16(c16, c16));1389#elif PPSSPP_ARCH(ARM_NEON)1390uint16x4_t c16 = vqmovun_s32(ivec);1391uint8x8_t c8 = vqmovn_u16(vcombine_u16(c16, c16));1392return vget_lane_u32(vreinterpret_u32_u8(c8), 0);1393#else1394return clamp_u8(r()) | (clamp_u8(g()) << 8) | (clamp_u8(b()) << 16) | (clamp_u8(a()) << 24);1395#endif1396}13971398template<typename T>1399__forceinline void Vec4<T>::ToRGBA(u8 *rgba) const1400{1401*(u32 *)rgba = ToRGBA();1402}14031404#if defined(_M_SSE)1405// Specialized for SIMD optimization14061407// Vec3<float> operation1408template<>1409inline void Vec3<float>::operator += (const Vec3<float> &other) {1410vec = _mm_add_ps(SAFE_M128(vec), SAFE_M128(other.vec));1411}14121413template<>1414inline Vec3<float> Vec3<float>::operator + (const Vec3 &other) const {1415return Vec3<float>(_mm_add_ps(SAFE_M128(vec), SAFE_M128(other.vec)));1416}14171418template<>1419inline void Vec3<float>::operator -= (const Vec3<float> &other) {1420vec = _mm_sub_ps(SAFE_M128(vec), SAFE_M128(other.vec));1421}14221423template<>1424inline Vec3<float> Vec3<float>::operator - (const Vec3 &other) const {1425return Vec3<float>(_mm_sub_ps(SAFE_M128(vec), SAFE_M128(other.vec)));1426}14271428template<>1429inline Vec3<float> Vec3<float>::operator * (const Vec3 &other) const {1430return Vec3<float>(_mm_mul_ps(SAFE_M128(vec), SAFE_M128(other.vec)));1431}14321433template<> template<>1434inline Vec3<float> Vec3<float>::operator * (const float &other) const {1435return Vec3<float>(_mm_mul_ps(SAFE_M128(vec), _mm_set_ps1(other)));1436}14371438// Vec4<int> operation1439template<>1440inline Vec4<int> Vec4<int>::operator + (const Vec4 &other) const {1441return Vec4<int>(_mm_add_epi32(SAFE_M128I(ivec), SAFE_M128I(other.ivec)));1442}14431444template<>1445inline Vec4<int> Vec4<int>::operator * (const Vec4 &other) const {1446__m128i a = SAFE_M128I(ivec);1447__m128i b = SAFE_M128I(other.ivec);1448// Intel in its immense wisdom decided that1449// SSE2 does not get _mm_mullo_epi32(),1450// so we do it this way. This is what clang does,1451// which seems about as good as it gets.1452__m128i m02 = _mm_mul_epu32(a, b);1453__m128i m13 = _mm_mul_epu32(1454_mm_shuffle_epi32(a, _MM_SHUFFLE(3, 3, 1, 1)),1455_mm_shuffle_epi32(b, _MM_SHUFFLE(3, 3, 1, 1)));1456__m128i ret = _mm_unpacklo_epi32(1457_mm_shuffle_epi32(m02, _MM_SHUFFLE(3, 2, 2, 0)),1458_mm_shuffle_epi32(m13, _MM_SHUFFLE(3, 2, 2, 0)));1459return Vec4<int>(ret);1460}14611462template<> template<>1463inline Vec4<int> Vec4<int>::operator * (const int &other) const {1464return (*this) * Vec4<int>(_mm_set1_epi32(other));1465}14661467template<>1468inline Vec4<int> Vec4<int>::operator | (const Vec4 &other) const {1469return Vec4<int>(_mm_or_si128(SAFE_M128I(ivec), SAFE_M128I(other.ivec)));1470}14711472template<>1473inline Vec4<int> Vec4<int>::operator & (const Vec4 &other) const {1474return Vec4<int>(_mm_and_si128(SAFE_M128I(ivec), SAFE_M128I(other.ivec)));1475}14761477// NOTE: modern GCC, clang, and MSVC are all ok with1478// non-compile-time-const amount for _mm_slli_epi32/_mm_srli_epi32.1479template<>1480inline Vec4<int> Vec4<int>::operator << (const int amount) const {1481return Vec4<int>(_mm_slli_epi32(SAFE_M128I(ivec), amount));1482}14831484template<>1485inline Vec4<int> Vec4<int>::operator >> (const int amount) const {1486return Vec4<int>(_mm_srli_epi32(SAFE_M128I(ivec), amount));1487}14881489// Vec4<float> operation1490template<>1491inline void Vec4<float>::operator += (const Vec4<float> &other) {1492vec = _mm_add_ps(SAFE_M128(vec), SAFE_M128(other.vec));1493}14941495template<>1496inline Vec4<float> Vec4<float>::operator + (const Vec4 &other) const {1497return Vec4<float>(_mm_add_ps(SAFE_M128(vec), SAFE_M128(other.vec)));1498}14991500template<>1501inline Vec4<float> Vec4<float>::operator * (const Vec4 &other) const {1502return Vec4<float>(_mm_mul_ps(SAFE_M128(vec), SAFE_M128(other.vec)));1503}15041505template<> template<>1506inline Vec4<float> Vec4<float>::operator * (const float &other) const {1507return Vec4<float>(_mm_mul_ps(SAFE_M128(vec), _mm_set_ps1(other)));1508}15091510// Vec3<float> cross product1511template<>1512inline Vec3<float> Cross(const Vec3<float> &a, const Vec3<float> &b)1513{1514#if PPSSPP_ARCH(X86)1515__m128 avec = _mm_loadu_ps(&a.x);1516__m128 bvec = _mm_loadu_ps(&b.x);1517#else1518__m128 avec = a.vec;1519__m128 bvec = b.vec;1520#endif1521const __m128 left = _mm_mul_ps(_mm_shuffle_ps(avec, avec, _MM_SHUFFLE(3, 0, 2, 1)), _mm_shuffle_ps(bvec, bvec, _MM_SHUFFLE(3, 1, 0, 2)));1522const __m128 right = _mm_mul_ps(_mm_shuffle_ps(avec, avec, _MM_SHUFFLE(3, 1, 0, 2)), _mm_shuffle_ps(bvec, bvec, _MM_SHUFFLE(3, 0, 2, 1)));1523return _mm_sub_ps(left, right);1524}1525#endif15261527}; // namespace Math3D15281529// linear interpolation via float: 0.0=begin, 1.0=end1530template<typename X>1531inline X Lerp(const X& begin, const X& end, const float t)1532{1533return begin*(1.f-t) + end*t;1534}15351536// linear interpolation via int: 0=begin, base=end1537template<typename X, int base>1538inline X LerpInt(const X& begin, const X& end, const int t)1539{1540return (begin*(base-t) + end*t) / base;1541}154215431544