CoCalc provides the best real-time collaborative environment for Jupyter Notebooks, LaTeX documents, and SageMath, scalable from individual users to large groups and classes!
CoCalc provides the best real-time collaborative environment for Jupyter Notebooks, LaTeX documents, and SageMath, scalable from individual users to large groups and classes!
Path: blob/master/GPU/Math3D.cpp
Views: 1401
// Copyright (c) 2012- PPSSPP Project.12// This program is free software: you can redistribute it and/or modify3// it under the terms of the GNU General Public License as published by4// the Free Software Foundation, version 2.0 or later versions.56// This program is distributed in the hope that it will be useful,7// but WITHOUT ANY WARRANTY; without even the implied warranty of8// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the9// GNU General Public License 2.0 for more details.1011// A copy of the GPL 2.0 should have been included with the program.12// If not, see http://www.gnu.org/licenses/1314// Official git repository and contact information can be found at15// https://github.com/hrydgard/ppsspp and http://www.ppsspp.org/.1617#include "Common/Common.h"18#include "GPU/Math3D.h"1920namespace Math3D {2122template<>23float Vec2<float>::Length() const24{25// Doubt this is worth it for a vec2 :/26#if defined(_M_SSE)27float ret;28__m128d tmp = _mm_load_sd((const double*)&x);29__m128 xy = _mm_castpd_ps(tmp);30__m128 sq = _mm_mul_ps(xy, xy);31const __m128 r2 = _mm_shuffle_ps(sq, sq, _MM_SHUFFLE(0, 0, 0, 1));32const __m128 res = _mm_add_ss(sq, r2);33_mm_store_ss(&ret, _mm_sqrt_ss(res));34return ret;35#elif PPSSPP_ARCH(ARM64_NEON)36float32x2_t vec = vld1_f32(&x);37float32x2_t sq = vmul_f32(vec, vec);38float32x2_t add2 = vpadd_f32(sq, sq);39float32x2_t res = vsqrt_f32(add2);40return vget_lane_f32(res, 0);41#else42return sqrtf(Length2());43#endif44}4546template<>47void Vec2<float>::SetLength(const float l)48{49(*this) *= l / Length();50}5152template<>53Vec2<float> Vec2<float>::WithLength(const float l) const54{55return (*this) * l / Length();56}5758template<>59float Vec2<float>::Distance2To(const Vec2<float> &other) const {60return Vec2<float>(other-(*this)).Length2();61}6263template<>64Vec2<float> Vec2<float>::Normalized() const65{66return (*this) / Length();67}6869template<>70float Vec2<float>::Normalize()71{72float len = Length();73(*this) = (*this)/len;74return len;75}7677template<>78float Vec3<float>::Length() const79{80#if defined(_M_SSE)81float ret;82__m128 xyz = _mm_loadu_ps(&x);83__m128 sq = _mm_mul_ps(xyz, xyz);84const __m128 r2 = _mm_shuffle_ps(sq, sq, _MM_SHUFFLE(0, 0, 0, 1));85const __m128 r3 = _mm_shuffle_ps(sq, sq, _MM_SHUFFLE(0, 0, 0, 2));86const __m128 res = _mm_add_ss(sq, _mm_add_ss(r2, r3));87_mm_store_ss(&ret, _mm_sqrt_ss(res));88return ret;89#elif PPSSPP_ARCH(ARM64_NEON)90float32x4_t sq = vsetq_lane_f32(0.0f, vmulq_f32(vec, vec), 3);91float32x2_t add1 = vget_low_f32(vpaddq_f32(sq, sq));92float32x2_t add2 = vpadd_f32(add1, add1);93float32x2_t res = vsqrt_f32(add2);94return vget_lane_f32(res, 0);95#else96return sqrtf(Length2());97#endif98}99100template<>101void Vec3<float>::SetLength(const float l)102{103(*this) *= l / Length();104}105106template<>107Vec3<float> Vec3<float>::WithLength(const float l) const108{109return (*this) * l / Length();110}111112template<>113float Vec3<float>::Distance2To(const Vec3<float> &other) const {114return Vec3<float>(other-(*this)).Length2();115}116117#if defined(_M_SSE)118__m128 SSENormalizeMultiplierSSE2(__m128 v)119{120const __m128 sq = _mm_mul_ps(v, v);121const __m128 r2 = _mm_shuffle_ps(sq, sq, _MM_SHUFFLE(0, 0, 0, 1));122const __m128 r3 = _mm_shuffle_ps(sq, sq, _MM_SHUFFLE(0, 0, 0, 2));123const __m128 res = _mm_add_ss(r3, _mm_add_ss(r2, sq));124125const __m128 rt = _mm_rsqrt_ss(res);126return _mm_shuffle_ps(rt, rt, _MM_SHUFFLE(0, 0, 0, 0));127}128129#if defined(__GNUC__) || defined(__clang__) || defined(__INTEL_COMPILER)130[[gnu::target("sse4.1")]]131#endif132__m128 SSENormalizeMultiplierSSE4(__m128 v)133{134// This is only used for Vec3f, so ignore the 4th component, might be garbage.135return _mm_rsqrt_ps(_mm_dp_ps(v, v, 0x77));136}137138__m128 SSENormalizeMultiplier(bool useSSE4, __m128 v)139{140if (useSSE4)141return SSENormalizeMultiplierSSE4(v);142return SSENormalizeMultiplierSSE2(v);143}144145template<>146Vec3<float> Vec3<float>::Normalized(bool useSSE4) const147{148const __m128 normalize = SSENormalizeMultiplier(useSSE4, vec);149return _mm_mul_ps(normalize, vec);150}151152template<>153Vec3<float> Vec3<float>::NormalizedOr001(bool useSSE4) const {154const __m128 normalize = SSENormalizeMultiplier(useSSE4, vec);155const __m128 result = _mm_mul_ps(normalize, vec);156const __m128 mask = _mm_cmpunord_ps(result, vec);157const __m128 replace = _mm_and_ps(_mm_set_ps(0.0f, 1.0f, 0.0f, 0.0f), mask);158// Replace with the constant if the mask matched.159return _mm_or_ps(_mm_andnot_ps(mask, result), replace);160}161#elif PPSSPP_ARCH(ARM64_NEON)162template<>163Vec3<float> Vec3<float>::Normalized(bool useSSE4) const {164float32x4_t sq = vsetq_lane_f32(0.0f, vmulq_f32(vec, vec), 3);165float32x2_t add1 = vget_low_f32(vpaddq_f32(sq, sq));166float32x2_t summed = vpadd_f32(add1, add1);167168float32x2_t e = vrsqrte_f32(summed);169e = vmul_f32(vrsqrts_f32(vmul_f32(e, e), summed), e);170e = vmul_f32(vrsqrts_f32(vmul_f32(e, e), summed), e);171172float32x4_t factor = vdupq_lane_f32(e, 0);173return Vec3<float>(vmulq_f32(vec, factor));174}175176template<>177Vec3<float> Vec3<float>::NormalizedOr001(bool useSSE4) const {178float32x4_t sq = vsetq_lane_f32(0.0f, vmulq_f32(vec, vec), 3);179float32x2_t add1 = vget_low_f32(vpaddq_f32(sq, sq));180float32x2_t summed = vpadd_f32(add1, add1);181if (vget_lane_f32(summed, 0) == 0.0f) {182return Vec3<float>(vsetq_lane_f32(1.0f, vdupq_lane_f32(summed, 0), 2));183}184185float32x2_t e = vrsqrte_f32(summed);186e = vmul_f32(vrsqrts_f32(vmul_f32(e, e), summed), e);187e = vmul_f32(vrsqrts_f32(vmul_f32(e, e), summed), e);188189float32x4_t factor = vdupq_lane_f32(e, 0);190return Vec3<float>(vmulq_f32(vec, factor));191}192#else193template<>194Vec3<float> Vec3<float>::Normalized(bool useSSE4) const195{196return (*this) / Length();197}198199template<>200Vec3<float> Vec3<float>::NormalizedOr001(bool useSSE4) const {201float len = Length();202if (len == 0.0f) {203return Vec3<float>(0.0f, 0.0f, 1.0f);204}205return *this / len;206}207#endif208209template<>210float Vec3<float>::Normalize()211{212float len = Length();213(*this) = (*this)/len;214return len;215}216217template<>218float Vec3<float>::NormalizeOr001() {219float len = Length();220if (len == 0.0f) {221z = 1.0f;222} else {223*this /= len;224}225return len;226}227228template<>229Vec3Packed<float> Vec3Packed<float>::FromRGB(unsigned int rgb)230{231return Vec3Packed((rgb & 0xFF) * (1.0f/255.0f),232((rgb >> 8) & 0xFF) * (1.0f/255.0f),233((rgb >> 16) & 0xFF) * (1.0f/255.0f));234}235236template<>237Vec3Packed<int> Vec3Packed<int>::FromRGB(unsigned int rgb)238{239return Vec3Packed(rgb & 0xFF, (rgb >> 8) & 0xFF, (rgb >> 16) & 0xFF);240}241242template<>243unsigned int Vec3Packed<float>::ToRGB() const244{245return ((unsigned int)(r()*255.f)) +246((unsigned int)(g()*255.f*256.f)) +247((unsigned int)(b()*255.f*256.f*256.f));248}249250template<>251unsigned int Vec3Packed<int>::ToRGB() const252{253return (r()&0xFF) | ((g()&0xFF)<<8) | ((b()&0xFF)<<16);254}255256template<>257float Vec3Packed<float>::Length() const258{259return sqrtf(Length2());260}261262template<>263void Vec3Packed<float>::SetLength(const float l)264{265(*this) *= l / Length();266}267268template<>269Vec3Packed<float> Vec3Packed<float>::WithLength(const float l) const270{271return (*this) * l / Length();272}273274template<>275float Vec3Packed<float>::Distance2To(const Vec3Packed<float> &other) const {276return Vec3Packed<float>(other-(*this)).Length2();277}278279template<>280Vec3Packed<float> Vec3Packed<float>::Normalized() const281{282return (*this) / Length();283}284285template<>286float Vec3Packed<float>::Normalize()287{288float len = Length();289(*this) = (*this)/len;290return len;291}292293template<>294float Vec4<float>::Length() const295{296#if defined(_M_SSE)297float ret;298__m128 xyzw = _mm_loadu_ps(&x);299__m128 sq = _mm_mul_ps(xyzw, xyzw);300const __m128 r2 = _mm_add_ps(sq, _mm_movehl_ps(sq, sq));301const __m128 res = _mm_add_ss(r2, _mm_shuffle_ps(r2, r2, _MM_SHUFFLE(0, 0, 0, 1)));302_mm_store_ss(&ret, _mm_sqrt_ss(res));303return ret;304#elif PPSSPP_ARCH(ARM64_NEON)305float32x4_t sq = vmulq_f32(vec, vec);306float32x2_t add1 = vget_low_f32(vpaddq_f32(sq, sq));307float32x2_t add2 = vpadd_f32(add1, add1);308float32x2_t res = vsqrt_f32(add2);309return vget_lane_f32(res, 0);310#else311return sqrtf(Length2());312#endif313}314315template<>316void Vec4<float>::SetLength(const float l)317{318(*this) *= l / Length();319}320321template<>322Vec4<float> Vec4<float>::WithLength(const float l) const323{324return (*this) * l / Length();325}326327template<>328float Vec4<float>::Distance2To(const Vec4<float> &other) const {329return Vec4<float>(other-(*this)).Length2();330}331332template<>333Vec4<float> Vec4<float>::Normalized() const334{335return (*this) / Length();336}337338template<>339float Vec4<float>::Normalize()340{341float len = Length();342(*this) = (*this)/len;343return len;344}345346}; // namespace Math3D347348349