CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutSign UpSign In
hrydgard

CoCalc provides the best real-time collaborative environment for Jupyter Notebooks, LaTeX documents, and SageMath, scalable from individual users to large groups and classes!

GitHub Repository: hrydgard/ppsspp
Path: blob/master/GPU/Math3D.cpp
Views: 1401
1
// Copyright (c) 2012- PPSSPP Project.
2
3
// This program is free software: you can redistribute it and/or modify
4
// it under the terms of the GNU General Public License as published by
5
// the Free Software Foundation, version 2.0 or later versions.
6
7
// This program is distributed in the hope that it will be useful,
8
// but WITHOUT ANY WARRANTY; without even the implied warranty of
9
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
10
// GNU General Public License 2.0 for more details.
11
12
// A copy of the GPL 2.0 should have been included with the program.
13
// If not, see http://www.gnu.org/licenses/
14
15
// Official git repository and contact information can be found at
16
// https://github.com/hrydgard/ppsspp and http://www.ppsspp.org/.
17
18
#include "Common/Common.h"
19
#include "GPU/Math3D.h"
20
21
namespace Math3D {
22
23
template<>
24
float Vec2<float>::Length() const
25
{
26
// Doubt this is worth it for a vec2 :/
27
#if defined(_M_SSE)
28
float ret;
29
__m128d tmp = _mm_load_sd((const double*)&x);
30
__m128 xy = _mm_castpd_ps(tmp);
31
__m128 sq = _mm_mul_ps(xy, xy);
32
const __m128 r2 = _mm_shuffle_ps(sq, sq, _MM_SHUFFLE(0, 0, 0, 1));
33
const __m128 res = _mm_add_ss(sq, r2);
34
_mm_store_ss(&ret, _mm_sqrt_ss(res));
35
return ret;
36
#elif PPSSPP_ARCH(ARM64_NEON)
37
float32x2_t vec = vld1_f32(&x);
38
float32x2_t sq = vmul_f32(vec, vec);
39
float32x2_t add2 = vpadd_f32(sq, sq);
40
float32x2_t res = vsqrt_f32(add2);
41
return vget_lane_f32(res, 0);
42
#else
43
return sqrtf(Length2());
44
#endif
45
}
46
47
template<>
48
void Vec2<float>::SetLength(const float l)
49
{
50
(*this) *= l / Length();
51
}
52
53
template<>
54
Vec2<float> Vec2<float>::WithLength(const float l) const
55
{
56
return (*this) * l / Length();
57
}
58
59
template<>
60
float Vec2<float>::Distance2To(const Vec2<float> &other) const {
61
return Vec2<float>(other-(*this)).Length2();
62
}
63
64
template<>
65
Vec2<float> Vec2<float>::Normalized() const
66
{
67
return (*this) / Length();
68
}
69
70
template<>
71
float Vec2<float>::Normalize()
72
{
73
float len = Length();
74
(*this) = (*this)/len;
75
return len;
76
}
77
78
template<>
79
float Vec3<float>::Length() const
80
{
81
#if defined(_M_SSE)
82
float ret;
83
__m128 xyz = _mm_loadu_ps(&x);
84
__m128 sq = _mm_mul_ps(xyz, xyz);
85
const __m128 r2 = _mm_shuffle_ps(sq, sq, _MM_SHUFFLE(0, 0, 0, 1));
86
const __m128 r3 = _mm_shuffle_ps(sq, sq, _MM_SHUFFLE(0, 0, 0, 2));
87
const __m128 res = _mm_add_ss(sq, _mm_add_ss(r2, r3));
88
_mm_store_ss(&ret, _mm_sqrt_ss(res));
89
return ret;
90
#elif PPSSPP_ARCH(ARM64_NEON)
91
float32x4_t sq = vsetq_lane_f32(0.0f, vmulq_f32(vec, vec), 3);
92
float32x2_t add1 = vget_low_f32(vpaddq_f32(sq, sq));
93
float32x2_t add2 = vpadd_f32(add1, add1);
94
float32x2_t res = vsqrt_f32(add2);
95
return vget_lane_f32(res, 0);
96
#else
97
return sqrtf(Length2());
98
#endif
99
}
100
101
template<>
102
void Vec3<float>::SetLength(const float l)
103
{
104
(*this) *= l / Length();
105
}
106
107
template<>
108
Vec3<float> Vec3<float>::WithLength(const float l) const
109
{
110
return (*this) * l / Length();
111
}
112
113
template<>
114
float Vec3<float>::Distance2To(const Vec3<float> &other) const {
115
return Vec3<float>(other-(*this)).Length2();
116
}
117
118
#if defined(_M_SSE)
119
__m128 SSENormalizeMultiplierSSE2(__m128 v)
120
{
121
const __m128 sq = _mm_mul_ps(v, v);
122
const __m128 r2 = _mm_shuffle_ps(sq, sq, _MM_SHUFFLE(0, 0, 0, 1));
123
const __m128 r3 = _mm_shuffle_ps(sq, sq, _MM_SHUFFLE(0, 0, 0, 2));
124
const __m128 res = _mm_add_ss(r3, _mm_add_ss(r2, sq));
125
126
const __m128 rt = _mm_rsqrt_ss(res);
127
return _mm_shuffle_ps(rt, rt, _MM_SHUFFLE(0, 0, 0, 0));
128
}
129
130
#if defined(__GNUC__) || defined(__clang__) || defined(__INTEL_COMPILER)
131
[[gnu::target("sse4.1")]]
132
#endif
133
__m128 SSENormalizeMultiplierSSE4(__m128 v)
134
{
135
// This is only used for Vec3f, so ignore the 4th component, might be garbage.
136
return _mm_rsqrt_ps(_mm_dp_ps(v, v, 0x77));
137
}
138
139
__m128 SSENormalizeMultiplier(bool useSSE4, __m128 v)
140
{
141
if (useSSE4)
142
return SSENormalizeMultiplierSSE4(v);
143
return SSENormalizeMultiplierSSE2(v);
144
}
145
146
template<>
147
Vec3<float> Vec3<float>::Normalized(bool useSSE4) const
148
{
149
const __m128 normalize = SSENormalizeMultiplier(useSSE4, vec);
150
return _mm_mul_ps(normalize, vec);
151
}
152
153
template<>
154
Vec3<float> Vec3<float>::NormalizedOr001(bool useSSE4) const {
155
const __m128 normalize = SSENormalizeMultiplier(useSSE4, vec);
156
const __m128 result = _mm_mul_ps(normalize, vec);
157
const __m128 mask = _mm_cmpunord_ps(result, vec);
158
const __m128 replace = _mm_and_ps(_mm_set_ps(0.0f, 1.0f, 0.0f, 0.0f), mask);
159
// Replace with the constant if the mask matched.
160
return _mm_or_ps(_mm_andnot_ps(mask, result), replace);
161
}
162
#elif PPSSPP_ARCH(ARM64_NEON)
163
template<>
164
Vec3<float> Vec3<float>::Normalized(bool useSSE4) const {
165
float32x4_t sq = vsetq_lane_f32(0.0f, vmulq_f32(vec, vec), 3);
166
float32x2_t add1 = vget_low_f32(vpaddq_f32(sq, sq));
167
float32x2_t summed = vpadd_f32(add1, add1);
168
169
float32x2_t e = vrsqrte_f32(summed);
170
e = vmul_f32(vrsqrts_f32(vmul_f32(e, e), summed), e);
171
e = vmul_f32(vrsqrts_f32(vmul_f32(e, e), summed), e);
172
173
float32x4_t factor = vdupq_lane_f32(e, 0);
174
return Vec3<float>(vmulq_f32(vec, factor));
175
}
176
177
template<>
178
Vec3<float> Vec3<float>::NormalizedOr001(bool useSSE4) const {
179
float32x4_t sq = vsetq_lane_f32(0.0f, vmulq_f32(vec, vec), 3);
180
float32x2_t add1 = vget_low_f32(vpaddq_f32(sq, sq));
181
float32x2_t summed = vpadd_f32(add1, add1);
182
if (vget_lane_f32(summed, 0) == 0.0f) {
183
return Vec3<float>(vsetq_lane_f32(1.0f, vdupq_lane_f32(summed, 0), 2));
184
}
185
186
float32x2_t e = vrsqrte_f32(summed);
187
e = vmul_f32(vrsqrts_f32(vmul_f32(e, e), summed), e);
188
e = vmul_f32(vrsqrts_f32(vmul_f32(e, e), summed), e);
189
190
float32x4_t factor = vdupq_lane_f32(e, 0);
191
return Vec3<float>(vmulq_f32(vec, factor));
192
}
193
#else
194
template<>
195
Vec3<float> Vec3<float>::Normalized(bool useSSE4) const
196
{
197
return (*this) / Length();
198
}
199
200
template<>
201
Vec3<float> Vec3<float>::NormalizedOr001(bool useSSE4) const {
202
float len = Length();
203
if (len == 0.0f) {
204
return Vec3<float>(0.0f, 0.0f, 1.0f);
205
}
206
return *this / len;
207
}
208
#endif
209
210
template<>
211
float Vec3<float>::Normalize()
212
{
213
float len = Length();
214
(*this) = (*this)/len;
215
return len;
216
}
217
218
template<>
219
float Vec3<float>::NormalizeOr001() {
220
float len = Length();
221
if (len == 0.0f) {
222
z = 1.0f;
223
} else {
224
*this /= len;
225
}
226
return len;
227
}
228
229
template<>
230
Vec3Packed<float> Vec3Packed<float>::FromRGB(unsigned int rgb)
231
{
232
return Vec3Packed((rgb & 0xFF) * (1.0f/255.0f),
233
((rgb >> 8) & 0xFF) * (1.0f/255.0f),
234
((rgb >> 16) & 0xFF) * (1.0f/255.0f));
235
}
236
237
template<>
238
Vec3Packed<int> Vec3Packed<int>::FromRGB(unsigned int rgb)
239
{
240
return Vec3Packed(rgb & 0xFF, (rgb >> 8) & 0xFF, (rgb >> 16) & 0xFF);
241
}
242
243
template<>
244
unsigned int Vec3Packed<float>::ToRGB() const
245
{
246
return ((unsigned int)(r()*255.f)) +
247
((unsigned int)(g()*255.f*256.f)) +
248
((unsigned int)(b()*255.f*256.f*256.f));
249
}
250
251
template<>
252
unsigned int Vec3Packed<int>::ToRGB() const
253
{
254
return (r()&0xFF) | ((g()&0xFF)<<8) | ((b()&0xFF)<<16);
255
}
256
257
template<>
258
float Vec3Packed<float>::Length() const
259
{
260
return sqrtf(Length2());
261
}
262
263
template<>
264
void Vec3Packed<float>::SetLength(const float l)
265
{
266
(*this) *= l / Length();
267
}
268
269
template<>
270
Vec3Packed<float> Vec3Packed<float>::WithLength(const float l) const
271
{
272
return (*this) * l / Length();
273
}
274
275
template<>
276
float Vec3Packed<float>::Distance2To(const Vec3Packed<float> &other) const {
277
return Vec3Packed<float>(other-(*this)).Length2();
278
}
279
280
template<>
281
Vec3Packed<float> Vec3Packed<float>::Normalized() const
282
{
283
return (*this) / Length();
284
}
285
286
template<>
287
float Vec3Packed<float>::Normalize()
288
{
289
float len = Length();
290
(*this) = (*this)/len;
291
return len;
292
}
293
294
template<>
295
float Vec4<float>::Length() const
296
{
297
#if defined(_M_SSE)
298
float ret;
299
__m128 xyzw = _mm_loadu_ps(&x);
300
__m128 sq = _mm_mul_ps(xyzw, xyzw);
301
const __m128 r2 = _mm_add_ps(sq, _mm_movehl_ps(sq, sq));
302
const __m128 res = _mm_add_ss(r2, _mm_shuffle_ps(r2, r2, _MM_SHUFFLE(0, 0, 0, 1)));
303
_mm_store_ss(&ret, _mm_sqrt_ss(res));
304
return ret;
305
#elif PPSSPP_ARCH(ARM64_NEON)
306
float32x4_t sq = vmulq_f32(vec, vec);
307
float32x2_t add1 = vget_low_f32(vpaddq_f32(sq, sq));
308
float32x2_t add2 = vpadd_f32(add1, add1);
309
float32x2_t res = vsqrt_f32(add2);
310
return vget_lane_f32(res, 0);
311
#else
312
return sqrtf(Length2());
313
#endif
314
}
315
316
template<>
317
void Vec4<float>::SetLength(const float l)
318
{
319
(*this) *= l / Length();
320
}
321
322
template<>
323
Vec4<float> Vec4<float>::WithLength(const float l) const
324
{
325
return (*this) * l / Length();
326
}
327
328
template<>
329
float Vec4<float>::Distance2To(const Vec4<float> &other) const {
330
return Vec4<float>(other-(*this)).Length2();
331
}
332
333
template<>
334
Vec4<float> Vec4<float>::Normalized() const
335
{
336
return (*this) / Length();
337
}
338
339
template<>
340
float Vec4<float>::Normalize()
341
{
342
float len = Length();
343
(*this) = (*this)/len;
344
return len;
345
}
346
347
}; // namespace Math3D
348
349