CoCalc -- Math3D.cpp

CoCalc provides the best real-time collaborative environment for Jupyter Notebooks, LaTeX documents, and SageMath, scalable from individual users to large groups and classes!

GitHub Repository: hrydgard/ppsspp
Path: blob/master/GPU/Math3D.cpp
Views: ¹⁴⁰¹
1
// Copyright (c) 2012- PPSSPP Project.
2

3
// This program is free software: you can redistribute it and/or modify
4
// it under the terms of the GNU General Public License as published by
5
// the Free Software Foundation, version 2.0 or later versions.
6

7
// This program is distributed in the hope that it will be useful,
8
// but WITHOUT ANY WARRANTY; without even the implied warranty of
9
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
10
// GNU General Public License 2.0 for more details.
11

12
// A copy of the GPL 2.0 should have been included with the program.
13
// If not, see http://www.gnu.org/licenses/
14

15
// Official git repository and contact information can be found at
16
// https://github.com/hrydgard/ppsspp and http://www.ppsspp.org/.
17

18
#include "Common/Common.h"
19
#include "GPU/Math3D.h"
20

21
namespace Math3D {
22

23
template<>
24
float Vec2<float>::Length() const
25
{
26
	// Doubt this is worth it for a vec2 :/
27
#if defined(_M_SSE)
28
	float ret;
29
	__m128d tmp = _mm_load_sd((const double*)&x);
30
	__m128 xy = _mm_castpd_ps(tmp);
31
	__m128 sq = _mm_mul_ps(xy, xy);
32
	const __m128 r2 = _mm_shuffle_ps(sq, sq, _MM_SHUFFLE(0, 0, 0, 1));
33
	const __m128 res = _mm_add_ss(sq, r2);
34
	_mm_store_ss(&ret, _mm_sqrt_ss(res));
35
	return ret;
36
#elif PPSSPP_ARCH(ARM64_NEON)
37
	float32x2_t vec = vld1_f32(&x);
38
	float32x2_t sq = vmul_f32(vec, vec);
39
	float32x2_t add2 = vpadd_f32(sq, sq);
40
	float32x2_t res = vsqrt_f32(add2);
41
	return vget_lane_f32(res, 0);
42
#else
43
	return sqrtf(Length2());
44
#endif
45
}
46

47
template<>
48
void Vec2<float>::SetLength(const float l)
49
{
50
	(*this) *= l / Length();
51
}
52

53
template<>
54
Vec2<float> Vec2<float>::WithLength(const float l) const
55
{
56
	return (*this) * l / Length();
57
}
58

59
template<>
60
float Vec2<float>::Distance2To(const Vec2<float> &other) const {
61
	return Vec2<float>(other-(*this)).Length2();
62
}
63

64
template<>
65
Vec2<float> Vec2<float>::Normalized() const
66
{
67
	return (*this) / Length();
68
}
69

70
template<>
71
float Vec2<float>::Normalize()
72
{
73
	float len = Length();
74
	(*this) = (*this)/len;
75
	return len;
76
}
77

78
template<>
79
float Vec3<float>::Length() const
80
{
81
#if defined(_M_SSE)
82
	float ret;
83
	__m128 xyz = _mm_loadu_ps(&x);
84
	__m128 sq = _mm_mul_ps(xyz, xyz);
85
	const __m128 r2 = _mm_shuffle_ps(sq, sq, _MM_SHUFFLE(0, 0, 0, 1));
86
	const __m128 r3 = _mm_shuffle_ps(sq, sq, _MM_SHUFFLE(0, 0, 0, 2));
87
	const __m128 res = _mm_add_ss(sq, _mm_add_ss(r2, r3));
88
	_mm_store_ss(&ret, _mm_sqrt_ss(res));
89
	return ret;
90
#elif PPSSPP_ARCH(ARM64_NEON)
91
	float32x4_t sq = vsetq_lane_f32(0.0f, vmulq_f32(vec, vec), 3);
92
	float32x2_t add1 = vget_low_f32(vpaddq_f32(sq, sq));
93
	float32x2_t add2 = vpadd_f32(add1, add1);
94
	float32x2_t res = vsqrt_f32(add2);
95
	return vget_lane_f32(res, 0);
96
#else
97
	return sqrtf(Length2());
98
#endif
99
}
100

101
template<>
102
void Vec3<float>::SetLength(const float l)
103
{
104
	(*this) *= l / Length();
105
}
106

107
template<>
108
Vec3<float> Vec3<float>::WithLength(const float l) const
109
{
110
	return (*this) * l / Length();
111
}
112

113
template<>
114
float Vec3<float>::Distance2To(const Vec3<float> &other) const {
115
	return Vec3<float>(other-(*this)).Length2();
116
}
117

118
#if defined(_M_SSE)
119
__m128 SSENormalizeMultiplierSSE2(__m128 v)
120
{
121
	const __m128 sq = _mm_mul_ps(v, v);
122
	const __m128 r2 = _mm_shuffle_ps(sq, sq, _MM_SHUFFLE(0, 0, 0, 1));
123
	const __m128 r3 = _mm_shuffle_ps(sq, sq, _MM_SHUFFLE(0, 0, 0, 2));
124
	const __m128 res = _mm_add_ss(r3, _mm_add_ss(r2, sq));
125

126
	const __m128 rt = _mm_rsqrt_ss(res);
127
	return _mm_shuffle_ps(rt, rt, _MM_SHUFFLE(0, 0, 0, 0));
128
}
129

130
#if defined(__GNUC__) || defined(__clang__) || defined(__INTEL_COMPILER)
131
[[gnu::target("sse4.1")]]
132
#endif
133
__m128 SSENormalizeMultiplierSSE4(__m128 v)
134
{
135
	// This is only used for Vec3f, so ignore the 4th component, might be garbage.
136
	return _mm_rsqrt_ps(_mm_dp_ps(v, v, 0x77));
137
}
138

139
__m128 SSENormalizeMultiplier(bool useSSE4, __m128 v)
140
{
141
	if (useSSE4)
142
		return SSENormalizeMultiplierSSE4(v);
143
	return SSENormalizeMultiplierSSE2(v);
144
}
145

146
template<>
147
Vec3<float> Vec3<float>::Normalized(bool useSSE4) const
148
{
149
	const __m128 normalize = SSENormalizeMultiplier(useSSE4, vec);
150
	return _mm_mul_ps(normalize, vec);
151
}
152

153
template<>
154
Vec3<float> Vec3<float>::NormalizedOr001(bool useSSE4) const {
155
	const __m128 normalize = SSENormalizeMultiplier(useSSE4, vec);
156
	const __m128 result = _mm_mul_ps(normalize, vec);
157
	const __m128 mask = _mm_cmpunord_ps(result, vec);
158
	const __m128 replace = _mm_and_ps(_mm_set_ps(0.0f, 1.0f, 0.0f, 0.0f), mask);
159
	// Replace with the constant if the mask matched.
160
	return _mm_or_ps(_mm_andnot_ps(mask, result), replace);
161
}
162
#elif PPSSPP_ARCH(ARM64_NEON)
163
template<>
164
Vec3<float> Vec3<float>::Normalized(bool useSSE4) const {
165
	float32x4_t sq = vsetq_lane_f32(0.0f, vmulq_f32(vec, vec), 3);
166
	float32x2_t add1 = vget_low_f32(vpaddq_f32(sq, sq));
167
	float32x2_t summed = vpadd_f32(add1, add1);
168

169
	float32x2_t e = vrsqrte_f32(summed);
170
	e = vmul_f32(vrsqrts_f32(vmul_f32(e, e), summed), e);
171
	e = vmul_f32(vrsqrts_f32(vmul_f32(e, e), summed), e);
172

173
	float32x4_t factor = vdupq_lane_f32(e, 0);
174
	return Vec3<float>(vmulq_f32(vec, factor));
175
}
176

177
template<>
178
Vec3<float> Vec3<float>::NormalizedOr001(bool useSSE4) const {
179
	float32x4_t sq = vsetq_lane_f32(0.0f, vmulq_f32(vec, vec), 3);
180
	float32x2_t add1 = vget_low_f32(vpaddq_f32(sq, sq));
181
	float32x2_t summed = vpadd_f32(add1, add1);
182
	if (vget_lane_f32(summed, 0) == 0.0f) {
183
		return Vec3<float>(vsetq_lane_f32(1.0f, vdupq_lane_f32(summed, 0), 2));
184
	}
185

186
	float32x2_t e = vrsqrte_f32(summed);
187
	e = vmul_f32(vrsqrts_f32(vmul_f32(e, e), summed), e);
188
	e = vmul_f32(vrsqrts_f32(vmul_f32(e, e), summed), e);
189

190
	float32x4_t factor = vdupq_lane_f32(e, 0);
191
	return Vec3<float>(vmulq_f32(vec, factor));
192
}
193
#else
194
template<>
195
Vec3<float> Vec3<float>::Normalized(bool useSSE4) const
196
{
197
	return (*this) / Length();
198
}
199

200
template<>
201
Vec3<float> Vec3<float>::NormalizedOr001(bool useSSE4) const {
202
	float len = Length();
203
	if (len == 0.0f) {
204
		return Vec3<float>(0.0f, 0.0f, 1.0f);
205
	}
206
	return *this / len;
207
}
208
#endif
209

210
template<>
211
float Vec3<float>::Normalize()
212
{
213
	float len = Length();
214
	(*this) = (*this)/len;
215
	return len;
216
}
217

218
template<>
219
float Vec3<float>::NormalizeOr001() {
220
	float len = Length();
221
	if (len == 0.0f) {
222
		z = 1.0f;
223
	} else {
224
		*this /= len;
225
	}
226
	return len;
227
}
228

229
template<>
230
Vec3Packed<float> Vec3Packed<float>::FromRGB(unsigned int rgb)
231
{
232
	return Vec3Packed((rgb & 0xFF) * (1.0f/255.0f),
233
				((rgb >> 8) & 0xFF) * (1.0f/255.0f),
234
				((rgb >> 16) & 0xFF) * (1.0f/255.0f));
235
}
236

237
template<>
238
Vec3Packed<int> Vec3Packed<int>::FromRGB(unsigned int rgb)
239
{
240
	return Vec3Packed(rgb & 0xFF, (rgb >> 8) & 0xFF, (rgb >> 16) & 0xFF);
241
}
242

243
template<>
244
unsigned int Vec3Packed<float>::ToRGB() const
245
{
246
	return ((unsigned int)(r()*255.f)) +
247
			((unsigned int)(g()*255.f*256.f)) +
248
			((unsigned int)(b()*255.f*256.f*256.f));
249
}
250

251
template<>
252
unsigned int Vec3Packed<int>::ToRGB() const
253
{
254
	return (r()&0xFF) | ((g()&0xFF)<<8) | ((b()&0xFF)<<16);
255
}
256

257
template<>
258
float Vec3Packed<float>::Length() const
259
{
260
	return sqrtf(Length2());
261
}
262

263
template<>
264
void Vec3Packed<float>::SetLength(const float l)
265
{
266
	(*this) *= l / Length();
267
}
268

269
template<>
270
Vec3Packed<float> Vec3Packed<float>::WithLength(const float l) const
271
{
272
	return (*this) * l / Length();
273
}
274

275
template<>
276
float Vec3Packed<float>::Distance2To(const Vec3Packed<float> &other) const {
277
	return Vec3Packed<float>(other-(*this)).Length2();
278
}
279

280
template<>
281
Vec3Packed<float> Vec3Packed<float>::Normalized() const
282
{
283
	return (*this) / Length();
284
}
285

286
template<>
287
float Vec3Packed<float>::Normalize()
288
{
289
	float len = Length();
290
	(*this) = (*this)/len;
291
	return len;
292
}
293

294
template<>
295
float Vec4<float>::Length() const
296
{
297
#if defined(_M_SSE)
298
	float ret;
299
	__m128 xyzw = _mm_loadu_ps(&x);
300
	__m128 sq = _mm_mul_ps(xyzw, xyzw);
301
	const __m128 r2 = _mm_add_ps(sq, _mm_movehl_ps(sq, sq));
302
	const __m128 res = _mm_add_ss(r2, _mm_shuffle_ps(r2, r2, _MM_SHUFFLE(0, 0, 0, 1)));
303
	_mm_store_ss(&ret, _mm_sqrt_ss(res));
304
	return ret;
305
#elif PPSSPP_ARCH(ARM64_NEON)
306
	float32x4_t sq = vmulq_f32(vec, vec);
307
	float32x2_t add1 = vget_low_f32(vpaddq_f32(sq, sq));
308
	float32x2_t add2 = vpadd_f32(add1, add1);
309
	float32x2_t res = vsqrt_f32(add2);
310
	return vget_lane_f32(res, 0);
311
#else
312
	return sqrtf(Length2());
313
#endif
314
}
315

316
template<>
317
void Vec4<float>::SetLength(const float l)
318
{
319
	(*this) *= l / Length();
320
}
321

322
template<>
323
Vec4<float> Vec4<float>::WithLength(const float l) const
324
{
325
	return (*this) * l / Length();
326
}
327

328
template<>
329
float Vec4<float>::Distance2To(const Vec4<float> &other) const {
330
	return Vec4<float>(other-(*this)).Length2();
331
}
332

333
template<>
334
Vec4<float> Vec4<float>::Normalized() const
335
{
336
	return (*this) / Length();
337
}
338

339
template<>
340
float Vec4<float>::Normalize()
341
{
342
	float len = Length();
343
	(*this) = (*this)/len;
344
	return len;
345
}
346

347
}; // namespace Math3D
348

349
CoCalc provides the best real-time collaborative environment for Jupyter Notebooks, LaTeX documents, and SageMath, scalable from individual users to large groups and classes!

Product

Resources

Company