Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
godotengine
GitHub Repository: godotengine/godot
Path: blob/master/thirdparty/embree/common/math/vec3fa.h
9912 views
1
// Copyright 2009-2021 Intel Corporation
2
// SPDX-License-Identifier: Apache-2.0
3
4
#pragma once
5
6
#include "../sys/alloc.h"
7
#include "emath.h"
8
9
#if defined(EMBREE_SYCL_SUPPORT) && defined(__SYCL_DEVICE_ONLY__)
10
# include "vec3fa_sycl.h"
11
#else
12
13
#include "../simd/sse.h"
14
15
namespace embree
16
{
17
////////////////////////////////////////////////////////////////////////////////
18
/// SSE Vec3fa Type
19
////////////////////////////////////////////////////////////////////////////////
20
21
struct __aligned(16) Vec3fa
22
{
23
ALIGNED_STRUCT_(16);
24
25
typedef float Scalar;
26
enum { N = 3 };
27
union {
28
__m128 m128;
29
struct { float x,y,z; };
30
};
31
32
////////////////////////////////////////////////////////////////////////////////
33
/// Constructors, Assignment & Cast Operators
34
////////////////////////////////////////////////////////////////////////////////
35
36
__forceinline Vec3fa( ) {}
37
__forceinline Vec3fa( const __m128 a ) : m128(a) {}
38
39
__forceinline Vec3fa ( const Vec3<float>& other ) { m128 = _mm_set_ps(0, other.z, other.y, other.x); }
40
//__forceinline Vec3fa& operator =( const Vec3<float>& other ) { m128 = _mm_set_ps(0, other.z, other.y, other.x); return *this; }
41
42
__forceinline Vec3fa ( const Vec3fa& other ) { m128 = other.m128; }
43
__forceinline Vec3fa& operator =( const Vec3fa& other ) { m128 = other.m128; return *this; }
44
45
__forceinline explicit Vec3fa( const float a ) : m128(_mm_set1_ps(a)) {}
46
__forceinline Vec3fa( const float x, const float y, const float z) : m128(_mm_set_ps(0, z, y, x)) {}
47
48
__forceinline explicit Vec3fa( const __m128i a ) : m128(_mm_cvtepi32_ps(a)) {}
49
50
__forceinline explicit operator const vfloat4() const { return vfloat4(m128); }
51
__forceinline explicit operator const vint4() const { return vint4(_mm_cvtps_epi32(m128)); }
52
__forceinline explicit operator const Vec2fa() const { return Vec2fa(m128); }
53
__forceinline explicit operator const Vec3ia() const { return Vec3ia(_mm_cvtps_epi32(m128)); }
54
55
//__forceinline operator const __m128&() const { return m128; }
56
//__forceinline operator __m128&() { return m128; }
57
58
////////////////////////////////////////////////////////////////////////////////
59
/// Loads and Stores
60
////////////////////////////////////////////////////////////////////////////////
61
62
static __forceinline Vec3fa load( const void* const a ) {
63
#if defined(__aarch64__)
64
__m128 t = _mm_load_ps((float*)a);
65
t[3] = 0.0f;
66
return Vec3fa(t);
67
#else
68
return Vec3fa(_mm_and_ps(_mm_load_ps((float*)a),_mm_castsi128_ps(_mm_set_epi32(0, -1, -1, -1))));
69
#endif
70
}
71
72
static __forceinline Vec3fa loadu( const void* const a ) {
73
return Vec3fa(_mm_loadu_ps((float*)a));
74
}
75
76
static __forceinline void storeu ( void* ptr, const Vec3fa& v ) {
77
_mm_storeu_ps((float*)ptr,v.m128);
78
}
79
80
////////////////////////////////////////////////////////////////////////////////
81
/// Constants
82
////////////////////////////////////////////////////////////////////////////////
83
84
__forceinline Vec3fa( ZeroTy ) : m128(_mm_setzero_ps()) {}
85
__forceinline Vec3fa( OneTy ) : m128(_mm_set1_ps(1.0f)) {}
86
__forceinline Vec3fa( PosInfTy ) : m128(_mm_set1_ps(pos_inf)) {}
87
__forceinline Vec3fa( NegInfTy ) : m128(_mm_set1_ps(neg_inf)) {}
88
89
////////////////////////////////////////////////////////////////////////////////
90
/// Array Access
91
////////////////////////////////////////////////////////////////////////////////
92
93
__forceinline const float& operator []( const size_t index ) const { assert(index < 3); return (&x)[index]; }
94
__forceinline float& operator []( const size_t index ) { assert(index < 3); return (&x)[index]; }
95
};
96
97
////////////////////////////////////////////////////////////////////////////////
98
/// Unary Operators
99
////////////////////////////////////////////////////////////////////////////////
100
101
__forceinline Vec3fa operator +( const Vec3fa& a ) { return a; }
102
__forceinline Vec3fa operator -( const Vec3fa& a ) {
103
#if defined(__aarch64__)
104
return vnegq_f32(a.m128);
105
#else
106
const __m128 mask = _mm_castsi128_ps(_mm_set1_epi32(0x80000000));
107
return _mm_xor_ps(a.m128, mask);
108
#endif
109
}
110
__forceinline Vec3fa abs ( const Vec3fa& a ) {
111
#if defined(__aarch64__)
112
return _mm_abs_ps(a.m128);
113
#else
114
const __m128 mask = _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff));
115
return _mm_and_ps(a.m128, mask);
116
#endif
117
}
118
__forceinline Vec3fa sign ( const Vec3fa& a ) {
119
return blendv_ps(Vec3fa(one).m128, (-Vec3fa(one)).m128, _mm_cmplt_ps (a.m128,Vec3fa(zero).m128));
120
}
121
122
__forceinline Vec3fa rcp ( const Vec3fa& a )
123
{
124
#if defined(__aarch64__)
125
return vdivq_f32(vdupq_n_f32(1.0f),a.m128);
126
#else
127
128
#if defined(__AVX512VL__)
129
const Vec3fa r = _mm_rcp14_ps(a.m128);
130
#else
131
const Vec3fa r = _mm_rcp_ps(a.m128);
132
#endif
133
134
#if defined(__AVX2__)
135
const Vec3fa h_n = _mm_fnmadd_ps(a.m128, r.m128, vfloat4(1.0)); // First, compute 1 - a * r (which will be very close to 0)
136
const Vec3fa res = _mm_fmadd_ps(r.m128, h_n.m128, r.m128); // Then compute r + r * h_n
137
#else
138
const Vec3fa h_n = _mm_sub_ps(vfloat4(1.0f), _mm_mul_ps(a.m128, r.m128)); // First, compute 1 - a * r (which will be very close to 0)
139
const Vec3fa res = _mm_add_ps(r.m128,_mm_mul_ps(r.m128, h_n.m128)); // Then compute r + r * h_n
140
#endif
141
142
return res;
143
#endif //defined(__aarch64__)
144
}
145
146
__forceinline Vec3fa sqrt ( const Vec3fa& a ) { return _mm_sqrt_ps(a.m128); }
147
__forceinline Vec3fa sqr ( const Vec3fa& a ) { return _mm_mul_ps(a.m128,a.m128); }
148
149
__forceinline Vec3fa rsqrt( const Vec3fa& a )
150
{
151
#if defined(__aarch64__)
152
__m128 r = _mm_rsqrt_ps(a.m128);
153
r = vmulq_f32(r, vrsqrtsq_f32(vmulq_f32(a.m128, r), r));
154
r = vmulq_f32(r, vrsqrtsq_f32(vmulq_f32(a.m128, r), r));
155
return r;
156
#else
157
158
#if defined(__AVX512VL__)
159
__m128 r = _mm_rsqrt14_ps(a.m128);
160
#else
161
__m128 r = _mm_rsqrt_ps(a.m128);
162
#endif
163
return _mm_add_ps(_mm_mul_ps(_mm_set1_ps(1.5f),r), _mm_mul_ps(_mm_mul_ps(_mm_mul_ps(a.m128, _mm_set1_ps(-0.5f)), r), _mm_mul_ps(r, r)));
164
#endif
165
}
166
167
__forceinline Vec3fa zero_fix(const Vec3fa& a) {
168
return blendv_ps(a.m128, _mm_set1_ps(min_rcp_input), _mm_cmplt_ps (abs(a).m128, _mm_set1_ps(min_rcp_input)));
169
}
170
__forceinline Vec3fa rcp_safe(const Vec3fa& a) {
171
return rcp(zero_fix(a));
172
}
173
__forceinline Vec3fa log ( const Vec3fa& a ) {
174
return Vec3fa(logf(a.x),logf(a.y),logf(a.z));
175
}
176
177
__forceinline Vec3fa exp ( const Vec3fa& a ) {
178
return Vec3fa(expf(a.x),expf(a.y),expf(a.z));
179
}
180
181
////////////////////////////////////////////////////////////////////////////////
182
/// Binary Operators
183
////////////////////////////////////////////////////////////////////////////////
184
185
__forceinline Vec3fa operator +( const Vec3fa& a, const Vec3fa& b ) { return _mm_add_ps(a.m128, b.m128); }
186
__forceinline Vec3fa operator -( const Vec3fa& a, const Vec3fa& b ) { return _mm_sub_ps(a.m128, b.m128); }
187
__forceinline Vec3fa operator *( const Vec3fa& a, const Vec3fa& b ) { return _mm_mul_ps(a.m128, b.m128); }
188
__forceinline Vec3fa operator *( const Vec3fa& a, const float b ) { return a * Vec3fa(b); }
189
__forceinline Vec3fa operator *( const float a, const Vec3fa& b ) { return Vec3fa(a) * b; }
190
__forceinline Vec3fa operator /( const Vec3fa& a, const Vec3fa& b ) { return _mm_div_ps(a.m128,b.m128); }
191
__forceinline Vec3fa operator /( const Vec3fa& a, const float b ) { return _mm_div_ps(a.m128,_mm_set1_ps(b)); }
192
__forceinline Vec3fa operator /( const float a, const Vec3fa& b ) { return _mm_div_ps(_mm_set1_ps(a),b.m128); }
193
194
__forceinline Vec3fa min( const Vec3fa& a, const Vec3fa& b ) { return _mm_min_ps(a.m128,b.m128); }
195
__forceinline Vec3fa max( const Vec3fa& a, const Vec3fa& b ) { return _mm_max_ps(a.m128,b.m128); }
196
197
#if defined(__aarch64__) || defined(__SSE4_1__)
198
__forceinline Vec3fa mini(const Vec3fa& a, const Vec3fa& b) {
199
const vint4 ai = _mm_castps_si128(a.m128);
200
const vint4 bi = _mm_castps_si128(b.m128);
201
const vint4 ci = _mm_min_epi32(ai,bi);
202
return _mm_castsi128_ps(ci);
203
}
204
#endif
205
206
#if defined(__aarch64__) || defined(__SSE4_1__)
207
__forceinline Vec3fa maxi(const Vec3fa& a, const Vec3fa& b) {
208
const vint4 ai = _mm_castps_si128(a.m128);
209
const vint4 bi = _mm_castps_si128(b.m128);
210
const vint4 ci = _mm_max_epi32(ai,bi);
211
return _mm_castsi128_ps(ci);
212
}
213
#endif
214
215
__forceinline Vec3fa pow ( const Vec3fa& a, const float& b ) {
216
return Vec3fa(powf(a.x,b),powf(a.y,b),powf(a.z,b));
217
}
218
219
////////////////////////////////////////////////////////////////////////////////
220
/// Ternary Operators
221
////////////////////////////////////////////////////////////////////////////////
222
223
#if defined(__AVX2__) || defined(__ARM_NEON)
224
__forceinline Vec3fa madd ( const Vec3fa& a, const Vec3fa& b, const Vec3fa& c) { return _mm_fmadd_ps(a.m128,b.m128,c.m128); }
225
__forceinline Vec3fa msub ( const Vec3fa& a, const Vec3fa& b, const Vec3fa& c) { return _mm_fmsub_ps(a.m128,b.m128,c.m128); }
226
__forceinline Vec3fa nmadd ( const Vec3fa& a, const Vec3fa& b, const Vec3fa& c) { return _mm_fnmadd_ps(a.m128,b.m128,c.m128); }
227
__forceinline Vec3fa nmsub ( const Vec3fa& a, const Vec3fa& b, const Vec3fa& c) { return _mm_fnmsub_ps(a.m128,b.m128,c.m128); }
228
#else
229
__forceinline Vec3fa madd ( const Vec3fa& a, const Vec3fa& b, const Vec3fa& c) { return a*b+c; }
230
__forceinline Vec3fa nmadd ( const Vec3fa& a, const Vec3fa& b, const Vec3fa& c) { return -a*b+c;}
231
__forceinline Vec3fa nmsub ( const Vec3fa& a, const Vec3fa& b, const Vec3fa& c) { return -a*b-c; }
232
__forceinline Vec3fa msub ( const Vec3fa& a, const Vec3fa& b, const Vec3fa& c) { return a*b-c; }
233
#endif
234
235
__forceinline Vec3fa madd ( const float a, const Vec3fa& b, const Vec3fa& c) { return madd(Vec3fa(a),b,c); }
236
__forceinline Vec3fa msub ( const float a, const Vec3fa& b, const Vec3fa& c) { return msub(Vec3fa(a),b,c); }
237
__forceinline Vec3fa nmadd ( const float a, const Vec3fa& b, const Vec3fa& c) { return nmadd(Vec3fa(a),b,c); }
238
__forceinline Vec3fa nmsub ( const float a, const Vec3fa& b, const Vec3fa& c) { return nmsub(Vec3fa(a),b,c); }
239
240
////////////////////////////////////////////////////////////////////////////////
241
/// Assignment Operators
242
////////////////////////////////////////////////////////////////////////////////
243
244
__forceinline Vec3fa& operator +=( Vec3fa& a, const Vec3fa& b ) { return a = a + b; }
245
__forceinline Vec3fa& operator -=( Vec3fa& a, const Vec3fa& b ) { return a = a - b; }
246
__forceinline Vec3fa& operator *=( Vec3fa& a, const Vec3fa& b ) { return a = a * b; }
247
__forceinline Vec3fa& operator *=( Vec3fa& a, const float b ) { return a = a * b; }
248
__forceinline Vec3fa& operator /=( Vec3fa& a, const Vec3fa& b ) { return a = a / b; }
249
__forceinline Vec3fa& operator /=( Vec3fa& a, const float b ) { return a = a / b; }
250
251
////////////////////////////////////////////////////////////////////////////////
252
/// Reductions
253
////////////////////////////////////////////////////////////////////////////////
254
#if defined(__aarch64__)
255
__forceinline float reduce_add(const Vec3fa& v) {
256
float32x4_t t = v.m128;
257
t[3] = 0.0f;
258
return vaddvq_f32(t);
259
}
260
261
__forceinline float reduce_mul(const Vec3fa& v) { return v.x*v.y*v.z; }
262
__forceinline float reduce_min(const Vec3fa& v) {
263
float32x4_t t = v.m128;
264
t[3] = t[2];
265
return vminvq_f32(t);
266
}
267
__forceinline float reduce_max(const Vec3fa& v) {
268
float32x4_t t = v.m128;
269
t[3] = t[2];
270
return vmaxvq_f32(t);
271
}
272
#else
273
__forceinline float reduce_add(const Vec3fa& v) {
274
const vfloat4 a(v.m128);
275
const vfloat4 b = shuffle<1>(a);
276
const vfloat4 c = shuffle<2>(a);
277
return _mm_cvtss_f32(a+b+c);
278
}
279
280
__forceinline float reduce_mul(const Vec3fa& v) { return v.x*v.y*v.z; }
281
__forceinline float reduce_min(const Vec3fa& v) { return min(v.x,v.y,v.z); }
282
__forceinline float reduce_max(const Vec3fa& v) { return max(v.x,v.y,v.z); }
283
#endif
284
285
////////////////////////////////////////////////////////////////////////////////
286
/// Comparison Operators
287
////////////////////////////////////////////////////////////////////////////////
288
289
__forceinline bool operator ==( const Vec3fa& a, const Vec3fa& b ) { return (_mm_movemask_ps(_mm_cmpeq_ps (a.m128, b.m128)) & 7) == 7; }
290
__forceinline bool operator !=( const Vec3fa& a, const Vec3fa& b ) { return (_mm_movemask_ps(_mm_cmpneq_ps(a.m128, b.m128)) & 7) != 0; }
291
292
__forceinline Vec3ba eq_mask( const Vec3fa& a, const Vec3fa& b ) { return _mm_cmpeq_ps (a.m128, b.m128); }
293
__forceinline Vec3ba neq_mask(const Vec3fa& a, const Vec3fa& b ) { return _mm_cmpneq_ps(a.m128, b.m128); }
294
__forceinline Vec3ba lt_mask( const Vec3fa& a, const Vec3fa& b ) { return _mm_cmplt_ps (a.m128, b.m128); }
295
__forceinline Vec3ba le_mask( const Vec3fa& a, const Vec3fa& b ) { return _mm_cmple_ps (a.m128, b.m128); }
296
#if defined(__aarch64__)
297
__forceinline Vec3ba gt_mask( const Vec3fa& a, const Vec3fa& b ) { return _mm_cmpgt_ps (a.m128, b.m128); }
298
__forceinline Vec3ba ge_mask( const Vec3fa& a, const Vec3fa& b ) { return _mm_cmpge_ps (a.m128, b.m128); }
299
#else
300
__forceinline Vec3ba gt_mask(const Vec3fa& a, const Vec3fa& b) { return _mm_cmpnle_ps(a.m128, b.m128); }
301
__forceinline Vec3ba ge_mask(const Vec3fa& a, const Vec3fa& b) { return _mm_cmpnlt_ps(a.m128, b.m128); }
302
#endif
303
304
__forceinline bool isvalid ( const Vec3fa& v ) {
305
return all(gt_mask(v,Vec3fa(-FLT_LARGE)) & lt_mask(v,Vec3fa(+FLT_LARGE)));
306
}
307
308
__forceinline bool is_finite ( const Vec3fa& a ) {
309
return all(ge_mask(a,Vec3fa(-FLT_MAX)) & le_mask(a,Vec3fa(+FLT_MAX)));
310
}
311
312
__forceinline bool isvalid4 ( const Vec3fa& v ) {
313
return all((vfloat4(v.m128) > vfloat4(-FLT_LARGE)) & (vfloat4(v.m128) < vfloat4(+FLT_LARGE)));
314
}
315
316
__forceinline bool is_finite4 ( const Vec3fa& a ) {
317
return all((vfloat4(a.m128) >= vfloat4(-FLT_MAX)) & (vfloat4(a.m128) <= vfloat4(+FLT_MAX)));
318
}
319
320
////////////////////////////////////////////////////////////////////////////////
321
/// Euclidean Space Operators
322
////////////////////////////////////////////////////////////////////////////////
323
324
#if defined(__SSE4_1__)
325
__forceinline float dot ( const Vec3fa& a, const Vec3fa& b ) {
326
return _mm_cvtss_f32(_mm_dp_ps(a.m128,b.m128,0x7F));
327
}
328
#else
329
__forceinline float dot ( const Vec3fa& a, const Vec3fa& b ) {
330
return reduce_add(a*b);
331
}
332
#endif
333
334
__forceinline Vec3fa cross ( const Vec3fa& a, const Vec3fa& b )
335
{
336
vfloat4 a0 = vfloat4(a.m128);
337
vfloat4 b0 = shuffle<1,2,0,3>(vfloat4(b.m128));
338
vfloat4 a1 = shuffle<1,2,0,3>(vfloat4(a.m128));
339
vfloat4 b1 = vfloat4(b.m128);
340
return Vec3fa(shuffle<1,2,0,3>(msub(a0,b0,a1*b1)));
341
}
342
343
__forceinline float sqr_length ( const Vec3fa& a ) { return dot(a,a); }
344
__forceinline float rcp_length ( const Vec3fa& a ) { return rsqrt(dot(a,a)); }
345
__forceinline float rcp_length2( const Vec3fa& a ) { return rcp(dot(a,a)); }
346
__forceinline float length ( const Vec3fa& a ) { return sqrt(dot(a,a)); }
347
__forceinline Vec3fa normalize( const Vec3fa& a ) { return a*rsqrt(dot(a,a)); }
348
__forceinline float distance ( const Vec3fa& a, const Vec3fa& b ) { return length(a-b); }
349
__forceinline float halfArea ( const Vec3fa& d ) { return madd(d.x,(d.y+d.z),d.y*d.z); }
350
__forceinline float area ( const Vec3fa& d ) { return 2.0f*halfArea(d); }
351
352
__forceinline Vec3fa normalize_safe( const Vec3fa& a ) {
353
const float d = dot(a,a); if (unlikely(d == 0.0f)) return a; else return a*rsqrt(d);
354
}
355
356
/*! differentiated normalization */
357
__forceinline Vec3fa dnormalize(const Vec3fa& p, const Vec3fa& dp)
358
{
359
const float pp = dot(p,p);
360
const float pdp = dot(p,dp);
361
return (pp*dp-pdp*p)*rcp(pp)*rsqrt(pp);
362
}
363
364
////////////////////////////////////////////////////////////////////////////////
365
/// Select
366
////////////////////////////////////////////////////////////////////////////////
367
368
__forceinline Vec3fa select( bool s, const Vec3fa& t, const Vec3fa& f ) {
369
__m128 mask = s ? _mm_castsi128_ps(_mm_cmpeq_epi32(_mm_setzero_si128(), _mm_setzero_si128())) : _mm_setzero_ps();
370
return blendv_ps(f.m128, t.m128, mask);
371
}
372
373
__forceinline Vec3fa select( const Vec3ba& s, const Vec3fa& t, const Vec3fa& f ) {
374
return blendv_ps(f.m128, t.m128, s);
375
}
376
377
__forceinline Vec3fa lerp(const Vec3fa& v0, const Vec3fa& v1, const float t) {
378
return madd(1.0f-t,v0,t*v1);
379
}
380
381
__forceinline int maxDim ( const Vec3fa& a )
382
{
383
const Vec3fa b = abs(a);
384
if (b.x > b.y) {
385
if (b.x > b.z) return 0; else return 2;
386
} else {
387
if (b.y > b.z) return 1; else return 2;
388
}
389
}
390
391
////////////////////////////////////////////////////////////////////////////////
392
/// Rounding Functions
393
////////////////////////////////////////////////////////////////////////////////
394
395
#if defined(__aarch64__)
396
__forceinline Vec3fa floor(const Vec3fa& a) { return vrndmq_f32(a.m128); }
397
__forceinline Vec3fa ceil (const Vec3fa& a) { return vrndpq_f32(a.m128); }
398
__forceinline Vec3fa trunc(const Vec3fa& a) { return vrndq_f32(a.m128); }
399
#elif defined (__SSE4_1__)
400
__forceinline Vec3fa trunc( const Vec3fa& a ) { return _mm_round_ps(a.m128, _MM_FROUND_TO_NEAREST_INT); }
401
__forceinline Vec3fa floor( const Vec3fa& a ) { return _mm_round_ps(a.m128, _MM_FROUND_TO_NEG_INF ); }
402
__forceinline Vec3fa ceil ( const Vec3fa& a ) { return _mm_round_ps(a.m128, _MM_FROUND_TO_POS_INF ); }
403
#else
404
__forceinline Vec3fa trunc( const Vec3fa& a ) { return Vec3fa(truncf(a.x),truncf(a.y),truncf(a.z)); }
405
__forceinline Vec3fa floor( const Vec3fa& a ) { return Vec3fa(floorf(a.x),floorf(a.y),floorf(a.z)); }
406
__forceinline Vec3fa ceil ( const Vec3fa& a ) { return Vec3fa(ceilf (a.x),ceilf (a.y),ceilf (a.z)); }
407
#endif
408
409
////////////////////////////////////////////////////////////////////////////////
410
/// Output Operators
411
////////////////////////////////////////////////////////////////////////////////
412
413
__forceinline embree_ostream operator<<(embree_ostream cout, const Vec3fa& a) {
414
return cout << "(" << a.x << ", " << a.y << ", " << a.z << ")";
415
}
416
417
typedef Vec3fa Vec3fa_t;
418
419
420
////////////////////////////////////////////////////////////////////////////////
421
/// SSE Vec3fx Type
422
////////////////////////////////////////////////////////////////////////////////
423
424
struct __aligned(16) Vec3fx
425
{
426
ALIGNED_STRUCT_(16);
427
428
typedef float Scalar;
429
enum { N = 3 };
430
union {
431
__m128 m128;
432
struct { float x,y,z; union { int a; unsigned u; float w; }; };
433
};
434
435
////////////////////////////////////////////////////////////////////////////////
436
/// Constructors, Assignment & Cast Operators
437
////////////////////////////////////////////////////////////////////////////////
438
439
__forceinline Vec3fx( ) {}
440
__forceinline Vec3fx( const __m128 a ) : m128(a) {}
441
442
__forceinline explicit Vec3fx(const Vec3fa& v) : m128(v.m128) {}
443
__forceinline operator Vec3fa () const { return Vec3fa(m128); }
444
445
__forceinline explicit Vec3fx ( const Vec3<float>& other ) { m128 = _mm_set_ps(0, other.z, other.y, other.x); }
446
//__forceinline Vec3fx& operator =( const Vec3<float>& other ) { m128 = _mm_set_ps(0, other.z, other.y, other.x); return *this; }
447
448
__forceinline Vec3fx ( const Vec3fx& other ) { m128 = other.m128; }
449
__forceinline Vec3fx& operator =( const Vec3fx& other ) { m128 = other.m128; return *this; }
450
451
__forceinline explicit Vec3fx( const float a ) : m128(_mm_set1_ps(a)) {}
452
__forceinline Vec3fx( const float x, const float y, const float z) : m128(_mm_set_ps(0, z, y, x)) {}
453
454
__forceinline Vec3fx( const Vec3fa& other, const int a1) { m128 = other.m128; a = a1; }
455
__forceinline Vec3fx( const Vec3fa& other, const unsigned a1) { m128 = other.m128; u = a1; }
456
__forceinline Vec3fx( const Vec3fa& other, const float w1) {
457
#if defined (__aarch64__)
458
m128 = other.m128; m128[3] = w1;
459
#elif defined (__SSE4_1__)
460
m128 = _mm_insert_ps(other.m128, _mm_set_ss(w1),3 << 4);
461
#else
462
const vint4 mask(-1,-1,-1,0);
463
m128 = select(vboolf4(_mm_castsi128_ps(mask)),vfloat4(other.m128),vfloat4(w1));
464
#endif
465
}
466
//__forceinline Vec3fx( const float x, const float y, const float z, const int a) : x(x), y(y), z(z), a(a) {} // not working properly!
467
//__forceinline Vec3fx( const float x, const float y, const float z, const unsigned a) : x(x), y(y), z(z), u(a) {} // not working properly!
468
__forceinline Vec3fx( const float x, const float y, const float z, const float w) : m128(_mm_set_ps(w, z, y, x)) {}
469
470
//__forceinline explicit Vec3fx( const __m128i a ) : m128(_mm_cvtepi32_ps(a)) {}
471
472
__forceinline explicit operator const vfloat4() const { return vfloat4(m128); }
473
__forceinline explicit operator const vint4() const { return vint4(_mm_cvtps_epi32(m128)); }
474
__forceinline explicit operator const Vec2fa() const { return Vec2fa(m128); }
475
__forceinline explicit operator const Vec3ia() const { return Vec3ia(_mm_cvtps_epi32(m128)); }
476
477
//__forceinline operator const __m128&() const { return m128; }
478
//__forceinline operator __m128&() { return m128; }
479
480
////////////////////////////////////////////////////////////////////////////////
481
/// Loads and Stores
482
////////////////////////////////////////////////////////////////////////////////
483
484
static __forceinline Vec3fx load( const void* const a ) {
485
return Vec3fx(_mm_and_ps(_mm_load_ps((float*)a),_mm_castsi128_ps(_mm_set_epi32(0, -1, -1, -1))));
486
}
487
488
static __forceinline Vec3fx loadu( const void* const a ) {
489
return Vec3fx(_mm_loadu_ps((float*)a));
490
}
491
492
static __forceinline void storeu ( void* ptr, const Vec3fx& v ) {
493
_mm_storeu_ps((float*)ptr,v.m128);
494
}
495
496
////////////////////////////////////////////////////////////////////////////////
497
/// Constants
498
////////////////////////////////////////////////////////////////////////////////
499
500
__forceinline Vec3fx( ZeroTy ) : m128(_mm_setzero_ps()) {}
501
__forceinline Vec3fx( OneTy ) : m128(_mm_set1_ps(1.0f)) {}
502
__forceinline Vec3fx( PosInfTy ) : m128(_mm_set1_ps(pos_inf)) {}
503
__forceinline Vec3fx( NegInfTy ) : m128(_mm_set1_ps(neg_inf)) {}
504
505
////////////////////////////////////////////////////////////////////////////////
506
/// Array Access
507
////////////////////////////////////////////////////////////////////////////////
508
509
__forceinline const float& operator []( const size_t index ) const { assert(index < 3); return (&x)[index]; }
510
__forceinline float& operator []( const size_t index ) { assert(index < 3); return (&x)[index]; }
511
};
512
513
////////////////////////////////////////////////////////////////////////////////
514
/// Unary Operators
515
////////////////////////////////////////////////////////////////////////////////
516
517
__forceinline Vec3fx operator +( const Vec3fx& a ) { return a; }
518
__forceinline Vec3fx operator -( const Vec3fx& a ) {
519
const __m128 mask = _mm_castsi128_ps(_mm_set1_epi32(0x80000000));
520
return _mm_xor_ps(a.m128, mask);
521
}
522
__forceinline Vec3fx abs ( const Vec3fx& a ) {
523
const __m128 mask = _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff));
524
return _mm_and_ps(a.m128, mask);
525
}
526
__forceinline Vec3fx sign ( const Vec3fx& a ) {
527
return blendv_ps(Vec3fx(one).m128, (-Vec3fx(one)).m128, _mm_cmplt_ps (a.m128,Vec3fx(zero).m128));
528
}
529
530
__forceinline Vec3fx rcp ( const Vec3fx& a )
531
{
532
#if defined(__AVX512VL__)
533
const Vec3fx r = _mm_rcp14_ps(a.m128);
534
#else
535
const Vec3fx r = _mm_rcp_ps(a.m128);
536
#endif
537
538
#if defined(__AVX2__)
539
const Vec3fx res = _mm_mul_ps(r.m128,_mm_fnmadd_ps(r.m128, a.m128, vfloat4(2.0f)));
540
#else
541
const Vec3fx res = _mm_mul_ps(r.m128,_mm_sub_ps(vfloat4(2.0f), _mm_mul_ps(r.m128, a.m128)));
542
//return _mm_sub_ps(_mm_add_ps(r, r), _mm_mul_ps(_mm_mul_ps(r, r), a));
543
#endif
544
545
return res;
546
}
547
548
__forceinline Vec3fx sqrt ( const Vec3fx& a ) { return _mm_sqrt_ps(a.m128); }
549
__forceinline Vec3fx sqr ( const Vec3fx& a ) { return _mm_mul_ps(a.m128,a.m128); }
550
551
__forceinline Vec3fx rsqrt( const Vec3fx& a )
552
{
553
#if defined(__AVX512VL__)
554
__m128 r = _mm_rsqrt14_ps(a.m128);
555
#else
556
__m128 r = _mm_rsqrt_ps(a.m128);
557
#endif
558
return _mm_add_ps(_mm_mul_ps(_mm_set1_ps(1.5f),r), _mm_mul_ps(_mm_mul_ps(_mm_mul_ps(a.m128, _mm_set1_ps(-0.5f)), r), _mm_mul_ps(r, r)));
559
}
560
561
__forceinline Vec3fx zero_fix(const Vec3fx& a) {
562
return blendv_ps(a.m128, _mm_set1_ps(min_rcp_input), _mm_cmplt_ps (abs(a).m128, _mm_set1_ps(min_rcp_input)));
563
}
564
__forceinline Vec3fx rcp_safe(const Vec3fx& a) {
565
return rcp(zero_fix(a));
566
}
567
__forceinline Vec3fx log ( const Vec3fx& a ) {
568
return Vec3fx(logf(a.x),logf(a.y),logf(a.z));
569
}
570
571
__forceinline Vec3fx exp ( const Vec3fx& a ) {
572
return Vec3fx(expf(a.x),expf(a.y),expf(a.z));
573
}
574
575
////////////////////////////////////////////////////////////////////////////////
576
/// Binary Operators
577
////////////////////////////////////////////////////////////////////////////////
578
579
__forceinline Vec3fx operator +( const Vec3fx& a, const Vec3fx& b ) { return _mm_add_ps(a.m128, b.m128); }
580
__forceinline Vec3fx operator -( const Vec3fx& a, const Vec3fx& b ) { return _mm_sub_ps(a.m128, b.m128); }
581
__forceinline Vec3fx operator *( const Vec3fx& a, const Vec3fx& b ) { return _mm_mul_ps(a.m128, b.m128); }
582
__forceinline Vec3fx operator *( const Vec3fx& a, const float b ) { return a * Vec3fx(b); }
583
__forceinline Vec3fx operator *( const float a, const Vec3fx& b ) { return Vec3fx(a) * b; }
584
__forceinline Vec3fx operator /( const Vec3fx& a, const Vec3fx& b ) { return _mm_div_ps(a.m128,b.m128); }
585
__forceinline Vec3fx operator /( const Vec3fx& a, const float b ) { return _mm_div_ps(a.m128,_mm_set1_ps(b)); }
586
__forceinline Vec3fx operator /( const float a, const Vec3fx& b ) { return _mm_div_ps(_mm_set1_ps(a),b.m128); }
587
588
__forceinline Vec3fx min( const Vec3fx& a, const Vec3fx& b ) { return _mm_min_ps(a.m128,b.m128); }
589
__forceinline Vec3fx max( const Vec3fx& a, const Vec3fx& b ) { return _mm_max_ps(a.m128,b.m128); }
590
591
#if defined(__SSE4_1__) || defined(__aarch64__)
592
__forceinline Vec3fx mini(const Vec3fx& a, const Vec3fx& b) {
593
const vint4 ai = _mm_castps_si128(a.m128);
594
const vint4 bi = _mm_castps_si128(b.m128);
595
const vint4 ci = _mm_min_epi32(ai,bi);
596
return _mm_castsi128_ps(ci);
597
}
598
#endif
599
600
#if defined(__SSE4_1__) || defined(__aarch64__)
601
__forceinline Vec3fx maxi(const Vec3fx& a, const Vec3fx& b) {
602
const vint4 ai = _mm_castps_si128(a.m128);
603
const vint4 bi = _mm_castps_si128(b.m128);
604
const vint4 ci = _mm_max_epi32(ai,bi);
605
return _mm_castsi128_ps(ci);
606
}
607
#endif
608
609
__forceinline Vec3fx pow ( const Vec3fx& a, const float& b ) {
610
return Vec3fx(powf(a.x,b),powf(a.y,b),powf(a.z,b));
611
}
612
613
////////////////////////////////////////////////////////////////////////////////
614
/// Ternary Operators
615
////////////////////////////////////////////////////////////////////////////////
616
617
#if defined(__AVX2__)
618
__forceinline Vec3fx madd ( const Vec3fx& a, const Vec3fx& b, const Vec3fx& c) { return _mm_fmadd_ps(a.m128,b.m128,c.m128); }
619
__forceinline Vec3fx msub ( const Vec3fx& a, const Vec3fx& b, const Vec3fx& c) { return _mm_fmsub_ps(a.m128,b.m128,c.m128); }
620
__forceinline Vec3fx nmadd ( const Vec3fx& a, const Vec3fx& b, const Vec3fx& c) { return _mm_fnmadd_ps(a.m128,b.m128,c.m128); }
621
__forceinline Vec3fx nmsub ( const Vec3fx& a, const Vec3fx& b, const Vec3fx& c) { return _mm_fnmsub_ps(a.m128,b.m128,c.m128); }
622
#else
623
__forceinline Vec3fx madd ( const Vec3fx& a, const Vec3fx& b, const Vec3fx& c) { return a*b+c; }
624
__forceinline Vec3fx msub ( const Vec3fx& a, const Vec3fx& b, const Vec3fx& c) { return a*b-c; }
625
__forceinline Vec3fx nmadd ( const Vec3fx& a, const Vec3fx& b, const Vec3fx& c) { return -a*b+c;}
626
__forceinline Vec3fx nmsub ( const Vec3fx& a, const Vec3fx& b, const Vec3fx& c) { return -a*b-c; }
627
#endif
628
629
__forceinline Vec3fx madd ( const float a, const Vec3fx& b, const Vec3fx& c) { return madd(Vec3fx(a),b,c); }
630
__forceinline Vec3fx msub ( const float a, const Vec3fx& b, const Vec3fx& c) { return msub(Vec3fx(a),b,c); }
631
__forceinline Vec3fx nmadd ( const float a, const Vec3fx& b, const Vec3fx& c) { return nmadd(Vec3fx(a),b,c); }
632
__forceinline Vec3fx nmsub ( const float a, const Vec3fx& b, const Vec3fx& c) { return nmsub(Vec3fx(a),b,c); }
633
634
////////////////////////////////////////////////////////////////////////////////
635
/// Assignment Operators
636
////////////////////////////////////////////////////////////////////////////////
637
638
__forceinline Vec3fx& operator +=( Vec3fx& a, const Vec3fx& b ) { return a = a + b; }
639
__forceinline Vec3fx& operator -=( Vec3fx& a, const Vec3fx& b ) { return a = a - b; }
640
__forceinline Vec3fx& operator *=( Vec3fx& a, const Vec3fx& b ) { return a = a * b; }
641
__forceinline Vec3fx& operator *=( Vec3fx& a, const float b ) { return a = a * b; }
642
__forceinline Vec3fx& operator /=( Vec3fx& a, const Vec3fx& b ) { return a = a / b; }
643
__forceinline Vec3fx& operator /=( Vec3fx& a, const float b ) { return a = a / b; }
644
645
////////////////////////////////////////////////////////////////////////////////
646
/// Reductions
647
////////////////////////////////////////////////////////////////////////////////
648
649
__forceinline float reduce_add(const Vec3fx& v) {
650
const vfloat4 a(v.m128);
651
const vfloat4 b = shuffle<1>(a);
652
const vfloat4 c = shuffle<2>(a);
653
return _mm_cvtss_f32(a+b+c);
654
}
655
656
__forceinline float reduce_mul(const Vec3fx& v) { return v.x*v.y*v.z; }
657
__forceinline float reduce_min(const Vec3fx& v) { return min(v.x,v.y,v.z); }
658
__forceinline float reduce_max(const Vec3fx& v) { return max(v.x,v.y,v.z); }
659
660
////////////////////////////////////////////////////////////////////////////////
661
/// Comparison Operators
662
////////////////////////////////////////////////////////////////////////////////
663
664
__forceinline bool operator ==( const Vec3fx& a, const Vec3fx& b ) { return (_mm_movemask_ps(_mm_cmpeq_ps (a.m128, b.m128)) & 7) == 7; }
665
__forceinline bool operator !=( const Vec3fx& a, const Vec3fx& b ) { return (_mm_movemask_ps(_mm_cmpneq_ps(a.m128, b.m128)) & 7) != 0; }
666
667
__forceinline Vec3ba eq_mask( const Vec3fx& a, const Vec3fx& b ) { return _mm_cmpeq_ps (a.m128, b.m128); }
668
__forceinline Vec3ba neq_mask(const Vec3fx& a, const Vec3fx& b ) { return _mm_cmpneq_ps(a.m128, b.m128); }
669
__forceinline Vec3ba lt_mask( const Vec3fx& a, const Vec3fx& b ) { return _mm_cmplt_ps (a.m128, b.m128); }
670
__forceinline Vec3ba le_mask( const Vec3fx& a, const Vec3fx& b ) { return _mm_cmple_ps (a.m128, b.m128); }
671
__forceinline Vec3ba gt_mask( const Vec3fx& a, const Vec3fx& b ) { return _mm_cmpnle_ps(a.m128, b.m128); }
672
__forceinline Vec3ba ge_mask( const Vec3fx& a, const Vec3fx& b ) { return _mm_cmpnlt_ps(a.m128, b.m128); }
673
674
__forceinline bool isvalid ( const Vec3fx& v ) {
675
return all(gt_mask(v,Vec3fx(-FLT_LARGE)) & lt_mask(v,Vec3fx(+FLT_LARGE)));
676
}
677
678
__forceinline bool is_finite ( const Vec3fx& a ) {
679
return all(ge_mask(a,Vec3fx(-FLT_MAX)) & le_mask(a,Vec3fx(+FLT_MAX)));
680
}
681
682
__forceinline bool isvalid4 ( const Vec3fx& v ) {
683
return all((vfloat4(v.m128) > vfloat4(-FLT_LARGE)) & (vfloat4(v.m128) < vfloat4(+FLT_LARGE)));
684
}
685
686
__forceinline bool is_finite4 ( const Vec3fx& a ) {
687
return all((vfloat4(a.m128) >= vfloat4(-FLT_MAX)) & (vfloat4(a.m128) <= vfloat4(+FLT_MAX)));
688
}
689
690
////////////////////////////////////////////////////////////////////////////////
691
/// Euclidean Space Operators
692
////////////////////////////////////////////////////////////////////////////////
693
694
#if defined(__SSE4_1__)
695
__forceinline float dot ( const Vec3fx& a, const Vec3fx& b ) {
696
return _mm_cvtss_f32(_mm_dp_ps(a.m128,b.m128,0x7F));
697
}
698
#else
699
__forceinline float dot ( const Vec3fx& a, const Vec3fx& b ) {
700
return reduce_add(a*b);
701
}
702
#endif
703
704
__forceinline Vec3fx cross ( const Vec3fx& a, const Vec3fx& b )
705
{
706
vfloat4 a0 = vfloat4(a.m128);
707
vfloat4 b0 = shuffle<1,2,0,3>(vfloat4(b.m128));
708
vfloat4 a1 = shuffle<1,2,0,3>(vfloat4(a.m128));
709
vfloat4 b1 = vfloat4(b.m128);
710
return Vec3fx(shuffle<1,2,0,3>(msub(a0,b0,a1*b1)));
711
}
712
713
__forceinline float sqr_length ( const Vec3fx& a ) { return dot(a,a); }
714
__forceinline float rcp_length ( const Vec3fx& a ) { return rsqrt(dot(a,a)); }
715
__forceinline float rcp_length2( const Vec3fx& a ) { return rcp(dot(a,a)); }
716
__forceinline float length ( const Vec3fx& a ) { return sqrt(dot(a,a)); }
717
__forceinline Vec3fx normalize( const Vec3fx& a ) { return a*rsqrt(dot(a,a)); }
718
__forceinline float distance ( const Vec3fx& a, const Vec3fx& b ) { return length(a-b); }
719
__forceinline float halfArea ( const Vec3fx& d ) { return madd(d.x,(d.y+d.z),d.y*d.z); }
720
__forceinline float area ( const Vec3fx& d ) { return 2.0f*halfArea(d); }
721
722
__forceinline Vec3fx normalize_safe( const Vec3fx& a ) {
723
const float d = dot(a,a); if (unlikely(d == 0.0f)) return a; else return a*rsqrt(d);
724
}
725
726
/*! differentiated normalization */
727
__forceinline Vec3fx dnormalize(const Vec3fx& p, const Vec3fx& dp)
728
{
729
const float pp = dot(p,p);
730
const float pdp = dot(p,dp);
731
return (pp*dp-pdp*p)*rcp(pp)*rsqrt(pp);
732
}
733
734
////////////////////////////////////////////////////////////////////////////////
735
/// Select
736
////////////////////////////////////////////////////////////////////////////////
737
738
__forceinline Vec3fx select( bool s, const Vec3fx& t, const Vec3fx& f ) {
739
__m128 mask = s ? _mm_castsi128_ps(_mm_cmpeq_epi32(_mm_setzero_si128(), _mm_setzero_si128())) : _mm_setzero_ps();
740
return blendv_ps(f.m128, t.m128, mask);
741
}
742
743
__forceinline Vec3fx select( const Vec3ba& s, const Vec3fx& t, const Vec3fx& f ) {
744
return blendv_ps(f.m128, t.m128, s);
745
}
746
747
__forceinline Vec3fx lerp(const Vec3fx& v0, const Vec3fx& v1, const float t) {
748
return madd(1.0f-t,v0,t*v1);
749
}
750
751
__forceinline int maxDim ( const Vec3fx& a )
752
{
753
const Vec3fx b = abs(a);
754
if (b.x > b.y) {
755
if (b.x > b.z) return 0; else return 2;
756
} else {
757
if (b.y > b.z) return 1; else return 2;
758
}
759
}
760
761
////////////////////////////////////////////////////////////////////////////////
762
/// Rounding Functions
763
////////////////////////////////////////////////////////////////////////////////
764
765
#if defined(__aarch64__)
766
__forceinline Vec3fx trunc(const Vec3fx& a) { return vrndq_f32(a.m128); }
767
__forceinline Vec3fx floor(const Vec3fx& a) { return vrndmq_f32(a.m128); }
768
__forceinline Vec3fx ceil (const Vec3fx& a) { return vrndpq_f32(a.m128); }
769
#elif defined (__SSE4_1__)
770
__forceinline Vec3fx trunc( const Vec3fx& a ) { return _mm_round_ps(a.m128, _MM_FROUND_TO_NEAREST_INT); }
771
__forceinline Vec3fx floor( const Vec3fx& a ) { return _mm_round_ps(a.m128, _MM_FROUND_TO_NEG_INF ); }
772
__forceinline Vec3fx ceil ( const Vec3fx& a ) { return _mm_round_ps(a.m128, _MM_FROUND_TO_POS_INF ); }
773
#else
774
__forceinline Vec3fx trunc( const Vec3fx& a ) { return Vec3fx(truncf(a.x),truncf(a.y),truncf(a.z)); }
775
__forceinline Vec3fx floor( const Vec3fx& a ) { return Vec3fx(floorf(a.x),floorf(a.y),floorf(a.z)); }
776
__forceinline Vec3fx ceil ( const Vec3fx& a ) { return Vec3fx(ceilf (a.x),ceilf (a.y),ceilf (a.z)); }
777
#endif
778
779
////////////////////////////////////////////////////////////////////////////////
780
/// Output Operators
781
////////////////////////////////////////////////////////////////////////////////
782
783
__forceinline embree_ostream operator<<(embree_ostream cout, const Vec3fx& a) {
784
return cout << "(" << a.x << ", " << a.y << ", " << a.z << ")";
785
}
786
787
788
typedef Vec3fx Vec3ff;
789
}
790
791
#endif
792
793