Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
godotengine
GitHub Repository: godotengine/godot
Path: blob/master/thirdparty/embree/common/simd/vllong8_avx512.h
9912 views
1
// Copyright 2009-2021 Intel Corporation
2
// SPDX-License-Identifier: Apache-2.0
3
4
#pragma once
5
6
#define vboolf vboolf_impl
7
#define vboold vboold_impl
8
#define vint vint_impl
9
#define vuint vuint_impl
10
#define vllong vllong_impl
11
#define vfloat vfloat_impl
12
#define vdouble vdouble_impl
13
14
namespace embree
15
{
16
/* 8-wide AVX-512 64-bit long long type */
17
template<>
18
struct vllong<8>
19
{
20
ALIGNED_STRUCT_(64);
21
22
typedef vboold8 Bool;
23
24
enum { size = 8 }; // number of SIMD elements
25
union { // data
26
__m512i v;
27
long long i[8];
28
};
29
30
////////////////////////////////////////////////////////////////////////////////
31
/// Constructors, Assignment & Cast Operators
32
////////////////////////////////////////////////////////////////////////////////
33
34
__forceinline vllong() {}
35
__forceinline vllong(const vllong8& t) { v = t.v; }
36
__forceinline vllong8& operator =(const vllong8& f) { v = f.v; return *this; }
37
38
__forceinline vllong(const __m512i& t) { v = t; }
39
__forceinline operator __m512i() const { return v; }
40
__forceinline operator __m256i() const { return _mm512_castsi512_si256(v); }
41
42
__forceinline vllong(long long i) {
43
v = _mm512_set1_epi64(i);
44
}
45
46
__forceinline vllong(long long a, long long b, long long c, long long d) {
47
v = _mm512_set4_epi64(d,c,b,a);
48
}
49
50
__forceinline vllong(long long a0, long long a1, long long a2, long long a3,
51
long long a4, long long a5, long long a6, long long a7)
52
{
53
v = _mm512_set_epi64(a7,a6,a5,a4,a3,a2,a1,a0);
54
}
55
56
__forceinline vllong(const vllong<4>& i) {
57
v = _mm512_broadcast_i64x4(i);
58
}
59
60
////////////////////////////////////////////////////////////////////////////////
61
/// Constants
62
////////////////////////////////////////////////////////////////////////////////
63
64
__forceinline vllong(ZeroTy) : v(_mm512_setzero_epi32()) {}
65
__forceinline vllong(OneTy) : v(_mm512_set1_epi64(1)) {}
66
__forceinline vllong(StepTy) : v(_mm512_set_epi64(7,6,5,4,3,2,1,0)) {}
67
__forceinline vllong(ReverseStepTy) : v(_mm512_setr_epi64(7,6,5,4,3,2,1,0)) {}
68
69
////////////////////////////////////////////////////////////////////////////////
70
/// Loads and Stores
71
////////////////////////////////////////////////////////////////////////////////
72
73
static __forceinline void store_nt(void* __restrict__ ptr, const vllong8& a) {
74
_mm512_stream_si512((__m512i*)ptr,a);
75
}
76
77
static __forceinline vllong8 loadu(const void* addr) {
78
return _mm512_loadu_si512(addr);
79
}
80
81
static __forceinline vllong8 load(const vllong8* addr) {
82
return _mm512_load_si512(addr);
83
}
84
85
static __forceinline vllong8 load(const long long* addr) {
86
return _mm512_load_si512(addr);
87
}
88
89
static __forceinline vllong8 load(const unsigned char* ptr) {
90
return _mm512_cvtepu8_epi64(*(__m128i*)ptr);
91
}
92
93
static __forceinline void store(void* ptr, const vllong8& v) {
94
_mm512_store_si512(ptr,v);
95
}
96
97
static __forceinline void storeu(void* ptr, const vllong8& v) {
98
_mm512_storeu_si512(ptr,v);
99
}
100
101
static __forceinline void storeu(const vboold8& mask, long long* ptr, const vllong8& f) {
102
_mm512_mask_storeu_epi64(ptr,mask,f);
103
}
104
105
static __forceinline void store(const vboold8& mask, void* addr, const vllong8& v2) {
106
_mm512_mask_store_epi64(addr,mask,v2);
107
}
108
109
static __forceinline vllong8 compact(const vboold8& mask, vllong8& v) {
110
return _mm512_mask_compress_epi64(v,mask,v);
111
}
112
113
static __forceinline vllong8 compact(const vboold8& mask, const vllong8& a, vllong8& b) {
114
return _mm512_mask_compress_epi64(a,mask,b);
115
}
116
117
static __forceinline vllong8 expand(const vboold8& mask, const vllong8& a, vllong8& b) {
118
return _mm512_mask_expand_epi64(b,mask,a);
119
}
120
121
////////////////////////////////////////////////////////////////////////////////
122
/// Array Access
123
////////////////////////////////////////////////////////////////////////////////
124
125
__forceinline long long& operator [](size_t index) { assert(index < 8); return i[index]; }
126
__forceinline const long long& operator [](size_t index) const { assert(index < 8); return i[index]; }
127
128
};
129
130
////////////////////////////////////////////////////////////////////////////////
131
/// Unary Operators
132
////////////////////////////////////////////////////////////////////////////////
133
134
__forceinline vboold8 asBool(const vllong8& a) { return _mm512_movepi64_mask(a); }
135
136
__forceinline vllong8 operator +(const vllong8& a) { return a; }
137
__forceinline vllong8 operator -(const vllong8& a) { return _mm512_sub_epi64(_mm512_setzero_epi32(), a); }
138
139
////////////////////////////////////////////////////////////////////////////////
140
/// Binary Operators
141
////////////////////////////////////////////////////////////////////////////////
142
143
__forceinline vllong8 operator +(const vllong8& a, const vllong8& b) { return _mm512_add_epi64(a, b); }
144
__forceinline vllong8 operator +(const vllong8& a, long long b) { return a + vllong8(b); }
145
__forceinline vllong8 operator +(long long a, const vllong8& b) { return vllong8(a) + b; }
146
147
__forceinline vllong8 operator -(const vllong8& a, const vllong8& b) { return _mm512_sub_epi64(a, b); }
148
__forceinline vllong8 operator -(const vllong8& a, long long b) { return a - vllong8(b); }
149
__forceinline vllong8 operator -(long long a, const vllong8& b) { return vllong8(a) - b; }
150
151
__forceinline vllong8 operator *(const vllong8& a, const vllong8& b) { return _mm512_mullo_epi64(a, b); }
152
__forceinline vllong8 operator *(const vllong8& a, long long b) { return a * vllong8(b); }
153
__forceinline vllong8 operator *(long long a, const vllong8& b) { return vllong8(a) * b; }
154
155
__forceinline vllong8 operator &(const vllong8& a, const vllong8& b) { return _mm512_and_epi64(a, b); }
156
__forceinline vllong8 operator &(const vllong8& a, long long b) { return a & vllong8(b); }
157
__forceinline vllong8 operator &(long long a, const vllong8& b) { return vllong8(a) & b; }
158
159
__forceinline vllong8 operator |(const vllong8& a, const vllong8& b) { return _mm512_or_epi64(a, b); }
160
__forceinline vllong8 operator |(const vllong8& a, long long b) { return a | vllong8(b); }
161
__forceinline vllong8 operator |(long long a, const vllong8& b) { return vllong8(a) | b; }
162
163
__forceinline vllong8 operator ^(const vllong8& a, const vllong8& b) { return _mm512_xor_epi64(a, b); }
164
__forceinline vllong8 operator ^(const vllong8& a, long long b) { return a ^ vllong8(b); }
165
__forceinline vllong8 operator ^(long long a, const vllong8& b) { return vllong8(a) ^ b; }
166
167
__forceinline vllong8 operator <<(const vllong8& a, long long n) { return _mm512_slli_epi64(a, n); }
168
__forceinline vllong8 operator >>(const vllong8& a, long long n) { return _mm512_srai_epi64(a, n); }
169
170
__forceinline vllong8 operator <<(const vllong8& a, const vllong8& n) { return _mm512_sllv_epi64(a, n); }
171
__forceinline vllong8 operator >>(const vllong8& a, const vllong8& n) { return _mm512_srav_epi64(a, n); }
172
173
__forceinline vllong8 sll (const vllong8& a, long long b) { return _mm512_slli_epi64(a, b); }
174
__forceinline vllong8 sra (const vllong8& a, long long b) { return _mm512_srai_epi64(a, b); }
175
__forceinline vllong8 srl (const vllong8& a, long long b) { return _mm512_srli_epi64(a, b); }
176
177
__forceinline vllong8 min(const vllong8& a, const vllong8& b) { return _mm512_min_epi64(a, b); }
178
__forceinline vllong8 min(const vllong8& a, long long b) { return min(a,vllong8(b)); }
179
__forceinline vllong8 min(long long a, const vllong8& b) { return min(vllong8(a),b); }
180
181
__forceinline vllong8 max(const vllong8& a, const vllong8& b) { return _mm512_max_epi64(a, b); }
182
__forceinline vllong8 max(const vllong8& a, long long b) { return max(a,vllong8(b)); }
183
__forceinline vllong8 max(long long a, const vllong8& b) { return max(vllong8(a),b); }
184
185
__forceinline vllong8 mask_add(const vboold8& m, const vllong8& c, const vllong8& a, const vllong8& b) { return _mm512_mask_add_epi64(c,m,a,b); }
186
__forceinline vllong8 mask_sub(const vboold8& m, const vllong8& c, const vllong8& a, const vllong8& b) { return _mm512_mask_sub_epi64(c,m,a,b); }
187
188
__forceinline vllong8 mask_and(const vboold8& m, const vllong8& c, const vllong8& a, const vllong8& b) { return _mm512_mask_and_epi64(c,m,a,b); }
189
__forceinline vllong8 mask_or (const vboold8& m, const vllong8& c, const vllong8& a, const vllong8& b) { return _mm512_mask_or_epi64(c,m,a,b); }
190
191
////////////////////////////////////////////////////////////////////////////////
192
/// Assignment Operators
193
////////////////////////////////////////////////////////////////////////////////
194
195
__forceinline vllong8& operator +=(vllong8& a, const vllong8& b) { return a = a + b; }
196
__forceinline vllong8& operator +=(vllong8& a, long long b) { return a = a + b; }
197
198
__forceinline vllong8& operator -=(vllong8& a, const vllong8& b) { return a = a - b; }
199
__forceinline vllong8& operator -=(vllong8& a, long long b) { return a = a - b; }
200
201
__forceinline vllong8& operator *=(vllong8& a, const vllong8& b) { return a = a * b; }
202
__forceinline vllong8& operator *=(vllong8& a, long long b) { return a = a * b; }
203
204
__forceinline vllong8& operator &=(vllong8& a, const vllong8& b) { return a = a & b; }
205
__forceinline vllong8& operator &=(vllong8& a, long long b) { return a = a & b; }
206
207
__forceinline vllong8& operator |=(vllong8& a, const vllong8& b) { return a = a | b; }
208
__forceinline vllong8& operator |=(vllong8& a, long long b) { return a = a | b; }
209
210
__forceinline vllong8& operator <<=(vllong8& a, long long b) { return a = a << b; }
211
__forceinline vllong8& operator >>=(vllong8& a, long long b) { return a = a >> b; }
212
213
////////////////////////////////////////////////////////////////////////////////
214
/// Comparison Operators + Select
215
////////////////////////////////////////////////////////////////////////////////
216
217
__forceinline vboold8 operator ==(const vllong8& a, const vllong8& b) { return _mm512_cmp_epi64_mask(a,b,_MM_CMPINT_EQ); }
218
__forceinline vboold8 operator ==(const vllong8& a, long long b) { return a == vllong8(b); }
219
__forceinline vboold8 operator ==(long long a, const vllong8& b) { return vllong8(a) == b; }
220
221
__forceinline vboold8 operator !=(const vllong8& a, const vllong8& b) { return _mm512_cmp_epi64_mask(a,b,_MM_CMPINT_NE); }
222
__forceinline vboold8 operator !=(const vllong8& a, long long b) { return a != vllong8(b); }
223
__forceinline vboold8 operator !=(long long a, const vllong8& b) { return vllong8(a) != b; }
224
225
__forceinline vboold8 operator < (const vllong8& a, const vllong8& b) { return _mm512_cmp_epi64_mask(a,b,_MM_CMPINT_LT); }
226
__forceinline vboold8 operator < (const vllong8& a, long long b) { return a < vllong8(b); }
227
__forceinline vboold8 operator < (long long a, const vllong8& b) { return vllong8(a) < b; }
228
229
__forceinline vboold8 operator >=(const vllong8& a, const vllong8& b) { return _mm512_cmp_epi64_mask(a,b,_MM_CMPINT_GE); }
230
__forceinline vboold8 operator >=(const vllong8& a, long long b) { return a >= vllong8(b); }
231
__forceinline vboold8 operator >=(long long a, const vllong8& b) { return vllong8(a) >= b; }
232
233
__forceinline vboold8 operator > (const vllong8& a, const vllong8& b) { return _mm512_cmp_epi64_mask(a,b,_MM_CMPINT_GT); }
234
__forceinline vboold8 operator > (const vllong8& a, long long b) { return a > vllong8(b); }
235
__forceinline vboold8 operator > (long long a, const vllong8& b) { return vllong8(a) > b; }
236
237
__forceinline vboold8 operator <=(const vllong8& a, const vllong8& b) { return _mm512_cmp_epi64_mask(a,b,_MM_CMPINT_LE); }
238
__forceinline vboold8 operator <=(const vllong8& a, long long b) { return a <= vllong8(b); }
239
__forceinline vboold8 operator <=(long long a, const vllong8& b) { return vllong8(a) <= b; }
240
241
__forceinline vboold8 eq(const vllong8& a, const vllong8& b) { return _mm512_cmp_epi64_mask(a,b,_MM_CMPINT_EQ); }
242
__forceinline vboold8 ne(const vllong8& a, const vllong8& b) { return _mm512_cmp_epi64_mask(a,b,_MM_CMPINT_NE); }
243
__forceinline vboold8 lt(const vllong8& a, const vllong8& b) { return _mm512_cmp_epi64_mask(a,b,_MM_CMPINT_LT); }
244
__forceinline vboold8 ge(const vllong8& a, const vllong8& b) { return _mm512_cmp_epi64_mask(a,b,_MM_CMPINT_GE); }
245
__forceinline vboold8 gt(const vllong8& a, const vllong8& b) { return _mm512_cmp_epi64_mask(a,b,_MM_CMPINT_GT); }
246
__forceinline vboold8 le(const vllong8& a, const vllong8& b) { return _mm512_cmp_epi64_mask(a,b,_MM_CMPINT_LE); }
247
248
__forceinline vboold8 eq(const vboold8 mask, const vllong8& a, const vllong8& b) { return _mm512_mask_cmp_epi64_mask(mask,a,b,_MM_CMPINT_EQ); }
249
__forceinline vboold8 ne(const vboold8 mask, const vllong8& a, const vllong8& b) { return _mm512_mask_cmp_epi64_mask(mask,a,b,_MM_CMPINT_NE); }
250
__forceinline vboold8 lt(const vboold8 mask, const vllong8& a, const vllong8& b) { return _mm512_mask_cmp_epi64_mask(mask,a,b,_MM_CMPINT_LT); }
251
__forceinline vboold8 ge(const vboold8 mask, const vllong8& a, const vllong8& b) { return _mm512_mask_cmp_epi64_mask(mask,a,b,_MM_CMPINT_GE); }
252
__forceinline vboold8 gt(const vboold8 mask, const vllong8& a, const vllong8& b) { return _mm512_mask_cmp_epi64_mask(mask,a,b,_MM_CMPINT_GT); }
253
__forceinline vboold8 le(const vboold8 mask, const vllong8& a, const vllong8& b) { return _mm512_mask_cmp_epi64_mask(mask,a,b,_MM_CMPINT_LE); }
254
255
__forceinline vllong8 select(const vboold8& m, const vllong8& t, const vllong8& f) {
256
return _mm512_mask_or_epi64(f,m,t,t);
257
}
258
259
////////////////////////////////////////////////////////////////////////////////
260
// Movement/Shifting/Shuffling Functions
261
////////////////////////////////////////////////////////////////////////////////
262
263
template<int i0, int i1>
264
__forceinline vllong8 shuffle(const vllong8& v) {
265
return _mm512_castpd_si512(_mm512_permute_pd(_mm512_castsi512_pd(v), (i1 << 7) | (i0 << 6) | (i1 << 5) | (i0 << 4) | (i1 << 3) | (i0 << 2) | (i1 << 1) | i0));
266
}
267
268
template<int i>
269
__forceinline vllong8 shuffle(const vllong8& v) {
270
return shuffle<i, i>(v);
271
}
272
273
template<int i0, int i1, int i2, int i3>
274
__forceinline vllong8 shuffle(const vllong8& v) {
275
return _mm512_permutex_epi64(v, _MM_SHUFFLE(i3, i2, i1, i0));
276
}
277
278
template<int i0, int i1>
279
__forceinline vllong8 shuffle4(const vllong8& v) {
280
return _mm512_shuffle_i64x2(v, v, _MM_SHUFFLE(i1*2+1, i1*2, i0*2+1, i0*2));
281
}
282
283
template<int i>
284
__forceinline vllong8 shuffle4(const vllong8& v) {
285
return shuffle4<i, i>(v);
286
}
287
288
template<int i>
289
__forceinline vllong8 align_shift_right(const vllong8& a, const vllong8& b) {
290
return _mm512_alignr_epi64(a, b, i);
291
};
292
293
__forceinline long long toScalar(const vllong8& v) {
294
return _mm_cvtsi128_si64(_mm512_castsi512_si128(v));
295
}
296
297
////////////////////////////////////////////////////////////////////////////////
298
/// Reductions
299
////////////////////////////////////////////////////////////////////////////////
300
301
__forceinline vllong8 vreduce_min2(vllong8 x) { return min(x, shuffle<1,0,3,2>(x)); }
302
__forceinline vllong8 vreduce_min4(vllong8 x) { x = vreduce_min2(x); return min(x, shuffle<2,3,0,1>(x)); }
303
__forceinline vllong8 vreduce_min (vllong8 x) { x = vreduce_min4(x); return min(x, shuffle4<1,0>(x)); }
304
305
__forceinline vllong8 vreduce_max2(vllong8 x) { return max(x, shuffle<1,0,3,2>(x)); }
306
__forceinline vllong8 vreduce_max4(vllong8 x) { x = vreduce_max2(x); return max(x, shuffle<2,3,0,1>(x)); }
307
__forceinline vllong8 vreduce_max (vllong8 x) { x = vreduce_max4(x); return max(x, shuffle4<1,0>(x)); }
308
309
__forceinline vllong8 vreduce_and2(vllong8 x) { return x & shuffle<1,0,3,2>(x); }
310
__forceinline vllong8 vreduce_and4(vllong8 x) { x = vreduce_and2(x); return x & shuffle<2,3,0,1>(x); }
311
__forceinline vllong8 vreduce_and (vllong8 x) { x = vreduce_and4(x); return x & shuffle4<1,0>(x); }
312
313
__forceinline vllong8 vreduce_or2(vllong8 x) { return x | shuffle<1,0,3,2>(x); }
314
__forceinline vllong8 vreduce_or4(vllong8 x) { x = vreduce_or2(x); return x | shuffle<2,3,0,1>(x); }
315
__forceinline vllong8 vreduce_or (vllong8 x) { x = vreduce_or4(x); return x | shuffle4<1,0>(x); }
316
317
__forceinline vllong8 vreduce_add2(vllong8 x) { return x + shuffle<1,0,3,2>(x); }
318
__forceinline vllong8 vreduce_add4(vllong8 x) { x = vreduce_add2(x); return x + shuffle<2,3,0,1>(x); }
319
__forceinline vllong8 vreduce_add (vllong8 x) { x = vreduce_add4(x); return x + shuffle4<1,0>(x); }
320
321
__forceinline long long reduce_min(const vllong8& v) { return toScalar(vreduce_min(v)); }
322
__forceinline long long reduce_max(const vllong8& v) { return toScalar(vreduce_max(v)); }
323
__forceinline long long reduce_and(const vllong8& v) { return toScalar(vreduce_and(v)); }
324
__forceinline long long reduce_or (const vllong8& v) { return toScalar(vreduce_or (v)); }
325
__forceinline long long reduce_add(const vllong8& v) { return toScalar(vreduce_add(v)); }
326
327
////////////////////////////////////////////////////////////////////////////////
328
/// Memory load and store operations
329
////////////////////////////////////////////////////////////////////////////////
330
331
__forceinline vllong8 permute(const vllong8& v, const vllong8& index) {
332
return _mm512_permutexvar_epi64(index,v);
333
}
334
335
__forceinline vllong8 reverse(const vllong8& a) {
336
return permute(a,vllong8(reverse_step));
337
}
338
339
////////////////////////////////////////////////////////////////////////////////
340
/// Output Operators
341
////////////////////////////////////////////////////////////////////////////////
342
343
__forceinline embree_ostream operator <<(embree_ostream cout, const vllong8& v)
344
{
345
cout << "<" << v[0];
346
for (size_t i=1; i<8; i++) cout << ", " << v[i];
347
cout << ">";
348
return cout;
349
}
350
}
351
352
#undef vboolf
353
#undef vboold
354
#undef vint
355
#undef vuint
356
#undef vllong
357
#undef vfloat
358
#undef vdouble
359
360