Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
godotengine
GitHub Repository: godotengine/godot
Path: blob/master/thirdparty/embree/common/sys/intrinsics.h
9912 views
1
// Copyright 2009-2021 Intel Corporation
2
// SPDX-License-Identifier: Apache-2.0
3
4
#pragma once
5
6
#include "platform.h"
7
8
#if defined(__WIN32__)
9
#include <intrin.h>
10
#endif
11
12
#if defined(__ARM_NEON)
13
#include "../simd/arm/emulation.h"
14
#else
15
#include <immintrin.h>
16
#if defined(__EMSCRIPTEN__)
17
#include "../simd/wasm/emulation.h"
18
#endif
19
#endif
20
21
#if defined(__BMI__) && defined(__GNUC__) && !defined(__INTEL_COMPILER)
22
#if !defined(_tzcnt_u32)
23
#define _tzcnt_u32 __tzcnt_u32
24
#endif
25
#if !defined(_tzcnt_u64)
26
#define _tzcnt_u64 __tzcnt_u64
27
#endif
28
#endif
29
30
#if defined(__aarch64__)
31
#if !defined(_lzcnt_u32)
32
#define _lzcnt_u32 __builtin_clz
33
#endif
34
#else
35
#if defined(__LZCNT__)
36
#if !defined(_lzcnt_u32)
37
#define _lzcnt_u32 __lzcnt32
38
#endif
39
#if !defined(_lzcnt_u64)
40
#define _lzcnt_u64 __lzcnt64
41
#endif
42
#endif
43
#endif
44
45
#if defined(__WIN32__)
46
# if !defined(NOMINMAX)
47
# define NOMINMAX
48
# endif
49
# include <windows.h>
50
#endif
51
52
/* normally defined in pmmintrin.h, but we always need this */
53
#if !defined(_MM_SET_DENORMALS_ZERO_MODE)
54
#define _MM_DENORMALS_ZERO_ON (0x0040)
55
#define _MM_DENORMALS_ZERO_OFF (0x0000)
56
#define _MM_DENORMALS_ZERO_MASK (0x0040)
57
#define _MM_SET_DENORMALS_ZERO_MODE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_DENORMALS_ZERO_MASK) | (x)))
58
#endif
59
60
namespace embree
61
{
62
63
////////////////////////////////////////////////////////////////////////////////
64
/// Windows Platform
65
////////////////////////////////////////////////////////////////////////////////
66
67
#if defined(__WIN32__) && !defined(__INTEL_LLVM_COMPILER)
68
69
__forceinline size_t read_tsc()
70
{
71
LARGE_INTEGER li;
72
QueryPerformanceCounter(&li);
73
return (size_t)li.QuadPart;
74
}
75
76
__forceinline int bsf(int v) {
77
#if defined(__AVX2__) && !defined(__aarch64__)
78
return _tzcnt_u32(v);
79
#else
80
unsigned long r = 0; _BitScanForward(&r,v); return r;
81
#endif
82
}
83
84
__forceinline unsigned bsf(unsigned v) {
85
#if defined(__AVX2__) && !defined(__aarch64__)
86
return _tzcnt_u32(v);
87
#else
88
unsigned long r = 0; _BitScanForward(&r,v); return r;
89
#endif
90
}
91
92
#if defined(__X86_64__) || defined (__aarch64__)
93
__forceinline size_t bsf(size_t v) {
94
#if defined(__AVX2__)
95
return _tzcnt_u64(v);
96
#else
97
unsigned long r = 0; _BitScanForward64(&r,v); return r;
98
#endif
99
}
100
#endif
101
102
__forceinline int bscf(int& v)
103
{
104
int i = bsf(v);
105
v &= v-1;
106
return i;
107
}
108
109
__forceinline unsigned bscf(unsigned& v)
110
{
111
unsigned i = bsf(v);
112
v &= v-1;
113
return i;
114
}
115
116
#if defined(__X86_64__) || defined (__aarch64__)
117
__forceinline size_t bscf(size_t& v)
118
{
119
size_t i = bsf(v);
120
v &= v-1;
121
return i;
122
}
123
#endif
124
125
__forceinline int bsr(int v) {
126
#if defined(__AVX2__) && !defined(__aarch64__)
127
return 31 - _lzcnt_u32(v);
128
#else
129
unsigned long r = 0; _BitScanReverse(&r,v); return r;
130
#endif
131
}
132
133
__forceinline unsigned bsr(unsigned v) {
134
#if defined(__AVX2__) && !defined(__aarch64__)
135
return 31 - _lzcnt_u32(v);
136
#else
137
unsigned long r = 0; _BitScanReverse(&r,v); return r;
138
#endif
139
}
140
141
#if defined(__X86_64__) || defined (__aarch64__)
142
__forceinline size_t bsr(size_t v) {
143
#if defined(__AVX2__)
144
return 63 -_lzcnt_u64(v);
145
#else
146
unsigned long r = 0; _BitScanReverse64(&r, v); return r;
147
#endif
148
}
149
#endif
150
151
__forceinline int lzcnt(const int x)
152
{
153
#if defined(__AVX2__) && !defined(__aarch64__)
154
return _lzcnt_u32(x);
155
#else
156
if (unlikely(x == 0)) return 32;
157
return 31 - bsr(x);
158
#endif
159
}
160
161
__forceinline int btc(int v, int i) {
162
long r = v; _bittestandcomplement(&r,i); return r;
163
}
164
165
__forceinline int bts(int v, int i) {
166
long r = v; _bittestandset(&r,i); return r;
167
}
168
169
__forceinline int btr(int v, int i) {
170
long r = v; _bittestandreset(&r,i); return r;
171
}
172
173
#if defined(__X86_64__)
174
175
__forceinline size_t btc(size_t v, size_t i) {
176
size_t r = v; _bittestandcomplement64((__int64*)&r,i); return r;
177
}
178
179
__forceinline size_t bts(size_t v, size_t i) {
180
__int64 r = v; _bittestandset64(&r,i); return r;
181
}
182
183
__forceinline size_t btr(size_t v, size_t i) {
184
__int64 r = v; _bittestandreset64(&r,i); return r;
185
}
186
187
#endif
188
189
__forceinline int32_t atomic_cmpxchg(volatile int32_t* p, const int32_t c, const int32_t v) {
190
return _InterlockedCompareExchange((volatile long*)p,v,c);
191
}
192
193
////////////////////////////////////////////////////////////////////////////////
194
/// Unix Platform
195
////////////////////////////////////////////////////////////////////////////////
196
197
#else
198
199
__forceinline uint64_t read_tsc() {
200
#if defined(__X86_ASM__)
201
uint32_t high,low;
202
asm volatile ("rdtsc" : "=d"(high), "=a"(low));
203
return (((uint64_t)high) << 32) + (uint64_t)low;
204
#else
205
/* Not supported yet, meaning measuring traversal cost per pixel does not work. */
206
return 0;
207
#endif
208
}
209
210
__forceinline int bsf(int v) {
211
#if defined(__ARM_NEON)
212
return __builtin_ctz(v);
213
#else
214
#if defined(__AVX2__)
215
return _tzcnt_u32(v);
216
#elif defined(__X86_ASM__)
217
int r = 0; asm ("bsf %1,%0" : "=r"(r) : "r"(v)); return r;
218
#else
219
return __builtin_ctz(v);
220
#endif
221
#endif
222
}
223
224
#if defined(EMBREE_SYCL_SUPPORT) && defined(__SYCL_DEVICE_ONLY__)
225
__forceinline unsigned int bsf(unsigned v) {
226
return sycl::ctz(v);
227
}
228
229
#else
230
231
#if defined(__64BIT__)
232
__forceinline unsigned bsf(unsigned v)
233
{
234
#if defined(__ARM_NEON)
235
return __builtin_ctz(v);
236
#else
237
#if defined(__AVX2__)
238
return _tzcnt_u32(v);
239
#elif defined(__X86_ASM__)
240
unsigned r = 0; asm ("bsf %1,%0" : "=r"(r) : "r"(v)); return r;
241
#else
242
return __builtin_ctz(v);
243
#endif
244
#endif
245
}
246
#endif
247
#endif
248
249
#if defined(EMBREE_SYCL_SUPPORT) && defined(__SYCL_DEVICE_ONLY__)
250
__forceinline size_t bsf(size_t v) {
251
return sycl::ctz(v);
252
}
253
#else
254
255
__forceinline size_t bsf(size_t v) {
256
#if defined(__AVX2__) && !defined(__aarch64__)
257
#if defined(__X86_64__)
258
return _tzcnt_u64(v);
259
#else
260
return _tzcnt_u32(v);
261
#endif
262
#elif defined(__X86_ASM__)
263
size_t r = 0; asm ("bsf %1,%0" : "=r"(r) : "r"(v)); return r;
264
#else
265
return __builtin_ctzl(v);
266
#endif
267
}
268
#endif
269
270
__forceinline int bscf(int& v)
271
{
272
int i = bsf(v);
273
v &= v-1;
274
return i;
275
}
276
277
#if defined(__64BIT__)
278
__forceinline unsigned int bscf(unsigned int& v)
279
{
280
unsigned int i = bsf(v);
281
v &= v-1;
282
return i;
283
}
284
#endif
285
286
__forceinline size_t bscf(size_t& v)
287
{
288
size_t i = bsf(v);
289
v &= v-1;
290
return i;
291
}
292
293
__forceinline int bsr(int v) {
294
#if defined(__AVX2__) && !defined(__aarch64__)
295
return 31 - _lzcnt_u32(v);
296
#elif defined(__X86_ASM__)
297
int r = 0; asm ("bsr %1,%0" : "=r"(r) : "r"(v)); return r;
298
#else
299
return __builtin_clz(v) ^ 31;
300
#endif
301
}
302
303
#if defined(__64BIT__) || defined(__EMSCRIPTEN__)
304
__forceinline unsigned bsr(unsigned v) {
305
#if defined(__AVX2__)
306
return 31 - _lzcnt_u32(v);
307
#elif defined(__X86_ASM__)
308
unsigned r = 0; asm ("bsr %1,%0" : "=r"(r) : "r"(v)); return r;
309
#else
310
return __builtin_clz(v) ^ 31;
311
#endif
312
}
313
#endif
314
315
__forceinline size_t bsr(size_t v) {
316
#if defined(__AVX2__) && !defined(__aarch64__)
317
#if defined(__X86_64__)
318
return 63 - _lzcnt_u64(v);
319
#else
320
return 31 - _lzcnt_u32(v);
321
#endif
322
#elif defined(__X86_ASM__)
323
size_t r = 0; asm ("bsr %1,%0" : "=r"(r) : "r"(v)); return r;
324
#else
325
return (sizeof(v) * 8 - 1) - __builtin_clzl(v);
326
#endif
327
}
328
329
__forceinline int lzcnt(const int x)
330
{
331
#if defined(__AVX2__) && !defined(__aarch64__)
332
return _lzcnt_u32(x);
333
#else
334
if (unlikely(x == 0)) return 32;
335
return 31 - bsr(x);
336
#endif
337
}
338
339
__forceinline size_t blsr(size_t v) {
340
#if defined(__AVX2__) && !defined(__aarch64__)
341
#if defined(__INTEL_COMPILER)
342
return _blsr_u64(v);
343
#else
344
#if defined(__X86_64__)
345
return __blsr_u64(v);
346
#else
347
return __blsr_u32(v);
348
#endif
349
#endif
350
#else
351
return v & (v-1);
352
#endif
353
}
354
355
__forceinline int btc(int v, int i) {
356
#if defined(__X86_ASM__)
357
int r = 0; asm ("btc %1,%0" : "=r"(r) : "r"(i), "0"(v) : "flags" ); return r;
358
#else
359
return (v ^ (1 << i));
360
#endif
361
}
362
363
__forceinline int bts(int v, int i) {
364
#if defined(__X86_ASM__)
365
int r = 0; asm ("bts %1,%0" : "=r"(r) : "r"(i), "0"(v) : "flags"); return r;
366
#else
367
return (v | (1 << i));
368
#endif
369
}
370
371
__forceinline int btr(int v, int i) {
372
#if defined(__X86_ASM__)
373
int r = 0; asm ("btr %1,%0" : "=r"(r) : "r"(i), "0"(v) : "flags"); return r;
374
#else
375
return (v & ~(1 << i));
376
#endif
377
}
378
379
__forceinline size_t btc(size_t v, size_t i) {
380
#if defined(__X86_ASM__)
381
size_t r = 0; asm ("btc %1,%0" : "=r"(r) : "r"(i), "0"(v) : "flags" ); return r;
382
#else
383
return (v ^ (1 << i));
384
#endif
385
}
386
387
__forceinline size_t bts(size_t v, size_t i) {
388
#if defined(__X86_ASM__)
389
size_t r = 0; asm ("bts %1,%0" : "=r"(r) : "r"(i), "0"(v) : "flags"); return r;
390
#else
391
return (v | (1 << i));
392
#endif
393
}
394
395
__forceinline size_t btr(size_t v, size_t i) {
396
#if defined(__X86_ASM__)
397
size_t r = 0; asm ("btr %1,%0" : "=r"(r) : "r"(i), "0"(v) : "flags"); return r;
398
#else
399
return (v & ~(1 << i));
400
#endif
401
}
402
403
__forceinline int32_t atomic_cmpxchg(int32_t volatile* value, int32_t comparand, const int32_t input) {
404
return __sync_val_compare_and_swap(value, comparand, input);
405
}
406
407
#endif
408
409
#if !defined(__WIN32__)
410
411
#if defined(__i386__) && defined(__PIC__)
412
413
__forceinline void __cpuid(int out[4], int op)
414
{
415
asm volatile ("xchg{l}\t{%%}ebx, %1\n\t"
416
"cpuid\n\t"
417
"xchg{l}\t{%%}ebx, %1\n\t"
418
: "=a"(out[0]), "=r"(out[1]), "=c"(out[2]), "=d"(out[3])
419
: "0"(op));
420
}
421
422
__forceinline void __cpuid_count(int out[4], int op1, int op2)
423
{
424
asm volatile ("xchg{l}\t{%%}ebx, %1\n\t"
425
"cpuid\n\t"
426
"xchg{l}\t{%%}ebx, %1\n\t"
427
: "=a" (out[0]), "=r" (out[1]), "=c" (out[2]), "=d" (out[3])
428
: "0" (op1), "2" (op2));
429
}
430
431
#elif defined(__X86_ASM__)
432
433
__forceinline void __cpuid(int out[4], int op) {
434
asm volatile ("cpuid" : "=a"(out[0]), "=b"(out[1]), "=c"(out[2]), "=d"(out[3]) : "a"(op));
435
}
436
437
__forceinline void __cpuid_count(int out[4], int op1, int op2) {
438
asm volatile ("cpuid" : "=a"(out[0]), "=b"(out[1]), "=c"(out[2]), "=d"(out[3]) : "a"(op1), "c"(op2));
439
}
440
441
#endif
442
#endif
443
444
////////////////////////////////////////////////////////////////////////////////
445
/// All Platforms
446
////////////////////////////////////////////////////////////////////////////////
447
448
#if defined(__clang__) || defined(__GNUC__)
449
#if !defined(_mm_undefined_ps)
450
__forceinline __m128 _mm_undefined_ps() { return _mm_setzero_ps(); }
451
#endif
452
#if !defined(_mm_undefined_si128)
453
__forceinline __m128i _mm_undefined_si128() { return _mm_setzero_si128(); }
454
#endif
455
#if !defined(_mm256_undefined_ps) && defined(__AVX__)
456
__forceinline __m256 _mm256_undefined_ps() { return _mm256_setzero_ps(); }
457
#endif
458
#if !defined(_mm256_undefined_si256) && defined(__AVX__)
459
__forceinline __m256i _mm256_undefined_si256() { return _mm256_setzero_si256(); }
460
#endif
461
#if !defined(_mm512_undefined_ps) && defined(__AVX512F__)
462
__forceinline __m512 _mm512_undefined_ps() { return _mm512_setzero_ps(); }
463
#endif
464
#if !defined(_mm512_undefined_epi32) && defined(__AVX512F__)
465
__forceinline __m512i _mm512_undefined_epi32() { return _mm512_setzero_si512(); }
466
#endif
467
#endif
468
469
#if defined(EMBREE_SYCL_SUPPORT) && defined(__SYCL_DEVICE_ONLY__)
470
471
__forceinline unsigned int popcnt(unsigned int in) {
472
return sycl::popcount(in);
473
}
474
475
#else
476
477
#if defined(__SSE4_2__) || defined(__ARM_NEON)
478
479
__forceinline int popcnt(int in) {
480
return _mm_popcnt_u32(in);
481
}
482
483
__forceinline unsigned popcnt(unsigned in) {
484
return _mm_popcnt_u32(in);
485
}
486
487
#if defined(__64BIT__)
488
__forceinline size_t popcnt(size_t in) {
489
return _mm_popcnt_u64(in);
490
}
491
#endif
492
493
#endif
494
495
#endif
496
497
#if defined(__X86_ASM__)
498
__forceinline uint64_t rdtsc()
499
{
500
int dummy[4];
501
__cpuid(dummy,0);
502
uint64_t clock = read_tsc();
503
__cpuid(dummy,0);
504
return clock;
505
}
506
#endif
507
508
__forceinline void pause_cpu(const size_t N = 8)
509
{
510
for (size_t i=0; i<N; i++)
511
_mm_pause();
512
}
513
514
/* prefetches */
515
__forceinline void prefetchL1 (const void* ptr) { _mm_prefetch((const char*)ptr,_MM_HINT_T0); }
516
__forceinline void prefetchL2 (const void* ptr) { _mm_prefetch((const char*)ptr,_MM_HINT_T1); }
517
__forceinline void prefetchL3 (const void* ptr) { _mm_prefetch((const char*)ptr,_MM_HINT_T2); }
518
__forceinline void prefetchNTA(const void* ptr) { _mm_prefetch((const char*)ptr,_MM_HINT_NTA); }
519
__forceinline void prefetchEX (const void* ptr) {
520
#if defined(__INTEL_COMPILER)
521
_mm_prefetch((const char*)ptr,_MM_HINT_ET0);
522
#else
523
_mm_prefetch((const char*)ptr,_MM_HINT_T0);
524
#endif
525
}
526
527
__forceinline void prefetchL1EX(const void* ptr) {
528
prefetchEX(ptr);
529
}
530
531
__forceinline void prefetchL2EX(const void* ptr) {
532
prefetchEX(ptr);
533
}
534
#if defined(__AVX2__) && !defined(__aarch64__)
535
__forceinline unsigned int pext(unsigned int a, unsigned int b) { return _pext_u32(a, b); }
536
__forceinline unsigned int pdep(unsigned int a, unsigned int b) { return _pdep_u32(a, b); }
537
#if defined(__X86_64__)
538
__forceinline size_t pext(size_t a, size_t b) { return _pext_u64(a, b); }
539
__forceinline size_t pdep(size_t a, size_t b) { return _pdep_u64(a, b); }
540
#endif
541
#endif
542
543
#if defined(__AVX512F__)
544
#if defined(__INTEL_COMPILER)
545
__forceinline float mm512_cvtss_f32(__m512 v) {
546
return _mm512_cvtss_f32(v);
547
}
548
__forceinline int mm512_mask2int(__mmask16 k1) {
549
return _mm512_mask2int(k1);
550
}
551
__forceinline __mmask16 mm512_int2mask(int mask) {
552
return _mm512_int2mask(mask);
553
}
554
#else
555
__forceinline float mm512_cvtss_f32(__m512 v) { // FIXME: _mm512_cvtss_f32 neither supported by clang v4.0.0 nor GCC 6.3
556
return _mm_cvtss_f32(_mm512_castps512_ps128(v));
557
}
558
__forceinline int mm512_mask2int(__mmask16 k1) { // FIXME: _mm512_mask2int not yet supported by GCC 6.3
559
return (int)k1;
560
}
561
__forceinline __mmask16 mm512_int2mask(int mask) { // FIXME: _mm512_int2mask not yet supported by GCC 6.3
562
return (__mmask16)mask;
563
}
564
#endif
565
#endif
566
}
567
568