Path: blob/master/thirdparty/embree/common/sys/intrinsics.h
9912 views
// Copyright 2009-2021 Intel Corporation1// SPDX-License-Identifier: Apache-2.023#pragma once45#include "platform.h"67#if defined(__WIN32__)8#include <intrin.h>9#endif1011#if defined(__ARM_NEON)12#include "../simd/arm/emulation.h"13#else14#include <immintrin.h>15#if defined(__EMSCRIPTEN__)16#include "../simd/wasm/emulation.h"17#endif18#endif1920#if defined(__BMI__) && defined(__GNUC__) && !defined(__INTEL_COMPILER)21#if !defined(_tzcnt_u32)22#define _tzcnt_u32 __tzcnt_u3223#endif24#if !defined(_tzcnt_u64)25#define _tzcnt_u64 __tzcnt_u6426#endif27#endif2829#if defined(__aarch64__)30#if !defined(_lzcnt_u32)31#define _lzcnt_u32 __builtin_clz32#endif33#else34#if defined(__LZCNT__)35#if !defined(_lzcnt_u32)36#define _lzcnt_u32 __lzcnt3237#endif38#if !defined(_lzcnt_u64)39#define _lzcnt_u64 __lzcnt6440#endif41#endif42#endif4344#if defined(__WIN32__)45# if !defined(NOMINMAX)46# define NOMINMAX47# endif48# include <windows.h>49#endif5051/* normally defined in pmmintrin.h, but we always need this */52#if !defined(_MM_SET_DENORMALS_ZERO_MODE)53#define _MM_DENORMALS_ZERO_ON (0x0040)54#define _MM_DENORMALS_ZERO_OFF (0x0000)55#define _MM_DENORMALS_ZERO_MASK (0x0040)56#define _MM_SET_DENORMALS_ZERO_MODE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_DENORMALS_ZERO_MASK) | (x)))57#endif5859namespace embree60{6162////////////////////////////////////////////////////////////////////////////////63/// Windows Platform64////////////////////////////////////////////////////////////////////////////////6566#if defined(__WIN32__) && !defined(__INTEL_LLVM_COMPILER)6768__forceinline size_t read_tsc()69{70LARGE_INTEGER li;71QueryPerformanceCounter(&li);72return (size_t)li.QuadPart;73}7475__forceinline int bsf(int v) {76#if defined(__AVX2__) && !defined(__aarch64__)77return _tzcnt_u32(v);78#else79unsigned long r = 0; _BitScanForward(&r,v); return r;80#endif81}8283__forceinline unsigned bsf(unsigned v) {84#if defined(__AVX2__) && !defined(__aarch64__)85return _tzcnt_u32(v);86#else87unsigned long r = 0; _BitScanForward(&r,v); return r;88#endif89}9091#if defined(__X86_64__) || defined (__aarch64__)92__forceinline size_t bsf(size_t v) {93#if defined(__AVX2__)94return _tzcnt_u64(v);95#else96unsigned long r = 0; _BitScanForward64(&r,v); return r;97#endif98}99#endif100101__forceinline int bscf(int& v)102{103int i = bsf(v);104v &= v-1;105return i;106}107108__forceinline unsigned bscf(unsigned& v)109{110unsigned i = bsf(v);111v &= v-1;112return i;113}114115#if defined(__X86_64__) || defined (__aarch64__)116__forceinline size_t bscf(size_t& v)117{118size_t i = bsf(v);119v &= v-1;120return i;121}122#endif123124__forceinline int bsr(int v) {125#if defined(__AVX2__) && !defined(__aarch64__)126return 31 - _lzcnt_u32(v);127#else128unsigned long r = 0; _BitScanReverse(&r,v); return r;129#endif130}131132__forceinline unsigned bsr(unsigned v) {133#if defined(__AVX2__) && !defined(__aarch64__)134return 31 - _lzcnt_u32(v);135#else136unsigned long r = 0; _BitScanReverse(&r,v); return r;137#endif138}139140#if defined(__X86_64__) || defined (__aarch64__)141__forceinline size_t bsr(size_t v) {142#if defined(__AVX2__)143return 63 -_lzcnt_u64(v);144#else145unsigned long r = 0; _BitScanReverse64(&r, v); return r;146#endif147}148#endif149150__forceinline int lzcnt(const int x)151{152#if defined(__AVX2__) && !defined(__aarch64__)153return _lzcnt_u32(x);154#else155if (unlikely(x == 0)) return 32;156return 31 - bsr(x);157#endif158}159160__forceinline int btc(int v, int i) {161long r = v; _bittestandcomplement(&r,i); return r;162}163164__forceinline int bts(int v, int i) {165long r = v; _bittestandset(&r,i); return r;166}167168__forceinline int btr(int v, int i) {169long r = v; _bittestandreset(&r,i); return r;170}171172#if defined(__X86_64__)173174__forceinline size_t btc(size_t v, size_t i) {175size_t r = v; _bittestandcomplement64((__int64*)&r,i); return r;176}177178__forceinline size_t bts(size_t v, size_t i) {179__int64 r = v; _bittestandset64(&r,i); return r;180}181182__forceinline size_t btr(size_t v, size_t i) {183__int64 r = v; _bittestandreset64(&r,i); return r;184}185186#endif187188__forceinline int32_t atomic_cmpxchg(volatile int32_t* p, const int32_t c, const int32_t v) {189return _InterlockedCompareExchange((volatile long*)p,v,c);190}191192////////////////////////////////////////////////////////////////////////////////193/// Unix Platform194////////////////////////////////////////////////////////////////////////////////195196#else197198__forceinline uint64_t read_tsc() {199#if defined(__X86_ASM__)200uint32_t high,low;201asm volatile ("rdtsc" : "=d"(high), "=a"(low));202return (((uint64_t)high) << 32) + (uint64_t)low;203#else204/* Not supported yet, meaning measuring traversal cost per pixel does not work. */205return 0;206#endif207}208209__forceinline int bsf(int v) {210#if defined(__ARM_NEON)211return __builtin_ctz(v);212#else213#if defined(__AVX2__)214return _tzcnt_u32(v);215#elif defined(__X86_ASM__)216int r = 0; asm ("bsf %1,%0" : "=r"(r) : "r"(v)); return r;217#else218return __builtin_ctz(v);219#endif220#endif221}222223#if defined(EMBREE_SYCL_SUPPORT) && defined(__SYCL_DEVICE_ONLY__)224__forceinline unsigned int bsf(unsigned v) {225return sycl::ctz(v);226}227228#else229230#if defined(__64BIT__)231__forceinline unsigned bsf(unsigned v)232{233#if defined(__ARM_NEON)234return __builtin_ctz(v);235#else236#if defined(__AVX2__)237return _tzcnt_u32(v);238#elif defined(__X86_ASM__)239unsigned r = 0; asm ("bsf %1,%0" : "=r"(r) : "r"(v)); return r;240#else241return __builtin_ctz(v);242#endif243#endif244}245#endif246#endif247248#if defined(EMBREE_SYCL_SUPPORT) && defined(__SYCL_DEVICE_ONLY__)249__forceinline size_t bsf(size_t v) {250return sycl::ctz(v);251}252#else253254__forceinline size_t bsf(size_t v) {255#if defined(__AVX2__) && !defined(__aarch64__)256#if defined(__X86_64__)257return _tzcnt_u64(v);258#else259return _tzcnt_u32(v);260#endif261#elif defined(__X86_ASM__)262size_t r = 0; asm ("bsf %1,%0" : "=r"(r) : "r"(v)); return r;263#else264return __builtin_ctzl(v);265#endif266}267#endif268269__forceinline int bscf(int& v)270{271int i = bsf(v);272v &= v-1;273return i;274}275276#if defined(__64BIT__)277__forceinline unsigned int bscf(unsigned int& v)278{279unsigned int i = bsf(v);280v &= v-1;281return i;282}283#endif284285__forceinline size_t bscf(size_t& v)286{287size_t i = bsf(v);288v &= v-1;289return i;290}291292__forceinline int bsr(int v) {293#if defined(__AVX2__) && !defined(__aarch64__)294return 31 - _lzcnt_u32(v);295#elif defined(__X86_ASM__)296int r = 0; asm ("bsr %1,%0" : "=r"(r) : "r"(v)); return r;297#else298return __builtin_clz(v) ^ 31;299#endif300}301302#if defined(__64BIT__) || defined(__EMSCRIPTEN__)303__forceinline unsigned bsr(unsigned v) {304#if defined(__AVX2__)305return 31 - _lzcnt_u32(v);306#elif defined(__X86_ASM__)307unsigned r = 0; asm ("bsr %1,%0" : "=r"(r) : "r"(v)); return r;308#else309return __builtin_clz(v) ^ 31;310#endif311}312#endif313314__forceinline size_t bsr(size_t v) {315#if defined(__AVX2__) && !defined(__aarch64__)316#if defined(__X86_64__)317return 63 - _lzcnt_u64(v);318#else319return 31 - _lzcnt_u32(v);320#endif321#elif defined(__X86_ASM__)322size_t r = 0; asm ("bsr %1,%0" : "=r"(r) : "r"(v)); return r;323#else324return (sizeof(v) * 8 - 1) - __builtin_clzl(v);325#endif326}327328__forceinline int lzcnt(const int x)329{330#if defined(__AVX2__) && !defined(__aarch64__)331return _lzcnt_u32(x);332#else333if (unlikely(x == 0)) return 32;334return 31 - bsr(x);335#endif336}337338__forceinline size_t blsr(size_t v) {339#if defined(__AVX2__) && !defined(__aarch64__)340#if defined(__INTEL_COMPILER)341return _blsr_u64(v);342#else343#if defined(__X86_64__)344return __blsr_u64(v);345#else346return __blsr_u32(v);347#endif348#endif349#else350return v & (v-1);351#endif352}353354__forceinline int btc(int v, int i) {355#if defined(__X86_ASM__)356int r = 0; asm ("btc %1,%0" : "=r"(r) : "r"(i), "0"(v) : "flags" ); return r;357#else358return (v ^ (1 << i));359#endif360}361362__forceinline int bts(int v, int i) {363#if defined(__X86_ASM__)364int r = 0; asm ("bts %1,%0" : "=r"(r) : "r"(i), "0"(v) : "flags"); return r;365#else366return (v | (1 << i));367#endif368}369370__forceinline int btr(int v, int i) {371#if defined(__X86_ASM__)372int r = 0; asm ("btr %1,%0" : "=r"(r) : "r"(i), "0"(v) : "flags"); return r;373#else374return (v & ~(1 << i));375#endif376}377378__forceinline size_t btc(size_t v, size_t i) {379#if defined(__X86_ASM__)380size_t r = 0; asm ("btc %1,%0" : "=r"(r) : "r"(i), "0"(v) : "flags" ); return r;381#else382return (v ^ (1 << i));383#endif384}385386__forceinline size_t bts(size_t v, size_t i) {387#if defined(__X86_ASM__)388size_t r = 0; asm ("bts %1,%0" : "=r"(r) : "r"(i), "0"(v) : "flags"); return r;389#else390return (v | (1 << i));391#endif392}393394__forceinline size_t btr(size_t v, size_t i) {395#if defined(__X86_ASM__)396size_t r = 0; asm ("btr %1,%0" : "=r"(r) : "r"(i), "0"(v) : "flags"); return r;397#else398return (v & ~(1 << i));399#endif400}401402__forceinline int32_t atomic_cmpxchg(int32_t volatile* value, int32_t comparand, const int32_t input) {403return __sync_val_compare_and_swap(value, comparand, input);404}405406#endif407408#if !defined(__WIN32__)409410#if defined(__i386__) && defined(__PIC__)411412__forceinline void __cpuid(int out[4], int op)413{414asm volatile ("xchg{l}\t{%%}ebx, %1\n\t"415"cpuid\n\t"416"xchg{l}\t{%%}ebx, %1\n\t"417: "=a"(out[0]), "=r"(out[1]), "=c"(out[2]), "=d"(out[3])418: "0"(op));419}420421__forceinline void __cpuid_count(int out[4], int op1, int op2)422{423asm volatile ("xchg{l}\t{%%}ebx, %1\n\t"424"cpuid\n\t"425"xchg{l}\t{%%}ebx, %1\n\t"426: "=a" (out[0]), "=r" (out[1]), "=c" (out[2]), "=d" (out[3])427: "0" (op1), "2" (op2));428}429430#elif defined(__X86_ASM__)431432__forceinline void __cpuid(int out[4], int op) {433asm volatile ("cpuid" : "=a"(out[0]), "=b"(out[1]), "=c"(out[2]), "=d"(out[3]) : "a"(op));434}435436__forceinline void __cpuid_count(int out[4], int op1, int op2) {437asm volatile ("cpuid" : "=a"(out[0]), "=b"(out[1]), "=c"(out[2]), "=d"(out[3]) : "a"(op1), "c"(op2));438}439440#endif441#endif442443////////////////////////////////////////////////////////////////////////////////444/// All Platforms445////////////////////////////////////////////////////////////////////////////////446447#if defined(__clang__) || defined(__GNUC__)448#if !defined(_mm_undefined_ps)449__forceinline __m128 _mm_undefined_ps() { return _mm_setzero_ps(); }450#endif451#if !defined(_mm_undefined_si128)452__forceinline __m128i _mm_undefined_si128() { return _mm_setzero_si128(); }453#endif454#if !defined(_mm256_undefined_ps) && defined(__AVX__)455__forceinline __m256 _mm256_undefined_ps() { return _mm256_setzero_ps(); }456#endif457#if !defined(_mm256_undefined_si256) && defined(__AVX__)458__forceinline __m256i _mm256_undefined_si256() { return _mm256_setzero_si256(); }459#endif460#if !defined(_mm512_undefined_ps) && defined(__AVX512F__)461__forceinline __m512 _mm512_undefined_ps() { return _mm512_setzero_ps(); }462#endif463#if !defined(_mm512_undefined_epi32) && defined(__AVX512F__)464__forceinline __m512i _mm512_undefined_epi32() { return _mm512_setzero_si512(); }465#endif466#endif467468#if defined(EMBREE_SYCL_SUPPORT) && defined(__SYCL_DEVICE_ONLY__)469470__forceinline unsigned int popcnt(unsigned int in) {471return sycl::popcount(in);472}473474#else475476#if defined(__SSE4_2__) || defined(__ARM_NEON)477478__forceinline int popcnt(int in) {479return _mm_popcnt_u32(in);480}481482__forceinline unsigned popcnt(unsigned in) {483return _mm_popcnt_u32(in);484}485486#if defined(__64BIT__)487__forceinline size_t popcnt(size_t in) {488return _mm_popcnt_u64(in);489}490#endif491492#endif493494#endif495496#if defined(__X86_ASM__)497__forceinline uint64_t rdtsc()498{499int dummy[4];500__cpuid(dummy,0);501uint64_t clock = read_tsc();502__cpuid(dummy,0);503return clock;504}505#endif506507__forceinline void pause_cpu(const size_t N = 8)508{509for (size_t i=0; i<N; i++)510_mm_pause();511}512513/* prefetches */514__forceinline void prefetchL1 (const void* ptr) { _mm_prefetch((const char*)ptr,_MM_HINT_T0); }515__forceinline void prefetchL2 (const void* ptr) { _mm_prefetch((const char*)ptr,_MM_HINT_T1); }516__forceinline void prefetchL3 (const void* ptr) { _mm_prefetch((const char*)ptr,_MM_HINT_T2); }517__forceinline void prefetchNTA(const void* ptr) { _mm_prefetch((const char*)ptr,_MM_HINT_NTA); }518__forceinline void prefetchEX (const void* ptr) {519#if defined(__INTEL_COMPILER)520_mm_prefetch((const char*)ptr,_MM_HINT_ET0);521#else522_mm_prefetch((const char*)ptr,_MM_HINT_T0);523#endif524}525526__forceinline void prefetchL1EX(const void* ptr) {527prefetchEX(ptr);528}529530__forceinline void prefetchL2EX(const void* ptr) {531prefetchEX(ptr);532}533#if defined(__AVX2__) && !defined(__aarch64__)534__forceinline unsigned int pext(unsigned int a, unsigned int b) { return _pext_u32(a, b); }535__forceinline unsigned int pdep(unsigned int a, unsigned int b) { return _pdep_u32(a, b); }536#if defined(__X86_64__)537__forceinline size_t pext(size_t a, size_t b) { return _pext_u64(a, b); }538__forceinline size_t pdep(size_t a, size_t b) { return _pdep_u64(a, b); }539#endif540#endif541542#if defined(__AVX512F__)543#if defined(__INTEL_COMPILER)544__forceinline float mm512_cvtss_f32(__m512 v) {545return _mm512_cvtss_f32(v);546}547__forceinline int mm512_mask2int(__mmask16 k1) {548return _mm512_mask2int(k1);549}550__forceinline __mmask16 mm512_int2mask(int mask) {551return _mm512_int2mask(mask);552}553#else554__forceinline float mm512_cvtss_f32(__m512 v) { // FIXME: _mm512_cvtss_f32 neither supported by clang v4.0.0 nor GCC 6.3555return _mm_cvtss_f32(_mm512_castps512_ps128(v));556}557__forceinline int mm512_mask2int(__mmask16 k1) { // FIXME: _mm512_mask2int not yet supported by GCC 6.3558return (int)k1;559}560__forceinline __mmask16 mm512_int2mask(int mask) { // FIXME: _mm512_int2mask not yet supported by GCC 6.3561return (__mmask16)mask;562}563#endif564#endif565}566567568