Path: blob/master/thirdparty/embree/common/simd/arm/emulation.h
9913 views
// Copyright 2009-2020 Intel Corporation1// SPDX-License-Identifier: Apache-2.023#pragma once45/* Make precision match SSE, at the cost of some performance */6#if !defined(__aarch64__)7# define SSE2NEON_PRECISE_DIV 18# define SSE2NEON_PRECISE_SQRT 19#endif1011#include "sse2neon.h"1213__forceinline __m128 _mm_abs_ps(__m128 a) { return vabsq_f32(a); }1415__forceinline __m128 _mm_fmadd_ps (__m128 a, __m128 b, __m128 c) { return vfmaq_f32(c, a, b); }16__forceinline __m128 _mm_fnmadd_ps(__m128 a, __m128 b, __m128 c) { return vfmsq_f32(c, a, b); }17__forceinline __m128 _mm_fnmsub_ps(__m128 a, __m128 b, __m128 c) { return vnegq_f32(vfmaq_f32(c, a, b)); }18__forceinline __m128 _mm_fmsub_ps (__m128 a, __m128 b, __m128 c) { return vnegq_f32(vfmsq_f32(c, a, b)); }1920__forceinline __m128 _mm_broadcast_ss (float const * mem_addr)21{22return vdupq_n_f32(*mem_addr);23}2425// AVX2 emulation leverages Intel FMA defs above. Include after them.26#include "avx2neon.h"2728/* Dummy defines for floating point control */29#define _MM_MASK_MASK 0x1f8030#define _MM_MASK_DIV_ZERO 0x20031// #define _MM_FLUSH_ZERO_ON 0x800032#define _MM_MASK_DENORM 0x10033#define _MM_SET_EXCEPTION_MASK(x)34// #define _MM_SET_FLUSH_ZERO_MODE(x)3536/*37__forceinline int _mm_getcsr()38{39return 0;40}4142__forceinline void _mm_mfence()43{44__sync_synchronize();45}46*/4748__forceinline __m128i _mm_load4epu8_epi32(__m128i *ptr)49{50uint8x8_t t0 = vld1_u8((uint8_t*)ptr);51uint16x8_t t1 = vmovl_u8(t0);52uint32x4_t t2 = vmovl_u16(vget_low_u16(t1));53return vreinterpretq_s32_u32(t2);54}5556__forceinline __m128i _mm_load4epu16_epi32(__m128i *ptr)57{58uint16x8_t t0 = vld1q_u16((uint16_t*)ptr);59uint32x4_t t1 = vmovl_u16(vget_low_u16(t0));60return vreinterpretq_s32_u32(t1);61}6263__forceinline __m128i _mm_load4epi8_f32(__m128i *ptr)64{65int8x8_t t0 = vld1_s8((int8_t*)ptr);66int16x8_t t1 = vmovl_s8(t0);67int32x4_t t2 = vmovl_s16(vget_low_s16(t1));68float32x4_t t3 = vcvtq_f32_s32(t2);69return vreinterpretq_s32_f32(t3);70}7172__forceinline __m128i _mm_load4epu8_f32(__m128i *ptr)73{74uint8x8_t t0 = vld1_u8((uint8_t*)ptr);75uint16x8_t t1 = vmovl_u8(t0);76uint32x4_t t2 = vmovl_u16(vget_low_u16(t1));77return vreinterpretq_s32_u32(t2);78}7980__forceinline __m128i _mm_load4epi16_f32(__m128i *ptr)81{82int16x8_t t0 = vld1q_s16((int16_t*)ptr);83int32x4_t t1 = vmovl_s16(vget_low_s16(t0));84float32x4_t t2 = vcvtq_f32_s32(t1);85return vreinterpretq_s32_f32(t2);86}878889