CoCalc -- emulation.h

GitHub Repository: godotengine/godot
Path: blob/master/thirdparty/embree/common/simd/arm/emulation.h
⁹⁹¹³ views
1
// Copyright 2009-2020 Intel Corporation
2
// SPDX-License-Identifier: Apache-2.0
3

4
#pragma once
5

6
/* Make precision match SSE, at the cost of some performance */
7
#if !defined(__aarch64__)
8
#  define SSE2NEON_PRECISE_DIV 1
9
#  define SSE2NEON_PRECISE_SQRT 1
10
#endif
11

12
#include "sse2neon.h"
13

14
__forceinline __m128 _mm_abs_ps(__m128 a) { return vabsq_f32(a); }
15

16
__forceinline __m128 _mm_fmadd_ps (__m128 a, __m128 b, __m128 c) { return vfmaq_f32(c, a, b); }
17
__forceinline __m128 _mm_fnmadd_ps(__m128 a, __m128 b, __m128 c) { return vfmsq_f32(c, a, b); }
18
__forceinline __m128 _mm_fnmsub_ps(__m128 a, __m128 b, __m128 c) { return vnegq_f32(vfmaq_f32(c, a, b)); }
19
__forceinline __m128 _mm_fmsub_ps (__m128 a, __m128 b, __m128 c) { return vnegq_f32(vfmsq_f32(c, a, b)); }
20

21
__forceinline __m128 _mm_broadcast_ss (float const * mem_addr)
22
{
23
    return vdupq_n_f32(*mem_addr);
24
}
25

26
// AVX2 emulation leverages Intel FMA defs above.  Include after them.
27
#include "avx2neon.h"
28

29
/* Dummy defines for floating point control */
30
#define _MM_MASK_MASK 0x1f80
31
#define _MM_MASK_DIV_ZERO 0x200
32
// #define _MM_FLUSH_ZERO_ON 0x8000
33
#define _MM_MASK_DENORM 0x100
34
#define _MM_SET_EXCEPTION_MASK(x)
35
// #define _MM_SET_FLUSH_ZERO_MODE(x)
36

37
/*
38
__forceinline int _mm_getcsr()
39
{
40
  return 0;
41
}
42

43
__forceinline void _mm_mfence()
44
{
45
  __sync_synchronize();
46
}
47
*/
48

49
__forceinline __m128i _mm_load4epu8_epi32(__m128i *ptr)
50
{
51
    uint8x8_t  t0 = vld1_u8((uint8_t*)ptr);
52
    uint16x8_t t1 = vmovl_u8(t0);
53
    uint32x4_t t2 = vmovl_u16(vget_low_u16(t1));
54
    return vreinterpretq_s32_u32(t2);
55
}
56

57
__forceinline __m128i _mm_load4epu16_epi32(__m128i *ptr)
58
{
59
    uint16x8_t t0 = vld1q_u16((uint16_t*)ptr);
60
    uint32x4_t t1 = vmovl_u16(vget_low_u16(t0));
61
    return vreinterpretq_s32_u32(t1);
62
}
63

64
__forceinline __m128i _mm_load4epi8_f32(__m128i *ptr)
65
{
66
    int8x8_t    t0 = vld1_s8((int8_t*)ptr);
67
    int16x8_t   t1 = vmovl_s8(t0);
68
    int32x4_t   t2 = vmovl_s16(vget_low_s16(t1));
69
    float32x4_t t3 = vcvtq_f32_s32(t2);
70
    return vreinterpretq_s32_f32(t3);
71
}
72

73
__forceinline __m128i _mm_load4epu8_f32(__m128i *ptr)
74
{
75
    uint8x8_t   t0 = vld1_u8((uint8_t*)ptr);
76
    uint16x8_t  t1 = vmovl_u8(t0);
77
    uint32x4_t  t2 = vmovl_u16(vget_low_u16(t1));
78
    return vreinterpretq_s32_u32(t2);
79
}
80

81
__forceinline __m128i _mm_load4epi16_f32(__m128i *ptr)
82
{
83
    int16x8_t   t0 = vld1q_s16((int16_t*)ptr);
84
    int32x4_t   t1 = vmovl_s16(vget_low_s16(t0));
85
    float32x4_t t2 = vcvtq_f32_s32(t1);
86
    return vreinterpretq_s32_f32(t2);
87
}
88

89
Product

Resources

Company