CoCalc provides the best real-time collaborative environment for Jupyter Notebooks, LaTeX documents, and SageMath, scalable from individual users to large groups and classes!
CoCalc provides the best real-time collaborative environment for Jupyter Notebooks, LaTeX documents, and SageMath, scalable from individual users to large groups and classes!
Path: blob/master/Common/Math/CrossSIMD.h
Views: 1401
// CrossSIMD1//2// Compatibility wrappers for SIMD dialects.3//4// In the long run, might do a more general single-source-SIMD wrapper here consisting5// of defines that translate to either NEON or SSE. It would be possible to write quite a lot of6// our various color conversion functions and so on in a pretty generic manner.78#include "ppsspp_config.h"910#include "stdint.h"1112#ifdef __clang__13// Weird how you can't just use #pragma in a macro.14#define DO_NOT_VECTORIZE_LOOP _Pragma("clang loop vectorize(disable)")15#else16#define DO_NOT_VECTORIZE_LOOP17#endif1819#if PPSSPP_ARCH(SSE2)20#include <emmintrin.h>21#endif2223#if PPSSPP_ARCH(ARM_NEON)24#if defined(_MSC_VER) && PPSSPP_ARCH(ARM64)25#include <arm64_neon.h>26#else27#include <arm_neon.h>28#endif29#endif3031// Basic types3233#if PPSSPP_ARCH(ARM64_NEON)3435// No special ones here.3637#elif PPSSPP_ARCH(ARM_NEON)3839// Compatibility wrappers making ARM64 NEON code run on ARM3240// With optimization on, these should compile down to the optimal code.4142inline float32x4_t vmulq_laneq_f32(float32x4_t a, float32x4_t b, int lane) {43switch (lane & 3) {44case 0: return vmulq_lane_f32(a, vget_low_f32(b), 0);45case 1: return vmulq_lane_f32(a, vget_low_f32(b), 1);46case 2: return vmulq_lane_f32(a, vget_high_f32(b), 0);47default: return vmulq_lane_f32(a, vget_high_f32(b), 1);48}49}5051inline float32x4_t vmlaq_laneq_f32(float32x4_t a, float32x4_t b, float32x4_t c, int lane) {52switch (lane & 3) {53case 0: return vmlaq_lane_f32(a, b, vget_low_f32(c), 0);54case 1: return vmlaq_lane_f32(a, b, vget_low_f32(c), 1);55case 2: return vmlaq_lane_f32(a, b, vget_high_f32(c), 0);56default: return vmlaq_lane_f32(a, b, vget_high_f32(c), 1);57}58}5960inline uint32x4_t vcgezq_f32(float32x4_t v) {61return vcgeq_f32(v, vdupq_n_f32(0.0f));62}6364#endif656667