Path: blob/master/thirdparty/basis_universal/encoder/cppspmd_sse.h
9903 views
// cppspmd_sse.h1// Copyright 2020-2022 Binomial LLC2//3// Licensed under the Apache License, Version 2.0 (the "License");4// you may not use this file except in compliance with the License.5// You may obtain a copy of the License at6//7// http://www.apache.org/licenses/LICENSE-2.08//9// Unless required by applicable law or agreed to in writing, software10// distributed under the License is distributed on an "AS IS" BASIS,11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.12// See the License for the specific language governing permissions and13// limitations under the License.14//15// Notes for Basis Universal:16// All of the "cppspmd" code and headers are OPTIONAL to Basis Universal. if BASISU_SUPPORT_SSE is 0, it will never be included and does not impact compilation.17// The techniques used in this code were originally demonstrated for AVX2 by Nicolas Guillemot, Jefferson Amstutz in their "CppSPMD" project.18// This is new code for use in Basis Universal, although it uses the same general SPMD techniques in SSE 2/4.1920#include <stdlib.h>21#include <stdint.h>22#include <assert.h>23#include <math.h>24#include <utility>25#include <algorithm>2627#if CPPSPMD_SSE228#include <xmmintrin.h> // SSE29#include <emmintrin.h> // SSE230#else31#include <xmmintrin.h> // SSE32#include <emmintrin.h> // SSE233#include <pmmintrin.h> // SSE334#include <tmmintrin.h> // SSSE335#include <smmintrin.h> // SSE4.136//#include <nmmintrin.h> // SSE4.237#endif3839#undef CPPSPMD_SSE40#undef CPPSPMD_AVX141#undef CPPSPMD_AVX242#undef CPPSPMD_AVX43#undef CPPSPMD_FLOAT444#undef CPPSPMD_INT164546#define CPPSPMD_SSE 147#define CPPSPMD_AVX 048#define CPPSPMD_AVX1 049#define CPPSPMD_AVX2 050#define CPPSPMD_FLOAT4 051#define CPPSPMD_INT16 05253#ifdef _MSC_VER54#ifndef CPPSPMD_DECL55#define CPPSPMD_DECL(type, name) __declspec(align(16)) type name56#endif5758#ifndef CPPSPMD_ALIGN59#define CPPSPMD_ALIGN(v) __declspec(align(v))60#endif6162#define _mm_undefined_si128 _mm_setzero_si12863#define _mm_undefined_ps _mm_setzero_ps64#else65#ifndef CPPSPMD_DECL66#define CPPSPMD_DECL(type, name) type name __attribute__((aligned(32)))67#endif6869#ifndef CPPSPMD_ALIGN70#define CPPSPMD_ALIGN(v) __attribute__((aligned(v)))71#endif72#endif7374#ifndef CPPSPMD_FORCE_INLINE75#ifdef _DEBUG76#define CPPSPMD_FORCE_INLINE inline77#else78#ifdef _MSC_VER79#define CPPSPMD_FORCE_INLINE __forceinline80#else81#define CPPSPMD_FORCE_INLINE inline82#endif83#endif84#endif8586#undef CPPSPMD87#undef CPPSPMD_ARCH8889#if CPPSPMD_SSE290#define CPPSPMD_SSE41 091#define CPPSPMD cppspmd_sse292#define CPPSPMD_ARCH _sse293#else94#define CPPSPMD_SSE41 195#define CPPSPMD cppspmd_sse4196#define CPPSPMD_ARCH _sse4197#endif9899#ifndef CPPSPMD_GLUER100#define CPPSPMD_GLUER(a, b) a##b101#endif102103#ifndef CPPSPMD_GLUER2104#define CPPSPMD_GLUER2(a, b) CPPSPMD_GLUER(a, b)105#endif106107#ifndef CPPSPMD_NAME108#define CPPSPMD_NAME(a) CPPSPMD_GLUER2(a, CPPSPMD_ARCH)109#endif110111#undef VASSERT112#define VCOND(cond) ((exec_mask(vbool(cond)) & m_exec).get_movemask() == m_exec.get_movemask())113#define VASSERT(cond) assert( VCOND(cond) )114115#define CPPSPMD_ALIGNMENT (16)116117#define storeu_si32(p, a) (void)(*(int*)(p) = _mm_cvtsi128_si32((a)))118119namespace CPPSPMD120{121122const int PROGRAM_COUNT_SHIFT = 2;123const int PROGRAM_COUNT = 1 << PROGRAM_COUNT_SHIFT;124125template <typename N> inline N* aligned_new() { void* p = _mm_malloc(sizeof(N), 64); new (p) N; return static_cast<N*>(p); }126template <typename N> void aligned_delete(N* p) { if (p) { p->~N(); _mm_free(p); } }127128CPPSPMD_DECL(const uint32_t, g_allones_128[4]) = { UINT32_MAX, UINT32_MAX, UINT32_MAX, UINT32_MAX };129CPPSPMD_DECL(const uint32_t, g_x_128[4]) = { UINT32_MAX, 0, 0, 0 };130CPPSPMD_DECL(const float, g_onef_128[4]) = { 1.0f, 1.0f, 1.0f, 1.0f };131CPPSPMD_DECL(const uint32_t, g_oneu_128[4]) = { 1, 1, 1, 1 };132133CPPSPMD_DECL(const uint32_t, g_lane_masks_128[4][4]) =134{135{ UINT32_MAX, 0, 0, 0 },136{ 0, UINT32_MAX, 0, 0 },137{ 0, 0, UINT32_MAX, 0 },138{ 0, 0, 0, UINT32_MAX },139};140141#if CPPSPMD_SSE41142CPPSPMD_FORCE_INLINE __m128i _mm_blendv_epi32(__m128i a, __m128i b, __m128i c) { return _mm_castps_si128(_mm_blendv_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b), _mm_castsi128_ps(c))); }143#endif144145CPPSPMD_FORCE_INLINE __m128i blendv_epi8(__m128i a, __m128i b, __m128i mask)146{147#if CPPSPMD_SSE2148return _mm_castps_si128(_mm_or_ps(_mm_and_ps(_mm_castsi128_ps(mask), _mm_castsi128_ps(b)), _mm_andnot_ps(_mm_castsi128_ps(mask), _mm_castsi128_ps(a))));149#else150return _mm_blendv_epi8(a, b, mask);151#endif152}153154CPPSPMD_FORCE_INLINE __m128 blendv_mask_ps(__m128 a, __m128 b, __m128 mask)155{156#if CPPSPMD_SSE2157// We know it's a mask, so we can just emulate the blend.158return _mm_or_ps(_mm_and_ps(mask, b), _mm_andnot_ps(mask, a));159#else160return _mm_blendv_ps(a, b, mask);161#endif162}163164CPPSPMD_FORCE_INLINE __m128 blendv_ps(__m128 a, __m128 b, __m128 mask)165{166#if CPPSPMD_SSE2167// Input is not a mask, but MSB bits - so emulate _mm_blendv_ps() by replicating bit 31.168mask = _mm_castsi128_ps(_mm_srai_epi32(_mm_castps_si128(mask), 31));169return _mm_or_ps(_mm_and_ps(mask, b), _mm_andnot_ps(mask, a));170#else171return _mm_blendv_ps(a, b, mask);172#endif173}174175CPPSPMD_FORCE_INLINE __m128i blendv_mask_epi32(__m128i a, __m128i b, __m128i mask)176{177return _mm_castps_si128(blendv_mask_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b), _mm_castsi128_ps(mask)));178}179180CPPSPMD_FORCE_INLINE __m128i blendv_epi32(__m128i a, __m128i b, __m128i mask)181{182return _mm_castps_si128(blendv_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b), _mm_castsi128_ps(mask)));183}184185#if CPPSPMD_SSE2186CPPSPMD_FORCE_INLINE int extract_x(const __m128i& vec) { return _mm_cvtsi128_si32(vec); }187CPPSPMD_FORCE_INLINE int extract_y(const __m128i& vec) { return _mm_cvtsi128_si32(_mm_shuffle_epi32(vec, 0x55)); }188CPPSPMD_FORCE_INLINE int extract_z(const __m128i& vec) { return _mm_cvtsi128_si32(_mm_shuffle_epi32(vec, 0xAA)); }189CPPSPMD_FORCE_INLINE int extract_w(const __m128i& vec) { return _mm_cvtsi128_si32(_mm_shuffle_epi32(vec, 0xFF)); }190191// Returns float bits as int, to emulate _mm_extract_ps()192CPPSPMD_FORCE_INLINE int extract_ps_x(const __m128& vec) { float f = _mm_cvtss_f32(vec); return *(const int*)&f; }193CPPSPMD_FORCE_INLINE int extract_ps_y(const __m128& vec) { float f = _mm_cvtss_f32(_mm_shuffle_ps(vec, vec, 0x55)); return *(const int*)&f; }194CPPSPMD_FORCE_INLINE int extract_ps_z(const __m128& vec) { float f = _mm_cvtss_f32(_mm_shuffle_ps(vec, vec, 0xAA)); return *(const int*)&f; }195CPPSPMD_FORCE_INLINE int extract_ps_w(const __m128& vec) { float f = _mm_cvtss_f32(_mm_shuffle_ps(vec, vec, 0xFF)); return *(const int*)&f; }196197// Returns floats198CPPSPMD_FORCE_INLINE float extractf_ps_x(const __m128& vec) { return _mm_cvtss_f32(vec); }199CPPSPMD_FORCE_INLINE float extractf_ps_y(const __m128& vec) { return _mm_cvtss_f32(_mm_shuffle_ps(vec, vec, 0x55)); }200CPPSPMD_FORCE_INLINE float extractf_ps_z(const __m128& vec) { return _mm_cvtss_f32(_mm_shuffle_ps(vec, vec, 0xAA)); }201CPPSPMD_FORCE_INLINE float extractf_ps_w(const __m128& vec) { return _mm_cvtss_f32(_mm_shuffle_ps(vec, vec, 0xFF)); }202#else203CPPSPMD_FORCE_INLINE int extract_x(const __m128i& vec) { return _mm_extract_epi32(vec, 0); }204CPPSPMD_FORCE_INLINE int extract_y(const __m128i& vec) { return _mm_extract_epi32(vec, 1); }205CPPSPMD_FORCE_INLINE int extract_z(const __m128i& vec) { return _mm_extract_epi32(vec, 2); }206CPPSPMD_FORCE_INLINE int extract_w(const __m128i& vec) { return _mm_extract_epi32(vec, 3); }207208// Returns float bits as int209CPPSPMD_FORCE_INLINE int extract_ps_x(const __m128& vec) { return _mm_extract_ps(vec, 0); }210CPPSPMD_FORCE_INLINE int extract_ps_y(const __m128& vec) { return _mm_extract_ps(vec, 1); }211CPPSPMD_FORCE_INLINE int extract_ps_z(const __m128& vec) { return _mm_extract_ps(vec, 2); }212CPPSPMD_FORCE_INLINE int extract_ps_w(const __m128& vec) { return _mm_extract_ps(vec, 3); }213214// Returns floats215CPPSPMD_FORCE_INLINE float extractf_ps_x(const __m128& vec) { int v = extract_ps_x(vec); return *(const float*)&v; }216CPPSPMD_FORCE_INLINE float extractf_ps_y(const __m128& vec) { int v = extract_ps_y(vec); return *(const float*)&v; }217CPPSPMD_FORCE_INLINE float extractf_ps_z(const __m128& vec) { int v = extract_ps_z(vec); return *(const float*)&v; }218CPPSPMD_FORCE_INLINE float extractf_ps_w(const __m128& vec) { int v = extract_ps_w(vec); return *(const float*)&v; }219#endif220221#if CPPSPMD_SSE2222CPPSPMD_FORCE_INLINE __m128i insert_x(const __m128i& vec, int v) { return _mm_insert_epi16(_mm_insert_epi16(vec, v, 0), (uint32_t)v >> 16U, 1); }223CPPSPMD_FORCE_INLINE __m128i insert_y(const __m128i& vec, int v) { return _mm_insert_epi16(_mm_insert_epi16(vec, v, 2), (uint32_t)v >> 16U, 3); }224CPPSPMD_FORCE_INLINE __m128i insert_z(const __m128i& vec, int v) { return _mm_insert_epi16(_mm_insert_epi16(vec, v, 4), (uint32_t)v >> 16U, 5); }225CPPSPMD_FORCE_INLINE __m128i insert_w(const __m128i& vec, int v) { return _mm_insert_epi16(_mm_insert_epi16(vec, v, 6), (uint32_t)v >> 16U, 7); }226#else227CPPSPMD_FORCE_INLINE __m128i insert_x(const __m128i& vec, int v) { return _mm_insert_epi32(vec, v, 0); }228CPPSPMD_FORCE_INLINE __m128i insert_y(const __m128i& vec, int v) { return _mm_insert_epi32(vec, v, 1); }229CPPSPMD_FORCE_INLINE __m128i insert_z(const __m128i& vec, int v) { return _mm_insert_epi32(vec, v, 2); }230CPPSPMD_FORCE_INLINE __m128i insert_w(const __m128i& vec, int v) { return _mm_insert_epi32(vec, v, 3); }231#endif232233#if CPPSPMD_SSE2234inline __m128i shuffle_epi8(const __m128i& a, const __m128i& b)235{236// Just emulate _mm_shuffle_epi8. This is very slow, but what else can we do?237CPPSPMD_ALIGN(16) uint8_t av[16];238_mm_store_si128((__m128i*)av, a);239240CPPSPMD_ALIGN(16) uint8_t bvi[16];241_mm_store_ps((float*)bvi, _mm_and_ps(_mm_castsi128_ps(b), _mm_castsi128_ps(_mm_set1_epi32(0x0F0F0F0F))));242243CPPSPMD_ALIGN(16) uint8_t result[16];244245result[0] = av[bvi[0]];246result[1] = av[bvi[1]];247result[2] = av[bvi[2]];248result[3] = av[bvi[3]];249250result[4] = av[bvi[4]];251result[5] = av[bvi[5]];252result[6] = av[bvi[6]];253result[7] = av[bvi[7]];254255result[8] = av[bvi[8]];256result[9] = av[bvi[9]];257result[10] = av[bvi[10]];258result[11] = av[bvi[11]];259260result[12] = av[bvi[12]];261result[13] = av[bvi[13]];262result[14] = av[bvi[14]];263result[15] = av[bvi[15]];264265return _mm_andnot_si128(_mm_cmplt_epi8(b, _mm_setzero_si128()), _mm_load_si128((__m128i*)result));266}267#else268CPPSPMD_FORCE_INLINE __m128i shuffle_epi8(const __m128i& a, const __m128i& b)269{270return _mm_shuffle_epi8(a, b);271}272#endif273274#if CPPSPMD_SSE2275CPPSPMD_FORCE_INLINE __m128i min_epi32(__m128i a, __m128i b)276{277return blendv_mask_epi32(b, a, _mm_cmplt_epi32(a, b));278}279CPPSPMD_FORCE_INLINE __m128i max_epi32(__m128i a, __m128i b)280{281return blendv_mask_epi32(b, a, _mm_cmpgt_epi32(a, b));282}283CPPSPMD_FORCE_INLINE __m128i min_epu32(__m128i a, __m128i b)284{285__m128i n = _mm_set1_epi32(0x80000000);286__m128i ac = _mm_add_epi32(a, n);287__m128i bc = _mm_add_epi32(b, n);288return blendv_mask_epi32(b, a, _mm_cmplt_epi32(ac, bc));289}290CPPSPMD_FORCE_INLINE __m128i max_epu32(__m128i a, __m128i b)291{292__m128i n = _mm_set1_epi32(0x80000000);293__m128i ac = _mm_add_epi32(a, n);294__m128i bc = _mm_add_epi32(b, n);295return blendv_mask_epi32(b, a, _mm_cmpgt_epi32(ac, bc));296}297#else298CPPSPMD_FORCE_INLINE __m128i min_epi32(__m128i a, __m128i b)299{300return _mm_min_epi32(a, b);301}302CPPSPMD_FORCE_INLINE __m128i max_epi32(__m128i a, __m128i b)303{304return _mm_max_epi32(a, b);305}306CPPSPMD_FORCE_INLINE __m128i min_epu32(__m128i a, __m128i b)307{308return _mm_min_epu32(a, b);309}310CPPSPMD_FORCE_INLINE __m128i max_epu32(__m128i a, __m128i b)311{312return _mm_max_epu32(a, b);313}314#endif315316#if CPPSPMD_SSE2317CPPSPMD_FORCE_INLINE __m128i abs_epi32(__m128i a)318{319__m128i sign_mask = _mm_srai_epi32(a, 31);320return _mm_sub_epi32(_mm_castps_si128(_mm_xor_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(sign_mask))), sign_mask);321}322#else323CPPSPMD_FORCE_INLINE __m128i abs_epi32(__m128i a)324{325return _mm_abs_epi32(a);326}327#endif328329#if CPPSPMD_SSE2330CPPSPMD_FORCE_INLINE __m128i mullo_epi32(__m128i a, __m128i b)331{332__m128i tmp1 = _mm_mul_epu32(a, b);333__m128i tmp2 = _mm_mul_epu32(_mm_srli_si128(a, 4), _mm_srli_si128(b, 4));334return _mm_unpacklo_epi32(_mm_shuffle_epi32(tmp1, _MM_SHUFFLE(0, 0, 2, 0)), _mm_shuffle_epi32(tmp2, _MM_SHUFFLE(0, 0, 2, 0)));335}336#else337CPPSPMD_FORCE_INLINE __m128i mullo_epi32(__m128i a, __m128i b)338{339return _mm_mullo_epi32(a, b);340}341#endif342343CPPSPMD_FORCE_INLINE __m128i mulhi_epu32(__m128i a, __m128i b)344{345__m128i tmp1 = _mm_mul_epu32(a, b);346__m128i tmp2 = _mm_mul_epu32(_mm_srli_si128(a, 4), _mm_srli_si128(b, 4));347return _mm_unpacklo_epi32(_mm_shuffle_epi32(tmp1, _MM_SHUFFLE(0, 0, 3, 1)), _mm_shuffle_epi32(tmp2, _MM_SHUFFLE(0, 0, 3, 1)));348}349350#if CPPSPMD_SSE2351inline __m128i load_rgba32(const void* p)352{353__m128i xmm = _mm_cvtsi32_si128(*(const int*)p);354xmm = _mm_unpacklo_epi8(xmm, _mm_setzero_si128());355xmm = _mm_unpacklo_epi16(xmm, _mm_setzero_si128());356return xmm;357}358#else359inline __m128i load_rgba32(const void* p)360{361return _mm_cvtepu8_epi32(_mm_castps_si128(_mm_load_ss((const float*)p)));362}363#endif364365inline void transpose4x4(__m128i& x, __m128i& y, __m128i& z, __m128i& w, const __m128i& r0, const __m128i& r1, const __m128i& r2, const __m128i& r3)366{367__m128i t0 = _mm_unpacklo_epi32(r0, r1);368__m128i t1 = _mm_unpacklo_epi32(r2, r3);369__m128i t2 = _mm_unpackhi_epi32(r0, r1);370__m128i t3 = _mm_unpackhi_epi32(r2, r3);371x = _mm_unpacklo_epi64(t0, t1);372y = _mm_unpackhi_epi64(t0, t1);373z = _mm_unpacklo_epi64(t2, t3);374w = _mm_unpackhi_epi64(t2, t3);375}376377const uint32_t ALL_ON_MOVEMASK = 0xF;378379struct spmd_kernel380{381struct vint;382struct lint;383struct vbool;384struct vfloat;385386typedef int int_t;387typedef vint vint_t;388typedef lint lint_t;389390// Exec mask391struct exec_mask392{393__m128i m_mask;394395exec_mask() = default;396397CPPSPMD_FORCE_INLINE explicit exec_mask(const vbool& b);398CPPSPMD_FORCE_INLINE explicit exec_mask(const __m128i& mask) : m_mask(mask) { }399400CPPSPMD_FORCE_INLINE void enable_lane(uint32_t lane) { m_mask = _mm_load_si128((const __m128i *)&g_lane_masks_128[lane][0]); }401402static CPPSPMD_FORCE_INLINE exec_mask all_on() { return exec_mask{ _mm_load_si128((const __m128i*)g_allones_128) }; }403static CPPSPMD_FORCE_INLINE exec_mask all_off() { return exec_mask{ _mm_setzero_si128() }; }404405CPPSPMD_FORCE_INLINE uint32_t get_movemask() const { return _mm_movemask_ps(_mm_castsi128_ps(m_mask)); }406};407408friend CPPSPMD_FORCE_INLINE bool all(const exec_mask& e);409friend CPPSPMD_FORCE_INLINE bool any(const exec_mask& e);410411CPPSPMD_FORCE_INLINE bool spmd_all() const { return all(m_exec); }412CPPSPMD_FORCE_INLINE bool spmd_any() const { return any(m_exec); }413CPPSPMD_FORCE_INLINE bool spmd_none() { return !any(m_exec); }414415// true if cond is true for all active lanes - false if no active lanes416CPPSPMD_FORCE_INLINE bool spmd_all(const vbool& e) { uint32_t m = m_exec.get_movemask(); return (m != 0) && ((exec_mask(e) & m_exec).get_movemask() == m); }417// true if cond is true for any active lanes418CPPSPMD_FORCE_INLINE bool spmd_any(const vbool& e) { return (exec_mask(e) & m_exec).get_movemask() != 0; }419CPPSPMD_FORCE_INLINE bool spmd_none(const vbool& e) { return !spmd_any(e); }420421friend CPPSPMD_FORCE_INLINE exec_mask operator^ (const exec_mask& a, const exec_mask& b);422friend CPPSPMD_FORCE_INLINE exec_mask operator& (const exec_mask& a, const exec_mask& b);423friend CPPSPMD_FORCE_INLINE exec_mask operator| (const exec_mask& a, const exec_mask& b);424425exec_mask m_exec;426exec_mask m_kernel_exec;427exec_mask m_continue_mask;428#ifdef _DEBUG429bool m_in_loop;430#endif431432CPPSPMD_FORCE_INLINE uint32_t get_movemask() const { return m_exec.get_movemask(); }433434void init(const exec_mask& kernel_exec);435436// Varying bool437438struct vbool439{440__m128i m_value;441442vbool() = default;443444CPPSPMD_FORCE_INLINE vbool(bool value) : m_value(_mm_set1_epi32(value ? UINT32_MAX : 0)) { }445446CPPSPMD_FORCE_INLINE explicit vbool(const __m128i& value) : m_value(value) { }447448CPPSPMD_FORCE_INLINE explicit operator vfloat() const;449CPPSPMD_FORCE_INLINE explicit operator vint() const;450451private:452//vbool& operator=(const vbool&);453};454455friend vbool operator!(const vbool& v);456457CPPSPMD_FORCE_INLINE vbool& store(vbool& dst, const vbool& src)458{459dst.m_value = blendv_mask_epi32(dst.m_value, src.m_value, m_exec.m_mask);460return dst;461}462463CPPSPMD_FORCE_INLINE vbool& store_all(vbool& dst, const vbool& src)464{465dst.m_value = src.m_value;466return dst;467}468469// Varying float470struct vfloat471{472__m128 m_value;473474vfloat() = default;475476CPPSPMD_FORCE_INLINE explicit vfloat(const __m128& v) : m_value(v) { }477478CPPSPMD_FORCE_INLINE vfloat(float value) : m_value(_mm_set1_ps(value)) { }479480CPPSPMD_FORCE_INLINE explicit vfloat(int value) : m_value(_mm_set1_ps((float)value)) { }481482private:483//vfloat& operator=(const vfloat&);484};485486CPPSPMD_FORCE_INLINE vfloat& store(vfloat& dst, const vfloat& src)487{488dst.m_value = blendv_mask_ps(dst.m_value, src.m_value, _mm_castsi128_ps(m_exec.m_mask));489return dst;490}491492CPPSPMD_FORCE_INLINE vfloat& store(vfloat&& dst, const vfloat& src)493{494dst.m_value = blendv_mask_ps(dst.m_value, src.m_value, _mm_castsi128_ps(m_exec.m_mask));495return dst;496}497498CPPSPMD_FORCE_INLINE vfloat& store_all(vfloat& dst, const vfloat& src)499{500dst.m_value = src.m_value;501return dst;502}503504CPPSPMD_FORCE_INLINE vfloat& store_all(vfloat&& dst, const vfloat& src)505{506dst.m_value = src.m_value;507return dst;508}509510// Linear ref to floats511struct float_lref512{513float* m_pValue;514515private:516//float_lref& operator=(const float_lref&);517};518519CPPSPMD_FORCE_INLINE const float_lref& store(const float_lref& dst, const vfloat& src)520{521int mask = _mm_movemask_ps(_mm_castsi128_ps(m_exec.m_mask));522if (mask == ALL_ON_MOVEMASK)523_mm_storeu_ps(dst.m_pValue, src.m_value);524else525_mm_storeu_ps(dst.m_pValue, blendv_mask_ps(_mm_loadu_ps(dst.m_pValue), src.m_value, _mm_castsi128_ps(m_exec.m_mask)));526return dst;527}528529CPPSPMD_FORCE_INLINE const float_lref& store(const float_lref&& dst, const vfloat& src)530{531int mask = _mm_movemask_ps(_mm_castsi128_ps(m_exec.m_mask));532if (mask == ALL_ON_MOVEMASK)533_mm_storeu_ps(dst.m_pValue, src.m_value);534else535_mm_storeu_ps(dst.m_pValue, blendv_mask_ps(_mm_loadu_ps(dst.m_pValue), src.m_value, _mm_castsi128_ps(m_exec.m_mask)));536return dst;537}538539CPPSPMD_FORCE_INLINE const float_lref& store_all(const float_lref& dst, const vfloat& src)540{541_mm_storeu_ps(dst.m_pValue, src.m_value);542return dst;543}544545CPPSPMD_FORCE_INLINE const float_lref& store_all(const float_lref&& dst, const vfloat& src)546{547_mm_storeu_ps(dst.m_pValue, src.m_value);548return dst;549}550551CPPSPMD_FORCE_INLINE vfloat load(const float_lref& src)552{553return vfloat{ _mm_and_ps(_mm_loadu_ps(src.m_pValue), _mm_castsi128_ps(m_exec.m_mask)) };554}555556// Varying ref to floats557struct float_vref558{559__m128i m_vindex;560float* m_pValue;561562private:563//float_vref& operator=(const float_vref&);564};565566// Varying ref to varying float567struct vfloat_vref568{569__m128i m_vindex;570vfloat* m_pValue;571572private:573//vfloat_vref& operator=(const vfloat_vref&);574};575576// Varying ref to varying int577struct vint_vref578{579__m128i m_vindex;580vint* m_pValue;581582private:583//vint_vref& operator=(const vint_vref&);584};585586CPPSPMD_FORCE_INLINE const float_vref& store(const float_vref& dst, const vfloat& src);587CPPSPMD_FORCE_INLINE const float_vref& store(const float_vref&& dst, const vfloat& src);588589CPPSPMD_FORCE_INLINE const float_vref& store_all(const float_vref& dst, const vfloat& src);590CPPSPMD_FORCE_INLINE const float_vref& store_all(const float_vref&& dst, const vfloat& src);591592CPPSPMD_FORCE_INLINE vfloat load(const float_vref& src)593{594CPPSPMD_ALIGN(16) int vindex[4];595_mm_store_si128((__m128i *)vindex, src.m_vindex);596597CPPSPMD_ALIGN(16) float loaded[4];598599int mask = _mm_movemask_ps(_mm_castsi128_ps(m_exec.m_mask));600for (int i = 0; i < 4; i++)601{602if (mask & (1 << i))603loaded[i] = src.m_pValue[vindex[i]];604}605return vfloat{ _mm_and_ps(_mm_castsi128_ps(m_exec.m_mask), _mm_load_ps((const float*)loaded)) };606}607608CPPSPMD_FORCE_INLINE vfloat load_all(const float_vref& src)609{610CPPSPMD_ALIGN(16) int vindex[4];611_mm_store_si128((__m128i *)vindex, src.m_vindex);612613CPPSPMD_ALIGN(16) float loaded[4];614615for (int i = 0; i < 4; i++)616loaded[i] = src.m_pValue[vindex[i]];617return vfloat{ _mm_load_ps((const float*)loaded) };618}619620// Linear ref to ints621struct int_lref622{623int* m_pValue;624625private:626//int_lref& operator=(const int_lref&);627};628629CPPSPMD_FORCE_INLINE const int_lref& store(const int_lref& dst, const vint& src)630{631int mask = _mm_movemask_ps(_mm_castsi128_ps(m_exec.m_mask));632if (mask == ALL_ON_MOVEMASK)633{634_mm_storeu_si128((__m128i *)dst.m_pValue, src.m_value);635}636else637{638CPPSPMD_ALIGN(16) int stored[4];639_mm_store_si128((__m128i *)stored, src.m_value);640641for (int i = 0; i < 4; i++)642{643if (mask & (1 << i))644dst.m_pValue[i] = stored[i];645}646}647return dst;648}649650CPPSPMD_FORCE_INLINE vint load(const int_lref& src)651{652__m128i v = _mm_loadu_si128((const __m128i*)src.m_pValue);653654v = _mm_castps_si128(_mm_and_ps(_mm_castsi128_ps(v), _mm_castsi128_ps(m_exec.m_mask)));655656return vint{ v };657}658659// Linear ref to int16's660struct int16_lref661{662int16_t* m_pValue;663664private:665//int16_lref& operator=(const int16_lref&);666};667668CPPSPMD_FORCE_INLINE const int16_lref& store(const int16_lref& dst, const vint& src)669{670CPPSPMD_ALIGN(16) int stored[4];671_mm_store_si128((__m128i *)stored, src.m_value);672673int mask = _mm_movemask_ps(_mm_castsi128_ps(m_exec.m_mask));674for (int i = 0; i < 4; i++)675{676if (mask & (1 << i))677dst.m_pValue[i] = static_cast<int16_t>(stored[i]);678}679return dst;680}681682CPPSPMD_FORCE_INLINE const int16_lref& store_all(const int16_lref& dst, const vint& src)683{684CPPSPMD_ALIGN(16) int stored[4];685_mm_store_si128((__m128i *)stored, src.m_value);686687for (int i = 0; i < 4; i++)688dst.m_pValue[i] = static_cast<int16_t>(stored[i]);689return dst;690}691692CPPSPMD_FORCE_INLINE vint load(const int16_lref& src)693{694CPPSPMD_ALIGN(16) int values[4];695696for (int i = 0; i < 4; i++)697values[i] = static_cast<int16_t>(src.m_pValue[i]);698699__m128i t = _mm_load_si128( (const __m128i *)values );700701return vint{ _mm_castps_si128(_mm_and_ps(_mm_castsi128_ps( t ), _mm_castsi128_ps(m_exec.m_mask))) };702}703704CPPSPMD_FORCE_INLINE vint load_all(const int16_lref& src)705{706CPPSPMD_ALIGN(16) int values[4];707708for (int i = 0; i < 4; i++)709values[i] = static_cast<int16_t>(src.m_pValue[i]);710711__m128i t = _mm_load_si128( (const __m128i *)values );712713return vint{ t };714}715716// Linear ref to constant ints717struct cint_lref718{719const int* m_pValue;720721private:722//cint_lref& operator=(const cint_lref&);723};724725CPPSPMD_FORCE_INLINE vint load(const cint_lref& src)726{727__m128i v = _mm_loadu_si128((const __m128i *)src.m_pValue);728v = _mm_castps_si128(_mm_and_ps(_mm_castsi128_ps(v), _mm_castsi128_ps(m_exec.m_mask)));729return vint{ v };730}731732CPPSPMD_FORCE_INLINE vint load_all(const cint_lref& src)733{734return vint{ _mm_loadu_si128((const __m128i *)src.m_pValue) };735}736737// Varying ref to ints738struct int_vref739{740__m128i m_vindex;741int* m_pValue;742743private:744//int_vref& operator=(const int_vref&);745};746747// Varying ref to constant ints748struct cint_vref749{750__m128i m_vindex;751const int* m_pValue;752753private:754//cint_vref& operator=(const cint_vref&);755};756757// Varying int758struct vint759{760__m128i m_value;761762vint() = default;763764CPPSPMD_FORCE_INLINE explicit vint(const __m128i& value) : m_value(value) { }765766CPPSPMD_FORCE_INLINE explicit vint(const lint &other) : m_value(other.m_value) { }767768CPPSPMD_FORCE_INLINE vint& operator=(const lint& other) { m_value = other.m_value; return *this; }769770CPPSPMD_FORCE_INLINE vint(int value) : m_value(_mm_set1_epi32(value)) { }771772CPPSPMD_FORCE_INLINE explicit vint(float value) : m_value(_mm_set1_epi32((int)value)) { }773774CPPSPMD_FORCE_INLINE explicit vint(const vfloat& other) : m_value(_mm_cvttps_epi32(other.m_value)) { }775776CPPSPMD_FORCE_INLINE explicit operator vbool() const777{778return vbool{ _mm_xor_si128( _mm_load_si128((const __m128i*)g_allones_128), _mm_cmpeq_epi32(m_value, _mm_setzero_si128())) };779}780781CPPSPMD_FORCE_INLINE explicit operator vfloat() const782{783return vfloat{ _mm_cvtepi32_ps(m_value) };784}785786CPPSPMD_FORCE_INLINE int_vref operator[](int* ptr) const787{788return int_vref{ m_value, ptr };789}790791CPPSPMD_FORCE_INLINE cint_vref operator[](const int* ptr) const792{793return cint_vref{ m_value, ptr };794}795796CPPSPMD_FORCE_INLINE float_vref operator[](float* ptr) const797{798return float_vref{ m_value, ptr };799}800801CPPSPMD_FORCE_INLINE vfloat_vref operator[](vfloat* ptr) const802{803return vfloat_vref{ m_value, ptr };804}805806CPPSPMD_FORCE_INLINE vint_vref operator[](vint* ptr) const807{808return vint_vref{ m_value, ptr };809}810811private:812//vint& operator=(const vint&);813};814815// Load/store linear int816CPPSPMD_FORCE_INLINE void storeu_linear(int *pDst, const vint& src)817{818int mask = _mm_movemask_ps(_mm_castsi128_ps(m_exec.m_mask));819if (mask == ALL_ON_MOVEMASK)820_mm_storeu_si128((__m128i *)pDst, src.m_value);821else822{823if (mask & 1) pDst[0] = extract_x(src.m_value);824if (mask & 2) pDst[1] = extract_y(src.m_value);825if (mask & 4) pDst[2] = extract_z(src.m_value);826if (mask & 8) pDst[3] = extract_w(src.m_value);827}828}829830CPPSPMD_FORCE_INLINE void storeu_linear_all(int *pDst, const vint& src)831{832_mm_storeu_si128((__m128i*)pDst, src.m_value);833}834835CPPSPMD_FORCE_INLINE void store_linear_all(int *pDst, const vint& src)836{837_mm_store_si128((__m128i*)pDst, src.m_value);838}839840CPPSPMD_FORCE_INLINE vint loadu_linear(const int *pSrc)841{842__m128i v = _mm_loadu_si128((const __m128i*)pSrc);843844v = _mm_castps_si128(_mm_and_ps(_mm_castsi128_ps(v), _mm_castsi128_ps(m_exec.m_mask)));845846return vint{ v };847}848849CPPSPMD_FORCE_INLINE vint loadu_linear_all(const int *pSrc)850{851return vint{ _mm_loadu_si128((__m128i*)pSrc) };852}853854CPPSPMD_FORCE_INLINE vint load_linear_all(const int *pSrc)855{856return vint{ _mm_load_si128((__m128i*)pSrc) };857}858859// Load/store linear float860CPPSPMD_FORCE_INLINE void storeu_linear(float *pDst, const vfloat& src)861{862int mask = _mm_movemask_ps(_mm_castsi128_ps(m_exec.m_mask));863if (mask == ALL_ON_MOVEMASK)864_mm_storeu_ps((float*)pDst, src.m_value);865else866{867int *pDstI = (int *)pDst;868if (mask & 1) pDstI[0] = extract_ps_x(src.m_value);869if (mask & 2) pDstI[1] = extract_ps_y(src.m_value);870if (mask & 4) pDstI[2] = extract_ps_z(src.m_value);871if (mask & 8) pDstI[3] = extract_ps_w(src.m_value);872}873}874875CPPSPMD_FORCE_INLINE void storeu_linear_all(float *pDst, const vfloat& src)876{877_mm_storeu_ps((float*)pDst, src.m_value);878}879880CPPSPMD_FORCE_INLINE void store_linear_all(float *pDst, const vfloat& src)881{882_mm_store_ps((float*)pDst, src.m_value);883}884885CPPSPMD_FORCE_INLINE vfloat loadu_linear(const float *pSrc)886{887__m128 v = _mm_loadu_ps((const float*)pSrc);888889v = _mm_and_ps(v, _mm_castsi128_ps(m_exec.m_mask));890891return vfloat{ v };892}893894CPPSPMD_FORCE_INLINE vfloat loadu_linear_all(const float *pSrc)895{896return vfloat{ _mm_loadu_ps((float*)pSrc) };897}898899CPPSPMD_FORCE_INLINE vfloat load_linear_all(const float *pSrc)900{901return vfloat{ _mm_load_ps((float*)pSrc) };902}903904CPPSPMD_FORCE_INLINE vint& store(vint& dst, const vint& src)905{906dst.m_value = blendv_mask_epi32(dst.m_value, src.m_value, m_exec.m_mask);907return dst;908}909910CPPSPMD_FORCE_INLINE const int_vref& store(const int_vref& dst, const vint& src)911{912CPPSPMD_ALIGN(16) int vindex[4];913_mm_store_si128((__m128i*)vindex, dst.m_vindex);914915CPPSPMD_ALIGN(16) int stored[4];916_mm_store_si128((__m128i*)stored, src.m_value);917918int mask = _mm_movemask_ps(_mm_castsi128_ps(m_exec.m_mask));919for (int i = 0; i < 4; i++)920{921if (mask & (1 << i))922dst.m_pValue[vindex[i]] = stored[i];923}924return dst;925}926927CPPSPMD_FORCE_INLINE vint& store_all(vint& dst, const vint& src)928{929dst.m_value = src.m_value;930return dst;931}932933CPPSPMD_FORCE_INLINE const int_vref& store_all(const int_vref& dst, const vint& src)934{935CPPSPMD_ALIGN(16) int vindex[4];936_mm_store_si128((__m128i*)vindex, dst.m_vindex);937938CPPSPMD_ALIGN(16) int stored[4];939_mm_store_si128((__m128i*)stored, src.m_value);940941for (int i = 0; i < 4; i++)942dst.m_pValue[vindex[i]] = stored[i];943944return dst;945}946947CPPSPMD_FORCE_INLINE vint load(const int_vref& src)948{949CPPSPMD_ALIGN(16) int values[4];950951CPPSPMD_ALIGN(16) int indices[4];952_mm_store_si128((__m128i *)indices, src.m_vindex);953954int mask = _mm_movemask_ps(_mm_castsi128_ps(m_exec.m_mask));955for (int i = 0; i < 4; i++)956{957if (mask & (1 << i))958values[i] = src.m_pValue[indices[i]];959}960961return vint{ _mm_castps_si128(_mm_and_ps(_mm_castsi128_ps(m_exec.m_mask), _mm_load_ps((const float*)values))) };962}963964CPPSPMD_FORCE_INLINE vint load_all(const int_vref& src)965{966CPPSPMD_ALIGN(16) int values[4];967968CPPSPMD_ALIGN(16) int indices[4];969_mm_store_si128((__m128i *)indices, src.m_vindex);970971for (int i = 0; i < 4; i++)972values[i] = src.m_pValue[indices[i]];973974return vint{ _mm_castps_si128( _mm_load_ps((const float*)values)) };975}976977CPPSPMD_FORCE_INLINE vint load(const cint_vref& src)978{979CPPSPMD_ALIGN(16) int values[4];980981CPPSPMD_ALIGN(16) int indices[4];982_mm_store_si128((__m128i *)indices, src.m_vindex);983984int mask = _mm_movemask_ps(_mm_castsi128_ps(m_exec.m_mask));985for (int i = 0; i < 4; i++)986{987if (mask & (1 << i))988values[i] = src.m_pValue[indices[i]];989}990991return vint{ _mm_castps_si128(_mm_and_ps(_mm_castsi128_ps(m_exec.m_mask), _mm_load_ps((const float*)values))) };992}993994CPPSPMD_FORCE_INLINE vint load_all(const cint_vref& src)995{996CPPSPMD_ALIGN(16) int values[4];997998CPPSPMD_ALIGN(16) int indices[4];999_mm_store_si128((__m128i *)indices, src.m_vindex);10001001for (int i = 0; i < 4; i++)1002values[i] = src.m_pValue[indices[i]];10031004return vint{ _mm_castps_si128( _mm_load_ps((const float*)values)) };1005}10061007CPPSPMD_FORCE_INLINE vint load_bytes_all(const cint_vref& src)1008{1009__m128i v0_l;10101011const uint8_t* pSrc = (const uint8_t*)src.m_pValue;1012v0_l = insert_x(_mm_undefined_si128(), ((int*)(pSrc + extract_x(src.m_vindex)))[0]);1013v0_l = insert_y(v0_l, ((int*)(pSrc + extract_y(src.m_vindex)))[0]);1014v0_l = insert_z(v0_l, ((int*)(pSrc + extract_z(src.m_vindex)))[0]);1015v0_l = insert_w(v0_l, ((int*)(pSrc + extract_w(src.m_vindex)))[0]);10161017return vint{ v0_l };1018}10191020CPPSPMD_FORCE_INLINE vint load_words_all(const cint_vref& src)1021{1022__m128i v0_l;10231024const uint8_t* pSrc = (const uint8_t*)src.m_pValue;1025v0_l = insert_x(_mm_undefined_si128(), ((int16_t*)(pSrc + 2 * extract_x(src.m_vindex)))[0]);1026v0_l = insert_y(v0_l, ((int16_t*)(pSrc + 2 * extract_y(src.m_vindex)))[0]);1027v0_l = insert_z(v0_l, ((int16_t*)(pSrc + 2 * extract_z(src.m_vindex)))[0]);1028v0_l = insert_w(v0_l, ((int16_t*)(pSrc + 2 * extract_w(src.m_vindex)))[0]);10291030return vint{ v0_l };1031}10321033CPPSPMD_FORCE_INLINE void store_strided(int *pDst, uint32_t stride, const vint &v)1034{1035int mask = _mm_movemask_ps(_mm_castsi128_ps(m_exec.m_mask));10361037if (mask & 1) pDst[0] = extract_x(v.m_value);1038if (mask & 2) pDst[stride] = extract_y(v.m_value);1039if (mask & 4) pDst[stride*2] = extract_z(v.m_value);1040if (mask & 8) pDst[stride*3] = extract_w(v.m_value);1041}10421043CPPSPMD_FORCE_INLINE void store_strided(float *pDstF, uint32_t stride, const vfloat &v)1044{1045int mask = _mm_movemask_ps(_mm_castsi128_ps(m_exec.m_mask));10461047if (mask & 1) ((int *)pDstF)[0] = extract_ps_x(v.m_value);1048if (mask & 2) ((int *)pDstF)[stride] = extract_ps_y(v.m_value);1049if (mask & 4) ((int *)pDstF)[stride*2] = extract_ps_z(v.m_value);1050if (mask & 8) ((int *)pDstF)[stride*3] = extract_ps_w(v.m_value);1051}10521053CPPSPMD_FORCE_INLINE void store_all_strided(int *pDst, uint32_t stride, const vint &v)1054{1055pDst[0] = extract_x(v.m_value);1056pDst[stride] = extract_y(v.m_value);1057pDst[stride*2] = extract_z(v.m_value);1058pDst[stride*3] = extract_w(v.m_value);1059}10601061CPPSPMD_FORCE_INLINE void store_all_strided(float *pDstF, uint32_t stride, const vfloat &v)1062{1063((int *)pDstF)[0] = extract_ps_x(v.m_value);1064((int *)pDstF)[stride] = extract_ps_y(v.m_value);1065((int *)pDstF)[stride*2] = extract_ps_z(v.m_value);1066((int *)pDstF)[stride*3] = extract_ps_w(v.m_value);1067}10681069CPPSPMD_FORCE_INLINE vint load_strided(const int *pSrc, uint32_t stride)1070{1071int mask = _mm_movemask_ps(_mm_castsi128_ps(m_exec.m_mask));10721073#if CPPSPMD_SSE21074CPPSPMD_ALIGN(16) int vals[4] = { 0, 0, 0, 0 };1075if (mask & 1) vals[0] = pSrc[0];1076if (mask & 2) vals[1] = pSrc[stride];1077if (mask & 4) vals[2] = pSrc[stride * 2];1078if (mask & 8) vals[3] = pSrc[stride * 3];1079return vint{ _mm_load_si128((__m128i*)vals) };1080#else1081const float* pSrcF = (const float*)pSrc;1082__m128 v = _mm_setzero_ps();1083if (mask & 1) v = _mm_load_ss(pSrcF);1084if (mask & 2) v = _mm_insert_ps(v, _mm_load_ss(pSrcF + stride), 0x10);1085if (mask & 4) v = _mm_insert_ps(v, _mm_load_ss(pSrcF + 2 * stride), 0x20);1086if (mask & 8) v = _mm_insert_ps(v, _mm_load_ss(pSrcF + 3 * stride), 0x30);1087return vint{ _mm_castps_si128(v) };1088#endif1089}10901091CPPSPMD_FORCE_INLINE vfloat load_strided(const float *pSrc, uint32_t stride)1092{1093int mask = _mm_movemask_ps(_mm_castsi128_ps(m_exec.m_mask));10941095#if CPPSPMD_SSE21096CPPSPMD_ALIGN(16) float vals[4] = { 0, 0, 0, 0 };1097if (mask & 1) vals[0] = pSrc[0];1098if (mask & 2) vals[1] = pSrc[stride];1099if (mask & 4) vals[2] = pSrc[stride * 2];1100if (mask & 8) vals[3] = pSrc[stride * 3];1101return vfloat{ _mm_load_ps(vals) };1102#else1103__m128 v = _mm_setzero_ps();1104if (mask & 1) v = _mm_load_ss(pSrc);1105if (mask & 2) v = _mm_insert_ps(v, _mm_load_ss(pSrc + stride), 0x10);1106if (mask & 4) v = _mm_insert_ps(v, _mm_load_ss(pSrc + 2 * stride), 0x20);1107if (mask & 8) v = _mm_insert_ps(v, _mm_load_ss(pSrc + 3 * stride), 0x30);1108return vfloat{ v };1109#endif1110}11111112CPPSPMD_FORCE_INLINE vint load_all_strided(const int *pSrc, uint32_t stride)1113{1114#if CPPSPMD_SSE21115CPPSPMD_ALIGN(16) int vals[4];1116vals[0] = pSrc[0];1117vals[1] = pSrc[stride];1118vals[2] = pSrc[stride * 2];1119vals[3] = pSrc[stride * 3];1120return vint{ _mm_load_si128((__m128i*)vals) };1121#else1122const float* pSrcF = (const float*)pSrc;1123__m128 v = _mm_load_ss(pSrcF);1124v = _mm_insert_ps(v, _mm_load_ss(pSrcF + stride), 0x10);1125v = _mm_insert_ps(v, _mm_load_ss(pSrcF + 2 * stride), 0x20);1126v = _mm_insert_ps(v, _mm_load_ss(pSrcF + 3 * stride), 0x30);1127return vint{ _mm_castps_si128(v) };1128#endif1129}11301131CPPSPMD_FORCE_INLINE vfloat load_all_strided(const float *pSrc, uint32_t stride)1132{1133#if CPPSPMD_SSE21134CPPSPMD_ALIGN(16) float vals[4];1135vals[0] = pSrc[0];1136vals[1] = pSrc[stride];1137vals[2] = pSrc[stride * 2];1138vals[3] = pSrc[stride * 3];1139return vfloat{ _mm_load_ps(vals) };1140#else1141__m128 v = _mm_load_ss(pSrc);1142v = _mm_insert_ps(v, _mm_load_ss(pSrc + stride), 0x10);1143v = _mm_insert_ps(v, _mm_load_ss(pSrc + 2 * stride), 0x20);1144v = _mm_insert_ps(v, _mm_load_ss(pSrc + 3 * stride), 0x30);1145return vfloat{ v };1146#endif1147}11481149CPPSPMD_FORCE_INLINE const vfloat_vref& store(const vfloat_vref& dst, const vfloat& src)1150{1151// TODO: There's surely a better way1152int mask = _mm_movemask_ps(_mm_castsi128_ps(m_exec.m_mask));11531154if (mask & 1) ((int *)(&dst.m_pValue[extract_x(dst.m_vindex)]))[0] = extract_x(_mm_castps_si128(src.m_value));1155if (mask & 2) ((int *)(&dst.m_pValue[extract_y(dst.m_vindex)]))[1] = extract_y(_mm_castps_si128(src.m_value));1156if (mask & 4) ((int *)(&dst.m_pValue[extract_z(dst.m_vindex)]))[2] = extract_z(_mm_castps_si128(src.m_value));1157if (mask & 8) ((int *)(&dst.m_pValue[extract_w(dst.m_vindex)]))[3] = extract_w(_mm_castps_si128(src.m_value));11581159return dst;1160}11611162CPPSPMD_FORCE_INLINE vfloat load(const vfloat_vref& src)1163{1164// TODO: There's surely a better way1165int mask = _mm_movemask_ps(_mm_castsi128_ps(m_exec.m_mask));11661167__m128i k = _mm_setzero_si128();11681169if (mask & 1) k = insert_x(k, ((int *)(&src.m_pValue[extract_x(src.m_vindex)]))[0]);1170if (mask & 2) k = insert_y(k, ((int *)(&src.m_pValue[extract_y(src.m_vindex)]))[1]);1171if (mask & 4) k = insert_z(k, ((int *)(&src.m_pValue[extract_z(src.m_vindex)]))[2]);1172if (mask & 8) k = insert_w(k, ((int *)(&src.m_pValue[extract_w(src.m_vindex)]))[3]);11731174return vfloat{ _mm_castsi128_ps(k) };1175}11761177CPPSPMD_FORCE_INLINE const vint_vref& store(const vint_vref& dst, const vint& src)1178{1179// TODO: There's surely a better way1180int mask = _mm_movemask_ps(_mm_castsi128_ps(m_exec.m_mask));11811182if (mask & 1) ((int *)(&dst.m_pValue[extract_x(dst.m_vindex)]))[0] = extract_x(src.m_value);1183if (mask & 2) ((int *)(&dst.m_pValue[extract_y(dst.m_vindex)]))[1] = extract_y(src.m_value);1184if (mask & 4) ((int *)(&dst.m_pValue[extract_z(dst.m_vindex)]))[2] = extract_z(src.m_value);1185if (mask & 8) ((int *)(&dst.m_pValue[extract_w(dst.m_vindex)]))[3] = extract_w(src.m_value);11861187return dst;1188}11891190CPPSPMD_FORCE_INLINE vint load(const vint_vref& src)1191{1192// TODO: There's surely a better way1193int mask = _mm_movemask_ps(_mm_castsi128_ps(m_exec.m_mask));11941195__m128i k = _mm_setzero_si128();11961197if (mask & 1) k = insert_x(k, ((int *)(&src.m_pValue[extract_x(src.m_vindex)]))[0]);1198if (mask & 2) k = insert_y(k, ((int *)(&src.m_pValue[extract_y(src.m_vindex)]))[1]);1199if (mask & 4) k = insert_z(k, ((int *)(&src.m_pValue[extract_z(src.m_vindex)]))[2]);1200if (mask & 8) k = insert_w(k, ((int *)(&src.m_pValue[extract_w(src.m_vindex)]))[3]);12011202return vint{ k };1203}12041205CPPSPMD_FORCE_INLINE vint load_all(const vint_vref& src)1206{1207// TODO: There's surely a better way1208__m128i k = _mm_setzero_si128();12091210k = insert_x(k, ((int*)(&src.m_pValue[extract_x(src.m_vindex)]))[0]);1211k = insert_y(k, ((int*)(&src.m_pValue[extract_y(src.m_vindex)]))[1]);1212k = insert_z(k, ((int*)(&src.m_pValue[extract_z(src.m_vindex)]))[2]);1213k = insert_w(k, ((int*)(&src.m_pValue[extract_w(src.m_vindex)]))[3]);12141215return vint{ k };1216}12171218// Linear integer1219struct lint1220{1221__m128i m_value;12221223CPPSPMD_FORCE_INLINE explicit lint(__m128i value)1224: m_value(value)1225{ }12261227CPPSPMD_FORCE_INLINE explicit operator vfloat() const1228{1229return vfloat{ _mm_cvtepi32_ps(m_value) };1230}12311232CPPSPMD_FORCE_INLINE explicit operator vint() const1233{1234return vint{ m_value };1235}12361237CPPSPMD_FORCE_INLINE int get_first_value() const1238{1239return _mm_cvtsi128_si32(m_value);1240}12411242CPPSPMD_FORCE_INLINE float_lref operator[](float* ptr) const1243{1244return float_lref{ ptr + get_first_value() };1245}12461247CPPSPMD_FORCE_INLINE int_lref operator[](int* ptr) const1248{1249return int_lref{ ptr + get_first_value() };1250}12511252CPPSPMD_FORCE_INLINE int16_lref operator[](int16_t* ptr) const1253{1254return int16_lref{ ptr + get_first_value() };1255}12561257CPPSPMD_FORCE_INLINE cint_lref operator[](const int* ptr) const1258{1259return cint_lref{ ptr + get_first_value() };1260}12611262private:1263//lint& operator=(const lint&);1264};12651266CPPSPMD_FORCE_INLINE lint& store_all(lint& dst, const lint& src)1267{1268dst.m_value = src.m_value;1269return dst;1270}12711272const lint program_index = lint{ _mm_set_epi32( 3, 2, 1, 0 ) };12731274// SPMD condition helpers12751276template<typename IfBody>1277CPPSPMD_FORCE_INLINE void spmd_if(const vbool& cond, const IfBody& ifBody);12781279CPPSPMD_FORCE_INLINE void spmd_if_break(const vbool& cond);12801281// No breaks, continues, etc. allowed1282template<typename IfBody>1283CPPSPMD_FORCE_INLINE void spmd_sif(const vbool& cond, const IfBody& ifBody);12841285// No breaks, continues, etc. allowed1286template<typename IfBody, typename ElseBody>1287CPPSPMD_FORCE_INLINE void spmd_sifelse(const vbool& cond, const IfBody& ifBody, const ElseBody &elseBody);12881289template<typename IfBody, typename ElseBody>1290CPPSPMD_FORCE_INLINE void spmd_ifelse(const vbool& cond, const IfBody& ifBody, const ElseBody& elseBody);12911292template<typename WhileCondBody, typename WhileBody>1293CPPSPMD_FORCE_INLINE void spmd_while(const WhileCondBody& whileCondBody, const WhileBody& whileBody);12941295template<typename ForInitBody, typename ForCondBody, typename ForIncrBody, typename ForBody>1296CPPSPMD_FORCE_INLINE void spmd_for(const ForInitBody& forInitBody, const ForCondBody& forCondBody, const ForIncrBody& forIncrBody, const ForBody& forBody);12971298template<typename ForeachBody>1299CPPSPMD_FORCE_INLINE void spmd_foreach(int begin, int end, const ForeachBody& foreachBody);13001301#ifdef _DEBUG1302CPPSPMD_FORCE_INLINE void check_masks();1303#else1304CPPSPMD_FORCE_INLINE void check_masks() { }1305#endif13061307CPPSPMD_FORCE_INLINE void spmd_break();1308CPPSPMD_FORCE_INLINE void spmd_continue();13091310CPPSPMD_FORCE_INLINE void spmd_return();13111312template<typename UnmaskedBody>1313CPPSPMD_FORCE_INLINE void spmd_unmasked(const UnmaskedBody& unmaskedBody);13141315template<typename SPMDKernel, typename... Args>1316//CPPSPMD_FORCE_INLINE decltype(auto) spmd_call(Args&&... args);1317CPPSPMD_FORCE_INLINE void spmd_call(Args&&... args);13181319CPPSPMD_FORCE_INLINE void swap(vint &a, vint &b) { vint temp = a; store(a, b); store(b, temp); }1320CPPSPMD_FORCE_INLINE void swap(vfloat &a, vfloat &b) { vfloat temp = a; store(a, b); store(b, temp); }1321CPPSPMD_FORCE_INLINE void swap(vbool &a, vbool &b) { vbool temp = a; store(a, b); store(b, temp); }13221323CPPSPMD_FORCE_INLINE float reduce_add(vfloat v)1324{1325__m128 k3210 = _mm_castsi128_ps(blendv_mask_epi32(_mm_setzero_si128(), _mm_castps_si128(v.m_value), m_exec.m_mask));1326__m128 temp = _mm_add_ps(_mm_shuffle_ps(k3210, k3210, _MM_SHUFFLE(0, 1, 2, 3)), k3210);1327return _mm_cvtss_f32(_mm_add_ss(_mm_movehl_ps(temp, temp), temp));1328}13291330CPPSPMD_FORCE_INLINE int reduce_add(vint v)1331{1332__m128i k3210 = blendv_mask_epi32(_mm_setzero_si128(), v.m_value, m_exec.m_mask);1333__m128i temp = _mm_add_epi32(_mm_shuffle_epi32(k3210, _MM_SHUFFLE(0, 1, 2, 3)), k3210);1334return extract_x(_mm_add_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(temp), _mm_castsi128_ps(temp))), temp));1335}13361337#include "cppspmd_math_declares.h"13381339}; // struct spmd_kernel13401341using exec_mask = spmd_kernel::exec_mask;1342using vint = spmd_kernel::vint;1343using int_lref = spmd_kernel::int_lref;1344using cint_vref = spmd_kernel::cint_vref;1345using cint_lref = spmd_kernel::cint_lref;1346using int_vref = spmd_kernel::int_vref;1347using lint = spmd_kernel::lint;1348using vbool = spmd_kernel::vbool;1349using vfloat = spmd_kernel::vfloat;1350using float_lref = spmd_kernel::float_lref;1351using float_vref = spmd_kernel::float_vref;1352using vfloat_vref = spmd_kernel::vfloat_vref;1353using vint_vref = spmd_kernel::vint_vref;13541355CPPSPMD_FORCE_INLINE spmd_kernel::vbool::operator vfloat() const1356{1357return vfloat { _mm_and_ps( _mm_castsi128_ps(m_value), *(const __m128 *)g_onef_128 ) };1358}13591360// Returns UINT32_MAX's for true, 0 for false. (Should it return 1's?)1361CPPSPMD_FORCE_INLINE spmd_kernel::vbool::operator vint() const1362{1363return vint { m_value };1364}13651366CPPSPMD_FORCE_INLINE vbool operator!(const vbool& v)1367{1368return vbool{ _mm_castps_si128(_mm_xor_ps(_mm_load_ps((const float*)g_allones_128), _mm_castsi128_ps(v.m_value))) };1369}13701371CPPSPMD_FORCE_INLINE exec_mask::exec_mask(const vbool& b) { m_mask = b.m_value; }13721373CPPSPMD_FORCE_INLINE exec_mask operator^(const exec_mask& a, const exec_mask& b) { return exec_mask{ _mm_xor_si128(a.m_mask, b.m_mask) }; }1374CPPSPMD_FORCE_INLINE exec_mask operator&(const exec_mask& a, const exec_mask& b) { return exec_mask{ _mm_and_si128(a.m_mask, b.m_mask) }; }1375CPPSPMD_FORCE_INLINE exec_mask operator|(const exec_mask& a, const exec_mask& b) { return exec_mask{ _mm_or_si128(a.m_mask, b.m_mask) }; }13761377CPPSPMD_FORCE_INLINE bool all(const exec_mask& e) { return _mm_movemask_ps(_mm_castsi128_ps(e.m_mask)) == ALL_ON_MOVEMASK; }1378CPPSPMD_FORCE_INLINE bool any(const exec_mask& e) { return _mm_movemask_ps(_mm_castsi128_ps(e.m_mask)) != 0; }13791380// Bad pattern - doesn't factor in the current exec mask. Prefer spmd_any() instead.1381CPPSPMD_FORCE_INLINE bool all(const vbool& e) { return _mm_movemask_ps(_mm_castsi128_ps(e.m_value)) == ALL_ON_MOVEMASK; }1382CPPSPMD_FORCE_INLINE bool any(const vbool& e) { return _mm_movemask_ps(_mm_castsi128_ps(e.m_value)) != 0; }13831384CPPSPMD_FORCE_INLINE exec_mask andnot(const exec_mask& a, const exec_mask& b) { return exec_mask{ _mm_andnot_si128(a.m_mask, b.m_mask) }; }1385CPPSPMD_FORCE_INLINE vbool operator||(const vbool& a, const vbool& b) { return vbool{ _mm_or_si128(a.m_value, b.m_value) }; }1386CPPSPMD_FORCE_INLINE vbool operator&&(const vbool& a, const vbool& b) { return vbool{ _mm_and_si128(a.m_value, b.m_value) }; }13871388CPPSPMD_FORCE_INLINE vfloat operator+(const vfloat& a, const vfloat& b) { return vfloat{ _mm_add_ps(a.m_value, b.m_value) }; }1389CPPSPMD_FORCE_INLINE vfloat operator-(const vfloat& a, const vfloat& b) { return vfloat{ _mm_sub_ps(a.m_value, b.m_value) }; }1390CPPSPMD_FORCE_INLINE vfloat operator+(float a, const vfloat& b) { return vfloat(a) + b; }1391CPPSPMD_FORCE_INLINE vfloat operator+(const vfloat& a, float b) { return a + vfloat(b); }1392CPPSPMD_FORCE_INLINE vfloat operator-(const vfloat& a, const vint& b) { return a - vfloat(b); }1393CPPSPMD_FORCE_INLINE vfloat operator-(const vint& a, const vfloat& b) { return vfloat(a) - b; }1394CPPSPMD_FORCE_INLINE vfloat operator-(const vfloat& a, int b) { return a - vfloat(b); }1395CPPSPMD_FORCE_INLINE vfloat operator-(int a, const vfloat& b) { return vfloat(a) - b; }1396CPPSPMD_FORCE_INLINE vfloat operator-(const vfloat& a, float b) { return a - vfloat(b); }1397CPPSPMD_FORCE_INLINE vfloat operator-(float a, const vfloat& b) { return vfloat(a) - b; }13981399CPPSPMD_FORCE_INLINE vfloat operator*(const vfloat& a, const vfloat& b) { return vfloat{ _mm_mul_ps(a.m_value, b.m_value) }; }1400CPPSPMD_FORCE_INLINE vfloat operator*(const vfloat& a, float b) { return a * vfloat(b); }1401CPPSPMD_FORCE_INLINE vfloat operator*(float a, const vfloat& b) { return vfloat(a) * b; }1402CPPSPMD_FORCE_INLINE vfloat operator*(const vfloat& a, int b) { return a * vfloat(b); }1403CPPSPMD_FORCE_INLINE vfloat operator*(int a, const vfloat& b) { return vfloat(a) * b; }14041405CPPSPMD_FORCE_INLINE vfloat operator/(const vfloat& a, const vfloat& b) { return vfloat{ _mm_div_ps(a.m_value, b.m_value) }; }1406CPPSPMD_FORCE_INLINE vfloat operator/(const vfloat& a, int b) { return a / vfloat(b); }1407CPPSPMD_FORCE_INLINE vfloat operator/(int a, const vfloat& b) { return vfloat(a) / b; }1408CPPSPMD_FORCE_INLINE vfloat operator/(const vfloat& a, float b) { return a / vfloat(b); }1409CPPSPMD_FORCE_INLINE vfloat operator/(float a, const vfloat& b) { return vfloat(a) / b; }1410CPPSPMD_FORCE_INLINE vfloat operator-(const vfloat& v) { return vfloat{ _mm_sub_ps(_mm_xor_ps(v.m_value, v.m_value), v.m_value) }; }14111412CPPSPMD_FORCE_INLINE vbool operator==(const vfloat& a, const vfloat& b) { return vbool{ _mm_castps_si128(_mm_cmpeq_ps(a.m_value, b.m_value)) }; }1413CPPSPMD_FORCE_INLINE vbool operator==(const vfloat& a, float b) { return a == vfloat(b); }14141415CPPSPMD_FORCE_INLINE vbool operator!=(const vfloat& a, const vfloat& b) { return !vbool{ _mm_castps_si128(_mm_cmpeq_ps(a.m_value, b.m_value)) }; }1416CPPSPMD_FORCE_INLINE vbool operator!=(const vfloat& a, float b) { return a != vfloat(b); }14171418CPPSPMD_FORCE_INLINE vbool operator<(const vfloat& a, const vfloat& b) { return vbool{ _mm_castps_si128(_mm_cmplt_ps(a.m_value, b.m_value)) }; }1419CPPSPMD_FORCE_INLINE vbool operator<(const vfloat& a, float b) { return a < vfloat(b); }14201421CPPSPMD_FORCE_INLINE vbool operator>(const vfloat& a, const vfloat& b) { return vbool{ _mm_castps_si128(_mm_cmpgt_ps(a.m_value, b.m_value)) }; }1422CPPSPMD_FORCE_INLINE vbool operator>(const vfloat& a, float b) { return a > vfloat(b); }14231424CPPSPMD_FORCE_INLINE vbool operator<=(const vfloat& a, const vfloat& b) { return vbool{ _mm_castps_si128(_mm_cmple_ps(a.m_value, b.m_value)) }; }1425CPPSPMD_FORCE_INLINE vbool operator<=(const vfloat& a, float b) { return a <= vfloat(b); }14261427CPPSPMD_FORCE_INLINE vbool operator>=(const vfloat& a, const vfloat& b) { return vbool{ _mm_castps_si128(_mm_cmpge_ps(a.m_value, b.m_value)) }; }1428CPPSPMD_FORCE_INLINE vbool operator>=(const vfloat& a, float b) { return a >= vfloat(b); }14291430CPPSPMD_FORCE_INLINE vfloat spmd_ternaryf(const vbool& cond, const vfloat& a, const vfloat& b) { return vfloat{ blendv_mask_ps(b.m_value, a.m_value, _mm_castsi128_ps(cond.m_value)) }; }1431CPPSPMD_FORCE_INLINE vint spmd_ternaryi(const vbool& cond, const vint& a, const vint& b) { return vint{ blendv_mask_epi32(b.m_value, a.m_value, cond.m_value) }; }14321433CPPSPMD_FORCE_INLINE vfloat sqrt(const vfloat& v) { return vfloat{ _mm_sqrt_ps(v.m_value) }; }1434CPPSPMD_FORCE_INLINE vfloat abs(const vfloat& v) { return vfloat{ _mm_andnot_ps(_mm_set1_ps(-0.0f), v.m_value) }; }1435CPPSPMD_FORCE_INLINE vfloat max(const vfloat& a, const vfloat& b) { return vfloat{ _mm_max_ps(a.m_value, b.m_value) }; }1436CPPSPMD_FORCE_INLINE vfloat min(const vfloat& a, const vfloat& b) { return vfloat{ _mm_min_ps(a.m_value, b.m_value) }; }14371438#if CPPSPMD_SSE21439CPPSPMD_FORCE_INLINE vfloat round_truncate(const vfloat& a)1440{1441__m128i abs_a = _mm_and_si128(_mm_castps_si128(a.m_value), _mm_set1_epi32(0x7FFFFFFFU) );1442__m128i has_fractional = _mm_cmplt_epi32(abs_a, _mm_castps_si128(_mm_set1_ps(8388608.0f)));14431444__m128i ai = _mm_cvttps_epi32(a.m_value);14451446__m128 af = _mm_cvtepi32_ps(ai);1447return vfloat{ blendv_mask_ps(a.m_value, af, _mm_castsi128_ps(has_fractional)) };1448}14491450CPPSPMD_FORCE_INLINE vfloat floor(const vfloat& a)1451{1452__m128i abs_a = _mm_and_si128(_mm_castps_si128(a.m_value), _mm_set1_epi32(0x7FFFFFFFU));1453__m128i has_fractional = _mm_cmplt_epi32(abs_a, _mm_castps_si128(_mm_set1_ps(8388608.0f)));14541455__m128i ai = _mm_cvtps_epi32(a.m_value);1456__m128 af = _mm_cvtepi32_ps(ai);1457__m128 changed = _mm_cvtepi32_ps(_mm_castps_si128(_mm_cmpgt_ps(af, a.m_value)));14581459af = _mm_add_ps(af, changed);14601461return vfloat{ blendv_mask_ps(a.m_value, af, _mm_castsi128_ps(has_fractional)) };1462}14631464CPPSPMD_FORCE_INLINE vfloat ceil(const vfloat& a)1465{1466__m128i abs_a = _mm_and_si128(_mm_castps_si128(a.m_value), _mm_set1_epi32(0x7FFFFFFFU));1467__m128i has_fractional = _mm_cmplt_epi32(abs_a, _mm_castps_si128(_mm_set1_ps(8388608.0f)));14681469__m128i ai = _mm_cvtps_epi32(a.m_value);1470__m128 af = _mm_cvtepi32_ps(ai);1471__m128 changed = _mm_cvtepi32_ps(_mm_castps_si128(_mm_cmplt_ps(af, a.m_value)));14721473af = _mm_sub_ps(af, changed);14741475return vfloat{ blendv_mask_ps(a.m_value, af, _mm_castsi128_ps(has_fractional)) };1476}14771478// We need to disable unsafe math optimizations for the key operations used for rounding to nearest.1479// I wish there was a better way.1480#if defined(__GNUC__) && !defined(__INTEL_COMPILER) && !defined(__clang__)1481inline __m128 add_sub(__m128 a, __m128 b) __attribute__((optimize("-fno-unsafe-math-optimizations")))1482#elif defined(__clang__)1483inline __m128 add_sub(__m128 a, __m128 b) __attribute__((optnone))1484#elif defined (_MSC_VER)1485#pragma float_control(push)1486#pragma float_control(precise, on)1487inline __m128 add_sub(__m128 a, __m128 b)1488#else1489inline __m128 add_sub(__m128 a, __m128 b)1490#endif1491{1492return _mm_sub_ps(_mm_add_ps(a, b), b);1493}14941495#if defined (_MSC_VER)1496#pragma float_control(pop)1497#endif14981499CPPSPMD_FORCE_INLINE vfloat round_nearest(const vfloat& a)1500{1501__m128i no_fract_fp_bits = _mm_castps_si128(_mm_set1_ps(8388608.0f));15021503__m128i sign_a = _mm_and_si128(_mm_castps_si128(a.m_value), _mm_set1_epi32(0x80000000U));1504__m128 force_int = _mm_castsi128_ps(_mm_or_si128(no_fract_fp_bits, sign_a));15051506// Can't use individual _mm_add_ps/_mm_sub_ps - this will be optimized out with /fp:fast by clang and probably other compilers.1507//__m128 temp1 = _mm_add_ps(a.m_value, force_int);1508//__m128 temp2 = _mm_sub_ps(temp1, force_int);1509__m128 temp2 = add_sub(a.m_value, force_int);15101511__m128i abs_a = _mm_and_si128(_mm_castps_si128(a.m_value), _mm_set1_epi32(0x7FFFFFFFU));1512__m128i has_fractional = _mm_cmplt_epi32(abs_a, no_fract_fp_bits);1513return vfloat{ blendv_mask_ps(a.m_value, temp2, _mm_castsi128_ps(has_fractional)) };1514}15151516#else1517CPPSPMD_FORCE_INLINE vfloat floor(const vfloat& v) { return vfloat{ _mm_floor_ps(v.m_value) }; }1518CPPSPMD_FORCE_INLINE vfloat ceil(const vfloat& a) { return vfloat{ _mm_ceil_ps(a.m_value) }; }1519CPPSPMD_FORCE_INLINE vfloat round_nearest(const vfloat &a) { return vfloat{ _mm_round_ps(a.m_value, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC ) }; }1520CPPSPMD_FORCE_INLINE vfloat round_truncate(const vfloat &a) { return vfloat{ _mm_round_ps(a.m_value, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC ) }; }1521#endif15221523CPPSPMD_FORCE_INLINE vfloat frac(const vfloat& a) { return a - floor(a); }1524CPPSPMD_FORCE_INLINE vfloat fmod(vfloat a, vfloat b) { vfloat c = frac(abs(a / b)) * abs(b); return spmd_ternaryf(a < 0, -c, c); }1525CPPSPMD_FORCE_INLINE vfloat sign(const vfloat& a) { return spmd_ternaryf(a < 0.0f, 1.0f, 1.0f); }15261527CPPSPMD_FORCE_INLINE vint max(const vint& a, const vint& b) { return vint{ max_epi32(a.m_value, b.m_value) }; }1528CPPSPMD_FORCE_INLINE vint min(const vint& a, const vint& b) { return vint{ min_epi32(a.m_value, b.m_value) }; }15291530CPPSPMD_FORCE_INLINE vint maxu(const vint& a, const vint& b) { return vint{ max_epu32(a.m_value, b.m_value) }; }1531CPPSPMD_FORCE_INLINE vint minu(const vint& a, const vint& b) { return vint{ min_epu32(a.m_value, b.m_value) }; }15321533CPPSPMD_FORCE_INLINE vint abs(const vint& v) { return vint{ abs_epi32(v.m_value) }; }15341535CPPSPMD_FORCE_INLINE vint byteswap(const vint& v) { return vint{ shuffle_epi8(v.m_value, _mm_set_epi8(12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3)) }; }15361537CPPSPMD_FORCE_INLINE vint cast_vfloat_to_vint(const vfloat& v) { return vint{ _mm_castps_si128(v.m_value) }; }1538CPPSPMD_FORCE_INLINE vfloat cast_vint_to_vfloat(const vint& v) { return vfloat{ _mm_castsi128_ps(v.m_value) }; }15391540CPPSPMD_FORCE_INLINE vfloat clamp(const vfloat& v, const vfloat& a, const vfloat& b)1541{1542return vfloat{ _mm_min_ps(b.m_value, _mm_max_ps(v.m_value, a.m_value) ) };1543}15441545CPPSPMD_FORCE_INLINE vint clamp(const vint& v, const vint& a, const vint& b)1546{1547return vint{ min_epi32(b.m_value, max_epi32(v.m_value, a.m_value) ) };1548}15491550CPPSPMD_FORCE_INLINE vfloat vfma(const vfloat& a, const vfloat& b, const vfloat& c)1551{1552return vfloat{ _mm_add_ps(_mm_mul_ps(a.m_value, b.m_value), c.m_value) };1553}15541555CPPSPMD_FORCE_INLINE vfloat vfms(const vfloat& a, const vfloat& b, const vfloat& c)1556{1557return vfloat{ _mm_sub_ps(_mm_mul_ps(a.m_value, b.m_value), c.m_value) };1558}15591560CPPSPMD_FORCE_INLINE vfloat vfnma(const vfloat& a, const vfloat& b, const vfloat& c)1561{1562return vfloat{ _mm_sub_ps(c.m_value, _mm_mul_ps(a.m_value, b.m_value)) };1563}15641565CPPSPMD_FORCE_INLINE vfloat vfnms(const vfloat& a, const vfloat& b, const vfloat& c)1566{1567return vfloat{ _mm_sub_ps(_mm_sub_ps(_mm_xor_ps(a.m_value, a.m_value), _mm_mul_ps(a.m_value, b.m_value)), c.m_value) };1568}15691570CPPSPMD_FORCE_INLINE vfloat lerp(const vfloat &x, const vfloat &y, const vfloat &s) { return vfma(y - x, s, x); }15711572CPPSPMD_FORCE_INLINE lint operator+(int a, const lint& b) { return lint{ _mm_add_epi32(_mm_set1_epi32(a), b.m_value) }; }1573CPPSPMD_FORCE_INLINE lint operator+(const lint& a, int b) { return lint{ _mm_add_epi32(a.m_value, _mm_set1_epi32(b)) }; }1574CPPSPMD_FORCE_INLINE vfloat operator+(float a, const lint& b) { return vfloat(a) + vfloat(b); }1575CPPSPMD_FORCE_INLINE vfloat operator+(const lint& a, float b) { return vfloat(a) + vfloat(b); }1576CPPSPMD_FORCE_INLINE vfloat operator*(const lint& a, float b) { return vfloat(a) * vfloat(b); }1577CPPSPMD_FORCE_INLINE vfloat operator*(float b, const lint& a) { return vfloat(a) * vfloat(b); }15781579CPPSPMD_FORCE_INLINE vint operator&(const vint& a, const vint& b) { return vint{ _mm_and_si128(a.m_value, b.m_value) }; }1580CPPSPMD_FORCE_INLINE vint operator&(const vint& a, int b) { return a & vint(b); }1581CPPSPMD_FORCE_INLINE vint andnot(const vint& a, const vint& b) { return vint{ _mm_andnot_si128(a.m_value, b.m_value) }; }1582CPPSPMD_FORCE_INLINE vint operator|(const vint& a, const vint& b) { return vint{ _mm_or_si128(a.m_value, b.m_value) }; }1583CPPSPMD_FORCE_INLINE vint operator|(const vint& a, int b) { return a | vint(b); }1584CPPSPMD_FORCE_INLINE vint operator^(const vint& a, const vint& b) { return vint{ _mm_xor_si128(a.m_value, b.m_value) }; }1585CPPSPMD_FORCE_INLINE vint operator^(const vint& a, int b) { return a ^ vint(b); }1586CPPSPMD_FORCE_INLINE vbool operator==(const vint& a, const vint& b) { return vbool{ _mm_cmpeq_epi32(a.m_value, b.m_value) }; }1587CPPSPMD_FORCE_INLINE vbool operator!=(const vint& a, const vint& b) { return !vbool{ _mm_cmpeq_epi32(a.m_value, b.m_value) }; }1588CPPSPMD_FORCE_INLINE vbool operator<(const vint& a, const vint& b) { return vbool{ _mm_cmpgt_epi32(b.m_value, a.m_value) }; }1589CPPSPMD_FORCE_INLINE vbool operator<=(const vint& a, const vint& b) { return !vbool{ _mm_cmpgt_epi32(a.m_value, b.m_value) }; }1590CPPSPMD_FORCE_INLINE vbool operator>=(const vint& a, const vint& b) { return !vbool{ _mm_cmpgt_epi32(b.m_value, a.m_value) }; }1591CPPSPMD_FORCE_INLINE vbool operator>(const vint& a, const vint& b) { return vbool{ _mm_cmpgt_epi32(a.m_value, b.m_value) }; }1592CPPSPMD_FORCE_INLINE vint operator+(const vint& a, const vint& b) { return vint{ _mm_add_epi32(a.m_value, b.m_value) }; }1593CPPSPMD_FORCE_INLINE vint operator-(const vint& a, const vint& b) { return vint{ _mm_sub_epi32(a.m_value, b.m_value) }; }1594CPPSPMD_FORCE_INLINE vint operator+(const vint& a, int b) { return a + vint(b); }1595CPPSPMD_FORCE_INLINE vint operator-(const vint& a, int b) { return a - vint(b); }1596CPPSPMD_FORCE_INLINE vint operator+(int a, const vint& b) { return vint(a) + b; }1597CPPSPMD_FORCE_INLINE vint operator-(int a, const vint& b) { return vint(a) - b; }1598CPPSPMD_FORCE_INLINE vint operator*(const vint& a, const vint& b) { return vint{ mullo_epi32(a.m_value, b.m_value) }; }1599CPPSPMD_FORCE_INLINE vint operator*(const vint& a, int b) { return a * vint(b); }1600CPPSPMD_FORCE_INLINE vint operator*(int a, const vint& b) { return vint(a) * b; }16011602CPPSPMD_FORCE_INLINE vint mulhiu(const vint& a, const vint& b) { return vint{ mulhi_epu32(a.m_value, b.m_value) }; }16031604CPPSPMD_FORCE_INLINE vint operator-(const vint& v) { return vint{ _mm_sub_epi32(_mm_setzero_si128(), v.m_value) }; }16051606CPPSPMD_FORCE_INLINE vint operator~(const vint& a) { return vint{ -a - 1 }; }16071608// A few of these break the lane-based abstraction model. They are supported in SSE2, so it makes sense to support them and let the user figure it out.1609CPPSPMD_FORCE_INLINE vint adds_epu8(const vint& a, const vint& b) { return vint{ _mm_adds_epu8(a.m_value, b.m_value) }; }1610CPPSPMD_FORCE_INLINE vint subs_epu8(const vint& a, const vint& b) { return vint{ _mm_subs_epu8(a.m_value, b.m_value) }; }1611CPPSPMD_FORCE_INLINE vint avg_epu8(const vint & a, const vint & b) { return vint{ _mm_avg_epu8(a.m_value, b.m_value) }; }1612CPPSPMD_FORCE_INLINE vint max_epu8(const vint& a, const vint& b) { return vint{ _mm_max_epu8(a.m_value, b.m_value) }; }1613CPPSPMD_FORCE_INLINE vint min_epu8(const vint& a, const vint& b) { return vint{ _mm_min_epu8(a.m_value, b.m_value) }; }1614CPPSPMD_FORCE_INLINE vint sad_epu8(const vint& a, const vint& b) { return vint{ _mm_sad_epu8(a.m_value, b.m_value) }; }16151616CPPSPMD_FORCE_INLINE vint add_epi8(const vint& a, const vint& b) { return vint{ _mm_add_epi8(a.m_value, b.m_value) }; }1617CPPSPMD_FORCE_INLINE vint adds_epi8(const vint& a, const vint& b) { return vint{ _mm_adds_epi8(a.m_value, b.m_value) }; }1618CPPSPMD_FORCE_INLINE vint sub_epi8(const vint& a, const vint& b) { return vint{ _mm_sub_epi8(a.m_value, b.m_value) }; }1619CPPSPMD_FORCE_INLINE vint subs_epi8(const vint& a, const vint& b) { return vint{ _mm_subs_epi8(a.m_value, b.m_value) }; }1620CPPSPMD_FORCE_INLINE vint cmpeq_epi8(const vint& a, const vint& b) { return vint{ _mm_cmpeq_epi8(a.m_value, b.m_value) }; }1621CPPSPMD_FORCE_INLINE vint cmpgt_epi8(const vint& a, const vint& b) { return vint{ _mm_cmpgt_epi8(a.m_value, b.m_value) }; }1622CPPSPMD_FORCE_INLINE vint cmplt_epi8(const vint& a, const vint& b) { return vint{ _mm_cmplt_epi8(a.m_value, b.m_value) }; }1623CPPSPMD_FORCE_INLINE vint unpacklo_epi8(const vint& a, const vint& b) { return vint{ _mm_unpacklo_epi8(a.m_value, b.m_value) }; }1624CPPSPMD_FORCE_INLINE vint unpackhi_epi8(const vint& a, const vint& b) { return vint{ _mm_unpackhi_epi8(a.m_value, b.m_value) }; }1625CPPSPMD_FORCE_INLINE int movemask_epi8(const vint& a) { return _mm_movemask_epi8(a.m_value); }1626CPPSPMD_FORCE_INLINE int movemask_epi32(const vint& a) { return _mm_movemask_ps(_mm_castsi128_ps(a.m_value)); }16271628CPPSPMD_FORCE_INLINE vint cmple_epu8(const vint& a, const vint& b) { return vint{ _mm_cmpeq_epi8(_mm_min_epu8(a.m_value, b.m_value), a.m_value) }; }1629CPPSPMD_FORCE_INLINE vint cmpge_epu8(const vint& a, const vint& b) { return vint{ cmple_epu8(b, a) }; }1630CPPSPMD_FORCE_INLINE vint cmpgt_epu8(const vint& a, const vint& b) { return vint{ _mm_andnot_si128(_mm_cmpeq_epi8(a.m_value, b.m_value), _mm_cmpeq_epi8(_mm_max_epu8(a.m_value, b.m_value), a.m_value)) }; }1631CPPSPMD_FORCE_INLINE vint cmplt_epu8(const vint& a, const vint& b) { return vint{ cmpgt_epu8(b, a) }; }1632CPPSPMD_FORCE_INLINE vint absdiff_epu8(const vint& a, const vint& b) { return vint{ _mm_or_si128(_mm_subs_epu8(a.m_value, b.m_value), _mm_subs_epu8(b.m_value, a.m_value)) }; }16331634CPPSPMD_FORCE_INLINE vint blendv_epi8(const vint& a, const vint& b, const vint &mask) { return vint{ blendv_epi8(a.m_value, b.m_value, _mm_cmplt_epi8(mask.m_value, _mm_setzero_si128())) }; }1635CPPSPMD_FORCE_INLINE vint blendv_epi32(const vint& a, const vint& b, const vint &mask) { return vint{ blendv_epi32(a.m_value, b.m_value, mask.m_value) }; }16361637CPPSPMD_FORCE_INLINE vint add_epi16(const vint& a, const vint& b) { return vint{ _mm_add_epi16(a.m_value, b.m_value) }; }1638CPPSPMD_FORCE_INLINE vint adds_epi16(const vint& a, const vint& b) { return vint{ _mm_adds_epi16(a.m_value, b.m_value) }; }1639CPPSPMD_FORCE_INLINE vint adds_epu16(const vint& a, const vint& b) { return vint{ _mm_adds_epu16(a.m_value, b.m_value) }; }1640CPPSPMD_FORCE_INLINE vint avg_epu16(const vint& a, const vint& b) { return vint{ _mm_avg_epu16(a.m_value, b.m_value) }; }1641CPPSPMD_FORCE_INLINE vint sub_epi16(const vint& a, const vint& b) { return vint{ _mm_sub_epi16(a.m_value, b.m_value) }; }1642CPPSPMD_FORCE_INLINE vint subs_epi16(const vint& a, const vint& b) { return vint{ _mm_subs_epi16(a.m_value, b.m_value) }; }1643CPPSPMD_FORCE_INLINE vint subs_epu16(const vint& a, const vint& b) { return vint{ _mm_subs_epu16(a.m_value, b.m_value) }; }1644CPPSPMD_FORCE_INLINE vint mullo_epi16(const vint& a, const vint& b) { return vint{ _mm_mullo_epi16(a.m_value, b.m_value) }; }1645CPPSPMD_FORCE_INLINE vint mulhi_epi16(const vint& a, const vint& b) { return vint{ _mm_mulhi_epi16(a.m_value, b.m_value) }; }1646CPPSPMD_FORCE_INLINE vint mulhi_epu16(const vint& a, const vint& b) { return vint{ _mm_mulhi_epu16(a.m_value, b.m_value) }; }1647CPPSPMD_FORCE_INLINE vint min_epi16(const vint& a, const vint& b) { return vint{ _mm_min_epi16(a.m_value, b.m_value) }; }1648CPPSPMD_FORCE_INLINE vint max_epi16(const vint& a, const vint& b) { return vint{ _mm_max_epi16(a.m_value, b.m_value) }; }1649CPPSPMD_FORCE_INLINE vint madd_epi16(const vint& a, const vint& b) { return vint{ _mm_madd_epi16(a.m_value, b.m_value) }; }1650CPPSPMD_FORCE_INLINE vint cmpeq_epi16(const vint& a, const vint& b) { return vint{ _mm_cmpeq_epi16(a.m_value, b.m_value) }; }1651CPPSPMD_FORCE_INLINE vint cmpgt_epi16(const vint& a, const vint& b) { return vint{ _mm_cmpgt_epi16(a.m_value, b.m_value) }; }1652CPPSPMD_FORCE_INLINE vint cmplt_epi16(const vint& a, const vint& b) { return vint{ _mm_cmplt_epi16(a.m_value, b.m_value) }; }1653CPPSPMD_FORCE_INLINE vint packs_epi16(const vint& a, const vint& b) { return vint{ _mm_packs_epi16(a.m_value, b.m_value) }; }1654CPPSPMD_FORCE_INLINE vint packus_epi16(const vint& a, const vint& b) { return vint{ _mm_packus_epi16(a.m_value, b.m_value) }; }16551656CPPSPMD_FORCE_INLINE vint uniform_shift_left_epi16(const vint& a, const vint& b) { return vint{ _mm_sll_epi16(a.m_value, b.m_value) }; }1657CPPSPMD_FORCE_INLINE vint uniform_arith_shift_right_epi16(const vint& a, const vint& b) { return vint{ _mm_sra_epi16(a.m_value, b.m_value) }; }1658CPPSPMD_FORCE_INLINE vint uniform_shift_right_epi16(const vint& a, const vint& b) { return vint{ _mm_srl_epi16(a.m_value, b.m_value) }; }16591660#define VINT_SHIFT_LEFT_EPI16(a, b) vint(_mm_slli_epi16((a).m_value, b))1661#define VINT_SHIFT_RIGHT_EPI16(a, b) vint(_mm_srai_epi16((a).m_value, b))1662#define VUINT_SHIFT_RIGHT_EPI16(a, b) vint(_mm_srli_epi16((a).m_value, b))16631664CPPSPMD_FORCE_INLINE vint undefined_vint() { return vint{ _mm_undefined_si128() }; }1665CPPSPMD_FORCE_INLINE vfloat undefined_vfloat() { return vfloat{ _mm_undefined_ps() }; }16661667CPPSPMD_FORCE_INLINE vint zero_vint() { return vint{ _mm_setzero_si128() }; }1668CPPSPMD_FORCE_INLINE vfloat zero_vfloat() { return vfloat{ _mm_setzero_ps() }; }16691670CPPSPMD_FORCE_INLINE vint vint_lane_set(int v0, int v1, int v2, int v3) { return vint{ _mm_set_epi32(v3, v2, v1, v0) }; }1671CPPSPMD_FORCE_INLINE vfloat vfloat_lane_set(float v0, float v1, float v2, float v3) { return vfloat{ _mm_set_ps(v3, v2, v1, v0) }; }1672CPPSPMD_FORCE_INLINE vint vint_lane_set_r(int v3, int v2, int v1, int v0) { return vint{ _mm_set_epi32(v3, v2, v1, v0) }; }1673CPPSPMD_FORCE_INLINE vfloat vfloat_lane_set_r(float v3, float v2, float v1, float v0) { return vfloat{ _mm_set_ps(v3, v2, v1, v0) }; }1674// control is an 8-bit immediate value containing 4 2-bit indices which shuffles the int32's in each 128-bit lane.1675#define VINT_LANE_SHUFFLE_EPI32(a, control) vint(_mm_shuffle_epi32((a).m_value, control))1676#define VFLOAT_LANE_SHUFFLE_PS(a, b, control) vfloat(_mm_shuffle_ps((a).m_value, (b).m_value, control))16771678// control is an 8-bit immediate value containing 4 2-bit indices which shuffles the int16's in either the high or low 64-bit lane.1679#define VINT_LANE_SHUFFLELO_EPI16(a, control) vint(_mm_shufflelo_epi16((a).m_value, control))1680#define VINT_LANE_SHUFFLEHI_EPI16(a, control) vint(_mm_shufflehi_epi16((a).m_value, control))16811682#define VINT_LANE_SHUFFLE_MASK(a, b, c, d) ((a) | ((b) << 2) | ((c) << 4) | ((d) << 6))1683#define VINT_LANE_SHUFFLE_MASK_R(d, c, b, a) ((a) | ((b) << 2) | ((c) << 4) | ((d) << 6))16841685#define VINT_LANE_SHIFT_LEFT_BYTES(a, l) vint(_mm_slli_si128((a).m_value, l))1686#define VINT_LANE_SHIFT_RIGHT_BYTES(a, l) vint(_mm_srli_si128((a).m_value, l))16871688// Unpack and interleave 8-bit integers from the low or high half of a and b1689CPPSPMD_FORCE_INLINE vint vint_lane_unpacklo_epi8(const vint& a, const vint& b) { return vint(_mm_unpacklo_epi8(a.m_value, b.m_value)); }1690CPPSPMD_FORCE_INLINE vint vint_lane_unpackhi_epi8(const vint& a, const vint& b) { return vint(_mm_unpackhi_epi8(a.m_value, b.m_value)); }16911692// Unpack and interleave 16-bit integers from the low or high half of a and b1693CPPSPMD_FORCE_INLINE vint vint_lane_unpacklo_epi16(const vint& a, const vint& b) { return vint(_mm_unpacklo_epi16(a.m_value, b.m_value)); }1694CPPSPMD_FORCE_INLINE vint vint_lane_unpackhi_epi16(const vint& a, const vint& b) { return vint(_mm_unpackhi_epi16(a.m_value, b.m_value)); }16951696// Unpack and interleave 32-bit integers from the low or high half of a and b1697CPPSPMD_FORCE_INLINE vint vint_lane_unpacklo_epi32(const vint& a, const vint& b) { return vint(_mm_unpacklo_epi32(a.m_value, b.m_value)); }1698CPPSPMD_FORCE_INLINE vint vint_lane_unpackhi_epi32(const vint& a, const vint& b) { return vint(_mm_unpackhi_epi32(a.m_value, b.m_value)); }16991700// Unpack and interleave 64-bit integers from the low or high half of a and b1701CPPSPMD_FORCE_INLINE vint vint_lane_unpacklo_epi64(const vint& a, const vint& b) { return vint(_mm_unpacklo_epi64(a.m_value, b.m_value)); }1702CPPSPMD_FORCE_INLINE vint vint_lane_unpackhi_epi64(const vint& a, const vint& b) { return vint(_mm_unpackhi_epi64(a.m_value, b.m_value)); }17031704CPPSPMD_FORCE_INLINE vint vint_set1_epi8(int8_t a) { return vint(_mm_set1_epi8(a)); }1705CPPSPMD_FORCE_INLINE vint vint_set1_epi16(int16_t a) { return vint(_mm_set1_epi16(a)); }1706CPPSPMD_FORCE_INLINE vint vint_set1_epi32(int32_t a) { return vint(_mm_set1_epi32(a)); }1707CPPSPMD_FORCE_INLINE vint vint_set1_epi64(int64_t a) { return vint(_mm_set1_epi64x(a)); }17081709CPPSPMD_FORCE_INLINE vint mul_epu32(const vint &a, const vint& b) { return vint(_mm_mul_epu32(a.m_value, b.m_value)); }17101711CPPSPMD_FORCE_INLINE vint div_epi32(const vint &a, const vint& b)1712{1713__m128d al = _mm_cvtepi32_pd(a.m_value);1714__m128d ah = _mm_cvtepi32_pd(_mm_unpackhi_epi64(a.m_value, a.m_value));17151716__m128d bl = _mm_cvtepi32_pd(b.m_value);1717__m128d bh = _mm_cvtepi32_pd(_mm_unpackhi_epi64(b.m_value, b.m_value));17181719__m128d rl = _mm_div_pd(al, bl);1720__m128d rh = _mm_div_pd(ah, bh);17211722__m128i rli = _mm_cvttpd_epi32(rl);1723__m128i rhi = _mm_cvttpd_epi32(rh);17241725return vint(_mm_unpacklo_epi64(rli, rhi));1726}17271728CPPSPMD_FORCE_INLINE vint mod_epi32(const vint &a, const vint& b)1729{1730vint aa = abs(a), ab = abs(b);1731vint q = div_epi32(aa, ab);1732vint r = aa - q * ab;1733return spmd_ternaryi(a < 0, -r, r);1734}17351736CPPSPMD_FORCE_INLINE vint operator/ (const vint& a, const vint& b)1737{1738return div_epi32(a, b);1739}17401741CPPSPMD_FORCE_INLINE vint operator/ (const vint& a, int b)1742{1743return div_epi32(a, vint(b));1744}17451746CPPSPMD_FORCE_INLINE vint operator% (const vint& a, const vint& b)1747{1748return mod_epi32(a, b);1749}17501751CPPSPMD_FORCE_INLINE vint operator% (const vint& a, int b)1752{1753return mod_epi32(a, vint(b));1754}17551756CPPSPMD_FORCE_INLINE vint operator<< (const vint& a, const vint& b)1757{1758#if 01759CPPSPMD_ALIGN(32) int result[4];1760result[0] = extract_x(a.m_value) << extract_x(b.m_value);1761result[1] = extract_y(a.m_value) << extract_y(b.m_value);1762result[2] = extract_z(a.m_value) << extract_z(b.m_value);1763result[3] = extract_w(a.m_value) << extract_w(b.m_value);17641765return vint{ _mm_load_si128((__m128i*)result) };1766#elif 01767int x = extract_x(a.m_value) << extract_x(b.m_value);1768int y = extract_y(a.m_value) << extract_y(b.m_value);1769int z = extract_z(a.m_value) << extract_z(b.m_value);1770int w = extract_w(a.m_value) << extract_w(b.m_value);17711772__m128i v = insert_x(_mm_undefined_si128(), x);1773v = insert_y(v, y);1774v = insert_z(v, z);1775return vint{ insert_w(v, w) };1776#else1777// What this does: shift left each b lane by 23 bits (to move the shift amount into the FP exponent position), then epi32 add to the integer rep of 1.0f, then cast that to float, then convert that to int to get fast 2^x.1778return a * vint(cast_vint_to_vfloat(vint(_mm_slli_epi32(b.m_value, 23)) + cast_vfloat_to_vint(vfloat(1.0f))));1779#endif1780}17811782// uniform shift left1783CPPSPMD_FORCE_INLINE vint operator<< (const vint& a, int b)1784{1785__m128i bv = _mm_castps_si128(_mm_and_ps(_mm_castsi128_ps(_mm_set1_epi32(b)), _mm_castsi128_ps(_mm_load_si128((const __m128i *)g_x_128))));1786return vint{ _mm_sll_epi32(a.m_value, bv) };1787}17881789// uniform arithmetic shift right1790CPPSPMD_FORCE_INLINE vint operator>> (const vint& a, int b)1791{1792__m128i bv = _mm_castps_si128(_mm_and_ps(_mm_castsi128_ps(_mm_set1_epi32(b)), _mm_castsi128_ps(_mm_load_si128((const __m128i *)g_x_128))));1793return vint{ _mm_sra_epi32(a.m_value, bv) };1794}17951796// uniform shift right1797CPPSPMD_FORCE_INLINE vint vuint_shift_right(const vint& a, int b)1798{1799__m128i bv = _mm_castps_si128(_mm_and_ps(_mm_castsi128_ps(_mm_set1_epi32(b)), _mm_castsi128_ps(_mm_load_si128((const __m128i *)g_x_128))));1800return vint{ _mm_srl_epi32(a.m_value, bv) };1801}18021803CPPSPMD_FORCE_INLINE vint vuint_shift_right(const vint& a, const vint& b)1804{1805#if 01806CPPSPMD_ALIGN(32) int result[4];1807result[0] = ((uint32_t)extract_x(a.m_value)) >> extract_x(b.m_value);1808result[1] = ((uint32_t)extract_y(a.m_value)) >> extract_y(b.m_value);1809result[2] = ((uint32_t)extract_z(a.m_value)) >> extract_z(b.m_value);1810result[3] = ((uint32_t)extract_w(a.m_value)) >> extract_w(b.m_value);18111812return vint{ _mm_load_si128((__m128i*)result) };1813#elif 01814uint32_t x = ((uint32_t)extract_x(a.m_value)) >> ((uint32_t)extract_x(b.m_value));1815uint32_t y = ((uint32_t)extract_y(a.m_value)) >> ((uint32_t)extract_y(b.m_value));1816uint32_t z = ((uint32_t)extract_z(a.m_value)) >> ((uint32_t)extract_z(b.m_value));1817uint32_t w = ((uint32_t)extract_w(a.m_value)) >> ((uint32_t)extract_w(b.m_value));18181819__m128i v = insert_x(_mm_undefined_si128(), x);1820v = insert_y(v, y);1821v = insert_z(v, z);1822return vint{ insert_w(v, w) };1823#else1824//vint inv_shift = 32 - b;1825//vfloat f = cast_vint_to_vfloat(vint(_mm_slli_epi32(inv_shift.m_value, 23)) + cast_vfloat_to_vint(vfloat(1.0f)));18261827// Take float rep of 1.0f (0x3f800000), subtract (32<<23), subtract (shift<<23), cast to float.1828vfloat f = cast_vint_to_vfloat(vint(_mm_sub_epi32(_mm_set1_epi32(0x4f800000), _mm_slli_epi32(b.m_value, 23))));18291830// Now convert scale factor to integer.1831vint r = vint(f);18321833// mulhi_epu32 (using two _mm_mul_epu32), to emulate varying shift left.1834vint q(mulhi_epu32(a.m_value, r.m_value));18351836// Handle shift amounts of 0.1837return spmd_ternaryi(b > 0, q, a);1838#endif1839}18401841CPPSPMD_FORCE_INLINE vint vuint_shift_right_not_zero(const vint& a, const vint& b)1842{1843//vint inv_shift = 32 - b;1844//vfloat f = cast_vint_to_vfloat(vint(_mm_slli_epi32(inv_shift.m_value, 23)) + cast_vfloat_to_vint(vfloat(1.0f)));18451846// Take float rep of 1.0f (0x3f800000), subtract (32<<23), subtract (shift<<23), cast to float.1847vfloat f = cast_vint_to_vfloat(vint(_mm_sub_epi32(_mm_set1_epi32(0x4f800000), _mm_slli_epi32(b.m_value, 23))));18481849// Now convert scale factor to integer.1850vint r = vint(f);18511852// mulhi_epu32 (using two _mm_mul_epu32), to emulate varying shift left.1853return vint(mulhi_epu32(a.m_value, r.m_value));1854}18551856CPPSPMD_FORCE_INLINE vint operator>> (const vint& a, const vint& b)1857{1858#if 01859CPPSPMD_ALIGN(32) int result[4];1860result[0] = extract_x(a.m_value) >> extract_x(b.m_value);1861result[1] = extract_y(a.m_value) >> extract_y(b.m_value);1862result[2] = extract_z(a.m_value) >> extract_z(b.m_value);1863result[3] = extract_w(a.m_value) >> extract_w(b.m_value);18641865return vint{ _mm_load_si128((__m128i*)result) };1866#elif 01867int x = extract_x(a.m_value) >> extract_x(b.m_value);1868int y = extract_y(a.m_value) >> extract_y(b.m_value);1869int z = extract_z(a.m_value) >> extract_z(b.m_value);1870int w = extract_w(a.m_value) >> extract_w(b.m_value);18711872__m128i v = insert_x(_mm_undefined_si128(), x);1873v = insert_y(v, y);1874v = insert_z(v, z);1875return vint{ insert_w(v, w) };1876#else1877vint sign_mask(_mm_cmplt_epi32(a.m_value, _mm_setzero_si128()));1878vint a_shifted = vuint_shift_right(a ^ sign_mask, b) ^ sign_mask;1879return a_shifted;1880#endif1881}18821883#undef VINT_SHIFT_LEFT1884#undef VINT_SHIFT_RIGHT1885#undef VUINT_SHIFT_RIGHT18861887// Shift left/right by a uniform immediate constant1888#define VINT_SHIFT_LEFT(a, b) vint(_mm_slli_epi32( (a).m_value, (b) ) )1889#define VINT_SHIFT_RIGHT(a, b) vint( _mm_srai_epi32( (a).m_value, (b) ) )1890#define VUINT_SHIFT_RIGHT(a, b) vint( _mm_srli_epi32( (a).m_value, (b) ) )1891#define VINT_ROT(x, k) (VINT_SHIFT_LEFT((x), (k)) | VUINT_SHIFT_RIGHT((x), 32 - (k)))18921893CPPSPMD_FORCE_INLINE vbool operator==(const lint& a, const lint& b) { return vbool{ _mm_cmpeq_epi32(a.m_value, b.m_value) }; }1894CPPSPMD_FORCE_INLINE vbool operator==(const lint& a, int b) { return vint(a) == vint(b); }1895CPPSPMD_FORCE_INLINE vbool operator==(int a, const lint& b) { return vint(a) == vint(b); }1896CPPSPMD_FORCE_INLINE vbool operator<(const lint& a, const lint& b) { return vbool{ _mm_cmpgt_epi32(b.m_value, a.m_value) }; }1897CPPSPMD_FORCE_INLINE vbool operator>(const lint& a, const lint& b) { return vbool{ _mm_cmpgt_epi32(a.m_value, b.m_value) }; }1898CPPSPMD_FORCE_INLINE vbool operator<=(const lint& a, const lint& b) { return !vbool{ _mm_cmpgt_epi32(a.m_value, b.m_value) }; }1899CPPSPMD_FORCE_INLINE vbool operator>=(const lint& a, const lint& b) { return !vbool{ _mm_cmpgt_epi32(b.m_value, a.m_value) }; }19001901CPPSPMD_FORCE_INLINE float extract(const vfloat& v, int instance) { assert(instance < 4); CPPSPMD_ALIGN(16) float values[4]; _mm_store_ps(values, v.m_value); return values[instance]; }1902CPPSPMD_FORCE_INLINE int extract(const vint& v, int instance) { assert(instance < 4); CPPSPMD_ALIGN(16) int values[4]; _mm_store_si128((__m128i*)values, v.m_value); return values[instance]; }1903CPPSPMD_FORCE_INLINE int extract(const lint& v, int instance) { assert(instance < 4); CPPSPMD_ALIGN(16) int values[4]; _mm_store_si128((__m128i*)values, v.m_value); return values[instance]; }1904CPPSPMD_FORCE_INLINE bool extract(const vbool& v, int instance) { assert(instance < 4); CPPSPMD_ALIGN(16) int values[4]; _mm_store_si128((__m128i*)values, v.m_value); return values[instance] != 0; }19051906#undef VINT_EXTRACT1907#undef VBOOL_EXTRACT1908#undef VFLOAT_EXTRACT19091910#if CPPSPMD_SSE21911// Pass in an immediate constant and the compiler will optimize these expressions.1912#define VINT_EXTRACT(v, instance) ( ((instance) == 0) ? extract_x((v).m_value) : (((instance) == 1) ? extract_y((v).m_value) : (((instance) == 2) ? extract_z((v).m_value) : extract_w((v).m_value))) )1913#define VBOOL_EXTRACT(v, instance) ( ((instance) == 0) ? extract_x((v).m_value) : (((instance) == 1) ? extract_y((v).m_value) : (((instance) == 2) ? extract_z((v).m_value) : extract_w((v).m_value))) )1914#define VFLOAT_EXTRACT(v, instance) ( ((instance) == 0) ? extractf_ps_x((v).m_value) : (((instance) == 1) ? extractf_ps_y((v).m_value) : (((instance) == 2) ? extractf_ps_z((v).m_value) : extractf_ps_w((v).m_value))) )1915#else1916CPPSPMD_FORCE_INLINE float cast_int_bits_as_float(int v) { return *(const float*)&v; }19171918#define VINT_EXTRACT(v, instance) _mm_extract_epi32((v).m_value, instance)1919#define VBOOL_EXTRACT(v, instance) _mm_extract_epi32((v).m_value, instance)1920#define VFLOAT_EXTRACT(v, instance) cast_int_bits_as_float(_mm_extract_ps((v).m_value, instance))1921#endif19221923CPPSPMD_FORCE_INLINE vfloat &insert(vfloat& v, int instance, float f)1924{1925assert(instance < 4);1926CPPSPMD_ALIGN(16) float values[4];1927_mm_store_ps(values, v.m_value);1928values[instance] = f;1929v.m_value = _mm_load_ps(values);1930return v;1931}19321933CPPSPMD_FORCE_INLINE vint &insert(vint& v, int instance, int i)1934{1935assert(instance < 4);1936CPPSPMD_ALIGN(16) int values[4];1937_mm_store_si128((__m128i *)values, v.m_value);1938values[instance] = i;1939v.m_value = _mm_load_si128((__m128i *)values);1940return v;1941}19421943CPPSPMD_FORCE_INLINE vint init_lookup4(const uint8_t pTab[16])1944{1945__m128i l = _mm_loadu_si128((const __m128i*)pTab);1946return vint{ l };1947}19481949CPPSPMD_FORCE_INLINE vint table_lookup4_8(const vint& a, const vint& table)1950{1951return vint{ shuffle_epi8(table.m_value, a.m_value) };1952}19531954CPPSPMD_FORCE_INLINE void init_lookup5(const uint8_t pTab[32], vint& table_0, vint& table_1)1955{1956__m128i l = _mm_loadu_si128((const __m128i*)pTab);1957__m128i h = _mm_loadu_si128((const __m128i*)(pTab + 16));1958table_0.m_value = l;1959table_1.m_value = h;1960}19611962CPPSPMD_FORCE_INLINE vint table_lookup5_8(const vint& a, const vint& table_0, const vint& table_1)1963{1964__m128i l_0 = shuffle_epi8(table_0.m_value, a.m_value);1965__m128i h_0 = shuffle_epi8(table_1.m_value, a.m_value);19661967__m128i m_0 = _mm_slli_epi32(a.m_value, 31 - 4);19681969__m128 v_0 = blendv_ps(_mm_castsi128_ps(l_0), _mm_castsi128_ps(h_0), _mm_castsi128_ps(m_0));19701971return vint{ _mm_castps_si128(v_0) };1972}19731974CPPSPMD_FORCE_INLINE void init_lookup6(const uint8_t pTab[64], vint& table_0, vint& table_1, vint& table_2, vint& table_3)1975{1976__m128i a = _mm_loadu_si128((const __m128i*)pTab);1977__m128i b = _mm_loadu_si128((const __m128i*)(pTab + 16));1978__m128i c = _mm_loadu_si128((const __m128i*)(pTab + 32));1979__m128i d = _mm_loadu_si128((const __m128i*)(pTab + 48));19801981table_0.m_value = a;1982table_1.m_value = b;1983table_2.m_value = c;1984table_3.m_value = d;1985}19861987CPPSPMD_FORCE_INLINE vint table_lookup6_8(const vint& a, const vint& table_0, const vint& table_1, const vint& table_2, const vint& table_3)1988{1989__m128i m_0 = _mm_slli_epi32(a.m_value, 31 - 4);19901991__m128 av_0;1992{1993__m128i al_0 = shuffle_epi8(table_0.m_value, a.m_value);1994__m128i ah_0 = shuffle_epi8(table_1.m_value, a.m_value);1995av_0 = blendv_ps(_mm_castsi128_ps(al_0), _mm_castsi128_ps(ah_0), _mm_castsi128_ps(m_0));1996}19971998__m128 bv_0;1999{2000__m128i bl_0 = shuffle_epi8(table_2.m_value, a.m_value);2001__m128i bh_0 = shuffle_epi8(table_3.m_value, a.m_value);2002bv_0 = blendv_ps(_mm_castsi128_ps(bl_0), _mm_castsi128_ps(bh_0), _mm_castsi128_ps(m_0));2003}20042005__m128i m2_0 = _mm_slli_epi32(a.m_value, 31 - 5);2006__m128 v2_0 = blendv_ps(av_0, bv_0, _mm_castsi128_ps(m2_0));20072008return vint{ _mm_castps_si128(v2_0) };2009}20102011#if 02012template<typename SPMDKernel, typename... Args>2013CPPSPMD_FORCE_INLINE decltype(auto) spmd_call(Args&&... args)2014{2015SPMDKernel kernel;2016kernel.init(exec_mask::all_on());2017return kernel._call(std::forward<Args>(args)...);2018}2019#else2020template<typename SPMDKernel, typename... Args>2021CPPSPMD_FORCE_INLINE void spmd_call(Args&&... args)2022{2023SPMDKernel kernel;2024kernel.init(exec_mask::all_on());2025kernel._call(std::forward<Args>(args)...);2026}2027#endif20282029CPPSPMD_FORCE_INLINE void spmd_kernel::init(const spmd_kernel::exec_mask& kernel_exec)2030{2031m_exec = kernel_exec;2032m_kernel_exec = kernel_exec;2033m_continue_mask = exec_mask::all_off();20342035#ifdef _DEBUG2036m_in_loop = false;2037#endif2038}20392040CPPSPMD_FORCE_INLINE const float_vref& spmd_kernel::store(const float_vref& dst, const vfloat& src)2041{2042CPPSPMD_ALIGN(16) int vindex[4];2043_mm_store_si128((__m128i*)vindex, dst.m_vindex);20442045CPPSPMD_ALIGN(16) float stored[4];2046_mm_store_ps(stored, src.m_value);20472048int mask = _mm_movemask_ps(_mm_castsi128_ps(m_exec.m_mask));2049for (int i = 0; i < 4; i++)2050{2051if (mask & (1 << i))2052dst.m_pValue[vindex[i]] = stored[i];2053}2054return dst;2055}20562057CPPSPMD_FORCE_INLINE const float_vref& spmd_kernel::store_all(const float_vref& dst, const vfloat& src)2058{2059CPPSPMD_ALIGN(16) int vindex[4];2060_mm_store_si128((__m128i*)vindex, dst.m_vindex);20612062CPPSPMD_ALIGN(16) float stored[4];2063_mm_store_ps(stored, src.m_value);20642065for (int i = 0; i < 4; i++)2066dst.m_pValue[vindex[i]] = stored[i];2067return dst;2068}20692070CPPSPMD_FORCE_INLINE const float_vref& spmd_kernel::store(const float_vref&& dst, const vfloat& src)2071{2072CPPSPMD_ALIGN(16) int vindex[4];2073_mm_store_si128((__m128i*)vindex, dst.m_vindex);20742075CPPSPMD_ALIGN(16) float stored[4];2076_mm_store_ps(stored, src.m_value);20772078int mask = _mm_movemask_ps(_mm_castsi128_ps(m_exec.m_mask));2079for (int i = 0; i < 4; i++)2080{2081if (mask & (1 << i))2082dst.m_pValue[vindex[i]] = stored[i];2083}2084return dst;2085}20862087CPPSPMD_FORCE_INLINE const float_vref& spmd_kernel::store_all(const float_vref&& dst, const vfloat& src)2088{2089CPPSPMD_ALIGN(16) int vindex[4];2090_mm_store_si128((__m128i*)vindex, dst.m_vindex);20912092CPPSPMD_ALIGN(16) float stored[4];2093_mm_store_ps(stored, src.m_value);20942095for (int i = 0; i < 4; i++)2096dst.m_pValue[vindex[i]] = stored[i];2097return dst;2098}20992100#include "cppspmd_flow.h"2101#include "cppspmd_math.h"21022103} // namespace cppspmd_sse412104210521062107