CoCalc -- vfloat16

GitHub Repository: godotengine/godot
Path: blob/master/thirdparty/embree/common/simd/vfloat16_avx512.h
²¹⁷⁷⁰ views
1
// Copyright 2009-2021 Intel Corporation
2
// SPDX-License-Identifier: Apache-2.0
3

4
#pragma once
5

6
#define vboolf vboolf_impl
7
#define vboold vboold_impl
8
#define vint vint_impl
9
#define vuint vuint_impl
10
#define vllong vllong_impl
11
#define vfloat vfloat_impl
12
#define vdouble vdouble_impl
13

14
namespace embree
15
{
16
  /* 16-wide AVX-512 float type */
17
  template<>
18
  struct vfloat<16>
19
  {
20
    ALIGNED_STRUCT_(64);
21
    
22
    typedef vboolf16 Bool;
23
    typedef vint16   Int;
24
    typedef vfloat16 Float;
25

26
    enum  { size = 16 }; // number of SIMD elements
27
    union {              // data
28
      __m512 v; 
29
      float f[16];
30
      int i[16];
31
    };
32

33
    ////////////////////////////////////////////////////////////////////////////////
34
    /// Constructors, Assignment & Cast Operators
35
    ////////////////////////////////////////////////////////////////////////////////
36
        
37
    __forceinline vfloat() {}
38
    __forceinline vfloat(const vfloat16& t) { v = t; }
39
    __forceinline vfloat16& operator =(const vfloat16& f) { v = f.v; return *this; }
40

41
    __forceinline vfloat(const __m512& t) { v = t; }
42
    __forceinline operator __m512() const { return v; }
43
    __forceinline operator __m256() const { return _mm512_castps512_ps256(v); }
44
    __forceinline operator __m128() const { return _mm512_castps512_ps128(v); }
45

46
    __forceinline vfloat(float f) {
47
      v = _mm512_set1_ps(f);
48
    }
49

50
    __forceinline vfloat(float a, float b, float c, float d) {
51
      v = _mm512_set4_ps(a, b, c, d);
52
    }
53

54
    __forceinline vfloat(const vfloat4& i) {
55
      v = _mm512_broadcast_f32x4(i);
56
    }
57

58
    __forceinline vfloat(const vfloat4& a, const vfloat4& b, const vfloat4& c, const vfloat4& d) {
59
      v = _mm512_castps128_ps512(a);
60
      v = _mm512_insertf32x4(v, b, 1);
61
      v = _mm512_insertf32x4(v, c, 2);
62
      v = _mm512_insertf32x4(v, d, 3);
63
    }
64

65
    __forceinline vfloat(const vboolf16& mask, const vfloat4& a, const vfloat4& b) {
66
      v = _mm512_broadcast_f32x4(a);
67
      v = _mm512_mask_broadcast_f32x4(v,mask,b);
68
    }
69

70
    __forceinline vfloat(const vfloat8& i) {
71
      v = _mm512_castpd_ps(_mm512_broadcast_f64x4(_mm256_castps_pd(i)));
72
    }
73

74
    __forceinline vfloat(const vfloat8& a, const vfloat8& b) {
75
      v = _mm512_castps256_ps512(a);
76
#if defined(__AVX512DQ__)
77
      v = _mm512_insertf32x8(v, b, 1);
78
#else
79
      v = _mm512_castpd_ps(_mm512_insertf64x4(_mm512_castps_pd(v), _mm256_castps_pd(b), 1));
80
#endif
81
    }
82

83
    /* WARNING: due to f64x4 the mask is considered as an 8bit mask */
84
    /*__forceinline vfloat(const vboolf16& mask, const vfloat8& a, const vfloat8& b) {
85
      __m512d aa = _mm512_broadcast_f64x4(_mm256_castps_pd(a));
86
      aa = _mm512_mask_broadcast_f64x4(aa,mask,_mm256_castps_pd(b));
87
      v = _mm512_castpd_ps(aa);
88
      }*/
89
    
90
    __forceinline explicit vfloat(const vint16& a) {
91
      v = _mm512_cvtepi32_ps(a);
92
    }
93

94
    __forceinline explicit vfloat(const vuint16& a) {
95
      v = _mm512_cvtepu32_ps(a);
96
    }
97

98
    ////////////////////////////////////////////////////////////////////////////////
99
    /// Constants
100
    ////////////////////////////////////////////////////////////////////////////////
101

102
    __forceinline vfloat(ZeroTy)   : v(_mm512_setzero_ps()) {}
103
    __forceinline vfloat(OneTy)    : v(_mm512_set1_ps(1.0f)) {}
104
    __forceinline vfloat(PosInfTy) : v(_mm512_set1_ps(pos_inf)) {}
105
    __forceinline vfloat(NegInfTy) : v(_mm512_set1_ps(neg_inf)) {}
106
    __forceinline vfloat(StepTy)   : v(_mm512_set_ps(15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0)) {}
107
    __forceinline vfloat(NaNTy)    : v(_mm512_set1_ps(nan)) {}
108
    __forceinline vfloat(UndefinedTy) : v(_mm512_undefined_ps()) {}
109

110
    ////////////////////////////////////////////////////////////////////////////////
111
    /// Loads and Stores
112
    ////////////////////////////////////////////////////////////////////////////////
113

114
    static __forceinline vfloat16 load (const void* ptr) { return _mm512_load_ps((float*)ptr);  }
115
    static __forceinline vfloat16 loadu(const void* ptr) { return _mm512_loadu_ps((float*)ptr); }
116

117
    static __forceinline vfloat16 load (const vboolf16& mask, const void* ptr) { return _mm512_mask_load_ps (_mm512_setzero_ps(),mask,(float*)ptr); }
118
    static __forceinline vfloat16 loadu(const vboolf16& mask, const void* ptr) { return _mm512_mask_loadu_ps(_mm512_setzero_ps(),mask,(float*)ptr); }
119

120
    static __forceinline void store (void* ptr, const vfloat16& v) { _mm512_store_ps ((float*)ptr,v); }
121
    static __forceinline void storeu(void* ptr, const vfloat16& v) { _mm512_storeu_ps((float*)ptr,v); }
122

123
    static __forceinline void store (const vboolf16& mask, void* ptr, const vfloat16& v) { _mm512_mask_store_ps ((float*)ptr,mask,v); }
124
    static __forceinline void storeu(const vboolf16& mask, void* ptr, const vfloat16& v) { _mm512_mask_storeu_ps((float*)ptr,mask,v); }
125

126
    static __forceinline void store_nt(void* __restrict__ ptr, const vfloat16& a) {
127
      _mm512_stream_ps((float*)ptr,a);
128
    }
129

130
    static __forceinline vfloat16 broadcast(const float* f) {
131
      return _mm512_set1_ps(*f);
132
    }
133

134
    template<int scale = 4>
135
    static __forceinline vfloat16 gather(const float* ptr, const vint16& index) {
136
      return _mm512_i32gather_ps(index, ptr, scale);
137
    }
138

139
    template<int scale = 4>
140
    static __forceinline vfloat16 gather(const vboolf16& mask, const float* ptr, const vint16& index) {
141
      vfloat16 r = zero;
142
      return _mm512_mask_i32gather_ps(r, mask, index, ptr, scale);
143
    }
144

145
    template<int scale = 4>
146
    static __forceinline void scatter(float* ptr, const vint16& index, const vfloat16& v) {
147
      _mm512_i32scatter_ps(ptr, index, v, scale);
148
    }
149

150
    template<int scale = 4>
151
    static __forceinline void scatter(const vboolf16& mask, float* ptr, const vint16& index, const vfloat16& v) {
152
      _mm512_mask_i32scatter_ps(ptr, mask, index, v, scale);
153
    }
154

155
    ////////////////////////////////////////////////////////////////////////////////
156
    /// Array Access
157
    ////////////////////////////////////////////////////////////////////////////////
158
    
159
    __forceinline       float& operator [](size_t index)       { assert(index < 16); return f[index]; }
160
    __forceinline const float& operator [](size_t index) const { assert(index < 16); return f[index]; }
161
  };
162

163
  ////////////////////////////////////////////////////////////////////////////////
164
  /// Unary Operators
165
  ////////////////////////////////////////////////////////////////////////////////
166

167
  __forceinline vfloat16 asFloat(const vint16&   a) { return _mm512_castsi512_ps(a); }
168
  __forceinline vint16   asInt  (const vfloat16& a) { return _mm512_castps_si512(a); }
169
  __forceinline vuint16  asUInt (const vfloat16& a) { return _mm512_castps_si512(a); }
170

171
  __forceinline vint16   toInt  (const vfloat16& a) { return vint16(a); }
172
  __forceinline vfloat16 toFloat(const vint16&   a) { return vfloat16(a); }
173

174
  __forceinline vfloat16 operator +(const vfloat16& a) { return a; }
175
  __forceinline vfloat16 operator -(const vfloat16& a) { return _mm512_mul_ps(a,vfloat16(-1)); }
176

177
  __forceinline vfloat16 abs    (const vfloat16& a) { return _mm512_castsi512_ps(_mm512_and_epi32(_mm512_castps_si512(a),_mm512_set1_epi32(0x7FFFFFFF))); }
178
  __forceinline vfloat16 signmsk(const vfloat16& a) { return _mm512_castsi512_ps(_mm512_and_epi32(_mm512_castps_si512(a),_mm512_set1_epi32(0x80000000))); }
179

180
  __forceinline vfloat16 rcp(const vfloat16& a)
181
  {
182
    const vfloat16 r = _mm512_rcp14_ps(a);
183
    return _mm512_fmadd_ps(r, _mm512_fnmadd_ps(a, r, vfloat16(1.0)), r);  // computes r + r * (1 - a*r)
184
  }
185

186
  __forceinline vfloat16 sqr (const vfloat16& a) { return _mm512_mul_ps(a,a); }
187
  __forceinline vfloat16 sqrt(const vfloat16& a) { return _mm512_sqrt_ps(a); }
188

189
  __forceinline vfloat16 rsqrt(const vfloat16& a)
190
  {
191
    const vfloat16 r = _mm512_rsqrt14_ps(a);
192
    return _mm512_fmadd_ps(_mm512_set1_ps(1.5f), r,
193
                           _mm512_mul_ps(_mm512_mul_ps(_mm512_mul_ps(a, _mm512_set1_ps(-0.5f)), r), _mm512_mul_ps(r, r))); 
194
  }
195

196
  ////////////////////////////////////////////////////////////////////////////////
197
  /// Binary Operators
198
  ////////////////////////////////////////////////////////////////////////////////
199

200
  __forceinline vfloat16 operator +(const vfloat16& a, const vfloat16& b) { return _mm512_add_ps(a, b); }
201
  __forceinline vfloat16 operator +(const vfloat16& a, float           b) { return a + vfloat16(b); }
202
  __forceinline vfloat16 operator +(float           a, const vfloat16& b) { return vfloat16(a) + b; }
203

204
  __forceinline vfloat16 operator -(const vfloat16& a, const vfloat16& b) { return _mm512_sub_ps(a, b); }
205
  __forceinline vfloat16 operator -(const vfloat16& a, float           b) { return a - vfloat16(b); }
206
  __forceinline vfloat16 operator -(float           a, const vfloat16& b) { return vfloat16(a) - b; }
207

208
  __forceinline vfloat16 operator *(const vfloat16& a, const vfloat16& b) { return _mm512_mul_ps(a, b); }
209
  __forceinline vfloat16 operator *(const vfloat16& a, float           b) { return a * vfloat16(b); }
210
  __forceinline vfloat16 operator *(float           a, const vfloat16& b) { return vfloat16(a) * b; }
211

212
  __forceinline vfloat16 operator /(const vfloat16& a, const vfloat16& b) { return _mm512_div_ps(a,b); }
213
  __forceinline vfloat16 operator /(const vfloat16& a, float           b) { return a/vfloat16(b); }
214
  __forceinline vfloat16 operator /(float           a, const vfloat16& b) { return vfloat16(a)/b; }
215
  
216
  __forceinline vfloat16 operator &(const vfloat16& a, const vfloat16& b) { return _mm512_and_ps(a,b); }
217
  __forceinline vfloat16 operator |(const vfloat16& a, const vfloat16& b) { return _mm512_or_ps(a,b); }
218
  __forceinline vfloat16 operator ^(const vfloat16& a, const vfloat16& b) {
219
    return  _mm512_castsi512_ps(_mm512_xor_epi32(_mm512_castps_si512(a),_mm512_castps_si512(b))); 
220
  }
221
  
222
  __forceinline vfloat16 min(const vfloat16& a, const vfloat16& b) { return _mm512_min_ps(a,b);  }
223
  __forceinline vfloat16 min(const vfloat16& a, float           b) { return _mm512_min_ps(a,vfloat16(b)); }
224
  __forceinline vfloat16 min(const float&    a, const vfloat16& b) { return _mm512_min_ps(vfloat16(a),b); }
225

226
  __forceinline vfloat16 max(const vfloat16& a, const vfloat16& b) { return _mm512_max_ps(a,b); }
227
  __forceinline vfloat16 max(const vfloat16& a, float           b) { return _mm512_max_ps(a,vfloat16(b)); }
228
  __forceinline vfloat16 max(const float&    a, const vfloat16& b) { return _mm512_max_ps(vfloat16(a),b); }
229

230
  __forceinline vfloat16 mini(const vfloat16& a, const vfloat16& b) {
231
    const vint16 ai = _mm512_castps_si512(a);
232
    const vint16 bi = _mm512_castps_si512(b);
233
    const vint16 ci = _mm512_min_epi32(ai,bi);
234
    return _mm512_castsi512_ps(ci);
235
  }
236

237
  __forceinline vfloat16 maxi(const vfloat16& a, const vfloat16& b) {
238
    const vint16 ai = _mm512_castps_si512(a);
239
    const vint16 bi = _mm512_castps_si512(b);
240
    const vint16 ci = _mm512_max_epi32(ai,bi);
241
    return _mm512_castsi512_ps(ci);
242
  }
243

244
  ////////////////////////////////////////////////////////////////////////////////
245
  /// Ternary Operators
246
  ////////////////////////////////////////////////////////////////////////////////
247

248
  __forceinline vfloat16 madd (const vfloat16& a, const vfloat16& b, const vfloat16& c) { return _mm512_fmadd_ps(a,b,c); }
249
  __forceinline vfloat16 msub (const vfloat16& a, const vfloat16& b, const vfloat16& c) { return _mm512_fmsub_ps(a,b,c); }
250
  __forceinline vfloat16 nmadd(const vfloat16& a, const vfloat16& b, const vfloat16& c) { return _mm512_fnmadd_ps(a,b,c); }
251
  __forceinline vfloat16 nmsub(const vfloat16& a, const vfloat16& b, const vfloat16& c) { return _mm512_fnmsub_ps(a,b,c); }
252
  
253
  ////////////////////////////////////////////////////////////////////////////////
254
  /// Assignment Operators
255
  ////////////////////////////////////////////////////////////////////////////////
256

257
  __forceinline vfloat16& operator +=(vfloat16& a, const vfloat16& b) { return a = a + b; }
258
  __forceinline vfloat16& operator +=(vfloat16& a, float           b) { return a = a + b; }
259
  
260
  __forceinline vfloat16& operator -=(vfloat16& a, const vfloat16& b) { return a = a - b; }
261
  __forceinline vfloat16& operator -=(vfloat16& a, float           b) { return a = a - b; }
262
  
263
  __forceinline vfloat16& operator *=(vfloat16& a, const vfloat16& b) { return a = a * b; }
264
  __forceinline vfloat16& operator *=(vfloat16& a, float           b) { return a = a * b; }
265

266
  __forceinline vfloat16& operator /=(vfloat16& a, const vfloat16& b) { return a = a / b; }
267
  __forceinline vfloat16& operator /=(vfloat16& a, float           b) { return a = a / b; }
268

269
  ////////////////////////////////////////////////////////////////////////////////
270
  /// Comparison Operators + Select
271
  ////////////////////////////////////////////////////////////////////////////////
272

273
  __forceinline vboolf16 operator ==(const vfloat16& a, const vfloat16& b) { return _mm512_cmp_ps_mask(a,b,_MM_CMPINT_EQ); }
274
  __forceinline vboolf16 operator ==(const vfloat16& a, float           b) { return a == vfloat16(b); }
275
  __forceinline vboolf16 operator ==(float           a, const vfloat16& b) { return vfloat16(a) == b; }
276

277
  __forceinline vboolf16 operator !=(const vfloat16& a, const vfloat16& b) { return _mm512_cmp_ps_mask(a,b,_MM_CMPINT_NE); }
278
  __forceinline vboolf16 operator !=(const vfloat16& a, float           b) { return a != vfloat16(b); }
279
  __forceinline vboolf16 operator !=(float           a, const vfloat16& b) { return vfloat16(a) != b; }
280

281
  __forceinline vboolf16 operator < (const vfloat16& a, const vfloat16& b) { return _mm512_cmp_ps_mask(a,b,_MM_CMPINT_LT); }
282
  __forceinline vboolf16 operator < (const vfloat16& a, float           b) { return a <  vfloat16(b); }
283
  __forceinline vboolf16 operator < (float           a, const vfloat16& b) { return vfloat16(a) <  b; }
284

285
  __forceinline vboolf16 operator >=(const vfloat16& a, const vfloat16& b) { return _mm512_cmp_ps_mask(a,b,_MM_CMPINT_GE); }
286
  __forceinline vboolf16 operator >=(const vfloat16& a, float           b) { return a >= vfloat16(b); }
287
  __forceinline vboolf16 operator >=(float           a, const vfloat16& b) { return vfloat16(a) >= b; }
288

289
  __forceinline vboolf16 operator > (const vfloat16& a, const vfloat16& b) { return _mm512_cmp_ps_mask(a,b,_MM_CMPINT_GT); }
290
  __forceinline vboolf16 operator > (const vfloat16& a, float           b) { return a >  vfloat16(b); }
291
  __forceinline vboolf16 operator > (float           a, const vfloat16& b) { return vfloat16(a) >  b; }
292

293
  __forceinline vboolf16 operator <=(const vfloat16& a, const vfloat16& b) { return _mm512_cmp_ps_mask(a,b,_MM_CMPINT_LE); }
294
  __forceinline vboolf16 operator <=(const vfloat16& a, float           b) { return a <= vfloat16(b); }
295
  __forceinline vboolf16 operator <=(float           a, const vfloat16& b) { return vfloat16(a) <= b; }
296

297
  __forceinline vboolf16 eq(const vfloat16& a, const vfloat16& b) { return _mm512_cmp_ps_mask(a,b,_MM_CMPINT_EQ); }
298
  __forceinline vboolf16 ne(const vfloat16& a, const vfloat16& b) { return _mm512_cmp_ps_mask(a,b,_MM_CMPINT_NE); }
299
  __forceinline vboolf16 lt(const vfloat16& a, const vfloat16& b) { return _mm512_cmp_ps_mask(a,b,_MM_CMPINT_LT); }
300
  __forceinline vboolf16 ge(const vfloat16& a, const vfloat16& b) { return _mm512_cmp_ps_mask(a,b,_MM_CMPINT_GE); }
301
  __forceinline vboolf16 gt(const vfloat16& a, const vfloat16& b) { return _mm512_cmp_ps_mask(a,b,_MM_CMPINT_GT); }
302
  __forceinline vboolf16 le(const vfloat16& a, const vfloat16& b) { return _mm512_cmp_ps_mask(a,b,_MM_CMPINT_LE); }
303

304
  __forceinline vboolf16 eq(const vboolf16& mask, const vfloat16& a, const vfloat16& b) { return _mm512_mask_cmp_ps_mask(mask,a,b,_MM_CMPINT_EQ); }
305
  __forceinline vboolf16 ne(const vboolf16& mask, const vfloat16& a, const vfloat16& b) { return _mm512_mask_cmp_ps_mask(mask,a,b,_MM_CMPINT_NE); }
306
  __forceinline vboolf16 lt(const vboolf16& mask, const vfloat16& a, const vfloat16& b) { return _mm512_mask_cmp_ps_mask(mask,a,b,_MM_CMPINT_LT); }
307
  __forceinline vboolf16 ge(const vboolf16& mask, const vfloat16& a, const vfloat16& b) { return _mm512_mask_cmp_ps_mask(mask,a,b,_MM_CMPINT_GE); }
308
  __forceinline vboolf16 gt(const vboolf16& mask, const vfloat16& a, const vfloat16& b) { return _mm512_mask_cmp_ps_mask(mask,a,b,_MM_CMPINT_GT); }
309
  __forceinline vboolf16 le(const vboolf16& mask, const vfloat16& a, const vfloat16& b) { return _mm512_mask_cmp_ps_mask(mask,a,b,_MM_CMPINT_LE); }
310
  
311
  __forceinline vfloat16 select(const vboolf16& s, const vfloat16& t, const vfloat16& f) {
312
    return _mm512_mask_blend_ps(s, f, t);
313
  }
314

315
  __forceinline vfloat16 lerp(const vfloat16& a, const vfloat16& b, const vfloat16& t) {
316
    return madd(t,b-a,a);
317
  }
318

319
  __forceinline bool isvalid (const vfloat16& v) {
320
    return all((v > vfloat16(-FLT_LARGE)) & (v < vfloat16(+FLT_LARGE)));
321
  }
322

323
  __forceinline void xchg(vboolf16 m, vfloat16& a, vfloat16& b)
324
  {
325
    vfloat16 c = a;
326
    a = select(m,b,a);
327
    b = select(m,c,b); 
328
  }
329

330
  ////////////////////////////////////////////////////////////////////////////////
331
  /// Rounding Functions
332
  ////////////////////////////////////////////////////////////////////////////////
333
  
334
  __forceinline vfloat16 floor(const vfloat16& a) {
335
    return _mm512_floor_ps(a);
336
  }
337
  __forceinline vfloat16 ceil (const vfloat16& a) {
338
    return _mm512_ceil_ps(a);
339
  }
340
  __forceinline vfloat16 round (const vfloat16& a) {
341
    return _mm512_roundscale_ps(a, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
342
  }
343
  __forceinline vint16 floori (const vfloat16& a) {
344
    return _mm512_cvt_roundps_epi32(a, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC);
345
  }
346

347
  ////////////////////////////////////////////////////////////////////////////////
348
  /// Movement/Shifting/Shuffling Functions
349
  ////////////////////////////////////////////////////////////////////////////////
350

351
  __forceinline vfloat16 unpacklo(const vfloat16& a, const vfloat16& b) { return _mm512_unpacklo_ps(a, b); }
352
  __forceinline vfloat16 unpackhi(const vfloat16& a, const vfloat16& b) { return _mm512_unpackhi_ps(a, b); }
353

354
  template<int i>
355
  __forceinline vfloat16 shuffle(const vfloat16& v) {
356
    return _mm512_permute_ps(v, _MM_SHUFFLE(i, i, i, i));
357
  }
358

359
  template<int i0, int i1, int i2, int i3>
360
  __forceinline vfloat16 shuffle(const vfloat16& v) {
361
    return _mm512_permute_ps(v, _MM_SHUFFLE(i3, i2, i1, i0));
362
  }
363

364
  template<int i>
365
  __forceinline vfloat16 shuffle4(const vfloat16& v) {
366
    return _mm512_shuffle_f32x4(v, v ,_MM_SHUFFLE(i, i, i, i));
367
  }
368

369
  template<int i0, int i1, int i2, int i3>
370
  __forceinline vfloat16 shuffle4(const vfloat16& v) {
371
    return _mm512_shuffle_f32x4(v, v, _MM_SHUFFLE(i3, i2, i1, i0));
372
  }
373

374
  __forceinline vfloat16 interleave4_even(const vfloat16& a, const vfloat16& b) {
375
    return _mm512_castsi512_ps(_mm512_mask_permutex_epi64(_mm512_castps_si512(a), mm512_int2mask(0xcc), _mm512_castps_si512(b), (_MM_PERM_ENUM)0x4e));
376
  }
377

378
  __forceinline vfloat16 interleave4_odd(const vfloat16& a, const vfloat16& b) {
379
    return _mm512_castsi512_ps(_mm512_mask_permutex_epi64(_mm512_castps_si512(b), mm512_int2mask(0x33), _mm512_castps_si512(a), (_MM_PERM_ENUM)0x4e));
380
  }
381

382
  __forceinline vfloat16 permute(vfloat16 v, __m512i index) {
383
    return _mm512_castsi512_ps(_mm512_permutexvar_epi32(index, _mm512_castps_si512(v)));
384
  }
385

386
  __forceinline vfloat16 reverse(const vfloat16& v) {
387
    return permute(v,_mm512_setr_epi32(15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0));
388
  }
389

390
  template<int i>
391
  __forceinline vfloat16 align_shift_right(const vfloat16& a, const vfloat16& b) {
392
    return _mm512_castsi512_ps(_mm512_alignr_epi32(_mm512_castps_si512(a),_mm512_castps_si512(b),i)); 
393
  };
394

395
  template<int i>
396
  __forceinline vfloat16 mask_align_shift_right(const vboolf16& mask, vfloat16& c, const vfloat16& a, const vfloat16& b) {
397
    return _mm512_castsi512_ps(_mm512_mask_alignr_epi32(_mm512_castps_si512(c),mask,_mm512_castps_si512(a),_mm512_castps_si512(b),i)); 
398
  };
399
 
400
  __forceinline vfloat16 shift_left_1(const vfloat16& a) {
401
    vfloat16 z = zero;
402
    return mask_align_shift_right<15>(0xfffe,z,a,a);
403
  }
404

405
  __forceinline vfloat16 shift_right_1(const vfloat16& x) {
406
    return align_shift_right<1>(zero,x);
407
  }
408

409
  __forceinline float toScalar(const vfloat16& v) { return mm512_cvtss_f32(v); }
410

411

412
  template<int i> __forceinline vfloat16 insert4(const vfloat16& a, const vfloat4& b) { return _mm512_insertf32x4(a, b, i); }
413

414
  template<int N, int i>
415
  vfloat<N> extractN(const vfloat16& v);
416

417
  template<> __forceinline vfloat4 extractN<4,0>(const vfloat16& v) { return _mm512_castps512_ps128(v);    }
418
  template<> __forceinline vfloat4 extractN<4,1>(const vfloat16& v) { return _mm512_extractf32x4_ps(v, 1); }
419
  template<> __forceinline vfloat4 extractN<4,2>(const vfloat16& v) { return _mm512_extractf32x4_ps(v, 2); }
420
  template<> __forceinline vfloat4 extractN<4,3>(const vfloat16& v) { return _mm512_extractf32x4_ps(v, 3); }
421

422
  template<> __forceinline vfloat8 extractN<8,0>(const vfloat16& v) { return _mm512_castps512_ps256(v);    }
423
  template<> __forceinline vfloat8 extractN<8,1>(const vfloat16& v) { return _mm512_extractf32x8_ps(v, 1); }
424

425
  template<int i> __forceinline vfloat4 extract4   (const vfloat16& v) { return _mm512_extractf32x4_ps(v, i); }
426
  template<>      __forceinline vfloat4 extract4<0>(const vfloat16& v) { return _mm512_castps512_ps128(v);    }
427

428
  template<int i> __forceinline vfloat8 extract8   (const vfloat16& v) { return _mm512_extractf32x8_ps(v, i); }
429
  template<>      __forceinline vfloat8 extract8<0>(const vfloat16& v) { return _mm512_castps512_ps256(v);    }
430

431
  ////////////////////////////////////////////////////////////////////////////////
432
  /// Transpose
433
  ////////////////////////////////////////////////////////////////////////////////
434

435
  __forceinline void transpose(const vfloat16& r0, const vfloat16& r1, const vfloat16& r2, const vfloat16& r3,
436
                               vfloat16& c0, vfloat16& c1, vfloat16& c2, vfloat16& c3)
437
  {
438
    vfloat16 a0a2_b0b2 = unpacklo(r0, r2);
439
    vfloat16 c0c2_d0d2 = unpackhi(r0, r2);
440
    vfloat16 a1a3_b1b3 = unpacklo(r1, r3);
441
    vfloat16 c1c3_d1d3 = unpackhi(r1, r3);
442

443
    c0 = unpacklo(a0a2_b0b2, a1a3_b1b3);
444
    c1 = unpackhi(a0a2_b0b2, a1a3_b1b3);
445
    c2 = unpacklo(c0c2_d0d2, c1c3_d1d3);
446
    c3 = unpackhi(c0c2_d0d2, c1c3_d1d3);
447
  }
448

449
  __forceinline void transpose(const vfloat4& r0,  const vfloat4& r1,  const vfloat4& r2,  const vfloat4& r3,
450
                               const vfloat4& r4,  const vfloat4& r5,  const vfloat4& r6,  const vfloat4& r7,
451
                               const vfloat4& r8,  const vfloat4& r9,  const vfloat4& r10, const vfloat4& r11,
452
                               const vfloat4& r12, const vfloat4& r13, const vfloat4& r14, const vfloat4& r15,
453
                               vfloat16& c0, vfloat16& c1, vfloat16& c2, vfloat16& c3)
454
  {
455
    return transpose(vfloat16(r0, r4, r8, r12), vfloat16(r1, r5, r9, r13), vfloat16(r2, r6, r10, r14), vfloat16(r3, r7, r11, r15),
456
                     c0, c1, c2, c3);
457
  }
458

459
  __forceinline void transpose(const vfloat16& r0, const vfloat16& r1, const vfloat16& r2, const vfloat16& r3,
460
                               const vfloat16& r4, const vfloat16& r5, const vfloat16& r6, const vfloat16& r7,
461
                               vfloat16& c0, vfloat16& c1, vfloat16& c2, vfloat16& c3,
462
                               vfloat16& c4, vfloat16& c5, vfloat16& c6, vfloat16& c7)
463
  {
464
    vfloat16 a0a1a2a3_e0e1e2e3, b0b1b2b3_f0f1f2f3, c0c1c2c3_g0g1g2g3, d0d1d2d3_h0h1h2h3;
465
    transpose(r0, r1, r2, r3, a0a1a2a3_e0e1e2e3, b0b1b2b3_f0f1f2f3, c0c1c2c3_g0g1g2g3, d0d1d2d3_h0h1h2h3);
466

467
    vfloat16 a4a5a6a7_e4e5e6e7, b4b5b6b7_f4f5f6f7, c4c5c6c7_g4g5g6g7, d4d5d6d7_h4h5h6h7;
468
    transpose(r4, r5, r6, r7, a4a5a6a7_e4e5e6e7, b4b5b6b7_f4f5f6f7, c4c5c6c7_g4g5g6g7, d4d5d6d7_h4h5h6h7);
469

470
    c0 = interleave4_even(a0a1a2a3_e0e1e2e3, a4a5a6a7_e4e5e6e7);
471
    c1 = interleave4_even(b0b1b2b3_f0f1f2f3, b4b5b6b7_f4f5f6f7);
472
    c2 = interleave4_even(c0c1c2c3_g0g1g2g3, c4c5c6c7_g4g5g6g7);
473
    c3 = interleave4_even(d0d1d2d3_h0h1h2h3, d4d5d6d7_h4h5h6h7);
474
    c4 = interleave4_odd (a0a1a2a3_e0e1e2e3, a4a5a6a7_e4e5e6e7);
475
    c5 = interleave4_odd (b0b1b2b3_f0f1f2f3, b4b5b6b7_f4f5f6f7);
476
    c6 = interleave4_odd (c0c1c2c3_g0g1g2g3, c4c5c6c7_g4g5g6g7);
477
    c7 = interleave4_odd (d0d1d2d3_h0h1h2h3, d4d5d6d7_h4h5h6h7);
478
  }
479

480
  __forceinline void transpose(const vfloat8& r0,  const vfloat8& r1,  const vfloat8& r2,  const vfloat8& r3,
481
                               const vfloat8& r4,  const vfloat8& r5,  const vfloat8& r6,  const vfloat8& r7,
482
                               const vfloat8& r8,  const vfloat8& r9,  const vfloat8& r10, const vfloat8& r11,
483
                               const vfloat8& r12, const vfloat8& r13, const vfloat8& r14, const vfloat8& r15,
484
                               vfloat16& c0, vfloat16& c1, vfloat16& c2, vfloat16& c3,
485
                               vfloat16& c4, vfloat16& c5, vfloat16& c6, vfloat16& c7)
486
  {
487
    return transpose(vfloat16(r0, r8),  vfloat16(r1, r9),  vfloat16(r2, r10), vfloat16(r3, r11),
488
                     vfloat16(r4, r12), vfloat16(r5, r13), vfloat16(r6, r14), vfloat16(r7, r15),
489
                     c0, c1, c2, c3, c4, c5, c6, c7);
490
  }
491

492
  ////////////////////////////////////////////////////////////////////////////////
493
  /// Reductions
494
  ////////////////////////////////////////////////////////////////////////////////
495

496
  __forceinline vfloat16 vreduce_add2(vfloat16 x) {                      return x + shuffle<1,0,3,2>(x); }
497
  __forceinline vfloat16 vreduce_add4(vfloat16 x) { x = vreduce_add2(x); return x + shuffle<2,3,0,1>(x); }
498
  __forceinline vfloat16 vreduce_add8(vfloat16 x) { x = vreduce_add4(x); return x + shuffle4<1,0,3,2>(x); }
499
  __forceinline vfloat16 vreduce_add (vfloat16 x) { x = vreduce_add8(x); return x + shuffle4<2,3,0,1>(x); }
500

501
  __forceinline vfloat16 vreduce_min2(vfloat16 x) {                      return min(x, shuffle<1,0,3,2>(x)); }
502
  __forceinline vfloat16 vreduce_min4(vfloat16 x) { x = vreduce_min2(x); return min(x, shuffle<2,3,0,1>(x)); }
503
  __forceinline vfloat16 vreduce_min8(vfloat16 x) { x = vreduce_min4(x); return min(x, shuffle4<1,0,3,2>(x)); }
504
  __forceinline vfloat16 vreduce_min (vfloat16 x) { x = vreduce_min8(x); return min(x, shuffle4<2,3,0,1>(x)); }
505

506
  __forceinline vfloat16 vreduce_max2(vfloat16 x) {                      return max(x, shuffle<1,0,3,2>(x)); }
507
  __forceinline vfloat16 vreduce_max4(vfloat16 x) { x = vreduce_max2(x); return max(x, shuffle<2,3,0,1>(x)); }
508
  __forceinline vfloat16 vreduce_max8(vfloat16 x) { x = vreduce_max4(x); return max(x, shuffle4<1,0,3,2>(x)); }
509
  __forceinline vfloat16 vreduce_max (vfloat16 x) { x = vreduce_max8(x); return max(x, shuffle4<2,3,0,1>(x)); }
510

511
  __forceinline float reduce_add(const vfloat16& v) { return toScalar(vreduce_add(v)); }
512
  __forceinline float reduce_min(const vfloat16& v) { return toScalar(vreduce_min(v)); }
513
  __forceinline float reduce_max(const vfloat16& v) { return toScalar(vreduce_max(v)); }
514
 
515
  __forceinline size_t select_min(const vfloat16& v) { 
516
    return bsf(_mm512_kmov(_mm512_cmp_epi32_mask(_mm512_castps_si512(v),_mm512_castps_si512(vreduce_min(v)),_MM_CMPINT_EQ)));
517
  }
518

519
  __forceinline size_t select_max(const vfloat16& v) { 
520
    return bsf(_mm512_kmov(_mm512_cmp_epi32_mask(_mm512_castps_si512(v),_mm512_castps_si512(vreduce_max(v)),_MM_CMPINT_EQ)));
521
  }
522

523
  __forceinline size_t select_min(const vboolf16& valid, const vfloat16& v) 
524
  { 
525
    const vfloat16 a = select(valid,v,vfloat16(pos_inf)); 
526
    const vbool16 valid_min = valid & (a == vreduce_min(a));
527
    return bsf(movemask(any(valid_min) ? valid_min : valid)); 
528
  }
529

530
  __forceinline size_t select_max(const vboolf16& valid, const vfloat16& v) 
531
  { 
532
    const vfloat16 a = select(valid,v,vfloat16(neg_inf)); 
533
    const vbool16 valid_max = valid & (a == vreduce_max(a));
534
    return bsf(movemask(any(valid_max) ? valid_max : valid)); 
535
  }
536
  
537
  __forceinline vfloat16 prefix_sum(const vfloat16& a) 
538
  {
539
    const vfloat16 z(zero);
540
    vfloat16 v = a;
541
    v = v + align_shift_right<16-1>(v,z);
542
    v = v + align_shift_right<16-2>(v,z);
543
    v = v + align_shift_right<16-4>(v,z);
544
    v = v + align_shift_right<16-8>(v,z);
545
    return v;  
546
  }
547

548
  __forceinline vfloat16 reverse_prefix_sum(const vfloat16& a) 
549
  {
550
    const vfloat16 z(zero);
551
    vfloat16 v = a;
552
    v = v + align_shift_right<1>(z,v);
553
    v = v + align_shift_right<2>(z,v);
554
    v = v + align_shift_right<4>(z,v);
555
    v = v + align_shift_right<8>(z,v);
556
    return v;  
557
  }
558

559
  __forceinline vfloat16 prefix_min(const vfloat16& a)
560
  {
561
    const vfloat16 z(pos_inf);
562
    vfloat16 v = a;
563
    v = min(v,align_shift_right<16-1>(v,z));
564
    v = min(v,align_shift_right<16-2>(v,z));
565
    v = min(v,align_shift_right<16-4>(v,z));
566
    v = min(v,align_shift_right<16-8>(v,z));
567
    return v;  
568
  }
569

570
  __forceinline vfloat16 prefix_max(const vfloat16& a)
571
  {
572
    const vfloat16 z(neg_inf);
573
    vfloat16 v = a;
574
    v = max(v,align_shift_right<16-1>(v,z));
575
    v = max(v,align_shift_right<16-2>(v,z));
576
    v = max(v,align_shift_right<16-4>(v,z));
577
    v = max(v,align_shift_right<16-8>(v,z));
578
    return v;  
579
  }
580

581

582
  __forceinline vfloat16 reverse_prefix_min(const vfloat16& a)
583
  {
584
    const vfloat16 z(pos_inf);
585
    vfloat16 v = a;
586
    v = min(v,align_shift_right<1>(z,v));
587
    v = min(v,align_shift_right<2>(z,v));
588
    v = min(v,align_shift_right<4>(z,v));
589
    v = min(v,align_shift_right<8>(z,v));
590
    return v;  
591
  }
592

593
  __forceinline vfloat16 reverse_prefix_max(const vfloat16& a)
594
  {
595
    const vfloat16 z(neg_inf);
596
    vfloat16 v = a;
597
    v = max(v,align_shift_right<1>(z,v));
598
    v = max(v,align_shift_right<2>(z,v));
599
    v = max(v,align_shift_right<4>(z,v));
600
    v = max(v,align_shift_right<8>(z,v));
601
    return v;  
602
  }
603

604
  __forceinline vfloat16 rcp_safe(const vfloat16& a) {
605
    return rcp(select(a != vfloat16(zero), a, vfloat16(min_rcp_input)));
606
  }
607

608
  ////////////////////////////////////////////////////////////////////////////////
609
  /// Output Operators
610
  ////////////////////////////////////////////////////////////////////////////////
611
  
612
  __forceinline embree_ostream operator <<(embree_ostream cout, const vfloat16& v)
613
  {
614
    cout << "<" << v[0];
615
    for (int i=1; i<16; i++) cout << ", " << v[i];
616
    cout << ">";
617
    return cout;
618
  }
619
}
620

621
#undef vboolf
622
#undef vboold
623
#undef vint
624
#undef vuint
625
#undef vllong
626
#undef vfloat
627
#undef vdouble
628

629
Product

Resources

Company