CoCalc -- simdlib_interface.hpp

GitHub Repository: PojavLauncherTeam/mesa
Path: blob/21.2-virgl/src/gallium/drivers/swr/rasterizer/common/simdlib_interface.hpp
⁴⁵⁷⁴ views
1
/****************************************************************************
2
 * Copyright (C) 2017 Intel Corporation.   All Rights Reserved.
3
 *
4
 * Permission is hereby granted, free of charge, to any person obtaining a
5
 * copy of this software and associated documentation files (the "Software"),
6
 * to deal in the Software without restriction, including without limitation
7
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8
 * and/or sell copies of the Software, and to permit persons to whom the
9
 * Software is furnished to do so, subject to the following conditions:
10
 *
11
 * The above copyright notice and this permission notice (including the next
12
 * paragraph) shall be included in all copies or substantial portions of the
13
 * Software.
14
 *
15
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20
 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21
 * IN THE SOFTWARE.
22
 ****************************************************************************/
23
#pragma once
24
#if 0
25
//===========================================================================
26
// Placeholder name representing either SIMD4, SIMD256, or SIMD16 structures.
27
//===========================================================================
28
struct SIMD256 // or SIMD4 or SIMD16
29
{
30
    //=======================================================================
31
    // SIMD Types
32
    //
33
    // These typedefs are examples. The SIMD256 and SIMD16 implementations will
34
    // use different base types with this same naming.
35
    using Float     = __m256;  // Packed single-precision float vector
36
    using Double    = __m256d; // Packed double-precision float vector
37
    using Integer   = __m256i; // Packed integer vector (mutable element widths)
38
    using Mask      = uint8_t; // Integer representing mask bits
39

40
    //=======================================================================
41
    // Standard interface
42
    // (available in both SIMD256 and SIMD16 widths)
43
    //=======================================================================
44

45
    //-----------------------------------------------------------------------
46
    // Single precision floating point arithmetic operations
47
    //-----------------------------------------------------------------------
48
    static Float    add_ps(Float a, Float b);               // return a + b
49
    static Float    div_ps(Float a, Float b);               // return a / b
50
    static Float    fmadd_ps(Float a, Float b, Float c);    // return (a * b) + c
51
    static Float    fmsub_ps(Float a, Float b, Float c);    // return (a * b) - c
52
    static Float    max_ps(Float a, Float b);               // return (a > b) ? a : b
53
    static Float    min_ps(Float a, Float b);               // return (a < b) ? a : b
54
    static Float    mul_ps(Float a, Float b);               // return a * b
55
    static Float    rcp_ps(Float a);                        // return 1.0f / a
56
    static Float    rsqrt_ps(Float a);                      // return 1.0f / sqrt(a)
57
    static Float    sub_ps(Float a, Float b);               // return a - b
58

59
    enum class RoundMode
60
    {
61
        TO_NEAREST_INT  = 0x00, // Round to nearest integer == TRUNCATE(value + (signof(value))0.5)
62
        TO_NEG_INF      = 0x01, // Round to negative infinity
63
        TO_POS_INF      = 0x02, // Round to positive infinity
64
        TO_ZERO         = 0x03, // Round to 0 a.k.a. truncate
65
        CUR_DIRECTION   = 0x04, // Round in direction set in MXCSR register
66

67
        RAISE_EXC       = 0x00, // Raise exception on overflow
68
        NO_EXC          = 0x08, // Suppress exceptions
69

70
        NINT            = static_cast<int>(TO_NEAREST_INT)  | static_cast<int>(RAISE_EXC),
71
        NINT_NOEXC      = static_cast<int>(TO_NEAREST_INT)  | static_cast<int>(NO_EXC),
72
        FLOOR           = static_cast<int>(TO_NEG_INF)      | static_cast<int>(RAISE_EXC),
73
        FLOOR_NOEXC     = static_cast<int>(TO_NEG_INF)      | static_cast<int>(NO_EXC),
74
        CEIL            = static_cast<int>(TO_POS_INF)      | static_cast<int>(RAISE_EXC),
75
        CEIL_NOEXC      = static_cast<int>(TO_POS_INF)      | static_cast<int>(NO_EXC),
76
        TRUNC           = static_cast<int>(TO_ZERO)         | static_cast<int>(RAISE_EXC),
77
        TRUNC_NOEXC     = static_cast<int>(TO_ZERO)         | static_cast<int>(NO_EXC),
78
        RINT            = static_cast<int>(CUR_DIRECTION)   | static_cast<int>(RAISE_EXC),
79
        NEARBYINT       = static_cast<int>(CUR_DIRECTION)   | static_cast<int>(NO_EXC),
80
    };
81

82
    // return round_func(a)
83
    //
84
    // round_func is chosen on the RMT template parameter.  See the documentation
85
    // for the RoundMode enumeration above.
86
    template <RoundMode RMT>
87
    static Float    round_ps(Float a);                  // return round(a) 
88

89

90
    //-----------------------------------------------------------------------
91
    // Integer (various width) arithmetic operations
92
    //-----------------------------------------------------------------------
93
    static Integer  abs_epi32(Integer a);               // return absolute_value(a) (int32)
94
    static Integer  add_epi32(Integer a, Integer b);    // return a + b (int32)
95
    static Integer  add_epi8(Integer a, Integer b);     // return a + b (int8)
96
    static Integer  adds_epu8(Integer a, Integer b);    // return ((a + b) > 0xff) ? 0xff : (a + b) (uint8) 
97
    static Integer  max_epi32(Integer a, Integer b);    // return (a > b) ? a : b (int32)
98
    static Integer  max_epu32(Integer a, Integer b);    // return (a > b) ? a : b (uint32)
99
    static Integer  min_epi32(Integer a, Integer b);    // return (a < b) ? a : b (int32)
100
    static Integer  min_epu32(Integer a, Integer b);    // return (a < b) ? a : b (uint32)
101
    static Integer  mul_epi32(Integer a, Integer b);    // return a * b (int32)
102

103
    // return (a * b) & 0xFFFFFFFF
104
    //
105
    // Multiply the packed 32-bit integers in a and b, producing intermediate 64-bit integers,
106
    // and store the low 32 bits of the intermediate integers in dst.
107
    static Float    mullo_epi32(Integer a, Integer b);
108

109
    static Integer  sub_epi32(Integer a, Integer b);    // return a - b (int32)
110
    static Integer  sub_epi64(Integer a, Integer b);    // return a - b (int64)
111
    static Integer  subs_epu8(Integer a, Integer b);    // return (b > a) ? 0 : (a - b) (uint8)
112

113
    //-----------------------------------------------------------------------
114
    // Logical operations
115
    //-----------------------------------------------------------------------
116
    static Float    and_ps(Float a, Float b);           // return a & b       (float treated as int)
117
    static Integer  and_si(Integer a, Integer b);       // return a & b       (int)
118
    static Float    andnot_ps(Float a, Float b);        // return (~a) & b    (float treated as int)
119
    static Integer  andnot_si(Integer a, Integer b);    // return (~a) & b    (int)
120
    static Float    or_ps(Float a, Float b);            // return a | b       (float treated as int)
121
    static Float    or_si(Integer a, Integer b);        // return a | b       (int)
122
    static Float    xor_ps(Float a, Float b);           // return a ^ b       (float treated as int)
123
    static Integer  xor_si(Integer a, Integer b);       // return a ^ b       (int)
124

125
    //-----------------------------------------------------------------------
126
    // Shift operations
127
    //-----------------------------------------------------------------------
128
    template<int ImmT>
129
    static Integer  slli_epi32(Integer a);              // return a << ImmT
130
    static Integer  sllv_epi32(Integer a, Integer b);   // return a << b
131
    template<int ImmT>
132
    static Integer  srai_epi32(Integer a);              // return a >> ImmT   (int32)
133
    template<int ImmT>
134
    static Integer  srli_epi32(Integer a);              // return a >> ImmT   (uint32)
135
    template<int ImmT>                                  // for each 128-bit lane:
136
    static Integer  srli_si(Integer a);                 //  return a >> (ImmT*8) (uint)
137
    template<int ImmT>
138
    static Float    srlisi_ps(Float a);                 // same as srli_si, but with Float cast to int
139
    static Integer  srlv_epi32(Integer a, Integer b);   // return a >> b      (uint32)
140

141
    //-----------------------------------------------------------------------
142
    // Conversion operations
143
    //-----------------------------------------------------------------------
144
    static Float    castpd_ps(Double a);                // return *(Float*)(&a)
145
    static Integer  castps_si(Float a);                 // return *(Integer*)(&a)
146
    static Double   castsi_pd(Integer a);               // return *(Double*)(&a)
147
    static Double   castps_pd(Float a);                 // return *(Double*)(&a)
148
    static Float    castsi_ps(Integer a);               // return *(Float*)(&a)
149
    static Float    cvtepi32_ps(Integer a);             // return (float)a    (int32 --> float)
150
    static Integer  cvtepu8_epi16(Integer a);           // return (int16)a    (uint8 --> int16)
151
    static Integer  cvtepu8_epi32(Integer a);           // return (int32)a    (uint8 --> int32)
152
    static Integer  cvtepu16_epi32(Integer a);          // return (int32)a    (uint16 --> int32)
153
    static Integer  cvtepu16_epi64(Integer a);          // return (int64)a    (uint16 --> int64)
154
    static Integer  cvtepu32_epi64(Integer a);          // return (int64)a    (uint32 --> int64)
155
    static Integer  cvtps_epi32(Float a);               // return (int32)a    (float --> int32)
156
    static Integer  cvttps_epi32(Float a);              // return (int32)a    (rnd_to_zero(float) --> int32)
157

158
    //-----------------------------------------------------------------------
159
    // Comparison operations
160
    //-----------------------------------------------------------------------
161

162
    // Comparison types used with cmp_ps:
163
    //   - ordered comparisons are always false if either operand is NaN
164
    //   - unordered comparisons are always true if either operand is NaN
165
    //   - signaling comparisons raise an exception if either operand is NaN
166
    //   - non-signaling comparisons will never raise an exception
167
    // 
168
    // Ordered:     return (a != NaN) && (b != NaN) && (a cmp b)
169
    // Unordered:   return (a == NaN) || (b == NaN) || (a cmp b)
170
    enum class CompareType
171
    {
172
        EQ_OQ      = 0x00, // Equal (ordered, nonsignaling)
173
        LT_OS      = 0x01, // Less-than (ordered, signaling)
174
        LE_OS      = 0x02, // Less-than-or-equal (ordered, signaling)
175
        UNORD_Q    = 0x03, // Unordered (nonsignaling)
176
        NEQ_UQ     = 0x04, // Not-equal (unordered, nonsignaling)
177
        NLT_US     = 0x05, // Not-less-than (unordered, signaling)
178
        NLE_US     = 0x06, // Not-less-than-or-equal (unordered, signaling)
179
        ORD_Q      = 0x07, // Ordered (nonsignaling)
180
        EQ_UQ      = 0x08, // Equal (unordered, non-signaling)
181
        NGE_US     = 0x09, // Not-greater-than-or-equal (unordered, signaling)
182
        NGT_US     = 0x0A, // Not-greater-than (unordered, signaling)
183
        FALSE_OQ   = 0x0B, // False (ordered, nonsignaling)
184
        NEQ_OQ     = 0x0C, // Not-equal (ordered, non-signaling)
185
        GE_OS      = 0x0D, // Greater-than-or-equal (ordered, signaling)
186
        GT_OS      = 0x0E, // Greater-than (ordered, signaling)
187
        TRUE_UQ    = 0x0F, // True (unordered, non-signaling)
188
        EQ_OS      = 0x10, // Equal (ordered, signaling)
189
        LT_OQ      = 0x11, // Less-than (ordered, nonsignaling)
190
        LE_OQ      = 0x12, // Less-than-or-equal (ordered, nonsignaling)
191
        UNORD_S    = 0x13, // Unordered (signaling)
192
        NEQ_US     = 0x14, // Not-equal (unordered, signaling)
193
        NLT_UQ     = 0x15, // Not-less-than (unordered, nonsignaling)
194
        NLE_UQ     = 0x16, // Not-less-than-or-equal (unordered, nonsignaling)
195
        ORD_S      = 0x17, // Ordered (signaling)
196
        EQ_US      = 0x18, // Equal (unordered, signaling)
197
        NGE_UQ     = 0x19, // Not-greater-than-or-equal (unordered, nonsignaling)
198
        NGT_UQ     = 0x1A, // Not-greater-than (unordered, nonsignaling)
199
        FALSE_OS   = 0x1B, // False (ordered, signaling)
200
        NEQ_OS     = 0x1C, // Not-equal (ordered, signaling)
201
        GE_OQ      = 0x1D, // Greater-than-or-equal (ordered, nonsignaling)
202
        GT_OQ      = 0x1E, // Greater-than (ordered, nonsignaling)
203
        TRUE_US    = 0x1F, // True (unordered, signaling)
204
    };
205

206
    // return a (CmpTypeT) b (float)
207
    //
208
    // See documentation for CompareType above for valid values for CmpTypeT.
209
    template<CompareType CmpTypeT>
210
    static Float    cmp_ps(Float a, Float b);           // return a (CmtTypeT) b (see above)
211
    static Float    cmpgt_ps(Float a, Float b);         // return cmp_ps<CompareType::GT_OQ>(a, b)
212
    static Float    cmple_ps(Float a, Float b);         // return cmp_ps<CompareType::LE_OQ>(a, b)
213
    static Float    cmplt_ps(Float a, Float b);         // return cmp_ps<CompareType::LT_OQ>(a, b)
214
    static Float    cmpneq_ps(Float a, Float b);        // return cmp_ps<CompareType::NEQ_OQ>(a, b)
215
    static Float    cmpeq_ps(Float a, Float b);         // return cmp_ps<CompareType::EQ_OQ>(a, b)
216
    static Float    cmpge_ps(Float a, Float b);         // return cmp_ps<CompareType::GE_OQ>(a, b)
217
    static Integer  cmpeq_epi8(Integer a, Integer b);   // return a == b (int8)
218
    static Integer  cmpeq_epi16(Integer a, Integer b);  // return a == b (int16)
219
    static Integer  cmpeq_epi32(Integer a, Integer b);  // return a == b (int32)
220
    static Integer  cmpeq_epi64(Integer a, Integer b);  // return a == b (int64)
221
    static Integer  cmpgt_epi8(Integer a, Integer b);   // return a > b (int8)
222
    static Integer  cmpgt_epi16(Integer a, Integer b);  // return a > b (int16)
223
    static Integer  cmpgt_epi32(Integer a, Integer b);  // return a > b (int32)
224
    static Integer  cmpgt_epi64(Integer a, Integer b);  // return a > b (int64)
225
    static Integer  cmplt_epi32(Integer a, Integer b);  // return a < b (int32)
226
    static bool     testz_ps(Float a, Float b);         // return all_lanes_zero(a & b) ? 1 : 0 (float)
227
    static bool     testz_si(Integer a, Integer b);     // return all_lanes_zero(a & b) ? 1 : 0 (int)
228

229
    //-----------------------------------------------------------------------
230
    // Blend / shuffle / permute operations
231
    //-----------------------------------------------------------------------
232
    template<int ImmT>
233
    static Float    blend_ps(Float a, Float b);                     // return ImmT ? b : a  (float)
234
    static Integer  blendv_epi32(Integer a, Integer b, Float mask); // return mask ? b : a (int)
235
    static Float    blendv_ps(Float a, Float b, Float mask);        // return mask ? b : a (float)
236
    static Float    broadcast_ss(float const *p);                   // return *p (all elements in vector get same value)
237
    static Integer  packs_epi16(Integer a, Integer b);              // See documentation for _mm256_packs_epi16 and _mm512_packs_epi16
238
    static Integer  packs_epi32(Integer a, Integer b);              // See documentation for _mm256_packs_epi32 and _mm512_packs_epi32
239
    static Integer  packus_epi16(Integer a, Integer b);             // See documentation for _mm256_packus_epi16 and _mm512_packus_epi16
240
    static Integer  packus_epi32(Integer a, Integer b);             // See documentation for _mm256_packus_epi32 and _mm512_packus_epi32
241
    static Float    permute_epi32(Integer a, Integer swiz);         // return a[swiz[i]] for each 32-bit lane i (int32)
242
    static Float    permute_ps(Float a, Integer swiz);              // return a[swiz[i]] for each 32-bit lane i (float)
243
    template<int SwizT>
244
    static Integer  shuffle_epi32(Integer a, Integer b);    
245
    template<int SwizT>
246
    static Integer  shuffle_epi64(Integer a, Integer b);
247
    static Integer  shuffle_epi8(Integer a, Integer b);
248
    template<int SwizT>
249
    static Float    shuffle_pd(Double a, Double b);
250
    template<int SwizT>
251
    static Float    shuffle_ps(Float a, Float b);
252
    static Integer  unpackhi_epi16(Integer a, Integer b);
253
    static Integer  unpackhi_epi32(Integer a, Integer b);
254
    static Integer  unpackhi_epi64(Integer a, Integer b);
255
    static Integer  unpackhi_epi8(Integer a, Integer b);
256
    static Float    unpackhi_pd(Double a, Double b);
257
    static Float    unpackhi_ps(Float a, Float b);
258
    static Integer  unpacklo_epi16(Integer a, Integer b);
259
    static Integer  unpacklo_epi32(Integer a, Integer b);
260
    static Integer  unpacklo_epi64(Integer a, Integer b);
261
    static Integer  unpacklo_epi8(Integer a, Integer b);
262
    static Float    unpacklo_pd(Double a, Double b);
263
    static Float    unpacklo_ps(Float a, Float b);
264

265
    //-----------------------------------------------------------------------
266
    // Load / store operations
267
    //-----------------------------------------------------------------------
268
    enum class ScaleFactor
269
    {
270
        SF_1,   // No scaling
271
        SF_2,   // Scale offset by 2
272
        SF_4,   // Scale offset by 4
273
        SF_8,   // Scale offset by 8
274
    };
275

276
    template<ScaleFactor ScaleT = ScaleFactor::SF_1>
277
    static Float    i32gather_ps(float const* p, Integer idx);  // return *(float*)(((int8*)p) + (idx * ScaleT))
278
    static Float    load1_ps(float const *p);                   // return *p    (broadcast 1 value to all elements)
279
    static Float    load_ps(float const *p);                    // return *p    (loads SIMD width elements from memory)
280
    static Integer  load_si(Integer const *p);                  // return *p
281
    static Float    loadu_ps(float const *p);                   // return *p    (same as load_ps but allows for unaligned mem)
282
    static Integer  loadu_si(Integer const *p);                 // return *p    (same as load_si but allows for unaligned mem)
283

284
    // for each element: (mask & (1 << 31)) ? (i32gather_ps<ScaleT>(p, idx), mask = 0) : old
285
    template<int ScaleT>
286
    static Float    mask_i32gather_ps(Float old, float const* p, Integer idx, Float mask);
287

288
    static void     maskstore_ps(float *p, Integer mask, Float src);
289
    static int      movemask_epi8(Integer a);
290
    static int      movemask_pd(Double a);
291
    static int      movemask_ps(Float a);
292
    static Integer  set1_epi32(int i);                          // return i (all elements are same value)
293
    static Integer  set1_epi8(char i);                          // return i (all elements are same value)
294
    static Float    set1_ps(float f);                           // return f (all elements are same value)
295
    static Float    setzero_ps();                               // return 0 (float)
296
    static Integer  setzero_si();                               // return 0 (integer)
297
    static void     store_ps(float *p, Float a);                // *p = a   (stores all elements contiguously in memory)
298
    static void     store_si(Integer *p, Integer a);            // *p = a
299
    static void     stream_ps(float *p, Float a);               // *p = a   (same as store_ps, but doesn't keep memory in cache)
300

301
    //=======================================================================
302
    // Legacy interface (available only in SIMD256 width)
303
    //=======================================================================
304

305
    static Float    broadcast_ps(__m128 const *p);
306
    template<int ImmT>
307
    static __m128d  extractf128_pd(Double a);
308
    template<int ImmT>
309
    static __m128   extractf128_ps(Float a);
310
    template<int ImmT>
311
    static __m128i  extractf128_si(Integer a);
312
    template<int ImmT>
313
    static Double   insertf128_pd(Double a, __m128d b);
314
    template<int ImmT>
315
    static Float    insertf128_ps(Float a, __m128 b);
316
    template<int ImmT>
317
    static Integer  insertf128_si(Integer a, __m128i b);
318
    static Integer  loadu2_si(__m128 const* phi, __m128 const* plo);
319
    template<int ImmT>
320
    static Double   permute2f128_pd(Double a, Double b);
321
    template<int ImmT>
322
    static Float    permute2f128_ps(Float a, Float b);
323
    template<int ImmT>
324
    static Integer  permute2f128_si(Integer a, Integer b);
325
    static Integer  set_epi32(int i7, int i6, int i5, int i4, int i3, int i2, int i1, int i0);
326
    static void     storeu2_si(__m128i *phi, __m128i *plo, Integer src);
327

328
    //=======================================================================
329
    // Advanced masking interface (currently available only in SIMD16 width)
330
    //=======================================================================
331
};
332
#endif // #if 0
333

334
Product

Resources

Company