CoCalc -- ConvectionKernels

GitHub Repository: godotengine/godot
Path: blob/master/thirdparty/cvtt/ConvectionKernels_ParallelMath.h
⁹⁸⁹⁸ views
1
/*
2
Convection Texture Tools
3
Copyright (c) 2018-2019 Eric Lasota
4

5
Permission is hereby granted, free of charge, to any person obtaining
6
a copy of this software and associated documentation files (the
7
"Software"), to deal in the Software without restriction, including
8
without limitation the rights to use, copy, modify, merge, publish,
9
distribute, sublicense, and/or sell copies of the Software, and to
10
permit persons to whom the Software is furnished to do so, subject
11
to the following conditions:
12

13
The above copyright notice and this permission notice shall be included
14
in all copies or substantial portions of the Software.
15

16
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
17
OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
19
IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
20
CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
21
TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
22
SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
23

24
*/
25
#pragma once
26
#ifndef __CVTT_PARALLELMATH_H__
27
#define __CVTT_PARALLELMATH_H__
28

29
#include "ConvectionKernels.h"
30
#include "ConvectionKernels_Config.h"
31

32
#ifdef CVTT_USE_SSE2
33
#include <emmintrin.h>
34
#endif
35

36
#include <float.h>
37
#include <assert.h>
38
#include <string.h>
39
#include <algorithm>
40
#include <math.h>
41

42
#define UNREFERENCED_PARAMETER(n) ((void)n)
43

44
// Parallel math implementation
45
//
46
// After preprocessor defs are handled, what this should do is expose the following types:
47
// SInt16 - Signed 16-bit integer
48
// UInt16 - Signed 16-bit integer
49
// UInt15 - Unsigned 15-bit integer
50
// SInt32 - Signed 32-bit integer
51
// UInt31 - Unsigned 31-bit integer
52
// AInt16 - 16-bit integer of unknown signedness (only used for storage)
53
// Int16CompFlag - Comparison flags from comparing 16-bit integers
54
// Int32CompFlag - Comparison flags from comparing 32-bit integers
55
// FloatCompFlag - Comparison flags from comparing 32-bit floats
56
//
57
// The reason for these distinctions are that depending on the instruction set, signed or unsigned versions of certain ops
58
// (particularly max, min, compares, and right shift) may not be available.  In cases where ops are not available, it's
59
// necessary to do high bit manipulations to accomplish the operation with 16-bit numbers.  The 15-bit and 31-bit uint types
60
// can elide the bit flips if unsigned versions are not available.
61

62
namespace cvtt
63
{
64
#ifdef CVTT_USE_SSE2
65
    // SSE2 version
66
    struct ParallelMath
67
    {
68
        typedef uint16_t ScalarUInt16;
69
        typedef int16_t ScalarSInt16;
70

71
        template<unsigned int TRoundingMode>
72
        struct RoundForScope
73
        {
74
            unsigned int m_oldCSR;
75

76
            RoundForScope()
77
            {
78
                m_oldCSR = _mm_getcsr();
79
                _mm_setcsr((m_oldCSR & ~_MM_ROUND_MASK) | (TRoundingMode));
80
            }
81

82
            ~RoundForScope()
83
            {
84
                _mm_setcsr(m_oldCSR);
85
            }
86
        };
87

88
        struct RoundTowardZeroForScope : RoundForScope<_MM_ROUND_TOWARD_ZERO>
89
        {
90
        };
91

92
        struct RoundTowardNearestForScope : RoundForScope<_MM_ROUND_NEAREST>
93
        {
94
        };
95

96
        struct RoundUpForScope : RoundForScope<_MM_ROUND_UP>
97
        {
98
        };
99

100
        struct RoundDownForScope : RoundForScope<_MM_ROUND_DOWN>
101
        {
102
        };
103

104
        static const int ParallelSize = 8;
105

106
        enum Int16Subtype
107
        {
108
            IntSubtype_Signed,
109
            IntSubtype_UnsignedFull,
110
            IntSubtype_UnsignedTruncated,
111
            IntSubtype_Abstract,
112
        };
113

114
        template<int TSubtype>
115
        struct VInt16
116
        {
117
            __m128i m_value;
118

119
            inline VInt16 operator+(int16_t other) const
120
            {
121
                VInt16 result;
122
                result.m_value = _mm_add_epi16(m_value, _mm_set1_epi16(static_cast<int16_t>(other)));
123
                return result;
124
            }
125

126
            inline VInt16 operator+(const VInt16 &other) const
127
            {
128
                VInt16 result;
129
                result.m_value = _mm_add_epi16(m_value, other.m_value);
130
                return result;
131
            }
132

133
            inline VInt16 operator|(const VInt16 &other) const
134
            {
135
                VInt16 result;
136
                result.m_value = _mm_or_si128(m_value, other.m_value);
137
                return result;
138
            }
139

140
            inline VInt16 operator&(const VInt16 &other) const
141
            {
142
                VInt16 result;
143
                result.m_value = _mm_and_si128(m_value, other.m_value);
144
                return result;
145
            }
146

147
            inline VInt16 operator-(const VInt16 &other) const
148
            {
149
                VInt16 result;
150
                result.m_value = _mm_sub_epi16(m_value, other.m_value);
151
                return result;
152
            }
153

154
            inline VInt16 operator<<(int bits) const
155
            {
156
                VInt16 result;
157
                result.m_value = _mm_slli_epi16(m_value, bits);
158
                return result;
159
            }
160

161
            inline VInt16 operator^(const VInt16 &other) const
162
            {
163
                VInt16 result;
164
                result.m_value = _mm_xor_si128(m_value, other.m_value);
165
                return result;
166
            }
167
        };
168

169
        typedef VInt16<IntSubtype_Signed> SInt16;
170
        typedef VInt16<IntSubtype_UnsignedFull> UInt16;
171
        typedef VInt16<IntSubtype_UnsignedTruncated> UInt15;
172
        typedef VInt16<IntSubtype_Abstract> AInt16;
173

174
        template<int TSubtype>
175
        struct VInt32
176
        {
177
            __m128i m_values[2];
178

179
            inline VInt32 operator+(const VInt32& other) const
180
            {
181
                VInt32 result;
182
                result.m_values[0] = _mm_add_epi32(m_values[0], other.m_values[0]);
183
                result.m_values[1] = _mm_add_epi32(m_values[1], other.m_values[1]);
184
                return result;
185
            }
186

187
            inline VInt32 operator-(const VInt32& other) const
188
            {
189
                VInt32 result;
190
                result.m_values[0] = _mm_sub_epi32(m_values[0], other.m_values[0]);
191
                result.m_values[1] = _mm_sub_epi32(m_values[1], other.m_values[1]);
192
                return result;
193
            }
194

195
            inline VInt32 operator<<(const int other) const
196
            {
197
                VInt32 result;
198
                result.m_values[0] = _mm_slli_epi32(m_values[0], other);
199
                result.m_values[1] = _mm_slli_epi32(m_values[1], other);
200
                return result;
201
            }
202

203
            inline VInt32 operator|(const VInt32& other) const
204
            {
205
                VInt32 result;
206
                result.m_values[0] = _mm_or_si128(m_values[0], other.m_values[0]);
207
                result.m_values[1] = _mm_or_si128(m_values[1], other.m_values[1]);
208
                return result;
209
            }
210
        };
211

212
        typedef VInt32<IntSubtype_Signed> SInt32;
213
        typedef VInt32<IntSubtype_UnsignedTruncated> UInt31;
214
        typedef VInt32<IntSubtype_UnsignedFull> UInt32;
215
        typedef VInt32<IntSubtype_Abstract> AInt32;
216

217
        template<class TTargetType>
218
        struct LosslessCast
219
        {
220
#ifdef CVTT_PERMIT_ALIASING
221
            template<int TSrcSubtype>
222
            static const TTargetType& Cast(const VInt32<TSrcSubtype> &src)
223
            {
224
                return reinterpret_cast<VInt32<TSubtype>&>(src);
225
            }
226

227
            template<int TSrcSubtype>
228
            static const TTargetType& Cast(const VInt16<TSrcSubtype> &src)
229
            {
230
                return reinterpret_cast<VInt16<TSubtype>&>(src);
231
            }
232
#else
233
            template<int TSrcSubtype>
234
            static TTargetType Cast(const VInt32<TSrcSubtype> &src)
235
            {
236
                TTargetType result;
237
                result.m_values[0] = src.m_values[0];
238
                result.m_values[1] = src.m_values[1];
239
                return result;
240
            }
241

242
            template<int TSrcSubtype>
243
            static TTargetType Cast(const VInt16<TSrcSubtype> &src)
244
            {
245
                TTargetType result;
246
                result.m_value = src.m_value;
247
                return result;
248
            }
249
#endif
250
        };
251

252
        struct Int64
253
        {
254
            __m128i m_values[4];
255
        };
256

257
        struct Float
258
        {
259
            __m128 m_values[2];
260

261
            inline Float operator+(const Float &other) const
262
            {
263
                Float result;
264
                result.m_values[0] = _mm_add_ps(m_values[0], other.m_values[0]);
265
                result.m_values[1] = _mm_add_ps(m_values[1], other.m_values[1]);
266
                return result;
267
            }
268

269
            inline Float operator+(float other) const
270
            {
271
                Float result;
272
                result.m_values[0] = _mm_add_ps(m_values[0], _mm_set1_ps(other));
273
                result.m_values[1] = _mm_add_ps(m_values[1], _mm_set1_ps(other));
274
                return result;
275
            }
276

277
            inline Float operator-(const Float& other) const
278
            {
279
                Float result;
280
                result.m_values[0] = _mm_sub_ps(m_values[0], other.m_values[0]);
281
                result.m_values[1] = _mm_sub_ps(m_values[1], other.m_values[1]);
282
                return result;
283
            }
284

285
            inline Float operator-() const
286
            {
287
                Float result;
288
                result.m_values[0] = _mm_sub_ps(_mm_setzero_ps(), m_values[0]);
289
                result.m_values[1] = _mm_sub_ps(_mm_setzero_ps(), m_values[1]);
290
                return result;
291
            }
292

293
            inline Float operator*(const Float& other) const
294
            {
295
                Float result;
296
                result.m_values[0] = _mm_mul_ps(m_values[0], other.m_values[0]);
297
                result.m_values[1] = _mm_mul_ps(m_values[1], other.m_values[1]);
298
                return result;
299
            }
300

301
            inline Float operator*(float other) const
302
            {
303
                Float result;
304
                result.m_values[0] = _mm_mul_ps(m_values[0], _mm_set1_ps(other));
305
                result.m_values[1] = _mm_mul_ps(m_values[1], _mm_set1_ps(other));
306
                return result;
307
            }
308

309
            inline Float operator/(const Float &other) const
310
            {
311
                Float result;
312
                result.m_values[0] = _mm_div_ps(m_values[0], other.m_values[0]);
313
                result.m_values[1] = _mm_div_ps(m_values[1], other.m_values[1]);
314
                return result;
315
            }
316

317
            inline Float operator/(float other) const
318
            {
319
                Float result;
320
                result.m_values[0] = _mm_div_ps(m_values[0], _mm_set1_ps(other));
321
                result.m_values[1] = _mm_div_ps(m_values[1], _mm_set1_ps(other));
322
                return result;
323
            }
324
        };
325

326
        struct Int16CompFlag
327
        {
328
            __m128i m_value;
329

330
            inline Int16CompFlag operator&(const Int16CompFlag &other) const
331
            {
332
                Int16CompFlag result;
333
                result.m_value = _mm_and_si128(m_value, other.m_value);
334
                return result;
335
            }
336

337
            inline Int16CompFlag operator|(const Int16CompFlag &other) const
338
            {
339
                Int16CompFlag result;
340
                result.m_value = _mm_or_si128(m_value, other.m_value);
341
                return result;
342
            }
343
        };
344

345
        struct Int32CompFlag
346
        {
347
            __m128i m_values[2];
348

349
            inline Int32CompFlag operator&(const Int32CompFlag &other) const
350
            {
351
                Int32CompFlag result;
352
                result.m_values[0] = _mm_and_si128(m_values[0], other.m_values[0]);
353
                result.m_values[1] = _mm_and_si128(m_values[1], other.m_values[1]);
354
                return result;
355
            }
356

357
            inline Int32CompFlag operator|(const Int32CompFlag &other) const
358
            {
359
                Int32CompFlag result;
360
                result.m_values[0] = _mm_or_si128(m_values[0], other.m_values[0]);
361
                result.m_values[1] = _mm_or_si128(m_values[1], other.m_values[1]);
362
                return result;
363
            }
364
        };
365

366
        struct FloatCompFlag
367
        {
368
            __m128 m_values[2];
369

370
            inline FloatCompFlag operator&(const FloatCompFlag &other) const
371
            {
372
                FloatCompFlag result;
373
                result.m_values[0] = _mm_and_ps(m_values[0], other.m_values[0]);
374
                result.m_values[1] = _mm_and_ps(m_values[1], other.m_values[1]);
375
                return result;
376
            }
377

378
            inline FloatCompFlag operator|(const FloatCompFlag &other) const
379
            {
380
                FloatCompFlag result;
381
                result.m_values[0] = _mm_or_ps(m_values[0], other.m_values[0]);
382
                result.m_values[1] = _mm_or_ps(m_values[1], other.m_values[1]);
383
                return result;
384
            }
385
        };
386

387
        template<int TSubtype>
388
        static VInt16<TSubtype> AbstractAdd(const VInt16<TSubtype> &a, const VInt16<TSubtype> &b)
389
        {
390
            VInt16<TSubtype> result;
391
            result.m_value = _mm_add_epi16(a.m_value, b.m_value);
392
            return result;
393
        }
394

395
        template<int TSubtype>
396
        static VInt16<TSubtype> AbstractSubtract(const VInt16<TSubtype> &a, const VInt16<TSubtype> &b)
397
        {
398
            VInt16<TSubtype> result;
399
            result.m_value = _mm_sub_epi16(a.m_value, b.m_value);
400
            return result;
401
        }
402

403
        static Float Select(const FloatCompFlag &flag, const Float &a, const Float &b)
404
        {
405
            Float result;
406
            for (int i = 0; i < 2; i++)
407
                result.m_values[i] = _mm_or_ps(_mm_and_ps(flag.m_values[i], a.m_values[i]), _mm_andnot_ps(flag.m_values[i], b.m_values[i]));
408
            return result;
409
        }
410

411
        template<int TSubtype>
412
        static VInt16<TSubtype> Select(const Int16CompFlag &flag, const VInt16<TSubtype> &a, const VInt16<TSubtype> &b)
413
        {
414
            VInt16<TSubtype> result;
415
            result.m_value = _mm_or_si128(_mm_and_si128(flag.m_value, a.m_value), _mm_andnot_si128(flag.m_value, b.m_value));
416
            return result;
417
        }
418

419
        template<int TSubtype>
420
        static VInt16<TSubtype> SelectOrZero(const Int16CompFlag &flag, const VInt16<TSubtype> &a)
421
        {
422
            VInt16<TSubtype> result;
423
            result.m_value = _mm_and_si128(flag.m_value, a.m_value);
424
            return result;
425
        }
426

427
        template<int TSubtype>
428
        static void ConditionalSet(VInt16<TSubtype> &dest, const Int16CompFlag &flag, const VInt16<TSubtype> &src)
429
        {
430
            dest.m_value = _mm_or_si128(_mm_andnot_si128(flag.m_value, dest.m_value), _mm_and_si128(flag.m_value, src.m_value));
431
        }
432

433
        template<int TSubtype>
434
        static void ConditionalSet(VInt32<TSubtype> &dest, const Int16CompFlag &flag, const VInt32<TSubtype> &src)
435
        {
436
            __m128i lowFlags = _mm_unpacklo_epi16(flag.m_value, flag.m_value);
437
            __m128i highFlags = _mm_unpackhi_epi16(flag.m_value, flag.m_value);
438
            dest.m_values[0] = _mm_or_si128(_mm_andnot_si128(lowFlags, dest.m_values[0]), _mm_and_si128(lowFlags, src.m_values[0]));
439
            dest.m_values[1] = _mm_or_si128(_mm_andnot_si128(highFlags, dest.m_values[1]), _mm_and_si128(highFlags, src.m_values[1]));
440
        }
441

442
        static void ConditionalSet(ParallelMath::Int16CompFlag &dest, const Int16CompFlag &flag, const ParallelMath::Int16CompFlag &src)
443
        {
444
            dest.m_value = _mm_or_si128(_mm_andnot_si128(flag.m_value, dest.m_value), _mm_and_si128(flag.m_value, src.m_value));
445
        }
446

447
        static SInt16 ConditionalNegate(const Int16CompFlag &flag, const SInt16 &v)
448
        {
449
            SInt16 result;
450
            result.m_value = _mm_add_epi16(_mm_xor_si128(flag.m_value, v.m_value), _mm_srli_epi16(flag.m_value, 15));
451
            return result;
452
        }
453

454
        template<int TSubtype>
455
        static void NotConditionalSet(VInt16<TSubtype> &dest, const Int16CompFlag &flag, const VInt16<TSubtype> &src)
456
        {
457
            dest.m_value = _mm_or_si128(_mm_and_si128(flag.m_value, dest.m_value), _mm_andnot_si128(flag.m_value, src.m_value));
458
        }
459

460
        static void ConditionalSet(Float &dest, const FloatCompFlag &flag, const Float &src)
461
        {
462
            for (int i = 0; i < 2; i++)
463
                dest.m_values[i] = _mm_or_ps(_mm_andnot_ps(flag.m_values[i], dest.m_values[i]), _mm_and_ps(flag.m_values[i], src.m_values[i]));
464
        }
465

466
        static void NotConditionalSet(Float &dest, const FloatCompFlag &flag, const Float &src)
467
        {
468
            for (int i = 0; i < 2; i++)
469
                dest.m_values[i] = _mm_or_ps(_mm_and_ps(flag.m_values[i], dest.m_values[i]), _mm_andnot_ps(flag.m_values[i], src.m_values[i]));
470
        }
471

472
        static void MakeSafeDenominator(Float& v)
473
        {
474
            ConditionalSet(v, Equal(v, MakeFloatZero()), MakeFloat(1.0f));
475
        }
476

477
        static SInt16 TruncateToPrecisionSigned(const SInt16 &v, int precision)
478
        {
479
            int lostBits = 16 - precision;
480
            if (lostBits == 0)
481
                return v;
482

483
            SInt16 result;
484
            result.m_value = _mm_srai_epi16(_mm_slli_epi16(v.m_value, lostBits), lostBits);
485
            return result;
486
        }
487

488
        static UInt16 TruncateToPrecisionUnsigned(const UInt16 &v, int precision)
489
        {
490
            int lostBits = 16 - precision;
491
            if (lostBits == 0)
492
                return v;
493

494
            UInt16 result;
495
            result.m_value = _mm_srli_epi16(_mm_slli_epi16(v.m_value, lostBits), lostBits);
496
            return result;
497
        }
498

499
        static UInt16 Min(const UInt16 &a, const UInt16 &b)
500
        {
501
            __m128i bitFlip = _mm_set1_epi16(-32768);
502

503
            UInt16 result;
504
            result.m_value = _mm_xor_si128(_mm_min_epi16(_mm_xor_si128(a.m_value, bitFlip), _mm_xor_si128(b.m_value, bitFlip)), bitFlip);
505
            return result;
506
        }
507

508
        static SInt16 Min(const SInt16 &a, const SInt16 &b)
509
        {
510
            SInt16 result;
511
            result.m_value = _mm_min_epi16(a.m_value, b.m_value);
512
            return result;
513
        }
514

515
        static UInt15 Min(const UInt15 &a, const UInt15 &b)
516
        {
517
            UInt15 result;
518
            result.m_value = _mm_min_epi16(a.m_value, b.m_value);
519
            return result;
520
        }
521

522
        static Float Min(const Float &a, const Float &b)
523
        {
524
            Float result;
525
            for (int i = 0; i < 2; i++)
526
                result.m_values[i] = _mm_min_ps(a.m_values[i], b.m_values[i]);
527
            return result;
528
        }
529

530
        static UInt16 Max(const UInt16 &a, const UInt16 &b)
531
        {
532
            __m128i bitFlip = _mm_set1_epi16(-32768);
533

534
            UInt16 result;
535
            result.m_value = _mm_xor_si128(_mm_max_epi16(_mm_xor_si128(a.m_value, bitFlip), _mm_xor_si128(b.m_value, bitFlip)), bitFlip);
536
            return result;
537
        }
538

539
        static SInt16 Max(const SInt16 &a, const SInt16 &b)
540
        {
541
            SInt16 result;
542
            result.m_value = _mm_max_epi16(a.m_value, b.m_value);
543
            return result;
544
        }
545

546
        static UInt15 Max(const UInt15 &a, const UInt15 &b)
547
        {
548
            UInt15 result;
549
            result.m_value = _mm_max_epi16(a.m_value, b.m_value);
550
            return result;
551
        }
552

553
        static Float Max(const Float &a, const Float &b)
554
        {
555
            Float result;
556
            for (int i = 0; i < 2; i++)
557
                result.m_values[i] = _mm_max_ps(a.m_values[i], b.m_values[i]);
558
            return result;
559
        }
560

561
        static Float Clamp(const Float &v, float min, float max)
562
        {
563
            Float result;
564
            for (int i = 0; i < 2; i++)
565
                result.m_values[i] = _mm_max_ps(_mm_min_ps(v.m_values[i], _mm_set1_ps(max)), _mm_set1_ps(min));
566
            return result;
567
        }
568

569
        static Float Reciprocal(const Float &v)
570
        {
571
            Float result;
572
            for (int i = 0; i < 2; i++)
573
                result.m_values[i] = _mm_rcp_ps(v.m_values[i]);
574
            return result;
575
        }
576

577
        static void ConvertLDRInputs(const PixelBlockU8* inputBlocks, int pxOffset, int channel, UInt15 &chOut)
578
        {
579
            int16_t values[8];
580
            for (int i = 0; i < 8; i++)
581
                values[i] = inputBlocks[i].m_pixels[pxOffset][channel];
582

583
            chOut.m_value = _mm_set_epi16(values[7], values[6], values[5], values[4], values[3], values[2], values[1], values[0]);
584
        }
585

586
        static void ConvertHDRInputs(const PixelBlockF16* inputBlocks, int pxOffset, int channel, SInt16 &chOut)
587
        {
588
            int16_t values[8];
589
            for (int i = 0; i < 8; i++)
590
                values[i] = inputBlocks[i].m_pixels[pxOffset][channel];
591

592
            chOut.m_value = _mm_set_epi16(values[7], values[6], values[5], values[4], values[3], values[2], values[1], values[0]);
593
        }
594

595
        static Float MakeFloat(float v)
596
        {
597
            Float f;
598
            f.m_values[0] = f.m_values[1] = _mm_set1_ps(v);
599
            return f;
600
        }
601

602
        static Float MakeFloatZero()
603
        {
604
            Float f;
605
            f.m_values[0] = f.m_values[1] = _mm_setzero_ps();
606
            return f;
607
        }
608

609
        static UInt16 MakeUInt16(uint16_t v)
610
        {
611
            UInt16 result;
612
            result.m_value = _mm_set1_epi16(static_cast<short>(v));
613
            return result;
614
        }
615

616
        static SInt16 MakeSInt16(int16_t v)
617
        {
618
            SInt16 result;
619
            result.m_value = _mm_set1_epi16(static_cast<short>(v));
620
            return result;
621
        }
622

623
        static AInt16 MakeAInt16(int16_t v)
624
        {
625
            AInt16 result;
626
            result.m_value = _mm_set1_epi16(static_cast<short>(v));
627
            return result;
628
        }
629

630
        static UInt15 MakeUInt15(uint16_t v)
631
        {
632
            UInt15 result;
633
            result.m_value = _mm_set1_epi16(static_cast<short>(v));
634
            return result;
635
        }
636

637
        static SInt32 MakeSInt32(int32_t v)
638
        {
639
            SInt32 result;
640
            result.m_values[0] = _mm_set1_epi32(v);
641
            result.m_values[1] = _mm_set1_epi32(v);
642
            return result;
643
        }
644

645
        static UInt31 MakeUInt31(uint32_t v)
646
        {
647
            UInt31 result;
648
            result.m_values[0] = _mm_set1_epi32(v);
649
            result.m_values[1] = _mm_set1_epi32(v);
650
            return result;
651
        }
652

653
        static uint16_t Extract(const UInt16 &v, int offset)
654
        {
655
            return reinterpret_cast<const uint16_t*>(&v.m_value)[offset];
656
        }
657

658
        static int16_t Extract(const SInt16 &v, int offset)
659
        {
660
            return reinterpret_cast<const int16_t*>(&v.m_value)[offset];
661
        }
662

663
        static uint16_t Extract(const UInt15 &v, int offset)
664
        {
665
            return reinterpret_cast<const uint16_t*>(&v.m_value)[offset];
666
        }
667

668
        static int16_t Extract(const AInt16 &v, int offset)
669
        {
670
            return reinterpret_cast<const int16_t*>(&v.m_value)[offset];
671
        }
672

673
        static int32_t Extract(const SInt32 &v, int offset)
674
        {
675
            return reinterpret_cast<const int32_t*>(&v.m_values[offset >> 2])[offset & 3];
676
        }
677

678
        static float Extract(const Float &v, int offset)
679
        {
680
            return reinterpret_cast<const float*>(&v.m_values[offset >> 2])[offset & 3];
681
        }
682

683
        static bool Extract(const ParallelMath::Int16CompFlag &v, int offset)
684
        {
685
            return reinterpret_cast<const int16_t*>(&v.m_value)[offset] != 0;
686
        }
687

688
        static void PutUInt16(UInt16 &dest, int offset, uint16_t v)
689
        {
690
            reinterpret_cast<uint16_t*>(&dest)[offset] = v;
691
        }
692

693
        static void PutUInt15(UInt15 &dest, int offset, uint16_t v)
694
        {
695
            reinterpret_cast<uint16_t*>(&dest)[offset] = v;
696
        }
697

698
        static void PutSInt16(SInt16 &dest, int offset, int16_t v)
699
        {
700
            reinterpret_cast<int16_t*>(&dest)[offset] = v;
701
        }
702

703
        static float ExtractFloat(const Float& v, int offset)
704
        {
705
            return reinterpret_cast<const float*>(&v)[offset];
706
        }
707

708
        static void PutFloat(Float &dest, int offset, float v)
709
        {
710
            reinterpret_cast<float*>(&dest)[offset] = v;
711
        }
712

713
        static void PutBoolInt16(Int16CompFlag &dest, int offset, bool v)
714
        {
715
            reinterpret_cast<int16_t*>(&dest)[offset] = v ? -1 : 0;
716
        }
717

718
        static Int32CompFlag Less(const UInt31 &a, const UInt31 &b)
719
        {
720
            Int32CompFlag result;
721
            result.m_values[0] = _mm_cmplt_epi32(a.m_values[0], b.m_values[0]);
722
            result.m_values[1] = _mm_cmplt_epi32(a.m_values[1], b.m_values[1]);
723
            return result;
724
        }
725

726
        static Int16CompFlag Less(const SInt16 &a, const SInt16 &b)
727
        {
728
            Int16CompFlag result;
729
            result.m_value = _mm_cmplt_epi16(a.m_value, b.m_value);
730
            return result;
731
        }
732

733
        static Int16CompFlag Less(const UInt15 &a, const UInt15 &b)
734
        {
735
            Int16CompFlag result;
736
            result.m_value = _mm_cmplt_epi16(a.m_value, b.m_value);
737
            return result;
738
        }
739

740
        static Int16CompFlag LessOrEqual(const UInt15 &a, const UInt15 &b)
741
        {
742
            Int16CompFlag result;
743
            result.m_value = _mm_cmplt_epi16(a.m_value, b.m_value);
744
            return result;
745
        }
746

747
        static FloatCompFlag Less(const Float &a, const Float &b)
748
        {
749
            FloatCompFlag result;
750
            for (int i = 0; i < 2; i++)
751
                result.m_values[i] = _mm_cmplt_ps(a.m_values[i], b.m_values[i]);
752
            return result;
753
        }
754

755
        static FloatCompFlag LessOrEqual(const Float &a, const Float &b)
756
        {
757
            FloatCompFlag result;
758
            for (int i = 0; i < 2; i++)
759
                result.m_values[i] = _mm_cmple_ps(a.m_values[i], b.m_values[i]);
760
            return result;
761
        }
762

763
        template<int TSubtype>
764
        static Int16CompFlag Equal(const VInt16<TSubtype> &a, const VInt16<TSubtype> &b)
765
        {
766
            Int16CompFlag result;
767
            result.m_value = _mm_cmpeq_epi16(a.m_value, b.m_value);
768
            return result;
769
        }
770

771
        static FloatCompFlag Equal(const Float &a, const Float &b)
772
        {
773
            FloatCompFlag result;
774
            for (int i = 0; i < 2; i++)
775
                result.m_values[i] = _mm_cmpeq_ps(a.m_values[i], b.m_values[i]);
776
            return result;
777
        }
778

779
        static Int16CompFlag Equal(const Int16CompFlag &a, const Int16CompFlag &b)
780
        {
781
            Int16CompFlag notResult;
782
            notResult.m_value = _mm_xor_si128(a.m_value, b.m_value);
783
            return Not(notResult);
784
        }
785

786
        static Float ToFloat(const UInt16 &v)
787
        {
788
            Float result;
789
            result.m_values[0] = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v.m_value, _mm_setzero_si128()));
790
            result.m_values[1] = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v.m_value, _mm_setzero_si128()));
791
            return result;
792
        }
793

794
        static UInt31 ToUInt31(const UInt16 &v)
795
        {
796
            UInt31 result;
797
            result.m_values[0] = _mm_unpacklo_epi16(v.m_value, _mm_setzero_si128());
798
            result.m_values[1] = _mm_unpackhi_epi16(v.m_value, _mm_setzero_si128());
799
            return result;
800
        }
801

802
        static SInt32 ToInt32(const UInt16 &v)
803
        {
804
            SInt32 result;
805
            result.m_values[0] = _mm_unpacklo_epi16(v.m_value, _mm_setzero_si128());
806
            result.m_values[1] = _mm_unpackhi_epi16(v.m_value, _mm_setzero_si128());
807
            return result;
808
        }
809

810
        static SInt32 ToInt32(const UInt15 &v)
811
        {
812
            SInt32 result;
813
            result.m_values[0] = _mm_unpacklo_epi16(v.m_value, _mm_setzero_si128());
814
            result.m_values[1] = _mm_unpackhi_epi16(v.m_value, _mm_setzero_si128());
815
            return result;
816
        }
817

818
        static SInt32 ToInt32(const SInt16 &v)
819
        {
820
            SInt32 result;
821
            result.m_values[0] = _mm_srai_epi32(_mm_unpacklo_epi16(_mm_setzero_si128(), v.m_value), 16);
822
            result.m_values[1] = _mm_srai_epi32(_mm_unpackhi_epi16(_mm_setzero_si128(), v.m_value), 16);
823
            return result;
824
        }
825

826
        static Float ToFloat(const SInt16 &v)
827
        {
828
            Float result;
829
            result.m_values[0] = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(_mm_setzero_si128(), v.m_value), 16));
830
            result.m_values[1] = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(_mm_setzero_si128(), v.m_value), 16));
831
            return result;
832
        }
833

834
        static Float ToFloat(const UInt15 &v)
835
        {
836
            Float result;
837
            result.m_values[0] = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v.m_value, _mm_setzero_si128()));
838
            result.m_values[1] = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v.m_value, _mm_setzero_si128()));
839
            return result;
840
        }
841

842
        static Float ToFloat(const UInt31 &v)
843
        {
844
            Float result;
845
            result.m_values[0] = _mm_cvtepi32_ps(v.m_values[0]);
846
            result.m_values[1] = _mm_cvtepi32_ps(v.m_values[1]);
847
            return result;
848
        }
849

850
        static Int16CompFlag FloatFlagToInt16(const FloatCompFlag &v)
851
        {
852
            __m128i lo = _mm_castps_si128(v.m_values[0]);
853
            __m128i hi = _mm_castps_si128(v.m_values[1]);
854

855
            Int16CompFlag result;
856
            result.m_value = _mm_packs_epi32(lo, hi);
857
            return result;
858
        }
859

860
        static FloatCompFlag Int16FlagToFloat(const Int16CompFlag &v)
861
        {
862
            __m128i lo = _mm_unpacklo_epi16(v.m_value, v.m_value);
863
            __m128i hi = _mm_unpackhi_epi16(v.m_value, v.m_value);
864

865
            FloatCompFlag result;
866
            result.m_values[0] = _mm_castsi128_ps(lo);
867
            result.m_values[1] = _mm_castsi128_ps(hi);
868
            return result;
869
        }
870

871
        static Int16CompFlag Int32FlagToInt16(const Int32CompFlag &v)
872
        {
873
            __m128i lo = v.m_values[0];
874
            __m128i hi = v.m_values[1];
875

876
            Int16CompFlag result;
877
            result.m_value = _mm_packs_epi32(lo, hi);
878
            return result;
879
        }
880

881
        static Int16CompFlag MakeBoolInt16(bool b)
882
        {
883
            Int16CompFlag result;
884
            if (b)
885
                result.m_value = _mm_set1_epi16(-1);
886
            else
887
                result.m_value = _mm_setzero_si128();
888
            return result;
889
        }
890

891
        static FloatCompFlag MakeBoolFloat(bool b)
892
        {
893
            FloatCompFlag result;
894
            if (b)
895
                result.m_values[0] = result.m_values[1] = _mm_castsi128_ps(_mm_set1_epi32(-1));
896
            else
897
                result.m_values[0] = result.m_values[1] = _mm_setzero_ps();
898
            return result;
899
        }
900

901
        static Int16CompFlag AndNot(const Int16CompFlag &a, const Int16CompFlag &b)
902
        {
903
            Int16CompFlag result;
904
            result.m_value = _mm_andnot_si128(b.m_value, a.m_value);
905
            return result;
906
        }
907

908
        static Int16CompFlag Not(const Int16CompFlag &b)
909
        {
910
            Int16CompFlag result;
911
            result.m_value = _mm_xor_si128(b.m_value, _mm_set1_epi32(-1));
912
            return result;
913
        }
914

915
        static Int32CompFlag Not(const Int32CompFlag &b)
916
        {
917
            Int32CompFlag result;
918
            result.m_values[0] = _mm_xor_si128(b.m_values[0], _mm_set1_epi32(-1));
919
            result.m_values[1] = _mm_xor_si128(b.m_values[1], _mm_set1_epi32(-1));
920
            return result;
921
        }
922

923
        static UInt16 RoundAndConvertToU16(const Float &v, const void* /*roundingMode*/)
924
        {
925
            __m128i lo = _mm_cvtps_epi32(_mm_add_ps(v.m_values[0], _mm_set1_ps(-32768)));
926
            __m128i hi = _mm_cvtps_epi32(_mm_add_ps(v.m_values[1], _mm_set1_ps(-32768)));
927

928
            __m128i packed = _mm_packs_epi32(lo, hi);
929

930
            UInt16 result;
931
            result.m_value = _mm_xor_si128(packed, _mm_set1_epi16(-32768));
932
            return result;
933
        }
934

935
        static UInt15 RoundAndConvertToU15(const Float &v, const void* /*roundingMode*/)
936
        {
937
            __m128i lo = _mm_cvtps_epi32(v.m_values[0]);
938
            __m128i hi = _mm_cvtps_epi32(v.m_values[1]);
939

940
            __m128i packed = _mm_packs_epi32(lo, hi);
941

942
            UInt15 result;
943
            result.m_value = _mm_packs_epi32(lo, hi);
944
            return result;
945
        }
946

947
        static SInt16 RoundAndConvertToS16(const Float &v, const void* /*roundingMode*/)
948
        {
949
            __m128i lo = _mm_cvtps_epi32(v.m_values[0]);
950
            __m128i hi = _mm_cvtps_epi32(v.m_values[1]);
951

952
            __m128i packed = _mm_packs_epi32(lo, hi);
953

954
            SInt16 result;
955
            result.m_value = _mm_packs_epi32(lo, hi);
956
            return result;
957
        }
958

959
        static Float Sqrt(const Float &f)
960
        {
961
            Float result;
962
            for (int i = 0; i < 2; i++)
963
                result.m_values[i] = _mm_sqrt_ps(f.m_values[i]);
964
            return result;
965
        }
966

967
        static UInt16 Abs(const SInt16 &a)
968
        {
969
            __m128i signBitsXor = _mm_srai_epi16(a.m_value, 15);
970
            __m128i signBitsAdd = _mm_srli_epi16(a.m_value, 15);
971

972
            UInt16 result;
973
            result.m_value = _mm_add_epi16(_mm_xor_si128(a.m_value, signBitsXor), signBitsAdd);
974
            return result;
975
        }
976

977
        static Float Abs(const Float& a)
978
        {
979
            __m128 invMask = _mm_set1_ps(-0.0f);
980

981
            Float result;
982
            result.m_values[0] = _mm_andnot_ps(invMask, a.m_values[0]);
983
            result.m_values[1] = _mm_andnot_ps(invMask, a.m_values[1]);
984
            return result;
985
        }
986

987
        static UInt16 SqDiffUInt8(const UInt15 &a, const UInt15 &b)
988
        {
989
            __m128i diff = _mm_sub_epi16(a.m_value, b.m_value);
990

991
            UInt16 result;
992
            result.m_value = _mm_mullo_epi16(diff, diff);
993
            return result;
994
        }
995

996
        static Float SqDiffSInt16(const SInt16 &a, const SInt16 &b)
997
        {
998
            __m128i diffU = _mm_sub_epi16(_mm_max_epi16(a.m_value, b.m_value), _mm_min_epi16(a.m_value, b.m_value));
999

1000
            __m128i mulHi = _mm_mulhi_epu16(diffU, diffU);
1001
            __m128i mulLo = _mm_mullo_epi16(diffU, diffU);
1002
            __m128i sqDiffHi = _mm_unpackhi_epi16(mulLo, mulHi);
1003
            __m128i sqDiffLo = _mm_unpacklo_epi16(mulLo, mulHi);
1004

1005
            Float result;
1006
            result.m_values[0] = _mm_cvtepi32_ps(sqDiffLo);
1007
            result.m_values[1] = _mm_cvtepi32_ps(sqDiffHi);
1008

1009
            return result;
1010
        }
1011

1012
        static Float TwosCLHalfToFloat(const SInt16 &v)
1013
        {
1014
            __m128i absV = _mm_add_epi16(_mm_xor_si128(v.m_value, _mm_srai_epi16(v.m_value, 15)), _mm_srli_epi16(v.m_value, 15));
1015

1016
            __m128i signBits = _mm_and_si128(v.m_value, _mm_set1_epi16(-32768));
1017
            __m128i mantissa = _mm_and_si128(v.m_value, _mm_set1_epi16(0x03ff));
1018
            __m128i exponent = _mm_and_si128(v.m_value, _mm_set1_epi16(0x7c00));
1019

1020
            __m128i isDenormal = _mm_cmpeq_epi16(exponent, _mm_setzero_si128());
1021

1022
            // Convert exponent to high-bits 
1023
            exponent = _mm_add_epi16(_mm_srli_epi16(exponent, 3), _mm_set1_epi16(14336));
1024

1025
            __m128i denormalCorrectionHigh = _mm_and_si128(isDenormal, _mm_or_si128(signBits, _mm_set1_epi16(14336)));
1026

1027
            __m128i highBits = _mm_or_si128(signBits, _mm_or_si128(exponent, _mm_srli_epi16(mantissa, 3)));
1028
            __m128i lowBits = _mm_slli_epi16(mantissa, 13);
1029

1030
            __m128i flow = _mm_unpacklo_epi16(lowBits, highBits);
1031
            __m128i fhigh = _mm_unpackhi_epi16(lowBits, highBits);
1032

1033
            __m128i correctionLow = _mm_unpacklo_epi16(_mm_setzero_si128(), denormalCorrectionHigh);
1034
            __m128i correctionHigh = _mm_unpackhi_epi16(_mm_setzero_si128(), denormalCorrectionHigh);
1035

1036
            Float result;
1037
            result.m_values[0] = _mm_sub_ps(_mm_castsi128_ps(flow), _mm_castsi128_ps(correctionLow));
1038
            result.m_values[1] = _mm_sub_ps(_mm_castsi128_ps(fhigh), _mm_castsi128_ps(correctionHigh));
1039

1040
            return result;
1041
        }
1042

1043
        static Float SqDiff2CLFloat(const SInt16 &a, const Float &b)
1044
        {
1045
            Float fa = TwosCLHalfToFloat(a);
1046

1047
            Float diff = fa - b;
1048
            return diff * diff;
1049
        }
1050

1051
        static Float SqDiff2CL(const SInt16 &a, const SInt16 &b)
1052
        {
1053
            Float fa = TwosCLHalfToFloat(a);
1054
            Float fb = TwosCLHalfToFloat(b);
1055

1056
            Float diff = fa - fb;
1057
            return diff * diff;
1058
        }
1059

1060
        static Float SqDiff2CLFloat(const SInt16 &a, float aWeight, const Float &b)
1061
        {
1062
            Float fa = TwosCLHalfToFloat(a) * aWeight;
1063

1064
            Float diff = fa - b;
1065
            return diff * diff;
1066
        }
1067

1068
        static UInt16 RightShift(const UInt16 &v, int bits)
1069
        {
1070
            UInt16 result;
1071
            result.m_value = _mm_srli_epi16(v.m_value, bits);
1072
            return result;
1073
        }
1074

1075
        static UInt31 RightShift(const UInt31 &v, int bits)
1076
        {
1077
            UInt31 result;
1078
            result.m_values[0] = _mm_srli_epi32(v.m_values[0], bits);
1079
            result.m_values[1] = _mm_srli_epi32(v.m_values[1], bits);
1080
            return result;
1081
        }
1082

1083
        static SInt16 RightShift(const SInt16 &v, int bits)
1084
        {
1085
            SInt16 result;
1086
            result.m_value = _mm_srai_epi16(v.m_value, bits);
1087
            return result;
1088
        }
1089

1090
        static UInt15 RightShift(const UInt15 &v, int bits)
1091
        {
1092
            UInt15 result;
1093
            result.m_value = _mm_srli_epi16(v.m_value, bits);
1094
            return result;
1095
        }
1096

1097
        static SInt32 RightShift(const SInt32 &v, int bits)
1098
        {
1099
            SInt32 result;
1100
            result.m_values[0] = _mm_srai_epi32(v.m_values[0], bits);
1101
            result.m_values[1] = _mm_srai_epi32(v.m_values[1], bits);
1102
            return result;
1103
        }
1104

1105
        static SInt16 ToSInt16(const SInt32 &v)
1106
        {
1107
            SInt16 result;
1108
            result.m_value = _mm_packs_epi32(v.m_values[0], v.m_values[1]);
1109
            return result;
1110
        }
1111

1112
        static SInt16 ToSInt16(const UInt16 &v)
1113
        {
1114
            SInt16 result;
1115
            result.m_value = v.m_value;
1116
            return result;
1117
        }
1118

1119
        static SInt16 ToSInt16(const UInt15 &v)
1120
        {
1121
            SInt16 result;
1122
            result.m_value = v.m_value;
1123
            return result;
1124
        }
1125

1126
        static UInt16 ToUInt16(const UInt32 &v)
1127
        {
1128
            __m128i low = _mm_srai_epi32(_mm_slli_epi32(v.m_values[0], 16), 16);
1129
            __m128i high = _mm_srai_epi32(_mm_slli_epi32(v.m_values[1], 16), 16);
1130

1131
            UInt16 result;
1132
            result.m_value = _mm_packs_epi32(low, high);
1133
            return result;
1134
        }
1135

1136
        static UInt16 ToUInt16(const UInt31 &v)
1137
        {
1138
            __m128i low = _mm_srai_epi32(_mm_slli_epi32(v.m_values[0], 16), 16);
1139
            __m128i high = _mm_srai_epi32(_mm_slli_epi32(v.m_values[1], 16), 16);
1140

1141
            UInt16 result;
1142
            result.m_value = _mm_packs_epi32(low, high);
1143
            return result;
1144
        }
1145

1146
        static UInt15 ToUInt15(const UInt31 &v)
1147
        {
1148
            UInt15 result;
1149
            result.m_value = _mm_packs_epi32(v.m_values[0], v.m_values[1]);
1150
            return result;
1151
        }
1152

1153
        static UInt15 ToUInt15(const SInt16 &v)
1154
        {
1155
            UInt15 result;
1156
            result.m_value = v.m_value;
1157
            return result;
1158
        }
1159

1160
        static UInt15 ToUInt15(const UInt16 &v)
1161
        {
1162
            UInt15 result;
1163
            result.m_value = v.m_value;
1164
            return result;
1165
        }
1166

1167
        static SInt32 XMultiply(const SInt16 &a, const SInt16 &b)
1168
        {
1169
            __m128i high = _mm_mulhi_epi16(a.m_value, b.m_value);
1170
            __m128i low = _mm_mullo_epi16(a.m_value, b.m_value);
1171

1172
            SInt32 result;
1173
            result.m_values[0] = _mm_unpacklo_epi16(low, high);
1174
            result.m_values[1] = _mm_unpackhi_epi16(low, high);
1175
            return result;
1176
        }
1177

1178
        static SInt32 XMultiply(const SInt16 &a, const UInt15 &b)
1179
        {
1180
            __m128i high = _mm_mulhi_epi16(a.m_value, b.m_value);
1181
            __m128i low = _mm_mullo_epi16(a.m_value, b.m_value);
1182

1183
            SInt32 result;
1184
            result.m_values[0] = _mm_unpacklo_epi16(low, high);
1185
            result.m_values[1] = _mm_unpackhi_epi16(low, high);
1186
            return result;
1187
        }
1188

1189
        static SInt32 XMultiply(const UInt15 &a, const SInt16 &b)
1190
        {
1191
            return XMultiply(b, a);
1192
        }
1193

1194
        static UInt32 XMultiply(const UInt16 &a, const UInt16 &b)
1195
        {
1196
            __m128i high = _mm_mulhi_epu16(a.m_value, b.m_value);
1197
            __m128i low = _mm_mullo_epi16(a.m_value, b.m_value);
1198

1199
            UInt32 result;
1200
            result.m_values[0] = _mm_unpacklo_epi16(low, high);
1201
            result.m_values[1] = _mm_unpackhi_epi16(low, high);
1202
            return result;
1203
        }
1204

1205
        static UInt16 CompactMultiply(const UInt16 &a, const UInt15 &b)
1206
        {
1207
            UInt16 result;
1208
            result.m_value = _mm_mullo_epi16(a.m_value, b.m_value);
1209
            return result;
1210
        }
1211

1212
        static UInt16 CompactMultiply(const UInt15 &a, const UInt15 &b)
1213
        {
1214
            UInt16 result;
1215
            result.m_value = _mm_mullo_epi16(a.m_value, b.m_value);
1216
            return result;
1217
        }
1218

1219
        static SInt16 CompactMultiply(const SInt16 &a, const UInt15 &b)
1220
        {
1221
            SInt16 result;
1222
            result.m_value = _mm_mullo_epi16(a.m_value, b.m_value);
1223
            return result;
1224
        }
1225

1226
        static SInt16 CompactMultiply(const SInt16 &a, const SInt16 &b)
1227
        {
1228
            SInt16 result;
1229
            result.m_value = _mm_mullo_epi16(a.m_value, b.m_value);
1230
            return result;
1231
        }
1232

1233
        static UInt31 XMultiply(const UInt15 &a, const UInt15 &b)
1234
        {
1235
            __m128i high = _mm_mulhi_epu16(a.m_value, b.m_value);
1236
            __m128i low = _mm_mullo_epi16(a.m_value, b.m_value);
1237

1238
            UInt31 result;
1239
            result.m_values[0] = _mm_unpacklo_epi16(low, high);
1240
            result.m_values[1] = _mm_unpackhi_epi16(low, high);
1241
            return result;
1242
        }
1243

1244
        static UInt31 XMultiply(const UInt16 &a, const UInt15 &b)
1245
        {
1246
            __m128i high = _mm_mulhi_epu16(a.m_value, b.m_value);
1247
            __m128i low = _mm_mullo_epi16(a.m_value, b.m_value);
1248

1249
            UInt31 result;
1250
            result.m_values[0] = _mm_unpacklo_epi16(low, high);
1251
            result.m_values[1] = _mm_unpackhi_epi16(low, high);
1252
            return result;
1253
        }
1254

1255
        static UInt31 XMultiply(const UInt15 &a, const UInt16 &b)
1256
        {
1257
            return XMultiply(b, a);
1258
        }
1259

1260
        static bool AnySet(const Int16CompFlag &v)
1261
        {
1262
            return _mm_movemask_epi8(v.m_value) != 0;
1263
        }
1264

1265
        static bool AllSet(const Int16CompFlag &v)
1266
        {
1267
            return _mm_movemask_epi8(v.m_value) == 0xffff;
1268
        }
1269

1270
        static bool AnySet(const FloatCompFlag &v)
1271
        {
1272
            return _mm_movemask_ps(v.m_values[0]) != 0 || _mm_movemask_ps(v.m_values[1]) != 0;
1273
        }
1274

1275
        static bool AllSet(const FloatCompFlag &v)
1276
        {
1277
            return _mm_movemask_ps(v.m_values[0]) == 0xf && _mm_movemask_ps(v.m_values[1]) == 0xf;
1278
        }
1279
    };
1280

1281
#else
1282
    // Scalar version
1283
    struct ParallelMath
1284
    {
1285
        struct RoundTowardZeroForScope
1286
        {
1287
        };
1288

1289
        struct RoundTowardNearestForScope
1290
        {
1291
        };
1292

1293
        struct RoundUpForScope
1294
        {
1295
        };
1296

1297
        struct RoundDownForScope
1298
        {
1299
        };
1300

1301
        static const int ParallelSize = 1;
1302

1303
        enum Int16Subtype
1304
        {
1305
            IntSubtype_Signed,
1306
            IntSubtype_UnsignedFull,
1307
            IntSubtype_UnsignedTruncated,
1308
            IntSubtype_Abstract,
1309
        };
1310

1311
        typedef int32_t SInt16;
1312
        typedef int32_t UInt15;
1313
        typedef int32_t UInt16;
1314
        typedef int32_t AInt16;
1315

1316
        typedef int32_t SInt32;
1317
        typedef int32_t UInt31;
1318
        typedef int32_t UInt32;
1319
        typedef int32_t AInt32;
1320

1321
        typedef int32_t ScalarUInt16;
1322
        typedef int32_t ScalarSInt16;
1323

1324
        typedef float Float;
1325

1326
        template<class TTargetType>
1327
        struct LosslessCast
1328
        {
1329
            static const int32_t& Cast(const int32_t &src)
1330
            {
1331
                return src;
1332
            }
1333
        };
1334

1335
        typedef bool Int16CompFlag;
1336
        typedef bool FloatCompFlag;
1337

1338
        static int32_t AbstractAdd(const int32_t &a, const int32_t &b)
1339
        {
1340
            return a + b;
1341
        }
1342

1343
        static int32_t AbstractSubtract(const int32_t &a, const int32_t &b)
1344
        {
1345
            return a - b;
1346
        }
1347

1348
        static float Select(bool flag, float a, float b)
1349
        {
1350
            return flag ? a : b;
1351
        }
1352

1353
        static int32_t Select(bool flag, int32_t a, int32_t b)
1354
        {
1355
            return flag ? a : b;
1356
        }
1357

1358
        static int32_t SelectOrZero(bool flag, int32_t a)
1359
        {
1360
            return flag ? a : 0;
1361
        }
1362

1363
        static void ConditionalSet(int32_t& dest, bool flag, int32_t src)
1364
        {
1365
            if (flag)
1366
                dest = src;
1367
        }
1368

1369
        static void ConditionalSet(bool& dest, bool flag, bool src)
1370
        {
1371
            if (flag)
1372
                dest = src;
1373
        }
1374

1375
        static int32_t ConditionalNegate(bool flag, int32_t v)
1376
        {
1377
            return (flag) ? -v : v;
1378
        }
1379

1380
        static void NotConditionalSet(int32_t& dest, bool flag, int32_t src)
1381
        {
1382
            if (!flag)
1383
                dest = src;
1384
        }
1385

1386
        static void ConditionalSet(float& dest, bool flag, float src)
1387
        {
1388
            if (flag)
1389
                dest = src;
1390
        }
1391

1392
        static void NotConditionalSet(float& dest, bool flag, float src)
1393
        {
1394
            if (!flag)
1395
                dest = src;
1396
        }
1397

1398
        static void MakeSafeDenominator(float& v)
1399
        {
1400
            if (v == 0.0f)
1401
                v = 1.0f;
1402
        }
1403

1404
        static int32_t SignedRightShift(int32_t v, int bits)
1405
        {
1406
            return v >> bits;
1407
        }
1408

1409
        static int32_t TruncateToPrecisionSigned(int32_t v, int precision)
1410
        {
1411
            v = (v << (32 - precision)) & 0xffffffff;
1412
            return SignedRightShift(v, 32 - precision);
1413
        }
1414

1415
        static int32_t TruncateToPrecisionUnsigned(int32_t v, int precision)
1416
        {
1417
            return v & ((1 << precision) - 1);
1418
        }
1419

1420
        static int32_t Min(int32_t a, int32_t b)
1421
        {
1422
            if (a < b)
1423
                return a;
1424
            return b;
1425
        }
1426

1427
        static float Min(float a, float b)
1428
        {
1429
            if (a < b)
1430
                return a;
1431
            return b;
1432
        }
1433

1434
        static int32_t Max(int32_t a, int32_t b)
1435
        {
1436
            if (a > b)
1437
                return a;
1438
            return b;
1439
        }
1440

1441
        static float Max(float a, float b)
1442
        {
1443
            if (a > b)
1444
                return a;
1445
            return b;
1446
        }
1447

1448
        static float Abs(float a)
1449
        {
1450
            return fabsf(a);
1451
        }
1452

1453
        static int32_t Abs(int32_t a)
1454
        {
1455
            if (a < 0)
1456
                return -a;
1457
            return a;
1458
        }
1459

1460
        static float Clamp(float v, float min, float max)
1461
        {
1462
            if (v < min)
1463
                return min;
1464
            if (v > max)
1465
                return max;
1466
            return v;
1467
        }
1468

1469
        static float Reciprocal(float v)
1470
        {
1471
            return 1.0f / v;
1472
        }
1473

1474
        static void ConvertLDRInputs(const PixelBlockU8* inputBlocks, int pxOffset, int channel, int32_t& chOut)
1475
        {
1476
            chOut = inputBlocks[0].m_pixels[pxOffset][channel];
1477
        }
1478

1479
        static void ConvertHDRInputs(const PixelBlockF16* inputBlocks, int pxOffset, int channel, int32_t& chOut)
1480
        {
1481
            chOut = inputBlocks[0].m_pixels[pxOffset][channel];
1482
        }
1483

1484
        static float MakeFloat(float v)
1485
        {
1486
            return v;
1487
        }
1488

1489
        static float MakeFloatZero()
1490
        {
1491
            return 0.0f;
1492
        }
1493

1494
        static int32_t MakeUInt16(uint16_t v)
1495
        {
1496
            return v;
1497
        }
1498

1499
        static int32_t MakeSInt16(int16_t v)
1500
        {
1501
            return v;
1502
        }
1503

1504
        static int32_t MakeAInt16(int16_t v)
1505
        {
1506
            return v;
1507
        }
1508

1509
        static int32_t MakeUInt15(uint16_t v)
1510
        {
1511
            return v;
1512
        }
1513

1514
        static int32_t MakeSInt32(int32_t v)
1515
        {
1516
            return v;
1517
        }
1518

1519
        static int32_t MakeUInt31(int32_t v)
1520
        {
1521
            return v;
1522
        }
1523

1524
        static int32_t Extract(int32_t v, int offset)
1525
        {
1526
            UNREFERENCED_PARAMETER(offset);
1527
            return v;
1528
        }
1529

1530
        static bool Extract(bool v, int offset)
1531
        {
1532
            UNREFERENCED_PARAMETER(offset);
1533
            return v;
1534
        }
1535

1536
        static float Extract(float v, int offset)
1537
        {
1538
            UNREFERENCED_PARAMETER(offset);
1539
            return v;
1540
        }
1541

1542
        static void PutUInt16(int32_t &dest, int offset, ParallelMath::ScalarUInt16 v)
1543
        {
1544
            UNREFERENCED_PARAMETER(offset);
1545
            dest = v;
1546
        }
1547

1548
        static void PutUInt15(int32_t &dest, int offset, ParallelMath::ScalarUInt16 v)
1549
        {
1550
            UNREFERENCED_PARAMETER(offset);
1551
            dest = v;
1552
        }
1553

1554
        static void PutSInt16(int32_t &dest, int offset, ParallelMath::ScalarSInt16 v)
1555
        {
1556
            UNREFERENCED_PARAMETER(offset);
1557
            dest = v;
1558
        }
1559

1560
        static float ExtractFloat(float v, int offset)
1561
        {
1562
            UNREFERENCED_PARAMETER(offset);
1563
            return v;
1564
        }
1565

1566
        static void PutFloat(float &dest, int offset, float v)
1567
        {
1568
            UNREFERENCED_PARAMETER(offset);
1569
            dest = v;
1570
        }
1571

1572
        static void PutBoolInt16(bool &dest, int offset, bool v)
1573
        {
1574
            UNREFERENCED_PARAMETER(offset);
1575
            dest = v;
1576
        }
1577

1578
        static bool Less(int32_t a, int32_t b)
1579
        {
1580
            return a < b;
1581
        }
1582

1583
        static bool Less(float a, float b)
1584
        {
1585
            return a < b;
1586
        }
1587

1588
        static bool LessOrEqual(int32_t a, int32_t b)
1589
        {
1590
            return a < b;
1591
        }
1592

1593
        static bool LessOrEqual(float a, float b)
1594
        {
1595
            return a < b;
1596
        }
1597

1598
        static bool Equal(int32_t a, int32_t b)
1599
        {
1600
            return a == b;
1601
        }
1602

1603
        static bool Equal(float a, float b)
1604
        {
1605
            return a == b;
1606
        }
1607

1608
        static float ToFloat(int32_t v)
1609
        {
1610
            return static_cast<float>(v);
1611
        }
1612

1613
        static int32_t ToUInt31(int32_t v)
1614
        {
1615
            return v;
1616
        }
1617

1618
        static int32_t ToInt32(int32_t v)
1619
        {
1620
            return v;
1621
        }
1622

1623
        static bool FloatFlagToInt16(bool v)
1624
        {
1625
            return v;
1626
        }
1627

1628
        static bool Int32FlagToInt16(bool v)
1629
        {
1630
            return v;
1631
        }
1632

1633
        static bool Int16FlagToFloat(bool v)
1634
        {
1635
            return v;
1636
        }
1637

1638
        static bool MakeBoolInt16(bool b)
1639
        {
1640
            return b;
1641
        }
1642

1643
        static bool MakeBoolFloat(bool b)
1644
        {
1645
            return b;
1646
        }
1647

1648
        static bool AndNot(bool a, bool b)
1649
        {
1650
            return a && !b;
1651
        }
1652

1653
        static bool Not(bool b)
1654
        {
1655
            return !b;
1656
        }
1657

1658
        static int32_t RoundAndConvertToInt(float v, const ParallelMath::RoundTowardZeroForScope *rtz)
1659
        {
1660
            UNREFERENCED_PARAMETER(rtz);
1661
            return static_cast<int>(v);
1662
        }
1663

1664
        static int32_t RoundAndConvertToInt(float v, const ParallelMath::RoundUpForScope *ru)
1665
        {
1666
            UNREFERENCED_PARAMETER(ru);
1667
            return static_cast<int>(ceilf(v));
1668
        }
1669

1670
        static int32_t RoundAndConvertToInt(float v, const ParallelMath::RoundDownForScope *rd)
1671
        {
1672
            UNREFERENCED_PARAMETER(rd);
1673
            return static_cast<int>(floorf(v));
1674
        }
1675

1676
        static int32_t RoundAndConvertToInt(float v, const ParallelMath::RoundTowardNearestForScope *rtn)
1677
        {
1678
            UNREFERENCED_PARAMETER(rtn);
1679
            return static_cast<int>(floorf(v + 0.5f));
1680
        }
1681

1682
        template<class TRoundMode>
1683
        static int32_t RoundAndConvertToU16(float v, const TRoundMode *roundingMode)
1684
        {
1685
            return RoundAndConvertToInt(v, roundingMode);
1686
        }
1687

1688
        template<class TRoundMode>
1689
        static int32_t RoundAndConvertToU15(float v, const TRoundMode *roundingMode)
1690
        {
1691
            return RoundAndConvertToInt(v, roundingMode);
1692
        }
1693

1694
        template<class TRoundMode>
1695
        static int32_t RoundAndConvertToS16(float v, const TRoundMode *roundingMode)
1696
        {
1697
            return RoundAndConvertToInt(v, roundingMode);
1698
        }
1699

1700
        static float Sqrt(float f)
1701
        {
1702
            return sqrtf(f);
1703
        }
1704

1705
        static int32_t SqDiffUInt8(int32_t a, int32_t b)
1706
        {
1707
            int32_t delta = a - b;
1708
            return delta * delta;
1709
        }
1710

1711
        static int32_t SqDiffInt16(int32_t a, int32_t b)
1712
        {
1713
            int32_t delta = a - b;
1714
            return delta * delta;
1715
        }
1716

1717
        static int32_t SqDiffSInt16(int32_t a, int32_t b)
1718
        {
1719
            int32_t delta = a - b;
1720
            return delta * delta;
1721
        }
1722

1723
        static float TwosCLHalfToFloat(int32_t v)
1724
        {
1725
            int32_t absV = (v < 0) ? -v : v;
1726

1727
            int32_t signBits = (absV & -32768);
1728
            int32_t mantissa = (absV & 0x03ff);
1729
            int32_t exponent = (absV & 0x7c00);
1730

1731
            bool isDenormal = (exponent == 0);
1732

1733
            // Convert exponent to high-bits
1734
            exponent = (exponent >> 3) + 14336;
1735

1736
            int32_t denormalCorrection = (isDenormal ? (signBits | 14336) : 0) << 16;
1737

1738
            int32_t fBits = ((exponent | signBits) << 16) | (mantissa << 13);
1739

1740
            float f, correction;
1741
            memcpy(&f, &fBits, 4);
1742
            memcpy(&correction, &denormalCorrection, 4);
1743

1744
            return f - correction;
1745
        }
1746

1747
        static Float SqDiff2CLFloat(const SInt16 &a, const Float &b)
1748
        {
1749
            Float fa = TwosCLHalfToFloat(a);
1750

1751
            Float diff = fa - b;
1752
            return diff * diff;
1753
        }
1754

1755
        static Float SqDiff2CL(const SInt16 &a, const SInt16 &b)
1756
        {
1757
            Float fa = TwosCLHalfToFloat(a);
1758
            Float fb = TwosCLHalfToFloat(b);
1759

1760
            Float diff = fa - fb;
1761
            return diff * diff;
1762
        }
1763

1764
        static Float SqDiff2CLFloat(const SInt16 &a, float aWeight, const Float &b)
1765
        {
1766
            Float fa = TwosCLHalfToFloat(a) * aWeight;
1767

1768
            Float diff = fa - b;
1769
            return diff * diff;
1770
        }
1771

1772
        static int32_t RightShift(int32_t v, int bits)
1773
        {
1774
            return SignedRightShift(v, bits);
1775
        }
1776

1777
        static int32_t ToSInt16(int32_t v)
1778
        {
1779
            return v;
1780
        }
1781

1782
        static int32_t ToUInt16(int32_t v)
1783
        {
1784
            return v;
1785
        }
1786

1787
        static int32_t ToUInt15(int32_t v)
1788
        {
1789
            return v;
1790
        }
1791

1792
        static int32_t XMultiply(int32_t a, int32_t b)
1793
        {
1794
            return a * b;
1795
        }
1796

1797
        static int32_t CompactMultiply(int32_t a, int32_t b)
1798
        {
1799
            return a * b;
1800
        }
1801

1802
        static bool AnySet(bool v)
1803
        {
1804
            return v;
1805
        }
1806

1807
        static bool AllSet(bool v)
1808
        {
1809
            return v;
1810
        }
1811
    };
1812

1813
#endif
1814
}
1815

1816
#endif
1817

1818
Product

Resources

Company