Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
godotengine
GitHub Repository: godotengine/godot
Path: blob/master/thirdparty/cvtt/ConvectionKernels_ParallelMath.h
9898 views
1
/*
2
Convection Texture Tools
3
Copyright (c) 2018-2019 Eric Lasota
4
5
Permission is hereby granted, free of charge, to any person obtaining
6
a copy of this software and associated documentation files (the
7
"Software"), to deal in the Software without restriction, including
8
without limitation the rights to use, copy, modify, merge, publish,
9
distribute, sublicense, and/or sell copies of the Software, and to
10
permit persons to whom the Software is furnished to do so, subject
11
to the following conditions:
12
13
The above copyright notice and this permission notice shall be included
14
in all copies or substantial portions of the Software.
15
16
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
17
OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
19
IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
20
CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
21
TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
22
SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
23
24
*/
25
#pragma once
26
#ifndef __CVTT_PARALLELMATH_H__
27
#define __CVTT_PARALLELMATH_H__
28
29
#include "ConvectionKernels.h"
30
#include "ConvectionKernels_Config.h"
31
32
#ifdef CVTT_USE_SSE2
33
#include <emmintrin.h>
34
#endif
35
36
#include <float.h>
37
#include <assert.h>
38
#include <string.h>
39
#include <algorithm>
40
#include <math.h>
41
42
#define UNREFERENCED_PARAMETER(n) ((void)n)
43
44
// Parallel math implementation
45
//
46
// After preprocessor defs are handled, what this should do is expose the following types:
47
// SInt16 - Signed 16-bit integer
48
// UInt16 - Signed 16-bit integer
49
// UInt15 - Unsigned 15-bit integer
50
// SInt32 - Signed 32-bit integer
51
// UInt31 - Unsigned 31-bit integer
52
// AInt16 - 16-bit integer of unknown signedness (only used for storage)
53
// Int16CompFlag - Comparison flags from comparing 16-bit integers
54
// Int32CompFlag - Comparison flags from comparing 32-bit integers
55
// FloatCompFlag - Comparison flags from comparing 32-bit floats
56
//
57
// The reason for these distinctions are that depending on the instruction set, signed or unsigned versions of certain ops
58
// (particularly max, min, compares, and right shift) may not be available. In cases where ops are not available, it's
59
// necessary to do high bit manipulations to accomplish the operation with 16-bit numbers. The 15-bit and 31-bit uint types
60
// can elide the bit flips if unsigned versions are not available.
61
62
namespace cvtt
63
{
64
#ifdef CVTT_USE_SSE2
65
// SSE2 version
66
struct ParallelMath
67
{
68
typedef uint16_t ScalarUInt16;
69
typedef int16_t ScalarSInt16;
70
71
template<unsigned int TRoundingMode>
72
struct RoundForScope
73
{
74
unsigned int m_oldCSR;
75
76
RoundForScope()
77
{
78
m_oldCSR = _mm_getcsr();
79
_mm_setcsr((m_oldCSR & ~_MM_ROUND_MASK) | (TRoundingMode));
80
}
81
82
~RoundForScope()
83
{
84
_mm_setcsr(m_oldCSR);
85
}
86
};
87
88
struct RoundTowardZeroForScope : RoundForScope<_MM_ROUND_TOWARD_ZERO>
89
{
90
};
91
92
struct RoundTowardNearestForScope : RoundForScope<_MM_ROUND_NEAREST>
93
{
94
};
95
96
struct RoundUpForScope : RoundForScope<_MM_ROUND_UP>
97
{
98
};
99
100
struct RoundDownForScope : RoundForScope<_MM_ROUND_DOWN>
101
{
102
};
103
104
static const int ParallelSize = 8;
105
106
enum Int16Subtype
107
{
108
IntSubtype_Signed,
109
IntSubtype_UnsignedFull,
110
IntSubtype_UnsignedTruncated,
111
IntSubtype_Abstract,
112
};
113
114
template<int TSubtype>
115
struct VInt16
116
{
117
__m128i m_value;
118
119
inline VInt16 operator+(int16_t other) const
120
{
121
VInt16 result;
122
result.m_value = _mm_add_epi16(m_value, _mm_set1_epi16(static_cast<int16_t>(other)));
123
return result;
124
}
125
126
inline VInt16 operator+(const VInt16 &other) const
127
{
128
VInt16 result;
129
result.m_value = _mm_add_epi16(m_value, other.m_value);
130
return result;
131
}
132
133
inline VInt16 operator|(const VInt16 &other) const
134
{
135
VInt16 result;
136
result.m_value = _mm_or_si128(m_value, other.m_value);
137
return result;
138
}
139
140
inline VInt16 operator&(const VInt16 &other) const
141
{
142
VInt16 result;
143
result.m_value = _mm_and_si128(m_value, other.m_value);
144
return result;
145
}
146
147
inline VInt16 operator-(const VInt16 &other) const
148
{
149
VInt16 result;
150
result.m_value = _mm_sub_epi16(m_value, other.m_value);
151
return result;
152
}
153
154
inline VInt16 operator<<(int bits) const
155
{
156
VInt16 result;
157
result.m_value = _mm_slli_epi16(m_value, bits);
158
return result;
159
}
160
161
inline VInt16 operator^(const VInt16 &other) const
162
{
163
VInt16 result;
164
result.m_value = _mm_xor_si128(m_value, other.m_value);
165
return result;
166
}
167
};
168
169
typedef VInt16<IntSubtype_Signed> SInt16;
170
typedef VInt16<IntSubtype_UnsignedFull> UInt16;
171
typedef VInt16<IntSubtype_UnsignedTruncated> UInt15;
172
typedef VInt16<IntSubtype_Abstract> AInt16;
173
174
template<int TSubtype>
175
struct VInt32
176
{
177
__m128i m_values[2];
178
179
inline VInt32 operator+(const VInt32& other) const
180
{
181
VInt32 result;
182
result.m_values[0] = _mm_add_epi32(m_values[0], other.m_values[0]);
183
result.m_values[1] = _mm_add_epi32(m_values[1], other.m_values[1]);
184
return result;
185
}
186
187
inline VInt32 operator-(const VInt32& other) const
188
{
189
VInt32 result;
190
result.m_values[0] = _mm_sub_epi32(m_values[0], other.m_values[0]);
191
result.m_values[1] = _mm_sub_epi32(m_values[1], other.m_values[1]);
192
return result;
193
}
194
195
inline VInt32 operator<<(const int other) const
196
{
197
VInt32 result;
198
result.m_values[0] = _mm_slli_epi32(m_values[0], other);
199
result.m_values[1] = _mm_slli_epi32(m_values[1], other);
200
return result;
201
}
202
203
inline VInt32 operator|(const VInt32& other) const
204
{
205
VInt32 result;
206
result.m_values[0] = _mm_or_si128(m_values[0], other.m_values[0]);
207
result.m_values[1] = _mm_or_si128(m_values[1], other.m_values[1]);
208
return result;
209
}
210
};
211
212
typedef VInt32<IntSubtype_Signed> SInt32;
213
typedef VInt32<IntSubtype_UnsignedTruncated> UInt31;
214
typedef VInt32<IntSubtype_UnsignedFull> UInt32;
215
typedef VInt32<IntSubtype_Abstract> AInt32;
216
217
template<class TTargetType>
218
struct LosslessCast
219
{
220
#ifdef CVTT_PERMIT_ALIASING
221
template<int TSrcSubtype>
222
static const TTargetType& Cast(const VInt32<TSrcSubtype> &src)
223
{
224
return reinterpret_cast<VInt32<TSubtype>&>(src);
225
}
226
227
template<int TSrcSubtype>
228
static const TTargetType& Cast(const VInt16<TSrcSubtype> &src)
229
{
230
return reinterpret_cast<VInt16<TSubtype>&>(src);
231
}
232
#else
233
template<int TSrcSubtype>
234
static TTargetType Cast(const VInt32<TSrcSubtype> &src)
235
{
236
TTargetType result;
237
result.m_values[0] = src.m_values[0];
238
result.m_values[1] = src.m_values[1];
239
return result;
240
}
241
242
template<int TSrcSubtype>
243
static TTargetType Cast(const VInt16<TSrcSubtype> &src)
244
{
245
TTargetType result;
246
result.m_value = src.m_value;
247
return result;
248
}
249
#endif
250
};
251
252
struct Int64
253
{
254
__m128i m_values[4];
255
};
256
257
struct Float
258
{
259
__m128 m_values[2];
260
261
inline Float operator+(const Float &other) const
262
{
263
Float result;
264
result.m_values[0] = _mm_add_ps(m_values[0], other.m_values[0]);
265
result.m_values[1] = _mm_add_ps(m_values[1], other.m_values[1]);
266
return result;
267
}
268
269
inline Float operator+(float other) const
270
{
271
Float result;
272
result.m_values[0] = _mm_add_ps(m_values[0], _mm_set1_ps(other));
273
result.m_values[1] = _mm_add_ps(m_values[1], _mm_set1_ps(other));
274
return result;
275
}
276
277
inline Float operator-(const Float& other) const
278
{
279
Float result;
280
result.m_values[0] = _mm_sub_ps(m_values[0], other.m_values[0]);
281
result.m_values[1] = _mm_sub_ps(m_values[1], other.m_values[1]);
282
return result;
283
}
284
285
inline Float operator-() const
286
{
287
Float result;
288
result.m_values[0] = _mm_sub_ps(_mm_setzero_ps(), m_values[0]);
289
result.m_values[1] = _mm_sub_ps(_mm_setzero_ps(), m_values[1]);
290
return result;
291
}
292
293
inline Float operator*(const Float& other) const
294
{
295
Float result;
296
result.m_values[0] = _mm_mul_ps(m_values[0], other.m_values[0]);
297
result.m_values[1] = _mm_mul_ps(m_values[1], other.m_values[1]);
298
return result;
299
}
300
301
inline Float operator*(float other) const
302
{
303
Float result;
304
result.m_values[0] = _mm_mul_ps(m_values[0], _mm_set1_ps(other));
305
result.m_values[1] = _mm_mul_ps(m_values[1], _mm_set1_ps(other));
306
return result;
307
}
308
309
inline Float operator/(const Float &other) const
310
{
311
Float result;
312
result.m_values[0] = _mm_div_ps(m_values[0], other.m_values[0]);
313
result.m_values[1] = _mm_div_ps(m_values[1], other.m_values[1]);
314
return result;
315
}
316
317
inline Float operator/(float other) const
318
{
319
Float result;
320
result.m_values[0] = _mm_div_ps(m_values[0], _mm_set1_ps(other));
321
result.m_values[1] = _mm_div_ps(m_values[1], _mm_set1_ps(other));
322
return result;
323
}
324
};
325
326
struct Int16CompFlag
327
{
328
__m128i m_value;
329
330
inline Int16CompFlag operator&(const Int16CompFlag &other) const
331
{
332
Int16CompFlag result;
333
result.m_value = _mm_and_si128(m_value, other.m_value);
334
return result;
335
}
336
337
inline Int16CompFlag operator|(const Int16CompFlag &other) const
338
{
339
Int16CompFlag result;
340
result.m_value = _mm_or_si128(m_value, other.m_value);
341
return result;
342
}
343
};
344
345
struct Int32CompFlag
346
{
347
__m128i m_values[2];
348
349
inline Int32CompFlag operator&(const Int32CompFlag &other) const
350
{
351
Int32CompFlag result;
352
result.m_values[0] = _mm_and_si128(m_values[0], other.m_values[0]);
353
result.m_values[1] = _mm_and_si128(m_values[1], other.m_values[1]);
354
return result;
355
}
356
357
inline Int32CompFlag operator|(const Int32CompFlag &other) const
358
{
359
Int32CompFlag result;
360
result.m_values[0] = _mm_or_si128(m_values[0], other.m_values[0]);
361
result.m_values[1] = _mm_or_si128(m_values[1], other.m_values[1]);
362
return result;
363
}
364
};
365
366
struct FloatCompFlag
367
{
368
__m128 m_values[2];
369
370
inline FloatCompFlag operator&(const FloatCompFlag &other) const
371
{
372
FloatCompFlag result;
373
result.m_values[0] = _mm_and_ps(m_values[0], other.m_values[0]);
374
result.m_values[1] = _mm_and_ps(m_values[1], other.m_values[1]);
375
return result;
376
}
377
378
inline FloatCompFlag operator|(const FloatCompFlag &other) const
379
{
380
FloatCompFlag result;
381
result.m_values[0] = _mm_or_ps(m_values[0], other.m_values[0]);
382
result.m_values[1] = _mm_or_ps(m_values[1], other.m_values[1]);
383
return result;
384
}
385
};
386
387
template<int TSubtype>
388
static VInt16<TSubtype> AbstractAdd(const VInt16<TSubtype> &a, const VInt16<TSubtype> &b)
389
{
390
VInt16<TSubtype> result;
391
result.m_value = _mm_add_epi16(a.m_value, b.m_value);
392
return result;
393
}
394
395
template<int TSubtype>
396
static VInt16<TSubtype> AbstractSubtract(const VInt16<TSubtype> &a, const VInt16<TSubtype> &b)
397
{
398
VInt16<TSubtype> result;
399
result.m_value = _mm_sub_epi16(a.m_value, b.m_value);
400
return result;
401
}
402
403
static Float Select(const FloatCompFlag &flag, const Float &a, const Float &b)
404
{
405
Float result;
406
for (int i = 0; i < 2; i++)
407
result.m_values[i] = _mm_or_ps(_mm_and_ps(flag.m_values[i], a.m_values[i]), _mm_andnot_ps(flag.m_values[i], b.m_values[i]));
408
return result;
409
}
410
411
template<int TSubtype>
412
static VInt16<TSubtype> Select(const Int16CompFlag &flag, const VInt16<TSubtype> &a, const VInt16<TSubtype> &b)
413
{
414
VInt16<TSubtype> result;
415
result.m_value = _mm_or_si128(_mm_and_si128(flag.m_value, a.m_value), _mm_andnot_si128(flag.m_value, b.m_value));
416
return result;
417
}
418
419
template<int TSubtype>
420
static VInt16<TSubtype> SelectOrZero(const Int16CompFlag &flag, const VInt16<TSubtype> &a)
421
{
422
VInt16<TSubtype> result;
423
result.m_value = _mm_and_si128(flag.m_value, a.m_value);
424
return result;
425
}
426
427
template<int TSubtype>
428
static void ConditionalSet(VInt16<TSubtype> &dest, const Int16CompFlag &flag, const VInt16<TSubtype> &src)
429
{
430
dest.m_value = _mm_or_si128(_mm_andnot_si128(flag.m_value, dest.m_value), _mm_and_si128(flag.m_value, src.m_value));
431
}
432
433
template<int TSubtype>
434
static void ConditionalSet(VInt32<TSubtype> &dest, const Int16CompFlag &flag, const VInt32<TSubtype> &src)
435
{
436
__m128i lowFlags = _mm_unpacklo_epi16(flag.m_value, flag.m_value);
437
__m128i highFlags = _mm_unpackhi_epi16(flag.m_value, flag.m_value);
438
dest.m_values[0] = _mm_or_si128(_mm_andnot_si128(lowFlags, dest.m_values[0]), _mm_and_si128(lowFlags, src.m_values[0]));
439
dest.m_values[1] = _mm_or_si128(_mm_andnot_si128(highFlags, dest.m_values[1]), _mm_and_si128(highFlags, src.m_values[1]));
440
}
441
442
static void ConditionalSet(ParallelMath::Int16CompFlag &dest, const Int16CompFlag &flag, const ParallelMath::Int16CompFlag &src)
443
{
444
dest.m_value = _mm_or_si128(_mm_andnot_si128(flag.m_value, dest.m_value), _mm_and_si128(flag.m_value, src.m_value));
445
}
446
447
static SInt16 ConditionalNegate(const Int16CompFlag &flag, const SInt16 &v)
448
{
449
SInt16 result;
450
result.m_value = _mm_add_epi16(_mm_xor_si128(flag.m_value, v.m_value), _mm_srli_epi16(flag.m_value, 15));
451
return result;
452
}
453
454
template<int TSubtype>
455
static void NotConditionalSet(VInt16<TSubtype> &dest, const Int16CompFlag &flag, const VInt16<TSubtype> &src)
456
{
457
dest.m_value = _mm_or_si128(_mm_and_si128(flag.m_value, dest.m_value), _mm_andnot_si128(flag.m_value, src.m_value));
458
}
459
460
static void ConditionalSet(Float &dest, const FloatCompFlag &flag, const Float &src)
461
{
462
for (int i = 0; i < 2; i++)
463
dest.m_values[i] = _mm_or_ps(_mm_andnot_ps(flag.m_values[i], dest.m_values[i]), _mm_and_ps(flag.m_values[i], src.m_values[i]));
464
}
465
466
static void NotConditionalSet(Float &dest, const FloatCompFlag &flag, const Float &src)
467
{
468
for (int i = 0; i < 2; i++)
469
dest.m_values[i] = _mm_or_ps(_mm_and_ps(flag.m_values[i], dest.m_values[i]), _mm_andnot_ps(flag.m_values[i], src.m_values[i]));
470
}
471
472
static void MakeSafeDenominator(Float& v)
473
{
474
ConditionalSet(v, Equal(v, MakeFloatZero()), MakeFloat(1.0f));
475
}
476
477
static SInt16 TruncateToPrecisionSigned(const SInt16 &v, int precision)
478
{
479
int lostBits = 16 - precision;
480
if (lostBits == 0)
481
return v;
482
483
SInt16 result;
484
result.m_value = _mm_srai_epi16(_mm_slli_epi16(v.m_value, lostBits), lostBits);
485
return result;
486
}
487
488
static UInt16 TruncateToPrecisionUnsigned(const UInt16 &v, int precision)
489
{
490
int lostBits = 16 - precision;
491
if (lostBits == 0)
492
return v;
493
494
UInt16 result;
495
result.m_value = _mm_srli_epi16(_mm_slli_epi16(v.m_value, lostBits), lostBits);
496
return result;
497
}
498
499
static UInt16 Min(const UInt16 &a, const UInt16 &b)
500
{
501
__m128i bitFlip = _mm_set1_epi16(-32768);
502
503
UInt16 result;
504
result.m_value = _mm_xor_si128(_mm_min_epi16(_mm_xor_si128(a.m_value, bitFlip), _mm_xor_si128(b.m_value, bitFlip)), bitFlip);
505
return result;
506
}
507
508
static SInt16 Min(const SInt16 &a, const SInt16 &b)
509
{
510
SInt16 result;
511
result.m_value = _mm_min_epi16(a.m_value, b.m_value);
512
return result;
513
}
514
515
static UInt15 Min(const UInt15 &a, const UInt15 &b)
516
{
517
UInt15 result;
518
result.m_value = _mm_min_epi16(a.m_value, b.m_value);
519
return result;
520
}
521
522
static Float Min(const Float &a, const Float &b)
523
{
524
Float result;
525
for (int i = 0; i < 2; i++)
526
result.m_values[i] = _mm_min_ps(a.m_values[i], b.m_values[i]);
527
return result;
528
}
529
530
static UInt16 Max(const UInt16 &a, const UInt16 &b)
531
{
532
__m128i bitFlip = _mm_set1_epi16(-32768);
533
534
UInt16 result;
535
result.m_value = _mm_xor_si128(_mm_max_epi16(_mm_xor_si128(a.m_value, bitFlip), _mm_xor_si128(b.m_value, bitFlip)), bitFlip);
536
return result;
537
}
538
539
static SInt16 Max(const SInt16 &a, const SInt16 &b)
540
{
541
SInt16 result;
542
result.m_value = _mm_max_epi16(a.m_value, b.m_value);
543
return result;
544
}
545
546
static UInt15 Max(const UInt15 &a, const UInt15 &b)
547
{
548
UInt15 result;
549
result.m_value = _mm_max_epi16(a.m_value, b.m_value);
550
return result;
551
}
552
553
static Float Max(const Float &a, const Float &b)
554
{
555
Float result;
556
for (int i = 0; i < 2; i++)
557
result.m_values[i] = _mm_max_ps(a.m_values[i], b.m_values[i]);
558
return result;
559
}
560
561
static Float Clamp(const Float &v, float min, float max)
562
{
563
Float result;
564
for (int i = 0; i < 2; i++)
565
result.m_values[i] = _mm_max_ps(_mm_min_ps(v.m_values[i], _mm_set1_ps(max)), _mm_set1_ps(min));
566
return result;
567
}
568
569
static Float Reciprocal(const Float &v)
570
{
571
Float result;
572
for (int i = 0; i < 2; i++)
573
result.m_values[i] = _mm_rcp_ps(v.m_values[i]);
574
return result;
575
}
576
577
static void ConvertLDRInputs(const PixelBlockU8* inputBlocks, int pxOffset, int channel, UInt15 &chOut)
578
{
579
int16_t values[8];
580
for (int i = 0; i < 8; i++)
581
values[i] = inputBlocks[i].m_pixels[pxOffset][channel];
582
583
chOut.m_value = _mm_set_epi16(values[7], values[6], values[5], values[4], values[3], values[2], values[1], values[0]);
584
}
585
586
static void ConvertHDRInputs(const PixelBlockF16* inputBlocks, int pxOffset, int channel, SInt16 &chOut)
587
{
588
int16_t values[8];
589
for (int i = 0; i < 8; i++)
590
values[i] = inputBlocks[i].m_pixels[pxOffset][channel];
591
592
chOut.m_value = _mm_set_epi16(values[7], values[6], values[5], values[4], values[3], values[2], values[1], values[0]);
593
}
594
595
static Float MakeFloat(float v)
596
{
597
Float f;
598
f.m_values[0] = f.m_values[1] = _mm_set1_ps(v);
599
return f;
600
}
601
602
static Float MakeFloatZero()
603
{
604
Float f;
605
f.m_values[0] = f.m_values[1] = _mm_setzero_ps();
606
return f;
607
}
608
609
static UInt16 MakeUInt16(uint16_t v)
610
{
611
UInt16 result;
612
result.m_value = _mm_set1_epi16(static_cast<short>(v));
613
return result;
614
}
615
616
static SInt16 MakeSInt16(int16_t v)
617
{
618
SInt16 result;
619
result.m_value = _mm_set1_epi16(static_cast<short>(v));
620
return result;
621
}
622
623
static AInt16 MakeAInt16(int16_t v)
624
{
625
AInt16 result;
626
result.m_value = _mm_set1_epi16(static_cast<short>(v));
627
return result;
628
}
629
630
static UInt15 MakeUInt15(uint16_t v)
631
{
632
UInt15 result;
633
result.m_value = _mm_set1_epi16(static_cast<short>(v));
634
return result;
635
}
636
637
static SInt32 MakeSInt32(int32_t v)
638
{
639
SInt32 result;
640
result.m_values[0] = _mm_set1_epi32(v);
641
result.m_values[1] = _mm_set1_epi32(v);
642
return result;
643
}
644
645
static UInt31 MakeUInt31(uint32_t v)
646
{
647
UInt31 result;
648
result.m_values[0] = _mm_set1_epi32(v);
649
result.m_values[1] = _mm_set1_epi32(v);
650
return result;
651
}
652
653
static uint16_t Extract(const UInt16 &v, int offset)
654
{
655
return reinterpret_cast<const uint16_t*>(&v.m_value)[offset];
656
}
657
658
static int16_t Extract(const SInt16 &v, int offset)
659
{
660
return reinterpret_cast<const int16_t*>(&v.m_value)[offset];
661
}
662
663
static uint16_t Extract(const UInt15 &v, int offset)
664
{
665
return reinterpret_cast<const uint16_t*>(&v.m_value)[offset];
666
}
667
668
static int16_t Extract(const AInt16 &v, int offset)
669
{
670
return reinterpret_cast<const int16_t*>(&v.m_value)[offset];
671
}
672
673
static int32_t Extract(const SInt32 &v, int offset)
674
{
675
return reinterpret_cast<const int32_t*>(&v.m_values[offset >> 2])[offset & 3];
676
}
677
678
static float Extract(const Float &v, int offset)
679
{
680
return reinterpret_cast<const float*>(&v.m_values[offset >> 2])[offset & 3];
681
}
682
683
static bool Extract(const ParallelMath::Int16CompFlag &v, int offset)
684
{
685
return reinterpret_cast<const int16_t*>(&v.m_value)[offset] != 0;
686
}
687
688
static void PutUInt16(UInt16 &dest, int offset, uint16_t v)
689
{
690
reinterpret_cast<uint16_t*>(&dest)[offset] = v;
691
}
692
693
static void PutUInt15(UInt15 &dest, int offset, uint16_t v)
694
{
695
reinterpret_cast<uint16_t*>(&dest)[offset] = v;
696
}
697
698
static void PutSInt16(SInt16 &dest, int offset, int16_t v)
699
{
700
reinterpret_cast<int16_t*>(&dest)[offset] = v;
701
}
702
703
static float ExtractFloat(const Float& v, int offset)
704
{
705
return reinterpret_cast<const float*>(&v)[offset];
706
}
707
708
static void PutFloat(Float &dest, int offset, float v)
709
{
710
reinterpret_cast<float*>(&dest)[offset] = v;
711
}
712
713
static void PutBoolInt16(Int16CompFlag &dest, int offset, bool v)
714
{
715
reinterpret_cast<int16_t*>(&dest)[offset] = v ? -1 : 0;
716
}
717
718
static Int32CompFlag Less(const UInt31 &a, const UInt31 &b)
719
{
720
Int32CompFlag result;
721
result.m_values[0] = _mm_cmplt_epi32(a.m_values[0], b.m_values[0]);
722
result.m_values[1] = _mm_cmplt_epi32(a.m_values[1], b.m_values[1]);
723
return result;
724
}
725
726
static Int16CompFlag Less(const SInt16 &a, const SInt16 &b)
727
{
728
Int16CompFlag result;
729
result.m_value = _mm_cmplt_epi16(a.m_value, b.m_value);
730
return result;
731
}
732
733
static Int16CompFlag Less(const UInt15 &a, const UInt15 &b)
734
{
735
Int16CompFlag result;
736
result.m_value = _mm_cmplt_epi16(a.m_value, b.m_value);
737
return result;
738
}
739
740
static Int16CompFlag LessOrEqual(const UInt15 &a, const UInt15 &b)
741
{
742
Int16CompFlag result;
743
result.m_value = _mm_cmplt_epi16(a.m_value, b.m_value);
744
return result;
745
}
746
747
static FloatCompFlag Less(const Float &a, const Float &b)
748
{
749
FloatCompFlag result;
750
for (int i = 0; i < 2; i++)
751
result.m_values[i] = _mm_cmplt_ps(a.m_values[i], b.m_values[i]);
752
return result;
753
}
754
755
static FloatCompFlag LessOrEqual(const Float &a, const Float &b)
756
{
757
FloatCompFlag result;
758
for (int i = 0; i < 2; i++)
759
result.m_values[i] = _mm_cmple_ps(a.m_values[i], b.m_values[i]);
760
return result;
761
}
762
763
template<int TSubtype>
764
static Int16CompFlag Equal(const VInt16<TSubtype> &a, const VInt16<TSubtype> &b)
765
{
766
Int16CompFlag result;
767
result.m_value = _mm_cmpeq_epi16(a.m_value, b.m_value);
768
return result;
769
}
770
771
static FloatCompFlag Equal(const Float &a, const Float &b)
772
{
773
FloatCompFlag result;
774
for (int i = 0; i < 2; i++)
775
result.m_values[i] = _mm_cmpeq_ps(a.m_values[i], b.m_values[i]);
776
return result;
777
}
778
779
static Int16CompFlag Equal(const Int16CompFlag &a, const Int16CompFlag &b)
780
{
781
Int16CompFlag notResult;
782
notResult.m_value = _mm_xor_si128(a.m_value, b.m_value);
783
return Not(notResult);
784
}
785
786
static Float ToFloat(const UInt16 &v)
787
{
788
Float result;
789
result.m_values[0] = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v.m_value, _mm_setzero_si128()));
790
result.m_values[1] = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v.m_value, _mm_setzero_si128()));
791
return result;
792
}
793
794
static UInt31 ToUInt31(const UInt16 &v)
795
{
796
UInt31 result;
797
result.m_values[0] = _mm_unpacklo_epi16(v.m_value, _mm_setzero_si128());
798
result.m_values[1] = _mm_unpackhi_epi16(v.m_value, _mm_setzero_si128());
799
return result;
800
}
801
802
static SInt32 ToInt32(const UInt16 &v)
803
{
804
SInt32 result;
805
result.m_values[0] = _mm_unpacklo_epi16(v.m_value, _mm_setzero_si128());
806
result.m_values[1] = _mm_unpackhi_epi16(v.m_value, _mm_setzero_si128());
807
return result;
808
}
809
810
static SInt32 ToInt32(const UInt15 &v)
811
{
812
SInt32 result;
813
result.m_values[0] = _mm_unpacklo_epi16(v.m_value, _mm_setzero_si128());
814
result.m_values[1] = _mm_unpackhi_epi16(v.m_value, _mm_setzero_si128());
815
return result;
816
}
817
818
static SInt32 ToInt32(const SInt16 &v)
819
{
820
SInt32 result;
821
result.m_values[0] = _mm_srai_epi32(_mm_unpacklo_epi16(_mm_setzero_si128(), v.m_value), 16);
822
result.m_values[1] = _mm_srai_epi32(_mm_unpackhi_epi16(_mm_setzero_si128(), v.m_value), 16);
823
return result;
824
}
825
826
static Float ToFloat(const SInt16 &v)
827
{
828
Float result;
829
result.m_values[0] = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(_mm_setzero_si128(), v.m_value), 16));
830
result.m_values[1] = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(_mm_setzero_si128(), v.m_value), 16));
831
return result;
832
}
833
834
static Float ToFloat(const UInt15 &v)
835
{
836
Float result;
837
result.m_values[0] = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v.m_value, _mm_setzero_si128()));
838
result.m_values[1] = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v.m_value, _mm_setzero_si128()));
839
return result;
840
}
841
842
static Float ToFloat(const UInt31 &v)
843
{
844
Float result;
845
result.m_values[0] = _mm_cvtepi32_ps(v.m_values[0]);
846
result.m_values[1] = _mm_cvtepi32_ps(v.m_values[1]);
847
return result;
848
}
849
850
static Int16CompFlag FloatFlagToInt16(const FloatCompFlag &v)
851
{
852
__m128i lo = _mm_castps_si128(v.m_values[0]);
853
__m128i hi = _mm_castps_si128(v.m_values[1]);
854
855
Int16CompFlag result;
856
result.m_value = _mm_packs_epi32(lo, hi);
857
return result;
858
}
859
860
static FloatCompFlag Int16FlagToFloat(const Int16CompFlag &v)
861
{
862
__m128i lo = _mm_unpacklo_epi16(v.m_value, v.m_value);
863
__m128i hi = _mm_unpackhi_epi16(v.m_value, v.m_value);
864
865
FloatCompFlag result;
866
result.m_values[0] = _mm_castsi128_ps(lo);
867
result.m_values[1] = _mm_castsi128_ps(hi);
868
return result;
869
}
870
871
static Int16CompFlag Int32FlagToInt16(const Int32CompFlag &v)
872
{
873
__m128i lo = v.m_values[0];
874
__m128i hi = v.m_values[1];
875
876
Int16CompFlag result;
877
result.m_value = _mm_packs_epi32(lo, hi);
878
return result;
879
}
880
881
static Int16CompFlag MakeBoolInt16(bool b)
882
{
883
Int16CompFlag result;
884
if (b)
885
result.m_value = _mm_set1_epi16(-1);
886
else
887
result.m_value = _mm_setzero_si128();
888
return result;
889
}
890
891
static FloatCompFlag MakeBoolFloat(bool b)
892
{
893
FloatCompFlag result;
894
if (b)
895
result.m_values[0] = result.m_values[1] = _mm_castsi128_ps(_mm_set1_epi32(-1));
896
else
897
result.m_values[0] = result.m_values[1] = _mm_setzero_ps();
898
return result;
899
}
900
901
static Int16CompFlag AndNot(const Int16CompFlag &a, const Int16CompFlag &b)
902
{
903
Int16CompFlag result;
904
result.m_value = _mm_andnot_si128(b.m_value, a.m_value);
905
return result;
906
}
907
908
static Int16CompFlag Not(const Int16CompFlag &b)
909
{
910
Int16CompFlag result;
911
result.m_value = _mm_xor_si128(b.m_value, _mm_set1_epi32(-1));
912
return result;
913
}
914
915
static Int32CompFlag Not(const Int32CompFlag &b)
916
{
917
Int32CompFlag result;
918
result.m_values[0] = _mm_xor_si128(b.m_values[0], _mm_set1_epi32(-1));
919
result.m_values[1] = _mm_xor_si128(b.m_values[1], _mm_set1_epi32(-1));
920
return result;
921
}
922
923
static UInt16 RoundAndConvertToU16(const Float &v, const void* /*roundingMode*/)
924
{
925
__m128i lo = _mm_cvtps_epi32(_mm_add_ps(v.m_values[0], _mm_set1_ps(-32768)));
926
__m128i hi = _mm_cvtps_epi32(_mm_add_ps(v.m_values[1], _mm_set1_ps(-32768)));
927
928
__m128i packed = _mm_packs_epi32(lo, hi);
929
930
UInt16 result;
931
result.m_value = _mm_xor_si128(packed, _mm_set1_epi16(-32768));
932
return result;
933
}
934
935
static UInt15 RoundAndConvertToU15(const Float &v, const void* /*roundingMode*/)
936
{
937
__m128i lo = _mm_cvtps_epi32(v.m_values[0]);
938
__m128i hi = _mm_cvtps_epi32(v.m_values[1]);
939
940
__m128i packed = _mm_packs_epi32(lo, hi);
941
942
UInt15 result;
943
result.m_value = _mm_packs_epi32(lo, hi);
944
return result;
945
}
946
947
static SInt16 RoundAndConvertToS16(const Float &v, const void* /*roundingMode*/)
948
{
949
__m128i lo = _mm_cvtps_epi32(v.m_values[0]);
950
__m128i hi = _mm_cvtps_epi32(v.m_values[1]);
951
952
__m128i packed = _mm_packs_epi32(lo, hi);
953
954
SInt16 result;
955
result.m_value = _mm_packs_epi32(lo, hi);
956
return result;
957
}
958
959
static Float Sqrt(const Float &f)
960
{
961
Float result;
962
for (int i = 0; i < 2; i++)
963
result.m_values[i] = _mm_sqrt_ps(f.m_values[i]);
964
return result;
965
}
966
967
static UInt16 Abs(const SInt16 &a)
968
{
969
__m128i signBitsXor = _mm_srai_epi16(a.m_value, 15);
970
__m128i signBitsAdd = _mm_srli_epi16(a.m_value, 15);
971
972
UInt16 result;
973
result.m_value = _mm_add_epi16(_mm_xor_si128(a.m_value, signBitsXor), signBitsAdd);
974
return result;
975
}
976
977
static Float Abs(const Float& a)
978
{
979
__m128 invMask = _mm_set1_ps(-0.0f);
980
981
Float result;
982
result.m_values[0] = _mm_andnot_ps(invMask, a.m_values[0]);
983
result.m_values[1] = _mm_andnot_ps(invMask, a.m_values[1]);
984
return result;
985
}
986
987
static UInt16 SqDiffUInt8(const UInt15 &a, const UInt15 &b)
988
{
989
__m128i diff = _mm_sub_epi16(a.m_value, b.m_value);
990
991
UInt16 result;
992
result.m_value = _mm_mullo_epi16(diff, diff);
993
return result;
994
}
995
996
static Float SqDiffSInt16(const SInt16 &a, const SInt16 &b)
997
{
998
__m128i diffU = _mm_sub_epi16(_mm_max_epi16(a.m_value, b.m_value), _mm_min_epi16(a.m_value, b.m_value));
999
1000
__m128i mulHi = _mm_mulhi_epu16(diffU, diffU);
1001
__m128i mulLo = _mm_mullo_epi16(diffU, diffU);
1002
__m128i sqDiffHi = _mm_unpackhi_epi16(mulLo, mulHi);
1003
__m128i sqDiffLo = _mm_unpacklo_epi16(mulLo, mulHi);
1004
1005
Float result;
1006
result.m_values[0] = _mm_cvtepi32_ps(sqDiffLo);
1007
result.m_values[1] = _mm_cvtepi32_ps(sqDiffHi);
1008
1009
return result;
1010
}
1011
1012
static Float TwosCLHalfToFloat(const SInt16 &v)
1013
{
1014
__m128i absV = _mm_add_epi16(_mm_xor_si128(v.m_value, _mm_srai_epi16(v.m_value, 15)), _mm_srli_epi16(v.m_value, 15));
1015
1016
__m128i signBits = _mm_and_si128(v.m_value, _mm_set1_epi16(-32768));
1017
__m128i mantissa = _mm_and_si128(v.m_value, _mm_set1_epi16(0x03ff));
1018
__m128i exponent = _mm_and_si128(v.m_value, _mm_set1_epi16(0x7c00));
1019
1020
__m128i isDenormal = _mm_cmpeq_epi16(exponent, _mm_setzero_si128());
1021
1022
// Convert exponent to high-bits
1023
exponent = _mm_add_epi16(_mm_srli_epi16(exponent, 3), _mm_set1_epi16(14336));
1024
1025
__m128i denormalCorrectionHigh = _mm_and_si128(isDenormal, _mm_or_si128(signBits, _mm_set1_epi16(14336)));
1026
1027
__m128i highBits = _mm_or_si128(signBits, _mm_or_si128(exponent, _mm_srli_epi16(mantissa, 3)));
1028
__m128i lowBits = _mm_slli_epi16(mantissa, 13);
1029
1030
__m128i flow = _mm_unpacklo_epi16(lowBits, highBits);
1031
__m128i fhigh = _mm_unpackhi_epi16(lowBits, highBits);
1032
1033
__m128i correctionLow = _mm_unpacklo_epi16(_mm_setzero_si128(), denormalCorrectionHigh);
1034
__m128i correctionHigh = _mm_unpackhi_epi16(_mm_setzero_si128(), denormalCorrectionHigh);
1035
1036
Float result;
1037
result.m_values[0] = _mm_sub_ps(_mm_castsi128_ps(flow), _mm_castsi128_ps(correctionLow));
1038
result.m_values[1] = _mm_sub_ps(_mm_castsi128_ps(fhigh), _mm_castsi128_ps(correctionHigh));
1039
1040
return result;
1041
}
1042
1043
static Float SqDiff2CLFloat(const SInt16 &a, const Float &b)
1044
{
1045
Float fa = TwosCLHalfToFloat(a);
1046
1047
Float diff = fa - b;
1048
return diff * diff;
1049
}
1050
1051
static Float SqDiff2CL(const SInt16 &a, const SInt16 &b)
1052
{
1053
Float fa = TwosCLHalfToFloat(a);
1054
Float fb = TwosCLHalfToFloat(b);
1055
1056
Float diff = fa - fb;
1057
return diff * diff;
1058
}
1059
1060
static Float SqDiff2CLFloat(const SInt16 &a, float aWeight, const Float &b)
1061
{
1062
Float fa = TwosCLHalfToFloat(a) * aWeight;
1063
1064
Float diff = fa - b;
1065
return diff * diff;
1066
}
1067
1068
static UInt16 RightShift(const UInt16 &v, int bits)
1069
{
1070
UInt16 result;
1071
result.m_value = _mm_srli_epi16(v.m_value, bits);
1072
return result;
1073
}
1074
1075
static UInt31 RightShift(const UInt31 &v, int bits)
1076
{
1077
UInt31 result;
1078
result.m_values[0] = _mm_srli_epi32(v.m_values[0], bits);
1079
result.m_values[1] = _mm_srli_epi32(v.m_values[1], bits);
1080
return result;
1081
}
1082
1083
static SInt16 RightShift(const SInt16 &v, int bits)
1084
{
1085
SInt16 result;
1086
result.m_value = _mm_srai_epi16(v.m_value, bits);
1087
return result;
1088
}
1089
1090
static UInt15 RightShift(const UInt15 &v, int bits)
1091
{
1092
UInt15 result;
1093
result.m_value = _mm_srli_epi16(v.m_value, bits);
1094
return result;
1095
}
1096
1097
static SInt32 RightShift(const SInt32 &v, int bits)
1098
{
1099
SInt32 result;
1100
result.m_values[0] = _mm_srai_epi32(v.m_values[0], bits);
1101
result.m_values[1] = _mm_srai_epi32(v.m_values[1], bits);
1102
return result;
1103
}
1104
1105
static SInt16 ToSInt16(const SInt32 &v)
1106
{
1107
SInt16 result;
1108
result.m_value = _mm_packs_epi32(v.m_values[0], v.m_values[1]);
1109
return result;
1110
}
1111
1112
static SInt16 ToSInt16(const UInt16 &v)
1113
{
1114
SInt16 result;
1115
result.m_value = v.m_value;
1116
return result;
1117
}
1118
1119
static SInt16 ToSInt16(const UInt15 &v)
1120
{
1121
SInt16 result;
1122
result.m_value = v.m_value;
1123
return result;
1124
}
1125
1126
static UInt16 ToUInt16(const UInt32 &v)
1127
{
1128
__m128i low = _mm_srai_epi32(_mm_slli_epi32(v.m_values[0], 16), 16);
1129
__m128i high = _mm_srai_epi32(_mm_slli_epi32(v.m_values[1], 16), 16);
1130
1131
UInt16 result;
1132
result.m_value = _mm_packs_epi32(low, high);
1133
return result;
1134
}
1135
1136
static UInt16 ToUInt16(const UInt31 &v)
1137
{
1138
__m128i low = _mm_srai_epi32(_mm_slli_epi32(v.m_values[0], 16), 16);
1139
__m128i high = _mm_srai_epi32(_mm_slli_epi32(v.m_values[1], 16), 16);
1140
1141
UInt16 result;
1142
result.m_value = _mm_packs_epi32(low, high);
1143
return result;
1144
}
1145
1146
static UInt15 ToUInt15(const UInt31 &v)
1147
{
1148
UInt15 result;
1149
result.m_value = _mm_packs_epi32(v.m_values[0], v.m_values[1]);
1150
return result;
1151
}
1152
1153
static UInt15 ToUInt15(const SInt16 &v)
1154
{
1155
UInt15 result;
1156
result.m_value = v.m_value;
1157
return result;
1158
}
1159
1160
static UInt15 ToUInt15(const UInt16 &v)
1161
{
1162
UInt15 result;
1163
result.m_value = v.m_value;
1164
return result;
1165
}
1166
1167
static SInt32 XMultiply(const SInt16 &a, const SInt16 &b)
1168
{
1169
__m128i high = _mm_mulhi_epi16(a.m_value, b.m_value);
1170
__m128i low = _mm_mullo_epi16(a.m_value, b.m_value);
1171
1172
SInt32 result;
1173
result.m_values[0] = _mm_unpacklo_epi16(low, high);
1174
result.m_values[1] = _mm_unpackhi_epi16(low, high);
1175
return result;
1176
}
1177
1178
static SInt32 XMultiply(const SInt16 &a, const UInt15 &b)
1179
{
1180
__m128i high = _mm_mulhi_epi16(a.m_value, b.m_value);
1181
__m128i low = _mm_mullo_epi16(a.m_value, b.m_value);
1182
1183
SInt32 result;
1184
result.m_values[0] = _mm_unpacklo_epi16(low, high);
1185
result.m_values[1] = _mm_unpackhi_epi16(low, high);
1186
return result;
1187
}
1188
1189
static SInt32 XMultiply(const UInt15 &a, const SInt16 &b)
1190
{
1191
return XMultiply(b, a);
1192
}
1193
1194
static UInt32 XMultiply(const UInt16 &a, const UInt16 &b)
1195
{
1196
__m128i high = _mm_mulhi_epu16(a.m_value, b.m_value);
1197
__m128i low = _mm_mullo_epi16(a.m_value, b.m_value);
1198
1199
UInt32 result;
1200
result.m_values[0] = _mm_unpacklo_epi16(low, high);
1201
result.m_values[1] = _mm_unpackhi_epi16(low, high);
1202
return result;
1203
}
1204
1205
static UInt16 CompactMultiply(const UInt16 &a, const UInt15 &b)
1206
{
1207
UInt16 result;
1208
result.m_value = _mm_mullo_epi16(a.m_value, b.m_value);
1209
return result;
1210
}
1211
1212
static UInt16 CompactMultiply(const UInt15 &a, const UInt15 &b)
1213
{
1214
UInt16 result;
1215
result.m_value = _mm_mullo_epi16(a.m_value, b.m_value);
1216
return result;
1217
}
1218
1219
static SInt16 CompactMultiply(const SInt16 &a, const UInt15 &b)
1220
{
1221
SInt16 result;
1222
result.m_value = _mm_mullo_epi16(a.m_value, b.m_value);
1223
return result;
1224
}
1225
1226
static SInt16 CompactMultiply(const SInt16 &a, const SInt16 &b)
1227
{
1228
SInt16 result;
1229
result.m_value = _mm_mullo_epi16(a.m_value, b.m_value);
1230
return result;
1231
}
1232
1233
static UInt31 XMultiply(const UInt15 &a, const UInt15 &b)
1234
{
1235
__m128i high = _mm_mulhi_epu16(a.m_value, b.m_value);
1236
__m128i low = _mm_mullo_epi16(a.m_value, b.m_value);
1237
1238
UInt31 result;
1239
result.m_values[0] = _mm_unpacklo_epi16(low, high);
1240
result.m_values[1] = _mm_unpackhi_epi16(low, high);
1241
return result;
1242
}
1243
1244
static UInt31 XMultiply(const UInt16 &a, const UInt15 &b)
1245
{
1246
__m128i high = _mm_mulhi_epu16(a.m_value, b.m_value);
1247
__m128i low = _mm_mullo_epi16(a.m_value, b.m_value);
1248
1249
UInt31 result;
1250
result.m_values[0] = _mm_unpacklo_epi16(low, high);
1251
result.m_values[1] = _mm_unpackhi_epi16(low, high);
1252
return result;
1253
}
1254
1255
static UInt31 XMultiply(const UInt15 &a, const UInt16 &b)
1256
{
1257
return XMultiply(b, a);
1258
}
1259
1260
static bool AnySet(const Int16CompFlag &v)
1261
{
1262
return _mm_movemask_epi8(v.m_value) != 0;
1263
}
1264
1265
static bool AllSet(const Int16CompFlag &v)
1266
{
1267
return _mm_movemask_epi8(v.m_value) == 0xffff;
1268
}
1269
1270
static bool AnySet(const FloatCompFlag &v)
1271
{
1272
return _mm_movemask_ps(v.m_values[0]) != 0 || _mm_movemask_ps(v.m_values[1]) != 0;
1273
}
1274
1275
static bool AllSet(const FloatCompFlag &v)
1276
{
1277
return _mm_movemask_ps(v.m_values[0]) == 0xf && _mm_movemask_ps(v.m_values[1]) == 0xf;
1278
}
1279
};
1280
1281
#else
1282
// Scalar version
1283
struct ParallelMath
1284
{
1285
struct RoundTowardZeroForScope
1286
{
1287
};
1288
1289
struct RoundTowardNearestForScope
1290
{
1291
};
1292
1293
struct RoundUpForScope
1294
{
1295
};
1296
1297
struct RoundDownForScope
1298
{
1299
};
1300
1301
static const int ParallelSize = 1;
1302
1303
enum Int16Subtype
1304
{
1305
IntSubtype_Signed,
1306
IntSubtype_UnsignedFull,
1307
IntSubtype_UnsignedTruncated,
1308
IntSubtype_Abstract,
1309
};
1310
1311
typedef int32_t SInt16;
1312
typedef int32_t UInt15;
1313
typedef int32_t UInt16;
1314
typedef int32_t AInt16;
1315
1316
typedef int32_t SInt32;
1317
typedef int32_t UInt31;
1318
typedef int32_t UInt32;
1319
typedef int32_t AInt32;
1320
1321
typedef int32_t ScalarUInt16;
1322
typedef int32_t ScalarSInt16;
1323
1324
typedef float Float;
1325
1326
template<class TTargetType>
1327
struct LosslessCast
1328
{
1329
static const int32_t& Cast(const int32_t &src)
1330
{
1331
return src;
1332
}
1333
};
1334
1335
typedef bool Int16CompFlag;
1336
typedef bool FloatCompFlag;
1337
1338
static int32_t AbstractAdd(const int32_t &a, const int32_t &b)
1339
{
1340
return a + b;
1341
}
1342
1343
static int32_t AbstractSubtract(const int32_t &a, const int32_t &b)
1344
{
1345
return a - b;
1346
}
1347
1348
static float Select(bool flag, float a, float b)
1349
{
1350
return flag ? a : b;
1351
}
1352
1353
static int32_t Select(bool flag, int32_t a, int32_t b)
1354
{
1355
return flag ? a : b;
1356
}
1357
1358
static int32_t SelectOrZero(bool flag, int32_t a)
1359
{
1360
return flag ? a : 0;
1361
}
1362
1363
static void ConditionalSet(int32_t& dest, bool flag, int32_t src)
1364
{
1365
if (flag)
1366
dest = src;
1367
}
1368
1369
static void ConditionalSet(bool& dest, bool flag, bool src)
1370
{
1371
if (flag)
1372
dest = src;
1373
}
1374
1375
static int32_t ConditionalNegate(bool flag, int32_t v)
1376
{
1377
return (flag) ? -v : v;
1378
}
1379
1380
static void NotConditionalSet(int32_t& dest, bool flag, int32_t src)
1381
{
1382
if (!flag)
1383
dest = src;
1384
}
1385
1386
static void ConditionalSet(float& dest, bool flag, float src)
1387
{
1388
if (flag)
1389
dest = src;
1390
}
1391
1392
static void NotConditionalSet(float& dest, bool flag, float src)
1393
{
1394
if (!flag)
1395
dest = src;
1396
}
1397
1398
static void MakeSafeDenominator(float& v)
1399
{
1400
if (v == 0.0f)
1401
v = 1.0f;
1402
}
1403
1404
static int32_t SignedRightShift(int32_t v, int bits)
1405
{
1406
return v >> bits;
1407
}
1408
1409
static int32_t TruncateToPrecisionSigned(int32_t v, int precision)
1410
{
1411
v = (v << (32 - precision)) & 0xffffffff;
1412
return SignedRightShift(v, 32 - precision);
1413
}
1414
1415
static int32_t TruncateToPrecisionUnsigned(int32_t v, int precision)
1416
{
1417
return v & ((1 << precision) - 1);
1418
}
1419
1420
static int32_t Min(int32_t a, int32_t b)
1421
{
1422
if (a < b)
1423
return a;
1424
return b;
1425
}
1426
1427
static float Min(float a, float b)
1428
{
1429
if (a < b)
1430
return a;
1431
return b;
1432
}
1433
1434
static int32_t Max(int32_t a, int32_t b)
1435
{
1436
if (a > b)
1437
return a;
1438
return b;
1439
}
1440
1441
static float Max(float a, float b)
1442
{
1443
if (a > b)
1444
return a;
1445
return b;
1446
}
1447
1448
static float Abs(float a)
1449
{
1450
return fabsf(a);
1451
}
1452
1453
static int32_t Abs(int32_t a)
1454
{
1455
if (a < 0)
1456
return -a;
1457
return a;
1458
}
1459
1460
static float Clamp(float v, float min, float max)
1461
{
1462
if (v < min)
1463
return min;
1464
if (v > max)
1465
return max;
1466
return v;
1467
}
1468
1469
static float Reciprocal(float v)
1470
{
1471
return 1.0f / v;
1472
}
1473
1474
static void ConvertLDRInputs(const PixelBlockU8* inputBlocks, int pxOffset, int channel, int32_t& chOut)
1475
{
1476
chOut = inputBlocks[0].m_pixels[pxOffset][channel];
1477
}
1478
1479
static void ConvertHDRInputs(const PixelBlockF16* inputBlocks, int pxOffset, int channel, int32_t& chOut)
1480
{
1481
chOut = inputBlocks[0].m_pixels[pxOffset][channel];
1482
}
1483
1484
static float MakeFloat(float v)
1485
{
1486
return v;
1487
}
1488
1489
static float MakeFloatZero()
1490
{
1491
return 0.0f;
1492
}
1493
1494
static int32_t MakeUInt16(uint16_t v)
1495
{
1496
return v;
1497
}
1498
1499
static int32_t MakeSInt16(int16_t v)
1500
{
1501
return v;
1502
}
1503
1504
static int32_t MakeAInt16(int16_t v)
1505
{
1506
return v;
1507
}
1508
1509
static int32_t MakeUInt15(uint16_t v)
1510
{
1511
return v;
1512
}
1513
1514
static int32_t MakeSInt32(int32_t v)
1515
{
1516
return v;
1517
}
1518
1519
static int32_t MakeUInt31(int32_t v)
1520
{
1521
return v;
1522
}
1523
1524
static int32_t Extract(int32_t v, int offset)
1525
{
1526
UNREFERENCED_PARAMETER(offset);
1527
return v;
1528
}
1529
1530
static bool Extract(bool v, int offset)
1531
{
1532
UNREFERENCED_PARAMETER(offset);
1533
return v;
1534
}
1535
1536
static float Extract(float v, int offset)
1537
{
1538
UNREFERENCED_PARAMETER(offset);
1539
return v;
1540
}
1541
1542
static void PutUInt16(int32_t &dest, int offset, ParallelMath::ScalarUInt16 v)
1543
{
1544
UNREFERENCED_PARAMETER(offset);
1545
dest = v;
1546
}
1547
1548
static void PutUInt15(int32_t &dest, int offset, ParallelMath::ScalarUInt16 v)
1549
{
1550
UNREFERENCED_PARAMETER(offset);
1551
dest = v;
1552
}
1553
1554
static void PutSInt16(int32_t &dest, int offset, ParallelMath::ScalarSInt16 v)
1555
{
1556
UNREFERENCED_PARAMETER(offset);
1557
dest = v;
1558
}
1559
1560
static float ExtractFloat(float v, int offset)
1561
{
1562
UNREFERENCED_PARAMETER(offset);
1563
return v;
1564
}
1565
1566
static void PutFloat(float &dest, int offset, float v)
1567
{
1568
UNREFERENCED_PARAMETER(offset);
1569
dest = v;
1570
}
1571
1572
static void PutBoolInt16(bool &dest, int offset, bool v)
1573
{
1574
UNREFERENCED_PARAMETER(offset);
1575
dest = v;
1576
}
1577
1578
static bool Less(int32_t a, int32_t b)
1579
{
1580
return a < b;
1581
}
1582
1583
static bool Less(float a, float b)
1584
{
1585
return a < b;
1586
}
1587
1588
static bool LessOrEqual(int32_t a, int32_t b)
1589
{
1590
return a < b;
1591
}
1592
1593
static bool LessOrEqual(float a, float b)
1594
{
1595
return a < b;
1596
}
1597
1598
static bool Equal(int32_t a, int32_t b)
1599
{
1600
return a == b;
1601
}
1602
1603
static bool Equal(float a, float b)
1604
{
1605
return a == b;
1606
}
1607
1608
static float ToFloat(int32_t v)
1609
{
1610
return static_cast<float>(v);
1611
}
1612
1613
static int32_t ToUInt31(int32_t v)
1614
{
1615
return v;
1616
}
1617
1618
static int32_t ToInt32(int32_t v)
1619
{
1620
return v;
1621
}
1622
1623
static bool FloatFlagToInt16(bool v)
1624
{
1625
return v;
1626
}
1627
1628
static bool Int32FlagToInt16(bool v)
1629
{
1630
return v;
1631
}
1632
1633
static bool Int16FlagToFloat(bool v)
1634
{
1635
return v;
1636
}
1637
1638
static bool MakeBoolInt16(bool b)
1639
{
1640
return b;
1641
}
1642
1643
static bool MakeBoolFloat(bool b)
1644
{
1645
return b;
1646
}
1647
1648
static bool AndNot(bool a, bool b)
1649
{
1650
return a && !b;
1651
}
1652
1653
static bool Not(bool b)
1654
{
1655
return !b;
1656
}
1657
1658
static int32_t RoundAndConvertToInt(float v, const ParallelMath::RoundTowardZeroForScope *rtz)
1659
{
1660
UNREFERENCED_PARAMETER(rtz);
1661
return static_cast<int>(v);
1662
}
1663
1664
static int32_t RoundAndConvertToInt(float v, const ParallelMath::RoundUpForScope *ru)
1665
{
1666
UNREFERENCED_PARAMETER(ru);
1667
return static_cast<int>(ceilf(v));
1668
}
1669
1670
static int32_t RoundAndConvertToInt(float v, const ParallelMath::RoundDownForScope *rd)
1671
{
1672
UNREFERENCED_PARAMETER(rd);
1673
return static_cast<int>(floorf(v));
1674
}
1675
1676
static int32_t RoundAndConvertToInt(float v, const ParallelMath::RoundTowardNearestForScope *rtn)
1677
{
1678
UNREFERENCED_PARAMETER(rtn);
1679
return static_cast<int>(floorf(v + 0.5f));
1680
}
1681
1682
template<class TRoundMode>
1683
static int32_t RoundAndConvertToU16(float v, const TRoundMode *roundingMode)
1684
{
1685
return RoundAndConvertToInt(v, roundingMode);
1686
}
1687
1688
template<class TRoundMode>
1689
static int32_t RoundAndConvertToU15(float v, const TRoundMode *roundingMode)
1690
{
1691
return RoundAndConvertToInt(v, roundingMode);
1692
}
1693
1694
template<class TRoundMode>
1695
static int32_t RoundAndConvertToS16(float v, const TRoundMode *roundingMode)
1696
{
1697
return RoundAndConvertToInt(v, roundingMode);
1698
}
1699
1700
static float Sqrt(float f)
1701
{
1702
return sqrtf(f);
1703
}
1704
1705
static int32_t SqDiffUInt8(int32_t a, int32_t b)
1706
{
1707
int32_t delta = a - b;
1708
return delta * delta;
1709
}
1710
1711
static int32_t SqDiffInt16(int32_t a, int32_t b)
1712
{
1713
int32_t delta = a - b;
1714
return delta * delta;
1715
}
1716
1717
static int32_t SqDiffSInt16(int32_t a, int32_t b)
1718
{
1719
int32_t delta = a - b;
1720
return delta * delta;
1721
}
1722
1723
static float TwosCLHalfToFloat(int32_t v)
1724
{
1725
int32_t absV = (v < 0) ? -v : v;
1726
1727
int32_t signBits = (absV & -32768);
1728
int32_t mantissa = (absV & 0x03ff);
1729
int32_t exponent = (absV & 0x7c00);
1730
1731
bool isDenormal = (exponent == 0);
1732
1733
// Convert exponent to high-bits
1734
exponent = (exponent >> 3) + 14336;
1735
1736
int32_t denormalCorrection = (isDenormal ? (signBits | 14336) : 0) << 16;
1737
1738
int32_t fBits = ((exponent | signBits) << 16) | (mantissa << 13);
1739
1740
float f, correction;
1741
memcpy(&f, &fBits, 4);
1742
memcpy(&correction, &denormalCorrection, 4);
1743
1744
return f - correction;
1745
}
1746
1747
static Float SqDiff2CLFloat(const SInt16 &a, const Float &b)
1748
{
1749
Float fa = TwosCLHalfToFloat(a);
1750
1751
Float diff = fa - b;
1752
return diff * diff;
1753
}
1754
1755
static Float SqDiff2CL(const SInt16 &a, const SInt16 &b)
1756
{
1757
Float fa = TwosCLHalfToFloat(a);
1758
Float fb = TwosCLHalfToFloat(b);
1759
1760
Float diff = fa - fb;
1761
return diff * diff;
1762
}
1763
1764
static Float SqDiff2CLFloat(const SInt16 &a, float aWeight, const Float &b)
1765
{
1766
Float fa = TwosCLHalfToFloat(a) * aWeight;
1767
1768
Float diff = fa - b;
1769
return diff * diff;
1770
}
1771
1772
static int32_t RightShift(int32_t v, int bits)
1773
{
1774
return SignedRightShift(v, bits);
1775
}
1776
1777
static int32_t ToSInt16(int32_t v)
1778
{
1779
return v;
1780
}
1781
1782
static int32_t ToUInt16(int32_t v)
1783
{
1784
return v;
1785
}
1786
1787
static int32_t ToUInt15(int32_t v)
1788
{
1789
return v;
1790
}
1791
1792
static int32_t XMultiply(int32_t a, int32_t b)
1793
{
1794
return a * b;
1795
}
1796
1797
static int32_t CompactMultiply(int32_t a, int32_t b)
1798
{
1799
return a * b;
1800
}
1801
1802
static bool AnySet(bool v)
1803
{
1804
return v;
1805
}
1806
1807
static bool AllSet(bool v)
1808
{
1809
return v;
1810
}
1811
};
1812
1813
#endif
1814
}
1815
1816
#endif
1817
1818