Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
godotengine
GitHub Repository: godotengine/godot
Path: blob/master/thirdparty/embree/common/math/transcendental.h
9912 views
1
// Copyright 2009-2021 Intel Corporation
2
// SPDX-License-Identifier: Apache-2.0
3
4
#pragma once
5
6
// Transcendental functions from "ispc": https://github.com/ispc/ispc/
7
// Most of the transcendental implementations in ispc code come from
8
// Solomon Boulos's "syrah": https://github.com/boulos/syrah/
9
10
#include "../simd/simd.h"
11
12
namespace embree
13
{
14
15
namespace fastapprox
16
{
17
18
template <typename T>
19
__forceinline T sin(const T &v)
20
{
21
static const float piOverTwoVec = 1.57079637050628662109375;
22
static const float twoOverPiVec = 0.636619746685028076171875;
23
auto scaled = v * twoOverPiVec;
24
auto kReal = floor(scaled);
25
auto k = toInt(kReal);
26
27
// Reduced range version of x
28
auto x = v - kReal * piOverTwoVec;
29
auto kMod4 = k & 3;
30
auto sinUseCos = (kMod4 == 1) | (kMod4 == 3);
31
auto flipSign = (kMod4 > 1);
32
33
// These coefficients are from sollya with fpminimax(sin(x)/x, [|0, 2,
34
// 4, 6, 8, 10|], [|single...|], [0;Pi/2]);
35
static const float sinC2 = -0.16666667163372039794921875;
36
static const float sinC4 = +8.333347737789154052734375e-3;
37
static const float sinC6 = -1.9842604524455964565277099609375e-4;
38
static const float sinC8 = +2.760012648650445044040679931640625e-6;
39
static const float sinC10 = -2.50293279435709337121807038784027099609375e-8;
40
41
static const float cosC2 = -0.5;
42
static const float cosC4 = +4.166664183139801025390625e-2;
43
static const float cosC6 = -1.388833043165504932403564453125e-3;
44
static const float cosC8 = +2.47562347794882953166961669921875e-5;
45
static const float cosC10 = -2.59630184018533327616751194000244140625e-7;
46
47
auto outside = select(sinUseCos, 1., x);
48
auto c2 = select(sinUseCos, T(cosC2), T(sinC2));
49
auto c4 = select(sinUseCos, T(cosC4), T(sinC4));
50
auto c6 = select(sinUseCos, T(cosC6), T(sinC6));
51
auto c8 = select(sinUseCos, T(cosC8), T(sinC8));
52
auto c10 = select(sinUseCos, T(cosC10), T(sinC10));
53
54
auto x2 = x * x;
55
auto formula = x2 * c10 + c8;
56
formula = x2 * formula + c6;
57
formula = x2 * formula + c4;
58
formula = x2 * formula + c2;
59
formula = x2 * formula + 1.;
60
formula *= outside;
61
62
formula = select(flipSign, -formula, formula);
63
return formula;
64
}
65
66
template <typename T>
67
__forceinline T cos(const T &v)
68
{
69
static const float piOverTwoVec = 1.57079637050628662109375;
70
static const float twoOverPiVec = 0.636619746685028076171875;
71
auto scaled = v * twoOverPiVec;
72
auto kReal = floor(scaled);
73
auto k = toInt(kReal);
74
75
// Reduced range version of x
76
auto x = v - kReal * piOverTwoVec;
77
78
auto kMod4 = k & 3;
79
auto cosUseCos = (kMod4 == 0) | (kMod4 == 2);
80
auto flipSign = (kMod4 == 1) | (kMod4 == 2);
81
82
const float sinC2 = -0.16666667163372039794921875;
83
const float sinC4 = +8.333347737789154052734375e-3;
84
const float sinC6 = -1.9842604524455964565277099609375e-4;
85
const float sinC8 = +2.760012648650445044040679931640625e-6;
86
const float sinC10 = -2.50293279435709337121807038784027099609375e-8;
87
88
const float cosC2 = -0.5;
89
const float cosC4 = +4.166664183139801025390625e-2;
90
const float cosC6 = -1.388833043165504932403564453125e-3;
91
const float cosC8 = +2.47562347794882953166961669921875e-5;
92
const float cosC10 = -2.59630184018533327616751194000244140625e-7;
93
94
auto outside = select(cosUseCos, 1., x);
95
auto c2 = select(cosUseCos, T(cosC2), T(sinC2));
96
auto c4 = select(cosUseCos, T(cosC4), T(sinC4));
97
auto c6 = select(cosUseCos, T(cosC6), T(sinC6));
98
auto c8 = select(cosUseCos, T(cosC8), T(sinC8));
99
auto c10 = select(cosUseCos, T(cosC10), T(sinC10));
100
101
auto x2 = x * x;
102
auto formula = x2 * c10 + c8;
103
formula = x2 * formula + c6;
104
formula = x2 * formula + c4;
105
formula = x2 * formula + c2;
106
formula = x2 * formula + 1.;
107
formula *= outside;
108
109
formula = select(flipSign, -formula, formula);
110
return formula;
111
}
112
113
template <typename T>
114
__forceinline void sincos(const T &v, T &sinResult, T &cosResult)
115
{
116
const float piOverTwoVec = 1.57079637050628662109375;
117
const float twoOverPiVec = 0.636619746685028076171875;
118
auto scaled = v * twoOverPiVec;
119
auto kReal = floor(scaled);
120
auto k = toInt(kReal);
121
122
// Reduced range version of x
123
auto x = v - kReal * piOverTwoVec;
124
auto kMod4 = k & 3;
125
auto cosUseCos = ((kMod4 == 0) | (kMod4 == 2));
126
auto sinUseCos = ((kMod4 == 1) | (kMod4 == 3));
127
auto sinFlipSign = (kMod4 > 1);
128
auto cosFlipSign = ((kMod4 == 1) | (kMod4 == 2));
129
130
const float oneVec = +1.;
131
const float sinC2 = -0.16666667163372039794921875;
132
const float sinC4 = +8.333347737789154052734375e-3;
133
const float sinC6 = -1.9842604524455964565277099609375e-4;
134
const float sinC8 = +2.760012648650445044040679931640625e-6;
135
const float sinC10 = -2.50293279435709337121807038784027099609375e-8;
136
137
const float cosC2 = -0.5;
138
const float cosC4 = +4.166664183139801025390625e-2;
139
const float cosC6 = -1.388833043165504932403564453125e-3;
140
const float cosC8 = +2.47562347794882953166961669921875e-5;
141
const float cosC10 = -2.59630184018533327616751194000244140625e-7;
142
143
auto x2 = x * x;
144
145
auto sinFormula = x2 * sinC10 + sinC8;
146
auto cosFormula = x2 * cosC10 + cosC8;
147
sinFormula = x2 * sinFormula + sinC6;
148
cosFormula = x2 * cosFormula + cosC6;
149
150
sinFormula = x2 * sinFormula + sinC4;
151
cosFormula = x2 * cosFormula + cosC4;
152
153
sinFormula = x2 * sinFormula + sinC2;
154
cosFormula = x2 * cosFormula + cosC2;
155
156
sinFormula = x2 * sinFormula + oneVec;
157
cosFormula = x2 * cosFormula + oneVec;
158
159
sinFormula *= x;
160
161
sinResult = select(sinUseCos, cosFormula, sinFormula);
162
cosResult = select(cosUseCos, cosFormula, sinFormula);
163
164
sinResult = select(sinFlipSign, -sinResult, sinResult);
165
cosResult = select(cosFlipSign, -cosResult, cosResult);
166
}
167
168
template <typename T>
169
__forceinline T tan(const T &v)
170
{
171
const float piOverFourVec = 0.785398185253143310546875;
172
const float fourOverPiVec = 1.27323949337005615234375;
173
174
auto xLt0 = v < 0.;
175
auto y = select(xLt0, -v, v);
176
auto scaled = y * fourOverPiVec;
177
178
auto kReal = floor(scaled);
179
auto k = toInt(kReal);
180
181
auto x = y - kReal * piOverFourVec;
182
183
// If k & 1, x -= Pi/4
184
auto needOffset = (k & 1) != 0;
185
x = select(needOffset, x - piOverFourVec, x);
186
187
// If k & 3 == (0 or 3) let z = tan_In...(y) otherwise z = -cot_In0To...
188
auto kMod4 = k & 3;
189
auto useCotan = (kMod4 == 1) | (kMod4 == 2);
190
191
const float oneVec = 1.0;
192
193
const float tanC2 = +0.33333075046539306640625;
194
const float tanC4 = +0.13339905440807342529296875;
195
const float tanC6 = +5.3348250687122344970703125e-2;
196
const float tanC8 = +2.46033705770969390869140625e-2;
197
const float tanC10 = +2.892402000725269317626953125e-3;
198
const float tanC12 = +9.500005282461643218994140625e-3;
199
200
const float cotC2 = -0.3333333432674407958984375;
201
const float cotC4 = -2.222204394638538360595703125e-2;
202
const float cotC6 = -2.11752182804048061370849609375e-3;
203
const float cotC8 = -2.0846328698098659515380859375e-4;
204
const float cotC10 = -2.548247357481159269809722900390625e-5;
205
const float cotC12 = -3.5257363606433500535786151885986328125e-7;
206
207
auto x2 = x * x;
208
T z;
209
if (any(useCotan))
210
{
211
auto cotVal = x2 * cotC12 + cotC10;
212
cotVal = x2 * cotVal + cotC8;
213
cotVal = x2 * cotVal + cotC6;
214
cotVal = x2 * cotVal + cotC4;
215
cotVal = x2 * cotVal + cotC2;
216
cotVal = x2 * cotVal + oneVec;
217
// The equation is for x * cot(x) but we need -x * cot(x) for the tan part.
218
cotVal /= -x;
219
z = cotVal;
220
}
221
auto useTan = !useCotan;
222
if (any(useTan))
223
{
224
auto tanVal = x2 * tanC12 + tanC10;
225
tanVal = x2 * tanVal + tanC8;
226
tanVal = x2 * tanVal + tanC6;
227
tanVal = x2 * tanVal + tanC4;
228
tanVal = x2 * tanVal + tanC2;
229
tanVal = x2 * tanVal + oneVec;
230
// Equation was for tan(x)/x
231
tanVal *= x;
232
z = select(useTan, tanVal, z);
233
}
234
return select(xLt0, -z, z);
235
}
236
237
template <typename T>
238
__forceinline T asin(const T &x0)
239
{
240
auto isneg = (x0 < 0.f);
241
auto x = abs(x0);
242
auto isnan = (x > 1.f);
243
244
// sollya
245
// fpminimax(((asin(x)-pi/2)/-sqrt(1-x)), [|0,1,2,3,4,5|],[|single...|],
246
// [1e-20;.9999999999999999]);
247
// avg error: 1.1105439e-06, max error 1.3187528e-06
248
auto v = 1.57079517841339111328125f +
249
x * (-0.21450997889041900634765625f +
250
x * (8.78556668758392333984375e-2f +
251
x * (-4.489909112453460693359375e-2f +
252
x * (1.928029954433441162109375e-2f +
253
x * (-4.3095736764371395111083984375e-3f)))));
254
255
v *= -sqrt(1.f - x);
256
v = v + 1.57079637050628662109375f;
257
258
v = select(v < 0.f, T(0.f), v);
259
v = select(isneg, -v, v);
260
v = select(isnan, T(cast_i2f(0x7fc00000)), v);
261
262
return v;
263
}
264
265
template <typename T>
266
__forceinline T acos(const T &v)
267
{
268
return 1.57079637050628662109375f - asin(v);
269
}
270
271
template <typename T>
272
__forceinline T atan(const T &v)
273
{
274
const float piOverTwoVec = 1.57079637050628662109375;
275
// atan(-x) = -atan(x) (so flip from negative to positive first)
276
// If x > 1 -> atan(x) = Pi/2 - atan(1/x)
277
auto xNeg = v < 0.f;
278
auto xFlipped = select(xNeg, -v, v);
279
280
auto xGt1 = xFlipped > 1.;
281
auto x = select(xGt1, rcpSafe(xFlipped), xFlipped);
282
283
// These coefficients approximate atan(x)/x
284
const float atanC0 = +0.99999988079071044921875;
285
const float atanC2 = -0.3333191573619842529296875;
286
const float atanC4 = +0.199689209461212158203125;
287
const float atanC6 = -0.14015688002109527587890625;
288
const float atanC8 = +9.905083477497100830078125e-2;
289
const float atanC10 = -5.93664981424808502197265625e-2;
290
const float atanC12 = +2.417283318936824798583984375e-2;
291
const float atanC14 = -4.6721356920897960662841796875e-3;
292
293
auto x2 = x * x;
294
auto result = x2 * atanC14 + atanC12;
295
result = x2 * result + atanC10;
296
result = x2 * result + atanC8;
297
result = x2 * result + atanC6;
298
result = x2 * result + atanC4;
299
result = x2 * result + atanC2;
300
result = x2 * result + atanC0;
301
result *= x;
302
303
result = select(xGt1, piOverTwoVec - result, result);
304
result = select(xNeg, -result, result);
305
return result;
306
}
307
308
template <typename T>
309
__forceinline T atan2(const T &y, const T &x)
310
{
311
const float piVec = 3.1415926536;
312
// atan2(y, x) =
313
//
314
// atan2(y > 0, x = +-0) -> Pi/2
315
// atan2(y < 0, x = +-0) -> -Pi/2
316
// atan2(y = +-0, x < +0) -> +-Pi
317
// atan2(y = +-0, x >= +0) -> +-0
318
//
319
// atan2(y >= 0, x < 0) -> Pi + atan(y/x)
320
// atan2(y < 0, x < 0) -> -Pi + atan(y/x)
321
// atan2(y, x > 0) -> atan(y/x)
322
//
323
// and then a bunch of code for dealing with infinities.
324
auto yOverX = y * rcpSafe(x);
325
auto atanArg = atan(yOverX);
326
auto xLt0 = x < 0.f;
327
auto yLt0 = y < 0.f;
328
auto offset = select(xLt0,
329
select(yLt0, T(-piVec), T(piVec)), 0.f);
330
return offset + atanArg;
331
}
332
333
template <typename T>
334
__forceinline T exp(const T &v)
335
{
336
const float ln2Part1 = 0.6931457519;
337
const float ln2Part2 = 1.4286067653e-6;
338
const float oneOverLn2 = 1.44269502162933349609375;
339
340
auto scaled = v * oneOverLn2;
341
auto kReal = floor(scaled);
342
auto k = toInt(kReal);
343
344
// Reduced range version of x
345
auto x = v - kReal * ln2Part1;
346
x -= kReal * ln2Part2;
347
348
// These coefficients are for e^x in [0, ln(2)]
349
const float one = 1.;
350
const float c2 = 0.4999999105930328369140625;
351
const float c3 = 0.166668415069580078125;
352
const float c4 = 4.16539050638675689697265625e-2;
353
const float c5 = 8.378830738365650177001953125e-3;
354
const float c6 = 1.304379315115511417388916015625e-3;
355
const float c7 = 2.7555381529964506626129150390625e-4;
356
357
auto result = x * c7 + c6;
358
result = x * result + c5;
359
result = x * result + c4;
360
result = x * result + c3;
361
result = x * result + c2;
362
result = x * result + one;
363
result = x * result + one;
364
365
// Compute 2^k (should differ for float and double, but I'll avoid
366
// it for now and just do floats)
367
const int fpbias = 127;
368
auto biasedN = k + fpbias;
369
auto overflow = kReal > fpbias;
370
// Minimum exponent is -126, so if k is <= -127 (k + 127 <= 0)
371
// we've got underflow. -127 * ln(2) -> -88.02. So the most
372
// negative float input that doesn't result in zero is like -88.
373
auto underflow = kReal <= -fpbias;
374
const int infBits = 0x7f800000;
375
biasedN <<= 23;
376
// Reinterpret this thing as float
377
auto twoToTheN = asFloat(biasedN);
378
// Handle both doubles and floats (hopefully eliding the copy for float)
379
auto elemtype2n = twoToTheN;
380
result *= elemtype2n;
381
result = select(overflow, cast_i2f(infBits), result);
382
result = select(underflow, 0., result);
383
return result;
384
}
385
386
// Range reduction for logarithms takes log(x) -> log(2^n * y) -> n
387
// * log(2) + log(y) where y is the reduced range (usually in [1/2, 1)).
388
template <typename T, typename R>
389
__forceinline void __rangeReduceLog(const T &input,
390
T &reduced,
391
R &exponent)
392
{
393
auto intVersion = asInt(input);
394
// single precision = SEEE EEEE EMMM MMMM MMMM MMMM MMMM MMMM
395
// exponent mask = 0111 1111 1000 0000 0000 0000 0000 0000
396
// 0x7 0xF 0x8 0x0 0x0 0x0 0x0 0x0
397
// non-exponent = 1000 0000 0111 1111 1111 1111 1111 1111
398
// = 0x8 0x0 0x7 0xF 0xF 0xF 0xF 0xF
399
400
//const int exponentMask(0x7F800000)
401
static const int nonexponentMask = 0x807FFFFF;
402
403
// We want the reduced version to have an exponent of -1 which is
404
// -1 + 127 after biasing or 126
405
static const int exponentNeg1 = (126l << 23);
406
// NOTE(boulos): We don't need to mask anything out since we know
407
// the sign bit has to be 0. If it's 1, we need to return infinity/nan
408
// anyway (log(x), x = +-0 -> infinity, x < 0 -> NaN).
409
auto biasedExponent = intVersion >> 23; // This number is [0, 255] but it means [-127, 128]
410
411
auto offsetExponent = biasedExponent + 1; // Treat the number as if it were 2^{e+1} * (1.m)/2
412
exponent = offsetExponent - 127; // get the real value
413
414
// Blend the offset_exponent with the original input (do this in
415
// int for now, until I decide if float can have & and &not)
416
auto blended = (intVersion & nonexponentMask) | (exponentNeg1);
417
reduced = asFloat(blended);
418
}
419
420
template <typename T> struct ExponentType { };
421
template <int N> struct ExponentType<vfloat_impl<N>> { typedef vint<N> Ty; };
422
template <> struct ExponentType<float> { typedef int Ty; };
423
424
template <typename T>
425
__forceinline T log(const T &v)
426
{
427
T reduced;
428
typename ExponentType<T>::Ty exponent;
429
430
const int nanBits = 0x7fc00000;
431
const int negInfBits = 0xFF800000;
432
const float nan = cast_i2f(nanBits);
433
const float negInf = cast_i2f(negInfBits);
434
auto useNan = v < 0.;
435
auto useInf = v == 0.;
436
auto exceptional = useNan | useInf;
437
const float one = 1.0;
438
439
auto patched = select(exceptional, one, v);
440
__rangeReduceLog(patched, reduced, exponent);
441
442
const float ln2 = 0.693147182464599609375;
443
444
auto x1 = one - reduced;
445
const float c1 = +0.50000095367431640625;
446
const float c2 = +0.33326041698455810546875;
447
const float c3 = +0.2519190013408660888671875;
448
const float c4 = +0.17541764676570892333984375;
449
const float c5 = +0.3424419462680816650390625;
450
const float c6 = -0.599632322788238525390625;
451
const float c7 = +1.98442304134368896484375;
452
const float c8 = -2.4899270534515380859375;
453
const float c9 = +1.7491014003753662109375;
454
455
auto result = x1 * c9 + c8;
456
result = x1 * result + c7;
457
result = x1 * result + c6;
458
result = x1 * result + c5;
459
result = x1 * result + c4;
460
result = x1 * result + c3;
461
result = x1 * result + c2;
462
result = x1 * result + c1;
463
result = x1 * result + one;
464
465
// Equation was for -(ln(red)/(1-red))
466
result *= -x1;
467
result += toFloat(exponent) * ln2;
468
469
return select(exceptional,
470
select(useNan, T(nan), T(negInf)),
471
result);
472
}
473
474
template <typename T>
475
__forceinline T pow(const T &x, const T &y)
476
{
477
auto x1 = abs(x);
478
auto z = exp(y * log(x1));
479
480
// Handle special cases
481
const float twoOver23 = 8388608.0f;
482
auto yInt = y == round(y);
483
auto yOddInt = select(yInt, asInt(abs(y) + twoOver23) << 31, 0); // set sign bit
484
485
// x == 0
486
z = select(x == 0.0f,
487
select(y < 0.0f, T(inf) | signmsk(x),
488
select(y == 0.0f, T(1.0f), asFloat(yOddInt) & x)), z);
489
490
// x < 0
491
auto xNegative = x < 0.0f;
492
if (any(xNegative))
493
{
494
auto z1 = z | asFloat(yOddInt);
495
z1 = select(yInt, z1, std::numeric_limits<float>::quiet_NaN());
496
z = select(xNegative, z1, z);
497
}
498
499
auto xFinite = isfinite(x);
500
auto yFinite = isfinite(y);
501
if (all(xFinite & yFinite))
502
return z;
503
504
// x finite and y infinite
505
z = select(andn(xFinite, yFinite),
506
select(x1 == 1.0f, 1.0f,
507
select((x1 > 1.0f) ^ (y < 0.0f), inf, T(0.0f))), z);
508
509
// x infinite
510
z = select(xFinite, z,
511
select(y == 0.0f, 1.0f,
512
select(y < 0.0f, T(0.0f), inf) | (asFloat(yOddInt) & x)));
513
514
return z;
515
}
516
517
template <typename T>
518
__forceinline T pow(const T &x, float y)
519
{
520
return pow(x, T(y));
521
}
522
523
} // namespace fastapprox
524
525
} // namespace embree
526
527