Path: blob/master/thirdparty/amd-fsr2/shaders/ffx_core_cpu.h
9903 views
// This file is part of the FidelityFX SDK.1//2// Copyright (c) 2022-2023 Advanced Micro Devices, Inc. All rights reserved.3//4// Permission is hereby granted, free of charge, to any person obtaining a copy5// of this software and associated documentation files (the "Software"), to deal6// in the Software without restriction, including without limitation the rights7// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell8// copies of the Software, and to permit persons to whom the Software is9// furnished to do so, subject to the following conditions:10// The above copyright notice and this permission notice shall be included in11// all copies or substantial portions of the Software.12//13// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR14// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,15// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE16// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER17// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,18// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN19// THE SOFTWARE.2021/// A define for a true value in a boolean expression.22///23/// @ingroup CPU24#define FFX_TRUE (1)2526/// A define for a false value in a boolean expression.27///28/// @ingroup CPU29#define FFX_FALSE (0)3031#if !defined(FFX_STATIC)32/// A define to abstract declaration of static variables and functions.33///34/// @ingroup CPU35#define FFX_STATIC static36#endif // #if !defined(FFX_STATIC)3738#ifdef __clang__39#pragma clang diagnostic ignored "-Wunused-variable"40#endif4142/// Interpret the bit layout of an IEEE-754 floating point value as an unsigned integer.43///44/// @param [in] x A 32bit floating value.45///46/// @returns47/// An unsigned 32bit integer value containing the bit pattern of <c><i>x</i></c>.48///49/// @ingroup CPU50FFX_STATIC FfxUInt32 ffxAsUInt32(FfxFloat32 x)51{52union53{54FfxFloat32 f;55FfxUInt32 u;56} bits;5758bits.f = x;59return bits.u;60}6162FFX_STATIC FfxFloat32 ffxDot2(FfxFloat32x2 a, FfxFloat32x2 b)63{64return a[0] * b[0] + a[1] * b[1];65}6667FFX_STATIC FfxFloat32 ffxDot3(FfxFloat32x3 a, FfxFloat32x3 b)68{69return a[0] * b[0] + a[1] * b[1] + a[2] * b[2];70}7172FFX_STATIC FfxFloat32 ffxDot4(FfxFloat32x4 a, FfxFloat32x4 b)73{74return a[0] * b[0] + a[1] * b[1] + a[2] * b[2] + a[3] * b[3];75}7677/// Compute the linear interopation between two values.78///79/// Implemented by calling the GLSL <c><i>mix</i></c> instrinsic function. Implements the80/// following math:81///82/// (1 - t) * x + t * y83///84/// @param [in] x The first value to lerp between.85/// @param [in] y The second value to lerp between.86/// @param [in] t The value to determine how much of <c><i>x</i></c> and how much of <c><i>y</i></c>.87///88/// @returns89/// A linearly interpolated value between <c><i>x</i></c> and <c><i>y</i></c> according to <c><i>t</i></c>.90///91/// @ingroup CPU92FFX_STATIC FfxFloat32 ffxLerp(FfxFloat32 x, FfxFloat32 y, FfxFloat32 t)93{94return y * t + (-x * t + x);95}9697/// Compute the reciprocal of a value.98///99/// @param [in] x The value to compute the reciprocal for.100///101/// @returns102/// The reciprocal value of <c><i>x</i></c>.103///104/// @ingroup CPU105FFX_STATIC FfxFloat32 ffxReciprocal(FfxFloat32 a)106{107return 1.0f / a;108}109110/// Compute the square root of a value.111///112/// @param [in] x The first value to compute the min of.113///114/// @returns115/// The the square root of <c><i>x</i></c>.116///117/// @ingroup CPU118FFX_STATIC FfxFloat32 ffxSqrt(FfxFloat32 x)119{120return sqrt(x);121}122123FFX_STATIC FfxUInt32 AShrSU1(FfxUInt32 a, FfxUInt32 b)124{125return FfxUInt32(FfxInt32(a) >> FfxInt32(b));126}127128/// Compute the factional part of a decimal value.129///130/// This function calculates <c><i>x - floor(x)</i></c>.131///132/// @param [in] x The value to compute the fractional part from.133///134/// @returns135/// The fractional part of <c><i>x</i></c>.136///137/// @ingroup CPU138FFX_STATIC FfxFloat32 ffxFract(FfxFloat32 a)139{140return a - floor(a);141}142143/// Compute the reciprocal square root of a value.144///145/// @param [in] x The value to compute the reciprocal for.146///147/// @returns148/// The reciprocal square root value of <c><i>x</i></c>.149///150/// @ingroup CPU151FFX_STATIC FfxFloat32 rsqrt(FfxFloat32 a)152{153return ffxReciprocal(ffxSqrt(a));154}155156FFX_STATIC FfxFloat32 ffxMin(FfxFloat32 x, FfxFloat32 y)157{158return x < y ? x : y;159}160161FFX_STATIC FfxUInt32 ffxMin(FfxUInt32 x, FfxUInt32 y)162{163return x < y ? x : y;164}165166FFX_STATIC FfxFloat32 ffxMax(FfxFloat32 x, FfxFloat32 y)167{168return x > y ? x : y;169}170171FFX_STATIC FfxUInt32 ffxMax(FfxUInt32 x, FfxUInt32 y)172{173return x > y ? x : y;174}175176/// Clamp a value to a [0..1] range.177///178/// @param [in] x The value to clamp to [0..1] range.179///180/// @returns181/// The clamped version of <c><i>x</i></c>.182///183/// @ingroup CPU184FFX_STATIC FfxFloat32 ffxSaturate(FfxFloat32 a)185{186return ffxMin(1.0f, ffxMax(0.0f, a));187}188189////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////190////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////191192FFX_STATIC void opAAddOneF3(FfxFloat32x3 d, FfxFloat32x3 a, FfxFloat32 b)193{194d[0] = a[0] + b;195d[1] = a[1] + b;196d[2] = a[2] + b;197return;198}199200FFX_STATIC void opACpyF3(FfxFloat32x3 d, FfxFloat32x3 a)201{202d[0] = a[0];203d[1] = a[1];204d[2] = a[2];205return;206}207208FFX_STATIC void opAMulF3(FfxFloat32x3 d, FfxFloat32x3 a, FfxFloat32x3 b)209{210d[0] = a[0] * b[0];211d[1] = a[1] * b[1];212d[2] = a[2] * b[2];213return;214}215216FFX_STATIC void opAMulOneF3(FfxFloat32x3 d, FfxFloat32x3 a, FfxFloat32 b)217{218d[0] = a[0] * b;219d[1] = a[1] * b;220d[2] = a[2] * b;221return;222}223224FFX_STATIC void opARcpF3(FfxFloat32x3 d, FfxFloat32x3 a)225{226d[0] = ffxReciprocal(a[0]);227d[1] = ffxReciprocal(a[1]);228d[2] = ffxReciprocal(a[2]);229return;230}231232/// Convert FfxFloat32 to half (in lower 16-bits of output).233///234/// This function implements the same fast technique that is documented here: ftp://ftp.fox-toolkit.org/pub/fasthalffloatconversion.pdf235///236/// The function supports denormals.237///238/// Some conversion rules are to make computations possibly "safer" on the GPU,239/// -INF & -NaN -> -65504240/// +INF & +NaN -> +65504241///242/// @param [in] f The 32bit floating point value to convert.243///244/// @returns245/// The closest 16bit floating point value to <c><i>f</i></c>.246///247/// @ingroup CPU248FFX_STATIC FfxUInt32 f32tof16(FfxFloat32 f)249{250static FfxUInt16 base[512] = {2510x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,2520x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,2530x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,2540x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,2550x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,2560x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0001, 0x0002, 0x0004, 0x0008, 0x0010, 0x0020, 0x0040, 0x0080, 0x0100, 0x0200, 0x0400,2570x0800, 0x0c00, 0x1000, 0x1400, 0x1800, 0x1c00, 0x2000, 0x2400, 0x2800, 0x2c00, 0x3000, 0x3400, 0x3800, 0x3c00, 0x4000, 0x4400, 0x4800, 0x4c00, 0x5000,2580x5400, 0x5800, 0x5c00, 0x6000, 0x6400, 0x6800, 0x6c00, 0x7000, 0x7400, 0x7800, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff,2590x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff,2600x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff,2610x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff,2620x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff,2630x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff,2640x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,2650x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,2660x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,2670x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,2680x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,2690x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8001, 0x8002,2700x8004, 0x8008, 0x8010, 0x8020, 0x8040, 0x8080, 0x8100, 0x8200, 0x8400, 0x8800, 0x8c00, 0x9000, 0x9400, 0x9800, 0x9c00, 0xa000, 0xa400, 0xa800, 0xac00,2710xb000, 0xb400, 0xb800, 0xbc00, 0xc000, 0xc400, 0xc800, 0xcc00, 0xd000, 0xd400, 0xd800, 0xdc00, 0xe000, 0xe400, 0xe800, 0xec00, 0xf000, 0xf400, 0xf800,2720xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff,2730xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff,2740xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff,2750xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff,2760xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff,2770xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff278};279280static FfxUInt8 shift[512] = {2810x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,2820x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,2830x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,2840x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,2850x18, 0x18, 0x18, 0x17, 0x16, 0x15, 0x14, 0x13, 0x12, 0x11, 0x10, 0x0f, 0x0e, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d,2860x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,2870x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,2880x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,2890x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,2900x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,2910x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,2920x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,2930x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,2940x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,2950x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x17, 0x16, 0x15, 0x14, 0x13, 0x12, 0x11, 0x10, 0x0f, 0x0e, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d,2960x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x18,2970x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,2980x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,2990x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,3000x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,3010x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18302};303304union305{306FfxFloat32 f;307FfxUInt32 u;308} bits;309310bits.f = f;311FfxUInt32 u = bits.u;312FfxUInt32 i = u >> 23;313return (FfxUInt32)(base[i]) + ((u & 0x7fffff) >> shift[i]);314}315316/// Pack 2x32-bit floating point values in a single 32bit value.317///318/// This function first converts each component of <c><i>value</i></c> into their nearest 16-bit floating319/// point representation, and then stores the X and Y components in the lower and upper 16 bits of the320/// 32bit unsigned integer respectively.321///322/// @param [in] value A 2-dimensional floating point value to convert and pack.323///324/// @returns325/// A packed 32bit value containing 2 16bit floating point values.326///327/// @ingroup CPU328FFX_STATIC FfxUInt32 packHalf2x16(FfxFloat32x2 a)329{330return f32tof16(a[0]) + (f32tof16(a[1]) << 16);331}332333334