Path: blob/master/thirdparty/amd-fsr2/shaders/ffx_spd.h
9903 views
// This file is part of the FidelityFX SDK.1//2// Copyright (c) 2022-2023 Advanced Micro Devices, Inc. All rights reserved.3//4// Permission is hereby granted, free of charge, to any person obtaining a copy5// of this software and associated documentation files (the "Software"), to deal6// in the Software without restriction, including without limitation the rights7// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell8// copies of the Software, and to permit persons to whom the Software is9// furnished to do so, subject to the following conditions:10// The above copyright notice and this permission notice shall be included in11// all copies or substantial portions of the Software.12//13// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR14// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,15// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE16// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER17// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,18// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN19// THE SOFTWARE.2021#ifdef FFX_CPU22FFX_STATIC void SpdSetup(FfxUInt32x2 dispatchThreadGroupCountXY, // CPU side: dispatch thread group count xy23FfxUInt32x2 workGroupOffset, // GPU side: pass in as constant24FfxUInt32x2 numWorkGroupsAndMips, // GPU side: pass in as constant25FfxUInt32x4 rectInfo, // left, top, width, height26FfxInt32 mips) // optional: if -1, calculate based on rect width and height27{28workGroupOffset[0] = rectInfo[0] / 64; // rectInfo[0] = left29workGroupOffset[1] = rectInfo[1] / 64; // rectInfo[1] = top3031FfxUInt32 endIndexX = (rectInfo[0] + rectInfo[2] - 1) / 64; // rectInfo[0] = left, rectInfo[2] = width32FfxUInt32 endIndexY = (rectInfo[1] + rectInfo[3] - 1) / 64; // rectInfo[1] = top, rectInfo[3] = height3334dispatchThreadGroupCountXY[0] = endIndexX + 1 - workGroupOffset[0];35dispatchThreadGroupCountXY[1] = endIndexY + 1 - workGroupOffset[1];3637numWorkGroupsAndMips[0] = (dispatchThreadGroupCountXY[0]) * (dispatchThreadGroupCountXY[1]);3839if (mips >= 0)40{41numWorkGroupsAndMips[1] = FfxUInt32(mips);42}43else44{45// calculate based on rect width and height46FfxUInt32 resolution = ffxMax(rectInfo[2], rectInfo[3]);47numWorkGroupsAndMips[1] = FfxUInt32((ffxMin(floor(log2(FfxFloat32(resolution))), FfxFloat32(12))));48}49}5051FFX_STATIC void SpdSetup(FfxUInt32x2 dispatchThreadGroupCountXY, // CPU side: dispatch thread group count xy52FfxUInt32x2 workGroupOffset, // GPU side: pass in as constant53FfxUInt32x2 numWorkGroupsAndMips, // GPU side: pass in as constant54FfxUInt32x4 rectInfo) // left, top, width, height55{56SpdSetup(dispatchThreadGroupCountXY, workGroupOffset, numWorkGroupsAndMips, rectInfo, -1);57}58#endif // #ifdef FFX_CPU596061//==============================================================================================================================62// NON-PACKED VERSION63//==============================================================================================================================64#ifdef FFX_GPU65#ifdef SPD_PACKED_ONLY66// Avoid compiler error67FfxFloat32x4 SpdLoadSourceImage(FfxInt32x2 p, FfxUInt32 slice)68{69return FfxFloat32x4(0.0, 0.0, 0.0, 0.0);70}7172FfxFloat32x4 SpdLoad(FfxInt32x2 p, FfxUInt32 slice)73{74return FfxFloat32x4(0.0, 0.0, 0.0, 0.0);75}76void SpdStore(FfxInt32x2 p, FfxFloat32x4 value, FfxUInt32 mip, FfxUInt32 slice)77{78}79FfxFloat32x4 SpdLoadIntermediate(FfxUInt32 x, FfxUInt32 y)80{81return FfxFloat32x4(0.0, 0.0, 0.0, 0.0);82}83void SpdStoreIntermediate(FfxUInt32 x, FfxUInt32 y, FfxFloat32x4 value)84{85}86FfxFloat32x4 SpdReduce4(FfxFloat32x4 v0, FfxFloat32x4 v1, FfxFloat32x4 v2, FfxFloat32x4 v3)87{88return FfxFloat32x4(0.0, 0.0, 0.0, 0.0);89}90#endif // #ifdef SPD_PACKED_ONLY9192//_____________________________________________________________/\_______________________________________________________________93#if defined(FFX_GLSL) && !defined(SPD_NO_WAVE_OPERATIONS)94#extension GL_KHR_shader_subgroup_quad:require95#endif9697void SpdWorkgroupShuffleBarrier()98{99#ifdef FFX_GLSL100barrier();101#endif102#ifdef FFX_HLSL103GroupMemoryBarrierWithGroupSync();104#endif105}106107// Only last active workgroup should proceed108bool SpdExitWorkgroup(FfxUInt32 numWorkGroups, FfxUInt32 localInvocationIndex, FfxUInt32 slice)109{110// global atomic counter111if (localInvocationIndex == 0)112{113SpdIncreaseAtomicCounter(slice);114}115116SpdWorkgroupShuffleBarrier();117return (SpdGetAtomicCounter() != (numWorkGroups - 1));118}119120// User defined: FfxFloat32x4 SpdReduce4(FfxFloat32x4 v0, FfxFloat32x4 v1, FfxFloat32x4 v2, FfxFloat32x4 v3);121FfxFloat32x4 SpdReduceQuad(FfxFloat32x4 v)122{123#if defined(FFX_GLSL) && !defined(SPD_NO_WAVE_OPERATIONS)124125FfxFloat32x4 v0 = v;126FfxFloat32x4 v1 = subgroupQuadSwapHorizontal(v);127FfxFloat32x4 v2 = subgroupQuadSwapVertical(v);128FfxFloat32x4 v3 = subgroupQuadSwapDiagonal(v);129return SpdReduce4(v0, v1, v2, v3);130131#elif defined(FFX_HLSL) && !defined(SPD_NO_WAVE_OPERATIONS)132133// requires SM6.0134FfxUInt32 quad = WaveGetLaneIndex() & (~0x3);135FfxFloat32x4 v0 = v;136FfxFloat32x4 v1 = WaveReadLaneAt(v, quad | 1);137FfxFloat32x4 v2 = WaveReadLaneAt(v, quad | 2);138FfxFloat32x4 v3 = WaveReadLaneAt(v, quad | 3);139return SpdReduce4(v0, v1, v2, v3);140/*141// if SM6.0 is not available, you can use the AMD shader intrinsics142// the AMD shader intrinsics are available in AMD GPU Services (AGS) library:143// https://gpuopen.com/amd-gpu-services-ags-library/144// works for DX11145FfxFloat32x4 v0 = v;146FfxFloat32x4 v1;147v1.x = AmdExtD3DShaderIntrinsics_SwizzleF(v.x, AmdExtD3DShaderIntrinsicsSwizzle_SwapX1);148v1.y = AmdExtD3DShaderIntrinsics_SwizzleF(v.y, AmdExtD3DShaderIntrinsicsSwizzle_SwapX1);149v1.z = AmdExtD3DShaderIntrinsics_SwizzleF(v.z, AmdExtD3DShaderIntrinsicsSwizzle_SwapX1);150v1.w = AmdExtD3DShaderIntrinsics_SwizzleF(v.w, AmdExtD3DShaderIntrinsicsSwizzle_SwapX1);151FfxFloat32x4 v2;152v2.x = AmdExtD3DShaderIntrinsics_SwizzleF(v.x, AmdExtD3DShaderIntrinsicsSwizzle_SwapX2);153v2.y = AmdExtD3DShaderIntrinsics_SwizzleF(v.y, AmdExtD3DShaderIntrinsicsSwizzle_SwapX2);154v2.z = AmdExtD3DShaderIntrinsics_SwizzleF(v.z, AmdExtD3DShaderIntrinsicsSwizzle_SwapX2);155v2.w = AmdExtD3DShaderIntrinsics_SwizzleF(v.w, AmdExtD3DShaderIntrinsicsSwizzle_SwapX2);156FfxFloat32x4 v3;157v3.x = AmdExtD3DShaderIntrinsics_SwizzleF(v.x, AmdExtD3DShaderIntrinsicsSwizzle_ReverseX4);158v3.y = AmdExtD3DShaderIntrinsics_SwizzleF(v.y, AmdExtD3DShaderIntrinsicsSwizzle_ReverseX4);159v3.z = AmdExtD3DShaderIntrinsics_SwizzleF(v.z, AmdExtD3DShaderIntrinsicsSwizzle_ReverseX4);160v3.w = AmdExtD3DShaderIntrinsics_SwizzleF(v.w, AmdExtD3DShaderIntrinsicsSwizzle_ReverseX4);161return SpdReduce4(v0, v1, v2, v3);162*/163#endif164return v;165}166167FfxFloat32x4 SpdReduceIntermediate(FfxUInt32x2 i0, FfxUInt32x2 i1, FfxUInt32x2 i2, FfxUInt32x2 i3)168{169FfxFloat32x4 v0 = SpdLoadIntermediate(i0.x, i0.y);170FfxFloat32x4 v1 = SpdLoadIntermediate(i1.x, i1.y);171FfxFloat32x4 v2 = SpdLoadIntermediate(i2.x, i2.y);172FfxFloat32x4 v3 = SpdLoadIntermediate(i3.x, i3.y);173return SpdReduce4(v0, v1, v2, v3);174}175176FfxFloat32x4 SpdReduceLoad4(FfxUInt32x2 i0, FfxUInt32x2 i1, FfxUInt32x2 i2, FfxUInt32x2 i3, FfxUInt32 slice)177{178FfxFloat32x4 v0 = SpdLoad(FfxInt32x2(i0), slice);179FfxFloat32x4 v1 = SpdLoad(FfxInt32x2(i1), slice);180FfxFloat32x4 v2 = SpdLoad(FfxInt32x2(i2), slice);181FfxFloat32x4 v3 = SpdLoad(FfxInt32x2(i3), slice);182return SpdReduce4(v0, v1, v2, v3);183}184185FfxFloat32x4 SpdReduceLoad4(FfxUInt32x2 base, FfxUInt32 slice)186{187return SpdReduceLoad4(FfxUInt32x2(base + FfxUInt32x2(0, 0)), FfxUInt32x2(base + FfxUInt32x2(0, 1)), FfxUInt32x2(base + FfxUInt32x2(1, 0)), FfxUInt32x2(base + FfxUInt32x2(1, 1)), slice);188}189190FfxFloat32x4 SpdReduceLoadSourceImage4(FfxUInt32x2 i0, FfxUInt32x2 i1, FfxUInt32x2 i2, FfxUInt32x2 i3, FfxUInt32 slice)191{192FfxFloat32x4 v0 = SpdLoadSourceImage(FfxInt32x2(i0), slice);193FfxFloat32x4 v1 = SpdLoadSourceImage(FfxInt32x2(i1), slice);194FfxFloat32x4 v2 = SpdLoadSourceImage(FfxInt32x2(i2), slice);195FfxFloat32x4 v3 = SpdLoadSourceImage(FfxInt32x2(i3), slice);196return SpdReduce4(v0, v1, v2, v3);197}198199FfxFloat32x4 SpdReduceLoadSourceImage(FfxUInt32x2 base, FfxUInt32 slice)200{201#ifdef SPD_LINEAR_SAMPLER202return SpdLoadSourceImage(FfxInt32x2(base), slice);203#else204return SpdReduceLoadSourceImage4(FfxUInt32x2(base + FfxUInt32x2(0, 0)), FfxUInt32x2(base + FfxUInt32x2(0, 1)), FfxUInt32x2(base + FfxUInt32x2(1, 0)), FfxUInt32x2(base + FfxUInt32x2(1, 1)), slice);205#endif206}207208void SpdDownsampleMips_0_1_Intrinsics(FfxUInt32 x, FfxUInt32 y, FfxUInt32x2 workGroupID, FfxUInt32 localInvocationIndex, FfxUInt32 mip, FfxUInt32 slice)209{210FfxFloat32x4 v[4];211212FfxInt32x2 tex = FfxInt32x2(workGroupID.xy * 64) + FfxInt32x2(x * 2, y * 2);213FfxInt32x2 pix = FfxInt32x2(workGroupID.xy * 32) + FfxInt32x2(x, y);214v[0] = SpdReduceLoadSourceImage(tex, slice);215SpdStore(pix, v[0], 0, slice);216217tex = FfxInt32x2(workGroupID.xy * 64) + FfxInt32x2(x * 2 + 32, y * 2);218pix = FfxInt32x2(workGroupID.xy * 32) + FfxInt32x2(x + 16, y);219v[1] = SpdReduceLoadSourceImage(tex, slice);220SpdStore(pix, v[1], 0, slice);221222tex = FfxInt32x2(workGroupID.xy * 64) + FfxInt32x2(x * 2, y * 2 + 32);223pix = FfxInt32x2(workGroupID.xy * 32) + FfxInt32x2(x, y + 16);224v[2] = SpdReduceLoadSourceImage(tex, slice);225SpdStore(pix, v[2], 0, slice);226227tex = FfxInt32x2(workGroupID.xy * 64) + FfxInt32x2(x * 2 + 32, y * 2 + 32);228pix = FfxInt32x2(workGroupID.xy * 32) + FfxInt32x2(x + 16, y + 16);229v[3] = SpdReduceLoadSourceImage(tex, slice);230SpdStore(pix, v[3], 0, slice);231232if (mip <= 1)233return;234235v[0] = SpdReduceQuad(v[0]);236v[1] = SpdReduceQuad(v[1]);237v[2] = SpdReduceQuad(v[2]);238v[3] = SpdReduceQuad(v[3]);239240if ((localInvocationIndex % 4) == 0)241{242SpdStore(FfxInt32x2(workGroupID.xy * 16) + FfxInt32x2(x / 2, y / 2), v[0], 1, slice);243SpdStoreIntermediate(x / 2, y / 2, v[0]);244245SpdStore(FfxInt32x2(workGroupID.xy * 16) + FfxInt32x2(x / 2 + 8, y / 2), v[1], 1, slice);246SpdStoreIntermediate(x / 2 + 8, y / 2, v[1]);247248SpdStore(FfxInt32x2(workGroupID.xy * 16) + FfxInt32x2(x / 2, y / 2 + 8), v[2], 1, slice);249SpdStoreIntermediate(x / 2, y / 2 + 8, v[2]);250251SpdStore(FfxInt32x2(workGroupID.xy * 16) + FfxInt32x2(x / 2 + 8, y / 2 + 8), v[3], 1, slice);252SpdStoreIntermediate(x / 2 + 8, y / 2 + 8, v[3]);253}254}255256void SpdDownsampleMips_0_1_LDS(FfxUInt32 x, FfxUInt32 y, FfxUInt32x2 workGroupID, FfxUInt32 localInvocationIndex, FfxUInt32 mip, FfxUInt32 slice)257{258FfxFloat32x4 v[4];259260FfxInt32x2 tex = FfxInt32x2(workGroupID.xy * 64) + FfxInt32x2(x * 2, y * 2);261FfxInt32x2 pix = FfxInt32x2(workGroupID.xy * 32) + FfxInt32x2(x, y);262v[0] = SpdReduceLoadSourceImage(tex, slice);263SpdStore(pix, v[0], 0, slice);264265tex = FfxInt32x2(workGroupID.xy * 64) + FfxInt32x2(x * 2 + 32, y * 2);266pix = FfxInt32x2(workGroupID.xy * 32) + FfxInt32x2(x + 16, y);267v[1] = SpdReduceLoadSourceImage(tex, slice);268SpdStore(pix, v[1], 0, slice);269270tex = FfxInt32x2(workGroupID.xy * 64) + FfxInt32x2(x * 2, y * 2 + 32);271pix = FfxInt32x2(workGroupID.xy * 32) + FfxInt32x2(x, y + 16);272v[2] = SpdReduceLoadSourceImage(tex, slice);273SpdStore(pix, v[2], 0, slice);274275tex = FfxInt32x2(workGroupID.xy * 64) + FfxInt32x2(x * 2 + 32, y * 2 + 32);276pix = FfxInt32x2(workGroupID.xy * 32) + FfxInt32x2(x + 16, y + 16);277v[3] = SpdReduceLoadSourceImage(tex, slice);278SpdStore(pix, v[3], 0, slice);279280if (mip <= 1)281return;282283for (FfxUInt32 i = 0; i < 4; i++)284{285SpdStoreIntermediate(x, y, v[i]);286SpdWorkgroupShuffleBarrier();287if (localInvocationIndex < 64)288{289v[i] = SpdReduceIntermediate(FfxUInt32x2(x * 2 + 0, y * 2 + 0), FfxUInt32x2(x * 2 + 1, y * 2 + 0), FfxUInt32x2(x * 2 + 0, y * 2 + 1), FfxUInt32x2(x * 2 + 1, y * 2 + 1));290SpdStore(FfxInt32x2(workGroupID.xy * 16) + FfxInt32x2(x + (i % 2) * 8, y + (i / 2) * 8), v[i], 1, slice);291}292SpdWorkgroupShuffleBarrier();293}294295if (localInvocationIndex < 64)296{297SpdStoreIntermediate(x + 0, y + 0, v[0]);298SpdStoreIntermediate(x + 8, y + 0, v[1]);299SpdStoreIntermediate(x + 0, y + 8, v[2]);300SpdStoreIntermediate(x + 8, y + 8, v[3]);301}302}303304void SpdDownsampleMips_0_1(FfxUInt32 x, FfxUInt32 y, FfxUInt32x2 workGroupID, FfxUInt32 localInvocationIndex, FfxUInt32 mip, FfxUInt32 slice)305{306#ifdef SPD_NO_WAVE_OPERATIONS307SpdDownsampleMips_0_1_LDS(x, y, workGroupID, localInvocationIndex, mip, slice);308#else309SpdDownsampleMips_0_1_Intrinsics(x, y, workGroupID, localInvocationIndex, mip, slice);310#endif311}312313314void SpdDownsampleMip_2(FfxUInt32 x, FfxUInt32 y, FfxUInt32x2 workGroupID, FfxUInt32 localInvocationIndex, FfxUInt32 mip, FfxUInt32 slice)315{316#ifdef SPD_NO_WAVE_OPERATIONS317if (localInvocationIndex < 64)318{319FfxFloat32x4 v = SpdReduceIntermediate(FfxUInt32x2(x * 2 + 0, y * 2 + 0), FfxUInt32x2(x * 2 + 1, y * 2 + 0), FfxUInt32x2(x * 2 + 0, y * 2 + 1), FfxUInt32x2(x * 2 + 1, y * 2 + 1));320SpdStore(FfxInt32x2(workGroupID.xy * 8) + FfxInt32x2(x, y), v, mip, slice);321// store to LDS, try to reduce bank conflicts322// x 0 x 0 x 0 x 0 x 0 x 0 x 0 x 0323// 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0324// 0 x 0 x 0 x 0 x 0 x 0 x 0 x 0 x325// 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0326// x 0 x 0 x 0 x 0 x 0 x 0 x 0 x 0327// ...328// x 0 x 0 x 0 x 0 x 0 x 0 x 0 x 0329SpdStoreIntermediate(x * 2 + y % 2, y * 2, v);330}331#else332FfxFloat32x4 v = SpdLoadIntermediate(x, y);333v = SpdReduceQuad(v);334// quad index 0 stores result335if (localInvocationIndex % 4 == 0)336{337SpdStore(FfxInt32x2(workGroupID.xy * 8) + FfxInt32x2(x / 2, y / 2), v, mip, slice);338SpdStoreIntermediate(x + (y / 2) % 2, y, v);339}340#endif341}342343void SpdDownsampleMip_3(FfxUInt32 x, FfxUInt32 y, FfxUInt32x2 workGroupID, FfxUInt32 localInvocationIndex, FfxUInt32 mip, FfxUInt32 slice)344{345#ifdef SPD_NO_WAVE_OPERATIONS346if (localInvocationIndex < 16)347{348// x 0 x 0349// 0 0 0 0350// 0 x 0 x351// 0 0 0 0352FfxFloat32x4 v =353SpdReduceIntermediate(FfxUInt32x2(x * 4 + 0 + 0, y * 4 + 0), FfxUInt32x2(x * 4 + 2 + 0, y * 4 + 0), FfxUInt32x2(x * 4 + 0 + 1, y * 4 + 2), FfxUInt32x2(x * 4 + 2 + 1, y * 4 + 2));354SpdStore(FfxInt32x2(workGroupID.xy * 4) + FfxInt32x2(x, y), v, mip, slice);355// store to LDS356// x 0 0 0 x 0 0 0 x 0 0 0 x 0 0 0357// 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0358// 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0359// 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0360// 0 x 0 0 0 x 0 0 0 x 0 0 0 x 0 0361// ...362// 0 0 x 0 0 0 x 0 0 0 x 0 0 0 x 0363// ...364// 0 0 0 x 0 0 0 x 0 0 0 x 0 0 0 x365// ...366SpdStoreIntermediate(x * 4 + y, y * 4, v);367}368#else369if (localInvocationIndex < 64)370{371FfxFloat32x4 v = SpdLoadIntermediate(x * 2 + y % 2, y * 2);372v = SpdReduceQuad(v);373// quad index 0 stores result374if (localInvocationIndex % 4 == 0)375{376SpdStore(FfxInt32x2(workGroupID.xy * 4) + FfxInt32x2(x / 2, y / 2), v, mip, slice);377SpdStoreIntermediate(x * 2 + y / 2, y * 2, v);378}379}380#endif381}382383void SpdDownsampleMip_4(FfxUInt32 x, FfxUInt32 y, FfxUInt32x2 workGroupID, FfxUInt32 localInvocationIndex, FfxUInt32 mip, FfxUInt32 slice)384{385#ifdef SPD_NO_WAVE_OPERATIONS386if (localInvocationIndex < 4)387{388// x 0 0 0 x 0 0 0389// ...390// 0 x 0 0 0 x 0 0391FfxFloat32x4 v = SpdReduceIntermediate(FfxUInt32x2(x * 8 + 0 + 0 + y * 2, y * 8 + 0),392FfxUInt32x2(x * 8 + 4 + 0 + y * 2, y * 8 + 0),393FfxUInt32x2(x * 8 + 0 + 1 + y * 2, y * 8 + 4),394FfxUInt32x2(x * 8 + 4 + 1 + y * 2, y * 8 + 4));395SpdStore(FfxInt32x2(workGroupID.xy * 2) + FfxInt32x2(x, y), v, mip, slice);396// store to LDS397// x x x x 0 ...398// 0 ...399SpdStoreIntermediate(x + y * 2, 0, v);400}401#else402if (localInvocationIndex < 16)403{404FfxFloat32x4 v = SpdLoadIntermediate(x * 4 + y, y * 4);405v = SpdReduceQuad(v);406// quad index 0 stores result407if (localInvocationIndex % 4 == 0)408{409SpdStore(FfxInt32x2(workGroupID.xy * 2) + FfxInt32x2(x / 2, y / 2), v, mip, slice);410SpdStoreIntermediate(x / 2 + y, 0, v);411}412}413#endif414}415416void SpdDownsampleMip_5(FfxUInt32x2 workGroupID, FfxUInt32 localInvocationIndex, FfxUInt32 mip, FfxUInt32 slice)417{418#ifdef SPD_NO_WAVE_OPERATIONS419if (localInvocationIndex < 1)420{421// x x x x 0 ...422// 0 ...423FfxFloat32x4 v = SpdReduceIntermediate(FfxUInt32x2(0, 0), FfxUInt32x2(1, 0), FfxUInt32x2(2, 0), FfxUInt32x2(3, 0));424SpdStore(FfxInt32x2(workGroupID.xy), v, mip, slice);425}426#else427if (localInvocationIndex < 4)428{429FfxFloat32x4 v = SpdLoadIntermediate(localInvocationIndex, 0);430v = SpdReduceQuad(v);431// quad index 0 stores result432if (localInvocationIndex % 4 == 0)433{434SpdStore(FfxInt32x2(workGroupID.xy), v, mip, slice);435}436}437#endif438}439440void SpdDownsampleMips_6_7(FfxUInt32 x, FfxUInt32 y, FfxUInt32 mips, FfxUInt32 slice)441{442FfxInt32x2 tex = FfxInt32x2(x * 4 + 0, y * 4 + 0);443FfxInt32x2 pix = FfxInt32x2(x * 2 + 0, y * 2 + 0);444FfxFloat32x4 v0 = SpdReduceLoad4(tex, slice);445SpdStore(pix, v0, 6, slice);446447tex = FfxInt32x2(x * 4 + 2, y * 4 + 0);448pix = FfxInt32x2(x * 2 + 1, y * 2 + 0);449FfxFloat32x4 v1 = SpdReduceLoad4(tex, slice);450SpdStore(pix, v1, 6, slice);451452tex = FfxInt32x2(x * 4 + 0, y * 4 + 2);453pix = FfxInt32x2(x * 2 + 0, y * 2 + 1);454FfxFloat32x4 v2 = SpdReduceLoad4(tex, slice);455SpdStore(pix, v2, 6, slice);456457tex = FfxInt32x2(x * 4 + 2, y * 4 + 2);458pix = FfxInt32x2(x * 2 + 1, y * 2 + 1);459FfxFloat32x4 v3 = SpdReduceLoad4(tex, slice);460SpdStore(pix, v3, 6, slice);461462if (mips <= 7)463return;464// no barrier needed, working on values only from the same thread465466FfxFloat32x4 v = SpdReduce4(v0, v1, v2, v3);467SpdStore(FfxInt32x2(x, y), v, 7, slice);468SpdStoreIntermediate(x, y, v);469}470471void SpdDownsampleNextFour(FfxUInt32 x, FfxUInt32 y, FfxUInt32x2 workGroupID, FfxUInt32 localInvocationIndex, FfxUInt32 baseMip, FfxUInt32 mips, FfxUInt32 slice)472{473if (mips <= baseMip)474return;475SpdWorkgroupShuffleBarrier();476SpdDownsampleMip_2(x, y, workGroupID, localInvocationIndex, baseMip, slice);477478if (mips <= baseMip + 1)479return;480SpdWorkgroupShuffleBarrier();481SpdDownsampleMip_3(x, y, workGroupID, localInvocationIndex, baseMip + 1, slice);482483if (mips <= baseMip + 2)484return;485SpdWorkgroupShuffleBarrier();486SpdDownsampleMip_4(x, y, workGroupID, localInvocationIndex, baseMip + 2, slice);487488if (mips <= baseMip + 3)489return;490SpdWorkgroupShuffleBarrier();491SpdDownsampleMip_5(workGroupID, localInvocationIndex, baseMip + 3, slice);492}493494void SpdDownsample(FfxUInt32x2 workGroupID, FfxUInt32 localInvocationIndex, FfxUInt32 mips, FfxUInt32 numWorkGroups, FfxUInt32 slice)495{496FfxUInt32x2 sub_xy = ffxRemapForWaveReduction(localInvocationIndex % 64);497FfxUInt32 x = sub_xy.x + 8 * ((localInvocationIndex >> 6) % 2);498FfxUInt32 y = sub_xy.y + 8 * ((localInvocationIndex >> 7));499SpdDownsampleMips_0_1(x, y, workGroupID, localInvocationIndex, mips, slice);500501SpdDownsampleNextFour(x, y, workGroupID, localInvocationIndex, 2, mips, slice);502503if (mips <= 6)504return;505506if (SpdExitWorkgroup(numWorkGroups, localInvocationIndex, slice))507return;508509SpdResetAtomicCounter(slice);510511// After mip 6 there is only a single workgroup left that downsamples the remaining up to 64x64 texels.512SpdDownsampleMips_6_7(x, y, mips, slice);513514SpdDownsampleNextFour(x, y, FfxUInt32x2(0, 0), localInvocationIndex, 8, mips, slice);515}516517void SpdDownsample(FfxUInt32x2 workGroupID, FfxUInt32 localInvocationIndex, FfxUInt32 mips, FfxUInt32 numWorkGroups, FfxUInt32 slice, FfxUInt32x2 workGroupOffset)518{519SpdDownsample(workGroupID + workGroupOffset, localInvocationIndex, mips, numWorkGroups, slice);520}521522////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////523////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////524525//==============================================================================================================================526// PACKED VERSION527//==============================================================================================================================528529#if FFX_HALF530531#ifdef FFX_GLSL532#extension GL_EXT_shader_subgroup_extended_types_float16:require533#endif534535FfxFloat16x4 SpdReduceQuadH(FfxFloat16x4 v)536{537#if defined(FFX_GLSL) && !defined(SPD_NO_WAVE_OPERATIONS)538FfxFloat16x4 v0 = v;539FfxFloat16x4 v1 = subgroupQuadSwapHorizontal(v);540FfxFloat16x4 v2 = subgroupQuadSwapVertical(v);541FfxFloat16x4 v3 = subgroupQuadSwapDiagonal(v);542return SpdReduce4H(v0, v1, v2, v3);543#elif defined(FFX_HLSL) && !defined(SPD_NO_WAVE_OPERATIONS)544// requires SM6.0545FfxUInt32 quad = WaveGetLaneIndex() & (~0x3);546FfxFloat16x4 v0 = v;547FfxFloat16x4 v1 = WaveReadLaneAt(v, quad | 1);548FfxFloat16x4 v2 = WaveReadLaneAt(v, quad | 2);549FfxFloat16x4 v3 = WaveReadLaneAt(v, quad | 3);550return SpdReduce4H(v0, v1, v2, v3);551/*552// if SM6.0 is not available, you can use the AMD shader intrinsics553// the AMD shader intrinsics are available in AMD GPU Services (AGS) library:554// https://gpuopen.com/amd-gpu-services-ags-library/555// works for DX11556FfxFloat16x4 v0 = v;557FfxFloat16x4 v1;558v1.x = AmdExtD3DShaderIntrinsics_SwizzleF(v.x, AmdExtD3DShaderIntrinsicsSwizzle_SwapX1);559v1.y = AmdExtD3DShaderIntrinsics_SwizzleF(v.y, AmdExtD3DShaderIntrinsicsSwizzle_SwapX1);560v1.z = AmdExtD3DShaderIntrinsics_SwizzleF(v.z, AmdExtD3DShaderIntrinsicsSwizzle_SwapX1);561v1.w = AmdExtD3DShaderIntrinsics_SwizzleF(v.w, AmdExtD3DShaderIntrinsicsSwizzle_SwapX1);562FfxFloat16x4 v2;563v2.x = AmdExtD3DShaderIntrinsics_SwizzleF(v.x, AmdExtD3DShaderIntrinsicsSwizzle_SwapX2);564v2.y = AmdExtD3DShaderIntrinsics_SwizzleF(v.y, AmdExtD3DShaderIntrinsicsSwizzle_SwapX2);565v2.z = AmdExtD3DShaderIntrinsics_SwizzleF(v.z, AmdExtD3DShaderIntrinsicsSwizzle_SwapX2);566v2.w = AmdExtD3DShaderIntrinsics_SwizzleF(v.w, AmdExtD3DShaderIntrinsicsSwizzle_SwapX2);567FfxFloat16x4 v3;568v3.x = AmdExtD3DShaderIntrinsics_SwizzleF(v.x, AmdExtD3DShaderIntrinsicsSwizzle_ReverseX4);569v3.y = AmdExtD3DShaderIntrinsics_SwizzleF(v.y, AmdExtD3DShaderIntrinsicsSwizzle_ReverseX4);570v3.z = AmdExtD3DShaderIntrinsics_SwizzleF(v.z, AmdExtD3DShaderIntrinsicsSwizzle_ReverseX4);571v3.w = AmdExtD3DShaderIntrinsics_SwizzleF(v.w, AmdExtD3DShaderIntrinsicsSwizzle_ReverseX4);572return SpdReduce4H(v0, v1, v2, v3);573*/574#endif575return FfxFloat16x4(0.0, 0.0, 0.0, 0.0);576}577578FfxFloat16x4 SpdReduceIntermediateH(FfxUInt32x2 i0, FfxUInt32x2 i1, FfxUInt32x2 i2, FfxUInt32x2 i3)579{580FfxFloat16x4 v0 = SpdLoadIntermediateH(i0.x, i0.y);581FfxFloat16x4 v1 = SpdLoadIntermediateH(i1.x, i1.y);582FfxFloat16x4 v2 = SpdLoadIntermediateH(i2.x, i2.y);583FfxFloat16x4 v3 = SpdLoadIntermediateH(i3.x, i3.y);584return SpdReduce4H(v0, v1, v2, v3);585}586587FfxFloat16x4 SpdReduceLoad4H(FfxUInt32x2 i0, FfxUInt32x2 i1, FfxUInt32x2 i2, FfxUInt32x2 i3, FfxUInt32 slice)588{589FfxFloat16x4 v0 = SpdLoadH(FfxInt32x2(i0), slice);590FfxFloat16x4 v1 = SpdLoadH(FfxInt32x2(i1), slice);591FfxFloat16x4 v2 = SpdLoadH(FfxInt32x2(i2), slice);592FfxFloat16x4 v3 = SpdLoadH(FfxInt32x2(i3), slice);593return SpdReduce4H(v0, v1, v2, v3);594}595596FfxFloat16x4 SpdReduceLoad4H(FfxUInt32x2 base, FfxUInt32 slice)597{598return SpdReduceLoad4H(FfxUInt32x2(base + FfxUInt32x2(0, 0)), FfxUInt32x2(base + FfxUInt32x2(0, 1)), FfxUInt32x2(base + FfxUInt32x2(1, 0)), FfxUInt32x2(base + FfxUInt32x2(1, 1)), slice);599}600601FfxFloat16x4 SpdReduceLoadSourceImage4H(FfxUInt32x2 i0, FfxUInt32x2 i1, FfxUInt32x2 i2, FfxUInt32x2 i3, FfxUInt32 slice)602{603FfxFloat16x4 v0 = SpdLoadSourceImageH(FfxInt32x2(i0), slice);604FfxFloat16x4 v1 = SpdLoadSourceImageH(FfxInt32x2(i1), slice);605FfxFloat16x4 v2 = SpdLoadSourceImageH(FfxInt32x2(i2), slice);606FfxFloat16x4 v3 = SpdLoadSourceImageH(FfxInt32x2(i3), slice);607return SpdReduce4H(v0, v1, v2, v3);608}609610FfxFloat16x4 SpdReduceLoadSourceImageH(FfxUInt32x2 base, FfxUInt32 slice)611{612#ifdef SPD_LINEAR_SAMPLER613return SpdLoadSourceImageH(FfxInt32x2(base), slice);614#else615return SpdReduceLoadSourceImage4H(FfxUInt32x2(base + FfxUInt32x2(0, 0)), FfxUInt32x2(base + FfxUInt32x2(0, 1)), FfxUInt32x2(base + FfxUInt32x2(1, 0)), FfxUInt32x2(base + FfxUInt32x2(1, 1)), slice);616#endif617}618619void SpdDownsampleMips_0_1_IntrinsicsH(FfxUInt32 x, FfxUInt32 y, FfxUInt32x2 workGroupID, FfxUInt32 localInvocationIndex, FfxUInt32 mips, FfxUInt32 slice)620{621FfxFloat16x4 v[4];622623FfxInt32x2 tex = FfxInt32x2(workGroupID.xy * 64) + FfxInt32x2(x * 2, y * 2);624FfxInt32x2 pix = FfxInt32x2(workGroupID.xy * 32) + FfxInt32x2(x, y);625v[0] = SpdReduceLoadSourceImageH(tex, slice);626SpdStoreH(pix, v[0], 0, slice);627628tex = FfxInt32x2(workGroupID.xy * 64) + FfxInt32x2(x * 2 + 32, y * 2);629pix = FfxInt32x2(workGroupID.xy * 32) + FfxInt32x2(x + 16, y);630v[1] = SpdReduceLoadSourceImageH(tex, slice);631SpdStoreH(pix, v[1], 0, slice);632633tex = FfxInt32x2(workGroupID.xy * 64) + FfxInt32x2(x * 2, y * 2 + 32);634pix = FfxInt32x2(workGroupID.xy * 32) + FfxInt32x2(x, y + 16);635v[2] = SpdReduceLoadSourceImageH(tex, slice);636SpdStoreH(pix, v[2], 0, slice);637638tex = FfxInt32x2(workGroupID.xy * 64) + FfxInt32x2(x * 2 + 32, y * 2 + 32);639pix = FfxInt32x2(workGroupID.xy * 32) + FfxInt32x2(x + 16, y + 16);640v[3] = SpdReduceLoadSourceImageH(tex, slice);641SpdStoreH(pix, v[3], 0, slice);642643if (mips <= 1)644return;645646v[0] = SpdReduceQuadH(v[0]);647v[1] = SpdReduceQuadH(v[1]);648v[2] = SpdReduceQuadH(v[2]);649v[3] = SpdReduceQuadH(v[3]);650651if ((localInvocationIndex % 4) == 0)652{653SpdStoreH(FfxInt32x2(workGroupID.xy * 16) + FfxInt32x2(x / 2, y / 2), v[0], 1, slice);654SpdStoreIntermediateH(x / 2, y / 2, v[0]);655656SpdStoreH(FfxInt32x2(workGroupID.xy * 16) + FfxInt32x2(x / 2 + 8, y / 2), v[1], 1, slice);657SpdStoreIntermediateH(x / 2 + 8, y / 2, v[1]);658659SpdStoreH(FfxInt32x2(workGroupID.xy * 16) + FfxInt32x2(x / 2, y / 2 + 8), v[2], 1, slice);660SpdStoreIntermediateH(x / 2, y / 2 + 8, v[2]);661662SpdStoreH(FfxInt32x2(workGroupID.xy * 16) + FfxInt32x2(x / 2 + 8, y / 2 + 8), v[3], 1, slice);663SpdStoreIntermediateH(x / 2 + 8, y / 2 + 8, v[3]);664}665}666667void SpdDownsampleMips_0_1_LDSH(FfxUInt32 x, FfxUInt32 y, FfxUInt32x2 workGroupID, FfxUInt32 localInvocationIndex, FfxUInt32 mips, FfxUInt32 slice)668{669FfxFloat16x4 v[4];670671FfxInt32x2 tex = FfxInt32x2(workGroupID.xy * 64) + FfxInt32x2(x * 2, y * 2);672FfxInt32x2 pix = FfxInt32x2(workGroupID.xy * 32) + FfxInt32x2(x, y);673v[0] = SpdReduceLoadSourceImageH(tex, slice);674SpdStoreH(pix, v[0], 0, slice);675676tex = FfxInt32x2(workGroupID.xy * 64) + FfxInt32x2(x * 2 + 32, y * 2);677pix = FfxInt32x2(workGroupID.xy * 32) + FfxInt32x2(x + 16, y);678v[1] = SpdReduceLoadSourceImageH(tex, slice);679SpdStoreH(pix, v[1], 0, slice);680681tex = FfxInt32x2(workGroupID.xy * 64) + FfxInt32x2(x * 2, y * 2 + 32);682pix = FfxInt32x2(workGroupID.xy * 32) + FfxInt32x2(x, y + 16);683v[2] = SpdReduceLoadSourceImageH(tex, slice);684SpdStoreH(pix, v[2], 0, slice);685686tex = FfxInt32x2(workGroupID.xy * 64) + FfxInt32x2(x * 2 + 32, y * 2 + 32);687pix = FfxInt32x2(workGroupID.xy * 32) + FfxInt32x2(x + 16, y + 16);688v[3] = SpdReduceLoadSourceImageH(tex, slice);689SpdStoreH(pix, v[3], 0, slice);690691if (mips <= 1)692return;693694for (FfxInt32 i = 0; i < 4; i++)695{696SpdStoreIntermediateH(x, y, v[i]);697SpdWorkgroupShuffleBarrier();698if (localInvocationIndex < 64)699{700v[i] = SpdReduceIntermediateH(FfxUInt32x2(x * 2 + 0, y * 2 + 0), FfxUInt32x2(x * 2 + 1, y * 2 + 0), FfxUInt32x2(x * 2 + 0, y * 2 + 1), FfxUInt32x2(x * 2 + 1, y * 2 + 1));701SpdStoreH(FfxInt32x2(workGroupID.xy * 16) + FfxInt32x2(x + (i % 2) * 8, y + (i / 2) * 8), v[i], 1, slice);702}703SpdWorkgroupShuffleBarrier();704}705706if (localInvocationIndex < 64)707{708SpdStoreIntermediateH(x + 0, y + 0, v[0]);709SpdStoreIntermediateH(x + 8, y + 0, v[1]);710SpdStoreIntermediateH(x + 0, y + 8, v[2]);711SpdStoreIntermediateH(x + 8, y + 8, v[3]);712}713}714715void SpdDownsampleMips_0_1H(FfxUInt32 x, FfxUInt32 y, FfxUInt32x2 workGroupID, FfxUInt32 localInvocationIndex, FfxUInt32 mips, FfxUInt32 slice)716{717#ifdef SPD_NO_WAVE_OPERATIONS718SpdDownsampleMips_0_1_LDSH(x, y, workGroupID, localInvocationIndex, mips, slice);719#else720SpdDownsampleMips_0_1_IntrinsicsH(x, y, workGroupID, localInvocationIndex, mips, slice);721#endif722}723724725void SpdDownsampleMip_2H(FfxUInt32 x, FfxUInt32 y, FfxUInt32x2 workGroupID, FfxUInt32 localInvocationIndex, FfxUInt32 mip, FfxUInt32 slice)726{727#ifdef SPD_NO_WAVE_OPERATIONS728if (localInvocationIndex < 64)729{730FfxFloat16x4 v = SpdReduceIntermediateH(FfxUInt32x2(x * 2 + 0, y * 2 + 0), FfxUInt32x2(x * 2 + 1, y * 2 + 0), FfxUInt32x2(x * 2 + 0, y * 2 + 1), FfxUInt32x2(x * 2 + 1, y * 2 + 1));731SpdStoreH(FfxInt32x2(workGroupID.xy * 8) + FfxInt32x2(x, y), v, mip, slice);732// store to LDS, try to reduce bank conflicts733// x 0 x 0 x 0 x 0 x 0 x 0 x 0 x 0734// 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0735// 0 x 0 x 0 x 0 x 0 x 0 x 0 x 0 x736// 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0737// x 0 x 0 x 0 x 0 x 0 x 0 x 0 x 0738// ...739// x 0 x 0 x 0 x 0 x 0 x 0 x 0 x 0740SpdStoreIntermediateH(x * 2 + y % 2, y * 2, v);741}742#else743FfxFloat16x4 v = SpdLoadIntermediateH(x, y);744v = SpdReduceQuadH(v);745// quad index 0 stores result746if (localInvocationIndex % 4 == 0)747{748SpdStoreH(FfxInt32x2(workGroupID.xy * 8) + FfxInt32x2(x / 2, y / 2), v, mip, slice);749SpdStoreIntermediateH(x + (y / 2) % 2, y, v);750}751#endif752}753754void SpdDownsampleMip_3H(FfxUInt32 x, FfxUInt32 y, FfxUInt32x2 workGroupID, FfxUInt32 localInvocationIndex, FfxUInt32 mip, FfxUInt32 slice)755{756#ifdef SPD_NO_WAVE_OPERATIONS757if (localInvocationIndex < 16)758{759// x 0 x 0760// 0 0 0 0761// 0 x 0 x762// 0 0 0 0763FfxFloat16x4 v =764SpdReduceIntermediateH(FfxUInt32x2(x * 4 + 0 + 0, y * 4 + 0), FfxUInt32x2(x * 4 + 2 + 0, y * 4 + 0), FfxUInt32x2(x * 4 + 0 + 1, y * 4 + 2), FfxUInt32x2(x * 4 + 2 + 1, y * 4 + 2));765SpdStoreH(FfxInt32x2(workGroupID.xy * 4) + FfxInt32x2(x, y), v, mip, slice);766// store to LDS767// x 0 0 0 x 0 0 0 x 0 0 0 x 0 0 0768// 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0769// 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0770// 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0771// 0 x 0 0 0 x 0 0 0 x 0 0 0 x 0 0772// ...773// 0 0 x 0 0 0 x 0 0 0 x 0 0 0 x 0774// ...775// 0 0 0 x 0 0 0 x 0 0 0 x 0 0 0 x776// ...777SpdStoreIntermediateH(x * 4 + y, y * 4, v);778}779#else780if (localInvocationIndex < 64)781{782FfxFloat16x4 v = SpdLoadIntermediateH(x * 2 + y % 2, y * 2);783v = SpdReduceQuadH(v);784// quad index 0 stores result785if (localInvocationIndex % 4 == 0)786{787SpdStoreH(FfxInt32x2(workGroupID.xy * 4) + FfxInt32x2(x / 2, y / 2), v, mip, slice);788SpdStoreIntermediateH(x * 2 + y / 2, y * 2, v);789}790}791#endif792}793794void SpdDownsampleMip_4H(FfxUInt32 x, FfxUInt32 y, FfxUInt32x2 workGroupID, FfxUInt32 localInvocationIndex, FfxUInt32 mip, FfxUInt32 slice)795{796#ifdef SPD_NO_WAVE_OPERATIONS797if (localInvocationIndex < 4)798{799// x 0 0 0 x 0 0 0800// ...801// 0 x 0 0 0 x 0 0802FfxFloat16x4 v = SpdReduceIntermediateH(FfxUInt32x2(x * 8 + 0 + 0 + y * 2, y * 8 + 0),803FfxUInt32x2(x * 8 + 4 + 0 + y * 2, y * 8 + 0),804FfxUInt32x2(x * 8 + 0 + 1 + y * 2, y * 8 + 4),805FfxUInt32x2(x * 8 + 4 + 1 + y * 2, y * 8 + 4));806SpdStoreH(FfxInt32x2(workGroupID.xy * 2) + FfxInt32x2(x, y), v, mip, slice);807// store to LDS808// x x x x 0 ...809// 0 ...810SpdStoreIntermediateH(x + y * 2, 0, v);811}812#else813if (localInvocationIndex < 16)814{815FfxFloat16x4 v = SpdLoadIntermediateH(x * 4 + y, y * 4);816v = SpdReduceQuadH(v);817// quad index 0 stores result818if (localInvocationIndex % 4 == 0)819{820SpdStoreH(FfxInt32x2(workGroupID.xy * 2) + FfxInt32x2(x / 2, y / 2), v, mip, slice);821SpdStoreIntermediateH(x / 2 + y, 0, v);822}823}824#endif825}826827void SpdDownsampleMip_5H(FfxUInt32x2 workGroupID, FfxUInt32 localInvocationIndex, FfxUInt32 mip, FfxUInt32 slice)828{829#ifdef SPD_NO_WAVE_OPERATIONS830if (localInvocationIndex < 1)831{832// x x x x 0 ...833// 0 ...834FfxFloat16x4 v = SpdReduceIntermediateH(FfxUInt32x2(0, 0), FfxUInt32x2(1, 0), FfxUInt32x2(2, 0), FfxUInt32x2(3, 0));835SpdStoreH(FfxInt32x2(workGroupID.xy), v, mip, slice);836}837#else838if (localInvocationIndex < 4)839{840FfxFloat16x4 v = SpdLoadIntermediateH(localInvocationIndex, 0);841v = SpdReduceQuadH(v);842// quad index 0 stores result843if (localInvocationIndex % 4 == 0)844{845SpdStoreH(FfxInt32x2(workGroupID.xy), v, mip, slice);846}847}848#endif849}850851void SpdDownsampleMips_6_7H(FfxUInt32 x, FfxUInt32 y, FfxUInt32 mips, FfxUInt32 slice)852{853FfxInt32x2 tex = FfxInt32x2(x * 4 + 0, y * 4 + 0);854FfxInt32x2 pix = FfxInt32x2(x * 2 + 0, y * 2 + 0);855FfxFloat16x4 v0 = SpdReduceLoad4H(tex, slice);856SpdStoreH(pix, v0, 6, slice);857858tex = FfxInt32x2(x * 4 + 2, y * 4 + 0);859pix = FfxInt32x2(x * 2 + 1, y * 2 + 0);860FfxFloat16x4 v1 = SpdReduceLoad4H(tex, slice);861SpdStoreH(pix, v1, 6, slice);862863tex = FfxInt32x2(x * 4 + 0, y * 4 + 2);864pix = FfxInt32x2(x * 2 + 0, y * 2 + 1);865FfxFloat16x4 v2 = SpdReduceLoad4H(tex, slice);866SpdStoreH(pix, v2, 6, slice);867868tex = FfxInt32x2(x * 4 + 2, y * 4 + 2);869pix = FfxInt32x2(x * 2 + 1, y * 2 + 1);870FfxFloat16x4 v3 = SpdReduceLoad4H(tex, slice);871SpdStoreH(pix, v3, 6, slice);872873if (mips < 8)874return;875// no barrier needed, working on values only from the same thread876877FfxFloat16x4 v = SpdReduce4H(v0, v1, v2, v3);878SpdStoreH(FfxInt32x2(x, y), v, 7, slice);879SpdStoreIntermediateH(x, y, v);880}881882void SpdDownsampleNextFourH(FfxUInt32 x, FfxUInt32 y, FfxUInt32x2 workGroupID, FfxUInt32 localInvocationIndex, FfxUInt32 baseMip, FfxUInt32 mips, FfxUInt32 slice)883{884if (mips <= baseMip)885return;886SpdWorkgroupShuffleBarrier();887SpdDownsampleMip_2H(x, y, workGroupID, localInvocationIndex, baseMip, slice);888889if (mips <= baseMip + 1)890return;891SpdWorkgroupShuffleBarrier();892SpdDownsampleMip_3H(x, y, workGroupID, localInvocationIndex, baseMip + 1, slice);893894if (mips <= baseMip + 2)895return;896SpdWorkgroupShuffleBarrier();897SpdDownsampleMip_4H(x, y, workGroupID, localInvocationIndex, baseMip + 2, slice);898899if (mips <= baseMip + 3)900return;901SpdWorkgroupShuffleBarrier();902SpdDownsampleMip_5H(workGroupID, localInvocationIndex, baseMip + 3, slice);903}904905void SpdDownsampleH(FfxUInt32x2 workGroupID, FfxUInt32 localInvocationIndex, FfxUInt32 mips, FfxUInt32 numWorkGroups, FfxUInt32 slice)906{907FfxUInt32x2 sub_xy = ffxRemapForWaveReduction(localInvocationIndex % 64);908FfxUInt32 x = sub_xy.x + 8 * ((localInvocationIndex >> 6) % 2);909FfxUInt32 y = sub_xy.y + 8 * ((localInvocationIndex >> 7));910911SpdDownsampleMips_0_1H(x, y, workGroupID, localInvocationIndex, mips, slice);912913SpdDownsampleNextFourH(x, y, workGroupID, localInvocationIndex, 2, mips, slice);914915if (mips < 7)916return;917918if (SpdExitWorkgroup(numWorkGroups, localInvocationIndex, slice))919return;920921SpdResetAtomicCounter(slice);922923// After mip 6 there is only a single workgroup left that downsamples the remaining up to 64x64 texels.924SpdDownsampleMips_6_7H(x, y, mips, slice);925926SpdDownsampleNextFourH(x, y, FfxUInt32x2(0, 0), localInvocationIndex, 8, mips, slice);927}928929void SpdDownsampleH(FfxUInt32x2 workGroupID, FfxUInt32 localInvocationIndex, FfxUInt32 mips, FfxUInt32 numWorkGroups, FfxUInt32 slice, FfxUInt32x2 workGroupOffset)930{931SpdDownsampleH(workGroupID + workGroupOffset, localInvocationIndex, mips, numWorkGroups, slice);932}933934#endif // #if FFX_HALF935#endif // #ifdef FFX_GPU936937938