Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
godotengine
GitHub Repository: godotengine/godot
Path: blob/master/thirdparty/amd-fsr2/shaders/ffx_spd.h
9903 views
1
// This file is part of the FidelityFX SDK.
2
//
3
// Copyright (c) 2022-2023 Advanced Micro Devices, Inc. All rights reserved.
4
//
5
// Permission is hereby granted, free of charge, to any person obtaining a copy
6
// of this software and associated documentation files (the "Software"), to deal
7
// in the Software without restriction, including without limitation the rights
8
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
// copies of the Software, and to permit persons to whom the Software is
10
// furnished to do so, subject to the following conditions:
11
// The above copyright notice and this permission notice shall be included in
12
// all copies or substantial portions of the Software.
13
//
14
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
17
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
19
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
20
// THE SOFTWARE.
21
22
#ifdef FFX_CPU
23
FFX_STATIC void SpdSetup(FfxUInt32x2 dispatchThreadGroupCountXY, // CPU side: dispatch thread group count xy
24
FfxUInt32x2 workGroupOffset, // GPU side: pass in as constant
25
FfxUInt32x2 numWorkGroupsAndMips, // GPU side: pass in as constant
26
FfxUInt32x4 rectInfo, // left, top, width, height
27
FfxInt32 mips) // optional: if -1, calculate based on rect width and height
28
{
29
workGroupOffset[0] = rectInfo[0] / 64; // rectInfo[0] = left
30
workGroupOffset[1] = rectInfo[1] / 64; // rectInfo[1] = top
31
32
FfxUInt32 endIndexX = (rectInfo[0] + rectInfo[2] - 1) / 64; // rectInfo[0] = left, rectInfo[2] = width
33
FfxUInt32 endIndexY = (rectInfo[1] + rectInfo[3] - 1) / 64; // rectInfo[1] = top, rectInfo[3] = height
34
35
dispatchThreadGroupCountXY[0] = endIndexX + 1 - workGroupOffset[0];
36
dispatchThreadGroupCountXY[1] = endIndexY + 1 - workGroupOffset[1];
37
38
numWorkGroupsAndMips[0] = (dispatchThreadGroupCountXY[0]) * (dispatchThreadGroupCountXY[1]);
39
40
if (mips >= 0)
41
{
42
numWorkGroupsAndMips[1] = FfxUInt32(mips);
43
}
44
else
45
{
46
// calculate based on rect width and height
47
FfxUInt32 resolution = ffxMax(rectInfo[2], rectInfo[3]);
48
numWorkGroupsAndMips[1] = FfxUInt32((ffxMin(floor(log2(FfxFloat32(resolution))), FfxFloat32(12))));
49
}
50
}
51
52
FFX_STATIC void SpdSetup(FfxUInt32x2 dispatchThreadGroupCountXY, // CPU side: dispatch thread group count xy
53
FfxUInt32x2 workGroupOffset, // GPU side: pass in as constant
54
FfxUInt32x2 numWorkGroupsAndMips, // GPU side: pass in as constant
55
FfxUInt32x4 rectInfo) // left, top, width, height
56
{
57
SpdSetup(dispatchThreadGroupCountXY, workGroupOffset, numWorkGroupsAndMips, rectInfo, -1);
58
}
59
#endif // #ifdef FFX_CPU
60
61
62
//==============================================================================================================================
63
// NON-PACKED VERSION
64
//==============================================================================================================================
65
#ifdef FFX_GPU
66
#ifdef SPD_PACKED_ONLY
67
// Avoid compiler error
68
FfxFloat32x4 SpdLoadSourceImage(FfxInt32x2 p, FfxUInt32 slice)
69
{
70
return FfxFloat32x4(0.0, 0.0, 0.0, 0.0);
71
}
72
73
FfxFloat32x4 SpdLoad(FfxInt32x2 p, FfxUInt32 slice)
74
{
75
return FfxFloat32x4(0.0, 0.0, 0.0, 0.0);
76
}
77
void SpdStore(FfxInt32x2 p, FfxFloat32x4 value, FfxUInt32 mip, FfxUInt32 slice)
78
{
79
}
80
FfxFloat32x4 SpdLoadIntermediate(FfxUInt32 x, FfxUInt32 y)
81
{
82
return FfxFloat32x4(0.0, 0.0, 0.0, 0.0);
83
}
84
void SpdStoreIntermediate(FfxUInt32 x, FfxUInt32 y, FfxFloat32x4 value)
85
{
86
}
87
FfxFloat32x4 SpdReduce4(FfxFloat32x4 v0, FfxFloat32x4 v1, FfxFloat32x4 v2, FfxFloat32x4 v3)
88
{
89
return FfxFloat32x4(0.0, 0.0, 0.0, 0.0);
90
}
91
#endif // #ifdef SPD_PACKED_ONLY
92
93
//_____________________________________________________________/\_______________________________________________________________
94
#if defined(FFX_GLSL) && !defined(SPD_NO_WAVE_OPERATIONS)
95
#extension GL_KHR_shader_subgroup_quad:require
96
#endif
97
98
void SpdWorkgroupShuffleBarrier()
99
{
100
#ifdef FFX_GLSL
101
barrier();
102
#endif
103
#ifdef FFX_HLSL
104
GroupMemoryBarrierWithGroupSync();
105
#endif
106
}
107
108
// Only last active workgroup should proceed
109
bool SpdExitWorkgroup(FfxUInt32 numWorkGroups, FfxUInt32 localInvocationIndex, FfxUInt32 slice)
110
{
111
// global atomic counter
112
if (localInvocationIndex == 0)
113
{
114
SpdIncreaseAtomicCounter(slice);
115
}
116
117
SpdWorkgroupShuffleBarrier();
118
return (SpdGetAtomicCounter() != (numWorkGroups - 1));
119
}
120
121
// User defined: FfxFloat32x4 SpdReduce4(FfxFloat32x4 v0, FfxFloat32x4 v1, FfxFloat32x4 v2, FfxFloat32x4 v3);
122
FfxFloat32x4 SpdReduceQuad(FfxFloat32x4 v)
123
{
124
#if defined(FFX_GLSL) && !defined(SPD_NO_WAVE_OPERATIONS)
125
126
FfxFloat32x4 v0 = v;
127
FfxFloat32x4 v1 = subgroupQuadSwapHorizontal(v);
128
FfxFloat32x4 v2 = subgroupQuadSwapVertical(v);
129
FfxFloat32x4 v3 = subgroupQuadSwapDiagonal(v);
130
return SpdReduce4(v0, v1, v2, v3);
131
132
#elif defined(FFX_HLSL) && !defined(SPD_NO_WAVE_OPERATIONS)
133
134
// requires SM6.0
135
FfxUInt32 quad = WaveGetLaneIndex() & (~0x3);
136
FfxFloat32x4 v0 = v;
137
FfxFloat32x4 v1 = WaveReadLaneAt(v, quad | 1);
138
FfxFloat32x4 v2 = WaveReadLaneAt(v, quad | 2);
139
FfxFloat32x4 v3 = WaveReadLaneAt(v, quad | 3);
140
return SpdReduce4(v0, v1, v2, v3);
141
/*
142
// if SM6.0 is not available, you can use the AMD shader intrinsics
143
// the AMD shader intrinsics are available in AMD GPU Services (AGS) library:
144
// https://gpuopen.com/amd-gpu-services-ags-library/
145
// works for DX11
146
FfxFloat32x4 v0 = v;
147
FfxFloat32x4 v1;
148
v1.x = AmdExtD3DShaderIntrinsics_SwizzleF(v.x, AmdExtD3DShaderIntrinsicsSwizzle_SwapX1);
149
v1.y = AmdExtD3DShaderIntrinsics_SwizzleF(v.y, AmdExtD3DShaderIntrinsicsSwizzle_SwapX1);
150
v1.z = AmdExtD3DShaderIntrinsics_SwizzleF(v.z, AmdExtD3DShaderIntrinsicsSwizzle_SwapX1);
151
v1.w = AmdExtD3DShaderIntrinsics_SwizzleF(v.w, AmdExtD3DShaderIntrinsicsSwizzle_SwapX1);
152
FfxFloat32x4 v2;
153
v2.x = AmdExtD3DShaderIntrinsics_SwizzleF(v.x, AmdExtD3DShaderIntrinsicsSwizzle_SwapX2);
154
v2.y = AmdExtD3DShaderIntrinsics_SwizzleF(v.y, AmdExtD3DShaderIntrinsicsSwizzle_SwapX2);
155
v2.z = AmdExtD3DShaderIntrinsics_SwizzleF(v.z, AmdExtD3DShaderIntrinsicsSwizzle_SwapX2);
156
v2.w = AmdExtD3DShaderIntrinsics_SwizzleF(v.w, AmdExtD3DShaderIntrinsicsSwizzle_SwapX2);
157
FfxFloat32x4 v3;
158
v3.x = AmdExtD3DShaderIntrinsics_SwizzleF(v.x, AmdExtD3DShaderIntrinsicsSwizzle_ReverseX4);
159
v3.y = AmdExtD3DShaderIntrinsics_SwizzleF(v.y, AmdExtD3DShaderIntrinsicsSwizzle_ReverseX4);
160
v3.z = AmdExtD3DShaderIntrinsics_SwizzleF(v.z, AmdExtD3DShaderIntrinsicsSwizzle_ReverseX4);
161
v3.w = AmdExtD3DShaderIntrinsics_SwizzleF(v.w, AmdExtD3DShaderIntrinsicsSwizzle_ReverseX4);
162
return SpdReduce4(v0, v1, v2, v3);
163
*/
164
#endif
165
return v;
166
}
167
168
FfxFloat32x4 SpdReduceIntermediate(FfxUInt32x2 i0, FfxUInt32x2 i1, FfxUInt32x2 i2, FfxUInt32x2 i3)
169
{
170
FfxFloat32x4 v0 = SpdLoadIntermediate(i0.x, i0.y);
171
FfxFloat32x4 v1 = SpdLoadIntermediate(i1.x, i1.y);
172
FfxFloat32x4 v2 = SpdLoadIntermediate(i2.x, i2.y);
173
FfxFloat32x4 v3 = SpdLoadIntermediate(i3.x, i3.y);
174
return SpdReduce4(v0, v1, v2, v3);
175
}
176
177
FfxFloat32x4 SpdReduceLoad4(FfxUInt32x2 i0, FfxUInt32x2 i1, FfxUInt32x2 i2, FfxUInt32x2 i3, FfxUInt32 slice)
178
{
179
FfxFloat32x4 v0 = SpdLoad(FfxInt32x2(i0), slice);
180
FfxFloat32x4 v1 = SpdLoad(FfxInt32x2(i1), slice);
181
FfxFloat32x4 v2 = SpdLoad(FfxInt32x2(i2), slice);
182
FfxFloat32x4 v3 = SpdLoad(FfxInt32x2(i3), slice);
183
return SpdReduce4(v0, v1, v2, v3);
184
}
185
186
FfxFloat32x4 SpdReduceLoad4(FfxUInt32x2 base, FfxUInt32 slice)
187
{
188
return SpdReduceLoad4(FfxUInt32x2(base + FfxUInt32x2(0, 0)), FfxUInt32x2(base + FfxUInt32x2(0, 1)), FfxUInt32x2(base + FfxUInt32x2(1, 0)), FfxUInt32x2(base + FfxUInt32x2(1, 1)), slice);
189
}
190
191
FfxFloat32x4 SpdReduceLoadSourceImage4(FfxUInt32x2 i0, FfxUInt32x2 i1, FfxUInt32x2 i2, FfxUInt32x2 i3, FfxUInt32 slice)
192
{
193
FfxFloat32x4 v0 = SpdLoadSourceImage(FfxInt32x2(i0), slice);
194
FfxFloat32x4 v1 = SpdLoadSourceImage(FfxInt32x2(i1), slice);
195
FfxFloat32x4 v2 = SpdLoadSourceImage(FfxInt32x2(i2), slice);
196
FfxFloat32x4 v3 = SpdLoadSourceImage(FfxInt32x2(i3), slice);
197
return SpdReduce4(v0, v1, v2, v3);
198
}
199
200
FfxFloat32x4 SpdReduceLoadSourceImage(FfxUInt32x2 base, FfxUInt32 slice)
201
{
202
#ifdef SPD_LINEAR_SAMPLER
203
return SpdLoadSourceImage(FfxInt32x2(base), slice);
204
#else
205
return SpdReduceLoadSourceImage4(FfxUInt32x2(base + FfxUInt32x2(0, 0)), FfxUInt32x2(base + FfxUInt32x2(0, 1)), FfxUInt32x2(base + FfxUInt32x2(1, 0)), FfxUInt32x2(base + FfxUInt32x2(1, 1)), slice);
206
#endif
207
}
208
209
void SpdDownsampleMips_0_1_Intrinsics(FfxUInt32 x, FfxUInt32 y, FfxUInt32x2 workGroupID, FfxUInt32 localInvocationIndex, FfxUInt32 mip, FfxUInt32 slice)
210
{
211
FfxFloat32x4 v[4];
212
213
FfxInt32x2 tex = FfxInt32x2(workGroupID.xy * 64) + FfxInt32x2(x * 2, y * 2);
214
FfxInt32x2 pix = FfxInt32x2(workGroupID.xy * 32) + FfxInt32x2(x, y);
215
v[0] = SpdReduceLoadSourceImage(tex, slice);
216
SpdStore(pix, v[0], 0, slice);
217
218
tex = FfxInt32x2(workGroupID.xy * 64) + FfxInt32x2(x * 2 + 32, y * 2);
219
pix = FfxInt32x2(workGroupID.xy * 32) + FfxInt32x2(x + 16, y);
220
v[1] = SpdReduceLoadSourceImage(tex, slice);
221
SpdStore(pix, v[1], 0, slice);
222
223
tex = FfxInt32x2(workGroupID.xy * 64) + FfxInt32x2(x * 2, y * 2 + 32);
224
pix = FfxInt32x2(workGroupID.xy * 32) + FfxInt32x2(x, y + 16);
225
v[2] = SpdReduceLoadSourceImage(tex, slice);
226
SpdStore(pix, v[2], 0, slice);
227
228
tex = FfxInt32x2(workGroupID.xy * 64) + FfxInt32x2(x * 2 + 32, y * 2 + 32);
229
pix = FfxInt32x2(workGroupID.xy * 32) + FfxInt32x2(x + 16, y + 16);
230
v[3] = SpdReduceLoadSourceImage(tex, slice);
231
SpdStore(pix, v[3], 0, slice);
232
233
if (mip <= 1)
234
return;
235
236
v[0] = SpdReduceQuad(v[0]);
237
v[1] = SpdReduceQuad(v[1]);
238
v[2] = SpdReduceQuad(v[2]);
239
v[3] = SpdReduceQuad(v[3]);
240
241
if ((localInvocationIndex % 4) == 0)
242
{
243
SpdStore(FfxInt32x2(workGroupID.xy * 16) + FfxInt32x2(x / 2, y / 2), v[0], 1, slice);
244
SpdStoreIntermediate(x / 2, y / 2, v[0]);
245
246
SpdStore(FfxInt32x2(workGroupID.xy * 16) + FfxInt32x2(x / 2 + 8, y / 2), v[1], 1, slice);
247
SpdStoreIntermediate(x / 2 + 8, y / 2, v[1]);
248
249
SpdStore(FfxInt32x2(workGroupID.xy * 16) + FfxInt32x2(x / 2, y / 2 + 8), v[2], 1, slice);
250
SpdStoreIntermediate(x / 2, y / 2 + 8, v[2]);
251
252
SpdStore(FfxInt32x2(workGroupID.xy * 16) + FfxInt32x2(x / 2 + 8, y / 2 + 8), v[3], 1, slice);
253
SpdStoreIntermediate(x / 2 + 8, y / 2 + 8, v[3]);
254
}
255
}
256
257
void SpdDownsampleMips_0_1_LDS(FfxUInt32 x, FfxUInt32 y, FfxUInt32x2 workGroupID, FfxUInt32 localInvocationIndex, FfxUInt32 mip, FfxUInt32 slice)
258
{
259
FfxFloat32x4 v[4];
260
261
FfxInt32x2 tex = FfxInt32x2(workGroupID.xy * 64) + FfxInt32x2(x * 2, y * 2);
262
FfxInt32x2 pix = FfxInt32x2(workGroupID.xy * 32) + FfxInt32x2(x, y);
263
v[0] = SpdReduceLoadSourceImage(tex, slice);
264
SpdStore(pix, v[0], 0, slice);
265
266
tex = FfxInt32x2(workGroupID.xy * 64) + FfxInt32x2(x * 2 + 32, y * 2);
267
pix = FfxInt32x2(workGroupID.xy * 32) + FfxInt32x2(x + 16, y);
268
v[1] = SpdReduceLoadSourceImage(tex, slice);
269
SpdStore(pix, v[1], 0, slice);
270
271
tex = FfxInt32x2(workGroupID.xy * 64) + FfxInt32x2(x * 2, y * 2 + 32);
272
pix = FfxInt32x2(workGroupID.xy * 32) + FfxInt32x2(x, y + 16);
273
v[2] = SpdReduceLoadSourceImage(tex, slice);
274
SpdStore(pix, v[2], 0, slice);
275
276
tex = FfxInt32x2(workGroupID.xy * 64) + FfxInt32x2(x * 2 + 32, y * 2 + 32);
277
pix = FfxInt32x2(workGroupID.xy * 32) + FfxInt32x2(x + 16, y + 16);
278
v[3] = SpdReduceLoadSourceImage(tex, slice);
279
SpdStore(pix, v[3], 0, slice);
280
281
if (mip <= 1)
282
return;
283
284
for (FfxUInt32 i = 0; i < 4; i++)
285
{
286
SpdStoreIntermediate(x, y, v[i]);
287
SpdWorkgroupShuffleBarrier();
288
if (localInvocationIndex < 64)
289
{
290
v[i] = SpdReduceIntermediate(FfxUInt32x2(x * 2 + 0, y * 2 + 0), FfxUInt32x2(x * 2 + 1, y * 2 + 0), FfxUInt32x2(x * 2 + 0, y * 2 + 1), FfxUInt32x2(x * 2 + 1, y * 2 + 1));
291
SpdStore(FfxInt32x2(workGroupID.xy * 16) + FfxInt32x2(x + (i % 2) * 8, y + (i / 2) * 8), v[i], 1, slice);
292
}
293
SpdWorkgroupShuffleBarrier();
294
}
295
296
if (localInvocationIndex < 64)
297
{
298
SpdStoreIntermediate(x + 0, y + 0, v[0]);
299
SpdStoreIntermediate(x + 8, y + 0, v[1]);
300
SpdStoreIntermediate(x + 0, y + 8, v[2]);
301
SpdStoreIntermediate(x + 8, y + 8, v[3]);
302
}
303
}
304
305
void SpdDownsampleMips_0_1(FfxUInt32 x, FfxUInt32 y, FfxUInt32x2 workGroupID, FfxUInt32 localInvocationIndex, FfxUInt32 mip, FfxUInt32 slice)
306
{
307
#ifdef SPD_NO_WAVE_OPERATIONS
308
SpdDownsampleMips_0_1_LDS(x, y, workGroupID, localInvocationIndex, mip, slice);
309
#else
310
SpdDownsampleMips_0_1_Intrinsics(x, y, workGroupID, localInvocationIndex, mip, slice);
311
#endif
312
}
313
314
315
void SpdDownsampleMip_2(FfxUInt32 x, FfxUInt32 y, FfxUInt32x2 workGroupID, FfxUInt32 localInvocationIndex, FfxUInt32 mip, FfxUInt32 slice)
316
{
317
#ifdef SPD_NO_WAVE_OPERATIONS
318
if (localInvocationIndex < 64)
319
{
320
FfxFloat32x4 v = SpdReduceIntermediate(FfxUInt32x2(x * 2 + 0, y * 2 + 0), FfxUInt32x2(x * 2 + 1, y * 2 + 0), FfxUInt32x2(x * 2 + 0, y * 2 + 1), FfxUInt32x2(x * 2 + 1, y * 2 + 1));
321
SpdStore(FfxInt32x2(workGroupID.xy * 8) + FfxInt32x2(x, y), v, mip, slice);
322
// store to LDS, try to reduce bank conflicts
323
// x 0 x 0 x 0 x 0 x 0 x 0 x 0 x 0
324
// 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
325
// 0 x 0 x 0 x 0 x 0 x 0 x 0 x 0 x
326
// 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
327
// x 0 x 0 x 0 x 0 x 0 x 0 x 0 x 0
328
// ...
329
// x 0 x 0 x 0 x 0 x 0 x 0 x 0 x 0
330
SpdStoreIntermediate(x * 2 + y % 2, y * 2, v);
331
}
332
#else
333
FfxFloat32x4 v = SpdLoadIntermediate(x, y);
334
v = SpdReduceQuad(v);
335
// quad index 0 stores result
336
if (localInvocationIndex % 4 == 0)
337
{
338
SpdStore(FfxInt32x2(workGroupID.xy * 8) + FfxInt32x2(x / 2, y / 2), v, mip, slice);
339
SpdStoreIntermediate(x + (y / 2) % 2, y, v);
340
}
341
#endif
342
}
343
344
void SpdDownsampleMip_3(FfxUInt32 x, FfxUInt32 y, FfxUInt32x2 workGroupID, FfxUInt32 localInvocationIndex, FfxUInt32 mip, FfxUInt32 slice)
345
{
346
#ifdef SPD_NO_WAVE_OPERATIONS
347
if (localInvocationIndex < 16)
348
{
349
// x 0 x 0
350
// 0 0 0 0
351
// 0 x 0 x
352
// 0 0 0 0
353
FfxFloat32x4 v =
354
SpdReduceIntermediate(FfxUInt32x2(x * 4 + 0 + 0, y * 4 + 0), FfxUInt32x2(x * 4 + 2 + 0, y * 4 + 0), FfxUInt32x2(x * 4 + 0 + 1, y * 4 + 2), FfxUInt32x2(x * 4 + 2 + 1, y * 4 + 2));
355
SpdStore(FfxInt32x2(workGroupID.xy * 4) + FfxInt32x2(x, y), v, mip, slice);
356
// store to LDS
357
// x 0 0 0 x 0 0 0 x 0 0 0 x 0 0 0
358
// 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
359
// 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
360
// 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
361
// 0 x 0 0 0 x 0 0 0 x 0 0 0 x 0 0
362
// ...
363
// 0 0 x 0 0 0 x 0 0 0 x 0 0 0 x 0
364
// ...
365
// 0 0 0 x 0 0 0 x 0 0 0 x 0 0 0 x
366
// ...
367
SpdStoreIntermediate(x * 4 + y, y * 4, v);
368
}
369
#else
370
if (localInvocationIndex < 64)
371
{
372
FfxFloat32x4 v = SpdLoadIntermediate(x * 2 + y % 2, y * 2);
373
v = SpdReduceQuad(v);
374
// quad index 0 stores result
375
if (localInvocationIndex % 4 == 0)
376
{
377
SpdStore(FfxInt32x2(workGroupID.xy * 4) + FfxInt32x2(x / 2, y / 2), v, mip, slice);
378
SpdStoreIntermediate(x * 2 + y / 2, y * 2, v);
379
}
380
}
381
#endif
382
}
383
384
void SpdDownsampleMip_4(FfxUInt32 x, FfxUInt32 y, FfxUInt32x2 workGroupID, FfxUInt32 localInvocationIndex, FfxUInt32 mip, FfxUInt32 slice)
385
{
386
#ifdef SPD_NO_WAVE_OPERATIONS
387
if (localInvocationIndex < 4)
388
{
389
// x 0 0 0 x 0 0 0
390
// ...
391
// 0 x 0 0 0 x 0 0
392
FfxFloat32x4 v = SpdReduceIntermediate(FfxUInt32x2(x * 8 + 0 + 0 + y * 2, y * 8 + 0),
393
FfxUInt32x2(x * 8 + 4 + 0 + y * 2, y * 8 + 0),
394
FfxUInt32x2(x * 8 + 0 + 1 + y * 2, y * 8 + 4),
395
FfxUInt32x2(x * 8 + 4 + 1 + y * 2, y * 8 + 4));
396
SpdStore(FfxInt32x2(workGroupID.xy * 2) + FfxInt32x2(x, y), v, mip, slice);
397
// store to LDS
398
// x x x x 0 ...
399
// 0 ...
400
SpdStoreIntermediate(x + y * 2, 0, v);
401
}
402
#else
403
if (localInvocationIndex < 16)
404
{
405
FfxFloat32x4 v = SpdLoadIntermediate(x * 4 + y, y * 4);
406
v = SpdReduceQuad(v);
407
// quad index 0 stores result
408
if (localInvocationIndex % 4 == 0)
409
{
410
SpdStore(FfxInt32x2(workGroupID.xy * 2) + FfxInt32x2(x / 2, y / 2), v, mip, slice);
411
SpdStoreIntermediate(x / 2 + y, 0, v);
412
}
413
}
414
#endif
415
}
416
417
void SpdDownsampleMip_5(FfxUInt32x2 workGroupID, FfxUInt32 localInvocationIndex, FfxUInt32 mip, FfxUInt32 slice)
418
{
419
#ifdef SPD_NO_WAVE_OPERATIONS
420
if (localInvocationIndex < 1)
421
{
422
// x x x x 0 ...
423
// 0 ...
424
FfxFloat32x4 v = SpdReduceIntermediate(FfxUInt32x2(0, 0), FfxUInt32x2(1, 0), FfxUInt32x2(2, 0), FfxUInt32x2(3, 0));
425
SpdStore(FfxInt32x2(workGroupID.xy), v, mip, slice);
426
}
427
#else
428
if (localInvocationIndex < 4)
429
{
430
FfxFloat32x4 v = SpdLoadIntermediate(localInvocationIndex, 0);
431
v = SpdReduceQuad(v);
432
// quad index 0 stores result
433
if (localInvocationIndex % 4 == 0)
434
{
435
SpdStore(FfxInt32x2(workGroupID.xy), v, mip, slice);
436
}
437
}
438
#endif
439
}
440
441
void SpdDownsampleMips_6_7(FfxUInt32 x, FfxUInt32 y, FfxUInt32 mips, FfxUInt32 slice)
442
{
443
FfxInt32x2 tex = FfxInt32x2(x * 4 + 0, y * 4 + 0);
444
FfxInt32x2 pix = FfxInt32x2(x * 2 + 0, y * 2 + 0);
445
FfxFloat32x4 v0 = SpdReduceLoad4(tex, slice);
446
SpdStore(pix, v0, 6, slice);
447
448
tex = FfxInt32x2(x * 4 + 2, y * 4 + 0);
449
pix = FfxInt32x2(x * 2 + 1, y * 2 + 0);
450
FfxFloat32x4 v1 = SpdReduceLoad4(tex, slice);
451
SpdStore(pix, v1, 6, slice);
452
453
tex = FfxInt32x2(x * 4 + 0, y * 4 + 2);
454
pix = FfxInt32x2(x * 2 + 0, y * 2 + 1);
455
FfxFloat32x4 v2 = SpdReduceLoad4(tex, slice);
456
SpdStore(pix, v2, 6, slice);
457
458
tex = FfxInt32x2(x * 4 + 2, y * 4 + 2);
459
pix = FfxInt32x2(x * 2 + 1, y * 2 + 1);
460
FfxFloat32x4 v3 = SpdReduceLoad4(tex, slice);
461
SpdStore(pix, v3, 6, slice);
462
463
if (mips <= 7)
464
return;
465
// no barrier needed, working on values only from the same thread
466
467
FfxFloat32x4 v = SpdReduce4(v0, v1, v2, v3);
468
SpdStore(FfxInt32x2(x, y), v, 7, slice);
469
SpdStoreIntermediate(x, y, v);
470
}
471
472
void SpdDownsampleNextFour(FfxUInt32 x, FfxUInt32 y, FfxUInt32x2 workGroupID, FfxUInt32 localInvocationIndex, FfxUInt32 baseMip, FfxUInt32 mips, FfxUInt32 slice)
473
{
474
if (mips <= baseMip)
475
return;
476
SpdWorkgroupShuffleBarrier();
477
SpdDownsampleMip_2(x, y, workGroupID, localInvocationIndex, baseMip, slice);
478
479
if (mips <= baseMip + 1)
480
return;
481
SpdWorkgroupShuffleBarrier();
482
SpdDownsampleMip_3(x, y, workGroupID, localInvocationIndex, baseMip + 1, slice);
483
484
if (mips <= baseMip + 2)
485
return;
486
SpdWorkgroupShuffleBarrier();
487
SpdDownsampleMip_4(x, y, workGroupID, localInvocationIndex, baseMip + 2, slice);
488
489
if (mips <= baseMip + 3)
490
return;
491
SpdWorkgroupShuffleBarrier();
492
SpdDownsampleMip_5(workGroupID, localInvocationIndex, baseMip + 3, slice);
493
}
494
495
void SpdDownsample(FfxUInt32x2 workGroupID, FfxUInt32 localInvocationIndex, FfxUInt32 mips, FfxUInt32 numWorkGroups, FfxUInt32 slice)
496
{
497
FfxUInt32x2 sub_xy = ffxRemapForWaveReduction(localInvocationIndex % 64);
498
FfxUInt32 x = sub_xy.x + 8 * ((localInvocationIndex >> 6) % 2);
499
FfxUInt32 y = sub_xy.y + 8 * ((localInvocationIndex >> 7));
500
SpdDownsampleMips_0_1(x, y, workGroupID, localInvocationIndex, mips, slice);
501
502
SpdDownsampleNextFour(x, y, workGroupID, localInvocationIndex, 2, mips, slice);
503
504
if (mips <= 6)
505
return;
506
507
if (SpdExitWorkgroup(numWorkGroups, localInvocationIndex, slice))
508
return;
509
510
SpdResetAtomicCounter(slice);
511
512
// After mip 6 there is only a single workgroup left that downsamples the remaining up to 64x64 texels.
513
SpdDownsampleMips_6_7(x, y, mips, slice);
514
515
SpdDownsampleNextFour(x, y, FfxUInt32x2(0, 0), localInvocationIndex, 8, mips, slice);
516
}
517
518
void SpdDownsample(FfxUInt32x2 workGroupID, FfxUInt32 localInvocationIndex, FfxUInt32 mips, FfxUInt32 numWorkGroups, FfxUInt32 slice, FfxUInt32x2 workGroupOffset)
519
{
520
SpdDownsample(workGroupID + workGroupOffset, localInvocationIndex, mips, numWorkGroups, slice);
521
}
522
523
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
524
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
525
526
//==============================================================================================================================
527
// PACKED VERSION
528
//==============================================================================================================================
529
530
#if FFX_HALF
531
532
#ifdef FFX_GLSL
533
#extension GL_EXT_shader_subgroup_extended_types_float16:require
534
#endif
535
536
FfxFloat16x4 SpdReduceQuadH(FfxFloat16x4 v)
537
{
538
#if defined(FFX_GLSL) && !defined(SPD_NO_WAVE_OPERATIONS)
539
FfxFloat16x4 v0 = v;
540
FfxFloat16x4 v1 = subgroupQuadSwapHorizontal(v);
541
FfxFloat16x4 v2 = subgroupQuadSwapVertical(v);
542
FfxFloat16x4 v3 = subgroupQuadSwapDiagonal(v);
543
return SpdReduce4H(v0, v1, v2, v3);
544
#elif defined(FFX_HLSL) && !defined(SPD_NO_WAVE_OPERATIONS)
545
// requires SM6.0
546
FfxUInt32 quad = WaveGetLaneIndex() & (~0x3);
547
FfxFloat16x4 v0 = v;
548
FfxFloat16x4 v1 = WaveReadLaneAt(v, quad | 1);
549
FfxFloat16x4 v2 = WaveReadLaneAt(v, quad | 2);
550
FfxFloat16x4 v3 = WaveReadLaneAt(v, quad | 3);
551
return SpdReduce4H(v0, v1, v2, v3);
552
/*
553
// if SM6.0 is not available, you can use the AMD shader intrinsics
554
// the AMD shader intrinsics are available in AMD GPU Services (AGS) library:
555
// https://gpuopen.com/amd-gpu-services-ags-library/
556
// works for DX11
557
FfxFloat16x4 v0 = v;
558
FfxFloat16x4 v1;
559
v1.x = AmdExtD3DShaderIntrinsics_SwizzleF(v.x, AmdExtD3DShaderIntrinsicsSwizzle_SwapX1);
560
v1.y = AmdExtD3DShaderIntrinsics_SwizzleF(v.y, AmdExtD3DShaderIntrinsicsSwizzle_SwapX1);
561
v1.z = AmdExtD3DShaderIntrinsics_SwizzleF(v.z, AmdExtD3DShaderIntrinsicsSwizzle_SwapX1);
562
v1.w = AmdExtD3DShaderIntrinsics_SwizzleF(v.w, AmdExtD3DShaderIntrinsicsSwizzle_SwapX1);
563
FfxFloat16x4 v2;
564
v2.x = AmdExtD3DShaderIntrinsics_SwizzleF(v.x, AmdExtD3DShaderIntrinsicsSwizzle_SwapX2);
565
v2.y = AmdExtD3DShaderIntrinsics_SwizzleF(v.y, AmdExtD3DShaderIntrinsicsSwizzle_SwapX2);
566
v2.z = AmdExtD3DShaderIntrinsics_SwizzleF(v.z, AmdExtD3DShaderIntrinsicsSwizzle_SwapX2);
567
v2.w = AmdExtD3DShaderIntrinsics_SwizzleF(v.w, AmdExtD3DShaderIntrinsicsSwizzle_SwapX2);
568
FfxFloat16x4 v3;
569
v3.x = AmdExtD3DShaderIntrinsics_SwizzleF(v.x, AmdExtD3DShaderIntrinsicsSwizzle_ReverseX4);
570
v3.y = AmdExtD3DShaderIntrinsics_SwizzleF(v.y, AmdExtD3DShaderIntrinsicsSwizzle_ReverseX4);
571
v3.z = AmdExtD3DShaderIntrinsics_SwizzleF(v.z, AmdExtD3DShaderIntrinsicsSwizzle_ReverseX4);
572
v3.w = AmdExtD3DShaderIntrinsics_SwizzleF(v.w, AmdExtD3DShaderIntrinsicsSwizzle_ReverseX4);
573
return SpdReduce4H(v0, v1, v2, v3);
574
*/
575
#endif
576
return FfxFloat16x4(0.0, 0.0, 0.0, 0.0);
577
}
578
579
FfxFloat16x4 SpdReduceIntermediateH(FfxUInt32x2 i0, FfxUInt32x2 i1, FfxUInt32x2 i2, FfxUInt32x2 i3)
580
{
581
FfxFloat16x4 v0 = SpdLoadIntermediateH(i0.x, i0.y);
582
FfxFloat16x4 v1 = SpdLoadIntermediateH(i1.x, i1.y);
583
FfxFloat16x4 v2 = SpdLoadIntermediateH(i2.x, i2.y);
584
FfxFloat16x4 v3 = SpdLoadIntermediateH(i3.x, i3.y);
585
return SpdReduce4H(v0, v1, v2, v3);
586
}
587
588
FfxFloat16x4 SpdReduceLoad4H(FfxUInt32x2 i0, FfxUInt32x2 i1, FfxUInt32x2 i2, FfxUInt32x2 i3, FfxUInt32 slice)
589
{
590
FfxFloat16x4 v0 = SpdLoadH(FfxInt32x2(i0), slice);
591
FfxFloat16x4 v1 = SpdLoadH(FfxInt32x2(i1), slice);
592
FfxFloat16x4 v2 = SpdLoadH(FfxInt32x2(i2), slice);
593
FfxFloat16x4 v3 = SpdLoadH(FfxInt32x2(i3), slice);
594
return SpdReduce4H(v0, v1, v2, v3);
595
}
596
597
FfxFloat16x4 SpdReduceLoad4H(FfxUInt32x2 base, FfxUInt32 slice)
598
{
599
return SpdReduceLoad4H(FfxUInt32x2(base + FfxUInt32x2(0, 0)), FfxUInt32x2(base + FfxUInt32x2(0, 1)), FfxUInt32x2(base + FfxUInt32x2(1, 0)), FfxUInt32x2(base + FfxUInt32x2(1, 1)), slice);
600
}
601
602
FfxFloat16x4 SpdReduceLoadSourceImage4H(FfxUInt32x2 i0, FfxUInt32x2 i1, FfxUInt32x2 i2, FfxUInt32x2 i3, FfxUInt32 slice)
603
{
604
FfxFloat16x4 v0 = SpdLoadSourceImageH(FfxInt32x2(i0), slice);
605
FfxFloat16x4 v1 = SpdLoadSourceImageH(FfxInt32x2(i1), slice);
606
FfxFloat16x4 v2 = SpdLoadSourceImageH(FfxInt32x2(i2), slice);
607
FfxFloat16x4 v3 = SpdLoadSourceImageH(FfxInt32x2(i3), slice);
608
return SpdReduce4H(v0, v1, v2, v3);
609
}
610
611
FfxFloat16x4 SpdReduceLoadSourceImageH(FfxUInt32x2 base, FfxUInt32 slice)
612
{
613
#ifdef SPD_LINEAR_SAMPLER
614
return SpdLoadSourceImageH(FfxInt32x2(base), slice);
615
#else
616
return SpdReduceLoadSourceImage4H(FfxUInt32x2(base + FfxUInt32x2(0, 0)), FfxUInt32x2(base + FfxUInt32x2(0, 1)), FfxUInt32x2(base + FfxUInt32x2(1, 0)), FfxUInt32x2(base + FfxUInt32x2(1, 1)), slice);
617
#endif
618
}
619
620
void SpdDownsampleMips_0_1_IntrinsicsH(FfxUInt32 x, FfxUInt32 y, FfxUInt32x2 workGroupID, FfxUInt32 localInvocationIndex, FfxUInt32 mips, FfxUInt32 slice)
621
{
622
FfxFloat16x4 v[4];
623
624
FfxInt32x2 tex = FfxInt32x2(workGroupID.xy * 64) + FfxInt32x2(x * 2, y * 2);
625
FfxInt32x2 pix = FfxInt32x2(workGroupID.xy * 32) + FfxInt32x2(x, y);
626
v[0] = SpdReduceLoadSourceImageH(tex, slice);
627
SpdStoreH(pix, v[0], 0, slice);
628
629
tex = FfxInt32x2(workGroupID.xy * 64) + FfxInt32x2(x * 2 + 32, y * 2);
630
pix = FfxInt32x2(workGroupID.xy * 32) + FfxInt32x2(x + 16, y);
631
v[1] = SpdReduceLoadSourceImageH(tex, slice);
632
SpdStoreH(pix, v[1], 0, slice);
633
634
tex = FfxInt32x2(workGroupID.xy * 64) + FfxInt32x2(x * 2, y * 2 + 32);
635
pix = FfxInt32x2(workGroupID.xy * 32) + FfxInt32x2(x, y + 16);
636
v[2] = SpdReduceLoadSourceImageH(tex, slice);
637
SpdStoreH(pix, v[2], 0, slice);
638
639
tex = FfxInt32x2(workGroupID.xy * 64) + FfxInt32x2(x * 2 + 32, y * 2 + 32);
640
pix = FfxInt32x2(workGroupID.xy * 32) + FfxInt32x2(x + 16, y + 16);
641
v[3] = SpdReduceLoadSourceImageH(tex, slice);
642
SpdStoreH(pix, v[3], 0, slice);
643
644
if (mips <= 1)
645
return;
646
647
v[0] = SpdReduceQuadH(v[0]);
648
v[1] = SpdReduceQuadH(v[1]);
649
v[2] = SpdReduceQuadH(v[2]);
650
v[3] = SpdReduceQuadH(v[3]);
651
652
if ((localInvocationIndex % 4) == 0)
653
{
654
SpdStoreH(FfxInt32x2(workGroupID.xy * 16) + FfxInt32x2(x / 2, y / 2), v[0], 1, slice);
655
SpdStoreIntermediateH(x / 2, y / 2, v[0]);
656
657
SpdStoreH(FfxInt32x2(workGroupID.xy * 16) + FfxInt32x2(x / 2 + 8, y / 2), v[1], 1, slice);
658
SpdStoreIntermediateH(x / 2 + 8, y / 2, v[1]);
659
660
SpdStoreH(FfxInt32x2(workGroupID.xy * 16) + FfxInt32x2(x / 2, y / 2 + 8), v[2], 1, slice);
661
SpdStoreIntermediateH(x / 2, y / 2 + 8, v[2]);
662
663
SpdStoreH(FfxInt32x2(workGroupID.xy * 16) + FfxInt32x2(x / 2 + 8, y / 2 + 8), v[3], 1, slice);
664
SpdStoreIntermediateH(x / 2 + 8, y / 2 + 8, v[3]);
665
}
666
}
667
668
void SpdDownsampleMips_0_1_LDSH(FfxUInt32 x, FfxUInt32 y, FfxUInt32x2 workGroupID, FfxUInt32 localInvocationIndex, FfxUInt32 mips, FfxUInt32 slice)
669
{
670
FfxFloat16x4 v[4];
671
672
FfxInt32x2 tex = FfxInt32x2(workGroupID.xy * 64) + FfxInt32x2(x * 2, y * 2);
673
FfxInt32x2 pix = FfxInt32x2(workGroupID.xy * 32) + FfxInt32x2(x, y);
674
v[0] = SpdReduceLoadSourceImageH(tex, slice);
675
SpdStoreH(pix, v[0], 0, slice);
676
677
tex = FfxInt32x2(workGroupID.xy * 64) + FfxInt32x2(x * 2 + 32, y * 2);
678
pix = FfxInt32x2(workGroupID.xy * 32) + FfxInt32x2(x + 16, y);
679
v[1] = SpdReduceLoadSourceImageH(tex, slice);
680
SpdStoreH(pix, v[1], 0, slice);
681
682
tex = FfxInt32x2(workGroupID.xy * 64) + FfxInt32x2(x * 2, y * 2 + 32);
683
pix = FfxInt32x2(workGroupID.xy * 32) + FfxInt32x2(x, y + 16);
684
v[2] = SpdReduceLoadSourceImageH(tex, slice);
685
SpdStoreH(pix, v[2], 0, slice);
686
687
tex = FfxInt32x2(workGroupID.xy * 64) + FfxInt32x2(x * 2 + 32, y * 2 + 32);
688
pix = FfxInt32x2(workGroupID.xy * 32) + FfxInt32x2(x + 16, y + 16);
689
v[3] = SpdReduceLoadSourceImageH(tex, slice);
690
SpdStoreH(pix, v[3], 0, slice);
691
692
if (mips <= 1)
693
return;
694
695
for (FfxInt32 i = 0; i < 4; i++)
696
{
697
SpdStoreIntermediateH(x, y, v[i]);
698
SpdWorkgroupShuffleBarrier();
699
if (localInvocationIndex < 64)
700
{
701
v[i] = SpdReduceIntermediateH(FfxUInt32x2(x * 2 + 0, y * 2 + 0), FfxUInt32x2(x * 2 + 1, y * 2 + 0), FfxUInt32x2(x * 2 + 0, y * 2 + 1), FfxUInt32x2(x * 2 + 1, y * 2 + 1));
702
SpdStoreH(FfxInt32x2(workGroupID.xy * 16) + FfxInt32x2(x + (i % 2) * 8, y + (i / 2) * 8), v[i], 1, slice);
703
}
704
SpdWorkgroupShuffleBarrier();
705
}
706
707
if (localInvocationIndex < 64)
708
{
709
SpdStoreIntermediateH(x + 0, y + 0, v[0]);
710
SpdStoreIntermediateH(x + 8, y + 0, v[1]);
711
SpdStoreIntermediateH(x + 0, y + 8, v[2]);
712
SpdStoreIntermediateH(x + 8, y + 8, v[3]);
713
}
714
}
715
716
void SpdDownsampleMips_0_1H(FfxUInt32 x, FfxUInt32 y, FfxUInt32x2 workGroupID, FfxUInt32 localInvocationIndex, FfxUInt32 mips, FfxUInt32 slice)
717
{
718
#ifdef SPD_NO_WAVE_OPERATIONS
719
SpdDownsampleMips_0_1_LDSH(x, y, workGroupID, localInvocationIndex, mips, slice);
720
#else
721
SpdDownsampleMips_0_1_IntrinsicsH(x, y, workGroupID, localInvocationIndex, mips, slice);
722
#endif
723
}
724
725
726
void SpdDownsampleMip_2H(FfxUInt32 x, FfxUInt32 y, FfxUInt32x2 workGroupID, FfxUInt32 localInvocationIndex, FfxUInt32 mip, FfxUInt32 slice)
727
{
728
#ifdef SPD_NO_WAVE_OPERATIONS
729
if (localInvocationIndex < 64)
730
{
731
FfxFloat16x4 v = SpdReduceIntermediateH(FfxUInt32x2(x * 2 + 0, y * 2 + 0), FfxUInt32x2(x * 2 + 1, y * 2 + 0), FfxUInt32x2(x * 2 + 0, y * 2 + 1), FfxUInt32x2(x * 2 + 1, y * 2 + 1));
732
SpdStoreH(FfxInt32x2(workGroupID.xy * 8) + FfxInt32x2(x, y), v, mip, slice);
733
// store to LDS, try to reduce bank conflicts
734
// x 0 x 0 x 0 x 0 x 0 x 0 x 0 x 0
735
// 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
736
// 0 x 0 x 0 x 0 x 0 x 0 x 0 x 0 x
737
// 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
738
// x 0 x 0 x 0 x 0 x 0 x 0 x 0 x 0
739
// ...
740
// x 0 x 0 x 0 x 0 x 0 x 0 x 0 x 0
741
SpdStoreIntermediateH(x * 2 + y % 2, y * 2, v);
742
}
743
#else
744
FfxFloat16x4 v = SpdLoadIntermediateH(x, y);
745
v = SpdReduceQuadH(v);
746
// quad index 0 stores result
747
if (localInvocationIndex % 4 == 0)
748
{
749
SpdStoreH(FfxInt32x2(workGroupID.xy * 8) + FfxInt32x2(x / 2, y / 2), v, mip, slice);
750
SpdStoreIntermediateH(x + (y / 2) % 2, y, v);
751
}
752
#endif
753
}
754
755
void SpdDownsampleMip_3H(FfxUInt32 x, FfxUInt32 y, FfxUInt32x2 workGroupID, FfxUInt32 localInvocationIndex, FfxUInt32 mip, FfxUInt32 slice)
756
{
757
#ifdef SPD_NO_WAVE_OPERATIONS
758
if (localInvocationIndex < 16)
759
{
760
// x 0 x 0
761
// 0 0 0 0
762
// 0 x 0 x
763
// 0 0 0 0
764
FfxFloat16x4 v =
765
SpdReduceIntermediateH(FfxUInt32x2(x * 4 + 0 + 0, y * 4 + 0), FfxUInt32x2(x * 4 + 2 + 0, y * 4 + 0), FfxUInt32x2(x * 4 + 0 + 1, y * 4 + 2), FfxUInt32x2(x * 4 + 2 + 1, y * 4 + 2));
766
SpdStoreH(FfxInt32x2(workGroupID.xy * 4) + FfxInt32x2(x, y), v, mip, slice);
767
// store to LDS
768
// x 0 0 0 x 0 0 0 x 0 0 0 x 0 0 0
769
// 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
770
// 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
771
// 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
772
// 0 x 0 0 0 x 0 0 0 x 0 0 0 x 0 0
773
// ...
774
// 0 0 x 0 0 0 x 0 0 0 x 0 0 0 x 0
775
// ...
776
// 0 0 0 x 0 0 0 x 0 0 0 x 0 0 0 x
777
// ...
778
SpdStoreIntermediateH(x * 4 + y, y * 4, v);
779
}
780
#else
781
if (localInvocationIndex < 64)
782
{
783
FfxFloat16x4 v = SpdLoadIntermediateH(x * 2 + y % 2, y * 2);
784
v = SpdReduceQuadH(v);
785
// quad index 0 stores result
786
if (localInvocationIndex % 4 == 0)
787
{
788
SpdStoreH(FfxInt32x2(workGroupID.xy * 4) + FfxInt32x2(x / 2, y / 2), v, mip, slice);
789
SpdStoreIntermediateH(x * 2 + y / 2, y * 2, v);
790
}
791
}
792
#endif
793
}
794
795
void SpdDownsampleMip_4H(FfxUInt32 x, FfxUInt32 y, FfxUInt32x2 workGroupID, FfxUInt32 localInvocationIndex, FfxUInt32 mip, FfxUInt32 slice)
796
{
797
#ifdef SPD_NO_WAVE_OPERATIONS
798
if (localInvocationIndex < 4)
799
{
800
// x 0 0 0 x 0 0 0
801
// ...
802
// 0 x 0 0 0 x 0 0
803
FfxFloat16x4 v = SpdReduceIntermediateH(FfxUInt32x2(x * 8 + 0 + 0 + y * 2, y * 8 + 0),
804
FfxUInt32x2(x * 8 + 4 + 0 + y * 2, y * 8 + 0),
805
FfxUInt32x2(x * 8 + 0 + 1 + y * 2, y * 8 + 4),
806
FfxUInt32x2(x * 8 + 4 + 1 + y * 2, y * 8 + 4));
807
SpdStoreH(FfxInt32x2(workGroupID.xy * 2) + FfxInt32x2(x, y), v, mip, slice);
808
// store to LDS
809
// x x x x 0 ...
810
// 0 ...
811
SpdStoreIntermediateH(x + y * 2, 0, v);
812
}
813
#else
814
if (localInvocationIndex < 16)
815
{
816
FfxFloat16x4 v = SpdLoadIntermediateH(x * 4 + y, y * 4);
817
v = SpdReduceQuadH(v);
818
// quad index 0 stores result
819
if (localInvocationIndex % 4 == 0)
820
{
821
SpdStoreH(FfxInt32x2(workGroupID.xy * 2) + FfxInt32x2(x / 2, y / 2), v, mip, slice);
822
SpdStoreIntermediateH(x / 2 + y, 0, v);
823
}
824
}
825
#endif
826
}
827
828
void SpdDownsampleMip_5H(FfxUInt32x2 workGroupID, FfxUInt32 localInvocationIndex, FfxUInt32 mip, FfxUInt32 slice)
829
{
830
#ifdef SPD_NO_WAVE_OPERATIONS
831
if (localInvocationIndex < 1)
832
{
833
// x x x x 0 ...
834
// 0 ...
835
FfxFloat16x4 v = SpdReduceIntermediateH(FfxUInt32x2(0, 0), FfxUInt32x2(1, 0), FfxUInt32x2(2, 0), FfxUInt32x2(3, 0));
836
SpdStoreH(FfxInt32x2(workGroupID.xy), v, mip, slice);
837
}
838
#else
839
if (localInvocationIndex < 4)
840
{
841
FfxFloat16x4 v = SpdLoadIntermediateH(localInvocationIndex, 0);
842
v = SpdReduceQuadH(v);
843
// quad index 0 stores result
844
if (localInvocationIndex % 4 == 0)
845
{
846
SpdStoreH(FfxInt32x2(workGroupID.xy), v, mip, slice);
847
}
848
}
849
#endif
850
}
851
852
void SpdDownsampleMips_6_7H(FfxUInt32 x, FfxUInt32 y, FfxUInt32 mips, FfxUInt32 slice)
853
{
854
FfxInt32x2 tex = FfxInt32x2(x * 4 + 0, y * 4 + 0);
855
FfxInt32x2 pix = FfxInt32x2(x * 2 + 0, y * 2 + 0);
856
FfxFloat16x4 v0 = SpdReduceLoad4H(tex, slice);
857
SpdStoreH(pix, v0, 6, slice);
858
859
tex = FfxInt32x2(x * 4 + 2, y * 4 + 0);
860
pix = FfxInt32x2(x * 2 + 1, y * 2 + 0);
861
FfxFloat16x4 v1 = SpdReduceLoad4H(tex, slice);
862
SpdStoreH(pix, v1, 6, slice);
863
864
tex = FfxInt32x2(x * 4 + 0, y * 4 + 2);
865
pix = FfxInt32x2(x * 2 + 0, y * 2 + 1);
866
FfxFloat16x4 v2 = SpdReduceLoad4H(tex, slice);
867
SpdStoreH(pix, v2, 6, slice);
868
869
tex = FfxInt32x2(x * 4 + 2, y * 4 + 2);
870
pix = FfxInt32x2(x * 2 + 1, y * 2 + 1);
871
FfxFloat16x4 v3 = SpdReduceLoad4H(tex, slice);
872
SpdStoreH(pix, v3, 6, slice);
873
874
if (mips < 8)
875
return;
876
// no barrier needed, working on values only from the same thread
877
878
FfxFloat16x4 v = SpdReduce4H(v0, v1, v2, v3);
879
SpdStoreH(FfxInt32x2(x, y), v, 7, slice);
880
SpdStoreIntermediateH(x, y, v);
881
}
882
883
void SpdDownsampleNextFourH(FfxUInt32 x, FfxUInt32 y, FfxUInt32x2 workGroupID, FfxUInt32 localInvocationIndex, FfxUInt32 baseMip, FfxUInt32 mips, FfxUInt32 slice)
884
{
885
if (mips <= baseMip)
886
return;
887
SpdWorkgroupShuffleBarrier();
888
SpdDownsampleMip_2H(x, y, workGroupID, localInvocationIndex, baseMip, slice);
889
890
if (mips <= baseMip + 1)
891
return;
892
SpdWorkgroupShuffleBarrier();
893
SpdDownsampleMip_3H(x, y, workGroupID, localInvocationIndex, baseMip + 1, slice);
894
895
if (mips <= baseMip + 2)
896
return;
897
SpdWorkgroupShuffleBarrier();
898
SpdDownsampleMip_4H(x, y, workGroupID, localInvocationIndex, baseMip + 2, slice);
899
900
if (mips <= baseMip + 3)
901
return;
902
SpdWorkgroupShuffleBarrier();
903
SpdDownsampleMip_5H(workGroupID, localInvocationIndex, baseMip + 3, slice);
904
}
905
906
void SpdDownsampleH(FfxUInt32x2 workGroupID, FfxUInt32 localInvocationIndex, FfxUInt32 mips, FfxUInt32 numWorkGroups, FfxUInt32 slice)
907
{
908
FfxUInt32x2 sub_xy = ffxRemapForWaveReduction(localInvocationIndex % 64);
909
FfxUInt32 x = sub_xy.x + 8 * ((localInvocationIndex >> 6) % 2);
910
FfxUInt32 y = sub_xy.y + 8 * ((localInvocationIndex >> 7));
911
912
SpdDownsampleMips_0_1H(x, y, workGroupID, localInvocationIndex, mips, slice);
913
914
SpdDownsampleNextFourH(x, y, workGroupID, localInvocationIndex, 2, mips, slice);
915
916
if (mips < 7)
917
return;
918
919
if (SpdExitWorkgroup(numWorkGroups, localInvocationIndex, slice))
920
return;
921
922
SpdResetAtomicCounter(slice);
923
924
// After mip 6 there is only a single workgroup left that downsamples the remaining up to 64x64 texels.
925
SpdDownsampleMips_6_7H(x, y, mips, slice);
926
927
SpdDownsampleNextFourH(x, y, FfxUInt32x2(0, 0), localInvocationIndex, 8, mips, slice);
928
}
929
930
void SpdDownsampleH(FfxUInt32x2 workGroupID, FfxUInt32 localInvocationIndex, FfxUInt32 mips, FfxUInt32 numWorkGroups, FfxUInt32 slice, FfxUInt32x2 workGroupOffset)
931
{
932
SpdDownsampleH(workGroupID + workGroupOffset, localInvocationIndex, mips, numWorkGroups, slice);
933
}
934
935
#endif // #if FFX_HALF
936
#endif // #ifdef FFX_GPU
937
938