Foundation
Loading...
Searching...
No Matches
ffx_spd.h
Go to the documentation of this file.
1//_____________________________________________________________/\_______________________________________________________________
2//==============================================================================================================================
3//
4// [FFX SPD] Single Pass Downsampler 2.0
5//
6//==============================================================================================================================
7// LICENSE
8// =======
9// Copyright (c) 2017-2020 Advanced Micro Devices, Inc. All rights reserved.
10// -------
11// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation
12// files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy,
13// modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the
14// Software is furnished to do so, subject to the following conditions:
15// -------
16// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the
17// Software.
18// -------
19// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
20// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
21// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
22// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
23//
24//------------------------------------------------------------------------------------------------------------------------------
25// CHANGELIST v2.0
26// ===============
27// - Added support for cube and array textures. SpdDownsample and SpdDownsampleH shader functions now take index of texture slice
28// as an additional parameter. For regular texture use 0.
29// - Added support for updating only sub-rectangle of the texture. Additional, optional parameter workGroupOffset added to shader
30// functions SpdDownsample and SpdDownsampleH.
31// - Added C function SpdSetup that helps to setup constants to be passed as a constant buffer.
32// - The global atomic counter is automatically reset to 0 by the shader at the end, so you do not need to clear it before every
33// use, just once after creation
34//
35//------------------------------------------------------------------------------------------------------------------------------
36// INTEGRATION SUMMARY FOR CPU
37// ===========================
38// // you need to provide as constants:
39// // number of mip levels to be computed (maximum is 12)
40// // number of total thread groups: ((widthInPixels+63)>>6) * ((heightInPixels+63)>>6)
41// // workGroupOffset -> by default 0, if you only downsample a rectancle within the source texture use SpdSetup function to calculate correct offset
42// ...
43// // Dispatch the shader such that each thread group works on a 64x64 sub-tile of the source image
44// // for Cube Textures or Texture2DArray, use the z dimension
45// vkCmdDispatch(cmdBuf,(widthInPixels+63)>>6,(heightInPixels+63)>>6, slices);
46
47// // you can also use the SpdSetup function:
48// //on top of your cpp file:
49// #define A_CPU
50// #include "ffx_a.h"
51// #include "ffx_spd.h"
52// // before your dispatch call, use SpdSetup function to get your constants
53// varAU2(dispatchThreadGroupCountXY); // output variable
54// varAU2(workGroupOffset); // output variable, this constants are required if Left and Top are not 0,0
55// varAU2(numWorkGroupsAndMips); // output variable
56// // input information about your source texture:
57// // left and top of the rectancle within your texture you want to downsample
58// // width and height of the rectancle you want to downsample
59// // if complete source texture should get downsampled: left = 0, top = 0, width = sourceTexture.width, height = sourceTexture.height
60// varAU4(rectInfo) = initAU4(0, 0, m_Texture.GetWidth(), m_Texture.GetHeight()); // left, top, width, height
61// SpdSetup(dispatchThreadGroupCountXY, workGroupOffset, numWorkGroupsAndMips, rectInfo);
62// ...
63// // constants:
64// data.numWorkGroupsPerSlice = numWorkGroupsAndMips[0];
65// data.mips = numWorkGroupsAndMips[1];
66// data.workGroupOffset[0] = workGroupOffset[0];
67// data.workGroupOffset[1] = workGroupOffset[1];
68// ...
69// uint32_t dispatchX = dispatchThreadGroupCountXY[0];
70// uint32_t dispatchY = dispatchThreadGroupCountXY[1];
71// uint32_t dispatchZ = m_CubeTexture.GetArraySize(); // slices - for 2D Texture this is 1, for cube texture 6
72// vkCmdDispatch(cmd_buf, dispatchX, dispatchY, dispatchZ);
73
74//------------------------------------------------------------------------------------------------------------------------------
75// INTEGRATION SUMMARY FOR GPU
76// ===========================
77
78// [SAMPLER] - if you want to use a sampler with linear filtering for loading the source image
79// follow additionally the instructions marked with [SAMPLER]
80// add following define:
81// #define SPD_LINEAR_SAMPLER
82// this is recommended, as using one sample() with linear filter to reduce 2x2 is faster
83// than 4x load() plus manual averaging
84
85// // Setup layout. Example below for VK_FORMAT_R16G16B16A16_SFLOAT.
86// // Note: If you use SRGB format for UAV load() and store() (if it's supported), you need to convert to and from linear space
87// // when using UAV load() and store()
88// // approximate conversion to linear (load function): x*x
89// // approximate conversion from linear (store function): sqrt()
90// // or use more accurate functions from ffx_a.h: AFromSrgbF1(value) and AToSrgbF1(value)
91// // Recommendation: use UNORM format instead of SRGB for UAV access, and SRGB for SRV access
92// // look in the sample app to see how it's done
93
94// // source image
95// // if cube texture use image2DArray / Texture2DArray and adapt your load/store/sample calls
96// GLSL: layout(set=0,binding=0,rgba16f)uniform image2D imgSrc;
97// [SAMPLER]: layout(set=0,binding=0)uniform texture2D imgSrc;
98// HLSL: [[vk::binding(0)]] Texture2D<float4> imgSrc :register(u0);
99
100// // destination -> 12 is the maximum number of mips supported by SPD
101// GLSL: layout(set=0,binding=1,rgba16f) uniform coherent image2D imgDst[12];
102// HLSL: [[vk::binding(1)]] globallycoherent RWTexture2D<float4> imgDst[12] :register(u1);
103
104// // global atomic counter - MUST be initialized to 0
105// // SPD resets the counter back after each run by calling SpdResetAtomicCounter(slice)
106// // if you have more than 1 slice (== if you downsample a cube texture or a texture2Darray)
107// // you have an array of counters: counter[6] -> if you have 6 slices for example
108// // GLSL:
109// layout(std430, set=0, binding=2) coherent buffer SpdGlobalAtomicBuffer
110// {
111// uint counter;
112// } spdGlobalAtomic;
113// // HLSL:
114// struct SpdGlobalAtomicBuffer
115// {
116// uint counter;
117// };
118// [[vk::binding(2)]] globallycoherent RWStructuredBuffer<SpdGlobalAtomicBuffer> spdGlobalAtomic;
119
120// // [SAMPLER] add sampler
121// GLSL: layout(set=0, binding=3) uniform sampler srcSampler;
122// HLSL: [[vk::binding(3)]] SamplerState srcSampler :register(s0);
123
124// // constants - either push constant or constant buffer
125// // or calculate within shader
126// // [SAMPLER] when using sampler add inverse source image size
127// // GLSL:
128// layout(push_constant) uniform SpdConstants {
129// uint mips; // needed to opt out earlier if mips are < 12
130// uint numWorkGroups; // number of total thread groups, so numWorkGroupsX * numWorkGroupsY * 1
131// // it is important to NOT take the number of slices (z dimension) into account here
132// // as each slice has its own counter!
133// vec2 workGroupOffset; // optional - use SpdSetup() function to calculate correct workgroup offset
134// } spdConstants;
135// // HLSL:
136// [[vk::push_constant]]
137// cbuffer spdConstants {
138// uint mips;
139// uint numWorkGroups;
140// float2 workGroupOffset; // optional
141// };
142
143// ...
144// // Setup pre-portability-header defines (sets up GLSL/HLSL path, etc)
145// #define A_GPU 1
146// #define A_GLSL 1 // or // #define A_HLSL 1
147
148// // if you want to use PACKED version
149// // recommended if bpc <= 16bit
150// #define A_HALF
151
152// ...
153// // Include the portability header (or copy it in without an include).
154// #include "ffx_a.h"
155// ...
156
157// // Define LDS variables
158// shared AF4 spdIntermediate[16][16]; // HLSL: groupshared
159// shared AU1 spdCounter; // HLSL: groupshared
160// // PACKED version
161// shared AH4 spdIntermediate[16][16]; // HLSL: groupshared
162// // Note: You can also use
163// shared AF1 spdIntermediateR[16][16];
164// shared AF1 spdIntermediateG[16][16];
165// shared AF1 spdIntermediateB[16][16];
166// shared AF1 spdIntermediateA[16][16];
167// // or for Packed version:
168// shared AH2 spdIntermediateRG[16][16];
169// shared AH2 spdIntermediateBA[16][16];
170// // This is potentially faster
171// // Adapt your load and store functions accordingly
172
173// // if subgroup operations are not supported / can't use SM6.0
174// #define SPD_NO_WAVE_OPERATIONS
175
176// // Define the fetch function(s) and the reduction function
177// // if non-power-of-2 textures, add border controls to the load and store functions
178// // to make sure the borders of the mip level look as you want it
179// // if you don't add border controls you'll read zeros past the border
180// // if you load with a sampler, this is obv. handled by your sampler :)
181// // this is also the place where you need to do color space transformation if needed
182// // E.g. if your texture format is SRGB/UNORM and you use the UAV load and store functions
183// // no automatic to/from linear conversions are happening
184// // there is to/from linear conversions when using a sampler and render target approach
185// // conversion to linear (load function): x*x
186// // conversion from linear (store function): sqrt()
187
188// AU1 slice parameter is for Cube textures and texture2DArray
189// if downsampling Texture2D you can ignore this parameter, otherwise use it to access correct slice
190// // Load from source image
191// GLSL: AF4 SpdLoadSourceImage(ASU2 p, AU1 slice){return imageLoad(imgSrc, p);}
192// HLSL: AF4 SpdLoadSourceImage(ASU2 tex, AU1 slice){return imgSrc[tex];}
193// [SAMPLER] don't forget to add the define #SPD_LINEAR_SAMPLER :)
194// GLSL:
195// AF4 SpdLoadSourceImage(ASU2 p, AU1 slice){
196// AF2 textureCoord = p * invInputSize + invInputSize;
197// return texture(sampler2D(imgSrc, srcSampler), textureCoord);
198// }
199// HLSL:
200// AF4 SpdLoadSourceImage(ASU2 p, AU1 slice){
201// AF2 textureCoord = p * invInputSize + invInputSize;
202// return imgSrc.SampleLevel(srcSampler, textureCoord, 0);
203// }
204
205// // SpdLoad() takes a 32-bit signed integer 2D coordinate and loads color.
206// // Loads the 5th mip level, each value is computed by a different thread group
207// // last thread group will access all its elements and compute the subsequent mips
208// // reminder: if non-power-of-2 textures, add border controls if you do not want to read zeros past the border
209// GLSL: AF4 SpdLoad(ASU2 p, AU1 slice){return imageLoad(imgDst[5],p);}
210// HLSL: AF4 SpdLoad(ASU2 tex, AU1 slice){return imgDst[5][tex];}
211
212// Define the store function
213// GLSL: void SpdStore(ASU2 p, AF4 value, AU1 mip, AU1 slice){imageStore(imgDst[mip], p, value);}
214// HLSL: void SpdStore(ASU2 pix, AF4 value, AU1 mip, AU1 slice){imgDst[mip][pix] = value;}
215
216// // Define the atomic counter increase function
217// // each slice only reads and stores to its specific slice counter
218// // so, if you have several slices it's
219// // InterlockedAdd(spdGlobalAtomic[0].counter[slice], 1, spdCounter);
220// // GLSL:
221// void SpdIncreaseAtomicCounter(AU1 slice){spdCounter = atomicAdd(spdGlobalAtomic.counter, 1);}
222// AU1 SpdGetAtomicCounter() {return spdCounter;}
223// void SpdResetAtomicCounter(AU1 slice){spdGlobalAtomic.counter[slice] = 0;}
224// // HLSL:
225// void SpdIncreaseAtomicCounter(AU1 slice){InterlockedAdd(spdGlobalAtomic[0].counter, 1, spdCounter);}
226// AU1 SpdGetAtomicCounter(){return spdCounter;}
227// void SpdResetAtomicCounter(AU1 slice){spdGlobalAtomic[0].counter[slice] = 0;}
228
229// // Define the LDS load and store functions
230// // GLSL:
231// AF4 SpdLoadIntermediate(AU1 x, AU1 y){return spdIntermediate[x][y];}
232// void SpdStoreIntermediate(AU1 x, AU1 y, AF4 value){spdIntermediate[x][y] = value;}
233// // HLSL:
234// AF4 SpdLoadIntermediate(AU1 x, AU1 y){return spdIntermediate[x][y];}
235// void SpdStoreIntermediate(AU1 x, AU1 y, AF4 value){spdIntermediate[x][y] = value;}
236
237// // Define your reduction function: takes as input the four 2x2 values and returns 1 output value
238// Example below: computes the average value
239// AF4 SpdReduce4(AF4 v0, AF4 v1, AF4 v2, AF4 v3){return (v0+v1+v2+v3)*0.25;}
240
241// // PACKED VERSION
242// Load from source image
243// GLSL: AH4 SpdLoadSourceImageH(ASU2 p, AU1 slice){return AH4(imageLoad(imgSrc, p));}
244// HLSL: AH4 SpdLoadSourceImageH(ASU2 tex, AU1 slice){return AH4(imgSrc[tex]);}
245// [SAMPLER]
246// GLSL:
247// AH4 SpdLoadSourceImageH(ASU2 p, AU1 slice){
248// AF2 textureCoord = p * invInputSize + invInputSize;
249// return AH4(texture(sampler2D(imgSrc, srcSampler), textureCoord));
250// }
251// HLSL:
252// AH4 SpdLoadSourceImageH(ASU2 p, AU1 slice){
253// AF2 textureCoord = p * invInputSize + invInputSize;
254// return AH4(imgSrc.SampleLevel(srcSampler, textureCoord, 0));
255// }
256
257// // SpdLoadH() takes a 32-bit signed integer 2D coordinate and loads color.
258// // Loads the 5th mip level, each value is computed by a different thread group
259// // last thread group will access all its elements and compute the subsequent mips
260// GLSL: AH4 SpdLoadH(ASU2 p, AU1 slice){return AH4(imageLoad(imgDst[5],p));}
261// HLSL: AH4 SpdLoadH(ASU2 tex, AU1 slice){return AH4(imgDst[5][tex]);}
262
263// Define the store function
264// GLSL: void SpdStoreH(ASU2 p, AH4 value, AU1 mip, AU1 slice){imageStore(imgDst[mip], p, AF4(value));}
265// HLSL: void SpdStoreH(ASU2 pix, AH4 value, AU1 index, AU1 slice){imgDst[index][pix] = AF4(value);}
266
267// // Define the atomic counter increase function
268// // GLSL:
269// void SpdIncreaseAtomicCounter(AU1 slice){spd_counter = atomicAdd(spdGlobalAtomic.counter, 1);}
270// AU1 SpdGetAtomicCounter() {return spdCounter;}
271// // HLSL:
272// void SpdIncreaseAtomicCounter(AU1 slice){InterlockedAdd(spdGlobalAtomic[0].counter, 1, spdCounter);}
273// AU1 SpdGetAtomicCounter(){return spdCounter;}
274
275// // Define the LDS load and store functions
276// // GLSL:
277// AH4 SpdLoadIntermediateH(AU1 x, AU1 y){return spdIntermediate[x][y];}
278// void SpdStoreIntermediateH(AU1 x, AU1 y, AH4 value){spdIntermediate[x][y] = value;}
279// // HLSL:
280// AH4 SpdLoadIntermediate(AU1 x, AU1 y){return spdIntermediate[x][y];}
281// void SpdStoreIntermediate(AU1 x, AU1 y, AH4 value){spdIntermediate[x][y] = value;}
282
283// // Define your reduction function: takes as input the four 2x2 values and returns 1 output value
284// Example below: computes the average value
285// AH4 SpdReduce4H(AH4 v0, AH4 v1, AH4 v2, AH4 v3){return (v0+v1+v2+v3)*AH1(0.25);}
286
287// //
288
289// // If you only use PACKED version
290// #define SPD_PACKED_ONLY
291
292// // Include this SPD (single pass downsampler) header file (or copy it in without an include).
293// #include "ffx_spd.h"
294// ...
295
296// // Example in shader integration
297// // GLSL:
298// layout(local_size_x = 256, local_size_y = 1, local_size_z = 1) in;
299// void main(){
300// // Call the downsampling function
301// // WorkGroupId.z should be 0 if you only downsample a Texture2D!
302// SpdDownsample(AU2(gl_WorkGroupID.xy), AU1(gl_LocalInvocationIndex),
303// AU1(spdConstants.mips), AU1(spdConstants.numWorkGroups), AU1(WorkGroupId.z));
304//
305// // PACKED:
306// SpdDownsampleH(AU2(gl_WorkGroupID.xy), AU1(gl_LocalInvocationIndex),
307// AU1(spdConstants.mips), AU1(spdConstants.numWorkGroups), AU1(WorkGroupId.z));
308// ...
309// // HLSL:
310// [numthreads(256,1,1)]
311// void main(uint3 WorkGroupId : SV_GroupID, uint LocalThreadIndex : SV_GroupIndex) {
312// SpdDownsample(AU2(WorkGroupId.xy), AU1(LocalThreadIndex),
313// AU1(mips), AU1(numWorkGroups), AU1(WorkGroupId.z));
314//
315// // PACKED:
316// SpdDownsampleH(AU2(WorkGroupId.xy), AU1(LocalThreadIndex),
317// AU1(mips), AU1(numWorkGroups), AU1(WorkGroupId.z));
318// ...
319
320//
321//------------------------------------------------------------------------------------------------------------------------------
322
323//==============================================================================================================================
324// SPD Setup
325//==============================================================================================================================
326#ifdef A_CPU
327A_STATIC void SpdSetup(
328outAU2 dispatchThreadGroupCountXY, // CPU side: dispatch thread group count xy
329outAU2 workGroupOffset, // GPU side: pass in as constant
330outAU2 numWorkGroupsAndMips, // GPU side: pass in as constant
331inAU4 rectInfo, // left, top, width, height
332ASU1 mips // optional: if -1, calculate based on rect width and height
333){
334 workGroupOffset[0] = rectInfo[0] / 64; // rectInfo[0] = left
335 workGroupOffset[1] = rectInfo[1] / 64; // rectInfo[1] = top
336
337 AU1 endIndexX = (rectInfo[0] + rectInfo[2] - 1) / 64; // rectInfo[0] = left, rectInfo[2] = width
338 AU1 endIndexY = (rectInfo[1] + rectInfo[3] - 1) / 64; // rectInfo[1] = top, rectInfo[3] = height
339
340 dispatchThreadGroupCountXY[0] = endIndexX + 1 - workGroupOffset[0];
341 dispatchThreadGroupCountXY[1] = endIndexY + 1 - workGroupOffset[1];
342
343 numWorkGroupsAndMips[0] = (dispatchThreadGroupCountXY[0]) * (dispatchThreadGroupCountXY[1]);
344
345 if (mips >= 0) {
346 numWorkGroupsAndMips[1] = AU1(mips);
347 } else { // calculate based on rect width and height
348 AU1 resolution = AMaxU1(rectInfo[2], rectInfo[3]);
349 numWorkGroupsAndMips[1] = AU1((AMinF1(AFloorF1(ALog2F1(AF1(resolution))), AF1(12))));
350 }
351}
352
353A_STATIC void SpdSetup(
354 outAU2 dispatchThreadGroupCountXY, // CPU side: dispatch thread group count xy
355 outAU2 workGroupOffset, // GPU side: pass in as constant
356 outAU2 numWorkGroupsAndMips, // GPU side: pass in as constant
357 inAU4 rectInfo // left, top, width, height
358) {
359 SpdSetup(dispatchThreadGroupCountXY, workGroupOffset, numWorkGroupsAndMips, rectInfo, -1);
360}
361#endif // #ifdef A_CPU
362//==============================================================================================================================
363// NON-PACKED VERSION
364//==============================================================================================================================
365#ifdef A_GPU
366#ifdef SPD_PACKED_ONLY
367 // Avoid compiler error
368 AF4 SpdLoadSourceImage(ASU2 p, AU1 slice){return AF4(0.0,0.0,0.0,0.0);}
369 AF4 SpdLoad(ASU2 p, AU1 slice){return AF4(0.0,0.0,0.0,0.0);}
370 void SpdStore(ASU2 p, AF4 value, AU1 mip, AU1 slice){}
371 AF4 SpdLoadIntermediate(AU1 x, AU1 y){return AF4(0.0,0.0,0.0,0.0);}
372 void SpdStoreIntermediate(AU1 x, AU1 y, AF4 value){}
373 AF4 SpdReduce4(AF4 v0, AF4 v1, AF4 v2, AF4 v3){return AF4(0.0,0.0,0.0,0.0);}
374#endif // #ifdef SPD_PACKED_ONLY
375
376//_____________________________________________________________/\_______________________________________________________________
377#if defined(A_GLSL) && !defined(SPD_NO_WAVE_OPERATIONS)
378#extension GL_KHR_shader_subgroup_quad:require
379#endif
380
381void SpdWorkgroupShuffleBarrier() {
382#ifdef A_GLSL
383 barrier();
384#endif
385#ifdef A_HLSL
386 GroupMemoryBarrierWithGroupSync();
387#endif
388}
389
390// Only last active workgroup should proceed
391bool SpdExitWorkgroup(AU1 numWorkGroups, AU1 localInvocationIndex, AU1 slice)
392{
393 // global atomic counter
394 if (localInvocationIndex == 0)
395 {
396 SpdIncreaseAtomicCounter(slice);
397 }
398 SpdWorkgroupShuffleBarrier();
399 return (SpdGetAtomicCounter() != (numWorkGroups - 1));
400}
401
404
405// User defined: AF4 SpdReduce4(AF4 v0, AF4 v1, AF4 v2, AF4 v3);
406
407AF4 SpdReduceQuad(AF4 v)
408{
409 #if defined(A_GLSL) && !defined(SPD_NO_WAVE_OPERATIONS)
410 AF4 v0 = v;
411 AF4 v1 = subgroupQuadSwapHorizontal(v);
412 AF4 v2 = subgroupQuadSwapVertical(v);
413 AF4 v3 = subgroupQuadSwapDiagonal(v);
414 return SpdReduce4(v0, v1, v2, v3);
415 #elif defined(A_HLSL) && !defined(SPD_NO_WAVE_OPERATIONS)
416 // requires SM6.0
417 AU1 quad = WaveGetLaneIndex() & (~0x3);
418 AF4 v0 = v;
419 AF4 v1 = WaveReadLaneAt(v, quad | 1);
420 AF4 v2 = WaveReadLaneAt(v, quad | 2);
421 AF4 v3 = WaveReadLaneAt(v, quad | 3);
422 return SpdReduce4(v0, v1, v2, v3);
423 /*
424 // if SM6.0 is not available, you can use the AMD shader intrinsics
425 // the AMD shader intrinsics are available in AMD GPU Services (AGS) library:
426 // https://gpuopen.com/amd-gpu-services-ags-library/
427 // works for DX11
428 AF4 v0 = v;
429 AF4 v1;
430 v1.x = AmdExtD3DShaderIntrinsics_SwizzleF(v.x, AmdExtD3DShaderIntrinsicsSwizzle_SwapX1);
431 v1.y = AmdExtD3DShaderIntrinsics_SwizzleF(v.y, AmdExtD3DShaderIntrinsicsSwizzle_SwapX1);
432 v1.z = AmdExtD3DShaderIntrinsics_SwizzleF(v.z, AmdExtD3DShaderIntrinsicsSwizzle_SwapX1);
433 v1.w = AmdExtD3DShaderIntrinsics_SwizzleF(v.w, AmdExtD3DShaderIntrinsicsSwizzle_SwapX1);
434 AF4 v2;
435 v2.x = AmdExtD3DShaderIntrinsics_SwizzleF(v.x, AmdExtD3DShaderIntrinsicsSwizzle_SwapX2);
436 v2.y = AmdExtD3DShaderIntrinsics_SwizzleF(v.y, AmdExtD3DShaderIntrinsicsSwizzle_SwapX2);
437 v2.z = AmdExtD3DShaderIntrinsics_SwizzleF(v.z, AmdExtD3DShaderIntrinsicsSwizzle_SwapX2);
438 v2.w = AmdExtD3DShaderIntrinsics_SwizzleF(v.w, AmdExtD3DShaderIntrinsicsSwizzle_SwapX2);
439 AF4 v3;
440 v3.x = AmdExtD3DShaderIntrinsics_SwizzleF(v.x, AmdExtD3DShaderIntrinsicsSwizzle_ReverseX4);
441 v3.y = AmdExtD3DShaderIntrinsics_SwizzleF(v.y, AmdExtD3DShaderIntrinsicsSwizzle_ReverseX4);
442 v3.z = AmdExtD3DShaderIntrinsics_SwizzleF(v.z, AmdExtD3DShaderIntrinsicsSwizzle_ReverseX4);
443 v3.w = AmdExtD3DShaderIntrinsics_SwizzleF(v.w, AmdExtD3DShaderIntrinsicsSwizzle_ReverseX4);
444 return SpdReduce4(v0, v1, v2, v3);
445 */
446 #endif
447 return v;
448}
449
450AF4 SpdReduceIntermediate(AU2 i0, AU2 i1, AU2 i2, AU2 i3)
451{
452 AF4 v0 = SpdLoadIntermediate(i0.x, i0.y);
453 AF4 v1 = SpdLoadIntermediate(i1.x, i1.y);
454 AF4 v2 = SpdLoadIntermediate(i2.x, i2.y);
455 AF4 v3 = SpdLoadIntermediate(i3.x, i3.y);
456 return SpdReduce4(v0, v1, v2, v3);
457}
458
459AF4 SpdReduceLoad4(AU2 i0, AU2 i1, AU2 i2, AU2 i3, AU1 slice)
460{
461 AF4 v0 = SpdLoad(ASU2(i0), slice);
462 AF4 v1 = SpdLoad(ASU2(i1), slice);
463 AF4 v2 = SpdLoad(ASU2(i2), slice);
464 AF4 v3 = SpdLoad(ASU2(i3), slice);
465 return SpdReduce4(v0, v1, v2, v3);
466}
467
468AF4 SpdReduceLoad4(AU2 base, AU1 slice)
469{
470 return SpdReduceLoad4(
471 AU2(base + AU2(0, 0)),
472 AU2(base + AU2(0, 1)),
473 AU2(base + AU2(1, 0)),
474 AU2(base + AU2(1, 1)),
475 slice);
476}
477
478AF4 SpdReduceLoadSourceImage4(AU2 i0, AU2 i1, AU2 i2, AU2 i3, AU1 slice)
479{
480 AF4 v0 = SpdLoadSourceImage(ASU2(i0), slice);
481 AF4 v1 = SpdLoadSourceImage(ASU2(i1), slice);
482 AF4 v2 = SpdLoadSourceImage(ASU2(i2), slice);
483 AF4 v3 = SpdLoadSourceImage(ASU2(i3), slice);
484 return SpdReduce4(v0, v1, v2, v3);
485}
486
487AF4 SpdReduceLoadSourceImage(AU2 base, AU1 slice)
488{
489#ifdef SPD_LINEAR_SAMPLER
490 return SpdLoadSourceImage(ASU2(base), slice);
491#else
492 return SpdReduceLoadSourceImage4(
493 AU2(base + AU2(0, 0)),
494 AU2(base + AU2(0, 1)),
495 AU2(base + AU2(1, 0)),
496 AU2(base + AU2(1, 1)),
497 slice);
498#endif
499}
500
501void SpdDownsampleMips_0_1_Intrinsics(AU1 x, AU1 y, AU2 workGroupID, AU1 localInvocationIndex, AU1 mip, AU1 slice)
502{
503 AF4 v[4];
504
505 ASU2 tex = ASU2(workGroupID.xy * 64) + ASU2(x * 2, y * 2);
506 ASU2 pix = ASU2(workGroupID.xy * 32) + ASU2(x, y);
507 v[0] = SpdReduceLoadSourceImage(tex, slice);
508 SpdStore(pix, v[0], 0, slice);
509
510 tex = ASU2(workGroupID.xy * 64) + ASU2(x * 2 + 32, y * 2);
511 pix = ASU2(workGroupID.xy * 32) + ASU2(x + 16, y);
512 v[1] = SpdReduceLoadSourceImage(tex, slice);
513 SpdStore(pix, v[1], 0, slice);
514
515 tex = ASU2(workGroupID.xy * 64) + ASU2(x * 2, y * 2 + 32);
516 pix = ASU2(workGroupID.xy * 32) + ASU2(x, y + 16);
517 v[2] = SpdReduceLoadSourceImage(tex, slice);
518 SpdStore(pix, v[2], 0, slice);
519
520 tex = ASU2(workGroupID.xy * 64) + ASU2(x * 2 + 32, y * 2 + 32);
521 pix = ASU2(workGroupID.xy * 32) + ASU2(x + 16, y + 16);
522 v[3] = SpdReduceLoadSourceImage(tex, slice);
523 SpdStore(pix, v[3], 0, slice);
524
525 if (mip <= 1)
526 return;
527
528 v[0] = SpdReduceQuad(v[0]);
529 v[1] = SpdReduceQuad(v[1]);
530 v[2] = SpdReduceQuad(v[2]);
531 v[3] = SpdReduceQuad(v[3]);
532
533 if ((localInvocationIndex % 4) == 0)
534 {
535 SpdStore(ASU2(workGroupID.xy * 16) +
536 ASU2(x/2, y/2), v[0], 1, slice);
537 SpdStoreIntermediate(
538 x/2, y/2, v[0]);
539
540 SpdStore(ASU2(workGroupID.xy * 16) +
541 ASU2(x/2 + 8, y/2), v[1], 1, slice);
542 SpdStoreIntermediate(
543 x/2 + 8, y/2, v[1]);
544
545 SpdStore(ASU2(workGroupID.xy * 16) +
546 ASU2(x/2, y/2 + 8), v[2], 1, slice);
547 SpdStoreIntermediate(
548 x/2, y/2 + 8, v[2]);
549
550 SpdStore(ASU2(workGroupID.xy * 16) +
551 ASU2(x/2 + 8, y/2 + 8), v[3], 1, slice);
552 SpdStoreIntermediate(
553 x/2 + 8, y/2 + 8, v[3]);
554 }
555}
556
557void SpdDownsampleMips_0_1_LDS(AU1 x, AU1 y, AU2 workGroupID, AU1 localInvocationIndex, AU1 mip, AU1 slice)
558{
559 AF4 v[4];
560
561 ASU2 tex = ASU2(workGroupID.xy * 64) + ASU2(x * 2, y * 2);
562 ASU2 pix = ASU2(workGroupID.xy * 32) + ASU2(x, y);
563 v[0] = SpdReduceLoadSourceImage(tex, slice);
564 SpdStore(pix, v[0], 0, slice);
565
566 tex = ASU2(workGroupID.xy * 64) + ASU2(x * 2 + 32, y * 2);
567 pix = ASU2(workGroupID.xy * 32) + ASU2(x + 16, y);
568 v[1] = SpdReduceLoadSourceImage(tex, slice);
569 SpdStore(pix, v[1], 0, slice);
570
571 tex = ASU2(workGroupID.xy * 64) + ASU2(x * 2, y * 2 + 32);
572 pix = ASU2(workGroupID.xy * 32) + ASU2(x, y + 16);
573 v[2] = SpdReduceLoadSourceImage(tex, slice);
574 SpdStore(pix, v[2], 0, slice);
575
576 tex = ASU2(workGroupID.xy * 64) + ASU2(x * 2 + 32, y * 2 + 32);
577 pix = ASU2(workGroupID.xy * 32) + ASU2(x + 16, y + 16);
578 v[3] = SpdReduceLoadSourceImage(tex, slice);
579 SpdStore(pix, v[3], 0, slice);
580
581 if (mip <= 1)
582 return;
583
584 for (int i = 0; i < 4; i++)
585 {
586 SpdStoreIntermediate(x, y, v[i]);
587 SpdWorkgroupShuffleBarrier();
588 if (localInvocationIndex < 64)
589 {
590 v[i] = SpdReduceIntermediate(
591 AU2(x * 2 + 0, y * 2 + 0),
592 AU2(x * 2 + 1, y * 2 + 0),
593 AU2(x * 2 + 0, y * 2 + 1),
594 AU2(x * 2 + 1, y * 2 + 1)
595 );
596 SpdStore(ASU2(workGroupID.xy * 16) + ASU2(x + (i % 2) * 8, y + (i / 2) * 8), v[i], 1, slice);
597 }
598 SpdWorkgroupShuffleBarrier();
599 }
600
601 if (localInvocationIndex < 64)
602 {
603 SpdStoreIntermediate(x + 0, y + 0, v[0]);
604 SpdStoreIntermediate(x + 8, y + 0, v[1]);
605 SpdStoreIntermediate(x + 0, y + 8, v[2]);
606 SpdStoreIntermediate(x + 8, y + 8, v[3]);
607 }
608}
609
610void SpdDownsampleMips_0_1(AU1 x, AU1 y, AU2 workGroupID, AU1 localInvocationIndex, AU1 mip, AU1 slice)
611{
612#ifdef SPD_NO_WAVE_OPERATIONS
613 SpdDownsampleMips_0_1_LDS(x, y, workGroupID, localInvocationIndex, mip, slice);
614#else
615 SpdDownsampleMips_0_1_Intrinsics(x, y, workGroupID, localInvocationIndex, mip, slice);
616#endif
617}
618
619
620void SpdDownsampleMip_2(AU1 x, AU1 y, AU2 workGroupID, AU1 localInvocationIndex, AU1 mip, AU1 slice)
621{
622#ifdef SPD_NO_WAVE_OPERATIONS
623 if (localInvocationIndex < 64)
624 {
625 AF4 v = SpdReduceIntermediate(
626 AU2(x * 2 + 0, y * 2 + 0),
627 AU2(x * 2 + 1, y * 2 + 0),
628 AU2(x * 2 + 0, y * 2 + 1),
629 AU2(x * 2 + 1, y * 2 + 1)
630 );
631 SpdStore(ASU2(workGroupID.xy * 8) + ASU2(x, y), v, mip, slice);
632 // store to LDS, try to reduce bank conflicts
633 // x 0 x 0 x 0 x 0 x 0 x 0 x 0 x 0
634 // 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
635 // 0 x 0 x 0 x 0 x 0 x 0 x 0 x 0 x
636 // 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
637 // x 0 x 0 x 0 x 0 x 0 x 0 x 0 x 0
638 // ...
639 // x 0 x 0 x 0 x 0 x 0 x 0 x 0 x 0
640 SpdStoreIntermediate(x * 2 + y % 2, y * 2, v);
641 }
642#else
643 AF4 v = SpdLoadIntermediate(x, y);
644 v = SpdReduceQuad(v);
645 // quad index 0 stores result
646 if (localInvocationIndex % 4 == 0)
647 {
648 SpdStore(ASU2(workGroupID.xy * 8) + ASU2(x/2, y/2), v, mip, slice);
649 SpdStoreIntermediate(x + (y/2) % 2, y, v);
650 }
651#endif
652}
653
654void SpdDownsampleMip_3(AU1 x, AU1 y, AU2 workGroupID, AU1 localInvocationIndex, AU1 mip, AU1 slice)
655{
656#ifdef SPD_NO_WAVE_OPERATIONS
657 if (localInvocationIndex < 16)
658 {
659 // x 0 x 0
660 // 0 0 0 0
661 // 0 x 0 x
662 // 0 0 0 0
663 AF4 v = SpdReduceIntermediate(
664 AU2(x * 4 + 0 + 0, y * 4 + 0),
665 AU2(x * 4 + 2 + 0, y * 4 + 0),
666 AU2(x * 4 + 0 + 1, y * 4 + 2),
667 AU2(x * 4 + 2 + 1, y * 4 + 2)
668 );
669 SpdStore(ASU2(workGroupID.xy * 4) + ASU2(x, y), v, mip, slice);
670 // store to LDS
671 // x 0 0 0 x 0 0 0 x 0 0 0 x 0 0 0
672 // 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
673 // 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
674 // 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
675 // 0 x 0 0 0 x 0 0 0 x 0 0 0 x 0 0
676 // ...
677 // 0 0 x 0 0 0 x 0 0 0 x 0 0 0 x 0
678 // ...
679 // 0 0 0 x 0 0 0 x 0 0 0 x 0 0 0 x
680 // ...
681 SpdStoreIntermediate(x * 4 + y, y * 4, v);
682 }
683#else
684 if (localInvocationIndex < 64)
685 {
686 AF4 v = SpdLoadIntermediate(x * 2 + y % 2,y * 2);
687 v = SpdReduceQuad(v);
688 // quad index 0 stores result
689 if (localInvocationIndex % 4 == 0)
690 {
691 SpdStore(ASU2(workGroupID.xy * 4) + ASU2(x/2, y/2), v, mip, slice);
692 SpdStoreIntermediate(x * 2 + y/2, y * 2, v);
693 }
694 }
695#endif
696}
697
698void SpdDownsampleMip_4(AU1 x, AU1 y, AU2 workGroupID, AU1 localInvocationIndex, AU1 mip, AU1 slice)
699{
700#ifdef SPD_NO_WAVE_OPERATIONS
701 if (localInvocationIndex < 4)
702 {
703 // x 0 0 0 x 0 0 0
704 // ...
705 // 0 x 0 0 0 x 0 0
706 AF4 v = SpdReduceIntermediate(
707 AU2(x * 8 + 0 + 0 + y * 2, y * 8 + 0),
708 AU2(x * 8 + 4 + 0 + y * 2, y * 8 + 0),
709 AU2(x * 8 + 0 + 1 + y * 2, y * 8 + 4),
710 AU2(x * 8 + 4 + 1 + y * 2, y * 8 + 4)
711 );
712 SpdStore(ASU2(workGroupID.xy * 2) + ASU2(x, y), v, mip, slice);
713 // store to LDS
714 // x x x x 0 ...
715 // 0 ...
716 SpdStoreIntermediate(x + y * 2, 0, v);
717 }
718#else
719 if (localInvocationIndex < 16)
720 {
721 AF4 v = SpdLoadIntermediate(x * 4 + y,y * 4);
722 v = SpdReduceQuad(v);
723 // quad index 0 stores result
724 if (localInvocationIndex % 4 == 0)
725 {
726 SpdStore(ASU2(workGroupID.xy * 2) + ASU2(x/2, y/2), v, mip, slice);
727 SpdStoreIntermediate(x / 2 + y, 0, v);
728 }
729 }
730#endif
731}
732
733void SpdDownsampleMip_5(AU2 workGroupID, AU1 localInvocationIndex, AU1 mip, AU1 slice)
734{
735#ifdef SPD_NO_WAVE_OPERATIONS
736 if (localInvocationIndex < 1)
737 {
738 // x x x x 0 ...
739 // 0 ...
740 AF4 v = SpdReduceIntermediate(
741 AU2(0, 0),
742 AU2(1, 0),
743 AU2(2, 0),
744 AU2(3, 0)
745 );
746 SpdStore(ASU2(workGroupID.xy), v, mip, slice);
747 }
748#else
749 if (localInvocationIndex < 4)
750 {
751 AF4 v = SpdLoadIntermediate(localInvocationIndex,0);
752 v = SpdReduceQuad(v);
753 // quad index 0 stores result
754 if (localInvocationIndex % 4 == 0)
755 {
756 SpdStore(ASU2(workGroupID.xy), v, mip, slice);
757 }
758 }
759#endif
760}
761
762void SpdDownsampleMips_6_7(AU1 x, AU1 y, AU1 mips, AU1 slice)
763{
764 ASU2 tex = ASU2(x * 4 + 0, y * 4 + 0);
765 ASU2 pix = ASU2(x * 2 + 0, y * 2 + 0);
766 AF4 v0 = SpdReduceLoad4(tex, slice);
767 SpdStore(pix, v0, 6, slice);
768
769 tex = ASU2(x * 4 + 2, y * 4 + 0);
770 pix = ASU2(x * 2 + 1, y * 2 + 0);
771 AF4 v1 = SpdReduceLoad4(tex, slice);
772 SpdStore(pix, v1, 6, slice);
773
774 tex = ASU2(x * 4 + 0, y * 4 + 2);
775 pix = ASU2(x * 2 + 0, y * 2 + 1);
776 AF4 v2 = SpdReduceLoad4(tex, slice);
777 SpdStore(pix, v2, 6, slice);
778
779 tex = ASU2(x * 4 + 2, y * 4 + 2);
780 pix = ASU2(x * 2 + 1, y * 2 + 1);
781 AF4 v3 = SpdReduceLoad4(tex, slice);
782 SpdStore(pix, v3, 6, slice);
783
784 if (mips <= 7) return;
785 // no barrier needed, working on values only from the same thread
786
787 AF4 v = SpdReduce4(v0, v1, v2, v3);
788 SpdStore(ASU2(x, y), v, 7, slice);
789 SpdStoreIntermediate(x, y, v);
790}
791
792void SpdDownsampleNextFour(AU1 x, AU1 y, AU2 workGroupID, AU1 localInvocationIndex, AU1 baseMip, AU1 mips, AU1 slice)
793{
794 if (mips <= baseMip) return;
795 SpdWorkgroupShuffleBarrier();
796 SpdDownsampleMip_2(x, y, workGroupID, localInvocationIndex, baseMip, slice);
797
798 if (mips <= baseMip + 1) return;
799 SpdWorkgroupShuffleBarrier();
800 SpdDownsampleMip_3(x, y, workGroupID, localInvocationIndex, baseMip + 1, slice);
801
802 if (mips <= baseMip + 2) return;
803 SpdWorkgroupShuffleBarrier();
804 SpdDownsampleMip_4(x, y, workGroupID, localInvocationIndex, baseMip + 2, slice);
805
806 if (mips <= baseMip + 3) return;
807 SpdWorkgroupShuffleBarrier();
808 SpdDownsampleMip_5(workGroupID, localInvocationIndex, baseMip + 3, slice);
809}
810
811void SpdDownsample(
812 AU2 workGroupID,
813 AU1 localInvocationIndex,
814 AU1 mips,
815 AU1 numWorkGroups,
816 AU1 slice
817) {
818 AU2 sub_xy = ARmpRed8x8(localInvocationIndex % 64);
819 AU1 x = sub_xy.x + 8 * ((localInvocationIndex >> 6) % 2);
820 AU1 y = sub_xy.y + 8 * ((localInvocationIndex >> 7));
821 SpdDownsampleMips_0_1(x, y, workGroupID, localInvocationIndex, mips, slice);
822
823 SpdDownsampleNextFour(x, y, workGroupID, localInvocationIndex, 2, mips, slice);
824
825 if (mips <= 6) return;
826
827 if (SpdExitWorkgroup(numWorkGroups, localInvocationIndex, slice)) return;
828
829 SpdResetAtomicCounter(slice);
830
831 // After mip 6 there is only a single workgroup left that downsamples the remaining up to 64x64 texels.
832 SpdDownsampleMips_6_7(x, y, mips, slice);
833
834 SpdDownsampleNextFour(x, y, AU2(0,0), localInvocationIndex, 8, mips, slice);
835}
836
837void SpdDownsample(
838 AU2 workGroupID,
839 AU1 localInvocationIndex,
840 AU1 mips,
841 AU1 numWorkGroups,
842 AU1 slice,
843 AU2 workGroupOffset
844) {
845 SpdDownsample(workGroupID + workGroupOffset, localInvocationIndex, mips, numWorkGroups, slice);
846}
847
850
851//==============================================================================================================================
852// PACKED VERSION
853//==============================================================================================================================
854
855#ifdef A_HALF
856
857#ifdef A_GLSL
858#extension GL_EXT_shader_subgroup_extended_types_float16:require
859#endif
860
861AH4 SpdReduceQuadH(AH4 v)
862{
863 #if defined(A_GLSL) && !defined(SPD_NO_WAVE_OPERATIONS)
864 AH4 v0 = v;
865 AH4 v1 = subgroupQuadSwapHorizontal(v);
866 AH4 v2 = subgroupQuadSwapVertical(v);
867 AH4 v3 = subgroupQuadSwapDiagonal(v);
868 return SpdReduce4H(v0, v1, v2, v3);
869 #elif defined(A_HLSL) && !defined(SPD_NO_WAVE_OPERATIONS)
870 // requires SM6.0
871 AU1 quad = WaveGetLaneIndex() & (~0x3);
872 AH4 v0 = v;
873 AH4 v1 = WaveReadLaneAt(v, quad | 1);
874 AH4 v2 = WaveReadLaneAt(v, quad | 2);
875 AH4 v3 = WaveReadLaneAt(v, quad | 3);
876 return SpdReduce4H(v0, v1, v2, v3);
877 /*
878 // if SM6.0 is not available, you can use the AMD shader intrinsics
879 // the AMD shader intrinsics are available in AMD GPU Services (AGS) library:
880 // https://gpuopen.com/amd-gpu-services-ags-library/
881 // works for DX11
882 AH4 v0 = v;
883 AH4 v1;
884 v1.x = AmdExtD3DShaderIntrinsics_SwizzleF(v.x, AmdExtD3DShaderIntrinsicsSwizzle_SwapX1);
885 v1.y = AmdExtD3DShaderIntrinsics_SwizzleF(v.y, AmdExtD3DShaderIntrinsicsSwizzle_SwapX1);
886 v1.z = AmdExtD3DShaderIntrinsics_SwizzleF(v.z, AmdExtD3DShaderIntrinsicsSwizzle_SwapX1);
887 v1.w = AmdExtD3DShaderIntrinsics_SwizzleF(v.w, AmdExtD3DShaderIntrinsicsSwizzle_SwapX1);
888 AH4 v2;
889 v2.x = AmdExtD3DShaderIntrinsics_SwizzleF(v.x, AmdExtD3DShaderIntrinsicsSwizzle_SwapX2);
890 v2.y = AmdExtD3DShaderIntrinsics_SwizzleF(v.y, AmdExtD3DShaderIntrinsicsSwizzle_SwapX2);
891 v2.z = AmdExtD3DShaderIntrinsics_SwizzleF(v.z, AmdExtD3DShaderIntrinsicsSwizzle_SwapX2);
892 v2.w = AmdExtD3DShaderIntrinsics_SwizzleF(v.w, AmdExtD3DShaderIntrinsicsSwizzle_SwapX2);
893 AH4 v3;
894 v3.x = AmdExtD3DShaderIntrinsics_SwizzleF(v.x, AmdExtD3DShaderIntrinsicsSwizzle_ReverseX4);
895 v3.y = AmdExtD3DShaderIntrinsics_SwizzleF(v.y, AmdExtD3DShaderIntrinsicsSwizzle_ReverseX4);
896 v3.z = AmdExtD3DShaderIntrinsics_SwizzleF(v.z, AmdExtD3DShaderIntrinsicsSwizzle_ReverseX4);
897 v3.w = AmdExtD3DShaderIntrinsics_SwizzleF(v.w, AmdExtD3DShaderIntrinsicsSwizzle_ReverseX4);
898 return SpdReduce4H(v0, v1, v2, v3);
899 */
900 #endif
901 return AH4(0.0, 0.0, 0.0, 0.0);
902
903}
904
905AH4 SpdReduceIntermediateH(AU2 i0, AU2 i1, AU2 i2, AU2 i3)
906{
907 AH4 v0 = SpdLoadIntermediateH(i0.x, i0.y);
908 AH4 v1 = SpdLoadIntermediateH(i1.x, i1.y);
909 AH4 v2 = SpdLoadIntermediateH(i2.x, i2.y);
910 AH4 v3 = SpdLoadIntermediateH(i3.x, i3.y);
911 return SpdReduce4H(v0, v1, v2, v3);
912}
913
914AH4 SpdReduceLoad4H(AU2 i0, AU2 i1, AU2 i2, AU2 i3, AU1 slice)
915{
916 AH4 v0 = SpdLoadH(ASU2(i0), slice);
917 AH4 v1 = SpdLoadH(ASU2(i1), slice);
918 AH4 v2 = SpdLoadH(ASU2(i2), slice);
919 AH4 v3 = SpdLoadH(ASU2(i3), slice);
920 return SpdReduce4H(v0, v1, v2, v3);
921}
922
923AH4 SpdReduceLoad4H(AU2 base, AU1 slice)
924{
925 return SpdReduceLoad4H(
926 AU2(base + AU2(0, 0)),
927 AU2(base + AU2(0, 1)),
928 AU2(base + AU2(1, 0)),
929 AU2(base + AU2(1, 1)),
930 slice);
931}
932
933AH4 SpdReduceLoadSourceImage4H(AU2 i0, AU2 i1, AU2 i2, AU2 i3, AU1 slice)
934{
935 AH4 v0 = SpdLoadSourceImageH(ASU2(i0), slice);
936 AH4 v1 = SpdLoadSourceImageH(ASU2(i1), slice);
937 AH4 v2 = SpdLoadSourceImageH(ASU2(i2), slice);
938 AH4 v3 = SpdLoadSourceImageH(ASU2(i3), slice);
939 return SpdReduce4H(v0, v1, v2, v3);
940}
941
942AH4 SpdReduceLoadSourceImageH(AU2 base, AU1 slice)
943{
944#ifdef SPD_LINEAR_SAMPLER
945 return SpdLoadSourceImageH(ASU2(base), slice);
946#else
947 return SpdReduceLoadSourceImage4H(
948 AU2(base + AU2(0, 0)),
949 AU2(base + AU2(0, 1)),
950 AU2(base + AU2(1, 0)),
951 AU2(base + AU2(1, 1)),
952 slice);
953#endif
954}
955
956void SpdDownsampleMips_0_1_IntrinsicsH(AU1 x, AU1 y, AU2 workGroupID, AU1 localInvocationIndex, AU1 mips, AU1 slice)
957{
958 AH4 v[4];
959
960 ASU2 tex = ASU2(workGroupID.xy * 64) + ASU2(x * 2, y * 2);
961 ASU2 pix = ASU2(workGroupID.xy * 32) + ASU2(x, y);
962 v[0] = SpdReduceLoadSourceImageH(tex, slice);
963 SpdStoreH(pix, v[0], 0, slice);
964
965 tex = ASU2(workGroupID.xy * 64) + ASU2(x * 2 + 32, y * 2);
966 pix = ASU2(workGroupID.xy * 32) + ASU2(x + 16, y);
967 v[1] = SpdReduceLoadSourceImageH(tex, slice);
968 SpdStoreH(pix, v[1], 0, slice);
969
970 tex = ASU2(workGroupID.xy * 64) + ASU2(x * 2, y * 2 + 32);
971 pix = ASU2(workGroupID.xy * 32) + ASU2(x, y + 16);
972 v[2] = SpdReduceLoadSourceImageH(tex, slice);
973 SpdStoreH(pix, v[2], 0, slice);
974
975 tex = ASU2(workGroupID.xy * 64) + ASU2(x * 2 + 32, y * 2 + 32);
976 pix = ASU2(workGroupID.xy * 32) + ASU2(x + 16, y + 16);
977 v[3] = SpdReduceLoadSourceImageH(tex, slice);
978 SpdStoreH(pix, v[3], 0, slice);
979
980 if (mips <= 1)
981 return;
982
983 v[0] = SpdReduceQuadH(v[0]);
984 v[1] = SpdReduceQuadH(v[1]);
985 v[2] = SpdReduceQuadH(v[2]);
986 v[3] = SpdReduceQuadH(v[3]);
987
988 if ((localInvocationIndex % 4) == 0)
989 {
990 SpdStoreH(ASU2(workGroupID.xy * 16) + ASU2(x/2, y/2), v[0], 1, slice);
991 SpdStoreIntermediateH(x/2, y/2, v[0]);
992
993 SpdStoreH(ASU2(workGroupID.xy * 16) + ASU2(x/2 + 8, y/2), v[1], 1, slice);
994 SpdStoreIntermediateH(x/2 + 8, y/2, v[1]);
995
996 SpdStoreH(ASU2(workGroupID.xy * 16) + ASU2(x/2, y/2 + 8), v[2], 1, slice);
997 SpdStoreIntermediateH(x/2, y/2 + 8, v[2]);
998
999 SpdStoreH(ASU2(workGroupID.xy * 16) + ASU2(x/2 + 8, y/2 + 8), v[3], 1, slice);
1000 SpdStoreIntermediateH(x/2 + 8, y/2 + 8, v[3]);
1001 }
1002}
1003
1004void SpdDownsampleMips_0_1_LDSH(AU1 x, AU1 y, AU2 workGroupID, AU1 localInvocationIndex, AU1 mips, AU1 slice)
1005{
1006 AH4 v[4];
1007
1008 ASU2 tex = ASU2(workGroupID.xy * 64) + ASU2(x * 2, y * 2);
1009 ASU2 pix = ASU2(workGroupID.xy * 32) + ASU2(x, y);
1010 v[0] = SpdReduceLoadSourceImageH(tex, slice);
1011 SpdStoreH(pix, v[0], 0, slice);
1012
1013 tex = ASU2(workGroupID.xy * 64) + ASU2(x * 2 + 32, y * 2);
1014 pix = ASU2(workGroupID.xy * 32) + ASU2(x + 16, y);
1015 v[1] = SpdReduceLoadSourceImageH(tex, slice);
1016 SpdStoreH(pix, v[1], 0, slice);
1017
1018 tex = ASU2(workGroupID.xy * 64) + ASU2(x * 2, y * 2 + 32);
1019 pix = ASU2(workGroupID.xy * 32) + ASU2(x, y + 16);
1020 v[2] = SpdReduceLoadSourceImageH(tex, slice);
1021 SpdStoreH(pix, v[2], 0, slice);
1022
1023 tex = ASU2(workGroupID.xy * 64) + ASU2(x * 2 + 32, y * 2 + 32);
1024 pix = ASU2(workGroupID.xy * 32) + ASU2(x + 16, y + 16);
1025 v[3] = SpdReduceLoadSourceImageH(tex, slice);
1026 SpdStoreH(pix, v[3], 0, slice);
1027
1028 if (mips <= 1)
1029 return;
1030
1031 for (int i = 0; i < 4; i++)
1032 {
1033 SpdStoreIntermediateH(x, y, v[i]);
1034 SpdWorkgroupShuffleBarrier();
1035 if (localInvocationIndex < 64)
1036 {
1037 v[i] = SpdReduceIntermediateH(
1038 AU2(x * 2 + 0, y * 2 + 0),
1039 AU2(x * 2 + 1, y * 2 + 0),
1040 AU2(x * 2 + 0, y * 2 + 1),
1041 AU2(x * 2 + 1, y * 2 + 1)
1042 );
1043 SpdStoreH(ASU2(workGroupID.xy * 16) + ASU2(x + (i % 2) * 8, y + (i / 2) * 8), v[i], 1, slice);
1044 }
1045 SpdWorkgroupShuffleBarrier();
1046 }
1047
1048 if (localInvocationIndex < 64)
1049 {
1050 SpdStoreIntermediateH(x + 0, y + 0, v[0]);
1051 SpdStoreIntermediateH(x + 8, y + 0, v[1]);
1052 SpdStoreIntermediateH(x + 0, y + 8, v[2]);
1053 SpdStoreIntermediateH(x + 8, y + 8, v[3]);
1054 }
1055}
1056
1057void SpdDownsampleMips_0_1H(AU1 x, AU1 y, AU2 workGroupID, AU1 localInvocationIndex, AU1 mips, AU1 slice)
1058{
1059#ifdef SPD_NO_WAVE_OPERATIONS
1060 SpdDownsampleMips_0_1_LDSH(x, y, workGroupID, localInvocationIndex, mips, slice);
1061#else
1062 SpdDownsampleMips_0_1_IntrinsicsH(x, y, workGroupID, localInvocationIndex, mips, slice);
1063#endif
1064}
1065
1066
1067void SpdDownsampleMip_2H(AU1 x, AU1 y, AU2 workGroupID, AU1 localInvocationIndex, AU1 mip, AU1 slice)
1068{
1069#ifdef SPD_NO_WAVE_OPERATIONS
1070 if (localInvocationIndex < 64)
1071 {
1072 AH4 v = SpdReduceIntermediateH(
1073 AU2(x * 2 + 0, y * 2 + 0),
1074 AU2(x * 2 + 1, y * 2 + 0),
1075 AU2(x * 2 + 0, y * 2 + 1),
1076 AU2(x * 2 + 1, y * 2 + 1)
1077 );
1078 SpdStoreH(ASU2(workGroupID.xy * 8) + ASU2(x, y), v, mip, slice);
1079 // store to LDS, try to reduce bank conflicts
1080 // x 0 x 0 x 0 x 0 x 0 x 0 x 0 x 0
1081 // 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
1082 // 0 x 0 x 0 x 0 x 0 x 0 x 0 x 0 x
1083 // 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
1084 // x 0 x 0 x 0 x 0 x 0 x 0 x 0 x 0
1085 // ...
1086 // x 0 x 0 x 0 x 0 x 0 x 0 x 0 x 0
1087 SpdStoreIntermediateH(x * 2 + y % 2, y * 2, v);
1088 }
1089#else
1090 AH4 v = SpdLoadIntermediateH(x, y);
1091 v = SpdReduceQuadH(v);
1092 // quad index 0 stores result
1093 if (localInvocationIndex % 4 == 0)
1094 {
1095 SpdStoreH(ASU2(workGroupID.xy * 8) + ASU2(x/2, y/2), v, mip, slice);
1096 SpdStoreIntermediateH(x + (y/2) % 2, y, v);
1097 }
1098#endif
1099}
1100
1101void SpdDownsampleMip_3H(AU1 x, AU1 y, AU2 workGroupID, AU1 localInvocationIndex, AU1 mip, AU1 slice)
1102{
1103#ifdef SPD_NO_WAVE_OPERATIONS
1104 if (localInvocationIndex < 16)
1105 {
1106 // x 0 x 0
1107 // 0 0 0 0
1108 // 0 x 0 x
1109 // 0 0 0 0
1110 AH4 v = SpdReduceIntermediateH(
1111 AU2(x * 4 + 0 + 0, y * 4 + 0),
1112 AU2(x * 4 + 2 + 0, y * 4 + 0),
1113 AU2(x * 4 + 0 + 1, y * 4 + 2),
1114 AU2(x * 4 + 2 + 1, y * 4 + 2)
1115 );
1116 SpdStoreH(ASU2(workGroupID.xy * 4) + ASU2(x, y), v, mip, slice);
1117 // store to LDS
1118 // x 0 0 0 x 0 0 0 x 0 0 0 x 0 0 0
1119 // 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
1120 // 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
1121 // 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
1122 // 0 x 0 0 0 x 0 0 0 x 0 0 0 x 0 0
1123 // ...
1124 // 0 0 x 0 0 0 x 0 0 0 x 0 0 0 x 0
1125 // ...
1126 // 0 0 0 x 0 0 0 x 0 0 0 x 0 0 0 x
1127 // ...
1128 SpdStoreIntermediateH(x * 4 + y, y * 4, v);
1129 }
1130#else
1131 if (localInvocationIndex < 64)
1132 {
1133 AH4 v = SpdLoadIntermediateH(x * 2 + y % 2,y * 2);
1134 v = SpdReduceQuadH(v);
1135 // quad index 0 stores result
1136 if (localInvocationIndex % 4 == 0)
1137 {
1138 SpdStoreH(ASU2(workGroupID.xy * 4) + ASU2(x/2, y/2), v, mip, slice);
1139 SpdStoreIntermediateH(x * 2 + y/2, y * 2, v);
1140 }
1141 }
1142#endif
1143}
1144
1145void SpdDownsampleMip_4H(AU1 x, AU1 y, AU2 workGroupID, AU1 localInvocationIndex, AU1 mip, AU1 slice)
1146{
1147#ifdef SPD_NO_WAVE_OPERATIONS
1148 if (localInvocationIndex < 4)
1149 {
1150 // x 0 0 0 x 0 0 0
1151 // ...
1152 // 0 x 0 0 0 x 0 0
1153 AH4 v = SpdReduceIntermediateH(
1154 AU2(x * 8 + 0 + 0 + y * 2, y * 8 + 0),
1155 AU2(x * 8 + 4 + 0 + y * 2, y * 8 + 0),
1156 AU2(x * 8 + 0 + 1 + y * 2, y * 8 + 4),
1157 AU2(x * 8 + 4 + 1 + y * 2, y * 8 + 4)
1158 );
1159 SpdStoreH(ASU2(workGroupID.xy * 2) + ASU2(x, y), v, mip, slice);
1160 // store to LDS
1161 // x x x x 0 ...
1162 // 0 ...
1163 SpdStoreIntermediateH(x + y * 2, 0, v);
1164 }
1165#else
1166 if (localInvocationIndex < 16)
1167 {
1168 AH4 v = SpdLoadIntermediateH(x * 4 + y,y * 4);
1169 v = SpdReduceQuadH(v);
1170 // quad index 0 stores result
1171 if (localInvocationIndex % 4 == 0)
1172 {
1173 SpdStoreH(ASU2(workGroupID.xy * 2) + ASU2(x/2, y/2), v, mip, slice);
1174 SpdStoreIntermediateH(x / 2 + y, 0, v);
1175 }
1176 }
1177#endif
1178}
1179
1180void SpdDownsampleMip_5H(AU2 workGroupID, AU1 localInvocationIndex, AU1 mip, AU1 slice)
1181{
1182#ifdef SPD_NO_WAVE_OPERATIONS
1183 if (localInvocationIndex < 1)
1184 {
1185 // x x x x 0 ...
1186 // 0 ...
1187 AH4 v = SpdReduceIntermediateH(
1188 AU2(0, 0),
1189 AU2(1, 0),
1190 AU2(2, 0),
1191 AU2(3, 0)
1192 );
1193 SpdStoreH(ASU2(workGroupID.xy), v, mip, slice);
1194 }
1195#else
1196 if (localInvocationIndex < 4)
1197 {
1198 AH4 v = SpdLoadIntermediateH(localInvocationIndex,0);
1199 v = SpdReduceQuadH(v);
1200 // quad index 0 stores result
1201 if (localInvocationIndex % 4 == 0)
1202 {
1203 SpdStoreH(ASU2(workGroupID.xy), v, mip, slice);
1204 }
1205 }
1206#endif
1207}
1208
1209void SpdDownsampleMips_6_7H(AU1 x, AU1 y, AU1 mips, AU1 slice)
1210{
1211 ASU2 tex = ASU2(x * 4 + 0, y * 4 + 0);
1212 ASU2 pix = ASU2(x * 2 + 0, y * 2 + 0);
1213 AH4 v0 = SpdReduceLoad4H(tex, slice);
1214 SpdStoreH(pix, v0, 6, slice);
1215
1216 tex = ASU2(x * 4 + 2, y * 4 + 0);
1217 pix = ASU2(x * 2 + 1, y * 2 + 0);
1218 AH4 v1 = SpdReduceLoad4H(tex, slice);
1219 SpdStoreH(pix, v1, 6, slice);
1220
1221 tex = ASU2(x * 4 + 0, y * 4 + 2);
1222 pix = ASU2(x * 2 + 0, y * 2 + 1);
1223 AH4 v2 = SpdReduceLoad4H(tex, slice);
1224 SpdStoreH(pix, v2, 6, slice);
1225
1226 tex = ASU2(x * 4 + 2, y * 4 + 2);
1227 pix = ASU2(x * 2 + 1, y * 2 + 1);
1228 AH4 v3 = SpdReduceLoad4H(tex, slice);
1229 SpdStoreH(pix, v3, 6, slice);
1230
1231 if (mips < 8) return;
1232 // no barrier needed, working on values only from the same thread
1233
1234 AH4 v = SpdReduce4H(v0, v1, v2, v3);
1235 SpdStoreH(ASU2(x, y), v, 7, slice);
1236 SpdStoreIntermediateH(x, y, v);
1237}
1238
1239void SpdDownsampleNextFourH(AU1 x, AU1 y, AU2 workGroupID, AU1 localInvocationIndex, AU1 baseMip, AU1 mips, AU1 slice)
1240{
1241 if (mips <= baseMip) return;
1242 SpdWorkgroupShuffleBarrier();
1243 SpdDownsampleMip_2H(x, y, workGroupID, localInvocationIndex, baseMip, slice);
1244
1245 if (mips <= baseMip + 1) return;
1246 SpdWorkgroupShuffleBarrier();
1247 SpdDownsampleMip_3H(x, y, workGroupID, localInvocationIndex, baseMip + 1, slice);
1248
1249 if (mips <= baseMip + 2) return;
1250 SpdWorkgroupShuffleBarrier();
1251 SpdDownsampleMip_4H(x, y, workGroupID, localInvocationIndex, baseMip + 2, slice);
1252
1253 if (mips <= baseMip + 3) return;
1254 SpdWorkgroupShuffleBarrier();
1255 SpdDownsampleMip_5H(workGroupID, localInvocationIndex, baseMip + 3, slice);
1256}
1257
1258void SpdDownsampleH(
1259 AU2 workGroupID,
1260 AU1 localInvocationIndex,
1261 AU1 mips,
1262 AU1 numWorkGroups,
1263 AU1 slice
1264) {
1265 AU2 sub_xy = ARmpRed8x8(localInvocationIndex % 64);
1266 AU1 x = sub_xy.x + 8 * ((localInvocationIndex >> 6) % 2);
1267 AU1 y = sub_xy.y + 8 * ((localInvocationIndex >> 7));
1268
1269 SpdDownsampleMips_0_1H(x, y, workGroupID, localInvocationIndex, mips, slice);
1270
1271 SpdDownsampleNextFourH(x, y, workGroupID, localInvocationIndex, 2, mips, slice);
1272
1273 if (mips < 7) return;
1274
1275 if (SpdExitWorkgroup(numWorkGroups, localInvocationIndex, slice)) return;
1276
1277 SpdResetAtomicCounter(slice);
1278
1279 // After mip 6 there is only a single workgroup left that downsamples the remaining up to 64x64 texels.
1280 SpdDownsampleMips_6_7H(x, y, mips, slice);
1281
1282 SpdDownsampleNextFourH(x, y, AU2(0,0), localInvocationIndex, 8, mips, slice);
1283}
1284
1285void SpdDownsampleH(
1286 AU2 workGroupID,
1287 AU1 localInvocationIndex,
1288 AU1 mips,
1289 AU1 numWorkGroups,
1290 AU1 slice,
1291 AU2 workGroupOffset
1292) {
1293 SpdDownsampleH(workGroupID + workGroupOffset, localInvocationIndex, mips, numWorkGroups, slice);
1294}
1295
1296#endif // #ifdef A_HALF
1297#endif // #ifdef A_GPU