327A_STATIC
void SpdSetup(
328outAU2 dispatchThreadGroupCountXY,
329outAU2 workGroupOffset,
330outAU2 numWorkGroupsAndMips,
334 workGroupOffset[0] = rectInfo[0] / 64;
335 workGroupOffset[1] = rectInfo[1] / 64;
337 AU1 endIndexX = (rectInfo[0] + rectInfo[2] - 1) / 64;
338 AU1 endIndexY = (rectInfo[1] + rectInfo[3] - 1) / 64;
340 dispatchThreadGroupCountXY[0] = endIndexX + 1 - workGroupOffset[0];
341 dispatchThreadGroupCountXY[1] = endIndexY + 1 - workGroupOffset[1];
343 numWorkGroupsAndMips[0] = (dispatchThreadGroupCountXY[0]) * (dispatchThreadGroupCountXY[1]);
346 numWorkGroupsAndMips[1] = AU1(mips);
348 AU1 resolution = AMaxU1(rectInfo[2], rectInfo[3]);
349 numWorkGroupsAndMips[1] = AU1((AMinF1(AFloorF1(ALog2F1(AF1(resolution))), AF1(12))));
353A_STATIC
void SpdSetup(
354 outAU2 dispatchThreadGroupCountXY,
355 outAU2 workGroupOffset,
356 outAU2 numWorkGroupsAndMips,
359 SpdSetup(dispatchThreadGroupCountXY, workGroupOffset, numWorkGroupsAndMips, rectInfo, -1);
366#ifdef SPD_PACKED_ONLY
368 AF4 SpdLoadSourceImage(ASU2 p, AU1 slice){
return AF4(0.0,0.0,0.0,0.0);}
369 AF4 SpdLoad(ASU2 p, AU1 slice){
return AF4(0.0,0.0,0.0,0.0);}
370 void SpdStore(ASU2 p, AF4 value, AU1 mip, AU1 slice){}
371 AF4 SpdLoadIntermediate(AU1 x, AU1 y){
return AF4(0.0,0.0,0.0,0.0);}
372 void SpdStoreIntermediate(AU1 x, AU1 y, AF4 value){}
373 AF4 SpdReduce4(AF4 v0, AF4 v1, AF4 v2, AF4 v3){
return AF4(0.0,0.0,0.0,0.0);}
377#if defined(A_GLSL) && !defined(SPD_NO_WAVE_OPERATIONS)
378#extension GL_KHR_shader_subgroup_quad:require
381void SpdWorkgroupShuffleBarrier() {
386 GroupMemoryBarrierWithGroupSync();
391bool SpdExitWorkgroup(AU1 numWorkGroups, AU1 localInvocationIndex, AU1 slice)
394 if (localInvocationIndex == 0)
396 SpdIncreaseAtomicCounter(slice);
398 SpdWorkgroupShuffleBarrier();
399 return (SpdGetAtomicCounter() != (numWorkGroups - 1));
407AF4 SpdReduceQuad(AF4 v)
409 #if defined(A_GLSL) && !defined(SPD_NO_WAVE_OPERATIONS)
411 AF4 v1 = subgroupQuadSwapHorizontal(v);
412 AF4 v2 = subgroupQuadSwapVertical(v);
413 AF4 v3 = subgroupQuadSwapDiagonal(v);
414 return SpdReduce4(v0, v1, v2, v3);
415 #elif defined(A_HLSL) && !defined(SPD_NO_WAVE_OPERATIONS)
417 AU1 quad = WaveGetLaneIndex() & (~0x3);
419 AF4 v1 = WaveReadLaneAt(v, quad | 1);
420 AF4 v2 = WaveReadLaneAt(v, quad | 2);
421 AF4 v3 = WaveReadLaneAt(v, quad | 3);
422 return SpdReduce4(v0, v1, v2, v3);
450AF4 SpdReduceIntermediate(AU2 i0, AU2 i1, AU2 i2, AU2 i3)
452 AF4 v0 = SpdLoadIntermediate(i0.x, i0.y);
453 AF4 v1 = SpdLoadIntermediate(i1.x, i1.y);
454 AF4 v2 = SpdLoadIntermediate(i2.x, i2.y);
455 AF4 v3 = SpdLoadIntermediate(i3.x, i3.y);
456 return SpdReduce4(v0, v1, v2, v3);
459AF4 SpdReduceLoad4(AU2 i0, AU2 i1, AU2 i2, AU2 i3, AU1 slice)
461 AF4 v0 = SpdLoad(ASU2(i0), slice);
462 AF4 v1 = SpdLoad(ASU2(i1), slice);
463 AF4 v2 = SpdLoad(ASU2(i2), slice);
464 AF4 v3 = SpdLoad(ASU2(i3), slice);
465 return SpdReduce4(v0, v1, v2, v3);
468AF4 SpdReduceLoad4(AU2 base, AU1 slice)
470 return SpdReduceLoad4(
471 AU2(base + AU2(0, 0)),
472 AU2(base + AU2(0, 1)),
473 AU2(base + AU2(1, 0)),
474 AU2(base + AU2(1, 1)),
478AF4 SpdReduceLoadSourceImage4(AU2 i0, AU2 i1, AU2 i2, AU2 i3, AU1 slice)
480 AF4 v0 = SpdLoadSourceImage(ASU2(i0), slice);
481 AF4 v1 = SpdLoadSourceImage(ASU2(i1), slice);
482 AF4 v2 = SpdLoadSourceImage(ASU2(i2), slice);
483 AF4 v3 = SpdLoadSourceImage(ASU2(i3), slice);
484 return SpdReduce4(v0, v1, v2, v3);
487AF4 SpdReduceLoadSourceImage(AU2 base, AU1 slice)
489#ifdef SPD_LINEAR_SAMPLER
490 return SpdLoadSourceImage(ASU2(base), slice);
492 return SpdReduceLoadSourceImage4(
493 AU2(base + AU2(0, 0)),
494 AU2(base + AU2(0, 1)),
495 AU2(base + AU2(1, 0)),
496 AU2(base + AU2(1, 1)),
501void SpdDownsampleMips_0_1_Intrinsics(AU1 x, AU1 y, AU2 workGroupID, AU1 localInvocationIndex, AU1 mip, AU1 slice)
505 ASU2 tex = ASU2(workGroupID.xy * 64) + ASU2(x * 2, y * 2);
506 ASU2 pix = ASU2(workGroupID.xy * 32) + ASU2(x, y);
507 v[0] = SpdReduceLoadSourceImage(tex, slice);
508 SpdStore(pix, v[0], 0, slice);
510 tex = ASU2(workGroupID.xy * 64) + ASU2(x * 2 + 32, y * 2);
511 pix = ASU2(workGroupID.xy * 32) + ASU2(x + 16, y);
512 v[1] = SpdReduceLoadSourceImage(tex, slice);
513 SpdStore(pix, v[1], 0, slice);
515 tex = ASU2(workGroupID.xy * 64) + ASU2(x * 2, y * 2 + 32);
516 pix = ASU2(workGroupID.xy * 32) + ASU2(x, y + 16);
517 v[2] = SpdReduceLoadSourceImage(tex, slice);
518 SpdStore(pix, v[2], 0, slice);
520 tex = ASU2(workGroupID.xy * 64) + ASU2(x * 2 + 32, y * 2 + 32);
521 pix = ASU2(workGroupID.xy * 32) + ASU2(x + 16, y + 16);
522 v[3] = SpdReduceLoadSourceImage(tex, slice);
523 SpdStore(pix, v[3], 0, slice);
528 v[0] = SpdReduceQuad(v[0]);
529 v[1] = SpdReduceQuad(v[1]);
530 v[2] = SpdReduceQuad(v[2]);
531 v[3] = SpdReduceQuad(v[3]);
533 if ((localInvocationIndex % 4) == 0)
535 SpdStore(ASU2(workGroupID.xy * 16) +
536 ASU2(x/2, y/2), v[0], 1, slice);
537 SpdStoreIntermediate(
540 SpdStore(ASU2(workGroupID.xy * 16) +
541 ASU2(x/2 + 8, y/2), v[1], 1, slice);
542 SpdStoreIntermediate(
545 SpdStore(ASU2(workGroupID.xy * 16) +
546 ASU2(x/2, y/2 + 8), v[2], 1, slice);
547 SpdStoreIntermediate(
550 SpdStore(ASU2(workGroupID.xy * 16) +
551 ASU2(x/2 + 8, y/2 + 8), v[3], 1, slice);
552 SpdStoreIntermediate(
553 x/2 + 8, y/2 + 8, v[3]);
557void SpdDownsampleMips_0_1_LDS(AU1 x, AU1 y, AU2 workGroupID, AU1 localInvocationIndex, AU1 mip, AU1 slice)
561 ASU2 tex = ASU2(workGroupID.xy * 64) + ASU2(x * 2, y * 2);
562 ASU2 pix = ASU2(workGroupID.xy * 32) + ASU2(x, y);
563 v[0] = SpdReduceLoadSourceImage(tex, slice);
564 SpdStore(pix, v[0], 0, slice);
566 tex = ASU2(workGroupID.xy * 64) + ASU2(x * 2 + 32, y * 2);
567 pix = ASU2(workGroupID.xy * 32) + ASU2(x + 16, y);
568 v[1] = SpdReduceLoadSourceImage(tex, slice);
569 SpdStore(pix, v[1], 0, slice);
571 tex = ASU2(workGroupID.xy * 64) + ASU2(x * 2, y * 2 + 32);
572 pix = ASU2(workGroupID.xy * 32) + ASU2(x, y + 16);
573 v[2] = SpdReduceLoadSourceImage(tex, slice);
574 SpdStore(pix, v[2], 0, slice);
576 tex = ASU2(workGroupID.xy * 64) + ASU2(x * 2 + 32, y * 2 + 32);
577 pix = ASU2(workGroupID.xy * 32) + ASU2(x + 16, y + 16);
578 v[3] = SpdReduceLoadSourceImage(tex, slice);
579 SpdStore(pix, v[3], 0, slice);
584 for (
int i = 0; i < 4; i++)
586 SpdStoreIntermediate(x, y, v[i]);
587 SpdWorkgroupShuffleBarrier();
588 if (localInvocationIndex < 64)
590 v[i] = SpdReduceIntermediate(
591 AU2(x * 2 + 0, y * 2 + 0),
592 AU2(x * 2 + 1, y * 2 + 0),
593 AU2(x * 2 + 0, y * 2 + 1),
594 AU2(x * 2 + 1, y * 2 + 1)
596 SpdStore(ASU2(workGroupID.xy * 16) + ASU2(x + (i % 2) * 8, y + (i / 2) * 8), v[i], 1, slice);
598 SpdWorkgroupShuffleBarrier();
601 if (localInvocationIndex < 64)
603 SpdStoreIntermediate(x + 0, y + 0, v[0]);
604 SpdStoreIntermediate(x + 8, y + 0, v[1]);
605 SpdStoreIntermediate(x + 0, y + 8, v[2]);
606 SpdStoreIntermediate(x + 8, y + 8, v[3]);
610void SpdDownsampleMips_0_1(AU1 x, AU1 y, AU2 workGroupID, AU1 localInvocationIndex, AU1 mip, AU1 slice)
612#ifdef SPD_NO_WAVE_OPERATIONS
613 SpdDownsampleMips_0_1_LDS(x, y, workGroupID, localInvocationIndex, mip, slice);
615 SpdDownsampleMips_0_1_Intrinsics(x, y, workGroupID, localInvocationIndex, mip, slice);
620void SpdDownsampleMip_2(AU1 x, AU1 y, AU2 workGroupID, AU1 localInvocationIndex, AU1 mip, AU1 slice)
622#ifdef SPD_NO_WAVE_OPERATIONS
623 if (localInvocationIndex < 64)
625 AF4 v = SpdReduceIntermediate(
626 AU2(x * 2 + 0, y * 2 + 0),
627 AU2(x * 2 + 1, y * 2 + 0),
628 AU2(x * 2 + 0, y * 2 + 1),
629 AU2(x * 2 + 1, y * 2 + 1)
631 SpdStore(ASU2(workGroupID.xy * 8) + ASU2(x, y), v, mip, slice);
640 SpdStoreIntermediate(x * 2 + y % 2, y * 2, v);
643 AF4 v = SpdLoadIntermediate(x, y);
644 v = SpdReduceQuad(v);
646 if (localInvocationIndex % 4 == 0)
648 SpdStore(ASU2(workGroupID.xy * 8) + ASU2(x/2, y/2), v, mip, slice);
649 SpdStoreIntermediate(x + (y/2) % 2, y, v);
654void SpdDownsampleMip_3(AU1 x, AU1 y, AU2 workGroupID, AU1 localInvocationIndex, AU1 mip, AU1 slice)
656#ifdef SPD_NO_WAVE_OPERATIONS
657 if (localInvocationIndex < 16)
663 AF4 v = SpdReduceIntermediate(
664 AU2(x * 4 + 0 + 0, y * 4 + 0),
665 AU2(x * 4 + 2 + 0, y * 4 + 0),
666 AU2(x * 4 + 0 + 1, y * 4 + 2),
667 AU2(x * 4 + 2 + 1, y * 4 + 2)
669 SpdStore(ASU2(workGroupID.xy * 4) + ASU2(x, y), v, mip, slice);
681 SpdStoreIntermediate(x * 4 + y, y * 4, v);
684 if (localInvocationIndex < 64)
686 AF4 v = SpdLoadIntermediate(x * 2 + y % 2,y * 2);
687 v = SpdReduceQuad(v);
689 if (localInvocationIndex % 4 == 0)
691 SpdStore(ASU2(workGroupID.xy * 4) + ASU2(x/2, y/2), v, mip, slice);
692 SpdStoreIntermediate(x * 2 + y/2, y * 2, v);
698void SpdDownsampleMip_4(AU1 x, AU1 y, AU2 workGroupID, AU1 localInvocationIndex, AU1 mip, AU1 slice)
700#ifdef SPD_NO_WAVE_OPERATIONS
701 if (localInvocationIndex < 4)
706 AF4 v = SpdReduceIntermediate(
707 AU2(x * 8 + 0 + 0 + y * 2, y * 8 + 0),
708 AU2(x * 8 + 4 + 0 + y * 2, y * 8 + 0),
709 AU2(x * 8 + 0 + 1 + y * 2, y * 8 + 4),
710 AU2(x * 8 + 4 + 1 + y * 2, y * 8 + 4)
712 SpdStore(ASU2(workGroupID.xy * 2) + ASU2(x, y), v, mip, slice);
716 SpdStoreIntermediate(x + y * 2, 0, v);
719 if (localInvocationIndex < 16)
721 AF4 v = SpdLoadIntermediate(x * 4 + y,y * 4);
722 v = SpdReduceQuad(v);
724 if (localInvocationIndex % 4 == 0)
726 SpdStore(ASU2(workGroupID.xy * 2) + ASU2(x/2, y/2), v, mip, slice);
727 SpdStoreIntermediate(x / 2 + y, 0, v);
733void SpdDownsampleMip_5(AU2 workGroupID, AU1 localInvocationIndex, AU1 mip, AU1 slice)
735#ifdef SPD_NO_WAVE_OPERATIONS
736 if (localInvocationIndex < 1)
740 AF4 v = SpdReduceIntermediate(
746 SpdStore(ASU2(workGroupID.xy), v, mip, slice);
749 if (localInvocationIndex < 4)
751 AF4 v = SpdLoadIntermediate(localInvocationIndex,0);
752 v = SpdReduceQuad(v);
754 if (localInvocationIndex % 4 == 0)
756 SpdStore(ASU2(workGroupID.xy), v, mip, slice);
762void SpdDownsampleMips_6_7(AU1 x, AU1 y, AU1 mips, AU1 slice)
764 ASU2 tex = ASU2(x * 4 + 0, y * 4 + 0);
765 ASU2 pix = ASU2(x * 2 + 0, y * 2 + 0);
766 AF4 v0 = SpdReduceLoad4(tex, slice);
767 SpdStore(pix, v0, 6, slice);
769 tex = ASU2(x * 4 + 2, y * 4 + 0);
770 pix = ASU2(x * 2 + 1, y * 2 + 0);
771 AF4 v1 = SpdReduceLoad4(tex, slice);
772 SpdStore(pix, v1, 6, slice);
774 tex = ASU2(x * 4 + 0, y * 4 + 2);
775 pix = ASU2(x * 2 + 0, y * 2 + 1);
776 AF4 v2 = SpdReduceLoad4(tex, slice);
777 SpdStore(pix, v2, 6, slice);
779 tex = ASU2(x * 4 + 2, y * 4 + 2);
780 pix = ASU2(x * 2 + 1, y * 2 + 1);
781 AF4 v3 = SpdReduceLoad4(tex, slice);
782 SpdStore(pix, v3, 6, slice);
784 if (mips <= 7)
return;
787 AF4 v = SpdReduce4(v0, v1, v2, v3);
788 SpdStore(ASU2(x, y), v, 7, slice);
789 SpdStoreIntermediate(x, y, v);
792void SpdDownsampleNextFour(AU1 x, AU1 y, AU2 workGroupID, AU1 localInvocationIndex, AU1 baseMip, AU1 mips, AU1 slice)
794 if (mips <= baseMip)
return;
795 SpdWorkgroupShuffleBarrier();
796 SpdDownsampleMip_2(x, y, workGroupID, localInvocationIndex, baseMip, slice);
798 if (mips <= baseMip + 1)
return;
799 SpdWorkgroupShuffleBarrier();
800 SpdDownsampleMip_3(x, y, workGroupID, localInvocationIndex, baseMip + 1, slice);
802 if (mips <= baseMip + 2)
return;
803 SpdWorkgroupShuffleBarrier();
804 SpdDownsampleMip_4(x, y, workGroupID, localInvocationIndex, baseMip + 2, slice);
806 if (mips <= baseMip + 3)
return;
807 SpdWorkgroupShuffleBarrier();
808 SpdDownsampleMip_5(workGroupID, localInvocationIndex, baseMip + 3, slice);
813 AU1 localInvocationIndex,
818 AU2 sub_xy = ARmpRed8x8(localInvocationIndex % 64);
819 AU1 x = sub_xy.x + 8 * ((localInvocationIndex >> 6) % 2);
820 AU1 y = sub_xy.y + 8 * ((localInvocationIndex >> 7));
821 SpdDownsampleMips_0_1(x, y, workGroupID, localInvocationIndex, mips, slice);
823 SpdDownsampleNextFour(x, y, workGroupID, localInvocationIndex, 2, mips, slice);
825 if (mips <= 6)
return;
827 if (SpdExitWorkgroup(numWorkGroups, localInvocationIndex, slice))
return;
829 SpdResetAtomicCounter(slice);
832 SpdDownsampleMips_6_7(x, y, mips, slice);
834 SpdDownsampleNextFour(x, y, AU2(0,0), localInvocationIndex, 8, mips, slice);
839 AU1 localInvocationIndex,
845 SpdDownsample(workGroupID + workGroupOffset, localInvocationIndex, mips, numWorkGroups, slice);
858#extension GL_EXT_shader_subgroup_extended_types_float16:require
861AH4 SpdReduceQuadH(AH4 v)
863 #if defined(A_GLSL) && !defined(SPD_NO_WAVE_OPERATIONS)
865 AH4 v1 = subgroupQuadSwapHorizontal(v);
866 AH4 v2 = subgroupQuadSwapVertical(v);
867 AH4 v3 = subgroupQuadSwapDiagonal(v);
868 return SpdReduce4H(v0, v1, v2, v3);
869 #elif defined(A_HLSL) && !defined(SPD_NO_WAVE_OPERATIONS)
871 AU1 quad = WaveGetLaneIndex() & (~0x3);
873 AH4 v1 = WaveReadLaneAt(v, quad | 1);
874 AH4 v2 = WaveReadLaneAt(v, quad | 2);
875 AH4 v3 = WaveReadLaneAt(v, quad | 3);
876 return SpdReduce4H(v0, v1, v2, v3);
901 return AH4(0.0, 0.0, 0.0, 0.0);
905AH4 SpdReduceIntermediateH(AU2 i0, AU2 i1, AU2 i2, AU2 i3)
907 AH4 v0 = SpdLoadIntermediateH(i0.x, i0.y);
908 AH4 v1 = SpdLoadIntermediateH(i1.x, i1.y);
909 AH4 v2 = SpdLoadIntermediateH(i2.x, i2.y);
910 AH4 v3 = SpdLoadIntermediateH(i3.x, i3.y);
911 return SpdReduce4H(v0, v1, v2, v3);
914AH4 SpdReduceLoad4H(AU2 i0, AU2 i1, AU2 i2, AU2 i3, AU1 slice)
916 AH4 v0 = SpdLoadH(ASU2(i0), slice);
917 AH4 v1 = SpdLoadH(ASU2(i1), slice);
918 AH4 v2 = SpdLoadH(ASU2(i2), slice);
919 AH4 v3 = SpdLoadH(ASU2(i3), slice);
920 return SpdReduce4H(v0, v1, v2, v3);
923AH4 SpdReduceLoad4H(AU2 base, AU1 slice)
925 return SpdReduceLoad4H(
926 AU2(base + AU2(0, 0)),
927 AU2(base + AU2(0, 1)),
928 AU2(base + AU2(1, 0)),
929 AU2(base + AU2(1, 1)),
933AH4 SpdReduceLoadSourceImage4H(AU2 i0, AU2 i1, AU2 i2, AU2 i3, AU1 slice)
935 AH4 v0 = SpdLoadSourceImageH(ASU2(i0), slice);
936 AH4 v1 = SpdLoadSourceImageH(ASU2(i1), slice);
937 AH4 v2 = SpdLoadSourceImageH(ASU2(i2), slice);
938 AH4 v3 = SpdLoadSourceImageH(ASU2(i3), slice);
939 return SpdReduce4H(v0, v1, v2, v3);
942AH4 SpdReduceLoadSourceImageH(AU2 base, AU1 slice)
944#ifdef SPD_LINEAR_SAMPLER
945 return SpdLoadSourceImageH(ASU2(base), slice);
947 return SpdReduceLoadSourceImage4H(
948 AU2(base + AU2(0, 0)),
949 AU2(base + AU2(0, 1)),
950 AU2(base + AU2(1, 0)),
951 AU2(base + AU2(1, 1)),
956void SpdDownsampleMips_0_1_IntrinsicsH(AU1 x, AU1 y, AU2 workGroupID, AU1 localInvocationIndex, AU1 mips, AU1 slice)
960 ASU2 tex = ASU2(workGroupID.xy * 64) + ASU2(x * 2, y * 2);
961 ASU2 pix = ASU2(workGroupID.xy * 32) + ASU2(x, y);
962 v[0] = SpdReduceLoadSourceImageH(tex, slice);
963 SpdStoreH(pix, v[0], 0, slice);
965 tex = ASU2(workGroupID.xy * 64) + ASU2(x * 2 + 32, y * 2);
966 pix = ASU2(workGroupID.xy * 32) + ASU2(x + 16, y);
967 v[1] = SpdReduceLoadSourceImageH(tex, slice);
968 SpdStoreH(pix, v[1], 0, slice);
970 tex = ASU2(workGroupID.xy * 64) + ASU2(x * 2, y * 2 + 32);
971 pix = ASU2(workGroupID.xy * 32) + ASU2(x, y + 16);
972 v[2] = SpdReduceLoadSourceImageH(tex, slice);
973 SpdStoreH(pix, v[2], 0, slice);
975 tex = ASU2(workGroupID.xy * 64) + ASU2(x * 2 + 32, y * 2 + 32);
976 pix = ASU2(workGroupID.xy * 32) + ASU2(x + 16, y + 16);
977 v[3] = SpdReduceLoadSourceImageH(tex, slice);
978 SpdStoreH(pix, v[3], 0, slice);
983 v[0] = SpdReduceQuadH(v[0]);
984 v[1] = SpdReduceQuadH(v[1]);
985 v[2] = SpdReduceQuadH(v[2]);
986 v[3] = SpdReduceQuadH(v[3]);
988 if ((localInvocationIndex % 4) == 0)
990 SpdStoreH(ASU2(workGroupID.xy * 16) + ASU2(x/2, y/2), v[0], 1, slice);
991 SpdStoreIntermediateH(x/2, y/2, v[0]);
993 SpdStoreH(ASU2(workGroupID.xy * 16) + ASU2(x/2 + 8, y/2), v[1], 1, slice);
994 SpdStoreIntermediateH(x/2 + 8, y/2, v[1]);
996 SpdStoreH(ASU2(workGroupID.xy * 16) + ASU2(x/2, y/2 + 8), v[2], 1, slice);
997 SpdStoreIntermediateH(x/2, y/2 + 8, v[2]);
999 SpdStoreH(ASU2(workGroupID.xy * 16) + ASU2(x/2 + 8, y/2 + 8), v[3], 1, slice);
1000 SpdStoreIntermediateH(x/2 + 8, y/2 + 8, v[3]);
1004void SpdDownsampleMips_0_1_LDSH(AU1 x, AU1 y, AU2 workGroupID, AU1 localInvocationIndex, AU1 mips, AU1 slice)
1008 ASU2 tex = ASU2(workGroupID.xy * 64) + ASU2(x * 2, y * 2);
1009 ASU2 pix = ASU2(workGroupID.xy * 32) + ASU2(x, y);
1010 v[0] = SpdReduceLoadSourceImageH(tex, slice);
1011 SpdStoreH(pix, v[0], 0, slice);
1013 tex = ASU2(workGroupID.xy * 64) + ASU2(x * 2 + 32, y * 2);
1014 pix = ASU2(workGroupID.xy * 32) + ASU2(x + 16, y);
1015 v[1] = SpdReduceLoadSourceImageH(tex, slice);
1016 SpdStoreH(pix, v[1], 0, slice);
1018 tex = ASU2(workGroupID.xy * 64) + ASU2(x * 2, y * 2 + 32);
1019 pix = ASU2(workGroupID.xy * 32) + ASU2(x, y + 16);
1020 v[2] = SpdReduceLoadSourceImageH(tex, slice);
1021 SpdStoreH(pix, v[2], 0, slice);
1023 tex = ASU2(workGroupID.xy * 64) + ASU2(x * 2 + 32, y * 2 + 32);
1024 pix = ASU2(workGroupID.xy * 32) + ASU2(x + 16, y + 16);
1025 v[3] = SpdReduceLoadSourceImageH(tex, slice);
1026 SpdStoreH(pix, v[3], 0, slice);
1031 for (
int i = 0; i < 4; i++)
1033 SpdStoreIntermediateH(x, y, v[i]);
1034 SpdWorkgroupShuffleBarrier();
1035 if (localInvocationIndex < 64)
1037 v[i] = SpdReduceIntermediateH(
1038 AU2(x * 2 + 0, y * 2 + 0),
1039 AU2(x * 2 + 1, y * 2 + 0),
1040 AU2(x * 2 + 0, y * 2 + 1),
1041 AU2(x * 2 + 1, y * 2 + 1)
1043 SpdStoreH(ASU2(workGroupID.xy * 16) + ASU2(x + (i % 2) * 8, y + (i / 2) * 8), v[i], 1, slice);
1045 SpdWorkgroupShuffleBarrier();
1048 if (localInvocationIndex < 64)
1050 SpdStoreIntermediateH(x + 0, y + 0, v[0]);
1051 SpdStoreIntermediateH(x + 8, y + 0, v[1]);
1052 SpdStoreIntermediateH(x + 0, y + 8, v[2]);
1053 SpdStoreIntermediateH(x + 8, y + 8, v[3]);
1057void SpdDownsampleMips_0_1H(AU1 x, AU1 y, AU2 workGroupID, AU1 localInvocationIndex, AU1 mips, AU1 slice)
1059#ifdef SPD_NO_WAVE_OPERATIONS
1060 SpdDownsampleMips_0_1_LDSH(x, y, workGroupID, localInvocationIndex, mips, slice);
1062 SpdDownsampleMips_0_1_IntrinsicsH(x, y, workGroupID, localInvocationIndex, mips, slice);
1067void SpdDownsampleMip_2H(AU1 x, AU1 y, AU2 workGroupID, AU1 localInvocationIndex, AU1 mip, AU1 slice)
1069#ifdef SPD_NO_WAVE_OPERATIONS
1070 if (localInvocationIndex < 64)
1072 AH4 v = SpdReduceIntermediateH(
1073 AU2(x * 2 + 0, y * 2 + 0),
1074 AU2(x * 2 + 1, y * 2 + 0),
1075 AU2(x * 2 + 0, y * 2 + 1),
1076 AU2(x * 2 + 1, y * 2 + 1)
1078 SpdStoreH(ASU2(workGroupID.xy * 8) + ASU2(x, y), v, mip, slice);
1087 SpdStoreIntermediateH(x * 2 + y % 2, y * 2, v);
1090 AH4 v = SpdLoadIntermediateH(x, y);
1091 v = SpdReduceQuadH(v);
1093 if (localInvocationIndex % 4 == 0)
1095 SpdStoreH(ASU2(workGroupID.xy * 8) + ASU2(x/2, y/2), v, mip, slice);
1096 SpdStoreIntermediateH(x + (y/2) % 2, y, v);
1101void SpdDownsampleMip_3H(AU1 x, AU1 y, AU2 workGroupID, AU1 localInvocationIndex, AU1 mip, AU1 slice)
1103#ifdef SPD_NO_WAVE_OPERATIONS
1104 if (localInvocationIndex < 16)
1110 AH4 v = SpdReduceIntermediateH(
1111 AU2(x * 4 + 0 + 0, y * 4 + 0),
1112 AU2(x * 4 + 2 + 0, y * 4 + 0),
1113 AU2(x * 4 + 0 + 1, y * 4 + 2),
1114 AU2(x * 4 + 2 + 1, y * 4 + 2)
1116 SpdStoreH(ASU2(workGroupID.xy * 4) + ASU2(x, y), v, mip, slice);
1128 SpdStoreIntermediateH(x * 4 + y, y * 4, v);
1131 if (localInvocationIndex < 64)
1133 AH4 v = SpdLoadIntermediateH(x * 2 + y % 2,y * 2);
1134 v = SpdReduceQuadH(v);
1136 if (localInvocationIndex % 4 == 0)
1138 SpdStoreH(ASU2(workGroupID.xy * 4) + ASU2(x/2, y/2), v, mip, slice);
1139 SpdStoreIntermediateH(x * 2 + y/2, y * 2, v);
1145void SpdDownsampleMip_4H(AU1 x, AU1 y, AU2 workGroupID, AU1 localInvocationIndex, AU1 mip, AU1 slice)
1147#ifdef SPD_NO_WAVE_OPERATIONS
1148 if (localInvocationIndex < 4)
1153 AH4 v = SpdReduceIntermediateH(
1154 AU2(x * 8 + 0 + 0 + y * 2, y * 8 + 0),
1155 AU2(x * 8 + 4 + 0 + y * 2, y * 8 + 0),
1156 AU2(x * 8 + 0 + 1 + y * 2, y * 8 + 4),
1157 AU2(x * 8 + 4 + 1 + y * 2, y * 8 + 4)
1159 SpdStoreH(ASU2(workGroupID.xy * 2) + ASU2(x, y), v, mip, slice);
1163 SpdStoreIntermediateH(x + y * 2, 0, v);
1166 if (localInvocationIndex < 16)
1168 AH4 v = SpdLoadIntermediateH(x * 4 + y,y * 4);
1169 v = SpdReduceQuadH(v);
1171 if (localInvocationIndex % 4 == 0)
1173 SpdStoreH(ASU2(workGroupID.xy * 2) + ASU2(x/2, y/2), v, mip, slice);
1174 SpdStoreIntermediateH(x / 2 + y, 0, v);
1180void SpdDownsampleMip_5H(AU2 workGroupID, AU1 localInvocationIndex, AU1 mip, AU1 slice)
1182#ifdef SPD_NO_WAVE_OPERATIONS
1183 if (localInvocationIndex < 1)
1187 AH4 v = SpdReduceIntermediateH(
1193 SpdStoreH(ASU2(workGroupID.xy), v, mip, slice);
1196 if (localInvocationIndex < 4)
1198 AH4 v = SpdLoadIntermediateH(localInvocationIndex,0);
1199 v = SpdReduceQuadH(v);
1201 if (localInvocationIndex % 4 == 0)
1203 SpdStoreH(ASU2(workGroupID.xy), v, mip, slice);
1209void SpdDownsampleMips_6_7H(AU1 x, AU1 y, AU1 mips, AU1 slice)
1211 ASU2 tex = ASU2(x * 4 + 0, y * 4 + 0);
1212 ASU2 pix = ASU2(x * 2 + 0, y * 2 + 0);
1213 AH4 v0 = SpdReduceLoad4H(tex, slice);
1214 SpdStoreH(pix, v0, 6, slice);
1216 tex = ASU2(x * 4 + 2, y * 4 + 0);
1217 pix = ASU2(x * 2 + 1, y * 2 + 0);
1218 AH4 v1 = SpdReduceLoad4H(tex, slice);
1219 SpdStoreH(pix, v1, 6, slice);
1221 tex = ASU2(x * 4 + 0, y * 4 + 2);
1222 pix = ASU2(x * 2 + 0, y * 2 + 1);
1223 AH4 v2 = SpdReduceLoad4H(tex, slice);
1224 SpdStoreH(pix, v2, 6, slice);
1226 tex = ASU2(x * 4 + 2, y * 4 + 2);
1227 pix = ASU2(x * 2 + 1, y * 2 + 1);
1228 AH4 v3 = SpdReduceLoad4H(tex, slice);
1229 SpdStoreH(pix, v3, 6, slice);
1231 if (mips < 8)
return;
1234 AH4 v = SpdReduce4H(v0, v1, v2, v3);
1235 SpdStoreH(ASU2(x, y), v, 7, slice);
1236 SpdStoreIntermediateH(x, y, v);
1239void SpdDownsampleNextFourH(AU1 x, AU1 y, AU2 workGroupID, AU1 localInvocationIndex, AU1 baseMip, AU1 mips, AU1 slice)
1241 if (mips <= baseMip)
return;
1242 SpdWorkgroupShuffleBarrier();
1243 SpdDownsampleMip_2H(x, y, workGroupID, localInvocationIndex, baseMip, slice);
1245 if (mips <= baseMip + 1)
return;
1246 SpdWorkgroupShuffleBarrier();
1247 SpdDownsampleMip_3H(x, y, workGroupID, localInvocationIndex, baseMip + 1, slice);
1249 if (mips <= baseMip + 2)
return;
1250 SpdWorkgroupShuffleBarrier();
1251 SpdDownsampleMip_4H(x, y, workGroupID, localInvocationIndex, baseMip + 2, slice);
1253 if (mips <= baseMip + 3)
return;
1254 SpdWorkgroupShuffleBarrier();
1255 SpdDownsampleMip_5H(workGroupID, localInvocationIndex, baseMip + 3, slice);
1260 AU1 localInvocationIndex,
1265 AU2 sub_xy = ARmpRed8x8(localInvocationIndex % 64);
1266 AU1 x = sub_xy.x + 8 * ((localInvocationIndex >> 6) % 2);
1267 AU1 y = sub_xy.y + 8 * ((localInvocationIndex >> 7));
1269 SpdDownsampleMips_0_1H(x, y, workGroupID, localInvocationIndex, mips, slice);
1271 SpdDownsampleNextFourH(x, y, workGroupID, localInvocationIndex, 2, mips, slice);
1273 if (mips < 7)
return;
1275 if (SpdExitWorkgroup(numWorkGroups, localInvocationIndex, slice))
return;
1277 SpdResetAtomicCounter(slice);
1280 SpdDownsampleMips_6_7H(x, y, mips, slice);
1282 SpdDownsampleNextFourH(x, y, AU2(0,0), localInvocationIndex, 8, mips, slice);
1287 AU1 localInvocationIndex,
1293 SpdDownsampleH(workGroupID + workGroupOffset, localInvocationIndex, mips, numWorkGroups, slice);