From 8d9d508dc78eb5225c99cb425fa484999f3c4305 Mon Sep 17 00:00:00 2001
From: riperiperi <rhy3756547@hotmail.com>
Date: Sat, 22 Apr 2023 22:02:39 +0100
Subject: [PATCH] Shader: Bias textureGather instructions on AMD/Intel (#4703)

* Experimental (GLSL, forced)

* SPIR-V attempt

* Add capability

* Fix pCount == 1 on glsl

* Fix typo
---
 Ryujinx.Graphics.GAL/Capabilities.cs          |  6 +++-
 .../Shader/DiskCache/DiskCacheHostStorage.cs  |  2 +-
 .../Shader/GpuAccessorBase.cs                 |  2 ++
 Ryujinx.Graphics.OpenGL/OpenGLRenderer.cs     | 10 +++++--
 .../Glsl/Instructions/InstGenMemory.cs        | 23 +++++++++++++-
 .../CodeGen/Spirv/Instructions.cs             | 30 +++++++++++++++++++
 Ryujinx.Graphics.Shader/IGpuAccessor.cs       |  9 ++++++
 .../HardwareCapabilities.cs                   |  5 +++-
 Ryujinx.Graphics.Vulkan/VulkanRenderer.cs     |  6 ++--
 9 files changed, 84 insertions(+), 9 deletions(-)

diff --git a/Ryujinx.Graphics.GAL/Capabilities.cs b/Ryujinx.Graphics.GAL/Capabilities.cs
index 7822da2115..bc4a02c970 100644
--- a/Ryujinx.Graphics.GAL/Capabilities.cs
+++ b/Ryujinx.Graphics.GAL/Capabilities.cs
@@ -48,6 +48,8 @@ namespace Ryujinx.Graphics.GAL
         public readonly float MaximumSupportedAnisotropy;
         public readonly int StorageBufferOffsetAlignment;
 
+        public readonly int GatherBiasPrecision;
+
         public Capabilities(
             TargetApi api,
             string vendorName,
@@ -87,7 +89,8 @@ namespace Ryujinx.Graphics.GAL
             uint maximumImagesPerStage,
             int maximumComputeSharedMemorySize,
             float maximumSupportedAnisotropy,
-            int storageBufferOffsetAlignment)
+            int storageBufferOffsetAlignment,
+            int gatherBiasPrecision)
         {
             Api = api;
             VendorName = vendorName;
@@ -128,6 +131,7 @@ namespace Ryujinx.Graphics.GAL
             MaximumComputeSharedMemorySize = maximumComputeSharedMemorySize;
             MaximumSupportedAnisotropy = maximumSupportedAnisotropy;
             StorageBufferOffsetAlignment = storageBufferOffsetAlignment;
+            GatherBiasPrecision = gatherBiasPrecision;
         }
     }
 }
\ No newline at end of file
diff --git a/Ryujinx.Graphics.Gpu/Shader/DiskCache/DiskCacheHostStorage.cs b/Ryujinx.Graphics.Gpu/Shader/DiskCache/DiskCacheHostStorage.cs
index 0b87cc9101..48464f8326 100644
--- a/Ryujinx.Graphics.Gpu/Shader/DiskCache/DiskCacheHostStorage.cs
+++ b/Ryujinx.Graphics.Gpu/Shader/DiskCache/DiskCacheHostStorage.cs
@@ -22,7 +22,7 @@ namespace Ryujinx.Graphics.Gpu.Shader.DiskCache
         private const ushort FileFormatVersionMajor = 1;
         private const ushort FileFormatVersionMinor = 2;
         private const uint FileFormatVersionPacked = ((uint)FileFormatVersionMajor << 16) | FileFormatVersionMinor;
-        private const uint CodeGenVersion = 4404;
+        private const uint CodeGenVersion = 4703;
 
         private const string SharedTocFileName = "shared.toc";
         private const string SharedDataFileName = "shared.data";
diff --git a/Ryujinx.Graphics.Gpu/Shader/GpuAccessorBase.cs b/Ryujinx.Graphics.Gpu/Shader/GpuAccessorBase.cs
index 1402f146bf..bbf2702e40 100644
--- a/Ryujinx.Graphics.Gpu/Shader/GpuAccessorBase.cs
+++ b/Ryujinx.Graphics.Gpu/Shader/GpuAccessorBase.cs
@@ -112,6 +112,8 @@ namespace Ryujinx.Graphics.Gpu.Shader
             };
         }
 
+        public int QueryHostGatherBiasPrecision() => _context.Capabilities.GatherBiasPrecision;
+
         public bool QueryHostReducedPrecision() => _context.Capabilities.ReduceShaderPrecision;
 
         public bool QueryHostHasFrontFacingBug() => _context.Capabilities.HasFrontFacingBug;
diff --git a/Ryujinx.Graphics.OpenGL/OpenGLRenderer.cs b/Ryujinx.Graphics.OpenGL/OpenGLRenderer.cs
index 91e52178fe..5a2e3fe4e6 100644
--- a/Ryujinx.Graphics.OpenGL/OpenGLRenderer.cs
+++ b/Ryujinx.Graphics.OpenGL/OpenGLRenderer.cs
@@ -103,11 +103,14 @@ namespace Ryujinx.Graphics.OpenGL
 
         public Capabilities GetCapabilities()
         {
+            bool intelWindows = HwCapabilities.Vendor == HwCapabilities.GpuVendor.IntelWindows;
+            bool amdWindows = HwCapabilities.Vendor == HwCapabilities.GpuVendor.AmdWindows;
+
             return new Capabilities(
                 api: TargetApi.OpenGL,
                 vendorName: GpuVendor,
-                hasFrontFacingBug: HwCapabilities.Vendor == HwCapabilities.GpuVendor.IntelWindows,
-                hasVectorIndexingBug: HwCapabilities.Vendor == HwCapabilities.GpuVendor.AmdWindows,
+                hasFrontFacingBug: intelWindows,
+                hasVectorIndexingBug: amdWindows,
                 needsFragmentOutputSpecialization: false,
                 reduceShaderPrecision: false,
                 supportsAstcCompression: HwCapabilities.SupportsAstcCompression,
@@ -142,7 +145,8 @@ namespace Ryujinx.Graphics.OpenGL
                 maximumImagesPerStage: 8,
                 maximumComputeSharedMemorySize: HwCapabilities.MaximumComputeSharedMemorySize,
                 maximumSupportedAnisotropy: HwCapabilities.MaximumSupportedAnisotropy,
-                storageBufferOffsetAlignment: HwCapabilities.StorageBufferOffsetAlignment);
+                storageBufferOffsetAlignment: HwCapabilities.StorageBufferOffsetAlignment,
+                gatherBiasPrecision: intelWindows || amdWindows ? 8 : 0); // Precision is 8 for these vendors on Vulkan.
         }
 
         public void SetBufferData(BufferHandle buffer, int offset, ReadOnlySpan<byte> data)
diff --git a/Ryujinx.Graphics.Shader/CodeGen/Glsl/Instructions/InstGenMemory.cs b/Ryujinx.Graphics.Shader/CodeGen/Glsl/Instructions/InstGenMemory.cs
index 263eada6f2..a5d2632ce2 100644
--- a/Ryujinx.Graphics.Shader/CodeGen/Glsl/Instructions/InstGenMemory.cs
+++ b/Ryujinx.Graphics.Shader/CodeGen/Glsl/Instructions/InstGenMemory.cs
@@ -677,7 +677,28 @@ namespace Ryujinx.Graphics.Shader.CodeGen.Glsl.Instructions
                 return vector;
             }
 
-            Append(ApplyScaling(AssemblePVector(pCount)));
+            string ApplyBias(string vector)
+            {
+                int gatherBiasPrecision = context.Config.GpuAccessor.QueryHostGatherBiasPrecision();
+                if (isGather && gatherBiasPrecision != 0)
+                {
+                    // GPU requires texture gather to be slightly offset to match NVIDIA behaviour when point is exactly between two texels.
+                    // Offset by the gather precision divided by 2 to correct for rounding.
+
+                    if (pCount == 1)
+                    {
+                        vector = $"{vector} + (1.0 / (float(textureSize({samplerName}, 0)) * float({1 << (gatherBiasPrecision + 1)})))";
+                    }
+                    else
+                    {
+                        vector = $"{vector} + (1.0 / (vec{pCount}(textureSize({samplerName}, 0).{"xyz".Substring(0, pCount)}) * float({1 << (gatherBiasPrecision + 1)})))";
+                    }
+                }
+
+                return vector;
+            }
+
+            Append(ApplyBias(ApplyScaling(AssemblePVector(pCount))));
 
             string AssembleDerivativesVector(int count)
             {
diff --git a/Ryujinx.Graphics.Shader/CodeGen/Spirv/Instructions.cs b/Ryujinx.Graphics.Shader/CodeGen/Spirv/Instructions.cs
index 14d6ab52ab..b3db190518 100644
--- a/Ryujinx.Graphics.Shader/CodeGen/Spirv/Instructions.cs
+++ b/Ryujinx.Graphics.Shader/CodeGen/Spirv/Instructions.cs
@@ -4,6 +4,7 @@ using Ryujinx.Graphics.Shader.Translation;
 using System;
 using System.Collections.Generic;
 using System.Diagnostics;
+using System.Linq;
 using System.Numerics;
 using static Spv.Specification;
 
@@ -1556,6 +1557,33 @@ namespace Ryujinx.Graphics.Shader.CodeGen.Spirv
                 }
             }
 
+            SpvInstruction ApplyBias(SpvInstruction vector, SpvInstruction image)
+            {
+                int gatherBiasPrecision = context.Config.GpuAccessor.QueryHostGatherBiasPrecision();
+                if (isGather && gatherBiasPrecision != 0)
+                {
+                    // GPU requires texture gather to be slightly offset to match NVIDIA behaviour when point is exactly between two texels.
+                    // Offset by the gather precision divided by 2 to correct for rounding.
+                    var sizeType = pCount == 1 ? context.TypeS32() : context.TypeVector(context.TypeS32(), pCount);
+                    var pVectorType = pCount == 1 ? context.TypeFP32() : context.TypeVector(context.TypeFP32(), pCount);
+
+                    var bias = context.Constant(context.TypeFP32(), (float)(1 << (gatherBiasPrecision + 1)));
+                    var biasVector = context.CompositeConstruct(pVectorType, Enumerable.Repeat(bias, pCount).ToArray());
+
+                    var one = context.Constant(context.TypeFP32(), 1f);
+                    var oneVector = context.CompositeConstruct(pVectorType, Enumerable.Repeat(one, pCount).ToArray());
+
+                    var divisor = context.FMul(
+                        pVectorType,
+                        context.ConvertSToF(pVectorType, context.ImageQuerySize(sizeType, image)),
+                        biasVector);
+
+                    vector = context.FAdd(pVectorType, vector, context.FDiv(pVectorType, oneVector, divisor));
+                }
+
+                return vector;
+            }
+
             SpvInstruction pCoords = AssemblePVector(pCount);
             pCoords = ScalingHelpers.ApplyScaling(context, texOp, pCoords, intCoords, isBindless, isIndexed, isArray, pCount);
 
@@ -1716,6 +1744,8 @@ namespace Ryujinx.Graphics.Shader.CodeGen.Spirv
                 image = context.Image(imageType, image);
             }
 
+            pCoords = ApplyBias(pCoords, image);
+
             var operands = operandsList.ToArray();
 
             SpvInstruction result;
diff --git a/Ryujinx.Graphics.Shader/IGpuAccessor.cs b/Ryujinx.Graphics.Shader/IGpuAccessor.cs
index ba5f2a92fe..bc5e67c357 100644
--- a/Ryujinx.Graphics.Shader/IGpuAccessor.cs
+++ b/Ryujinx.Graphics.Shader/IGpuAccessor.cs
@@ -196,6 +196,15 @@ namespace Ryujinx.Graphics.Shader
             return false;
         }
 
+        /// <summary>
+        /// Queries host's gather operation precision bits for biasing their coordinates. Zero means no bias.
+        /// </summary>
+        /// <returns>Bits of gather operation precision to use for coordinate bias</returns>
+        int QueryHostGatherBiasPrecision()
+        {
+            return 0;
+        }
+
         /// <summary>
         /// Queries host about whether to reduce precision to improve performance.
         /// </summary>
diff --git a/Ryujinx.Graphics.Vulkan/HardwareCapabilities.cs b/Ryujinx.Graphics.Vulkan/HardwareCapabilities.cs
index a45c2409bb..e206bb2992 100644
--- a/Ryujinx.Graphics.Vulkan/HardwareCapabilities.cs
+++ b/Ryujinx.Graphics.Vulkan/HardwareCapabilities.cs
@@ -46,6 +46,7 @@ namespace Ryujinx.Graphics.Vulkan
         public readonly SampleCountFlags SupportedSampleCounts;
         public readonly PortabilitySubsetFlags PortabilitySubset;
         public readonly uint VertexBufferAlignment;
+        public readonly uint SubTexelPrecisionBits;
 
         public HardwareCapabilities(
             bool supportsIndexTypeUint8,
@@ -77,7 +78,8 @@ namespace Ryujinx.Graphics.Vulkan
             ShaderStageFlags requiredSubgroupSizeStages,
             SampleCountFlags supportedSampleCounts,
             PortabilitySubsetFlags portabilitySubset,
-            uint vertexBufferAlignment)
+            uint vertexBufferAlignment,
+            uint subTexelPrecisionBits)
         {
             SupportsIndexTypeUint8 = supportsIndexTypeUint8;
             SupportsCustomBorderColor = supportsCustomBorderColor;
@@ -109,6 +111,7 @@ namespace Ryujinx.Graphics.Vulkan
             SupportedSampleCounts = supportedSampleCounts;
             PortabilitySubset = portabilitySubset;
             VertexBufferAlignment = vertexBufferAlignment;
+            SubTexelPrecisionBits = subTexelPrecisionBits;
         }
     }
 }
diff --git a/Ryujinx.Graphics.Vulkan/VulkanRenderer.cs b/Ryujinx.Graphics.Vulkan/VulkanRenderer.cs
index 92b453fb14..1c295d6ff8 100644
--- a/Ryujinx.Graphics.Vulkan/VulkanRenderer.cs
+++ b/Ryujinx.Graphics.Vulkan/VulkanRenderer.cs
@@ -311,7 +311,8 @@ namespace Ryujinx.Graphics.Vulkan
                 propertiesSubgroupSizeControl.RequiredSubgroupSizeStages,
                 supportedSampleCounts,
                 portabilityFlags,
-                vertexBufferAlignment);
+                vertexBufferAlignment,
+                properties.Limits.SubTexelPrecisionBits);
 
             IsSharedMemory = MemoryAllocator.IsDeviceMemoryShared(_physicalDevice);
 
@@ -576,7 +577,8 @@ namespace Ryujinx.Graphics.Vulkan
                 maximumImagesPerStage: Constants.MaxImagesPerStage,
                 maximumComputeSharedMemorySize: (int)limits.MaxComputeSharedMemorySize,
                 maximumSupportedAnisotropy: (int)limits.MaxSamplerAnisotropy,
-                storageBufferOffsetAlignment: (int)limits.MinStorageBufferOffsetAlignment);
+                storageBufferOffsetAlignment: (int)limits.MinStorageBufferOffsetAlignment,
+                gatherBiasPrecision: IsIntelWindows || IsAmdWindows ? (int)Capabilities.SubTexelPrecisionBits : 0);
         }
 
         public HardwareInfo GetHardwareInfo()