From d9d18439f6900fd9f05bde41998526281f7638c5 Mon Sep 17 00:00:00 2001
From: gdkchan <gab.dark.100@gmail.com>
Date: Wed, 11 Aug 2021 15:59:42 -0300
Subject: [PATCH] Use a new approach for shader BRX targets (#2532)

* Use a new approach for shader BRX targets

* Make shader cache actually work

* Improve the shader pattern matching a bit

* Extend LDC search to predecessor blocks, catches more cases

* Nit

* Only save the amount of constant buffer data actually used. Avoids crashes on partially mapped buffers

* Ignore Rd on predicate instructions, as they do not have a Rd register (catches more cases)
---
 .../Shader/Cache/CacheCollection.cs           |  51 ++-
 .../Shader/Cache/CacheHelper.cs               |  53 ++-
 .../Shader/Cache/CacheManager.cs              |  10 +
 .../Definition/GuestShaderCacheEntryHeader.cs |  10 +-
 .../Shader/CachedGpuAccessor.cs               |  20 +-
 Ryujinx.Graphics.Gpu/Shader/GpuAccessor.cs    |  21 ++
 Ryujinx.Graphics.Gpu/Shader/ShaderCache.cs    |  59 ++-
 .../Shader/ShaderCompileTask.cs               |   3 +-
 Ryujinx.Graphics.Shader/Decoders/Block.cs     |  36 +-
 Ryujinx.Graphics.Shader/Decoders/Decoder.cs   | 347 ++++++++++++------
 Ryujinx.Graphics.Shader/IGpuAccessor.cs       |   5 +
 .../Instructions/InstEmitFlow.cs              |   6 +
 12 files changed, 472 insertions(+), 149 deletions(-)

diff --git a/Ryujinx.Graphics.Gpu/Shader/Cache/CacheCollection.cs b/Ryujinx.Graphics.Gpu/Shader/Cache/CacheCollection.cs
index 2660e52869..316e027f74 100644
--- a/Ryujinx.Graphics.Gpu/Shader/Cache/CacheCollection.cs
+++ b/Ryujinx.Graphics.Gpu/Shader/Cache/CacheCollection.cs
@@ -38,6 +38,11 @@ namespace Ryujinx.Graphics.Gpu.Shader.Cache
             /// </summary>
             RemoveManifestEntries,
 
+            /// <summary>
+            /// Remove entries from the hash manifest and save it, and also deletes the temporary file.
+            /// </summary>
+            RemoveManifestEntryAndTempFile,
+
             /// <summary>
             /// Flush temporary cache to archive.
             /// </summary>
@@ -116,6 +121,9 @@ namespace Ryujinx.Graphics.Gpu.Shader.Cache
         /// </summary>
         private ZipArchive _cacheArchive;
 
+        /// <summary>
+        /// Indicates if the cache collection supports modification.
+        /// </summary>
         public bool IsReadOnly { get; }
 
         /// <summary>
@@ -264,6 +272,21 @@ namespace Ryujinx.Graphics.Gpu.Shader.Cache
             }
         }
 
+        /// <summary>
+        /// Remove given entry from the manifest and delete the temporary file.
+        /// </summary>
+        /// <param name="entry">Entry to remove from the manifest</param>
+        private void RemoveManifestEntryAndTempFile(Hash128 entry)
+        {
+            lock (_hashTable)
+            {
+                _hashTable.Remove(entry);
+                SaveManifest();
+            }
+
+            File.Delete(GenCacheTempFilePath(entry));
+        }
+
         /// <summary>
         /// Queue a task to flush temporary files to the archive on the worker.
         /// </summary>
@@ -440,6 +463,9 @@ namespace Ryujinx.Graphics.Gpu.Shader.Cache
                 case CacheFileOperation.RemoveManifestEntries:
                     RemoveManifestEntries((HashSet<Hash128>)task.Data);
                     break;
+                case CacheFileOperation.RemoveManifestEntryAndTempFile:
+                    RemoveManifestEntryAndTempFile((Hash128)task.Data);
+                    break;
                 case CacheFileOperation.FlushToArchive:
                     FlushToArchive();
                     break;
@@ -472,7 +498,7 @@ namespace Ryujinx.Graphics.Gpu.Shader.Cache
         {
             if (IsReadOnly)
             {
-                Logger.Warning?.Print(LogClass.Gpu, "Trying to add {keyHash} on a read-only cache, ignoring.");
+                Logger.Warning?.Print(LogClass.Gpu, $"Trying to add {keyHash} on a read-only cache, ignoring.");
 
                 return;
             }
@@ -521,7 +547,7 @@ namespace Ryujinx.Graphics.Gpu.Shader.Cache
         {
             if (IsReadOnly)
             {
-                Logger.Warning?.Print(LogClass.Gpu, "Trying to replace {keyHash} on a read-only cache, ignoring.");
+                Logger.Warning?.Print(LogClass.Gpu, $"Trying to replace {keyHash} on a read-only cache, ignoring.");
 
                 return;
             }
@@ -540,6 +566,27 @@ namespace Ryujinx.Graphics.Gpu.Shader.Cache
             });
         }
 
+        /// <summary>
+        /// Removes a value at the given hash from the cache.
+        /// </summary>
+        /// <param name="keyHash">The hash of the value in the cache</param>
+        public void RemoveValue(ref Hash128 keyHash)
+        {
+            if (IsReadOnly)
+            {
+                Logger.Warning?.Print(LogClass.Gpu, $"Trying to remove {keyHash} on a read-only cache, ignoring.");
+
+                return;
+            }
+
+            // Only queue file change operations
+            _fileWriterWorkerQueue.Add(new CacheFileOperationTask
+            {
+                Type = CacheFileOperation.RemoveManifestEntryAndTempFile,
+                Data = keyHash
+            });
+        }
+
         public void Dispose()
         {
             Dispose(true);
diff --git a/Ryujinx.Graphics.Gpu/Shader/Cache/CacheHelper.cs b/Ryujinx.Graphics.Gpu/Shader/Cache/CacheHelper.cs
index f6caddef19..33da42db0b 100644
--- a/Ryujinx.Graphics.Gpu/Shader/Cache/CacheHelper.cs
+++ b/Ryujinx.Graphics.Gpu/Shader/Cache/CacheHelper.cs
@@ -371,11 +371,13 @@ namespace Ryujinx.Graphics.Gpu.Shader.Cache
         /// <summary>
         /// Create guest shader cache entries from the runtime contexts.
         /// </summary>
-        /// <param name="memoryManager">The GPU memory manager in use</param>
+        /// <param name="channel">The GPU channel in use</param>
         /// <param name="shaderContexts">The runtime contexts</param>
         /// <returns>Guest shader cahe entries from the runtime contexts</returns>
-        public static GuestShaderCacheEntry[] CreateShaderCacheEntries(MemoryManager memoryManager, ReadOnlySpan<TranslatorContext> shaderContexts)
+        public static GuestShaderCacheEntry[] CreateShaderCacheEntries(GpuChannel channel, ReadOnlySpan<TranslatorContext> shaderContexts)
         {
+            MemoryManager memoryManager = channel.MemoryManager;
+
             int startIndex = shaderContexts.Length > 1 ? 1 : 0;
 
             GuestShaderCacheEntry[] entries = new GuestShaderCacheEntry[shaderContexts.Length - startIndex];
@@ -389,31 +391,66 @@ namespace Ryujinx.Graphics.Gpu.Shader.Cache
                     continue;
                 }
 
+                GpuAccessor gpuAccessor = context.GpuAccessor as GpuAccessor;
+
+                ulong cb1DataAddress;
+                int cb1DataSize = gpuAccessor?.Cb1DataSize ?? 0;
+
+                if (context.Stage == ShaderStage.Compute)
+                {
+                    cb1DataAddress = channel.BufferManager.GetComputeUniformBufferAddress(1);
+                }
+                else
+                {
+                    int stageIndex = context.Stage switch
+                    {
+                        ShaderStage.TessellationControl => 1,
+                        ShaderStage.TessellationEvaluation => 2,
+                        ShaderStage.Geometry => 3,
+                        ShaderStage.Fragment => 4,
+                        _ => 0
+                    };
+
+                    cb1DataAddress = channel.BufferManager.GetGraphicsUniformBufferAddress(stageIndex, 1);
+                }
+
+                int size = context.Size;
+
                 TranslatorContext translatorContext2 = i == 1 ? shaderContexts[0] : null;
 
                 int sizeA = translatorContext2 != null ? translatorContext2.Size : 0;
 
-                byte[] code = new byte[context.Size + sizeA];
+                byte[] code = new byte[size + cb1DataSize + sizeA];
 
-                memoryManager.GetSpan(context.Address, context.Size).CopyTo(code);
+                memoryManager.GetSpan(context.Address, size).CopyTo(code);
+
+                if (cb1DataAddress != 0 && cb1DataSize != 0)
+                {
+                    memoryManager.Physical.GetSpan(cb1DataAddress, cb1DataSize).CopyTo(code.AsSpan().Slice(size, cb1DataSize));
+                }
 
                 if (translatorContext2 != null)
                 {
-                    memoryManager.GetSpan(translatorContext2.Address, sizeA).CopyTo(code.AsSpan().Slice(context.Size, sizeA));
+                    memoryManager.GetSpan(translatorContext2.Address, sizeA).CopyTo(code.AsSpan().Slice(size + cb1DataSize, sizeA));
                 }
 
                 GuestGpuAccessorHeader gpuAccessorHeader = CreateGuestGpuAccessorCache(context.GpuAccessor);
 
-                if (context.GpuAccessor is GpuAccessor)
+                if (gpuAccessor != null)
                 {
                     gpuAccessorHeader.TextureDescriptorCount = context.TextureHandlesForCache.Count;
                 }
 
-                GuestShaderCacheEntryHeader header = new GuestShaderCacheEntryHeader(context.Stage, context.Size, sizeA, gpuAccessorHeader);
+                GuestShaderCacheEntryHeader header = new GuestShaderCacheEntryHeader(
+                    context.Stage,
+                    size + cb1DataSize,
+                    sizeA,
+                    cb1DataSize,
+                    gpuAccessorHeader);
 
                 GuestShaderCacheEntry entry = new GuestShaderCacheEntry(header, code);
 
-                if (context.GpuAccessor is GpuAccessor gpuAccessor)
+                if (gpuAccessor != null)
                 {
                     foreach (int textureHandle in context.TextureHandlesForCache)
                     {
diff --git a/Ryujinx.Graphics.Gpu/Shader/Cache/CacheManager.cs b/Ryujinx.Graphics.Gpu/Shader/Cache/CacheManager.cs
index 1ac37704af..3fc11e822e 100644
--- a/Ryujinx.Graphics.Gpu/Shader/Cache/CacheManager.cs
+++ b/Ryujinx.Graphics.Gpu/Shader/Cache/CacheManager.cs
@@ -114,6 +114,16 @@ namespace Ryujinx.Graphics.Gpu.Shader.Cache
             _hostProgramCache.ReplaceValue(ref programCodeHash, data);
         }
 
+        /// <summary>
+        /// Removes a shader program present in the program cache.
+        /// </summary>
+        /// <param name="programCodeHash">Target program code hash</param>
+        public void RemoveProgram(ref Hash128 programCodeHash)
+        {
+            _guestProgramCache.RemoveValue(ref programCodeHash);
+            _hostProgramCache.RemoveValue(ref programCodeHash);
+        }
+
         /// <summary>
         /// Get all guest program hashes.
         /// </summary>
diff --git a/Ryujinx.Graphics.Gpu/Shader/Cache/Definition/GuestShaderCacheEntryHeader.cs b/Ryujinx.Graphics.Gpu/Shader/Cache/Definition/GuestShaderCacheEntryHeader.cs
index 6d5bb28dce..9b22cac55d 100644
--- a/Ryujinx.Graphics.Gpu/Shader/Cache/Definition/GuestShaderCacheEntryHeader.cs
+++ b/Ryujinx.Graphics.Gpu/Shader/Cache/Definition/GuestShaderCacheEntryHeader.cs
@@ -40,9 +40,9 @@ namespace Ryujinx.Graphics.Gpu.Shader.Cache.Definition
         public int SizeA;
 
         /// <summary>
-        /// Unused/reserved.
+        /// Constant buffer 1 data size.
         /// </summary>
-        public int Reserved4;
+        public int Cb1DataSize;
 
         /// <summary>
         /// The header of the cached gpu accessor.
@@ -55,12 +55,14 @@ namespace Ryujinx.Graphics.Gpu.Shader.Cache.Definition
         /// <param name="stage">The stage of this shader</param>
         /// <param name="size">The size of the code section</param>
         /// <param name="sizeA">The size of the code2 section if present (Vertex A)</param>
+        /// <param name="cb1DataSize">Constant buffer 1 data size</param>
         /// <param name="gpuAccessorHeader">The header of the cached gpu accessor</param>
-        public GuestShaderCacheEntryHeader(ShaderStage stage, int size, int sizeA, GuestGpuAccessorHeader gpuAccessorHeader) : this()
+        public GuestShaderCacheEntryHeader(ShaderStage stage, int size, int sizeA, int cb1DataSize, GuestGpuAccessorHeader gpuAccessorHeader) : this()
         {
             Stage = stage;
-            Size  = size;
+            Size = size;
             SizeA = sizeA;
+            Cb1DataSize = cb1DataSize;
             GpuAccessorHeader = gpuAccessorHeader;
         }
     }
diff --git a/Ryujinx.Graphics.Gpu/Shader/CachedGpuAccessor.cs b/Ryujinx.Graphics.Gpu/Shader/CachedGpuAccessor.cs
index a7bd4edb96..452dfd837a 100644
--- a/Ryujinx.Graphics.Gpu/Shader/CachedGpuAccessor.cs
+++ b/Ryujinx.Graphics.Gpu/Shader/CachedGpuAccessor.cs
@@ -11,6 +11,7 @@ namespace Ryujinx.Graphics.Gpu.Shader
     {
         private readonly GpuContext _context;
         private readonly ReadOnlyMemory<byte> _data;
+        private readonly ReadOnlyMemory<byte> _cb1Data;
         private readonly GuestGpuAccessorHeader _header;
         private readonly Dictionary<int, GuestTextureDescriptor> _textureDescriptors;
 
@@ -19,12 +20,19 @@ namespace Ryujinx.Graphics.Gpu.Shader
         /// </summary>
         /// <param name="context">GPU context</param>
         /// <param name="data">The data of the shader</param>
+        /// <param name="cb1Data">The constant buffer 1 data of the shader</param>
         /// <param name="header">The cache of the GPU accessor</param>
         /// <param name="guestTextureDescriptors">The cache of the texture descriptors</param>
-        public CachedGpuAccessor(GpuContext context, ReadOnlyMemory<byte> data, GuestGpuAccessorHeader header, Dictionary<int, GuestTextureDescriptor> guestTextureDescriptors)
+        public CachedGpuAccessor(
+            GpuContext context,
+            ReadOnlyMemory<byte> data,
+            ReadOnlyMemory<byte> cb1Data,
+            GuestGpuAccessorHeader header,
+            Dictionary<int, GuestTextureDescriptor> guestTextureDescriptors)
         {
             _context = context;
             _data = data;
+            _cb1Data = cb1Data;
             _header = header;
             _textureDescriptors = new Dictionary<int, GuestTextureDescriptor>();
 
@@ -34,6 +42,16 @@ namespace Ryujinx.Graphics.Gpu.Shader
             }
         }
 
+        /// <summary>
+        /// Reads data from the constant buffer 1.
+        /// </summary>
+        /// <param name="offset">Offset in bytes to read from</param>
+        /// <returns>Value at the given offset</returns>
+        public uint ConstantBuffer1Read(int offset)
+        {
+            return MemoryMarshal.Cast<byte, uint>(_cb1Data.Span.Slice(offset))[0];
+        }
+
         /// <summary>
         /// Prints a log message.
         /// </summary>
diff --git a/Ryujinx.Graphics.Gpu/Shader/GpuAccessor.cs b/Ryujinx.Graphics.Gpu/Shader/GpuAccessor.cs
index b7059b51d5..6254b1c2d8 100644
--- a/Ryujinx.Graphics.Gpu/Shader/GpuAccessor.cs
+++ b/Ryujinx.Graphics.Gpu/Shader/GpuAccessor.cs
@@ -20,6 +20,8 @@ namespace Ryujinx.Graphics.Gpu.Shader
         private readonly int _localMemorySize;
         private readonly int _sharedMemorySize;
 
+        public int Cb1DataSize { get; private set; }
+
         /// <summary>
         /// Creates a new instance of the GPU state accessor for graphics shader translation.
         /// </summary>
@@ -67,6 +69,25 @@ namespace Ryujinx.Graphics.Gpu.Shader
             _sharedMemorySize = sharedMemorySize;
         }
 
+        /// <summary>
+        /// Reads data from the constant buffer 1.
+        /// </summary>
+        /// <param name="offset">Offset in bytes to read from</param>
+        /// <returns>Value at the given offset</returns>
+        public uint ConstantBuffer1Read(int offset)
+        {
+            if (Cb1DataSize < offset + 4)
+            {
+                Cb1DataSize = offset + 4;
+            }
+
+            ulong baseAddress = _compute
+                ? _channel.BufferManager.GetComputeUniformBufferAddress(1)
+                : _channel.BufferManager.GetGraphicsUniformBufferAddress(_stageIndex, 1);
+
+            return _channel.MemoryManager.Physical.Read<uint>(baseAddress + (ulong)offset);
+        }
+
         /// <summary>
         /// Prints a log message.
         /// </summary>
diff --git a/Ryujinx.Graphics.Gpu/Shader/ShaderCache.cs b/Ryujinx.Graphics.Gpu/Shader/ShaderCache.cs
index a5712a14a8..754449fbc3 100644
--- a/Ryujinx.Graphics.Gpu/Shader/ShaderCache.cs
+++ b/Ryujinx.Graphics.Gpu/Shader/ShaderCache.cs
@@ -38,7 +38,7 @@ namespace Ryujinx.Graphics.Gpu.Shader
         /// <summary>
         /// Version of the codegen (to be changed when codegen or guest format change).
         /// </summary>
-        private const ulong ShaderCodeGenVersion = 2469;
+        private const ulong ShaderCodeGenVersion = 2530;
 
         // Progress reporting helpers
         private volatile int _shaderCount;
@@ -112,7 +112,7 @@ namespace Ryujinx.Graphics.Gpu.Shader
                 int programIndex = 0;
                 List<ShaderCompileTask> activeTasks = new List<ShaderCompileTask>();
 
-                AutoResetEvent taskDoneEvent = new AutoResetEvent(false);
+                using AutoResetEvent taskDoneEvent = new AutoResetEvent(false);
 
                 // This thread dispatches tasks to do shader translation, and creates programs that OpenGL will link in the background.
                 // The program link status is checked in a non-blocking manner so that multiple shaders can be compiled at once.
@@ -191,7 +191,14 @@ namespace Ryujinx.Graphics.Gpu.Shader
 
                                     Task compileTask = Task.Run(() =>
                                     {
-                                        IGpuAccessor gpuAccessor = new CachedGpuAccessor(_context, entry.Code, entry.Header.GpuAccessorHeader, entry.TextureDescriptors);
+                                        var binaryCode = new Memory<byte>(entry.Code);
+
+                                        var gpuAccessor = new CachedGpuAccessor(
+                                            _context,
+                                            binaryCode,
+                                            binaryCode.Slice(binaryCode.Length - entry.Header.Cb1DataSize),
+                                            entry.Header.GpuAccessorHeader,
+                                            entry.TextureDescriptors);
 
                                         var options = new TranslationOptions(TargetLanguage.Glsl, TargetApi.OpenGL, DefaultFlags | TranslationFlags.Compute);
                                         program = Translator.CreateContext(0, gpuAccessor, options).Translate(out shaderProgramInfo);
@@ -199,12 +206,20 @@ namespace Ryujinx.Graphics.Gpu.Shader
 
                                     task.OnTask(compileTask, (bool _, ShaderCompileTask task) =>
                                     {
+                                        if (task.IsFaulted)
+                                        {
+                                            Logger.Warning?.Print(LogClass.Gpu, $"Host shader {key} is corrupted or incompatible, discarding...");
+
+                                            _cacheManager.RemoveProgram(ref key);
+                                            return true; // Exit early, the decoding step failed.
+                                        }
+
                                         ShaderCodeHolder shader = new ShaderCodeHolder(program, shaderProgramInfo, entry.Code);
 
                                         Logger.Info?.Print(LogClass.Gpu, $"Host shader {key} got invalidated, rebuilding from guest...");
 
                                         // Compile shader and create program as the shader program binary got invalidated.
-                                        shader.HostShader = _context.Renderer.CompileShader(ShaderStage.Compute, shader.Program.Code);
+                                        shader.HostShader = _context.Renderer.CompileShader(ShaderStage.Compute, program.Code);
                                         hostProgram = _context.Renderer.CreateProgram(new IShader[] { shader.HostShader }, null);
 
                                         task.OnCompiled(hostProgram, (bool isNewProgramValid, ShaderCompileTask task) =>
@@ -298,7 +313,14 @@ namespace Ryujinx.Graphics.Gpu.Shader
                                             }
                                             else
                                             {
-                                                IGpuAccessor gpuAccessor = new CachedGpuAccessor(_context, entry.Code, entry.Header.GpuAccessorHeader, entry.TextureDescriptors);
+                                                var binaryCode = new Memory<byte>(entry.Code);
+
+                                                var gpuAccessor = new CachedGpuAccessor(
+                                                    _context,
+                                                    binaryCode,
+                                                    binaryCode.Slice(binaryCode.Length - entry.Header.Cb1DataSize),
+                                                    entry.Header.GpuAccessorHeader,
+                                                    entry.TextureDescriptors);
 
                                                 var options = new TranslationOptions(TargetLanguage.Glsl, TargetApi.OpenGL, flags);
                                                 var options2 = new TranslationOptions(TargetLanguage.Glsl, TargetApi.OpenGL, flags | TranslationFlags.VertexA);
@@ -310,7 +332,7 @@ namespace Ryujinx.Graphics.Gpu.Shader
                                             }
 
                                             // NOTE: Vertex B comes first in the shader cache.
-                                            byte[] code = entry.Code.AsSpan().Slice(0, entry.Header.Size).ToArray();
+                                            byte[] code = entry.Code.AsSpan().Slice(0, entry.Header.Size - entry.Header.Cb1DataSize).ToArray();
                                             byte[] code2 = entry.Code.AsSpan().Slice(entry.Header.Size, entry.Header.SizeA).ToArray();
 
                                             shaders[i] = new ShaderCodeHolder(program, shaderProgramInfo, code, code2);
@@ -326,13 +348,22 @@ namespace Ryujinx.Graphics.Gpu.Shader
                                             }
                                             else
                                             {
-                                                IGpuAccessor gpuAccessor = new CachedGpuAccessor(_context, entry.Code, entry.Header.GpuAccessorHeader, entry.TextureDescriptors);
+                                                var binaryCode = new Memory<byte>(entry.Code);
+
+                                                var gpuAccessor = new CachedGpuAccessor(
+                                                    _context,
+                                                    binaryCode,
+                                                    binaryCode.Slice(binaryCode.Length - entry.Header.Cb1DataSize),
+                                                    entry.Header.GpuAccessorHeader,
+                                                    entry.TextureDescriptors);
 
                                                 var options = new TranslationOptions(TargetLanguage.Glsl, TargetApi.OpenGL, flags);
                                                 program = Translator.CreateContext(0, gpuAccessor, options, counts).Translate(out shaderProgramInfo);
                                             }
 
-                                            shaders[i] = new ShaderCodeHolder(program, shaderProgramInfo, entry.Code);
+                                            byte[] code = entry.Code.AsSpan().Slice(0, entry.Header.Size - entry.Header.Cb1DataSize).ToArray();
+
+                                            shaders[i] = new ShaderCodeHolder(program, shaderProgramInfo, code);
                                         }
 
                                         shaderPrograms.Add(program);
@@ -341,6 +372,14 @@ namespace Ryujinx.Graphics.Gpu.Shader
 
                                 task.OnTask(compileTask, (bool _, ShaderCompileTask task) =>
                                 {
+                                    if (task.IsFaulted)
+                                    {
+                                        Logger.Warning?.Print(LogClass.Gpu, $"Host shader {key} is corrupted or incompatible, discarding...");
+
+                                        _cacheManager.RemoveProgram(ref key);
+                                        return true; // Exit early, the decoding step failed.
+                                    }
+
                                     // If the host program was rejected by the gpu driver or isn't in cache, try to build from program sources again.
                                     if (!isHostProgramValid)
                                     {
@@ -537,7 +576,7 @@ namespace Ryujinx.Graphics.Gpu.Shader
                 isShaderCacheReadOnly = _cacheManager.IsReadOnly;
 
                 // Compute hash and prepare data for shader disk cache comparison.
-                shaderCacheEntries = CacheHelper.CreateShaderCacheEntries(channel.MemoryManager, shaderContexts);
+                shaderCacheEntries = CacheHelper.CreateShaderCacheEntries(channel, shaderContexts);
                 programCodeHash = CacheHelper.ComputeGuestHashFromCache(shaderCacheEntries);
             }
 
@@ -659,7 +698,7 @@ namespace Ryujinx.Graphics.Gpu.Shader
                 isShaderCacheReadOnly = _cacheManager.IsReadOnly;
 
                 // Compute hash and prepare data for shader disk cache comparison.
-                shaderCacheEntries = CacheHelper.CreateShaderCacheEntries(channel.MemoryManager, shaderContexts);
+                shaderCacheEntries = CacheHelper.CreateShaderCacheEntries(channel, shaderContexts);
                 programCodeHash = CacheHelper.ComputeGuestHashFromCache(shaderCacheEntries, tfd);
             }
 
diff --git a/Ryujinx.Graphics.Gpu/Shader/ShaderCompileTask.cs b/Ryujinx.Graphics.Gpu/Shader/ShaderCompileTask.cs
index ff48fab001..a9283de23d 100644
--- a/Ryujinx.Graphics.Gpu/Shader/ShaderCompileTask.cs
+++ b/Ryujinx.Graphics.Gpu/Shader/ShaderCompileTask.cs
@@ -1,5 +1,4 @@
 using Ryujinx.Graphics.GAL;
-using System;
 using System.Threading;
 using System.Threading.Tasks;
 
@@ -20,6 +19,8 @@ namespace Ryujinx.Graphics.Gpu.Shader
         private ShaderCompileTaskCallback _action;
         private AutoResetEvent _taskDoneEvent;
 
+        public bool IsFaulted => _programsTask.IsFaulted;
+
         /// <summary>
         /// Create a new shader compile task, with an event to signal whenever a subtask completes.
         /// </summary>
diff --git a/Ryujinx.Graphics.Shader/Decoders/Block.cs b/Ryujinx.Graphics.Shader/Decoders/Block.cs
index e147023736..69cb55b951 100644
--- a/Ryujinx.Graphics.Shader/Decoders/Block.cs
+++ b/Ryujinx.Graphics.Shader/Decoders/Block.cs
@@ -8,10 +8,38 @@ namespace Ryujinx.Graphics.Shader.Decoders
         public ulong Address    { get; set; }
         public ulong EndAddress { get; set; }
 
-        public Block Next   { get; set; }
-        public Block Branch { get; set; }
+        private Block _next;
+        private Block _branch;
 
-        public OpCodeBranchIndir BrIndir { get; set; }
+        public Block Next
+        {
+            get
+            {
+                return _next;
+            }
+            set
+            {
+                _next?.Predecessors.Remove(this);
+                value?.Predecessors.Add(this);
+                _next = value;
+            }
+        }
+
+        public Block Branch
+        {
+            get
+            {
+                return _branch;
+            }
+            set
+            {
+                _branch?.Predecessors.Remove(this);
+                value?.Predecessors.Add(this);
+                _branch = value;
+            }
+        }
+
+        public HashSet<Block> Predecessors { get; }
 
         public List<OpCode>     OpCodes     { get; }
         public List<OpCodePush> PushOpCodes { get; }
@@ -20,6 +48,8 @@ namespace Ryujinx.Graphics.Shader.Decoders
         {
             Address = address;
 
+            Predecessors = new HashSet<Block>();
+
             OpCodes     = new List<OpCode>();
             PushOpCodes = new List<OpCodePush>();
         }
diff --git a/Ryujinx.Graphics.Shader/Decoders/Decoder.cs b/Ryujinx.Graphics.Shader/Decoders/Decoder.cs
index 9ca581771a..c916935e71 100644
--- a/Ryujinx.Graphics.Shader/Decoders/Decoder.cs
+++ b/Ryujinx.Graphics.Shader/Decoders/Decoder.cs
@@ -9,8 +9,6 @@ namespace Ryujinx.Graphics.Shader.Decoders
 {
     static class Decoder
     {
-        public const ulong ShaderEndDelimiter = 0xe2400fffff87000f;
-
         public static Block[][] Decode(IGpuAccessor gpuAccessor, ulong startAddress, out bool hasBindless)
         {
             hasBindless = false;
@@ -51,130 +49,139 @@ namespace Ryujinx.Graphics.Shader.Decoders
 
                 GetBlock(funcAddress);
 
-                while (workQueue.TryDequeue(out Block currBlock))
+                bool hasNewTarget;
+
+                do
                 {
-                    // Check if the current block is inside another block.
-                    if (BinarySearch(blocks, currBlock.Address, out int nBlkIndex))
+                    while (workQueue.TryDequeue(out Block currBlock))
                     {
-                        Block nBlock = blocks[nBlkIndex];
-
-                        if (nBlock.Address == currBlock.Address)
+                        // Check if the current block is inside another block.
+                        if (BinarySearch(blocks, currBlock.Address, out int nBlkIndex))
                         {
-                            throw new InvalidOperationException("Found duplicate block address on the list.");
-                        }
+                            Block nBlock = blocks[nBlkIndex];
 
-                        nBlock.Split(currBlock);
-                        blocks.Insert(nBlkIndex + 1, currBlock);
-
-                        continue;
-                    }
-
-                    // If we have a block after the current one, set the limit address.
-                    ulong limitAddress = ulong.MaxValue;
-
-                    if (nBlkIndex != blocks.Count)
-                    {
-                        Block nBlock = blocks[nBlkIndex];
-
-                        int nextIndex = nBlkIndex + 1;
-
-                        if (nBlock.Address < currBlock.Address && nextIndex < blocks.Count)
-                        {
-                            limitAddress = blocks[nextIndex].Address;
-                        }
-                        else if (nBlock.Address > currBlock.Address)
-                        {
-                            limitAddress = blocks[nBlkIndex].Address;
-                        }
-                    }
-
-                    FillBlock(gpuAccessor, currBlock, limitAddress, startAddress, out bool blockHasBindless);
-                    hasBindless |= blockHasBindless;
-
-                    if (currBlock.OpCodes.Count != 0)
-                    {
-                        // We should have blocks for all possible branch targets,
-                        // including those from SSY/PBK instructions.
-                        foreach (OpCodePush pushOp in currBlock.PushOpCodes)
-                        {
-                            GetBlock(pushOp.GetAbsoluteAddress());
-                        }
-
-                        // Set child blocks. "Branch" is the block the branch instruction
-                        // points to (when taken), "Next" is the block at the next address,
-                        // executed when the branch is not taken. For Unconditional Branches
-                        // or end of program, Next is null.
-                        OpCode lastOp = currBlock.GetLastOp();
-
-                        if (lastOp is OpCodeBranch opBr)
-                        {
-                            if (lastOp.Emitter == InstEmit.Cal)
+                            if (nBlock.Address == currBlock.Address)
                             {
-                                EnqueueFunction(opBr.GetAbsoluteAddress());
+                                throw new InvalidOperationException("Found duplicate block address on the list.");
                             }
-                            else
+
+                            nBlock.Split(currBlock);
+                            blocks.Insert(nBlkIndex + 1, currBlock);
+
+                            continue;
+                        }
+
+                        // If we have a block after the current one, set the limit address.
+                        ulong limitAddress = ulong.MaxValue;
+
+                        if (nBlkIndex != blocks.Count)
+                        {
+                            Block nBlock = blocks[nBlkIndex];
+
+                            int nextIndex = nBlkIndex + 1;
+
+                            if (nBlock.Address < currBlock.Address && nextIndex < blocks.Count)
                             {
-                                currBlock.Branch = GetBlock(opBr.GetAbsoluteAddress());
+                                limitAddress = blocks[nextIndex].Address;
+                            }
+                            else if (nBlock.Address > currBlock.Address)
+                            {
+                                limitAddress = blocks[nBlkIndex].Address;
                             }
                         }
-                        else if (lastOp is OpCodeBranchIndir opBrIndir)
+
+                        FillBlock(gpuAccessor, currBlock, limitAddress, startAddress, out bool blockHasBindless);
+                        hasBindless |= blockHasBindless;
+
+                        if (currBlock.OpCodes.Count != 0)
                         {
-                            // An indirect branch could go anywhere, we don't know the target.
-                            // Those instructions are usually used on a switch to jump table
-                            // compiler optimization, and in those cases the possible targets
-                            // seems to be always right after the BRX itself. We can assume
-                            // that the possible targets are all the blocks in-between the
-                            // instruction right after the BRX, and the common target that
-                            // all the "cases" should eventually jump to, acting as the
-                            // switch break.
-                            Block firstTarget = GetBlock(currBlock.EndAddress);
+                            // We should have blocks for all possible branch targets,
+                            // including those from SSY/PBK instructions.
+                            foreach (OpCodePush pushOp in currBlock.PushOpCodes)
+                            {
+                                GetBlock(pushOp.GetAbsoluteAddress());
+                            }
 
-                            firstTarget.BrIndir = opBrIndir;
+                            // Set child blocks. "Branch" is the block the branch instruction
+                            // points to (when taken), "Next" is the block at the next address,
+                            // executed when the branch is not taken. For Unconditional Branches
+                            // or end of program, Next is null.
+                            OpCode lastOp = currBlock.GetLastOp();
 
-                            opBrIndir.PossibleTargets.Add(firstTarget);
+                            if (lastOp is OpCodeBranch opBr)
+                            {
+                                if (lastOp.Emitter == InstEmit.Cal)
+                                {
+                                    EnqueueFunction(opBr.GetAbsoluteAddress());
+                                }
+                                else
+                                {
+                                    currBlock.Branch = GetBlock(opBr.GetAbsoluteAddress());
+                                }
+                            }
+
+                            if (!IsUnconditionalBranch(lastOp))
+                            {
+                                currBlock.Next = GetBlock(currBlock.EndAddress);
+                            }
                         }
 
-                        if (!IsUnconditionalBranch(lastOp))
+                        // Insert the new block on the list (sorted by address).
+                        if (blocks.Count != 0)
                         {
-                            currBlock.Next = GetBlock(currBlock.EndAddress);
+                            Block nBlock = blocks[nBlkIndex];
+
+                            blocks.Insert(nBlkIndex + (nBlock.Address < currBlock.Address ? 1 : 0), currBlock);
+                        }
+                        else
+                        {
+                            blocks.Add(currBlock);
                         }
                     }
 
-                    // Insert the new block on the list (sorted by address).
-                    if (blocks.Count != 0)
+                    // Propagate SSY/PBK addresses into their uses (SYNC/BRK).
+                    foreach (Block block in blocks.Where(x => x.PushOpCodes.Count != 0))
                     {
-                        Block nBlock = blocks[nBlkIndex];
-
-                        blocks.Insert(nBlkIndex + (nBlock.Address < currBlock.Address ? 1 : 0), currBlock);
-                    }
-                    else
-                    {
-                        blocks.Add(currBlock);
-                    }
-
-                    // Do we have a block after the current one?
-                    if (currBlock.BrIndir != null && HasBlockAfter(gpuAccessor, currBlock, startAddress))
-                    {
-                        bool targetVisited = visited.ContainsKey(currBlock.EndAddress);
-
-                        Block possibleTarget = GetBlock(currBlock.EndAddress);
-
-                        currBlock.BrIndir.PossibleTargets.Add(possibleTarget);
-
-                        if (!targetVisited)
+                        for (int pushOpIndex = 0; pushOpIndex < block.PushOpCodes.Count; pushOpIndex++)
                         {
-                            possibleTarget.BrIndir = currBlock.BrIndir;
+                            PropagatePushOp(visited, block, pushOpIndex);
                         }
                     }
+
+                    // Try to find target for BRX (indirect branch) instructions.
+                    hasNewTarget = false;
+
+                    foreach (Block block in blocks)
+                    {
+                        if (block.GetLastOp() is OpCodeBranchIndir opBrIndir && opBrIndir.PossibleTargets.Count == 0)
+                        {
+                            ulong baseOffset = opBrIndir.Address + 8 + (ulong)opBrIndir.Offset;
+
+                            // An indirect branch could go anywhere,
+                            // try to get the possible target offsets from the constant buffer.
+                            (int cbBaseOffset, int cbOffsetsCount) = FindBrxTargetRange(block, opBrIndir.Ra.Index);
+
+                            if (cbOffsetsCount != 0)
+                            {
+                                hasNewTarget = true;
+                            }
+
+                            for (int i = 0; i < cbOffsetsCount; i++)
+                            {
+                                uint targetOffset = gpuAccessor.ConstantBuffer1Read(cbBaseOffset + i * 4);
+                                Block target = GetBlock(baseOffset + targetOffset);
+                                opBrIndir.PossibleTargets.Add(target);
+                                target.Predecessors.Add(block);
+                            }
+                        }
+                    }
+
+                    // If we discovered new branch targets from the BRX instruction,
+                    // we need another round of decoding to decode the new blocks.
+                    // Additionally, we may have more SSY/PBK targets to propagate,
+                    // and new BRX instructions.
                 }
-
-                foreach (Block block in blocks.Where(x => x.PushOpCodes.Count != 0))
-                {
-                    for (int pushOpIndex = 0; pushOpIndex < block.PushOpCodes.Count; pushOpIndex++)
-                    {
-                        PropagatePushOp(visited, block, pushOpIndex);
-                    }
-                }
+                while (hasNewTarget);
 
                 funcs.Add(blocks.ToArray());
             }
@@ -182,19 +189,6 @@ namespace Ryujinx.Graphics.Shader.Decoders
             return funcs.ToArray();
         }
 
-        private static bool HasBlockAfter(IGpuAccessor gpuAccessor, Block currBlock, ulong startAdddress)
-        {
-            if (!gpuAccessor.MemoryMapped(startAdddress + currBlock.EndAddress) ||
-                !gpuAccessor.MemoryMapped(startAdddress + currBlock.EndAddress + 7))
-            {
-                return false;
-            }
-
-            ulong inst = gpuAccessor.MemoryRead<ulong>(startAdddress + currBlock.EndAddress);
-
-            return inst != 0UL && inst != ShaderEndDelimiter;
-        }
-
         private static bool BinarySearch(List<Block> blocks, ulong address, out int index)
         {
             index = 0;
@@ -320,6 +314,115 @@ namespace Ryujinx.Graphics.Shader.Decoders
                     opCode is OpCodeExit;
         }
 
+        private static (int, int) FindBrxTargetRange(Block block, int brxReg)
+        {
+            // Try to match the following pattern:
+            //
+            // IMNMX.U32 Rx, Rx, UpperBound, PT
+            // SHL Rx, Rx, 0x2
+            // LDC Rx, c[0x1][Rx+BaseOffset]
+            //
+            // Here, Rx is an arbitrary register, "UpperBound" and "BaseOffset" are constants.
+            // The above pattern is assumed to be generated by the compiler before BRX,
+            // as the instruction is usually used to implement jump tables for switch statement optimizations.
+            // On a successful match, "BaseOffset" is the offset in bytes where the jump offsets are
+            // located on the constant buffer, and "UpperBound" is the total number of offsets for the BRX, minus 1.
+
+            HashSet<Block> visited = new HashSet<Block>();
+
+            var ldcLocation = FindFirstRegWrite(visited, new BlockLocation(block, block.OpCodes.Count - 1), brxReg);
+            if (ldcLocation.Block == null || ldcLocation.Block.OpCodes[ldcLocation.Index] is not OpCodeLdc opLdc)
+            {
+                return (0, 0);
+            }
+
+            if (opLdc.Slot != 1 || opLdc.IndexMode != CbIndexMode.Default)
+            {
+                return (0, 0);
+            }
+
+            var shlLocation = FindFirstRegWrite(visited, ldcLocation, opLdc.Ra.Index);
+            if (shlLocation.Block == null || shlLocation.Block.OpCodes[shlLocation.Index] is not OpCodeAluImm opShl)
+            {
+                return (0, 0);
+            }
+
+            if (opShl.Emitter != InstEmit.Shl || opShl.Immediate != 2)
+            {
+                return (0, 0);
+            }
+
+            var imnmxLocation = FindFirstRegWrite(visited, shlLocation, opShl.Ra.Index);
+            if (imnmxLocation.Block == null || imnmxLocation.Block.OpCodes[imnmxLocation.Index] is not OpCodeAluImm opImnmx)
+            {
+                return (0, 0);
+            }
+
+            bool isImnmxS32 = opImnmx.RawOpCode.Extract(48);
+
+            if (opImnmx.Emitter != InstEmit.Imnmx || isImnmxS32 || !opImnmx.Predicate39.IsPT || opImnmx.InvertP)
+            {
+                return (0, 0);
+            }
+
+            return (opLdc.Offset, opImnmx.Immediate + 1);
+        }
+
+        private struct BlockLocation
+        {
+            public Block Block { get; }
+            public int Index { get; }
+
+            public BlockLocation(Block block, int index)
+            {
+                Block = block;
+                Index = index;
+            }
+        }
+
+        private static BlockLocation FindFirstRegWrite(HashSet<Block> visited, BlockLocation location, int regIndex)
+        {
+            Queue<BlockLocation> toVisit = new Queue<BlockLocation>();
+            toVisit.Enqueue(location);
+            visited.Add(location.Block);
+
+            while (toVisit.TryDequeue(out var currentLocation))
+            {
+                Block block = currentLocation.Block;
+                for (int i = currentLocation.Index - 1; i >= 0; i--)
+                {
+                    if (WritesToRegister(block.OpCodes[i], regIndex))
+                    {
+                        return new BlockLocation(block, i);
+                    }
+                }
+
+                foreach (Block predecessor in block.Predecessors)
+                {
+                    if (visited.Add(predecessor))
+                    {
+                        toVisit.Enqueue(new BlockLocation(predecessor, predecessor.OpCodes.Count));
+                    }
+                }
+            }
+
+            return new BlockLocation(null, 0);
+        }
+
+        private static bool WritesToRegister(OpCode opCode, int regIndex)
+        {
+            // Predicate instruction only ever writes to predicate, so we shouldn't check those.
+            if (opCode.Emitter == InstEmit.Fsetp ||
+                opCode.Emitter == InstEmit.Hsetp2 ||
+                opCode.Emitter == InstEmit.Isetp ||
+                opCode.Emitter == InstEmit.R2p)
+            {
+                return false;
+            }
+
+            return opCode is IOpCodeRd opRd && opRd.Rd.Index == regIndex;
+        }
+
         private enum MergeType
         {
             Brk = 0,
@@ -388,6 +491,8 @@ namespace Ryujinx.Graphics.Shader.Decoders
         {
             OpCodePush pushOp = currBlock.PushOpCodes[pushOpIndex];
 
+            Block target = blocks[pushOp.GetAbsoluteAddress()];
+
             Stack<PathBlockState> workQueue = new Stack<PathBlockState>();
 
             HashSet<Block> visited = new HashSet<Block>();
@@ -497,10 +602,12 @@ namespace Ryujinx.Graphics.Shader.Decoders
                         if (branchStack.Count == 0)
                         {
                             // If the entire stack was consumed, then the current pop instruction
-                            // just consumed the address from out push instruction.
-                            op.Targets.Add(pushOp, op.Targets.Count);
-
-                            pushOp.PopOps.TryAdd(op, Local());
+                            // just consumed the address from our push instruction.
+                            if (op.Targets.TryAdd(pushOp, op.Targets.Count))
+                            {
+                                pushOp.PopOps.Add(op, Local());
+                                target.Predecessors.Add(current);
+                            }
                         }
                         else
                         {
diff --git a/Ryujinx.Graphics.Shader/IGpuAccessor.cs b/Ryujinx.Graphics.Shader/IGpuAccessor.cs
index 26a8cafdb0..04f23061b9 100644
--- a/Ryujinx.Graphics.Shader/IGpuAccessor.cs
+++ b/Ryujinx.Graphics.Shader/IGpuAccessor.cs
@@ -7,6 +7,11 @@
             // No default log output.
         }
 
+        uint ConstantBuffer1Read(int offset)
+        {
+            return 0;
+        }
+
         T MemoryRead<T>(ulong address) where T : unmanaged;
 
         bool MemoryMapped(ulong address)
diff --git a/Ryujinx.Graphics.Shader/Instructions/InstEmitFlow.cs b/Ryujinx.Graphics.Shader/Instructions/InstEmitFlow.cs
index d4ab5955f6..1f5bf35bc6 100644
--- a/Ryujinx.Graphics.Shader/Instructions/InstEmitFlow.cs
+++ b/Ryujinx.Graphics.Shader/Instructions/InstEmitFlow.cs
@@ -25,6 +25,12 @@ namespace Ryujinx.Graphics.Shader.Instructions
         {
             OpCodeBranchIndir op = (OpCodeBranchIndir)context.CurrOp;
 
+            if (op.PossibleTargets.Count == 0)
+            {
+                context.Config.GpuAccessor.Log($"Failed to find targets for BRX instruction at 0x{op.Address:X}.");
+                return;
+            }
+
             int offset = (int)op.Address + 8 + op.Offset;
 
             Operand address = context.IAdd(Register(op.Ra), Const(offset));