From 6eb85e846f25ae36a39685d6ac91025deaea306c Mon Sep 17 00:00:00 2001
From: Logan Stromberg <loganstromberg@gmail.com>
Date: Thu, 14 Jul 2022 11:45:56 -0700
Subject: [PATCH] Reduce some unnecessary allocations in DMA handler (#2886)

* experimental changes to try and reduce allocations in kernel threading and DMA handler

* Simplify the changes in this branch to just 1. Don't make unnecessary copies of data just for texture-texture transfers and 2. Add a fast path for 1bpp linear byte copies

* forgot to check src + dst linearity in 1bpp DMA fast path. Fixes the UE4 regression.

* removing dev log I left in

* Generalizing the DMA linear fast path to cases other than 1bpp copies

* revert kernel changes

* revert whitespace

* remove unneeded references

* PR feedback

Co-authored-by: Logan Stromberg <lostromb@microsoft.com>
Co-authored-by: gdk <gab.dark.100@gmail.com>
---
 Ryujinx.Graphics.Gpu/Engine/Dma/DmaClass.cs | 47 +++++++++++++++------
 1 file changed, 33 insertions(+), 14 deletions(-)

diff --git a/Ryujinx.Graphics.Gpu/Engine/Dma/DmaClass.cs b/Ryujinx.Graphics.Gpu/Engine/Dma/DmaClass.cs
index df7e55a11d..b1f766b830 100644
--- a/Ryujinx.Graphics.Gpu/Engine/Dma/DmaClass.cs
+++ b/Ryujinx.Graphics.Gpu/Engine/Dma/DmaClass.cs
@@ -208,7 +208,6 @@ namespace Ryujinx.Graphics.Gpu.Engine.Dma
                 }
 
                 ReadOnlySpan<byte> srcSpan = memoryManager.GetSpan(srcGpuVa + (ulong)srcBaseOffset, srcSize, true);
-                Span<byte> dstSpan = memoryManager.GetSpan(dstGpuVa + (ulong)dstBaseOffset, dstSize).ToArray();
 
                 bool completeSource = IsTextureCopyComplete(src, srcLinear, srcBpp, srcStride, xCount, yCount);
                 bool completeDest = IsTextureCopyComplete(dst, dstLinear, dstBpp, dstStride, xCount, yCount);
@@ -262,43 +261,63 @@ namespace Ryujinx.Graphics.Gpu.Engine.Dma
                         target.SynchronizeMemory();
                         target.SetData(data);
                         target.SignalModified();
-
                         return;
                     }
                     else if (srcCalculator.LayoutMatches(dstCalculator))
                     {
-                        srcSpan.CopyTo(dstSpan); // No layout conversion has to be performed, just copy the data entirely.
-
-                        memoryManager.Write(dstGpuVa + (ulong)dstBaseOffset, dstSpan);
-
+                        // No layout conversion has to be performed, just copy the data entirely.
+                        memoryManager.Write(dstGpuVa + (ulong)dstBaseOffset, srcSpan);
                         return;
                     }
                 }
 
                 unsafe bool Convert<T>(Span<byte> dstSpan, ReadOnlySpan<byte> srcSpan) where T : unmanaged
                 {
-                    fixed (byte* dstPtr = dstSpan, srcPtr = srcSpan)
+                    if (srcLinear && dstLinear && srcBpp == dstBpp)
                     {
-                        byte* dstBase = dstPtr - dstBaseOffset; // Layout offset is relative to the base, so we need to subtract the span's offset.
-                        byte* srcBase = srcPtr - srcBaseOffset;
-
+                        // Optimized path for purely linear copies - we don't need to calculate every single byte offset,
+                        // and we can make use of Span.CopyTo which is very very fast (even compared to pointers)
                         for (int y = 0; y < yCount; y++)
                         {
                             srcCalculator.SetY(srcRegionY + y);
                             dstCalculator.SetY(dstRegionY + y);
+                            int srcOffset = srcCalculator.GetOffset(srcRegionX);
+                            int dstOffset = dstCalculator.GetOffset(dstRegionX);
+                            srcSpan.Slice(srcOffset - srcBaseOffset, xCount * srcBpp)
+                                .CopyTo(dstSpan.Slice(dstOffset - dstBaseOffset, xCount * dstBpp));
+                        }
+                    }
+                    else
+                    {
+                        fixed (byte* dstPtr = dstSpan, srcPtr = srcSpan)
+                        {
+                            byte* dstBase = dstPtr - dstBaseOffset; // Layout offset is relative to the base, so we need to subtract the span's offset.
+                            byte* srcBase = srcPtr - srcBaseOffset;
 
-                            for (int x = 0; x < xCount; x++)
+                            for (int y = 0; y < yCount; y++)
                             {
-                                int srcOffset = srcCalculator.GetOffset(srcRegionX + x);
-                                int dstOffset = dstCalculator.GetOffset(dstRegionX + x);
+                                srcCalculator.SetY(srcRegionY + y);
+                                dstCalculator.SetY(dstRegionY + y);
 
-                                *(T*)(dstBase + dstOffset) = *(T*)(srcBase + srcOffset);
+                                for (int x = 0; x < xCount; x++)
+                                {
+                                    int srcOffset = srcCalculator.GetOffset(srcRegionX + x);
+                                    int dstOffset = dstCalculator.GetOffset(dstRegionX + x);
+
+                                    *(T*)(dstBase + dstOffset) = *(T*)(srcBase + srcOffset);
+                                }
                             }
                         }
                     }
+
                     return true;
                 }
 
+                // OPT: This allocates a (potentially) huge temporary array and then copies an existing
+                // region of memory into it, data that might get overwritten entirely anyways. Ideally this should
+                // all be rewritten to use pooled arrays, but that gets complicated with packed data and strides
+                Span<byte> dstSpan = memoryManager.GetSpan(dstGpuVa + (ulong)dstBaseOffset, dstSize).ToArray();
+
                 bool _ = srcBpp switch
                 {
                     1 => Convert<byte>(dstSpan, srcSpan),