diff --git a/Ryujinx.Graphics.Gpu/Engine/MethodCopyBuffer.cs b/Ryujinx.Graphics.Gpu/Engine/MethodCopyBuffer.cs
index 7244db3242..2e6fe0ab17 100644
--- a/Ryujinx.Graphics.Gpu/Engine/MethodCopyBuffer.cs
+++ b/Ryujinx.Graphics.Gpu/Engine/MethodCopyBuffer.cs
@@ -1,6 +1,7 @@
 using Ryujinx.Graphics.Gpu.State;
 using Ryujinx.Graphics.Texture;
 using System;
+using System.Runtime.Intrinsics;
 
 namespace Ryujinx.Graphics.Gpu.Engine
 {
@@ -56,19 +57,58 @@ namespace Ryujinx.Graphics.Gpu.Engine
                 ulong srcBaseAddress = _context.MemoryManager.Translate(cbp.SrcAddress.Pack());
                 ulong dstBaseAddress = _context.MemoryManager.Translate(cbp.DstAddress.Pack());
 
-                for (int y = 0; y < cbp.YCount; y++)
-                for (int x = 0; x < cbp.XCount; x++)
+                (int srcBaseOffset, int srcSize) = srcCalculator.GetRectangleRange(src.RegionX, src.RegionY, cbp.XCount, cbp.YCount);
+                (int dstBaseOffset, int dstSize) = dstCalculator.GetRectangleRange(dst.RegionX, dst.RegionY, cbp.XCount, cbp.YCount);
+
+                ReadOnlySpan<byte> srcSpan = _context.PhysicalMemory.GetSpan(srcBaseAddress + (ulong)srcBaseOffset, srcSize);
+                Span<byte> dstSpan = _context.PhysicalMemory.GetSpan(dstBaseAddress + (ulong)dstBaseOffset, dstSize).ToArray();
+
+                bool completeSource = src.RegionX == 0 && src.RegionY == 0 && src.Width == cbp.XCount && src.Height == cbp.YCount;
+                bool completeDest = dst.RegionX == 0 && dst.RegionY == 0 && dst.Width == cbp.XCount && dst.Height == cbp.YCount;
+
+                if (completeSource && completeDest && srcCalculator.LayoutMatches(dstCalculator))
                 {
-                    int srcOffset = srcCalculator.GetOffset(src.RegionX + x, src.RegionY + y);
-                    int dstOffset = dstCalculator.GetOffset(dst.RegionX + x, dst.RegionY + y);
-
-                    ulong srcAddress = srcBaseAddress + (ulong)srcOffset;
-                    ulong dstAddress = dstBaseAddress + (ulong)dstOffset;
-
-                    ReadOnlySpan<byte> pixel = _context.PhysicalMemory.GetSpan(srcAddress, srcBpp);
-
-                    _context.PhysicalMemory.Write(dstAddress, pixel);
+                    srcSpan.CopyTo(dstSpan); // No layout conversion has to be performed, just copy the data entirely.
                 }
+                else 
+                {
+                    unsafe bool Convert<T>(Span<byte> dstSpan, ReadOnlySpan<byte> srcSpan) where T : unmanaged
+                    {
+                        fixed (byte* dstPtr = dstSpan, srcPtr = srcSpan)
+                        {
+                            byte* dstBase = dstPtr - dstBaseOffset; // Layout offset is relative to the base, so we need to subtract the span's offset.
+                            byte* srcBase = srcPtr - srcBaseOffset;
+
+                            for (int y = 0; y < cbp.YCount; y++)
+                            {
+                                srcCalculator.SetY(src.RegionY + y);
+                                dstCalculator.SetY(dst.RegionY + y);
+
+                                for (int x = 0; x < cbp.XCount; x++)
+                                {
+                                    int srcOffset = srcCalculator.GetOffset(src.RegionX + x);
+                                    int dstOffset = dstCalculator.GetOffset(dst.RegionX + x);
+
+                                    *(T*)(dstBase + dstOffset) = *(T*)(srcBase + srcOffset);
+                                }
+                            }
+                        }
+                        return true;
+                    }
+
+                    bool _ = srcBpp switch
+                    {
+                        1 => Convert<byte>(dstSpan, srcSpan),
+                        2 => Convert<ushort>(dstSpan, srcSpan),
+                        4 => Convert<uint>(dstSpan, srcSpan),
+                        8 => Convert<ulong>(dstSpan, srcSpan),
+                        12 => Convert<Bpp12Pixel>(dstSpan, srcSpan),
+                        16 => Convert<Vector128<byte>>(dstSpan, srcSpan),
+                        _ => throw new NotSupportedException($"Unable to copy ${srcBpp} bpp pixel format.")
+                    };
+                }
+
+                _context.PhysicalMemory.Write(dstBaseAddress + (ulong)dstBaseOffset, dstSpan);
             }
             else
             {
diff --git a/Ryujinx.Graphics.Texture/BlockLinearLayout.cs b/Ryujinx.Graphics.Texture/BlockLinearLayout.cs
index b95db70290..0b1122421d 100644
--- a/Ryujinx.Graphics.Texture/BlockLinearLayout.cs
+++ b/Ryujinx.Graphics.Texture/BlockLinearLayout.cs
@@ -33,6 +33,11 @@ namespace Ryujinx.Graphics.Texture
         private int _robSize;
         private int _sliceSize;
 
+        // Variables for built in iteration.
+        private int _yPart;
+        private int _yzPart;
+        private int _zPart;
+
         public BlockLinearLayout(
             int width,
             int height,
@@ -97,5 +102,94 @@ namespace Ryujinx.Graphics.Texture
 
             return offset;
         }
+
+        public (int offset, int size) GetRectangleRange(int x, int y, int width, int height)
+        {
+            // Justification:
+            // The 2D offset is a combination of separate x and y parts.
+            // Both components increase with input and never overlap bits.
+            // Therefore for each component, the minimum input value is the lowest that component can go.
+            // Minimum total value is minimum X component + minimum Y component. Similar goes for maximum.
+
+            int start = GetOffset(x, y, 0);
+            int end = GetOffset(x + width - 1, y + height - 1, 0) + _texBpp; // Cover the last pixel.
+            return (start, end - start);
+        }
+
+        public bool LayoutMatches(BlockLinearLayout other)
+        {
+            return _robSize == other._robSize &&
+                   _sliceSize == other._sliceSize &&
+                   _texBpp == other._texBpp &&
+                   _bhMask == other._bhMask &&
+                   _bdMask == other._bdMask;
+        }
+
+        // Functions for built in iteration.
+        // Components of the offset can be updated separately, and combined to save some time.
+
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        public void SetY(int y)
+        {
+            int yh = y / GobHeight;
+            int offset = (yh >> _bhShift) * _robSize;
+
+            offset += (yh & _bhMask) * GobSize;
+
+            offset += ((y & 0x07) >> 1) << 6;
+            offset += ((y & 0x01) >> 0) << 4;
+
+            _yPart = offset;
+            _yzPart = offset + _zPart;
+        }
+
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        public void SetZ(int z)
+        {
+            int offset = (z >> _bdShift) * _sliceSize;
+
+            offset += ((z & _bdMask) * GobSize) << _bhShift;
+
+            _zPart = offset;
+            _yzPart = offset + _yPart;
+        }
+
+        /// <summary>
+        /// Optimized conversion for line offset in bytes to an absolute offset. Input x must be divisible by 16.
+        /// </summary>
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        public int GetOffsetWithLineOffset16(int x)
+        {
+            int offset = (x / GobStride) << _xShift;
+
+            offset += ((x & 0x3f) >> 5) << 8;
+            offset += ((x & 0x1f) >> 4) << 5;
+
+            return offset + _yzPart;
+        }
+
+        /// <summary>
+        /// Optimized conversion for line offset in bytes to an absolute offset. Input x must be divisible by 64.
+        /// </summary>
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        public int GetOffsetWithLineOffset64(int x)
+        {
+            int offset = (x / GobStride) << _xShift;
+
+            return offset + _yzPart;
+        }
+
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        public int GetOffset(int x)
+        {
+            x <<= _bppShift;
+            int offset = (x / GobStride) << _xShift;
+
+            offset += ((x & 0x3f) >> 5) << 8;
+            offset += ((x & 0x1f) >> 4) << 5;
+            offset += (x & 0x0f);
+
+            return offset + _yzPart;
+        }
     }
 }
\ No newline at end of file
diff --git a/Ryujinx.Graphics.Texture/Bpp12Pixel.cs b/Ryujinx.Graphics.Texture/Bpp12Pixel.cs
new file mode 100644
index 0000000000..5a38259e27
--- /dev/null
+++ b/Ryujinx.Graphics.Texture/Bpp12Pixel.cs
@@ -0,0 +1,11 @@
+using System.Runtime.InteropServices;
+
+namespace Ryujinx.Graphics.Texture
+{
+    [StructLayout(LayoutKind.Sequential, Pack = 1, Size = 12)]
+    public struct Bpp12Pixel
+    {
+        private ulong _elem1;
+        private uint _elem2;
+    }
+}
diff --git a/Ryujinx.Graphics.Texture/LayoutConverter.cs b/Ryujinx.Graphics.Texture/LayoutConverter.cs
index ce2b37b54c..525271c4c3 100644
--- a/Ryujinx.Graphics.Texture/LayoutConverter.cs
+++ b/Ryujinx.Graphics.Texture/LayoutConverter.cs
@@ -1,6 +1,6 @@
 using Ryujinx.Common;
 using System;
-
+using System.Runtime.Intrinsics;
 using static Ryujinx.Graphics.Texture.BlockLinearConstants;
 
 namespace Ryujinx.Graphics.Texture
@@ -64,11 +64,14 @@ namespace Ryujinx.Graphics.Texture
                 }
 
                 int strideTrunc = BitUtils.AlignDown(w * bytesPerPixel, 16);
+                int strideTrunc64 = BitUtils.AlignDown(w * bytesPerPixel, 64);
 
                 int xStart = strideTrunc / bytesPerPixel;
 
                 int stride = BitUtils.AlignUp(w * bytesPerPixel, HostStrideAlignment);
 
+                int outStrideGap = stride - w * bytesPerPixel;
+
                 int alignment = gobWidth;
 
                 if (d < gobBlocksInZ || w <= gobWidth || h <= gobHeight)
@@ -86,36 +89,74 @@ namespace Ryujinx.Graphics.Texture
                     mipGobBlocksInZ,
                     bytesPerPixel);
 
-                for (int layer = 0; layer < layers; layer++)
+                unsafe bool Convert<T>(Span<byte> output, ReadOnlySpan<byte> data) where T : unmanaged
                 {
-                    int inBaseOffset = layer * sizeInfo.LayerSize + sizeInfo.GetMipOffset(level);
-
-                    for (int z = 0; z < d; z++)
-                    for (int y = 0; y < h; y++)
+                    fixed (byte* outputPtr = output, dataPtr = data)
                     {
-                        for (int x = 0; x < strideTrunc; x += 16)
+                        byte* outPtr = outputPtr + outOffs;
+                        for (int layer = 0; layer < layers; layer++)
                         {
-                            int offset = inBaseOffset + layoutConverter.GetOffsetWithLineOffset(x, y, z);
+                            byte* inBaseOffset = dataPtr + (layer * sizeInfo.LayerSize + sizeInfo.GetMipOffset(level));
 
-                            Span<byte> dest = output.Slice(outOffs + x, 16);
+                            for (int z = 0; z < d; z++)
+                            {
+                                layoutConverter.SetZ(z);
+                                for (int y = 0; y < h; y++)
+                                {
+                                    layoutConverter.SetY(y);
 
-                            data.Slice(offset, 16).CopyTo(dest);
+                                    for (int x = 0; x < strideTrunc64; x += 64, outPtr += 64)
+                                    {
+                                        byte* offset = inBaseOffset + layoutConverter.GetOffsetWithLineOffset64(x);
+                                        byte* offset2 = offset + 0x20;
+                                        byte* offset3 = offset + 0x100;
+                                        byte* offset4 = offset + 0x120;
+
+                                        Vector128<byte> value = *(Vector128<byte>*)offset;
+                                        Vector128<byte> value2 = *(Vector128<byte>*)offset2;
+                                        Vector128<byte> value3 = *(Vector128<byte>*)offset3;
+                                        Vector128<byte> value4 = *(Vector128<byte>*)offset4;
+
+                                        *(Vector128<byte>*)outPtr = value;
+                                        *(Vector128<byte>*)(outPtr + 16) = value2;
+                                        *(Vector128<byte>*)(outPtr + 32) = value3;
+                                        *(Vector128<byte>*)(outPtr + 48) = value4;
+                                    }
+
+                                    for (int x = strideTrunc64; x < strideTrunc; x += 16, outPtr += 16)
+                                    {
+                                        byte* offset = inBaseOffset + layoutConverter.GetOffsetWithLineOffset16(x);
+
+                                        *(Vector128<byte>*)outPtr = *(Vector128<byte>*)offset;
+                                    }
+
+                                    for (int x = xStart; x < w; x++, outPtr += bytesPerPixel)
+                                    {
+                                        byte* offset = inBaseOffset + layoutConverter.GetOffset(x);
+
+                                        *(T*)outPtr = *(T*)offset;
+                                    }
+
+                                    outPtr += outStrideGap;
+                                }
+                            }
                         }
-
-                        for (int x = xStart; x < w; x++)
-                        {
-                            int offset = inBaseOffset + layoutConverter.GetOffset(x, y, z);
-
-                            Span<byte> dest = output.Slice(outOffs + x * bytesPerPixel, bytesPerPixel);
-
-                            data.Slice(offset, bytesPerPixel).CopyTo(dest);
-                        }
-
-                        outOffs += stride;
+                        outOffs += stride * h * d * layers;
                     }
+                    return true;
                 }
-            }
 
+                bool _ = bytesPerPixel switch
+                {
+                    1 => Convert<byte>(output, data),
+                    2 => Convert<ushort>(output, data),
+                    4 => Convert<uint>(output, data),
+                    8 => Convert<ulong>(output, data),
+                    12 => Convert<Bpp12Pixel>(output, data),
+                    16 => Convert<Vector128<byte>>(output, data),
+                    _ => throw new NotSupportedException($"Unable to convert ${bytesPerPixel} bpp pixel format.")
+                };
+            }
             return output;
         }
 
@@ -132,22 +173,18 @@ namespace Ryujinx.Graphics.Texture
             int h = BitUtils.DivRoundUp(height, blockHeight);
 
             int outStride = BitUtils.AlignUp(w * bytesPerPixel, HostStrideAlignment);
+            int lineSize = w * bytesPerPixel;
 
             Span<byte> output = new byte[h * outStride];
 
             int outOffs = 0;
+            int inOffs = 0;
 
             for (int y = 0; y < h; y++)
             {
-                for (int x = 0; x < w; x++)
-                {
-                    int offset = y * stride + x * bytesPerPixel;
-
-                    Span<byte> dest = output.Slice(outOffs + x * bytesPerPixel, bytesPerPixel);
-
-                    data.Slice(offset, bytesPerPixel).CopyTo(dest);
-                }
+                data.Slice(inOffs, lineSize).CopyTo(output.Slice(outOffs, lineSize));
 
+                inOffs += stride;
                 outOffs += outStride;
             }
 
@@ -198,8 +235,15 @@ namespace Ryujinx.Graphics.Texture
                     mipGobBlocksInZ >>= 1;
                 }
 
+                int strideTrunc = BitUtils.AlignDown(w * bytesPerPixel, 16);
+                int strideTrunc64 = BitUtils.AlignDown(w * bytesPerPixel, 64);
+
+                int xStart = strideTrunc / bytesPerPixel;
+
                 int stride = BitUtils.AlignUp(w * bytesPerPixel, HostStrideAlignment);
 
+                int inStrideGap = stride - w * bytesPerPixel;
+
                 int alignment = gobWidth;
 
                 if (d < gobBlocksInZ || w <= gobWidth || h <= gobHeight)
@@ -217,25 +261,73 @@ namespace Ryujinx.Graphics.Texture
                     mipGobBlocksInZ,
                     bytesPerPixel);
 
-                for (int layer = 0; layer < layers; layer++)
+                unsafe bool Convert<T>(Span<byte> output, ReadOnlySpan<byte> data) where T : unmanaged
                 {
-                    int outBaseOffset = layer * sizeInfo.LayerSize + sizeInfo.GetMipOffset(level);
-
-                    for (int z = 0; z < d; z++)
-                    for (int y = 0; y < h; y++)
+                    fixed (byte* outputPtr = output, dataPtr = data)
                     {
-                        for (int x = 0; x < w; x++)
+                        byte* inPtr = dataPtr + inOffs;
+                        for (int layer = 0; layer < layers; layer++)
                         {
-                            int offset = outBaseOffset + layoutConverter.GetOffset(x, y, z);
+                            byte* outBaseOffset = outputPtr + (layer * sizeInfo.LayerSize + sizeInfo.GetMipOffset(level));
 
-                            Span<byte> dest = output.Slice(offset, bytesPerPixel);
+                            for (int z = 0; z < d; z++)
+                            {
+                                layoutConverter.SetZ(z);
+                                for (int y = 0; y < h; y++)
+                                {
+                                    layoutConverter.SetY(y);
 
-                            data.Slice(inOffs + x * bytesPerPixel, bytesPerPixel).CopyTo(dest);
+                                    for (int x = 0; x < strideTrunc64; x += 64, inPtr += 64)
+                                    {
+                                        byte* offset = outBaseOffset + layoutConverter.GetOffsetWithLineOffset64(x);
+                                        byte* offset2 = offset + 0x20;
+                                        byte* offset3 = offset + 0x100;
+                                        byte* offset4 = offset + 0x120;
+
+                                        Vector128<byte> value = *(Vector128<byte>*)inPtr;
+                                        Vector128<byte> value2 = *(Vector128<byte>*)(inPtr + 16);
+                                        Vector128<byte> value3 = *(Vector128<byte>*)(inPtr + 32);
+                                        Vector128<byte> value4 = *(Vector128<byte>*)(inPtr + 48);
+
+                                        *(Vector128<byte>*)offset = value;
+                                        *(Vector128<byte>*)offset2 = value2;
+                                        *(Vector128<byte>*)offset3 = value3;
+                                        *(Vector128<byte>*)offset4 = value4;
+                                    }
+
+                                    for (int x = strideTrunc64; x < strideTrunc; x += 16, inPtr += 16)
+                                    {
+                                        byte* offset = outBaseOffset + layoutConverter.GetOffsetWithLineOffset16(x);
+
+                                        *(Vector128<byte>*)offset = *(Vector128<byte>*)inPtr;
+                                    }
+
+                                    for (int x = xStart; x < w; x++, inPtr += bytesPerPixel)
+                                    {
+                                        byte* offset = outBaseOffset + layoutConverter.GetOffset(x);
+
+                                        *(T*)offset = *(T*)inPtr;
+                                    }
+
+                                    inPtr += inStrideGap;
+                                }
+                            }
                         }
-
-                        inOffs += stride;
+                        inOffs += stride * h * d * layers;
                     }
+                    return true;
                 }
+
+                bool _ = bytesPerPixel switch
+                {
+                    1 => Convert<byte>(output, data),
+                    2 => Convert<ushort>(output, data),
+                    4 => Convert<uint>(output, data),
+                    8 => Convert<ulong>(output, data),
+                    12 => Convert<Bpp12Pixel>(output, data),
+                    16 => Convert<Vector128<byte>>(output, data),
+                    _ => throw new NotSupportedException($"Unable to convert ${bytesPerPixel} bpp pixel format.")
+                };
             }
 
             return output;
@@ -254,23 +346,19 @@ namespace Ryujinx.Graphics.Texture
             int h = BitUtils.DivRoundUp(height, blockHeight);
 
             int inStride = BitUtils.AlignUp(w * bytesPerPixel, HostStrideAlignment);
+            int lineSize = width * bytesPerPixel;
 
             Span<byte> output = new byte[h * stride];
 
             int inOffs = 0;
+            int outOffs = 0;
 
             for (int y = 0; y < h; y++)
             {
-                for (int x = 0; x < w; x++)
-                {
-                    int offset = y * stride + x * bytesPerPixel;
-
-                    Span<byte> dest = output.Slice(offset, bytesPerPixel);
-
-                    data.Slice(inOffs + x * bytesPerPixel, bytesPerPixel).CopyTo(dest);
-                }
+                data.Slice(inOffs, lineSize).CopyTo(output.Slice(outOffs, lineSize));
 
                 inOffs += inStride;
+                outOffs += stride;
             }
 
             return output;
diff --git a/Ryujinx.Graphics.Texture/OffsetCalculator.cs b/Ryujinx.Graphics.Texture/OffsetCalculator.cs
index bb5d606ca4..1f5d9614a8 100644
--- a/Ryujinx.Graphics.Texture/OffsetCalculator.cs
+++ b/Ryujinx.Graphics.Texture/OffsetCalculator.cs
@@ -1,17 +1,22 @@
 using Ryujinx.Common;
-
+using System.Runtime.CompilerServices;
 using static Ryujinx.Graphics.Texture.BlockLinearConstants;
 
 namespace Ryujinx.Graphics.Texture
 {
     public class OffsetCalculator
     {
+        private int  _width;
+        private int  _height;
         private int  _stride;
         private bool _isLinear;
         private int  _bytesPerPixel;
 
         private BlockLinearLayout _layoutConverter;
 
+        // Variables for built in iteration.
+        private int _yPart;
+
         public OffsetCalculator(
             int  width,
             int  height,
@@ -20,6 +25,8 @@ namespace Ryujinx.Graphics.Texture
             int  gobBlocksInY,
             int  bytesPerPixel)
         {
+            _width         = width;
+            _height        = height;
             _stride        = stride;
             _isLinear      = isLinear;
             _bytesPerPixel = bytesPerPixel;
@@ -40,6 +47,18 @@ namespace Ryujinx.Graphics.Texture
             }
         }
 
+        public void SetY(int y)
+        {
+            if (_isLinear)
+            {
+                _yPart = y * _stride;
+            }
+            else
+            {
+                _layoutConverter.SetY(y);
+            }
+        }
+
         public int GetOffset(int x, int y)
         {
             if (_isLinear)
@@ -51,5 +70,48 @@ namespace Ryujinx.Graphics.Texture
                 return _layoutConverter.GetOffset(x, y, 0);
             }
         }
+
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        public int GetOffset(int x)
+        {
+            if (_isLinear)
+            {
+                return x * _bytesPerPixel + _yPart;
+            }
+            else
+            {
+                return _layoutConverter.GetOffset(x);
+            }
+        }
+
+        public (int offset, int size) GetRectangleRange(int x, int y, int width, int height)
+        {
+            if (_isLinear)
+            {
+                int start = y * _stride + x * _bytesPerPixel;
+                int end = (y + height - 1) * _stride + (x + width) * _bytesPerPixel;
+                return (start, end - start);
+            }
+            else
+            {
+                return _layoutConverter.GetRectangleRange(x, y, width, height);
+            }
+        }
+
+        public bool LayoutMatches(OffsetCalculator other)
+        {
+            if (_isLinear)
+            {
+                return other._isLinear &&
+                       _width == other._width &&
+                       _height == other._height &&
+                       _stride == other._stride &&
+                       _bytesPerPixel == other._bytesPerPixel;
+            }
+            else
+            {
+                return !other._isLinear && _layoutConverter.LayoutMatches(other._layoutConverter);
+            }
+        }
     }
 }
\ No newline at end of file