From 42c75dbb8f9472f434d0324a37a87e91ee7b50f3 Mon Sep 17 00:00:00 2001
From: gdkchan <gab.dark.100@gmail.com>
Date: Sat, 22 Jan 2022 15:23:00 -0300
Subject: [PATCH] Add support for BC1/2/3 decompression (for 3D textures)
 (#2987)

* Add support for BC1/2/3 decompression (for 3D textures)

* Optimize and clean up

* Unsafe not needed here

* Fix alpha value interpolation when a0 <= a1
---
 Ryujinx.Graphics.GAL/Capabilities.cs          |  37 +-
 Ryujinx.Graphics.GAL/Format.cs                |  22 -
 Ryujinx.Graphics.Gpu/GpuContext.cs            |  19 +-
 Ryujinx.Graphics.Gpu/Image/Texture.cs         |  30 +-
 .../Image/TextureCompatibility.cs             |  18 +-
 Ryujinx.Graphics.OpenGL/FormatTable.cs        |   2 -
 Ryujinx.Graphics.OpenGL/Renderer.cs           |   1 +
 Ryujinx.Graphics.Texture/BCnDecoder.cs        | 734 ++++++++++++++++--
 8 files changed, 720 insertions(+), 143 deletions(-)

diff --git a/Ryujinx.Graphics.GAL/Capabilities.cs b/Ryujinx.Graphics.GAL/Capabilities.cs
index c7cedb34b5..4e5dff596c 100644
--- a/Ryujinx.Graphics.GAL/Capabilities.cs
+++ b/Ryujinx.Graphics.GAL/Capabilities.cs
@@ -2,30 +2,32 @@ namespace Ryujinx.Graphics.GAL
 {
     public struct Capabilities
     {
-        public bool HasFrontFacingBug { get; }
-        public bool HasVectorIndexingBug { get; }
+        public readonly bool HasFrontFacingBug;
+        public readonly bool HasVectorIndexingBug;
 
-        public bool SupportsAstcCompression { get; }
-        public bool SupportsBgraFormat { get; }
-        public bool SupportsR4G4Format { get; }
-        public bool SupportsFragmentShaderInterlock { get; }
-        public bool SupportsFragmentShaderOrderingIntel { get; }
-        public bool SupportsImageLoadFormatted { get; }
-        public bool SupportsMismatchingViewFormat { get; }
-        public bool SupportsNonConstantTextureOffset { get; }
-        public bool SupportsShaderBallot { get; }
-        public bool SupportsTextureShadowLod { get; }
-        public bool SupportsViewportSwizzle { get; }
-        public bool SupportsIndirectParameters { get; }
+        public readonly bool SupportsAstcCompression;
+        public readonly bool Supports3DTextureCompression;
+        public readonly bool SupportsBgraFormat;
+        public readonly bool SupportsR4G4Format;
+        public readonly bool SupportsFragmentShaderInterlock;
+        public readonly bool SupportsFragmentShaderOrderingIntel;
+        public readonly bool SupportsImageLoadFormatted;
+        public readonly bool SupportsMismatchingViewFormat;
+        public readonly bool SupportsNonConstantTextureOffset;
+        public readonly bool SupportsShaderBallot;
+        public readonly bool SupportsTextureShadowLod;
+        public readonly bool SupportsViewportSwizzle;
+        public readonly bool SupportsIndirectParameters;
 
-        public int MaximumComputeSharedMemorySize { get; }
-        public float MaximumSupportedAnisotropy { get; }
-        public int StorageBufferOffsetAlignment { get; }
+        public readonly int MaximumComputeSharedMemorySize;
+        public readonly float MaximumSupportedAnisotropy;
+        public readonly int StorageBufferOffsetAlignment;
 
         public Capabilities(
             bool hasFrontFacingBug,
             bool hasVectorIndexingBug,
             bool supportsAstcCompression,
+            bool supports3DTextureCompression,
             bool supportsBgraFormat,
             bool supportsR4G4Format,
             bool supportsFragmentShaderInterlock,
@@ -44,6 +46,7 @@ namespace Ryujinx.Graphics.GAL
             HasFrontFacingBug = hasFrontFacingBug;
             HasVectorIndexingBug = hasVectorIndexingBug;
             SupportsAstcCompression = supportsAstcCompression;
+            Supports3DTextureCompression = supports3DTextureCompression;
             SupportsBgraFormat = supportsBgraFormat;
             SupportsR4G4Format = supportsR4G4Format;
             SupportsFragmentShaderInterlock = supportsFragmentShaderInterlock;
diff --git a/Ryujinx.Graphics.GAL/Format.cs b/Ryujinx.Graphics.GAL/Format.cs
index d5e183bafe..a454413bf7 100644
--- a/Ryujinx.Graphics.GAL/Format.cs
+++ b/Ryujinx.Graphics.GAL/Format.cs
@@ -67,11 +67,9 @@ namespace Ryujinx.Graphics.GAL
         R10G10B10A2Uint,
         R11G11B10Float,
         R9G9B9E5Float,
-        Bc1RgbUnorm,
         Bc1RgbaUnorm,
         Bc2Unorm,
         Bc3Unorm,
-        Bc1RgbSrgb,
         Bc1RgbaSrgb,
         Bc2Srgb,
         Bc3Srgb,
@@ -349,25 +347,5 @@ namespace Ryujinx.Graphics.GAL
         {
             return format.IsUint() || format.IsSint();
         }
-
-        /// <summary>
-        /// Checks if the texture format is a BC4 compressed format.
-        /// </summary>
-        /// <param name="format">Texture format</param>
-        /// <returns>True if the texture format is a BC4 compressed format, false otherwise</returns>
-        public static bool IsBc4(this Format format)
-        {
-            return format == Format.Bc4Unorm || format == Format.Bc4Snorm;
-        }
-
-        /// <summary>
-        /// Checks if the texture format is a BC5 compressed format.
-        /// </summary>
-        /// <param name="format">Texture format</param>
-        /// <returns>True if the texture format is a BC5 compressed format, false otherwise</returns>
-        public static bool IsBc5(this Format format)
-        {
-            return format == Format.Bc5Unorm || format == Format.Bc5Snorm;
-        }
     }
 }
\ No newline at end of file
diff --git a/Ryujinx.Graphics.Gpu/GpuContext.cs b/Ryujinx.Graphics.Gpu/GpuContext.cs
index 5c9af3839e..ddc95b2c0e 100644
--- a/Ryujinx.Graphics.Gpu/GpuContext.cs
+++ b/Ryujinx.Graphics.Gpu/GpuContext.cs
@@ -78,14 +78,27 @@ namespace Ryujinx.Graphics.Gpu
         /// <summary>
         /// Host hardware capabilities.
         /// </summary>
-        internal Capabilities Capabilities => _caps.Value;
+        internal ref Capabilities Capabilities
+        {
+            get
+            {
+                if (!_capsLoaded)
+                {
+                    _caps = Renderer.GetCapabilities();
+                    _capsLoaded = true;
+                }
+
+                return ref _caps;
+            }
+        }
 
         /// <summary>
         /// Event for signalling shader cache loading progress.
         /// </summary>
         public event Action<ShaderCacheState, int, int> ShaderCacheStateChanged;
 
-        private readonly Lazy<Capabilities> _caps;
+        private bool _capsLoaded;
+        private Capabilities _caps;
         private Thread _gpuThread;
 
         /// <summary>
@@ -110,8 +123,6 @@ namespace Ryujinx.Graphics.Gpu
             DeferredActions = new Queue<Action>();
 
             PhysicalMemoryRegistry = new ConcurrentDictionary<long, PhysicalMemory>();
-
-            _caps = new Lazy<Capabilities>(Renderer.GetCapabilities);
         }
 
         /// <summary>
diff --git a/Ryujinx.Graphics.Gpu/Image/Texture.cs b/Ryujinx.Graphics.Gpu/Image/Texture.cs
index b2fa15a257..e1f00606fe 100644
--- a/Ryujinx.Graphics.Gpu/Image/Texture.cs
+++ b/Ryujinx.Graphics.Gpu/Image/Texture.cs
@@ -834,13 +834,31 @@ namespace Ryujinx.Graphics.Gpu.Image
             {
                 data = PixelConverter.ConvertR4G4ToR4G4B4A4(data);
             }
-            else if (Target == Target.Texture3D && Format.IsBc4())
+            else if (!_context.Capabilities.Supports3DTextureCompression && Target == Target.Texture3D)
             {
-                data = BCnDecoder.DecodeBC4(data, width, height, depth, levels, layers, Info.FormatInfo.Format == Format.Bc4Snorm);
-            }
-            else if (Target == Target.Texture3D && Format.IsBc5())
-            {
-                data = BCnDecoder.DecodeBC5(data, width, height, depth, levels, layers, Info.FormatInfo.Format == Format.Bc5Snorm);
+                switch (Format)
+                {
+                    case Format.Bc1RgbaSrgb:
+                    case Format.Bc1RgbaUnorm:
+                        data = BCnDecoder.DecodeBC1(data, width, height, depth, levels, layers);
+                        break;
+                    case Format.Bc2Srgb:
+                    case Format.Bc2Unorm:
+                        data = BCnDecoder.DecodeBC2(data, width, height, depth, levels, layers);
+                        break;
+                    case Format.Bc3Srgb:
+                    case Format.Bc3Unorm:
+                        data = BCnDecoder.DecodeBC3(data, width, height, depth, levels, layers);
+                        break;
+                    case Format.Bc4Snorm:
+                    case Format.Bc4Unorm:
+                        data = BCnDecoder.DecodeBC4(data, width, height, depth, levels, layers, Format == Format.Bc4Snorm);
+                        break;
+                    case Format.Bc5Snorm:
+                    case Format.Bc5Unorm:
+                        data = BCnDecoder.DecodeBC5(data, width, height, depth, levels, layers, Format == Format.Bc5Snorm);
+                        break;
+                }
             }
 
             return data;
diff --git a/Ryujinx.Graphics.Gpu/Image/TextureCompatibility.cs b/Ryujinx.Graphics.Gpu/Image/TextureCompatibility.cs
index 0461a81f78..188e1e090a 100644
--- a/Ryujinx.Graphics.Gpu/Image/TextureCompatibility.cs
+++ b/Ryujinx.Graphics.Gpu/Image/TextureCompatibility.cs
@@ -14,9 +14,6 @@ namespace Ryujinx.Graphics.Gpu.Image
         private enum FormatClass
         {
             Unclassified,
-            BCn64,
-            BCn128,
-            Bc1Rgb,
             Bc1Rgba,
             Bc2,
             Bc3,
@@ -88,13 +85,21 @@ namespace Ryujinx.Graphics.Gpu.Image
                 return new FormatInfo(Format.R4G4B4A4Unorm, 1, 1, 2, 4);
             }
 
-            if (info.Target == Target.Texture3D)
+            if (!caps.Supports3DTextureCompression && info.Target == Target.Texture3D)
             {
-                // The host API does not support 3D BC4/BC5 compressed formats.
+                // The host API does not support 3D compressed formats.
                 // We assume software decompression will be done for those textures,
                 // and so we adjust the format here to match the decompressor output.
                 switch (info.FormatInfo.Format)
                 {
+                    case Format.Bc1RgbaSrgb:
+                    case Format.Bc2Srgb:
+                    case Format.Bc3Srgb:
+                        return new FormatInfo(Format.R8G8B8A8Srgb, 1, 1, 4, 4);
+                    case Format.Bc1RgbaUnorm:
+                    case Format.Bc2Unorm:
+                    case Format.Bc3Unorm:
+                        return new FormatInfo(Format.R8G8B8A8Unorm, 1, 1, 4, 4);
                     case Format.Bc4Unorm:
                         return new FormatInfo(Format.R8Unorm, 1, 1, 1, 1);
                     case Format.Bc4Snorm:
@@ -749,9 +754,6 @@ namespace Ryujinx.Graphics.Gpu.Image
         {
             switch (format)
             {
-                case Format.Bc1RgbSrgb:
-                case Format.Bc1RgbUnorm:
-                    return FormatClass.Bc1Rgb;
                 case Format.Bc1RgbaSrgb:
                 case Format.Bc1RgbaUnorm:
                     return FormatClass.Bc1Rgba;
diff --git a/Ryujinx.Graphics.OpenGL/FormatTable.cs b/Ryujinx.Graphics.OpenGL/FormatTable.cs
index e3249cd6f6..41fd9f3701 100644
--- a/Ryujinx.Graphics.OpenGL/FormatTable.cs
+++ b/Ryujinx.Graphics.OpenGL/FormatTable.cs
@@ -80,11 +80,9 @@ namespace Ryujinx.Graphics.OpenGL
             Add(Format.R10G10B10A2Uint,     new FormatInfo(4, false, false, All.Rgb10A2ui,         PixelFormat.RgbaInteger,    PixelType.UnsignedInt2101010Reversed));
             Add(Format.R11G11B10Float,      new FormatInfo(3, false, false, All.R11fG11fB10f,      PixelFormat.Rgb,            PixelType.UnsignedInt10F11F11FRev));
             Add(Format.R9G9B9E5Float,       new FormatInfo(3, false, false, All.Rgb9E5,            PixelFormat.Rgb,            PixelType.UnsignedInt5999Rev));
-            Add(Format.Bc1RgbUnorm,         new FormatInfo(3, true,  false, All.CompressedRgbS3tcDxt1Ext));
             Add(Format.Bc1RgbaUnorm,        new FormatInfo(4, true,  false, All.CompressedRgbaS3tcDxt1Ext));
             Add(Format.Bc2Unorm,            new FormatInfo(4, true,  false, All.CompressedRgbaS3tcDxt3Ext));
             Add(Format.Bc3Unorm,            new FormatInfo(4, true,  false, All.CompressedRgbaS3tcDxt5Ext));
-            Add(Format.Bc1RgbSrgb,          new FormatInfo(3, false, false, All.CompressedSrgbS3tcDxt1Ext));
             Add(Format.Bc1RgbaSrgb,         new FormatInfo(4, true,  false, All.CompressedSrgbAlphaS3tcDxt1Ext));
             Add(Format.Bc2Srgb,             new FormatInfo(4, false, false, All.CompressedSrgbAlphaS3tcDxt3Ext));
             Add(Format.Bc3Srgb,             new FormatInfo(4, false, false, All.CompressedSrgbAlphaS3tcDxt5Ext));
diff --git a/Ryujinx.Graphics.OpenGL/Renderer.cs b/Ryujinx.Graphics.OpenGL/Renderer.cs
index ceacbf2944..8d44f2e447 100644
--- a/Ryujinx.Graphics.OpenGL/Renderer.cs
+++ b/Ryujinx.Graphics.OpenGL/Renderer.cs
@@ -104,6 +104,7 @@ namespace Ryujinx.Graphics.OpenGL
                 hasFrontFacingBug: HwCapabilities.Vendor == HwCapabilities.GpuVendor.IntelWindows,
                 hasVectorIndexingBug: HwCapabilities.Vendor == HwCapabilities.GpuVendor.AmdWindows,
                 supportsAstcCompression: HwCapabilities.SupportsAstcCompression,
+                supports3DTextureCompression: false,
                 supportsBgraFormat: false,
                 supportsR4G4Format: false,
                 supportsFragmentShaderInterlock: HwCapabilities.SupportsFragmentShaderInterlock,
diff --git a/Ryujinx.Graphics.Texture/BCnDecoder.cs b/Ryujinx.Graphics.Texture/BCnDecoder.cs
index b8b04bac2e..b840cac891 100644
--- a/Ryujinx.Graphics.Texture/BCnDecoder.cs
+++ b/Ryujinx.Graphics.Texture/BCnDecoder.cs
@@ -1,7 +1,9 @@
 using Ryujinx.Common;
 using System;
-using System.Runtime.CompilerServices;
+using System.Buffers.Binary;
 using System.Runtime.InteropServices;
+using System.Runtime.Intrinsics;
+using System.Runtime.Intrinsics.X86;
 
 namespace Ryujinx.Graphics.Texture
 {
@@ -10,22 +12,30 @@ namespace Ryujinx.Graphics.Texture
         private const int BlockWidth = 4;
         private const int BlockHeight = 4;
 
-        public static byte[] DecodeBC4(ReadOnlySpan<byte> data, int width, int height, int depth, int levels, int layers, bool signed)
+        public static byte[] DecodeBC1(ReadOnlySpan<byte> data, int width, int height, int depth, int levels, int layers)
         {
             int size = 0;
 
             for (int l = 0; l < levels; l++)
             {
-                size += Math.Max(1, width >> l) * Math.Max(1, height >> l) * Math.Max(1, depth >> l) * layers;
+                size += Math.Max(1, width >> l) * Math.Max(1, height >> l) * Math.Max(1, depth >> l) * layers * 4;
             }
 
             byte[] output = new byte[size];
 
-            ReadOnlySpan<ulong> data64 = MemoryMarshal.Cast<byte, ulong>(data);
+            Span<byte> tile = stackalloc byte[BlockWidth * BlockHeight * 4];
 
-            Span<byte> rPal = stackalloc byte[8];
+            Span<uint> tileAsUint = MemoryMarshal.Cast<byte, uint>(tile);
+            Span<uint> outputAsUint = MemoryMarshal.Cast<byte, uint>(output);
 
-            int baseOOffs = 0;
+            Span<Vector128<byte>> tileAsVector128 = MemoryMarshal.Cast<byte, Vector128<byte>>(tile);
+
+            Span<Vector128<byte>> outputLine0 = default;
+            Span<Vector128<byte>> outputLine1 = default;
+            Span<Vector128<byte>> outputLine2 = default;
+            Span<Vector128<byte>> outputLine3 = default;
+
+            int imageBaseOOffs = 0;
 
             for (int l = 0; l < levels; l++)
             {
@@ -39,11 +49,302 @@ namespace Ryujinx.Graphics.Texture
                         for (int y = 0; y < h; y++)
                         {
                             int baseY = y * BlockHeight;
+                            int copyHeight = Math.Min(BlockHeight, height - baseY);
+                            int lineBaseOOffs = imageBaseOOffs + baseY * width;
+
+                            if (copyHeight == 4)
+                            {
+                                outputLine0 = MemoryMarshal.Cast<uint, Vector128<byte>>(outputAsUint.Slice(lineBaseOOffs));
+                                outputLine1 = MemoryMarshal.Cast<uint, Vector128<byte>>(outputAsUint.Slice(lineBaseOOffs + width));
+                                outputLine2 = MemoryMarshal.Cast<uint, Vector128<byte>>(outputAsUint.Slice(lineBaseOOffs + width * 2));
+                                outputLine3 = MemoryMarshal.Cast<uint, Vector128<byte>>(outputAsUint.Slice(lineBaseOOffs + width * 3));
+                            }
 
                             for (int x = 0; x < w; x++)
                             {
                                 int baseX = x * BlockWidth;
-                                int lineBaseOOffs = baseOOffs + baseX;
+                                int copyWidth = Math.Min(BlockWidth, width - baseX);
+
+                                BC1DecodeTileRgb(tile, data);
+
+                                if ((copyWidth | copyHeight) == 4)
+                                {
+                                    outputLine0[x] = tileAsVector128[0];
+                                    outputLine1[x] = tileAsVector128[1];
+                                    outputLine2[x] = tileAsVector128[2];
+                                    outputLine3[x] = tileAsVector128[3];
+                                }
+                                else
+                                {
+                                    int pixelBaseOOffs = lineBaseOOffs + baseX;
+
+                                    for (int tY = 0; tY < copyHeight; tY++)
+                                    {
+                                        tileAsUint.Slice(tY * 4, copyWidth).CopyTo(outputAsUint.Slice(pixelBaseOOffs + width * tY, copyWidth));
+                                    }
+                                }
+
+                                data = data.Slice(8);
+                            }
+                        }
+
+                        imageBaseOOffs += width * height;
+                    }
+                }
+
+                width = Math.Max(1, width >> 1);
+                height = Math.Max(1, height >> 1);
+                depth = Math.Max(1, depth >> 1);
+            }
+
+            return output;
+        }
+
+        public static byte[] DecodeBC2(ReadOnlySpan<byte> data, int width, int height, int depth, int levels, int layers)
+        {
+            int size = 0;
+
+            for (int l = 0; l < levels; l++)
+            {
+                size += Math.Max(1, width >> l) * Math.Max(1, height >> l) * Math.Max(1, depth >> l) * layers * 4;
+            }
+
+            byte[] output = new byte[size];
+
+            Span<byte> tile = stackalloc byte[BlockWidth * BlockHeight * 4];
+
+            Span<uint> tileAsUint = MemoryMarshal.Cast<byte, uint>(tile);
+            Span<uint> outputAsUint = MemoryMarshal.Cast<byte, uint>(output);
+
+            Span<Vector128<byte>> tileAsVector128 = MemoryMarshal.Cast<byte, Vector128<byte>>(tile);
+
+            Span<Vector128<byte>> outputLine0 = default;
+            Span<Vector128<byte>> outputLine1 = default;
+            Span<Vector128<byte>> outputLine2 = default;
+            Span<Vector128<byte>> outputLine3 = default;
+
+            int imageBaseOOffs = 0;
+
+            for (int l = 0; l < levels; l++)
+            {
+                int w = BitUtils.DivRoundUp(width, BlockWidth);
+                int h = BitUtils.DivRoundUp(height, BlockHeight);
+
+                for (int l2 = 0; l2 < layers; l2++)
+                {
+                    for (int z = 0; z < depth; z++)
+                    {
+                        for (int y = 0; y < h; y++)
+                        {
+                            int baseY = y * BlockHeight;
+                            int copyHeight = Math.Min(BlockHeight, height - baseY);
+                            int lineBaseOOffs = imageBaseOOffs + baseY * width;
+
+                            if (copyHeight == 4)
+                            {
+                                outputLine0 = MemoryMarshal.Cast<uint, Vector128<byte>>(outputAsUint.Slice(lineBaseOOffs));
+                                outputLine1 = MemoryMarshal.Cast<uint, Vector128<byte>>(outputAsUint.Slice(lineBaseOOffs + width));
+                                outputLine2 = MemoryMarshal.Cast<uint, Vector128<byte>>(outputAsUint.Slice(lineBaseOOffs + width * 2));
+                                outputLine3 = MemoryMarshal.Cast<uint, Vector128<byte>>(outputAsUint.Slice(lineBaseOOffs + width * 3));
+                            }
+
+                            for (int x = 0; x < w; x++)
+                            {
+                                int baseX = x * BlockWidth;
+                                int copyWidth = Math.Min(BlockWidth, width - baseX);
+
+                                BC23DecodeTileRgb(tile, data.Slice(8));
+
+                                ulong block = BinaryPrimitives.ReadUInt64LittleEndian(data);
+
+                                for (int i = 3; i < BlockWidth * BlockHeight * 4; i += 4, block >>= 4)
+                                {
+                                    tile[i] = (byte)((block & 0xf) | (block << 4));
+                                }
+
+                                if ((copyWidth | copyHeight) == 4)
+                                {
+                                    outputLine0[x] = tileAsVector128[0];
+                                    outputLine1[x] = tileAsVector128[1];
+                                    outputLine2[x] = tileAsVector128[2];
+                                    outputLine3[x] = tileAsVector128[3];
+                                }
+                                else
+                                {
+                                    int pixelBaseOOffs = lineBaseOOffs + baseX;
+
+                                    for (int tY = 0; tY < copyHeight; tY++)
+                                    {
+                                        tileAsUint.Slice(tY * 4, copyWidth).CopyTo(outputAsUint.Slice(pixelBaseOOffs + width * tY, copyWidth));
+                                    }
+                                }
+
+                                data = data.Slice(16);
+                            }
+                        }
+
+                        imageBaseOOffs += width * height;
+                    }
+                }
+
+                width = Math.Max(1, width >> 1);
+                height = Math.Max(1, height >> 1);
+                depth = Math.Max(1, depth >> 1);
+            }
+
+            return output;
+        }
+
+        public static byte[] DecodeBC3(ReadOnlySpan<byte> data, int width, int height, int depth, int levels, int layers)
+        {
+            int size = 0;
+
+            for (int l = 0; l < levels; l++)
+            {
+                size += Math.Max(1, width >> l) * Math.Max(1, height >> l) * Math.Max(1, depth >> l) * layers * 4;
+            }
+
+            byte[] output = new byte[size];
+
+            Span<byte> tile = stackalloc byte[BlockWidth * BlockHeight * 4];
+            Span<byte> rPal = stackalloc byte[8];
+
+            Span<uint> tileAsUint = MemoryMarshal.Cast<byte, uint>(tile);
+            Span<uint> outputAsUint = MemoryMarshal.Cast<byte, uint>(output);
+
+            Span<Vector128<byte>> tileAsVector128 = MemoryMarshal.Cast<byte, Vector128<byte>>(tile);
+
+            Span<Vector128<byte>> outputLine0 = default;
+            Span<Vector128<byte>> outputLine1 = default;
+            Span<Vector128<byte>> outputLine2 = default;
+            Span<Vector128<byte>> outputLine3 = default;
+
+            int imageBaseOOffs = 0;
+
+            for (int l = 0; l < levels; l++)
+            {
+                int w = BitUtils.DivRoundUp(width, BlockWidth);
+                int h = BitUtils.DivRoundUp(height, BlockHeight);
+
+                for (int l2 = 0; l2 < layers; l2++)
+                {
+                    for (int z = 0; z < depth; z++)
+                    {
+                        for (int y = 0; y < h; y++)
+                        {
+                            int baseY = y * BlockHeight;
+                            int copyHeight = Math.Min(BlockHeight, height - baseY);
+                            int lineBaseOOffs = imageBaseOOffs + baseY * width;
+
+                            if (copyHeight == 4)
+                            {
+                                outputLine0 = MemoryMarshal.Cast<uint, Vector128<byte>>(outputAsUint.Slice(lineBaseOOffs));
+                                outputLine1 = MemoryMarshal.Cast<uint, Vector128<byte>>(outputAsUint.Slice(lineBaseOOffs + width));
+                                outputLine2 = MemoryMarshal.Cast<uint, Vector128<byte>>(outputAsUint.Slice(lineBaseOOffs + width * 2));
+                                outputLine3 = MemoryMarshal.Cast<uint, Vector128<byte>>(outputAsUint.Slice(lineBaseOOffs + width * 3));
+                            }
+
+                            for (int x = 0; x < w; x++)
+                            {
+                                int baseX = x * BlockWidth;
+                                int copyWidth = Math.Min(BlockWidth, width - baseX);
+
+                                BC23DecodeTileRgb(tile, data.Slice(8));
+
+                                ulong block = BinaryPrimitives.ReadUInt64LittleEndian(data);
+
+                                rPal[0] = (byte)block;
+                                rPal[1] = (byte)(block >> 8);
+
+                                BCnLerpAlphaUnorm(rPal);
+                                BCnDecodeTileAlphaRgba(tile, rPal, block >> 16);
+
+                                if ((copyWidth | copyHeight) == 4)
+                                {
+                                    outputLine0[x] = tileAsVector128[0];
+                                    outputLine1[x] = tileAsVector128[1];
+                                    outputLine2[x] = tileAsVector128[2];
+                                    outputLine3[x] = tileAsVector128[3];
+                                }
+                                else
+                                {
+                                    int pixelBaseOOffs = lineBaseOOffs + baseX;
+
+                                    for (int tY = 0; tY < copyHeight; tY++)
+                                    {
+                                        tileAsUint.Slice(tY * 4, copyWidth).CopyTo(outputAsUint.Slice(pixelBaseOOffs + width * tY, copyWidth));
+                                    }
+                                }
+
+                                data = data.Slice(16);
+                            }
+                        }
+
+                        imageBaseOOffs += width * height;
+                    }
+                }
+
+                width = Math.Max(1, width >> 1);
+                height = Math.Max(1, height >> 1);
+                depth = Math.Max(1, depth >> 1);
+            }
+
+            return output;
+        }
+
+        public static byte[] DecodeBC4(ReadOnlySpan<byte> data, int width, int height, int depth, int levels, int layers, bool signed)
+        {
+            int size = 0;
+
+            for (int l = 0; l < levels; l++)
+            {
+                size += Math.Max(1, width >> l) * Math.Max(1, height >> l) * Math.Max(1, depth >> l) * layers;
+            }
+
+            byte[] output = new byte[size];
+            Span<byte> outputSpan = new Span<byte>(output);
+
+            ReadOnlySpan<ulong> data64 = MemoryMarshal.Cast<byte, ulong>(data);
+
+            Span<byte> tile = stackalloc byte[BlockWidth * BlockHeight];
+            Span<byte> rPal = stackalloc byte[8];
+
+            Span<uint> tileAsUint = MemoryMarshal.Cast<byte, uint>(tile);
+
+            Span<uint> outputLine0 = default;
+            Span<uint> outputLine1 = default;
+            Span<uint> outputLine2 = default;
+            Span<uint> outputLine3 = default;
+
+            int imageBaseOOffs = 0;
+
+            for (int l = 0; l < levels; l++)
+            {
+                int w = BitUtils.DivRoundUp(width, BlockWidth);
+                int h = BitUtils.DivRoundUp(height, BlockHeight);
+
+                for (int l2 = 0; l2 < layers; l2++)
+                {
+                    for (int z = 0; z < depth; z++)
+                    {
+                        for (int y = 0; y < h; y++)
+                        {
+                            int baseY = y * BlockHeight;
+                            int copyHeight = Math.Min(BlockHeight, height - baseY);
+                            int lineBaseOOffs = imageBaseOOffs + baseY * width;
+
+                            if (copyHeight == 4)
+                            {
+                                outputLine0 = MemoryMarshal.Cast<byte, uint>(outputSpan.Slice(lineBaseOOffs));
+                                outputLine1 = MemoryMarshal.Cast<byte, uint>(outputSpan.Slice(lineBaseOOffs + width));
+                                outputLine2 = MemoryMarshal.Cast<byte, uint>(outputSpan.Slice(lineBaseOOffs + width * 2));
+                                outputLine3 = MemoryMarshal.Cast<byte, uint>(outputSpan.Slice(lineBaseOOffs + width * 3));
+                            }
+
+                            for (int x = 0; x < w; x++)
+                            {
+                                int baseX = x * BlockWidth;
+                                int copyWidth = Math.Min(BlockWidth, width - baseX);
 
                                 ulong block = data64[0];
 
@@ -52,45 +353,43 @@ namespace Ryujinx.Graphics.Texture
 
                                 if (signed)
                                 {
-                                    CalculateBC3AlphaS(rPal);
+                                    BCnLerpAlphaSnorm(rPal);
                                 }
                                 else
                                 {
-                                    CalculateBC3Alpha(rPal);
+                                    BCnLerpAlphaUnorm(rPal);
                                 }
 
-                                ulong rI = block >> 16;
+                                BCnDecodeTileAlpha(tile, rPal, block >> 16);
 
-                                for (int texel = 0; texel < BlockWidth * BlockHeight; texel++)
+                                if ((copyWidth | copyHeight) == 4)
                                 {
-                                    int tX = texel & 3;
-                                    int tY = texel >> 2;
+                                    outputLine0[x] = tileAsUint[0];
+                                    outputLine1[x] = tileAsUint[1];
+                                    outputLine2[x] = tileAsUint[2];
+                                    outputLine3[x] = tileAsUint[3];
+                                }
+                                else
+                                {
+                                    int pixelBaseOOffs = lineBaseOOffs + baseX;
 
-                                    if (baseX + tX >= width || baseY + tY >= height)
+                                    for (int tY = 0; tY < copyHeight; tY++)
                                     {
-                                        continue;
+                                        tile.Slice(tY * 4, copyWidth).CopyTo(outputSpan.Slice(pixelBaseOOffs + width * tY, copyWidth));
                                     }
-
-                                    int shift = texel * 3;
-
-                                    byte r = rPal[(int)((rI >> shift) & 7)];
-
-                                    int oOffs = lineBaseOOffs + tY * width + tX;
-
-                                    output[oOffs] = r;
                                 }
 
                                 data64 = data64.Slice(1);
                             }
-
-                            baseOOffs += width * (baseY + BlockHeight > height ? (height & (BlockHeight - 1)) : BlockHeight);
                         }
+
+                        imageBaseOOffs += width * height;
                     }
                 }
 
-                width  = Math.Max(1, width  >> 1);
+                width = Math.Max(1, width >> 1);
                 height = Math.Max(1, height >> 1);
-                depth  = Math.Max(1, depth  >> 1);
+                depth = Math.Max(1, depth >> 1);
             }
 
             return output;
@@ -109,10 +408,22 @@ namespace Ryujinx.Graphics.Texture
 
             ReadOnlySpan<ulong> data64 = MemoryMarshal.Cast<byte, ulong>(data);
 
+            Span<byte> rTile = stackalloc byte[BlockWidth * BlockHeight * 2];
+            Span<byte> gTile = stackalloc byte[BlockWidth * BlockHeight * 2];
             Span<byte> rPal = stackalloc byte[8];
             Span<byte> gPal = stackalloc byte[8];
 
-            int baseOOffs = 0;
+            Span<ushort> outputAsUshort = MemoryMarshal.Cast<byte, ushort>(output);
+
+            Span<uint> rTileAsUint = MemoryMarshal.Cast<byte, uint>(rTile);
+            Span<uint> gTileAsUint = MemoryMarshal.Cast<byte, uint>(gTile);
+
+            Span<ulong> outputLine0 = default;
+            Span<ulong> outputLine1 = default;
+            Span<ulong> outputLine2 = default;
+            Span<ulong> outputLine3 = default;
+
+            int imageBaseOOffs = 0;
 
             for (int l = 0; l < levels; l++)
             {
@@ -126,11 +437,21 @@ namespace Ryujinx.Graphics.Texture
                         for (int y = 0; y < h; y++)
                         {
                             int baseY = y * BlockHeight;
+                            int copyHeight = Math.Min(BlockHeight, height - baseY);
+                            int lineBaseOOffs = imageBaseOOffs + baseY * width;
+
+                            if (copyHeight == 4)
+                            {
+                                outputLine0 = MemoryMarshal.Cast<ushort, ulong>(outputAsUshort.Slice(lineBaseOOffs));
+                                outputLine1 = MemoryMarshal.Cast<ushort, ulong>(outputAsUshort.Slice(lineBaseOOffs + width));
+                                outputLine2 = MemoryMarshal.Cast<ushort, ulong>(outputAsUshort.Slice(lineBaseOOffs + width * 2));
+                                outputLine3 = MemoryMarshal.Cast<ushort, ulong>(outputAsUshort.Slice(lineBaseOOffs + width * 3));
+                            }
 
                             for (int x = 0; x < w; x++)
                             {
                                 int baseX = x * BlockWidth;
-                                int lineBaseOOffs = baseOOffs + baseX;
+                                int copyWidth = Math.Min(BlockWidth, width - baseX);
 
                                 ulong blockL = data64[0];
                                 ulong blockH = data64[1];
@@ -142,101 +463,346 @@ namespace Ryujinx.Graphics.Texture
 
                                 if (signed)
                                 {
-                                    CalculateBC3AlphaS(rPal);
-                                    CalculateBC3AlphaS(gPal);
+                                    BCnLerpAlphaSnorm(rPal);
+                                    BCnLerpAlphaSnorm(gPal);
                                 }
                                 else
                                 {
-                                    CalculateBC3Alpha(rPal);
-                                    CalculateBC3Alpha(gPal);
+                                    BCnLerpAlphaUnorm(rPal);
+                                    BCnLerpAlphaUnorm(gPal);
                                 }
 
-                                ulong rI = blockL >> 16;
-                                ulong gI = blockH >> 16;
+                                BCnDecodeTileAlpha(rTile, rPal, blockL >> 16);
+                                BCnDecodeTileAlpha(gTile, gPal, blockH >> 16);
 
-                                for (int texel = 0; texel < BlockWidth * BlockHeight; texel++)
+                                if ((copyWidth | copyHeight) == 4)
                                 {
-                                    int tX = texel & 3;
-                                    int tY = texel >> 2;
+                                    outputLine0[x] = InterleaveBytes(rTileAsUint[0], gTileAsUint[0]);
+                                    outputLine1[x] = InterleaveBytes(rTileAsUint[1], gTileAsUint[1]);
+                                    outputLine2[x] = InterleaveBytes(rTileAsUint[2], gTileAsUint[2]);
+                                    outputLine3[x] = InterleaveBytes(rTileAsUint[3], gTileAsUint[3]);
+                                }
+                                else
+                                {
+                                    int pixelBaseOOffs = lineBaseOOffs + baseX;
 
-                                    if (baseX + tX >= width || baseY + tY >= height)
+                                    for (int tY = 0; tY < copyHeight; tY++)
                                     {
-                                        continue;
+                                        int line = pixelBaseOOffs + width * tY;
+
+                                        for (int tX = 0; tX < copyWidth; tX++)
+                                        {
+                                            int texel = tY * BlockWidth + tX;
+
+                                            outputAsUshort[line + tX] = (ushort)(rTile[texel] | (gTile[texel] << 8));
+                                        }
                                     }
-
-                                    int shift = texel * 3;
-
-                                    byte r = rPal[(int)((rI >> shift) & 7)];
-                                    byte g = gPal[(int)((gI >> shift) & 7)];
-
-                                    int oOffs = (lineBaseOOffs + tY * width + tX) * 2;
-
-                                    output[oOffs + 0] = r;
-                                    output[oOffs + 1] = g;
                                 }
 
                                 data64 = data64.Slice(2);
                             }
-
-                            baseOOffs += width * (baseY + BlockHeight > height ? (height & (BlockHeight - 1)) : BlockHeight);
                         }
+
+                        imageBaseOOffs += width * height;
                     }
                 }
 
-                width  = Math.Max(1, width  >> 1);
+                width = Math.Max(1, width >> 1);
                 height = Math.Max(1, height >> 1);
-                depth  = Math.Max(1, depth  >> 1);
+                depth = Math.Max(1, depth >> 1);
             }
 
             return output;
         }
 
-        [MethodImpl(MethodImplOptions.AggressiveInlining)]
-        private static void CalculateBC3Alpha(Span<byte> alpha)
+        private static ulong InterleaveBytes(uint left, uint right)
         {
-            for (int i = 2; i < 8; i++)
+            return InterleaveBytesWithZeros(left) | (InterleaveBytesWithZeros(right) << 8);
+        }
+
+        private static ulong InterleaveBytesWithZeros(uint value)
+        {
+            ulong output = value;
+            output = (output ^ (output << 16)) & 0xffff0000ffffUL;
+            output = (output ^ (output << 8)) & 0xff00ff00ff00ffUL;
+            return output;
+        }
+
+        private static void BCnLerpAlphaUnorm(Span<byte> alpha)
+        {
+            byte a0 = alpha[0];
+            byte a1 = alpha[1];
+
+            if (a0 > a1)
             {
-                if (alpha[0] > alpha[1])
+                alpha[2] = (byte)((6 * a0 + 1 * a1) / 7);
+                alpha[3] = (byte)((5 * a0 + 2 * a1) / 7);
+                alpha[4] = (byte)((4 * a0 + 3 * a1) / 7);
+                alpha[5] = (byte)((3 * a0 + 4 * a1) / 7);
+                alpha[6] = (byte)((2 * a0 + 5 * a1) / 7);
+                alpha[7] = (byte)((1 * a0 + 6 * a1) / 7);
+            }
+            else
+            {
+                alpha[2] = (byte)((4 * a0 + 1 * a1) / 5);
+                alpha[3] = (byte)((3 * a0 + 2 * a1) / 5);
+                alpha[4] = (byte)((2 * a0 + 3 * a1) / 5);
+                alpha[5] = (byte)((1 * a0 + 4 * a1) / 5);
+                alpha[6] = 0;
+                alpha[7] = 0xff;
+            }
+        }
+
+        private static void BCnLerpAlphaSnorm(Span<byte> alpha)
+        {
+            sbyte a0 = (sbyte)alpha[0];
+            sbyte a1 = (sbyte)alpha[1];
+
+            if (a0 > a1)
+            {
+                alpha[2] = (byte)((6 * a0 + 1 * a1) / 7);
+                alpha[3] = (byte)((5 * a0 + 2 * a1) / 7);
+                alpha[4] = (byte)((4 * a0 + 3 * a1) / 7);
+                alpha[5] = (byte)((3 * a0 + 4 * a1) / 7);
+                alpha[6] = (byte)((2 * a0 + 5 * a1) / 7);
+                alpha[7] = (byte)((1 * a0 + 6 * a1) / 7);
+            }
+            else
+            {
+                alpha[2] = (byte)((4 * a0 + 1 * a1) / 5);
+                alpha[3] = (byte)((3 * a0 + 2 * a1) / 5);
+                alpha[4] = (byte)((2 * a0 + 3 * a1) / 5);
+                alpha[5] = (byte)((1 * a0 + 4 * a1) / 5);
+                alpha[6] = 0x80;
+                alpha[7] = 0x7f;
+            }
+        }
+
+        private unsafe static void BCnDecodeTileAlpha(Span<byte> output, Span<byte> rPal, ulong rI)
+        {
+            if (Avx2.IsSupported)
+            {
+                Span<Vector128<byte>> outputAsVector128 = MemoryMarshal.Cast<byte, Vector128<byte>>(output);
+
+                Vector128<uint> shifts = Vector128.Create(0u, 3u, 6u, 9u);
+                Vector128<uint> masks = Vector128.Create(7u);
+
+                Vector128<byte> vClut;
+
+                fixed (byte* pRPal = rPal)
                 {
-                    alpha[i] = (byte)(((8 - i) * alpha[0] + (i - 1) * alpha[1]) / 7);
+                    vClut = Sse2.LoadScalarVector128((ulong*)pRPal).AsByte();
                 }
-                else if (i < 6)
+
+                Vector128<uint> indices0 = Vector128.Create((uint)rI);
+                Vector128<uint> indices1 = Vector128.Create((uint)(rI >> 24));
+                Vector128<uint> indices00 = Avx2.ShiftRightLogicalVariable(indices0, shifts);
+                Vector128<uint> indices10 = Avx2.ShiftRightLogicalVariable(indices1, shifts);
+                Vector128<uint> indices01 = Sse2.ShiftRightLogical(indices00, 12);
+                Vector128<uint> indices11 = Sse2.ShiftRightLogical(indices10, 12);
+                indices00 = Sse2.And(indices00, masks);
+                indices10 = Sse2.And(indices10, masks);
+                indices01 = Sse2.And(indices01, masks);
+                indices11 = Sse2.And(indices11, masks);
+
+                Vector128<ushort> indicesW0 = Sse41.PackUnsignedSaturate(indices00.AsInt32(), indices01.AsInt32());
+                Vector128<ushort> indicesW1 = Sse41.PackUnsignedSaturate(indices10.AsInt32(), indices11.AsInt32());
+
+                Vector128<byte> indices = Sse2.PackUnsignedSaturate(indicesW0.AsInt16(), indicesW1.AsInt16());
+
+                outputAsVector128[0] = Ssse3.Shuffle(vClut, indices);
+            }
+            else
+            {
+                for (int i = 0; i < BlockWidth * BlockHeight; i++, rI >>= 3)
                 {
-                    alpha[i] = (byte)(((6 - i) * alpha[0] + (i - 1) * alpha[1]) / 7);
-                }
-                else if (i == 6)
-                {
-                    alpha[i] = 0;
-                }
-                else /* i == 7 */
-                {
-                    alpha[i] = 0xff;
+                    output[i] = rPal[(int)(rI & 7)];
                 }
             }
         }
 
-        [MethodImpl(MethodImplOptions.AggressiveInlining)]
-        private static void CalculateBC3AlphaS(Span<byte> alpha)
+        private unsafe static void BCnDecodeTileAlphaRgba(Span<byte> output, Span<byte> rPal, ulong rI)
         {
-            for (int i = 2; i < 8; i++)
+            if (Avx2.IsSupported)
             {
-                if ((sbyte)alpha[0] > (sbyte)alpha[1])
+                Span<Vector256<uint>> outputAsVector256 = MemoryMarshal.Cast<byte, Vector256<uint>>(output);
+
+                Vector256<uint> shifts = Vector256.Create(0u, 3u, 6u, 9u, 12u, 15u, 18u, 21u);
+
+                Vector128<uint> vClut128;
+
+                fixed (byte* pRPal = rPal)
                 {
-                    alpha[i] = (byte)(((8 - i) * (sbyte)alpha[0] + (i - 1) * (sbyte)alpha[1]) / 7);
+                    vClut128 = Sse2.LoadScalarVector128((ulong*)pRPal).AsUInt32();
                 }
-                else if (i < 6)
+
+                Vector256<uint> vClut = Avx2.ConvertToVector256Int32(vClut128.AsByte()).AsUInt32();
+                vClut = Avx2.ShiftLeftLogical(vClut, 24);
+
+                Vector256<uint> indices0 = Vector256.Create((uint)rI);
+                Vector256<uint> indices1 = Vector256.Create((uint)(rI >> 24));
+
+                indices0 = Avx2.ShiftRightLogicalVariable(indices0, shifts);
+                indices1 = Avx2.ShiftRightLogicalVariable(indices1, shifts);
+
+                outputAsVector256[0] = Avx2.Or(outputAsVector256[0], Avx2.PermuteVar8x32(vClut, indices0));
+                outputAsVector256[1] = Avx2.Or(outputAsVector256[1], Avx2.PermuteVar8x32(vClut, indices1));
+            }
+            else
+            {
+                for (int i = 3; i < BlockWidth * BlockHeight * 4; i += 4, rI >>= 3)
                 {
-                    alpha[i] = (byte)(((6 - i) * (sbyte)alpha[0] + (i - 1) * (sbyte)alpha[1]) / 7);
-                }
-                else if (i == 6)
-                {
-                    alpha[i] = 0x80;
-                }
-                else /* i == 7 */
-                {
-                    alpha[i] = 0x7f;
+                    output[i] = rPal[(int)(rI & 7)];
                 }
             }
         }
+
+        private unsafe static void BC1DecodeTileRgb(Span<byte> output, ReadOnlySpan<byte> input)
+        {
+            Span<uint> clut = stackalloc uint[4];
+
+            uint c0c1 = BinaryPrimitives.ReadUInt32LittleEndian(input);
+            uint c0 = (ushort)c0c1;
+            uint c1 = (ushort)(c0c1 >> 16);
+
+            clut[0] = ConvertRgb565ToRgb888(c0) | 0xff000000;
+            clut[1] = ConvertRgb565ToRgb888(c1) | 0xff000000;
+            clut[2] = BC1LerpRgb2(clut[0], clut[1], c0, c1);
+            clut[3] = BC1LerpRgb3(clut[0], clut[1], c0, c1);
+
+            BCnDecodeTileRgb(clut, output, input);
+        }
+
+        private unsafe static void BC23DecodeTileRgb(Span<byte> output, ReadOnlySpan<byte> input)
+        {
+            Span<uint> clut = stackalloc uint[4];
+
+            uint c0c1 = BinaryPrimitives.ReadUInt32LittleEndian(input);
+            uint c0 = (ushort)c0c1;
+            uint c1 = (ushort)(c0c1 >> 16);
+
+            clut[0] = ConvertRgb565ToRgb888(c0);
+            clut[1] = ConvertRgb565ToRgb888(c1);
+            clut[2] = BC23LerpRgb2(clut[0], clut[1]);
+            clut[3] = BC23LerpRgb3(clut[0], clut[1]);
+
+            BCnDecodeTileRgb(clut, output, input);
+        }
+
+        private unsafe static void BCnDecodeTileRgb(Span<uint> clut, Span<byte> output, ReadOnlySpan<byte> input)
+        {
+            if (Avx2.IsSupported)
+            {
+                Span<Vector256<uint>> outputAsVector256 = MemoryMarshal.Cast<byte, Vector256<uint>>(output);
+
+                Vector256<uint> shifts0 = Vector256.Create(0u, 2u, 4u, 6u, 8u, 10u, 12u, 14u);
+                Vector256<uint> shifts1 = Vector256.Create(16u, 18u, 20u, 22u, 24u, 26u, 28u, 30u);
+                Vector256<uint> masks = Vector256.Create(3u);
+
+                Vector256<uint> vClut;
+
+                fixed (uint* pClut = &clut[0])
+                {
+                    vClut = Sse2.LoadVector128(pClut).ToVector256Unsafe();
+                }
+
+                Vector256<uint> indices0;
+
+                fixed (byte* pInput = input)
+                {
+                    indices0 = Avx2.BroadcastScalarToVector256((uint*)(pInput + 4));
+                }
+
+                Vector256<uint> indices1 = indices0;
+
+                indices0 = Avx2.ShiftRightLogicalVariable(indices0, shifts0);
+                indices1 = Avx2.ShiftRightLogicalVariable(indices1, shifts1);
+                indices0 = Avx2.And(indices0, masks);
+                indices1 = Avx2.And(indices1, masks);
+
+                outputAsVector256[0] = Avx2.PermuteVar8x32(vClut, indices0);
+                outputAsVector256[1] = Avx2.PermuteVar8x32(vClut, indices1);
+            }
+            else
+            {
+                Span<uint> outputAsUint = MemoryMarshal.Cast<byte, uint>(output);
+
+                uint indices = BinaryPrimitives.ReadUInt32LittleEndian(input.Slice(4));
+
+                for (int i = 0; i < BlockWidth * BlockHeight; i++, indices >>= 2)
+                {
+                    outputAsUint[i] = clut[(int)(indices & 3)];
+                }
+            }
+        }
+
+        private static uint BC1LerpRgb2(uint color0, uint color1, uint c0, uint c1)
+        {
+            if (c0 > c1)
+            {
+                return BC23LerpRgb2(color0, color1) | 0xff000000;
+            }
+
+            uint carry = color0 & color1;
+            uint addHalve = ((color0 ^ color1) >> 1) & 0x7f7f7f;
+            return (addHalve + carry) | 0xff000000;
+        }
+
+        private static uint BC23LerpRgb2(uint color0, uint color1)
+        {
+            uint r0 = (byte)color0;
+            uint g0 = color0 & 0xff00;
+            uint b0 = color0 & 0xff0000;
+
+            uint r1 = (byte)color1;
+            uint g1 = color1 & 0xff00;
+            uint b1 = color1 & 0xff0000;
+
+            uint mixR = (2 * r0 + r1) / 3;
+            uint mixG = (2 * g0 + g1) / 3;
+            uint mixB = (2 * b0 + b1) / 3;
+
+            return mixR | (mixG & 0xff00) | (mixB & 0xff0000);
+        }
+
+        private static uint BC1LerpRgb3(uint color0, uint color1, uint c0, uint c1)
+        {
+            if (c0 > c1)
+            {
+                return BC23LerpRgb3(color0, color1) | 0xff000000;
+            }
+
+            return 0;
+        }
+
+        private static uint BC23LerpRgb3(uint color0, uint color1)
+        {
+            uint r0 = (byte)color0;
+            uint g0 = color0 & 0xff00;
+            uint b0 = color0 & 0xff0000;
+
+            uint r1 = (byte)color1;
+            uint g1 = color1 & 0xff00;
+            uint b1 = color1 & 0xff0000;
+
+            uint mixR = (2 * r1 + r0) / 3;
+            uint mixG = (2 * g1 + g0) / 3;
+            uint mixB = (2 * b1 + b0) / 3;
+
+            return mixR | (mixG & 0xff00) | (mixB & 0xff0000);
+        }
+
+        private static uint ConvertRgb565ToRgb888(uint value)
+        {
+            uint b = (value & 0x1f) << 19;
+            uint g = (value << 5) & 0xfc00;
+            uint r = (value >> 8) & 0xf8;
+
+            b |= b >> 5;
+            g |= g >> 6;
+            r |= r >> 5;
+
+            return r | (g & 0xff00) | (b & 0xff0000);
+        }
     }
 }
\ No newline at end of file