From 31fca432a7274907c46f6ec254d54e96cb6446c6 Mon Sep 17 00:00:00 2001
From: Mary <me@thog.eu>
Date: Tue, 2 Mar 2021 23:50:46 +0100
Subject: [PATCH] Amadeus: Add ARM SIMD fast path (#2069)

Add fast paths in the audio renderer for AArch64 in all current fast paths.
---
 .../Renderer/Dsp/Command/MixCommand.cs        | 26 ++++++++++
 .../Renderer/Dsp/Command/VolumeCommand.cs     | 25 ++++++++++
 .../Renderer/Dsp/DataSourceHelper.cs          | 47 ++++++++++++++++++-
 3 files changed, 97 insertions(+), 1 deletion(-)

diff --git a/Ryujinx.Audio/Renderer/Dsp/Command/MixCommand.cs b/Ryujinx.Audio/Renderer/Dsp/Command/MixCommand.cs
index 566fea92ba..069688711f 100644
--- a/Ryujinx.Audio/Renderer/Dsp/Command/MixCommand.cs
+++ b/Ryujinx.Audio/Renderer/Dsp/Command/MixCommand.cs
@@ -19,6 +19,7 @@ using System;
 using System.Runtime.CompilerServices;
 using System.Runtime.InteropServices;
 using System.Runtime.Intrinsics;
+using System.Runtime.Intrinsics.Arm;
 using System.Runtime.Intrinsics.X86;
 
 namespace Ryujinx.Audio.Renderer.Dsp.Command
@@ -89,6 +90,27 @@ namespace Ryujinx.Audio.Renderer.Dsp.Command
             }
         }
 
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        private void ProcessMixAdvSimd(Span<float> outputMix, ReadOnlySpan<float> inputMix)
+        {
+            Vector128<float> volumeVec = Vector128.Create(Volume);
+
+            ReadOnlySpan<Vector128<float>> inputVec = MemoryMarshal.Cast<float, Vector128<float>>(inputMix);
+            Span<Vector128<float>> outputVec = MemoryMarshal.Cast<float, Vector128<float>>(outputMix);
+
+            int sisdStart = inputVec.Length * 4;
+
+            for (int i = 0; i < inputVec.Length; i++)
+            {
+                outputVec[i] = AdvSimd.Add(outputVec[i], AdvSimd.Ceiling(AdvSimd.Multiply(inputVec[i], volumeVec)));
+            }
+
+            for (int i = sisdStart; i < inputMix.Length; i++)
+            {
+                outputMix[i] += FloatingPointHelper.MultiplyRoundUp(inputMix[i], Volume);
+            }
+        }
+
         [MethodImpl(MethodImplOptions.AggressiveInlining)]
         private void ProcessMixSlowPath(Span<float> outputMix, ReadOnlySpan<float> inputMix)
         {
@@ -108,6 +130,10 @@ namespace Ryujinx.Audio.Renderer.Dsp.Command
             {
                 ProcessMixSse41(outputMix, inputMix);
             }
+            else if (AdvSimd.IsSupported)
+            {
+                ProcessMixAdvSimd(outputMix, inputMix);
+            }
             else
             {
                 ProcessMixSlowPath(outputMix, inputMix);
diff --git a/Ryujinx.Audio/Renderer/Dsp/Command/VolumeCommand.cs b/Ryujinx.Audio/Renderer/Dsp/Command/VolumeCommand.cs
index b58ae1f814..217d51c9e2 100644
--- a/Ryujinx.Audio/Renderer/Dsp/Command/VolumeCommand.cs
+++ b/Ryujinx.Audio/Renderer/Dsp/Command/VolumeCommand.cs
@@ -19,6 +19,7 @@ using System;
 using System.Runtime.CompilerServices;
 using System.Runtime.InteropServices;
 using System.Runtime.Intrinsics;
+using System.Runtime.Intrinsics.Arm;
 using System.Runtime.Intrinsics.X86;
 
 namespace Ryujinx.Audio.Renderer.Dsp.Command
@@ -89,6 +90,26 @@ namespace Ryujinx.Audio.Renderer.Dsp.Command
             }
         }
 
+        private void ProcessVolumeAdvSimd(Span<float> outputBuffer, ReadOnlySpan<float> inputBuffer)
+        {
+            Vector128<float> volumeVec = Vector128.Create(Volume);
+
+            ReadOnlySpan<Vector128<float>> inputVec = MemoryMarshal.Cast<float, Vector128<float>>(inputBuffer);
+            Span<Vector128<float>> outputVec = MemoryMarshal.Cast<float, Vector128<float>>(outputBuffer);
+
+            int sisdStart = inputVec.Length * 4;
+
+            for (int i = 0; i < inputVec.Length; i++)
+            {
+                outputVec[i] = AdvSimd.Ceiling(AdvSimd.Multiply(inputVec[i], volumeVec));
+            }
+
+            for (int i = sisdStart; i < inputBuffer.Length; i++)
+            {
+                outputBuffer[i] = FloatingPointHelper.MultiplyRoundUp(inputBuffer[i], Volume);
+            }
+        }
+
         private void ProcessVolume(Span<float> outputBuffer, ReadOnlySpan<float> inputBuffer)
         {
             if (Avx.IsSupported)
@@ -99,6 +120,10 @@ namespace Ryujinx.Audio.Renderer.Dsp.Command
             {
                 ProcessVolumeSse41(outputBuffer, inputBuffer);
             }
+            else if (AdvSimd.IsSupported)
+            {
+                ProcessVolumeAdvSimd(outputBuffer, inputBuffer);
+            }
             else
             {
                 ProcessVolumeSlowPath(outputBuffer, inputBuffer);
diff --git a/Ryujinx.Audio/Renderer/Dsp/DataSourceHelper.cs b/Ryujinx.Audio/Renderer/Dsp/DataSourceHelper.cs
index c951452925..373776b71b 100644
--- a/Ryujinx.Audio/Renderer/Dsp/DataSourceHelper.cs
+++ b/Ryujinx.Audio/Renderer/Dsp/DataSourceHelper.cs
@@ -26,6 +26,7 @@ using System.Diagnostics;
 using System.Runtime.CompilerServices;
 using System.Runtime.InteropServices;
 using System.Runtime.Intrinsics;
+using System.Runtime.Intrinsics.Arm;
 using System.Runtime.Intrinsics.X86;
 using static Ryujinx.Audio.Renderer.Parameter.VoiceInParameter;
 
@@ -320,6 +321,24 @@ namespace Ryujinx.Audio.Renderer.Dsp
             }
         }
 
+        private static void ToFloatAdvSimd(Span<float> output, ReadOnlySpan<int> input, int sampleCount)
+        {
+            ReadOnlySpan<Vector128<int>> inputVec = MemoryMarshal.Cast<int, Vector128<int>>(input);
+            Span<Vector128<float>> outputVec = MemoryMarshal.Cast<float, Vector128<float>>(output);
+
+            int sisdStart = inputVec.Length * 4;
+
+            for (int i = 0; i < inputVec.Length; i++)
+            {
+                outputVec[i] = AdvSimd.ConvertToSingle(inputVec[i]);
+            }
+
+            for (int i = sisdStart; i < sampleCount; i++)
+            {
+                output[i] = input[i];
+            }
+        }
+
         [MethodImpl(MethodImplOptions.AggressiveInlining)]
         public static void ToFloatSlow(Span<float> output, ReadOnlySpan<int> input, int sampleCount)
         {
@@ -339,6 +358,10 @@ namespace Ryujinx.Audio.Renderer.Dsp
             {
                 ToFloatSse2(output, input, sampleCount);
             }
+            else if (AdvSimd.IsSupported)
+            {
+                ToFloatAdvSimd(output, input, sampleCount);
+            }
             else
             {
                 ToFloatSlow(output, input, sampleCount);
@@ -372,7 +395,25 @@ namespace Ryujinx.Audio.Renderer.Dsp
 
             for (int i = 0; i < inputVec.Length; i++)
             {
-                outputVec[i] = Avx.ConvertToVector128Int32(inputVec[i]);
+                outputVec[i] = Sse2.ConvertToVector128Int32(inputVec[i]);
+            }
+
+            for (int i = sisdStart; i < sampleCount; i++)
+            {
+                output[i] = (int)input[i];
+            }
+        }
+
+        public static void ToIntAdvSimd(Span<int> output, ReadOnlySpan<float> input, int sampleCount)
+        {
+            ReadOnlySpan<Vector128<float>> inputVec = MemoryMarshal.Cast<float, Vector128<float>>(input);
+            Span<Vector128<int>> outputVec = MemoryMarshal.Cast<int, Vector128<int>>(output);
+
+            int sisdStart = inputVec.Length * 4;
+
+            for (int i = 0; i < inputVec.Length; i++)
+            {
+                outputVec[i] = AdvSimd.ConvertToInt32RoundToZero(inputVec[i]);
             }
 
             for (int i = sisdStart; i < sampleCount; i++)
@@ -400,6 +441,10 @@ namespace Ryujinx.Audio.Renderer.Dsp
             {
                 ToIntSse2(output, input, sampleCount);
             }
+            else if (AdvSimd.IsSupported)
+            {
+                ToIntAdvSimd(output, input, sampleCount);
+            }
             else
             {
                 ToIntSlow(output, input, sampleCount);