From a11784fcbf7a19b9d36e755cc92a27fe994008c7 Mon Sep 17 00:00:00 2001
From: merry <git@mary.rs>
Date: Thu, 12 Jan 2023 07:05:18 +0000
Subject: [PATCH] Arm64: Cpu feature detection (#4264)

* Arm64: Cpu feature detection

* Ptc: Add Arm64 feature info

* nits

* simplify CheckSysctlName

* restore some macos flags

* feedback
---
 .../CodeGen/Arm64/HardwareCapabilities.cs     | 185 ++++++++++++++++++
 .../Instructions/InstEmitSimdArithmetic.cs    |   2 +-
 ARMeilleure/Optimizations.cs                  |  44 +++--
 ARMeilleure/Translation/PTC/Ptc.cs            |  37 +++-
 4 files changed, 237 insertions(+), 31 deletions(-)
 create mode 100644 ARMeilleure/CodeGen/Arm64/HardwareCapabilities.cs

diff --git a/ARMeilleure/CodeGen/Arm64/HardwareCapabilities.cs b/ARMeilleure/CodeGen/Arm64/HardwareCapabilities.cs
new file mode 100644
index 0000000000..99ff299e9e
--- /dev/null
+++ b/ARMeilleure/CodeGen/Arm64/HardwareCapabilities.cs
@@ -0,0 +1,185 @@
+using System;
+using System.Linq;
+using System.Reflection;
+using System.Runtime.CompilerServices;
+using System.Runtime.InteropServices;
+using System.Runtime.Intrinsics.Arm;
+using System.Runtime.Versioning;
+
+namespace ARMeilleure.CodeGen.Arm64
+{
+    static partial class HardwareCapabilities
+    {
+        static HardwareCapabilities()
+        {
+            if (!ArmBase.Arm64.IsSupported)
+            {
+                return;
+            }
+
+            if (OperatingSystem.IsLinux())
+            {
+                LinuxFeatureInfoHwCap = (LinuxFeatureFlagsHwCap)getauxval(AT_HWCAP);
+                LinuxFeatureInfoHwCap2 = (LinuxFeatureFlagsHwCap2)getauxval(AT_HWCAP2);
+            }
+
+            if (OperatingSystem.IsMacOS())
+            {
+                for (int i = 0; i < _sysctlNames.Length; i++)
+                {
+                    if (CheckSysctlName(_sysctlNames[i]))
+                    {
+                        MacOsFeatureInfo |= (MacOsFeatureFlags)(1 << i);
+                    }
+                }
+            }
+        }
+
+#region Linux
+
+        private const ulong AT_HWCAP = 16;
+        private const ulong AT_HWCAP2 = 26;
+
+        [LibraryImport("libc", SetLastError = true)]
+        private static partial ulong getauxval(ulong type);
+
+        [Flags]
+        public enum LinuxFeatureFlagsHwCap : ulong
+        {
+            Fp        = 1 << 0,
+            Asimd     = 1 << 1,
+            Evtstrm   = 1 << 2,
+            Aes       = 1 << 3,
+            Pmull     = 1 << 4,
+            Sha1      = 1 << 5,
+            Sha2      = 1 << 6,
+            Crc32     = 1 << 7,
+            Atomics   = 1 << 8,
+            FpHp      = 1 << 9,
+            AsimdHp   = 1 << 10,
+            CpuId     = 1 << 11,
+            AsimdRdm  = 1 << 12,
+            Jscvt     = 1 << 13,
+            Fcma      = 1 << 14,
+            Lrcpc     = 1 << 15,
+            DcpOp     = 1 << 16,
+            Sha3      = 1 << 17,
+            Sm3       = 1 << 18,
+            Sm4       = 1 << 19,
+            AsimdDp   = 1 << 20,
+            Sha512    = 1 << 21,
+            Sve       = 1 << 22,
+            AsimdFhm  = 1 << 23,
+            Dit       = 1 << 24,
+            Uscat     = 1 << 25,
+            Ilrcpc    = 1 << 26,
+            FlagM     = 1 << 27,
+            Ssbs      = 1 << 28,
+            Sb        = 1 << 29,
+            Paca      = 1 << 30,
+            Pacg      = 1UL << 31
+        }
+
+        [Flags]
+        public enum LinuxFeatureFlagsHwCap2 : ulong
+        {
+            Dcpodp      = 1 << 0,
+            Sve2        = 1 << 1,
+            SveAes      = 1 << 2,
+            SvePmull    = 1 << 3,
+            SveBitperm  = 1 << 4,
+            SveSha3     = 1 << 5,
+            SveSm4      = 1 << 6,
+            FlagM2      = 1 << 7,
+            Frint       = 1 << 8,
+            SveI8mm     = 1 << 9,
+            SveF32mm    = 1 << 10,
+            SveF64mm    = 1 << 11,
+            SveBf16     = 1 << 12,
+            I8mm        = 1 << 13,
+            Bf16        = 1 << 14,
+            Dgh         = 1 << 15,
+            Rng         = 1 << 16,
+            Bti         = 1 << 17,
+            Mte         = 1 << 18,
+            Ecv         = 1 << 19,
+            Afp         = 1 << 20,
+            Rpres       = 1 << 21,
+            Mte3        = 1 << 22,
+            Sme         = 1 << 23,
+            Sme_i16i64  = 1 << 24,
+            Sme_f64f64  = 1 << 25,
+            Sme_i8i32   = 1 << 26,
+            Sme_f16f32  = 1 << 27,
+            Sme_b16f32  = 1 << 28,
+            Sme_f32f32  = 1 << 29,
+            Sme_fa64    = 1 << 30,
+            Wfxt        = 1UL << 31,
+            Ebf16       = 1UL << 32,
+            Sve_Ebf16   = 1UL << 33,
+            Cssc        = 1UL << 34,
+            Rprfm       = 1UL << 35,
+            Sve2p1      = 1UL << 36
+        }
+
+        public static LinuxFeatureFlagsHwCap LinuxFeatureInfoHwCap { get; } = 0;
+        public static LinuxFeatureFlagsHwCap2 LinuxFeatureInfoHwCap2 { get; } = 0;
+
+#endregion
+
+#region macOS
+
+        [LibraryImport("libSystem.dylib", SetLastError = true)]
+        private static unsafe partial int sysctlbyname([MarshalAs(UnmanagedType.LPStr)] string name, out int oldValue, ref ulong oldSize, IntPtr newValue, ulong newValueSize);
+
+        [SupportedOSPlatform("macos")]
+        private static bool CheckSysctlName(string name)
+        {
+            ulong size = sizeof(int);
+            if (sysctlbyname(name, out int val, ref size, IntPtr.Zero, 0) == 0 && size == sizeof(int))
+            {
+                return val != 0;
+            }
+            return false;
+        }
+
+        private static string[] _sysctlNames = new string[]
+        {
+            "hw.optional.floatingpoint",
+            "hw.optional.AdvSIMD",
+            "hw.optional.arm.FEAT_FP16",
+            "hw.optional.arm.FEAT_AES",
+            "hw.optional.arm.FEAT_PMULL",
+            "hw.optional.arm.FEAT_LSE",
+            "hw.optional.armv8_crc32",
+            "hw.optional.arm.FEAT_SHA1",
+            "hw.optional.arm.FEAT_SHA256"
+        };
+
+        [Flags]
+        public enum MacOsFeatureFlags
+        {
+            Fp      = 1 << 0,
+            AdvSimd = 1 << 1,
+            Fp16    = 1 << 2,
+            Aes     = 1 << 3,
+            Pmull   = 1 << 4,
+            Lse     = 1 << 5,
+            Crc32   = 1 << 6,
+            Sha1    = 1 << 7,
+            Sha256  = 1 << 8
+        }
+
+        public static MacOsFeatureFlags MacOsFeatureInfo { get; } = 0;
+
+#endregion
+
+        public static bool SupportsAdvSimd => LinuxFeatureInfoHwCap.HasFlag(LinuxFeatureFlagsHwCap.Asimd) || MacOsFeatureInfo.HasFlag(MacOsFeatureFlags.AdvSimd);
+        public static bool SupportsAes => LinuxFeatureInfoHwCap.HasFlag(LinuxFeatureFlagsHwCap.Aes) || MacOsFeatureInfo.HasFlag(MacOsFeatureFlags.Aes);
+        public static bool SupportsPmull => LinuxFeatureInfoHwCap.HasFlag(LinuxFeatureFlagsHwCap.Pmull) || MacOsFeatureInfo.HasFlag(MacOsFeatureFlags.Pmull);
+        public static bool SupportsLse => LinuxFeatureInfoHwCap.HasFlag(LinuxFeatureFlagsHwCap.Atomics) || MacOsFeatureInfo.HasFlag(MacOsFeatureFlags.Lse);
+        public static bool SupportsCrc32 => LinuxFeatureInfoHwCap.HasFlag(LinuxFeatureFlagsHwCap.Crc32) || MacOsFeatureInfo.HasFlag(MacOsFeatureFlags.Crc32);
+        public static bool SupportsSha1 => LinuxFeatureInfoHwCap.HasFlag(LinuxFeatureFlagsHwCap.Sha1) || MacOsFeatureInfo.HasFlag(MacOsFeatureFlags.Sha1);
+        public static bool SupportsSha256 => LinuxFeatureInfoHwCap.HasFlag(LinuxFeatureFlagsHwCap.Sha2) || MacOsFeatureInfo.HasFlag(MacOsFeatureFlags.Sha256);
+    }
+}
diff --git a/ARMeilleure/Instructions/InstEmitSimdArithmetic.cs b/ARMeilleure/Instructions/InstEmitSimdArithmetic.cs
index 3e65db23d9..d0bb68e4f7 100644
--- a/ARMeilleure/Instructions/InstEmitSimdArithmetic.cs
+++ b/ARMeilleure/Instructions/InstEmitSimdArithmetic.cs
@@ -2556,7 +2556,7 @@ namespace ARMeilleure.Instructions
         {
             OpCodeSimdReg op = (OpCodeSimdReg)context.CurrOp;
 
-            if (Optimizations.UseAdvSimd && false) // Not supported by all Arm CPUs.
+            if (Optimizations.UseArm64Pmull)
             {
                 InstEmitSimdHelperArm64.EmitVectorBinaryOp(context, Intrinsic.Arm64PmullV);
             }
diff --git a/ARMeilleure/Optimizations.cs b/ARMeilleure/Optimizations.cs
index 0810d96c91..9044314f60 100644
--- a/ARMeilleure/Optimizations.cs
+++ b/ARMeilleure/Optimizations.cs
@@ -1,8 +1,10 @@
-using ARMeilleure.CodeGen.X86;
 using System.Runtime.Intrinsics.Arm;
 
 namespace ARMeilleure
 {
+    using Arm64HardwareCapabilities = ARMeilleure.CodeGen.Arm64.HardwareCapabilities;
+    using X86HardwareCapabilities = ARMeilleure.CodeGen.X86.HardwareCapabilities;
+
     public static class Optimizations
     {
         public static bool FastFP { get; set; } = true;
@@ -10,7 +12,8 @@ namespace ARMeilleure
         public static bool AllowLcqInFunctionTable  { get; set; } = true;
         public static bool UseUnmanagedDispatchLoop { get; set; } = true;
 
-        public static bool UseAdvSimdIfAvailable { get; set; } = true;
+        public static bool UseAdvSimdIfAvailable    { get; set; } = true;
+        public static bool UseArm64PmullIfAvailable { get; set; } = true;
 
         public static bool UseSseIfAvailable       { get; set; } = true;
         public static bool UseSse2IfAvailable      { get; set; } = true;
@@ -29,25 +32,26 @@ namespace ARMeilleure
 
         public static bool ForceLegacySse
         {
-            get => HardwareCapabilities.ForceLegacySse;
-            set => HardwareCapabilities.ForceLegacySse = value;
+            get => X86HardwareCapabilities.ForceLegacySse;
+            set => X86HardwareCapabilities.ForceLegacySse = value;
         }
 
-        internal static bool UseAdvSimd => UseAdvSimdIfAvailable && AdvSimd.IsSupported;
+        internal static bool UseAdvSimd    => UseAdvSimdIfAvailable    && Arm64HardwareCapabilities.SupportsAdvSimd;
+        internal static bool UseArm64Pmull => UseArm64PmullIfAvailable && Arm64HardwareCapabilities.SupportsPmull;
 
-        internal static bool UseSse       => UseSseIfAvailable       && HardwareCapabilities.SupportsSse;
-        internal static bool UseSse2      => UseSse2IfAvailable      && HardwareCapabilities.SupportsSse2;
-        internal static bool UseSse3      => UseSse3IfAvailable      && HardwareCapabilities.SupportsSse3;
-        internal static bool UseSsse3     => UseSsse3IfAvailable     && HardwareCapabilities.SupportsSsse3;
-        internal static bool UseSse41     => UseSse41IfAvailable     && HardwareCapabilities.SupportsSse41;
-        internal static bool UseSse42     => UseSse42IfAvailable     && HardwareCapabilities.SupportsSse42;
-        internal static bool UsePopCnt    => UsePopCntIfAvailable    && HardwareCapabilities.SupportsPopcnt;
-        internal static bool UseAvx       => UseAvxIfAvailable       && HardwareCapabilities.SupportsAvx && !ForceLegacySse;
-        internal static bool UseF16c      => UseF16cIfAvailable      && HardwareCapabilities.SupportsF16c;
-        internal static bool UseFma       => UseFmaIfAvailable       && HardwareCapabilities.SupportsFma;
-        internal static bool UseAesni     => UseAesniIfAvailable     && HardwareCapabilities.SupportsAesni;
-        internal static bool UsePclmulqdq => UsePclmulqdqIfAvailable && HardwareCapabilities.SupportsPclmulqdq;
-        internal static bool UseSha       => UseShaIfAvailable       && HardwareCapabilities.SupportsSha;
-        internal static bool UseGfni      => UseGfniIfAvailable      && HardwareCapabilities.SupportsGfni;
+        internal static bool UseSse       => UseSseIfAvailable       && X86HardwareCapabilities.SupportsSse;
+        internal static bool UseSse2      => UseSse2IfAvailable      && X86HardwareCapabilities.SupportsSse2;
+        internal static bool UseSse3      => UseSse3IfAvailable      && X86HardwareCapabilities.SupportsSse3;
+        internal static bool UseSsse3     => UseSsse3IfAvailable     && X86HardwareCapabilities.SupportsSsse3;
+        internal static bool UseSse41     => UseSse41IfAvailable     && X86HardwareCapabilities.SupportsSse41;
+        internal static bool UseSse42     => UseSse42IfAvailable     && X86HardwareCapabilities.SupportsSse42;
+        internal static bool UsePopCnt    => UsePopCntIfAvailable    && X86HardwareCapabilities.SupportsPopcnt;
+        internal static bool UseAvx       => UseAvxIfAvailable       && X86HardwareCapabilities.SupportsAvx && !ForceLegacySse;
+        internal static bool UseF16c      => UseF16cIfAvailable      && X86HardwareCapabilities.SupportsF16c;
+        internal static bool UseFma       => UseFmaIfAvailable       && X86HardwareCapabilities.SupportsFma;
+        internal static bool UseAesni     => UseAesniIfAvailable     && X86HardwareCapabilities.SupportsAesni;
+        internal static bool UsePclmulqdq => UsePclmulqdqIfAvailable && X86HardwareCapabilities.SupportsPclmulqdq;
+        internal static bool UseSha       => UseShaIfAvailable       && X86HardwareCapabilities.SupportsSha;
+        internal static bool UseGfni      => UseGfniIfAvailable      && X86HardwareCapabilities.SupportsGfni;
     }
-}
\ No newline at end of file
+}
diff --git a/ARMeilleure/Translation/PTC/Ptc.cs b/ARMeilleure/Translation/PTC/Ptc.cs
index 6f57e1883c..a59bc58883 100644
--- a/ARMeilleure/Translation/PTC/Ptc.cs
+++ b/ARMeilleure/Translation/PTC/Ptc.cs
@@ -1,7 +1,6 @@
 using ARMeilleure.CodeGen;
 using ARMeilleure.CodeGen.Linking;
 using ARMeilleure.CodeGen.Unwinding;
-using ARMeilleure.CodeGen.X86;
 using ARMeilleure.Common;
 using ARMeilleure.Memory;
 using Ryujinx.Common;
@@ -22,12 +21,15 @@ using static ARMeilleure.Translation.PTC.PtcFormatter;
 
 namespace ARMeilleure.Translation.PTC
 {
+    using Arm64HardwareCapabilities = ARMeilleure.CodeGen.Arm64.HardwareCapabilities;
+    using X86HardwareCapabilities = ARMeilleure.CodeGen.X86.HardwareCapabilities;
+
     class Ptc : IPtcLoadState
     {
         private const string OuterHeaderMagicString = "PTCohd\0\0";
         private const string InnerHeaderMagicString = "PTCihd\0\0";
 
-        private const uint InternalVersion = 4114; //! To be incremented manually for each change to the ARMeilleure project.
+        private const uint InternalVersion = 4264; //! To be incremented manually for each change to the ARMeilleure project.
 
         private const string ActualDir = "0";
         private const string BackupDir = "1";
@@ -952,11 +954,26 @@ namespace ARMeilleure.Translation.PTC
 
         private static FeatureInfo GetFeatureInfo()
         {
-            return new FeatureInfo(
-                (uint)HardwareCapabilities.FeatureInfo1Ecx,
-                (uint)HardwareCapabilities.FeatureInfo1Edx,
-                (uint)HardwareCapabilities.FeatureInfo7Ebx,
-                (uint)HardwareCapabilities.FeatureInfo7Ecx);
+            if (RuntimeInformation.ProcessArchitecture == Architecture.Arm64)
+            {
+                return new FeatureInfo(
+                    (ulong)Arm64HardwareCapabilities.LinuxFeatureInfoHwCap,
+                    (ulong)Arm64HardwareCapabilities.LinuxFeatureInfoHwCap2,
+                    (ulong)Arm64HardwareCapabilities.MacOsFeatureInfo,
+                    0);
+            }
+            else if (RuntimeInformation.ProcessArchitecture == Architecture.X64)
+            {
+                return new FeatureInfo(
+                    (ulong)X86HardwareCapabilities.FeatureInfo1Ecx,
+                    (ulong)X86HardwareCapabilities.FeatureInfo1Edx,
+                    (ulong)X86HardwareCapabilities.FeatureInfo7Ebx,
+                    (ulong)X86HardwareCapabilities.FeatureInfo7Ecx);
+            }
+            else
+            {
+                return new FeatureInfo(0, 0, 0, 0);
+            }
         }
 
         private byte GetMemoryManagerMode()
@@ -976,7 +993,7 @@ namespace ARMeilleure.Translation.PTC
             return osPlatform;
         }
 
-        [StructLayout(LayoutKind.Sequential, Pack = 1/*, Size = 58*/)]
+        [StructLayout(LayoutKind.Sequential, Pack = 1/*, Size = 74*/)]
         private struct OuterHeader
         {
             public ulong Magic;
@@ -1007,8 +1024,8 @@ namespace ARMeilleure.Translation.PTC
             }
         }
 
-        [StructLayout(LayoutKind.Sequential, Pack = 1/*, Size = 16*/)]
-        private record struct FeatureInfo(uint FeatureInfo0, uint FeatureInfo1, uint FeatureInfo2, uint FeatureInfo3);
+        [StructLayout(LayoutKind.Sequential, Pack = 1/*, Size = 32*/)]
+        private record struct FeatureInfo(ulong FeatureInfo0, ulong FeatureInfo1, ulong FeatureInfo2, ulong FeatureInfo3);
 
         [StructLayout(LayoutKind.Sequential, Pack = 1/*, Size = 128*/)]
         private struct InnerHeader