From 0679084f115b6838dec4d8c5e85044c33d4122d0 Mon Sep 17 00:00:00 2001 From: LDj3SNuD <35856442+LDj3SNuD@users.noreply.github.com> Date: Wed, 18 Nov 2020 19:35:54 +0100 Subject: [PATCH] CPU (A64): Add FP16/FP32 fast paths (F16C Intrinsics) for Fcvt_S, Fcvtl_V & Fcvtn_V Instructions. Now HardwareCapabilities uses CpuId. (#1650) * net5.0 * CPU (A64): Add FP16/FP32 fast paths (F16C Intrinsics) for Fcvt_S, Fcvtl_V & Fcvtn_V Instructions. Switch to .NET 5.0. Nits. Tests performed successfully in both debug and release mode (for all instructions involved). * Address comment. * Update appveyor.yml * Revert "Update appveyor.yml" This reverts commit 27cdd59e8b90e227e6924d9c162af26c00a89013. * Remove Assembler CpuId. * Update appveyor.yml * Address comment. --- ARMeilleure/CodeGen/X86/Assembler.cs | 8 +- .../CodeGen/X86/HardwareCapabilities.cs | 62 ++++++++++--- ARMeilleure/CodeGen/X86/IntrinsicTable.cs | 2 + ARMeilleure/CodeGen/X86/X86Instruction.cs | 3 +- ARMeilleure/Instructions/InstEmitSimdCvt.cs | 88 ++++++++++++++----- .../IntermediateRepresentation/Intrinsic.cs | 2 + ARMeilleure/Optimizations.cs | 2 + ARMeilleure/Translation/PTC/Ptc.cs | 20 +---- Ryujinx.Tests/Cpu/CpuTestSimd.cs | 11 ++- 9 files changed, 136 insertions(+), 62 deletions(-) diff --git a/ARMeilleure/CodeGen/X86/Assembler.cs b/ARMeilleure/CodeGen/X86/Assembler.cs index b855f1b11a..48053efc97 100644 --- a/ARMeilleure/CodeGen/X86/Assembler.cs +++ b/ARMeilleure/CodeGen/X86/Assembler.cs @@ -104,7 +104,6 @@ namespace ARMeilleure.CodeGen.X86 Add(X86Instruction.Cmpxchg8, new InstructionInfo(0x00000fb0, BadOp, BadOp, BadOp, BadOp, InstructionFlags.Reg8Src)); Add(X86Instruction.Comisd, new InstructionInfo(BadOp, BadOp, BadOp, BadOp, 0x00000f2f, InstructionFlags.Vex | InstructionFlags.Prefix66)); Add(X86Instruction.Comiss, new InstructionInfo(BadOp, BadOp, BadOp, BadOp, 0x00000f2f, InstructionFlags.Vex)); - Add(X86Instruction.Cpuid, new InstructionInfo(BadOp, BadOp, BadOp, BadOp, 0x00000fa2, InstructionFlags.RegOnly)); Add(X86Instruction.Crc32, new InstructionInfo(BadOp, BadOp, BadOp, BadOp, 0x000f38f1, InstructionFlags.PrefixF2)); Add(X86Instruction.Crc32_16, new InstructionInfo(BadOp, BadOp, BadOp, BadOp, 0x000f38f1, InstructionFlags.PrefixF2 | InstructionFlags.Prefix66)); Add(X86Instruction.Crc32_8, new InstructionInfo(BadOp, BadOp, BadOp, BadOp, 0x000f38f0, InstructionFlags.PrefixF2 | InstructionFlags.Reg8Src)); @@ -270,6 +269,8 @@ namespace ARMeilleure.CodeGen.X86 Add(X86Instruction.Unpcklps, new InstructionInfo(BadOp, BadOp, BadOp, BadOp, 0x00000f14, InstructionFlags.Vex)); Add(X86Instruction.Vblendvpd, new InstructionInfo(BadOp, BadOp, BadOp, BadOp, 0x000f3a4b, InstructionFlags.Vex | InstructionFlags.Prefix66)); Add(X86Instruction.Vblendvps, new InstructionInfo(BadOp, BadOp, BadOp, BadOp, 0x000f3a4a, InstructionFlags.Vex | InstructionFlags.Prefix66)); + Add(X86Instruction.Vcvtph2ps, new InstructionInfo(BadOp, BadOp, BadOp, BadOp, 0x000f3813, InstructionFlags.Vex | InstructionFlags.Prefix66)); + Add(X86Instruction.Vcvtps2ph, new InstructionInfo(0x000f3a1d, BadOp, BadOp, BadOp, BadOp, InstructionFlags.Vex | InstructionFlags.Prefix66)); Add(X86Instruction.Vpblendvb, new InstructionInfo(BadOp, BadOp, BadOp, BadOp, 0x000f3a4c, InstructionFlags.Vex | InstructionFlags.Prefix66)); Add(X86Instruction.Xor, new InstructionInfo(0x00000031, 0x06000083, 0x06000081, BadOp, 0x00000033, InstructionFlags.None)); Add(X86Instruction.Xorpd, new InstructionInfo(BadOp, BadOp, BadOp, BadOp, 0x00000f57, InstructionFlags.Vex | InstructionFlags.Prefix66)); @@ -386,11 +387,6 @@ namespace ARMeilleure.CodeGen.X86 WriteInstruction(src1, null, src2, X86Instruction.Comiss); } - public void Cpuid() - { - WriteInstruction(null, null, OperandType.None, X86Instruction.Cpuid); - } - public void Cvtsd2ss(Operand dest, Operand src1, Operand src2) { WriteInstruction(dest, src1, src2, X86Instruction.Cvtsd2ss); diff --git a/ARMeilleure/CodeGen/X86/HardwareCapabilities.cs b/ARMeilleure/CodeGen/X86/HardwareCapabilities.cs index b622c65cd0..aa103e30b6 100644 --- a/ARMeilleure/CodeGen/X86/HardwareCapabilities.cs +++ b/ARMeilleure/CodeGen/X86/HardwareCapabilities.cs @@ -1,20 +1,60 @@ +using System; using System.Runtime.Intrinsics.X86; namespace ARMeilleure.CodeGen.X86 { static class HardwareCapabilities { - public static bool SupportsSse => Sse.IsSupported; - public static bool SupportsSse2 => Sse2.IsSupported; - public static bool SupportsSse3 => Sse3.IsSupported; - public static bool SupportsSsse3 => Ssse3.IsSupported; - public static bool SupportsSse41 => Sse41.IsSupported; - public static bool SupportsSse42 => Sse42.IsSupported; - public static bool SupportsPclmulqdq => Pclmulqdq.IsSupported; - public static bool SupportsFma => Fma.IsSupported; - public static bool SupportsPopcnt => Popcnt.IsSupported; - public static bool SupportsAesni => Aes.IsSupported; - public static bool SupportsAvx => Avx.IsSupported; + static HardwareCapabilities() + { + if (!X86Base.IsSupported) + { + return; + } + + (_, _, int ecx, int edx) = X86Base.CpuId(0x00000001, 0x00000000); + + FeatureInfoEdx = (FeatureFlagsEdx)edx; + FeatureInfoEcx = (FeatureFlagsEcx)ecx; + } + + [Flags] + public enum FeatureFlagsEdx + { + Sse = 1 << 25, + Sse2 = 1 << 26 + } + + [Flags] + public enum FeatureFlagsEcx + { + Sse3 = 1 << 0, + Pclmulqdq = 1 << 1, + Ssse3 = 1 << 9, + Fma = 1 << 12, + Sse41 = 1 << 19, + Sse42 = 1 << 20, + Popcnt = 1 << 23, + Aes = 1 << 25, + Avx = 1 << 28, + F16c = 1 << 29 + } + + public static FeatureFlagsEdx FeatureInfoEdx { get; } + public static FeatureFlagsEcx FeatureInfoEcx { get; } + + public static bool SupportsSse => FeatureInfoEdx.HasFlag(FeatureFlagsEdx.Sse); + public static bool SupportsSse2 => FeatureInfoEdx.HasFlag(FeatureFlagsEdx.Sse2); + public static bool SupportsSse3 => FeatureInfoEcx.HasFlag(FeatureFlagsEcx.Sse3); + public static bool SupportsPclmulqdq => FeatureInfoEcx.HasFlag(FeatureFlagsEcx.Pclmulqdq); + public static bool SupportsSsse3 => FeatureInfoEcx.HasFlag(FeatureFlagsEcx.Ssse3); + public static bool SupportsFma => FeatureInfoEcx.HasFlag(FeatureFlagsEcx.Fma); + public static bool SupportsSse41 => FeatureInfoEcx.HasFlag(FeatureFlagsEcx.Sse41); + public static bool SupportsSse42 => FeatureInfoEcx.HasFlag(FeatureFlagsEcx.Sse42); + public static bool SupportsPopcnt => FeatureInfoEcx.HasFlag(FeatureFlagsEcx.Popcnt); + public static bool SupportsAesni => FeatureInfoEcx.HasFlag(FeatureFlagsEcx.Aes); + public static bool SupportsAvx => FeatureInfoEcx.HasFlag(FeatureFlagsEcx.Avx); + public static bool SupportsF16c => FeatureInfoEcx.HasFlag(FeatureFlagsEcx.F16c); public static bool ForceLegacySse { get; set; } diff --git a/ARMeilleure/CodeGen/X86/IntrinsicTable.cs b/ARMeilleure/CodeGen/X86/IntrinsicTable.cs index f7469badc0..864b0a108e 100644 --- a/ARMeilleure/CodeGen/X86/IntrinsicTable.cs +++ b/ARMeilleure/CodeGen/X86/IntrinsicTable.cs @@ -162,6 +162,8 @@ namespace ARMeilleure.CodeGen.X86 Add(Intrinsic.X86Unpckhps, new IntrinsicInfo(X86Instruction.Unpckhps, IntrinsicType.Binary)); Add(Intrinsic.X86Unpcklpd, new IntrinsicInfo(X86Instruction.Unpcklpd, IntrinsicType.Binary)); Add(Intrinsic.X86Unpcklps, new IntrinsicInfo(X86Instruction.Unpcklps, IntrinsicType.Binary)); + Add(Intrinsic.X86Vcvtph2ps, new IntrinsicInfo(X86Instruction.Vcvtph2ps, IntrinsicType.Unary)); + Add(Intrinsic.X86Vcvtps2ph, new IntrinsicInfo(X86Instruction.Vcvtps2ph, IntrinsicType.BinaryImm)); Add(Intrinsic.X86Xorpd, new IntrinsicInfo(X86Instruction.Xorpd, IntrinsicType.Binary)); Add(Intrinsic.X86Xorps, new IntrinsicInfo(X86Instruction.Xorps, IntrinsicType.Binary)); } diff --git a/ARMeilleure/CodeGen/X86/X86Instruction.cs b/ARMeilleure/CodeGen/X86/X86Instruction.cs index f9b35d371d..fae17b862b 100644 --- a/ARMeilleure/CodeGen/X86/X86Instruction.cs +++ b/ARMeilleure/CodeGen/X86/X86Instruction.cs @@ -33,7 +33,6 @@ namespace ARMeilleure.CodeGen.X86 Cmpxchg8, Comisd, Comiss, - Cpuid, Crc32, Crc32_16, Crc32_8, @@ -199,6 +198,8 @@ namespace ARMeilleure.CodeGen.X86 Unpcklps, Vblendvpd, Vblendvps, + Vcvtph2ps, + Vcvtps2ph, Vpblendvb, Xor, Xorpd, diff --git a/ARMeilleure/Instructions/InstEmitSimdCvt.cs b/ARMeilleure/Instructions/InstEmitSimdCvt.cs index edcf35d5ad..0350427cb1 100644 --- a/ARMeilleure/Instructions/InstEmitSimdCvt.cs +++ b/ARMeilleure/Instructions/InstEmitSimdCvt.cs @@ -60,21 +60,48 @@ namespace ARMeilleure.Instructions } else if (op.Size == 0 && op.Opc == 3) // Single -> Half. { - Operand ne = context.VectorExtract(OperandType.FP32, GetVec(op.Rn), 0); + if (Optimizations.UseF16c) + { + Debug.Assert(!Optimizations.ForceLegacySse); - Operand res = context.Call(typeof(SoftFloat32_16).GetMethod(nameof(SoftFloat32_16.FPConvert)), ne); + Operand n = GetVec(op.Rn); - res = context.ZeroExtend16(OperandType.I64, res); + Operand res = context.AddIntrinsic(Intrinsic.X86Vcvtps2ph, n, Const(X86GetRoundControl(FPRoundingMode.ToNearest))); + res = context.AddIntrinsic(Intrinsic.X86Pslldq, res, Const(14)); // VectorZeroUpper112() + res = context.AddIntrinsic(Intrinsic.X86Psrldq, res, Const(14)); - context.Copy(GetVec(op.Rd), EmitVectorInsert(context, context.VectorZero(), res, 0, 1)); + context.Copy(GetVec(op.Rd), res); + } + else + { + Operand ne = context.VectorExtract(OperandType.FP32, GetVec(op.Rn), 0); + + Operand res = context.Call(typeof(SoftFloat32_16).GetMethod(nameof(SoftFloat32_16.FPConvert)), ne); + + res = context.ZeroExtend16(OperandType.I64, res); + + context.Copy(GetVec(op.Rd), EmitVectorInsert(context, context.VectorZero(), res, 0, 1)); + } } else if (op.Size == 3 && op.Opc == 0) // Half -> Single. { - Operand ne = EmitVectorExtractZx(context, op.Rn, 0, 1); + if (Optimizations.UseF16c) + { + Debug.Assert(!Optimizations.ForceLegacySse); - Operand res = context.Call(typeof(SoftFloat16_32).GetMethod(nameof(SoftFloat16_32.FPConvert)), ne); + Operand res = context.AddIntrinsic(Intrinsic.X86Vcvtph2ps, GetVec(op.Rn)); + res = context.VectorZeroUpper96(res); - context.Copy(GetVec(op.Rd), context.VectorInsert(context.VectorZero(), res, 0)); + context.Copy(GetVec(op.Rd), res); + } + else + { + Operand ne = EmitVectorExtractZx(context, op.Rn, 0, 1); + + Operand res = context.Call(typeof(SoftFloat16_32).GetMethod(nameof(SoftFloat16_32.FPConvert)), ne); + + context.Copy(GetVec(op.Rd), context.VectorInsert(context.VectorZero(), res, 0)); + } } else if (op.Size == 1 && op.Opc == 3) // Double -> Half. { @@ -129,18 +156,20 @@ namespace ARMeilleure.Instructions if (Optimizations.UseSse2 && sizeF == 1) { Operand n = GetVec(op.Rn); - Operand res; - if (op.RegisterSize == RegisterSize.Simd128) - { - res = context.AddIntrinsic(Intrinsic.X86Movhlps, n, n); - } - else - { - res = n; - } + Operand res = op.RegisterSize == RegisterSize.Simd128 ? context.AddIntrinsic(Intrinsic.X86Movhlps, n, n) : n; + res = context.AddIntrinsic(Intrinsic.X86Cvtps2pd, res); - res = context.AddIntrinsic(Intrinsic.X86Cvtps2pd, res); + context.Copy(GetVec(op.Rd), res); + } + else if (Optimizations.UseF16c && sizeF == 0) + { + Debug.Assert(!Optimizations.ForceLegacySse); + + Operand n = GetVec(op.Rn); + + Operand res = op.RegisterSize == RegisterSize.Simd128 ? context.AddIntrinsic(Intrinsic.X86Movhlps, n, n) : n; + res = context.AddIntrinsic(Intrinsic.X86Vcvtph2ps, res); context.Copy(GetVec(op.Rd), res); } @@ -210,17 +239,30 @@ namespace ARMeilleure.Instructions { Operand d = GetVec(op.Rd); - Operand res = context.VectorZeroUpper64(d); + Intrinsic movInst = op.RegisterSize == RegisterSize.Simd128 ? Intrinsic.X86Movlhps : Intrinsic.X86Movhlps; Operand nInt = context.AddIntrinsic(Intrinsic.X86Cvtpd2ps, GetVec(op.Rn)); + nInt = context.AddIntrinsic(Intrinsic.X86Movlhps, nInt, nInt); - nInt = context.AddIntrinsic(Intrinsic.X86Movlhps, nInt, nInt); + Operand res = context.VectorZeroUpper64(d); + res = context.AddIntrinsic(movInst, res, nInt); - Intrinsic movInst = op.RegisterSize == RegisterSize.Simd128 - ? Intrinsic.X86Movlhps - : Intrinsic.X86Movhlps; + context.Copy(d, res); + } + else if (Optimizations.UseF16c && sizeF == 0) + { + Debug.Assert(!Optimizations.ForceLegacySse); - res = context.AddIntrinsic(movInst, res, nInt); + Operand d = GetVec(op.Rd); + Operand n = GetVec(op.Rn); + + Intrinsic movInst = op.RegisterSize == RegisterSize.Simd128 ? Intrinsic.X86Movlhps : Intrinsic.X86Movhlps; + + Operand nInt = context.AddIntrinsic(Intrinsic.X86Vcvtps2ph, n, Const(X86GetRoundControl(FPRoundingMode.ToNearest))); + nInt = context.AddIntrinsic(Intrinsic.X86Movlhps, nInt, nInt); + + Operand res = context.VectorZeroUpper64(d); + res = context.AddIntrinsic(movInst, res, nInt); context.Copy(d, res); } diff --git a/ARMeilleure/IntermediateRepresentation/Intrinsic.cs b/ARMeilleure/IntermediateRepresentation/Intrinsic.cs index 7f89117005..cbfa8c71ab 100644 --- a/ARMeilleure/IntermediateRepresentation/Intrinsic.cs +++ b/ARMeilleure/IntermediateRepresentation/Intrinsic.cs @@ -151,6 +151,8 @@ namespace ARMeilleure.IntermediateRepresentation X86Unpckhps, X86Unpcklpd, X86Unpcklps, + X86Vcvtph2ps, + X86Vcvtps2ph, X86Xorpd, X86Xorps } diff --git a/ARMeilleure/Optimizations.cs b/ARMeilleure/Optimizations.cs index fbbbfdb261..f568eb055d 100644 --- a/ARMeilleure/Optimizations.cs +++ b/ARMeilleure/Optimizations.cs @@ -14,6 +14,7 @@ namespace ARMeilleure public static bool UseSse42IfAvailable { get; set; } = true; public static bool UsePopCntIfAvailable { get; set; } = true; public static bool UseAvxIfAvailable { get; set; } = true; + public static bool UseF16cIfAvailable { get; set; } = true; public static bool UseAesniIfAvailable { get; set; } = true; public static bool UsePclmulqdqIfAvailable { get; set; } = true; @@ -31,6 +32,7 @@ namespace ARMeilleure internal static bool UseSse42 => UseSse42IfAvailable && HardwareCapabilities.SupportsSse42; internal static bool UsePopCnt => UsePopCntIfAvailable && HardwareCapabilities.SupportsPopcnt; internal static bool UseAvx => UseAvxIfAvailable && HardwareCapabilities.SupportsAvx && !ForceLegacySse; + internal static bool UseF16c => UseF16cIfAvailable && HardwareCapabilities.SupportsF16c; internal static bool UseAesni => UseAesniIfAvailable && HardwareCapabilities.SupportsAesni; internal static bool UsePclmulqdq => UsePclmulqdqIfAvailable && HardwareCapabilities.SupportsPclmulqdq; } diff --git a/ARMeilleure/Translation/PTC/Ptc.cs b/ARMeilleure/Translation/PTC/Ptc.cs index dd1c44b23b..3baef401a6 100644 --- a/ARMeilleure/Translation/PTC/Ptc.cs +++ b/ARMeilleure/Translation/PTC/Ptc.cs @@ -1,5 +1,6 @@ using ARMeilleure.CodeGen; using ARMeilleure.CodeGen.Unwinding; +using ARMeilleure.CodeGen.X86; using ARMeilleure.Memory; using Ryujinx.Common.Configuration; using Ryujinx.Common.Logging; @@ -10,7 +11,6 @@ using System.Diagnostics; using System.IO; using System.IO.Compression; using System.Runtime.InteropServices; -using System.Runtime.Intrinsics.X86; using System.Runtime.Serialization.Formatters.Binary; using System.Threading; using System.Threading.Tasks; @@ -21,7 +21,7 @@ namespace ARMeilleure.Translation.PTC { private const string HeaderMagic = "PTChd"; - private const int InternalVersion = 1273; //! To be incremented manually for each change to the ARMeilleure project. + private const int InternalVersion = 1650; //! To be incremented manually for each change to the ARMeilleure project. private const string ActualDir = "0"; private const string BackupDir = "1"; @@ -646,21 +646,7 @@ namespace ARMeilleure.Translation.PTC private static ulong GetFeatureInfo() { - ulong featureInfo = 0ul; - - featureInfo |= (Sse3.IsSupported ? 1ul : 0ul) << 0; - featureInfo |= (Pclmulqdq.IsSupported ? 1ul : 0ul) << 1; - featureInfo |= (Ssse3.IsSupported ? 1ul : 0ul) << 9; - featureInfo |= (Fma.IsSupported ? 1ul : 0ul) << 12; - featureInfo |= (Sse41.IsSupported ? 1ul : 0ul) << 19; - featureInfo |= (Sse42.IsSupported ? 1ul : 0ul) << 20; - featureInfo |= (Popcnt.IsSupported ? 1ul : 0ul) << 23; - featureInfo |= (Aes.IsSupported ? 1ul : 0ul) << 25; - featureInfo |= (Avx.IsSupported ? 1ul : 0ul) << 28; - featureInfo |= (Sse.IsSupported ? 1ul : 0ul) << 57; - featureInfo |= (Sse2.IsSupported ? 1ul : 0ul) << 58; - - return featureInfo; + return (ulong)HardwareCapabilities.FeatureInfoEdx << 32 | (uint)HardwareCapabilities.FeatureInfoEcx; } private struct Header diff --git a/Ryujinx.Tests/Cpu/CpuTestSimd.cs b/Ryujinx.Tests/Cpu/CpuTestSimd.cs index 249447d765..1371de4b76 100644 --- a/Ryujinx.Tests/Cpu/CpuTestSimd.cs +++ b/Ryujinx.Tests/Cpu/CpuTestSimd.cs @@ -1973,15 +1973,18 @@ namespace Ryujinx.Tests.Cpu CompareAgainstUnicorn(); } - [Test, Pairwise] [Explicit] + [Test, Pairwise] [Explicit] // Unicorn seems to default all rounding modes to RMode.Rn. public void F_Cvt_S_SH([ValueSource("_F_Cvt_S_SH_")] uint opcodes, - [ValueSource("_1S_F_")] ulong a) + [ValueSource("_1S_F_")] ulong a, + [Values(RMode.Rn)] RMode rMode) { ulong z = TestContext.CurrentContext.Random.NextULong(); V128 v0 = MakeVectorE0E1(z, z); V128 v1 = MakeVectorE0(a); - SingleOpcode(opcodes, v0: v0, v1: v1); + int fpcr = (int)rMode << (int)Fpcr.RMode; + + SingleOpcode(opcodes, v0: v0, v1: v1, fpcr: fpcr); CompareAgainstUnicorn(); } @@ -2134,7 +2137,7 @@ namespace Ryujinx.Tests.Cpu CompareAgainstUnicorn(fpsrMask: Fpsr.Ioc | Fpsr.Ofc | Fpsr.Ufc | Fpsr.Ixc | Fpsr.Idc); } - [Test, Pairwise] [Explicit] // Unicorn seems to default all rounding modes to RMode.Rn. + [Test, Pairwise] [Explicit] public void F_Cvtn_V_2D2S_2D4S([ValueSource("_F_Cvtn_V_2D2S_2D4S_")] uint opcodes, [Values(0u)] uint rd, [Values(1u, 0u)] uint rn,