From b8ee5b15abc750e0484195633e6c4bb6e05eab6f Mon Sep 17 00:00:00 2001 From: gdkchan <gab.dark.100@gmail.com> Date: Sat, 29 Feb 2020 17:51:17 -0300 Subject: [PATCH] Implement FACGE and FACGT (Scalar and Vector) AArch64 SIMD instructions (#956) --- ARMeilleure/Decoders/OpCodeTable.cs | 6 +- ARMeilleure/Instructions/InstEmitSimdCmp.cs | 75 ++++++++++++++++++++- ARMeilleure/Instructions/InstName.cs | 4 ++ Ryujinx.Tests/Cpu/CpuTestSimdReg.cs | 58 +++++++++------- 4 files changed, 115 insertions(+), 28 deletions(-) diff --git a/ARMeilleure/Decoders/OpCodeTable.cs b/ARMeilleure/Decoders/OpCodeTable.cs index 3915ac87bb..96847dd8ea 100644 --- a/ARMeilleure/Decoders/OpCodeTable.cs +++ b/ARMeilleure/Decoders/OpCodeTable.cs @@ -244,6 +244,10 @@ namespace ARMeilleure.Decoders SetA64("0>1011101<1xxxxx110101xxxxxxxxxx", InstName.Fabd_V, InstEmit.Fabd_V, typeof(OpCodeSimdReg)); SetA64("000111100x100000110000xxxxxxxxxx", InstName.Fabs_S, InstEmit.Fabs_S, typeof(OpCodeSimd)); SetA64("0>0011101<100000111110xxxxxxxxxx", InstName.Fabs_V, InstEmit.Fabs_V, typeof(OpCodeSimd)); + SetA64("011111100x1xxxxx111011xxxxxxxxxx", InstName.Facge_S, InstEmit.Facge_S, typeof(OpCodeSimdReg)); + SetA64("0>1011100<1xxxxx111011xxxxxxxxxx", InstName.Facge_V, InstEmit.Facge_V, typeof(OpCodeSimdReg)); + SetA64("011111101x1xxxxx111011xxxxxxxxxx", InstName.Facgt_S, InstEmit.Facgt_S, typeof(OpCodeSimdReg)); + SetA64("0>1011101<1xxxxx111011xxxxxxxxxx", InstName.Facgt_V, InstEmit.Facgt_V, typeof(OpCodeSimdReg)); SetA64("000111100x1xxxxx001010xxxxxxxxxx", InstName.Fadd_S, InstEmit.Fadd_S, typeof(OpCodeSimdReg)); SetA64("0>0011100<1xxxxx110101xxxxxxxxxx", InstName.Fadd_V, InstEmit.Fadd_V, typeof(OpCodeSimdReg)); SetA64("011111100x110000110110xxxxxxxxxx", InstName.Faddp_S, InstEmit.Faddp_S, typeof(OpCodeSimd)); @@ -751,7 +755,7 @@ namespace ARMeilleure.Decoders SetA32("111100110x11xxxxxxxx0001xxx1xxxx", InstName.Vbif, InstEmit32.Vbif, typeof(OpCode32SimdBinary)); SetA32("111100110x10xxxxxxxx0001xxx1xxxx", InstName.Vbit, InstEmit32.Vbit, typeof(OpCode32SimdBinary)); SetA32("111100110x01xxxxxxxx0001xxx1xxxx", InstName.Vbsl, InstEmit32.Vbsl, typeof(OpCode32SimdBinary)); - SetA32("111100110x<<xxxxxxxx1000xxx1xxxx", InstName.Vceq, InstEmit32.Vceq_I, typeof(OpCode32SimdReg)); + SetA32("111100110x<<xxxxxxxx1000xxx1xxxx", InstName.Vceq, InstEmit32.Vceq_I, typeof(OpCode32SimdReg)); SetA32("111100100x00xxxxxxxx1110xxx0xxxx", InstName.Vceq, InstEmit32.Vceq_V, typeof(OpCode32SimdReg)); SetA32("111100111x11xx01xxxx0x010xx0xxxx", InstName.Vceq, InstEmit32.Vceq_Z, typeof(OpCode32SimdCmpZ)); SetA32("1111001x0x<<xxxxxxxx0011xxx1xxxx", InstName.Vcge, InstEmit32.Vcge_I, typeof(OpCode32SimdReg)); diff --git a/ARMeilleure/Instructions/InstEmitSimdCmp.cs b/ARMeilleure/Instructions/InstEmitSimdCmp.cs index e70f56a0a3..d11adf1943 100644 --- a/ARMeilleure/Instructions/InstEmitSimdCmp.cs +++ b/ARMeilleure/Instructions/InstEmitSimdCmp.cs @@ -286,6 +286,54 @@ namespace ARMeilleure.Instructions EmitCmtstOp(context, scalar: false); } + public static void Facge_S(ArmEmitterContext context) + { + if (Optimizations.FastFP && Optimizations.UseSse2) + { + EmitSse2CmpOpF(context, CmpCondition.GreaterThanOrEqual, scalar: true, absolute: true); + } + else + { + EmitCmpOpF(context, SoftFloat32.FPCompareGE, SoftFloat64.FPCompareGE, scalar: true, absolute: true); + } + } + + public static void Facge_V(ArmEmitterContext context) + { + if (Optimizations.FastFP && Optimizations.UseSse2) + { + EmitSse2CmpOpF(context, CmpCondition.GreaterThanOrEqual, scalar: false, absolute: true); + } + else + { + EmitCmpOpF(context, SoftFloat32.FPCompareGE, SoftFloat64.FPCompareGE, scalar: false, absolute: true); + } + } + + public static void Facgt_S(ArmEmitterContext context) + { + if (Optimizations.FastFP && Optimizations.UseSse2) + { + EmitSse2CmpOpF(context, CmpCondition.GreaterThan, scalar: true, absolute: true); + } + else + { + EmitCmpOpF(context, SoftFloat32.FPCompareGT, SoftFloat64.FPCompareGT, scalar: true, absolute: true); + } + } + + public static void Facgt_V(ArmEmitterContext context) + { + if (Optimizations.FastFP && Optimizations.UseSse2) + { + EmitSse2CmpOpF(context, CmpCondition.GreaterThan, scalar: false, absolute: true); + } + else + { + EmitCmpOpF(context, SoftFloat32.FPCompareGT, SoftFloat64.FPCompareGT, scalar: false, absolute: true); + } + } + public static void Fccmp_S(ArmEmitterContext context) { EmitFccmpOrFccmpe(context, signalNaNs: false); @@ -639,7 +687,8 @@ namespace ARMeilleure.Instructions ArmEmitterContext context, _F32_F32_F32 f32, _F64_F64_F64 f64, - bool scalar) + bool scalar, + bool absolute = false) { OpCodeSimd op = (OpCodeSimd)context.CurrOp; @@ -665,6 +714,12 @@ namespace ARMeilleure.Instructions me = sizeF == 0 ? ConstF(0f) : ConstF(0d); } + if (absolute) + { + ne = EmitUnaryMathCall(context, MathF.Abs, Math.Abs, ne); + me = EmitUnaryMathCall(context, MathF.Abs, Math.Abs, me); + } + Operand e = EmitSoftFloatCall(context, f32, f64, ne, me); res = context.VectorInsert(res, e, index); @@ -673,7 +728,7 @@ namespace ARMeilleure.Instructions context.Copy(GetVec(op.Rd), res); } - private static void EmitSse2CmpOpF(ArmEmitterContext context, CmpCondition cond, bool scalar) + private static void EmitSse2CmpOpF(ArmEmitterContext context, CmpCondition cond, bool scalar, bool absolute = false) { OpCodeSimd op = (OpCodeSimd)context.CurrOp; @@ -684,6 +739,14 @@ namespace ARMeilleure.Instructions if (sizeF == 0) { + if (absolute) + { + Operand mask = scalar ? X86GetScalar(context, int.MaxValue) : X86GetAllElements(context, int.MaxValue); + + n = context.AddIntrinsic(Intrinsic.X86Andps, n, mask); + m = context.AddIntrinsic(Intrinsic.X86Andps, m, mask); + } + Intrinsic inst = scalar ? Intrinsic.X86Cmpss : Intrinsic.X86Cmpps; Operand res = context.AddIntrinsic(inst, n, m, Const((int)cond)); @@ -701,6 +764,14 @@ namespace ARMeilleure.Instructions } else /* if (sizeF == 1) */ { + if (absolute) + { + Operand mask = scalar ? X86GetScalar(context, long.MaxValue) : X86GetAllElements(context, long.MaxValue); + + n = context.AddIntrinsic(Intrinsic.X86Andpd, n, mask); + m = context.AddIntrinsic(Intrinsic.X86Andpd, m, mask); + } + Intrinsic inst = scalar ? Intrinsic.X86Cmpsd : Intrinsic.X86Cmppd; Operand res = context.AddIntrinsic(inst, n, m, Const((int)cond)); diff --git a/ARMeilleure/Instructions/InstName.cs b/ARMeilleure/Instructions/InstName.cs index 0c2dd18d49..157feacfa0 100644 --- a/ARMeilleure/Instructions/InstName.cs +++ b/ARMeilleure/Instructions/InstName.cs @@ -152,6 +152,10 @@ namespace ARMeilleure.Instructions Fabd_V, Fabs_S, Fabs_V, + Facge_S, + Facge_V, + Facgt_S, + Facgt_V, Fadd_S, Fadd_V, Faddp_S, diff --git a/Ryujinx.Tests/Cpu/CpuTestSimdReg.cs b/Ryujinx.Tests/Cpu/CpuTestSimdReg.cs index 9b767db408..a545838273 100644 --- a/Ryujinx.Tests/Cpu/CpuTestSimdReg.cs +++ b/Ryujinx.Tests/Cpu/CpuTestSimdReg.cs @@ -259,40 +259,48 @@ namespace Ryujinx.Tests.Cpu }; } - private static uint[] _F_Cm_EqGeGt_S_S_() + private static uint[] _F_AcCm_EqGeGt_S_S_() { return new uint[] { + 0x7E22EC20u, // FACGE S0, S1, S2 + 0x7EA2EC20u, // FACGT S0, S1, S2 0x5E22E420u, // FCMEQ S0, S1, S2 0x7E22E420u, // FCMGE S0, S1, S2 0x7EA2E420u // FCMGT S0, S1, S2 }; } - private static uint[] _F_Cm_EqGeGt_S_D_() + private static uint[] _F_AcCm_EqGeGt_S_D_() { return new uint[] { + 0x7E62EC20u, // FACGE D0, D1, D2 + 0x7EE2EC20u, // FACGT D0, D1, D2 0x5E62E420u, // FCMEQ D0, D1, D2 0x7E62E420u, // FCMGE D0, D1, D2 0x7EE2E420u // FCMGT D0, D1, D2 }; } - private static uint[] _F_Cm_EqGeGt_V_2S_4S_() + private static uint[] _F_AcCm_EqGeGt_V_2S_4S_() { return new uint[] { + 0x2E20EC00u, // FACGE V0.2S, V0.2S, V0.2S + 0x2EA0EC00u, // FACGT V0.2S, V0.2S, V0.2S 0x0E20E400u, // FCMEQ V0.2S, V0.2S, V0.2S 0x2E20E400u, // FCMGE V0.2S, V0.2S, V0.2S 0x2EA0E400u // FCMGT V0.2S, V0.2S, V0.2S }; } - private static uint[] _F_Cm_EqGeGt_V_2D_() + private static uint[] _F_AcCm_EqGeGt_V_2D_() { return new uint[] { + 0x6E60EC00u, // FACGE V0.2D, V0.2D, V0.2D + 0x6EE0EC00u, // FACGT V0.2D, V0.2D, V0.2D 0x4E60E400u, // FCMEQ V0.2D, V0.2D, V0.2D 0x6E60E400u, // FCMGE V0.2D, V0.2D, V0.2D 0x6EE0E400u // FCMGT V0.2D, V0.2D, V0.2D @@ -1429,9 +1437,9 @@ namespace Ryujinx.Tests.Cpu } [Test, Pairwise] [Explicit] - public void F_Cm_EqGeGt_S_S([ValueSource("_F_Cm_EqGeGt_S_S_")] uint opcodes, - [ValueSource("_1S_F_")] ulong a, - [ValueSource("_1S_F_")] ulong b) + public void F_AcCm_EqGeGt_S_S([ValueSource("_F_AcCm_EqGeGt_S_S_")] uint opcodes, + [ValueSource("_1S_F_")] ulong a, + [ValueSource("_1S_F_")] ulong b) { ulong z = TestContext.CurrentContext.Random.NextULong(); V128 v0 = MakeVectorE0E1(z, z); @@ -1448,9 +1456,9 @@ namespace Ryujinx.Tests.Cpu } [Test, Pairwise] [Explicit] - public void F_Cm_EqGeGt_S_D([ValueSource("_F_Cm_EqGeGt_S_D_")] uint opcodes, - [ValueSource("_1D_F_")] ulong a, - [ValueSource("_1D_F_")] ulong b) + public void F_AcCm_EqGeGt_S_D([ValueSource("_F_AcCm_EqGeGt_S_D_")] uint opcodes, + [ValueSource("_1D_F_")] ulong a, + [ValueSource("_1D_F_")] ulong b) { ulong z = TestContext.CurrentContext.Random.NextULong(); V128 v0 = MakeVectorE1(z); @@ -1467,14 +1475,14 @@ namespace Ryujinx.Tests.Cpu } [Test, Pairwise] [Explicit] - public void F_Cm_EqGeGt_V_2S_4S([ValueSource("_F_Cm_EqGeGt_V_2S_4S_")] uint opcodes, - [Values(0u)] uint rd, - [Values(1u, 0u)] uint rn, - [Values(2u, 0u)] uint rm, - [ValueSource("_2S_F_")] ulong z, - [ValueSource("_2S_F_")] ulong a, - [ValueSource("_2S_F_")] ulong b, - [Values(0b0u, 0b1u)] uint q) // <2S, 4S> + public void F_AcCm_EqGeGt_V_2S_4S([ValueSource("_F_AcCm_EqGeGt_V_2S_4S_")] uint opcodes, + [Values(0u)] uint rd, + [Values(1u, 0u)] uint rn, + [Values(2u, 0u)] uint rm, + [ValueSource("_2S_F_")] ulong z, + [ValueSource("_2S_F_")] ulong a, + [ValueSource("_2S_F_")] ulong b, + [Values(0b0u, 0b1u)] uint q) // <2S, 4S> { opcodes |= ((rm & 31) << 16) | ((rn & 31) << 5) | ((rd & 31) << 0); opcodes |= ((q & 1) << 30); @@ -1493,13 +1501,13 @@ namespace Ryujinx.Tests.Cpu } [Test, Pairwise] [Explicit] - public void F_Cm_EqGeGt_V_2D([ValueSource("_F_Cm_EqGeGt_V_2D_")] uint opcodes, - [Values(0u)] uint rd, - [Values(1u, 0u)] uint rn, - [Values(2u, 0u)] uint rm, - [ValueSource("_1D_F_")] ulong z, - [ValueSource("_1D_F_")] ulong a, - [ValueSource("_1D_F_")] ulong b) + public void F_AcCm_EqGeGt_V_2D([ValueSource("_F_AcCm_EqGeGt_V_2D_")] uint opcodes, + [Values(0u)] uint rd, + [Values(1u, 0u)] uint rn, + [Values(2u, 0u)] uint rm, + [ValueSource("_1D_F_")] ulong z, + [ValueSource("_1D_F_")] ulong a, + [ValueSource("_1D_F_")] ulong b) { opcodes |= ((rm & 31) << 16) | ((rn & 31) << 5) | ((rd & 31) << 0);