From 9a49f8aec92f7707037f5d1e677078451d07036b Mon Sep 17 00:00:00 2001
From: riperiperi <rhy3756547@hotmail.com>
Date: Wed, 24 Jun 2020 01:43:44 +0100
Subject: [PATCH] Fix VMVN (immediate), Add VPMIN, VPMAX, VMVN (register)
 (#1303)

* Add Vmvn (register), tests for both Vmvn variants.

* Add Vpmin, Vpmax, improve Non-FastFp accuracy for Vpadd

* Rebase on top of PTC.

* Add Nopcode

* Increment PTC version.

* Fix nits.
---
 ARMeilleure/Decoders/OpCodeTable.cs           | 12 ++-
 .../Instructions/InstEmitSimdArithmetic32.cs  | 62 ++++++++++++-
 .../Instructions/InstEmitSimdMove32.cs        | 18 +++-
 ARMeilleure/Instructions/InstName.cs          |  2 +
 ARMeilleure/Translation/PTC/Ptc.cs            |  2 +-
 Ryujinx.Tests/Cpu/CpuTestSimdMov32.cs         | 84 +++++++++++++++++-
 Ryujinx.Tests/Cpu/CpuTestSimdReg32.cs         | 86 ++++++++++++++++---
 7 files changed, 243 insertions(+), 23 deletions(-)

diff --git a/ARMeilleure/Decoders/OpCodeTable.cs b/ARMeilleure/Decoders/OpCodeTable.cs
index ec7b8bd91c..8567e1ce69 100644
--- a/ARMeilleure/Decoders/OpCodeTable.cs
+++ b/ARMeilleure/Decoders/OpCodeTable.cs
@@ -704,6 +704,7 @@ namespace ARMeilleure.Decoders
             SetA32("<<<<0011111x0000xxxxxxxxxxxxxxxx", InstName.Mvn,     InstEmit32.Mvn,     typeof(OpCode32AluImm));
             SetA32("<<<<0001111x0000xxxxxxxxxxx0xxxx", InstName.Mvn,     InstEmit32.Mvn,     typeof(OpCode32AluRsImm));
             SetA32("<<<<0001111x0000xxxxxxxx0xx1xxxx", InstName.Mvn,     InstEmit32.Mvn,     typeof(OpCode32AluRsReg));
+            SetA32("<<<<0011001000001111000000000000", InstName.Nop,     InstEmit32.Nop,     typeof(OpCode32));
             SetA32("<<<<0011100xxxxxxxxxxxxxxxxxxxxx", InstName.Orr,     InstEmit32.Orr,     typeof(OpCode32AluImm));
             SetA32("<<<<0001100xxxxxxxxxxxxxxxx0xxxx", InstName.Orr,     InstEmit32.Orr,     typeof(OpCode32AluRsImm));
             SetA32("<<<<0001100xxxxxxxxxxxxx0xx1xxxx", InstName.Orr,     InstEmit32.Orr,     typeof(OpCode32AluRsReg));
@@ -878,9 +879,10 @@ namespace ARMeilleure.Decoders
             SetA32("1111001x1x<<xxxxxxx01010x1x0xxxx", InstName.Vmull,    InstEmit32.Vmull_1,  typeof(OpCode32SimdRegElemLong));
             SetA32("1111001x1x<<xxxxxxx01100x0x0xxxx", InstName.Vmull,    InstEmit32.Vmull_I,  typeof(OpCode32SimdRegLong));
             SetA32("111100101x00xxxxxxx01110x0x0xxxx", InstName.Vmull,    InstEmit32.Vmull_I,  typeof(OpCode32SimdRegLong)); // Polynomial
-            SetA32("1111001x1x000xxxxxxx0xx00x11xxxx", InstName.Vmvn,     InstEmit32.Vmvn_I,   typeof(OpCode32SimdImm)); // D/Q vector I32.
-            SetA32("1111001x1x000xxxxxxx10x00x11xxxx", InstName.Vmvn,     InstEmit32.Vmvn_I,   typeof(OpCode32SimdImm));
-            SetA32("1111001x1x000xxxxxxx110x0x11xxxx", InstName.Vmvn,     InstEmit32.Vmvn_I,   typeof(OpCode32SimdImm));
+            SetA32("111100111x110000xxxx01011xx0xxxx", InstName.Vmvn,     InstEmit32.Vmvn_I,   typeof(OpCode32SimdBinary));
+            SetA32("1111001x1x000xxxxxxx0xx00x11xxxx", InstName.Vmvn,     InstEmit32.Vmvn_II,  typeof(OpCode32SimdImm)); // D/Q vector I32.
+            SetA32("1111001x1x000xxxxxxx10x00x11xxxx", InstName.Vmvn,     InstEmit32.Vmvn_II,  typeof(OpCode32SimdImm));
+            SetA32("1111001x1x000xxxxxxx110x0x11xxxx", InstName.Vmvn,     InstEmit32.Vmvn_II,  typeof(OpCode32SimdImm));
             SetA32("<<<<11101x110001xxxx101x01x0xxxx", InstName.Vneg,     InstEmit32.Vneg_S,   typeof(OpCode32SimdS));
             SetA32("111100111x11xx01xxxx0x111xx0xxxx", InstName.Vneg,     InstEmit32.Vneg_V,   typeof(OpCode32Simd));
             SetA32("<<<<11100x01xxxxxxxx101xx1x0xxxx", InstName.Vnmla,    InstEmit32.Vnmla_S,  typeof(OpCode32SimdRegS));
@@ -890,6 +892,10 @@ namespace ARMeilleure.Decoders
             SetA32("1111001x1x000xxxxxxx0xx10x01xxxx", InstName.Vorr,     InstEmit32.Vorr_II,  typeof(OpCode32SimdImm));
             SetA32("111100100x<<xxxxxxxx1011x0x1xxxx", InstName.Vpadd,    InstEmit32.Vpadd_I,  typeof(OpCode32SimdReg));
             SetA32("111100110x00xxxxxxxx1101x0x0xxxx", InstName.Vpadd,    InstEmit32.Vpadd_V,  typeof(OpCode32SimdReg));
+            SetA32("1111001x0x<<xxxxxxxx1010x0x0xxxx", InstName.Vpmax,    InstEmit32.Vpmax_I,  typeof(OpCode32SimdReg));
+            SetA32("111100110x00xxxxxxxx1111x0x0xxxx", InstName.Vpmax,    InstEmit32.Vpmax_V,  typeof(OpCode32SimdReg));
+            SetA32("1111001x0x<<xxxxxxxx1010x0x1xxxx", InstName.Vpmin,    InstEmit32.Vpmin_I,  typeof(OpCode32SimdReg));
+            SetA32("111100110x10xxxxxxxx1111x0x0xxxx", InstName.Vpmin,    InstEmit32.Vpmin_V,  typeof(OpCode32SimdReg));
             SetA32("1111001x1x>>>xxxxxxx100101x1xxx0", InstName.Vqrshrn,  InstEmit32.Vqrshrn,  typeof(OpCode32SimdShImmNarrow));
             SetA32("111100111x>>>xxxxxxx100001x1xxx0", InstName.Vqrshrun, InstEmit32.Vqrshrun, typeof(OpCode32SimdShImmNarrow));
             SetA32("111100111x111011xxxx010x0xx0xxxx", InstName.Vrecpe,   InstEmit32.Vrecpe,   typeof(OpCode32SimdSqrte));
diff --git a/ARMeilleure/Instructions/InstEmitSimdArithmetic32.cs b/ARMeilleure/Instructions/InstEmitSimdArithmetic32.cs
index fdc1bb469f..82f57d63e1 100644
--- a/ARMeilleure/Instructions/InstEmitSimdArithmetic32.cs
+++ b/ARMeilleure/Instructions/InstEmitSimdArithmetic32.cs
@@ -817,7 +817,7 @@ namespace ARMeilleure.Instructions
             }
             else
             {
-                EmitVectorPairwiseOpF32(context, (op1, op2) => context.Add(op1, op2));
+                EmitVectorPairwiseOpF32(context, (op1, op2) => EmitSoftFloatCallDefaultFpscr(context, nameof(SoftFloat32.FPAddFpscr), op1, op2));
             }
         }
 
@@ -835,6 +835,66 @@ namespace ARMeilleure.Instructions
             }
         }
 
+        public static void Vpmax_V(ArmEmitterContext context)
+        {
+            if (Optimizations.FastFP && Optimizations.UseSse2)
+            {
+                EmitSse2VectorPairwiseOpF32(context, Intrinsic.X86Maxps);
+            }
+            else
+            {
+                EmitVectorPairwiseOpF32(context, (op1, op2) => EmitSoftFloatCallDefaultFpscr(context, nameof(SoftFloat64.FPMaxFpscr), op1, op2));
+            }
+        }
+
+        public static void Vpmax_I(ArmEmitterContext context)
+        {
+            OpCode32SimdReg op = (OpCode32SimdReg)context.CurrOp;
+
+            if (Optimizations.UseSsse3)
+            {
+                EmitSsse3VectorPairwiseOp32(context, op.U ? X86PmaxuInstruction : X86PmaxsInstruction);
+            }
+            else
+            {
+                EmitVectorPairwiseOpI32(context, (op1, op2) => 
+                {
+                    Operand greater = op.U ? context.ICompareGreaterUI(op1, op2) : context.ICompareGreater(op1, op2);
+                    return context.ConditionalSelect(greater, op1, op2);
+                }, !op.U);
+            }
+        }
+
+        public static void Vpmin_V(ArmEmitterContext context)
+        {
+            if (Optimizations.FastFP && Optimizations.UseSse2)
+            {
+                EmitSse2VectorPairwiseOpF32(context, Intrinsic.X86Minps);
+            }
+            else
+            {
+                EmitVectorPairwiseOpF32(context, (op1, op2) => EmitSoftFloatCallDefaultFpscr(context, nameof(SoftFloat32.FPMinFpscr), op1, op2));
+            }
+        }
+
+        public static void Vpmin_I(ArmEmitterContext context)
+        {
+            OpCode32SimdReg op = (OpCode32SimdReg)context.CurrOp;
+
+            if (Optimizations.UseSsse3)
+            {
+                EmitSsse3VectorPairwiseOp32(context, op.U ? X86PminuInstruction : X86PminsInstruction);
+            }
+            else
+            {
+                EmitVectorPairwiseOpI32(context, (op1, op2) =>
+                {
+                    Operand greater = op.U ? context.ICompareLessUI(op1, op2) : context.ICompareLess(op1, op2);
+                    return context.ConditionalSelect(greater, op1, op2);
+                }, !op.U);
+            }
+        }
+
         public static void Vrev(ArmEmitterContext context)
         {
             OpCode32SimdRev op = (OpCode32SimdRev)context.CurrOp;
diff --git a/ARMeilleure/Instructions/InstEmitSimdMove32.cs b/ARMeilleure/Instructions/InstEmitSimdMove32.cs
index f11f9cc593..b484381fe8 100644
--- a/ARMeilleure/Instructions/InstEmitSimdMove32.cs
+++ b/ARMeilleure/Instructions/InstEmitSimdMove32.cs
@@ -34,7 +34,23 @@ namespace ARMeilleure.Instructions
 
         public static void Vmvn_I(ArmEmitterContext context)
         {
-            EmitVectorImmUnaryOp32(context, (op1) => context.BitwiseExclusiveOr(op1, op1));
+            if (Optimizations.UseSse2)
+            {
+                EmitVectorUnaryOpSimd32(context, (op1) =>
+                {
+                    Operand mask = X86GetAllElements(context, -1L);
+                    return context.AddIntrinsic(Intrinsic.X86Pandn, op1, mask);
+                });
+            }
+            else
+            {
+                EmitVectorUnaryOpZx32(context, (op1) => context.BitwiseNot(op1));
+            }
+        }
+
+        public static void Vmvn_II(ArmEmitterContext context)
+        {
+            EmitVectorImmUnaryOp32(context, (op1) => context.BitwiseNot(op1));
         }
 
         public static void Vmov_GS(ArmEmitterContext context)
diff --git a/ARMeilleure/Instructions/InstName.cs b/ARMeilleure/Instructions/InstName.cs
index 9bf319aaf5..e4d084560d 100644
--- a/ARMeilleure/Instructions/InstName.cs
+++ b/ARMeilleure/Instructions/InstName.cs
@@ -582,6 +582,8 @@ namespace ARMeilleure.Instructions
         Vnmls,
         Vorr,
         Vpadd,
+        Vpmax,
+        Vpmin,
         Vqrshrn,
         Vqrshrun,
         Vrev,
diff --git a/ARMeilleure/Translation/PTC/Ptc.cs b/ARMeilleure/Translation/PTC/Ptc.cs
index 135a45f035..2b4059ec29 100644
--- a/ARMeilleure/Translation/PTC/Ptc.cs
+++ b/ARMeilleure/Translation/PTC/Ptc.cs
@@ -20,7 +20,7 @@ namespace ARMeilleure.Translation.PTC
     {
         private const string HeaderMagic = "PTChd";
 
-        private const int InternalVersion = 1; //! To be incremented manually for each change to the ARMeilleure project.
+        private const int InternalVersion = 2; //! To be incremented manually for each change to the ARMeilleure project.
 
         private const string BaseDir = "Ryujinx";
 
diff --git a/Ryujinx.Tests/Cpu/CpuTestSimdMov32.cs b/Ryujinx.Tests/Cpu/CpuTestSimdMov32.cs
index 8c9627ce0f..aba0f02dbf 100644
--- a/Ryujinx.Tests/Cpu/CpuTestSimdMov32.cs
+++ b/Ryujinx.Tests/Cpu/CpuTestSimdMov32.cs
@@ -39,8 +39,8 @@ namespace Ryujinx.Tests.Cpu
                 0b1110_1
             };
 
-
             uint opcode = 0xf2800010u; // VMOV.I32 D0, #0
+
             uint cmodeOp = variants[variant];
 
             if (q)
@@ -49,11 +49,11 @@ namespace Ryujinx.Tests.Cpu
             }
 
             opcode |= ((cmodeOp & 1) << 5) | ((cmodeOp & 0x1e) << 7);
-            opcode |= ((q ? 1u : 0u) << 6);
+            opcode |= (q ? 1u : 0u) << 6;
             opcode |= (imm & 0xf) | ((imm & 0x70) << 12) | ((imm & 0x80) << 16);
 
-            opcode |= ((vd & 0x10) << 18);
-            opcode |= ((vd & 0xf) << 12);
+            opcode |= (vd & 0x10) << 18;
+            opcode |= (vd & 0xf) << 12;
 
             SingleOpcode(opcode);
 
@@ -258,6 +258,82 @@ namespace Ryujinx.Tests.Cpu
             CompareAgainstUnicorn();
         }
 
+        [Test, Pairwise, Description("VMVN.<size> <Vt>, <Vm>")]
+        public void Vmvn([Range(0u, 1u, 2u)] uint size,
+                         [Values(0u, 1u, 2u, 3u)] uint vd,
+                         [Values(0u, 2u, 4u, 8u)] uint vm,
+                         [Values] bool q)
+        {
+            uint opcode = 0xf3b00580u; // VMVN D0, D0
+
+            if (q)
+            {
+                opcode |= 1 << 6;
+                vm <<= 1;
+                vd <<= 1;
+            }
+
+            opcode |= (size & 0x3) << 18;
+            opcode |= (vm & 0x10) << 1;
+            opcode |= (vm & 0xf) << 0;
+
+            opcode |= (vd & 0x10) << 18;
+            opcode |= (vd & 0xf) << 12;
+
+            V128 v0 = new V128(TestContext.CurrentContext.Random.NextULong(), TestContext.CurrentContext.Random.NextULong());
+            V128 v1 = new V128(TestContext.CurrentContext.Random.NextULong(), TestContext.CurrentContext.Random.NextULong());
+            V128 v2 = new V128(TestContext.CurrentContext.Random.NextULong(), TestContext.CurrentContext.Random.NextULong());
+            V128 v3 = new V128(TestContext.CurrentContext.Random.NextULong(), TestContext.CurrentContext.Random.NextULong());
+
+            SingleOpcode(opcode, v0: v0, v1: v1, v2: v2, v3: v3);
+
+            CompareAgainstUnicorn();
+        }
+
+        [Test, Pairwise, Description("VMVN.I<size> <Dd/Qd>, #<imm>")]
+        public void Mvni_V([Range(0u, 7u)] uint variant,
+                           [Values(0u, 1u, 2u, 3u)] uint vd,
+                           [Values(0x0u)] [Random(1u, 0xffu, RndCntImm)] uint imm,
+                           [Values] bool q)
+        {
+            uint[] variants =
+            {
+                // I32
+                0b0000,
+                0b0010,
+                0b0100,
+                0b0110,
+
+                // I16
+                0b1000,
+                0b1010,
+
+                // I32
+                0b1100,
+                0b1101,
+            };
+
+            uint opcode = 0xf2800030u; // VMVN.I32 D0, #0
+
+            uint cmodeOp = variants[variant];
+
+            if (q)
+            {
+                vd <<= 1;
+            }
+
+            opcode |= (cmodeOp & 0xf) << 8;
+            opcode |= (q ? 1u : 0u) << 6;
+            opcode |= (imm & 0xf) | ((imm & 0x70) << 12) | ((imm & 0x80) << 16);
+
+            opcode |= (vd & 0x10) << 18;
+            opcode |= (vd & 0xf) << 12;
+
+            SingleOpcode(opcode);
+
+            CompareAgainstUnicorn();
+        }
+
         [Test, Pairwise, Description("VTRN.<size> <Vd>, <Vm>")]
         public void Vtrn([Values(0u, 1u, 2u, 3u)] uint vm,
                          [Values(0u, 1u, 2u, 3u)] uint vd,
diff --git a/Ryujinx.Tests/Cpu/CpuTestSimdReg32.cs b/Ryujinx.Tests/Cpu/CpuTestSimdReg32.cs
index 66db63bc49..1581e85044 100644
--- a/Ryujinx.Tests/Cpu/CpuTestSimdReg32.cs
+++ b/Ryujinx.Tests/Cpu/CpuTestSimdReg32.cs
@@ -12,6 +12,31 @@ namespace Ryujinx.Tests.Cpu
     {
 #if SimdReg32
 
+#region "ValueSource (Opcodes)"
+        private static uint[] _Vp_Add_Max_Min_F_()
+        {
+            return new uint[]
+            {
+                0xf3000d00u, // VPADD.F32 D0, D0, D0
+                0xf3000f00u, // VPMAX.F32 D0, D0, D0
+                0xf3200f00u // VPMIN.F32 D0, D0, D0
+            };
+        }
+
+        // VPADD does not have an unsigned flag, so we check the opcode before setting it.
+        private static uint VpaddI8 = 0xf2000b10u; // VPADD.I8 D0, D0, D0
+
+        private static uint[] _Vp_Add_Max_Min_I_()
+        {
+            return new uint[]
+            {
+                VpaddI8,
+                0xf2000a00u, // VPMAX.S8 D0, D0, D0
+                0xf2000a10u // VPMIN.S8 D0, D0, D0
+            };
+        }
+#endregion
+
 #region "ValueSource (Types)"
         private static ulong[] _1B1H1S1D_()
         {
@@ -296,7 +321,7 @@ namespace Ryujinx.Tests.Cpu
         {
             uint opcode = 0xf2800a00u; // VMLSL.S8 Q0, D0, D0
 
-            opcode |= ((rm & 0xf) << 0) | ((rm & 0x10) << 1);
+            opcode |= ((rm & 0xf) << 0)  | ((rm & 0x10) << 1);
             opcode |= ((rd & 0xf) << 12) | ((rd & 0x10) << 18);
             opcode |= ((rn & 0xf) << 16) | ((rn & 0x10) << 3);
 
@@ -329,7 +354,7 @@ namespace Ryujinx.Tests.Cpu
         {
             uint opcode = 0xf2800c00u; // VMULL.S8 Q0, D0, D0
 
-            opcode |= ((rm & 0xf) << 0) | ((rm & 0x10) << 1);
+            opcode |= ((rm & 0xf) << 0)  | ((rm & 0x10) << 1);
             opcode |= ((rd & 0xf) << 12) | ((rd & 0x10) << 18);
             opcode |= ((rn & 0xf) << 16) | ((rn & 0x10) << 3);
 
@@ -381,7 +406,7 @@ namespace Ryujinx.Tests.Cpu
                 opcode |= 1 << 24;
             }
 
-            opcode |= ((rm & 0xf) << 0) | ((rm & 0x10) << 1);
+            opcode |= ((rm & 0xf) << 0)  | ((rm & 0x10) << 1);
             opcode |= ((rd & 0xf) << 12) | ((rd & 0x10) << 18);
             opcode |= ((rn & 0xf) << 16) | ((rn & 0x10) << 3);
 
@@ -397,22 +422,57 @@ namespace Ryujinx.Tests.Cpu
         }
 
         [Explicit]
-        [Test, Pairwise, Description("VPADD.f32 V0, V0, V0")]
-        public void Vpadd_f32([Values(0u)] uint rd,
-                              [Range(0u, 7u)] uint rn,
-                              [Range(0u, 7u)] uint rm)
+        [Test, Pairwise]
+        public void Vp_Add_Max_Min_F([ValueSource("_Vp_Add_Max_Min_F_")] uint opcode,
+                                     [Values(0u)] uint rd,
+                                     [Range(0u, 7u)] uint rn,
+                                     [Range(0u, 7u)] uint rm,
+                                     [ValueSource("_2S_F_")] ulong z0,
+                                     [ValueSource("_2S_F_")] ulong z1,
+                                     [ValueSource("_2S_F_")] ulong a0,
+                                     [ValueSource("_2S_F_")] ulong a1,
+                                     [ValueSource("_2S_F_")] ulong b0,
+                                     [ValueSource("_2S_F_")] ulong b1)
         {
-            // not currently a slow path test - just a sanity check for pairwise
-            uint opcode = 0xf3000d00u; // VPADD.F32 D0, D0, D0
-
             opcode |= ((rm & 0xf) << 0)  | ((rm & 0x10) << 1);
             opcode |= ((rd & 0xf) << 12) | ((rd & 0x10) << 18);
             opcode |= ((rn & 0xf) << 16) | ((rn & 0x10) << 3);
 
             var rnd = TestContext.CurrentContext.Random;
-            V128 v0 = new V128(rnd.NextFloat(int.MinValue, int.MaxValue), rnd.NextFloat(int.MinValue, int.MaxValue), rnd.NextFloat(int.MinValue, int.MaxValue), rnd.NextFloat(int.MinValue, int.MaxValue));
-            V128 v1 = new V128(rnd.NextFloat(int.MinValue, int.MaxValue), rnd.NextFloat(int.MinValue, int.MaxValue), rnd.NextFloat(int.MinValue, int.MaxValue), rnd.NextFloat(int.MinValue, int.MaxValue));
-            V128 v2 = new V128(rnd.NextFloat(int.MinValue, int.MaxValue), rnd.NextFloat(int.MinValue, int.MaxValue), rnd.NextFloat(int.MinValue, int.MaxValue), rnd.NextFloat(int.MinValue, int.MaxValue));
+            V128 v0 = MakeVectorE0E1(z0, z1);
+            V128 v1 = MakeVectorE0E1(a0, a1);
+            V128 v2 = MakeVectorE0E1(b0, b1);
+
+            SingleOpcode(opcode, v0: v0, v1: v1, v2: v2);
+
+            CompareAgainstUnicorn();
+        }
+
+        [Test, Pairwise]
+        public void Vp_Add_Max_Min_I([ValueSource("_Vp_Add_Max_Min_I_")] uint opcode,
+                                     [Values(0u)] uint rd,
+                                     [Range(0u, 5u)] uint rn,
+                                     [Range(0u, 5u)] uint rm,
+                                     [Values(0u, 1u, 2u)] uint size,
+                                     [Random(RndCnt)] ulong z,
+                                     [Random(RndCnt)] ulong a,
+                                     [Random(RndCnt)] ulong b,
+                                     [Values] bool u)
+        {
+            if (u && opcode != VpaddI8)
+            {
+                opcode |= 1 << 24;
+            }
+
+            opcode |= ((rm & 0xf) << 0)  | ((rm & 0x10) << 1);
+            opcode |= ((rd & 0xf) << 12) | ((rd & 0x10) << 18);
+            opcode |= ((rn & 0xf) << 16) | ((rn & 0x10) << 3);
+
+            opcode |= size << 20;
+
+            V128 v0 = MakeVectorE0E1(z, z);
+            V128 v1 = MakeVectorE0E1(a, z);
+            V128 v2 = MakeVectorE0E1(b, z);
 
             SingleOpcode(opcode, v0: v0, v1: v1, v2: v2);