From 88619d71b8e4840218c68b712aa184098d2dbccf Mon Sep 17 00:00:00 2001
From: LDj3SNuD <35856442+LDj3SNuD@users.noreply.github.com>
Date: Fri, 17 Jul 2020 06:21:40 +0200
Subject: [PATCH] CPU: A32: Add Vadd & Vsub Wide (S/U_8/16/32) Inst.s with
 Test. (#1390)

---
 ARMeilleure/Decoders/OpCode32SimdRegWide.cs   | 17 ++++
 ARMeilleure/Decoders/OpCodeTable.cs           |  2 +
 .../Instructions/InstEmitSimdArithmetic32.cs  | 14 +++
 .../Instructions/InstEmitSimdHelper32.cs      | 24 +++++
 ARMeilleure/Instructions/InstName.cs          |  2 +
 ARMeilleure/Translation/PTC/Ptc.cs            |  2 +-
 Ryujinx.Tests/Cpu/CpuTestSimdReg32.cs         | 97 ++++++++-----------
 7 files changed, 103 insertions(+), 55 deletions(-)
 create mode 100644 ARMeilleure/Decoders/OpCode32SimdRegWide.cs

diff --git a/ARMeilleure/Decoders/OpCode32SimdRegWide.cs b/ARMeilleure/Decoders/OpCode32SimdRegWide.cs
new file mode 100644
index 0000000000..55384b2bd8
--- /dev/null
+++ b/ARMeilleure/Decoders/OpCode32SimdRegWide.cs
@@ -0,0 +1,17 @@
+namespace ARMeilleure.Decoders
+{
+    sealed class OpCode32SimdRegWide : OpCode32SimdReg
+    {
+        public OpCode32SimdRegWide(InstDescriptor inst, ulong address, int opCode) : base(inst, address, opCode)
+        {
+            Q = false;
+            RegisterSize = RegisterSize.Simd64;
+
+            // Subclasses have their own handling of Vx to account for before checking.
+            if (GetType() == typeof(OpCode32SimdRegWide) && DecoderHelper.VectorArgumentsInvalid(true, Vd, Vn))
+            {
+                Instruction = InstDescriptor.Undefined;
+            }
+        }
+    }
+}
diff --git a/ARMeilleure/Decoders/OpCodeTable.cs b/ARMeilleure/Decoders/OpCodeTable.cs
index b98fcab12d..5923941503 100644
--- a/ARMeilleure/Decoders/OpCodeTable.cs
+++ b/ARMeilleure/Decoders/OpCodeTable.cs
@@ -803,6 +803,7 @@ namespace ARMeilleure.Decoders
             SetA32("111100100xxxxxxxxxxx1000xxx0xxxx", InstName.Vadd,     InstEmit32.Vadd_I,   typeof(OpCode32SimdReg));
             SetA32("<<<<11100x11xxxxxxxx101xx0x0xxxx", InstName.Vadd,     InstEmit32.Vadd_S,   typeof(OpCode32SimdRegS));
             SetA32("111100100x00xxxxxxxx1101xxx0xxxx", InstName.Vadd,     InstEmit32.Vadd_V,   typeof(OpCode32SimdReg));
+            SetA32("1111001x1x<<xxxxxxxx0001x0x0xxxx", InstName.Vaddw,    InstEmit32.Vaddw_I,  typeof(OpCode32SimdRegWide));
             SetA32("111100100x00xxxxxxxx0001xxx1xxxx", InstName.Vand,     InstEmit32.Vand_I,   typeof(OpCode32SimdBinary));
             SetA32("111100110x11xxxxxxxx0001xxx1xxxx", InstName.Vbif,     InstEmit32.Vbif,     typeof(OpCode32SimdBinary));
             SetA32("111100110x10xxxxxxxx0001xxx1xxxx", InstName.Vbit,     InstEmit32.Vbit,     typeof(OpCode32SimdBinary));
@@ -946,6 +947,7 @@ namespace ARMeilleure.Decoders
             SetA32("111100110xxxxxxxxxxx1000xxx0xxxx", InstName.Vsub,     InstEmit32.Vsub_I,   typeof(OpCode32SimdReg));
             SetA32("<<<<11100x11xxxxxxxx101xx1x0xxxx", InstName.Vsub,     InstEmit32.Vsub_S,   typeof(OpCode32SimdRegS));
             SetA32("111100100x10xxxxxxxx1101xxx0xxxx", InstName.Vsub,     InstEmit32.Vsub_V,   typeof(OpCode32SimdReg));
+            SetA32("1111001x1x<<xxxxxxxx0011x0x0xxxx", InstName.Vsubw,    InstEmit32.Vsubw_I,  typeof(OpCode32SimdRegWide));
             SetA32("111100111x11xxxxxxxx10xxxxx0xxxx", InstName.Vtbl,     InstEmit32.Vtbl,     typeof(OpCode32SimdTbl));
             SetA32("111100111x11<<10xxxx00001xx0xxxx", InstName.Vtrn,     InstEmit32.Vtrn,     typeof(OpCode32SimdCmpZ));
             SetA32("111100111x11<<10xxxx00010xx0xxxx", InstName.Vuzp,     InstEmit32.Vuzp,     typeof(OpCode32SimdCmpZ));
diff --git a/ARMeilleure/Instructions/InstEmitSimdArithmetic32.cs b/ARMeilleure/Instructions/InstEmitSimdArithmetic32.cs
index eb86ac9e73..cc6e6edbed 100644
--- a/ARMeilleure/Instructions/InstEmitSimdArithmetic32.cs
+++ b/ARMeilleure/Instructions/InstEmitSimdArithmetic32.cs
@@ -107,6 +107,13 @@ namespace ARMeilleure.Instructions
             }
         }
 
+        public static void Vaddw_I(ArmEmitterContext context)
+        {
+            OpCode32SimdRegWide op = (OpCode32SimdRegWide)context.CurrOp;
+
+            EmitVectorBinaryWideOpI32(context, (op1, op2) => context.Add(op1, op2), !op.U);
+        }
+
         public static void Vdup(ArmEmitterContext context)
         {
             OpCode32SimdDupGP op = (OpCode32SimdDupGP)context.CurrOp;
@@ -1191,6 +1198,13 @@ namespace ARMeilleure.Instructions
             }
         }
 
+        public static void Vsubw_I(ArmEmitterContext context)
+        {
+            OpCode32SimdRegWide op = (OpCode32SimdRegWide)context.CurrOp;
+
+            EmitVectorBinaryWideOpI32(context, (op1, op2) => context.Subtract(op1, op2), !op.U);
+        }
+
         private static void EmitSse41MaxMinNumOpF32(ArmEmitterContext context, bool isMaxNum, bool scalar)
         {
             IOpCode32Simd op = (IOpCode32Simd)context.CurrOp;
diff --git a/ARMeilleure/Instructions/InstEmitSimdHelper32.cs b/ARMeilleure/Instructions/InstEmitSimdHelper32.cs
index 9697715a1f..9753af661e 100644
--- a/ARMeilleure/Instructions/InstEmitSimdHelper32.cs
+++ b/ARMeilleure/Instructions/InstEmitSimdHelper32.cs
@@ -281,6 +281,30 @@ namespace ARMeilleure.Instructions
             context.Copy(GetVecA32(op.Qd), res);
         }
 
+        public static void EmitVectorBinaryWideOpI32(ArmEmitterContext context, Func2I emit, bool signed)
+        {
+            OpCode32SimdReg op = (OpCode32SimdReg)context.CurrOp;
+
+            Operand res = context.VectorZero();
+
+            int elems = op.GetBytesCount() >> op.Size;
+
+            for (int index = 0; index < elems; index++)
+            {
+                Operand ne = EmitVectorExtract32(context, op.Qn, op.In + index, op.Size + 1, signed);
+                Operand me = EmitVectorExtract32(context, op.Qm, op.Im + index, op.Size,     signed);
+
+                if (op.Size == 2)
+                {
+                    me = signed ? context.SignExtend32(OperandType.I64, me) : context.ZeroExtend32(OperandType.I64, me);
+                }
+
+                res = EmitVectorInsert(context, res, emit(ne, me), index, op.Size + 1);
+            }
+
+            context.Copy(GetVecA32(op.Qd), res);
+        }
+
         public static void EmitVectorTernaryLongOpI32(ArmEmitterContext context, Func3I emit, bool signed)
         {
             OpCode32SimdReg op = (OpCode32SimdReg)context.CurrOp;
diff --git a/ARMeilleure/Instructions/InstName.cs b/ARMeilleure/Instructions/InstName.cs
index 69b5d3fc7c..28041874f1 100644
--- a/ARMeilleure/Instructions/InstName.cs
+++ b/ARMeilleure/Instructions/InstName.cs
@@ -545,6 +545,7 @@ namespace ARMeilleure.Instructions
         // FP & SIMD (AArch32)
         Vabs,
         Vadd,
+        Vaddw,
         Vand,
         Vbif,
         Vbit,
@@ -611,6 +612,7 @@ namespace ARMeilleure.Instructions
         Vrsqrte,
         Vrsqrts,
         Vsub,
+        Vsubw,
         Vtbl,
         Vtrn,
         Vuzp,
diff --git a/ARMeilleure/Translation/PTC/Ptc.cs b/ARMeilleure/Translation/PTC/Ptc.cs
index b951caf83e..d5fb88284b 100644
--- a/ARMeilleure/Translation/PTC/Ptc.cs
+++ b/ARMeilleure/Translation/PTC/Ptc.cs
@@ -20,7 +20,7 @@ namespace ARMeilleure.Translation.PTC
     {
         private const string HeaderMagic = "PTChd";
       
-        private const int InternalVersion = 10; //! To be incremented manually for each change to the ARMeilleure project.
+        private const int InternalVersion = 11; //! To be incremented manually for each change to the ARMeilleure project.
 
         private const string BaseDir = "Ryujinx";
 
diff --git a/Ryujinx.Tests/Cpu/CpuTestSimdReg32.cs b/Ryujinx.Tests/Cpu/CpuTestSimdReg32.cs
index 1581e85044..dbe69124e0 100644
--- a/Ryujinx.Tests/Cpu/CpuTestSimdReg32.cs
+++ b/Ryujinx.Tests/Cpu/CpuTestSimdReg32.cs
@@ -13,6 +13,15 @@ namespace Ryujinx.Tests.Cpu
 #if SimdReg32
 
 #region "ValueSource (Opcodes)"
+        private static uint[] _V_Add_Sub_Wide_I_()
+        {
+            return new uint[]
+            {
+                0xf2800100u, // VADDW.S8 Q0, Q0, D0
+                0xf2800300u  // VSUBW.S8 Q0, Q0, D0
+            };
+        }
+
         private static uint[] _Vp_Add_Max_Min_F_()
         {
             return new uint[]
@@ -38,60 +47,6 @@ namespace Ryujinx.Tests.Cpu
 #endregion
 
 #region "ValueSource (Types)"
-        private static ulong[] _1B1H1S1D_()
-        {
-            return new ulong[] { 0x0000000000000000ul, 0x000000000000007Ful,
-                                 0x0000000000000080ul, 0x00000000000000FFul,
-                                 0x0000000000007FFFul, 0x0000000000008000ul,
-                                 0x000000000000FFFFul, 0x000000007FFFFFFFul,
-                                 0x0000000080000000ul, 0x00000000FFFFFFFFul,
-                                 0x7FFFFFFFFFFFFFFFul, 0x8000000000000000ul,
-                                 0xFFFFFFFFFFFFFFFFul };
-        }
-
-        private static ulong[] _1D_()
-        {
-            return new ulong[] { 0x0000000000000000ul, 0x7FFFFFFFFFFFFFFFul,
-                                 0x8000000000000000ul, 0xFFFFFFFFFFFFFFFFul };
-        }
-
-        private static ulong[] _1H1S_()
-        {
-            return new ulong[] { 0x0000000000000000ul, 0x0000000000007FFFul,
-                                 0x0000000000008000ul, 0x000000000000FFFFul,
-                                 0x000000007FFFFFFFul, 0x0000000080000000ul,
-                                 0x00000000FFFFFFFFul };
-        }
-
-        private static ulong[] _4H2S_()
-        {
-            return new ulong[] { 0x0000000000000000ul, 0x7FFF7FFF7FFF7FFFul,
-                                 0x8000800080008000ul, 0x7FFFFFFF7FFFFFFFul,
-                                 0x8000000080000000ul, 0xFFFFFFFFFFFFFFFFul };
-        }
-
-        private static ulong[] _4H2S1D_()
-        {
-            return new ulong[] { 0x0000000000000000ul, 0x7FFF7FFF7FFF7FFFul,
-                                 0x8000800080008000ul, 0x7FFFFFFF7FFFFFFFul,
-                                 0x8000000080000000ul, 0x7FFFFFFFFFFFFFFFul,
-                                 0x8000000000000000ul, 0xFFFFFFFFFFFFFFFFul };
-        }
-
-        private static ulong[] _8B_()
-        {
-            return new ulong[] { 0x0000000000000000ul, 0x7F7F7F7F7F7F7F7Ful,
-                                 0x8080808080808080ul, 0xFFFFFFFFFFFFFFFFul };
-        }
-
-        private static ulong[] _8B4H2S_()
-        {
-            return new ulong[] { 0x0000000000000000ul, 0x7F7F7F7F7F7F7F7Ful,
-                                 0x8080808080808080ul, 0x7FFF7FFF7FFF7FFFul,
-                                 0x8000800080008000ul, 0x7FFFFFFF7FFFFFFFul,
-                                 0x8000000080000000ul, 0xFFFFFFFFFFFFFFFFul };
-        }
-
         private static ulong[] _8B4H2S1D_()
         {
             return new ulong[] { 0x0000000000000000ul, 0x7F7F7F7F7F7F7F7Ful,
@@ -267,6 +222,40 @@ namespace Ryujinx.Tests.Cpu
             CompareAgainstUnicorn();
         }
 
+        [Test, Pairwise]
+        public void V_Add_Sub_Wide_I([ValueSource("_V_Add_Sub_Wide_I_")] uint opcode,
+                                     [Range(0u, 5u)] uint rd,
+                                     [Range(0u, 5u)] uint rn,
+                                     [Range(0u, 5u)] uint rm,
+                                     [ValueSource("_8B4H2S1D_")] [Random(RndCnt)] ulong z,
+                                     [ValueSource("_8B4H2S1D_")] [Random(RndCnt)] ulong a,
+                                     [ValueSource("_8B4H2S1D_")] [Random(RndCnt)] ulong b,
+                                     [Values(0u, 1u, 2u)] uint size, // <SU8, SU16, SU32>
+                                     [Values] bool u) // <S, U>
+        {
+            if (u)
+            {
+                opcode |= 1 << 24;
+            }
+
+            rd >>= 1; rd <<= 1;
+            rn >>= 1; rn <<= 1;
+
+            opcode |= ((rd & 0xf) << 12) | ((rd & 0x10) << 18);
+            opcode |= ((rn & 0xf) << 16) | ((rn & 0x10) << 3);
+            opcode |= ((rm & 0xf) << 0)  | ((rm & 0x10) << 1);
+
+            opcode |= (size & 0x3) << 20;
+
+            V128 v0 = MakeVectorE0E1(z, ~z);
+            V128 v1 = MakeVectorE0E1(a, ~a);
+            V128 v2 = MakeVectorE0E1(b, ~b);
+
+            SingleOpcode(opcode, v0: v0, v1: v1, v2: v2);
+
+            CompareAgainstUnicorn();
+        }
+
         [Test, Pairwise, Description("VCMP.f<size> Vd, Vm")]
         public void Vcmp([Values(2u, 3u)] uint size,
                          [ValueSource("_1S_F_")] ulong a,