From 3332b29f01993d22195f417d39ae4074d53fb55c Mon Sep 17 00:00:00 2001
From: sharmander <saldabain.dev@gmail.com>
Date: Mon, 14 Dec 2020 22:01:52 -0500
Subject: [PATCH] CPU: Implement VFMA (Vector) (#1762)

* Implement VFMA.F64

* Simplify switch

* Simplify FMA Instructions into their own IntrinsicType.

* Remove whitespace

* Fix indentation

* Change tests for Vfnms -- disable inf / nan

* Move args up, not description ;)

* Implementation Complete.

All Tests Pass (Slow / Fast Path)

* Move location of function in assembler + test updates.

* Shift params upwards

* Remove unused function

* Update PTC version.

* Add comments / re-oreder opcode table.

* Remove whitespace

* Fix nit

* Fix nit.

* Fix whitespace

* Wrong opcode was used by a bad merge.

* Addressed rip's comments.
---
 ARMeilleure/CodeGen/X86/Assembler.cs          |  8 ++--
 ARMeilleure/Decoders/OpCodeTable.cs           |  1 +
 .../Instructions/InstEmitSimdArithmetic32.cs  | 17 +++++++
 .../Instructions/InstEmitSimdHelper32.cs      | 12 +++++
 Ryujinx.Tests/Cpu/CpuTestSimdReg32.cs         | 46 +++++++++++++++++++
 5 files changed, 80 insertions(+), 4 deletions(-)

diff --git a/ARMeilleure/CodeGen/X86/Assembler.cs b/ARMeilleure/CodeGen/X86/Assembler.cs
index b242a17185..7f19c3c43e 100644
--- a/ARMeilleure/CodeGen/X86/Assembler.cs
+++ b/ARMeilleure/CodeGen/X86/Assembler.cs
@@ -273,10 +273,10 @@ namespace ARMeilleure.CodeGen.X86
             Add(X86Instruction.Vblendvps,    new InstructionInfo(BadOp,      BadOp,      BadOp,      BadOp,      0x000f3a4a, InstructionFlags.Vex | InstructionFlags.Prefix66));
             Add(X86Instruction.Vcvtph2ps,    new InstructionInfo(BadOp,      BadOp,      BadOp,      BadOp,      0x000f3813, InstructionFlags.Vex | InstructionFlags.Prefix66));
             Add(X86Instruction.Vcvtps2ph,    new InstructionInfo(0x000f3a1d, BadOp,      BadOp,      BadOp,      BadOp,      InstructionFlags.Vex | InstructionFlags.Prefix66));
-            Add(X86Instruction.Vfmadd231ps,  new InstructionInfo(BadOp,      BadOp,      BadOp,      BadOp,      0x000f38bc, InstructionFlags.Vex | InstructionFlags.Prefix66));
-            Add(X86Instruction.Vfmadd231pd,  new InstructionInfo(BadOp,      BadOp,      BadOp,      BadOp,      0x000f38bc, InstructionFlags.Vex | InstructionFlags.Prefix66 | InstructionFlags.RexW));
-            Add(X86Instruction.Vfmadd231ss,  new InstructionInfo(BadOp,      BadOp,      BadOp,      BadOp,      0x000f38bd, InstructionFlags.Vex | InstructionFlags.Prefix66));
-            Add(X86Instruction.Vfmadd231sd,  new InstructionInfo(BadOp,      BadOp,      BadOp,      BadOp,      0x000f38bd, InstructionFlags.Vex | InstructionFlags.Prefix66 | InstructionFlags.RexW));
+            Add(X86Instruction.Vfmadd231ps,  new InstructionInfo(BadOp,      BadOp,      BadOp,      BadOp,      0x000f38b8, InstructionFlags.Vex | InstructionFlags.Prefix66));
+            Add(X86Instruction.Vfmadd231pd,  new InstructionInfo(BadOp,      BadOp,      BadOp,      BadOp,      0x000f38b8, InstructionFlags.Vex | InstructionFlags.Prefix66 | InstructionFlags.RexW));
+            Add(X86Instruction.Vfmadd231ss,  new InstructionInfo(BadOp,      BadOp,      BadOp,      BadOp,      0x000f38b9, InstructionFlags.Vex | InstructionFlags.Prefix66));
+            Add(X86Instruction.Vfmadd231sd,  new InstructionInfo(BadOp,      BadOp,      BadOp,      BadOp,      0x000f38b9, InstructionFlags.Vex | InstructionFlags.Prefix66 | InstructionFlags.RexW));
             Add(X86Instruction.Vfmsub231ps,  new InstructionInfo(BadOp,      BadOp,      BadOp,      BadOp,      0x000f38ba, InstructionFlags.Vex | InstructionFlags.Prefix66));
             Add(X86Instruction.Vfmsub231pd,  new InstructionInfo(BadOp,      BadOp,      BadOp,      BadOp,      0x000f38ba, InstructionFlags.Vex | InstructionFlags.Prefix66 | InstructionFlags.RexW));
             Add(X86Instruction.Vfmsub231ss,  new InstructionInfo(BadOp,      BadOp,      BadOp,      BadOp,      0x000f38bb, InstructionFlags.Vex | InstructionFlags.Prefix66));
diff --git a/ARMeilleure/Decoders/OpCodeTable.cs b/ARMeilleure/Decoders/OpCodeTable.cs
index 3d3b26f91f..5cf83476a1 100644
--- a/ARMeilleure/Decoders/OpCodeTable.cs
+++ b/ARMeilleure/Decoders/OpCodeTable.cs
@@ -820,6 +820,7 @@ namespace ARMeilleure.Decoders
             SetA32("111100110x00xxxxxxxx0001xxx1xxxx", InstName.Veor,     InstEmit32.Veor_I,   OpCode32SimdBinary.Create);
             SetA32("111100101x11xxxxxxxxxxxxxxx0xxxx", InstName.Vext,     InstEmit32.Vext,     OpCode32SimdExt.Create);
             SetA32("<<<<11101x10xxxxxxxx101xx0x0xxxx", InstName.Vfma,     InstEmit32.Vfma_S,   OpCode32SimdRegS.Create);
+            SetA32("111100100x00xxxxxxxx1100xxx1xxxx", InstName.Vfma,     InstEmit32.Vfma_V,   OpCode32SimdReg.Create);
             SetA32("<<<<11101x10xxxxxxxx101xx1x0xxxx", InstName.Vfms,     InstEmit32.Vfms_S,   OpCode32SimdRegS.Create);
             SetA32("<<<<11101x01xxxxxxxx101xx1x0xxxx", InstName.Vfnma,    InstEmit32.Vfnma_S,  OpCode32SimdRegS.Create);
             SetA32("<<<<11101x01xxxxxxxx101xx0x0xxxx", InstName.Vfnms,    InstEmit32.Vfnms_S,  OpCode32SimdRegS.Create);
diff --git a/ARMeilleure/Instructions/InstEmitSimdArithmetic32.cs b/ARMeilleure/Instructions/InstEmitSimdArithmetic32.cs
index d72df97cbb..40289520c1 100644
--- a/ARMeilleure/Instructions/InstEmitSimdArithmetic32.cs
+++ b/ARMeilleure/Instructions/InstEmitSimdArithmetic32.cs
@@ -252,6 +252,23 @@ namespace ARMeilleure.Instructions
             }
         }
 
+        public static void Vfma_V(ArmEmitterContext context) // Fused.
+        {
+            if (Optimizations.FastFP && Optimizations.UseFma)
+            {
+                // Vectors contain elements that are 32-bits in length always. The only thing that will change is the number of elements in a vector. 
+                // The 64-bit variant will never be used.
+                EmitVectorTernaryOpF32(context, Intrinsic.X86Vfmadd231ps, Intrinsic.X86Vfmadd231pd);
+            }
+            else
+            {
+                EmitVectorTernaryOpF32(context, (op1, op2, op3) =>
+                {
+                    return EmitSoftFloatCall(context, nameof(SoftFloat32.FPMulAdd), op1, op2, op3);
+                });
+            }
+        }
+
         public static void Vfma_S(ArmEmitterContext context) // Fused.
         {
             if (Optimizations.FastFP && Optimizations.UseSse2)
diff --git a/ARMeilleure/Instructions/InstEmitSimdHelper32.cs b/ARMeilleure/Instructions/InstEmitSimdHelper32.cs
index 75aa7220bb..2d5d4ba946 100644
--- a/ARMeilleure/Instructions/InstEmitSimdHelper32.cs
+++ b/ARMeilleure/Instructions/InstEmitSimdHelper32.cs
@@ -820,6 +820,18 @@ namespace ARMeilleure.Instructions
             });
         }
 
+        public static void EmitVectorTernaryOpF32(ArmEmitterContext context, Intrinsic inst32, Intrinsic inst64)
+        {
+            OpCode32SimdReg op = (OpCode32SimdReg)context.CurrOp;
+
+            Intrinsic inst = (op.Size & 1) != 0 ? inst64 : inst32;
+
+            EmitVectorTernaryOpSimd32(context, (d, n, m) =>
+            {
+                return context.AddIntrinsic(inst, d, n, m);
+            });
+        }
+
         public static void EmitScalarUnaryOpSimd32(ArmEmitterContext context, Func1I scalarFunc)
         {
             OpCode32SimdS op = (OpCode32SimdS)context.CurrOp;
diff --git a/Ryujinx.Tests/Cpu/CpuTestSimdReg32.cs b/Ryujinx.Tests/Cpu/CpuTestSimdReg32.cs
index dc5903d589..4298bd1f6f 100644
--- a/Ryujinx.Tests/Cpu/CpuTestSimdReg32.cs
+++ b/Ryujinx.Tests/Cpu/CpuTestSimdReg32.cs
@@ -293,6 +293,52 @@ namespace Ryujinx.Tests.Cpu
             CompareAgainstUnicorn(fpsrMask: Fpsr.Nzcv);
         }
 
+        [Test, Pairwise, Description("VFMA.F<size> <Vd>, <Vn>, <Vm>")]
+        public void Vfma([Values(0u, 1u)] uint rd,
+                         [Values(0u, 1u)] uint rn,
+                         [Values(0u, 1u)] uint rm,
+                         [Values(0u, 1u)] uint Q,
+                         [ValueSource("_2S_F_")] ulong z,
+                         [ValueSource("_2S_F_")] ulong a,
+                         [ValueSource("_2S_F_")] ulong b )
+        {
+            uint opcode = 0xf2000c10;
+            
+            V128 v0;
+            V128 v1;
+            V128 v2;
+
+            uint c = (uint) BitConverter.SingleToInt32Bits(z);
+            uint d = (uint) BitConverter.SingleToInt32Bits(a);
+            uint e = (uint) BitConverter.SingleToInt32Bits(b);
+            if (Q == 0)
+            {
+                opcode |= (((rm & 0x1) << 5) | (rm & 0x1e) >> 1);
+                opcode |= (((rd & 0x1) << 22) | (rd & 0x1e) << 11);
+                opcode |= (((rn & 0x1) << 7) | (rn & 0x1e) >> 15);
+
+                v0 = MakeVectorE0E1(c, c);
+                v1 = MakeVectorE0E1(d, c);
+                v2 = MakeVectorE0E1(e, c);
+            }
+            else
+            {
+                rd = rn = rm = 0; // Needed, as these values cannot be odd values if Q == 1.
+                opcode |= (((rm & 0x10) << 1) | (rm & 0xf) << 0);
+                opcode |= (((rd & 0x10) << 18) | (rd & 0xf) << 12);
+                opcode |= (((rn & 0x10) << 3) | (rn & 0xf) << 16);
+
+                v0 = MakeVectorE0E1E2E3(c, c, d, e);
+                v1 = MakeVectorE0E1E2E3(d, c, e, c);
+                v2 = MakeVectorE0E1E2E3(e, c, d, c);
+            }
+
+            opcode |= ((Q & 1)  << 6);
+
+            SingleOpcode(opcode, v0: v0, v1: v1, v2: v2);
+            CompareAgainstUnicorn();
+        }
+        
         [Test, Pairwise, Description("VFNMA.F<size> <Vd>, <Vn>, <Vm>")]
         public void Vfnma([Values(0u, 1u)] uint rd,
                           [Values(0u, 1u)] uint rn,