From 8a33e884f8f482e93e2b90380b158c1417cc50f8 Mon Sep 17 00:00:00 2001
From: LDj3SNuD <35856442+LDj3SNuD@users.noreply.github.com>
Date: Thu, 17 Dec 2020 20:43:41 +0100
Subject: [PATCH] Fix Vnmls_S fast path (F64: losing input d value). Fix
 Vnmla_S & Vnmls_S slow paths (using fused inst.s). Fix Vfma_V slow path not
 using StandardFPSCRValue(). (#1775)

* Fix Vnmls_S fast path (F64: losing input d value). Fix Vnmla_S & Vnmls_S slow paths (using fused inst.s).

Add Vfma_S & Vfms_S Fma fast paths.
Add Vfnma_S inst. with Fma/Sse fast paths and slow path.
Add Vfnms_S Sse fast path.

Add Tests for affected inst.s.

Nits.

* InternalVersion = 1775

* Nits.

* Fix Vfma_V slow path not using StandardFPSCRValue().

* Nit: Fix Vfma_V order.

* Add Vfms_V Sse fast path and slow path.

* Add Vfma_V and Vfms_V Test.
---
 ARMeilleure/CodeGen/X86/Assembler.cs          |  14 +-
 ARMeilleure/CodeGen/X86/CodeGenerator.cs      |  14 +-
 ARMeilleure/CodeGen/X86/IntrinsicTable.cs     |   8 +-
 ARMeilleure/CodeGen/X86/X86Instruction.cs     |  12 +-
 ARMeilleure/Decoders/OpCodeTable.cs           |   1 +
 ARMeilleure/Instructions/InstEmitAlu32.cs     |   2 +-
 ARMeilleure/Instructions/InstEmitMul32.cs     |   2 +-
 .../Instructions/InstEmitSimdArithmetic32.cs  | 114 ++++----
 .../Instructions/InstEmitSimdHelper32.cs      |  26 +-
 .../IntermediateRepresentation/Intrinsic.cs   |   8 +-
 ARMeilleure/Translation/PTC/Ptc.cs            |   2 +-
 Ryujinx.Tests/Cpu/CpuTestSimdCvt32.cs         |  59 ++--
 Ryujinx.Tests/Cpu/CpuTestSimdReg32.cs         | 251 +++++++++++-------
 13 files changed, 292 insertions(+), 221 deletions(-)

diff --git a/ARMeilleure/CodeGen/X86/Assembler.cs b/ARMeilleure/CodeGen/X86/Assembler.cs
index 7f19c3c43e..2484e25191 100644
--- a/ARMeilleure/CodeGen/X86/Assembler.cs
+++ b/ARMeilleure/CodeGen/X86/Assembler.cs
@@ -274,17 +274,15 @@ namespace ARMeilleure.CodeGen.X86
             Add(X86Instruction.Vcvtph2ps,    new InstructionInfo(BadOp,      BadOp,      BadOp,      BadOp,      0x000f3813, InstructionFlags.Vex | InstructionFlags.Prefix66));
             Add(X86Instruction.Vcvtps2ph,    new InstructionInfo(0x000f3a1d, BadOp,      BadOp,      BadOp,      BadOp,      InstructionFlags.Vex | InstructionFlags.Prefix66));
             Add(X86Instruction.Vfmadd231ps,  new InstructionInfo(BadOp,      BadOp,      BadOp,      BadOp,      0x000f38b8, InstructionFlags.Vex | InstructionFlags.Prefix66));
-            Add(X86Instruction.Vfmadd231pd,  new InstructionInfo(BadOp,      BadOp,      BadOp,      BadOp,      0x000f38b8, InstructionFlags.Vex | InstructionFlags.Prefix66 | InstructionFlags.RexW));
-            Add(X86Instruction.Vfmadd231ss,  new InstructionInfo(BadOp,      BadOp,      BadOp,      BadOp,      0x000f38b9, InstructionFlags.Vex | InstructionFlags.Prefix66));
             Add(X86Instruction.Vfmadd231sd,  new InstructionInfo(BadOp,      BadOp,      BadOp,      BadOp,      0x000f38b9, InstructionFlags.Vex | InstructionFlags.Prefix66 | InstructionFlags.RexW));
-            Add(X86Instruction.Vfmsub231ps,  new InstructionInfo(BadOp,      BadOp,      BadOp,      BadOp,      0x000f38ba, InstructionFlags.Vex | InstructionFlags.Prefix66));
-            Add(X86Instruction.Vfmsub231pd,  new InstructionInfo(BadOp,      BadOp,      BadOp,      BadOp,      0x000f38ba, InstructionFlags.Vex | InstructionFlags.Prefix66 | InstructionFlags.RexW));
-            Add(X86Instruction.Vfmsub231ss,  new InstructionInfo(BadOp,      BadOp,      BadOp,      BadOp,      0x000f38bb, InstructionFlags.Vex | InstructionFlags.Prefix66));
+            Add(X86Instruction.Vfmadd231ss,  new InstructionInfo(BadOp,      BadOp,      BadOp,      BadOp,      0x000f38b9, InstructionFlags.Vex | InstructionFlags.Prefix66));
             Add(X86Instruction.Vfmsub231sd,  new InstructionInfo(BadOp,      BadOp,      BadOp,      BadOp,      0x000f38bb, InstructionFlags.Vex | InstructionFlags.Prefix66 | InstructionFlags.RexW));
-            Add(X86Instruction.Vfnmsub231ps, new InstructionInfo(BadOp,      BadOp,      BadOp,      BadOp,      0x000f38be, InstructionFlags.Vex | InstructionFlags.Prefix66));
-            Add(X86Instruction.Vfnmsub231pd, new InstructionInfo(BadOp,      BadOp,      BadOp,      BadOp,      0x000f38be, InstructionFlags.Vex | InstructionFlags.Prefix66 | InstructionFlags.RexW));
-            Add(X86Instruction.Vfnmsub231ss, new InstructionInfo(BadOp,      BadOp,      BadOp,      BadOp,      0x000f38bf, InstructionFlags.Vex | InstructionFlags.Prefix66));
+            Add(X86Instruction.Vfmsub231ss,  new InstructionInfo(BadOp,      BadOp,      BadOp,      BadOp,      0x000f38bb, InstructionFlags.Vex | InstructionFlags.Prefix66));
+            Add(X86Instruction.Vfnmadd231ps, new InstructionInfo(BadOp,      BadOp,      BadOp,      BadOp,      0x000f38bc, InstructionFlags.Vex | InstructionFlags.Prefix66));
+            Add(X86Instruction.Vfnmadd231sd, new InstructionInfo(BadOp,      BadOp,      BadOp,      BadOp,      0x000f38bd, InstructionFlags.Vex | InstructionFlags.Prefix66 | InstructionFlags.RexW));
+            Add(X86Instruction.Vfnmadd231ss, new InstructionInfo(BadOp,      BadOp,      BadOp,      BadOp,      0x000f38bd, InstructionFlags.Vex | InstructionFlags.Prefix66));
             Add(X86Instruction.Vfnmsub231sd, new InstructionInfo(BadOp,      BadOp,      BadOp,      BadOp,      0x000f38bf, InstructionFlags.Vex | InstructionFlags.Prefix66 | InstructionFlags.RexW));
+            Add(X86Instruction.Vfnmsub231ss, new InstructionInfo(BadOp,      BadOp,      BadOp,      BadOp,      0x000f38bf, InstructionFlags.Vex | InstructionFlags.Prefix66));
             Add(X86Instruction.Vpblendvb,    new InstructionInfo(BadOp,      BadOp,      BadOp,      BadOp,      0x000f3a4c, InstructionFlags.Vex | InstructionFlags.Prefix66));
             Add(X86Instruction.Xor,          new InstructionInfo(0x00000031, 0x06000083, 0x06000081, BadOp,      0x00000033, InstructionFlags.None));
             Add(X86Instruction.Xorpd,        new InstructionInfo(BadOp,      BadOp,      BadOp,      BadOp,      0x00000f57, InstructionFlags.Vex | InstructionFlags.Prefix66));
diff --git a/ARMeilleure/CodeGen/X86/CodeGenerator.cs b/ARMeilleure/CodeGen/X86/CodeGenerator.cs
index 29a4cd78a7..5f41ff7901 100644
--- a/ARMeilleure/CodeGen/X86/CodeGenerator.cs
+++ b/ARMeilleure/CodeGen/X86/CodeGenerator.cs
@@ -440,9 +440,12 @@ namespace ARMeilleure.CodeGen.X86
                         else
                         {
                             EnsureSameReg(dest, src1);
+
                             Debug.Assert(src3.GetRegister().Index == 0);
+
                             context.Assembler.WriteInstruction(info.Inst, dest, src1, src2);
                         }
+
                         break;
                     }
 
@@ -474,11 +477,16 @@ namespace ARMeilleure.CodeGen.X86
                         Operand src2 = operation.GetSource(1);
                         Operand src3 = operation.GetSource(2);
 
-                        EnsureSameType(dest, src1, src2, src3);
-                        EnsureSameReg(dest, src1);
-                        Debug.Assert(!dest.Type.IsInteger());
                         Debug.Assert(HardwareCapabilities.SupportsVexEncoding);
 
+                        Debug.Assert(dest.Kind == OperandKind.Register && src1.Kind == OperandKind.Register && src2.Kind == OperandKind.Register);
+                        Debug.Assert(src3.Kind == OperandKind.Register || src3.Kind == OperandKind.Memory);
+
+                        EnsureSameType(dest, src1, src2, src3);
+                        Debug.Assert(dest.Type == OperandType.V128);
+
+                        Debug.Assert(dest.Value == src1.Value);
+
                         context.Assembler.WriteInstruction(info.Inst, dest, src2, src3);
 
                         break;
diff --git a/ARMeilleure/CodeGen/X86/IntrinsicTable.cs b/ARMeilleure/CodeGen/X86/IntrinsicTable.cs
index 195ce91d65..9030be3c1e 100644
--- a/ARMeilleure/CodeGen/X86/IntrinsicTable.cs
+++ b/ARMeilleure/CodeGen/X86/IntrinsicTable.cs
@@ -166,16 +166,14 @@ namespace ARMeilleure.CodeGen.X86
             Add(Intrinsic.X86Unpcklps,     new IntrinsicInfo(X86Instruction.Unpcklps,     IntrinsicType.Binary));
             Add(Intrinsic.X86Vcvtph2ps,    new IntrinsicInfo(X86Instruction.Vcvtph2ps,    IntrinsicType.Unary));
             Add(Intrinsic.X86Vcvtps2ph,    new IntrinsicInfo(X86Instruction.Vcvtps2ph,    IntrinsicType.BinaryImm));
-            Add(Intrinsic.X86Vfmadd231pd,  new IntrinsicInfo(X86Instruction.Vfmadd231pd,  IntrinsicType.Fma));
             Add(Intrinsic.X86Vfmadd231ps,  new IntrinsicInfo(X86Instruction.Vfmadd231ps,  IntrinsicType.Fma));
             Add(Intrinsic.X86Vfmadd231sd,  new IntrinsicInfo(X86Instruction.Vfmadd231sd,  IntrinsicType.Fma));
             Add(Intrinsic.X86Vfmadd231ss,  new IntrinsicInfo(X86Instruction.Vfmadd231ss,  IntrinsicType.Fma));
-            Add(Intrinsic.X86Vfmsub231pd,  new IntrinsicInfo(X86Instruction.Vfmsub231pd,  IntrinsicType.Fma));
-            Add(Intrinsic.X86Vfmsub231ps,  new IntrinsicInfo(X86Instruction.Vfmsub231ps,  IntrinsicType.Fma));
             Add(Intrinsic.X86Vfmsub231sd,  new IntrinsicInfo(X86Instruction.Vfmsub231sd,  IntrinsicType.Fma));
             Add(Intrinsic.X86Vfmsub231ss,  new IntrinsicInfo(X86Instruction.Vfmsub231ss,  IntrinsicType.Fma));
-            Add(Intrinsic.X86Vfnmsub231pd, new IntrinsicInfo(X86Instruction.Vfnmsub231pd, IntrinsicType.Fma));
-            Add(Intrinsic.X86Vfnmsub231ps, new IntrinsicInfo(X86Instruction.Vfnmsub231ps, IntrinsicType.Fma));
+            Add(Intrinsic.X86Vfnmadd231ps, new IntrinsicInfo(X86Instruction.Vfnmadd231ps, IntrinsicType.Fma));
+            Add(Intrinsic.X86Vfnmadd231sd, new IntrinsicInfo(X86Instruction.Vfnmadd231sd, IntrinsicType.Fma));
+            Add(Intrinsic.X86Vfnmadd231ss, new IntrinsicInfo(X86Instruction.Vfnmadd231ss, IntrinsicType.Fma));
             Add(Intrinsic.X86Vfnmsub231sd, new IntrinsicInfo(X86Instruction.Vfnmsub231sd, IntrinsicType.Fma));
             Add(Intrinsic.X86Vfnmsub231ss, new IntrinsicInfo(X86Instruction.Vfnmsub231ss, IntrinsicType.Fma));
             Add(Intrinsic.X86Xorpd,        new IntrinsicInfo(X86Instruction.Xorpd,        IntrinsicType.Binary));
diff --git a/ARMeilleure/CodeGen/X86/X86Instruction.cs b/ARMeilleure/CodeGen/X86/X86Instruction.cs
index 7ed4841c20..ed5b50c559 100644
--- a/ARMeilleure/CodeGen/X86/X86Instruction.cs
+++ b/ARMeilleure/CodeGen/X86/X86Instruction.cs
@@ -203,18 +203,16 @@ namespace ARMeilleure.CodeGen.X86
         Vblendvps,
         Vcvtph2ps,
         Vcvtps2ph,
-        Vfmadd231pd,
         Vfmadd231ps,
         Vfmadd231sd,
         Vfmadd231ss,
-        Vfmsub231ps,
-        Vfmsub231pd,
-        Vfmsub231ss,
         Vfmsub231sd,
-        Vfnmsub231ps,
-        Vfnmsub231pd,
-        Vfnmsub231ss,
+        Vfmsub231ss,
+        Vfnmadd231ps,
+        Vfnmadd231sd,
+        Vfnmadd231ss,
         Vfnmsub231sd,
+        Vfnmsub231ss,
         Vpblendvb,
         Xor,
         Xorpd,
diff --git a/ARMeilleure/Decoders/OpCodeTable.cs b/ARMeilleure/Decoders/OpCodeTable.cs
index 88c68644fc..665e71290d 100644
--- a/ARMeilleure/Decoders/OpCodeTable.cs
+++ b/ARMeilleure/Decoders/OpCodeTable.cs
@@ -822,6 +822,7 @@ namespace ARMeilleure.Decoders
             SetA32("<<<<11101x10xxxxxxxx101xx0x0xxxx", InstName.Vfma,     InstEmit32.Vfma_S,   OpCode32SimdRegS.Create);
             SetA32("111100100x00xxxxxxxx1100xxx1xxxx", InstName.Vfma,     InstEmit32.Vfma_V,   OpCode32SimdReg.Create);
             SetA32("<<<<11101x10xxxxxxxx101xx1x0xxxx", InstName.Vfms,     InstEmit32.Vfms_S,   OpCode32SimdRegS.Create);
+            SetA32("111100100x10xxxxxxxx1100xxx1xxxx", InstName.Vfms,     InstEmit32.Vfms_V,   OpCode32SimdReg.Create);
             SetA32("<<<<11101x01xxxxxxxx101xx1x0xxxx", InstName.Vfnma,    InstEmit32.Vfnma_S,  OpCode32SimdRegS.Create);
             SetA32("<<<<11101x01xxxxxxxx101xx0x0xxxx", InstName.Vfnms,    InstEmit32.Vfnms_S,  OpCode32SimdRegS.Create);
             SetA32("1111001x0x<<xxxxxxxx0000xxx0xxxx", InstName.Vhadd,    InstEmit32.Vhadd,    OpCode32SimdReg.Create);
diff --git a/ARMeilleure/Instructions/InstEmitAlu32.cs b/ARMeilleure/Instructions/InstEmitAlu32.cs
index d57ff0b65d..f3da121c2c 100644
--- a/ARMeilleure/Instructions/InstEmitAlu32.cs
+++ b/ARMeilleure/Instructions/InstEmitAlu32.cs
@@ -591,7 +591,7 @@ namespace ARMeilleure.Instructions
             EmitAluStore(context, res);
         }
 
-        public static void EmitDiv(ArmEmitterContext context, bool unsigned)
+        private static void EmitDiv(ArmEmitterContext context, bool unsigned)
         {
             Operand n = GetAluN(context);
             Operand m = GetAluM(context);
diff --git a/ARMeilleure/Instructions/InstEmitMul32.cs b/ARMeilleure/Instructions/InstEmitMul32.cs
index 454d44a460..fa744d2528 100644
--- a/ARMeilleure/Instructions/InstEmitMul32.cs
+++ b/ARMeilleure/Instructions/InstEmitMul32.cs
@@ -329,7 +329,7 @@ namespace ARMeilleure.Instructions
             EmitGenericAluStoreA32(context, op.RdLo, op.SetFlags, lo);
         }
 
-        public static void EmitMlal(ArmEmitterContext context, bool signed)
+        private static void EmitMlal(ArmEmitterContext context, bool signed)
         {
             OpCode32AluUmull op = (OpCode32AluUmull)context.CurrOp;
 
diff --git a/ARMeilleure/Instructions/InstEmitSimdArithmetic32.cs b/ARMeilleure/Instructions/InstEmitSimdArithmetic32.cs
index 40289520c1..d35af20912 100644
--- a/ARMeilleure/Instructions/InstEmitSimdArithmetic32.cs
+++ b/ARMeilleure/Instructions/InstEmitSimdArithmetic32.cs
@@ -252,28 +252,14 @@ namespace ARMeilleure.Instructions
             }
         }
 
-        public static void Vfma_V(ArmEmitterContext context) // Fused.
+        public static void Vfma_S(ArmEmitterContext context) // Fused.
         {
             if (Optimizations.FastFP && Optimizations.UseFma)
             {
-                // Vectors contain elements that are 32-bits in length always. The only thing that will change is the number of elements in a vector. 
-                // The 64-bit variant will never be used.
-                EmitVectorTernaryOpF32(context, Intrinsic.X86Vfmadd231ps, Intrinsic.X86Vfmadd231pd);
+                EmitScalarTernaryOpF32(context, Intrinsic.X86Vfmadd231ss, Intrinsic.X86Vfmadd231sd);
             }
-            else
+            else if (Optimizations.FastFP && Optimizations.UseSse2)
             {
-                EmitVectorTernaryOpF32(context, (op1, op2, op3) =>
-                {
-                    return EmitSoftFloatCall(context, nameof(SoftFloat32.FPMulAdd), op1, op2, op3);
-                });
-            }
-        }
-
-        public static void Vfma_S(ArmEmitterContext context) // Fused.
-        {
-            if (Optimizations.FastFP && Optimizations.UseSse2)
-            {
-                // TODO: Use FMA instruction set.
                 EmitScalarTernaryOpF32(context, Intrinsic.X86Mulss, Intrinsic.X86Mulsd, Intrinsic.X86Addss, Intrinsic.X86Addsd);
             }
             else
@@ -285,11 +271,29 @@ namespace ARMeilleure.Instructions
             }
         }
 
+        public static void Vfma_V(ArmEmitterContext context) // Fused.
+        {
+            if (Optimizations.FastFP && Optimizations.UseFma)
+            {
+                EmitVectorTernaryOpF32(context, Intrinsic.X86Vfmadd231ps);
+            }
+            else
+            {
+                EmitVectorTernaryOpF32(context, (op1, op2, op3) =>
+                {
+                    return EmitSoftFloatCallDefaultFpscr(context, nameof(SoftFloat32.FPMulAddFpscr), op1, op2, op3);
+                });
+            }
+        }
+
         public static void Vfms_S(ArmEmitterContext context) // Fused.
         {
-            if (Optimizations.FastFP && Optimizations.UseSse2)
+            if (Optimizations.FastFP && Optimizations.UseFma)
+            {
+                EmitScalarTernaryOpF32(context, Intrinsic.X86Vfnmadd231ss, Intrinsic.X86Vfnmadd231sd);
+            }
+            else if (Optimizations.FastFP && Optimizations.UseSse2)
             {
-                // TODO: Use FMA instruction set.
                 EmitScalarTernaryOpF32(context, Intrinsic.X86Mulss, Intrinsic.X86Mulsd, Intrinsic.X86Subss, Intrinsic.X86Subsd);
             }
             else
@@ -301,17 +305,36 @@ namespace ARMeilleure.Instructions
             }
         }
 
+        public static void Vfms_V(ArmEmitterContext context) // Fused.
+        {
+            if (Optimizations.FastFP && Optimizations.UseFma)
+            {
+                EmitVectorTernaryOpF32(context, Intrinsic.X86Vfnmadd231ps);
+            }
+            else
+            {
+                EmitVectorTernaryOpF32(context, (op1, op2, op3) =>
+                {
+                    return EmitSoftFloatCallDefaultFpscr(context, nameof(SoftFloat32.FPMulSubFpscr), op1, op2, op3);
+                });
+            }
+        }
+
         public static void Vfnma_S(ArmEmitterContext context) // Fused.
         {
             if (Optimizations.FastFP && Optimizations.UseFma)
             {
                 EmitScalarTernaryOpF32(context, Intrinsic.X86Vfnmsub231ss, Intrinsic.X86Vfnmsub231sd);
             }
+            else if (Optimizations.FastFP && Optimizations.UseSse2)
+            {
+                EmitScalarTernaryOpF32(context, Intrinsic.X86Mulss, Intrinsic.X86Mulsd, Intrinsic.X86Subss, Intrinsic.X86Subsd, isNegD: true);
+            }
             else
             {
                 EmitScalarTernaryOpF32(context, (op1, op2, op3) =>
                 {
-                    return EmitSoftFloatCall(context, nameof(SoftFloat32.FPMulAdd), context.Negate(op1), context.Negate(op2), op3);
+                    return EmitSoftFloatCall(context, nameof(SoftFloat32.FPNegMulAdd), op1, op2, op3);
                 });
             }
         }
@@ -322,11 +345,15 @@ namespace ARMeilleure.Instructions
             {
                 EmitScalarTernaryOpF32(context, Intrinsic.X86Vfmsub231ss, Intrinsic.X86Vfmsub231sd);
             }
+            else if (Optimizations.FastFP && Optimizations.UseSse2)
+            {
+                EmitScalarTernaryOpF32(context, Intrinsic.X86Mulss, Intrinsic.X86Mulsd, Intrinsic.X86Addss, Intrinsic.X86Addsd, isNegD: true);
+            }
             else
             {
                 EmitScalarTernaryOpF32(context, (op1, op2, op3) =>
                 {
-                    return EmitSoftFloatCall(context, nameof(SoftFloat32.FPMulAdd), context.Negate(op1), op2, op3);
+                    return EmitSoftFloatCall(context, nameof(SoftFloat32.FPNegMulSub), op1, op2, op3);
                 });
             }
         }
@@ -422,36 +449,21 @@ namespace ARMeilleure.Instructions
 
             if (Optimizations.FastFP && Optimizations.UseSse2)
             {
-                EmitScalarTernaryOpSimd32(context, (d, n, m) =>
-                {
-                    if ((op.Size & 1) == 0)
-                    {
-                        Operand res = context.AddIntrinsic(Intrinsic.X86Mulss, n, m);
-                        res = context.AddIntrinsic(Intrinsic.X86Addss, d, res);
-                        Operand mask = X86GetScalar(context, -0f);
-                        return context.AddIntrinsic(Intrinsic.X86Xorps, mask, res);
-                    }
-                    else
-                    {
-                        Operand res = context.AddIntrinsic(Intrinsic.X86Mulsd, n, m);
-                        res = context.AddIntrinsic(Intrinsic.X86Addsd, d, res);
-                        Operand mask = X86GetScalar(context, -0d);
-                        return context.AddIntrinsic(Intrinsic.X86Xorpd, mask, res);
-                    }
-                });
+                EmitScalarTernaryOpF32(context, Intrinsic.X86Mulss, Intrinsic.X86Mulsd, Intrinsic.X86Subss, Intrinsic.X86Subsd, isNegD: true);
             }
             else if (Optimizations.FastFP)
             {
                 EmitScalarTernaryOpF32(context, (op1, op2, op3) =>
                 {
-                    return context.Negate(context.Add(op1, context.Multiply(op2, op3)));
+                    return context.Subtract(context.Negate(op1), context.Multiply(op2, op3));
                 });
             }
             else
             {
                 EmitScalarTernaryOpF32(context, (op1, op2, op3) =>
                 {
-                    return EmitSoftFloatCall(context, nameof(SoftFloat32.FPNegMulAdd), op1, op2, op3);
+                    Operand res = EmitSoftFloatCall(context, nameof(SoftFloat32.FPMul), op2, op3);
+                    return EmitSoftFloatCall(context, nameof(SoftFloat32.FPSub), context.Negate(op1), res);
                 });
             }
         }
@@ -462,24 +474,7 @@ namespace ARMeilleure.Instructions
 
             if (Optimizations.FastFP && Optimizations.UseSse2)
             {
-                EmitScalarTernaryOpSimd32(context, (d, n, m) =>
-                {
-                    if ((op.Size & 1) == 0)
-                    {
-                        Operand res = context.AddIntrinsic(Intrinsic.X86Mulss, n, m);
-                        Operand mask = X86GetScalar(context, -0f);
-                        d = context.AddIntrinsic(Intrinsic.X86Xorps, mask, d);
-                        return context.AddIntrinsic(Intrinsic.X86Addss, d, res);
-
-                    }
-                    else
-                    {
-                        Operand res = context.AddIntrinsic(Intrinsic.X86Mulsd, n, m);
-                        Operand mask = X86GetScalar(context, -0d);
-                        d = context.AddIntrinsic(Intrinsic.X86Xorpd, mask, res);
-                        return context.AddIntrinsic(Intrinsic.X86Addsd, d, res);
-                    }
-                });
+                EmitScalarTernaryOpF32(context, Intrinsic.X86Mulss, Intrinsic.X86Mulsd, Intrinsic.X86Addss, Intrinsic.X86Addsd, isNegD: true);
             }
             else if (Optimizations.FastFP)
             {
@@ -492,7 +487,8 @@ namespace ARMeilleure.Instructions
             {
                 EmitScalarTernaryOpF32(context, (op1, op2, op3) =>
                 {
-                    return EmitSoftFloatCall(context, nameof(SoftFloat32.FPNegMulSub), op1, op2, op3);
+                    Operand res = EmitSoftFloatCall(context, nameof(SoftFloat32.FPMul), op2, op3);
+                    return EmitSoftFloatCall(context, nameof(SoftFloat32.FPAdd), context.Negate(op1), res);
                 });
             }
         }
diff --git a/ARMeilleure/Instructions/InstEmitSimdHelper32.cs b/ARMeilleure/Instructions/InstEmitSimdHelper32.cs
index 2d5d4ba946..3919505779 100644
--- a/ARMeilleure/Instructions/InstEmitSimdHelper32.cs
+++ b/ARMeilleure/Instructions/InstEmitSimdHelper32.cs
@@ -820,15 +820,15 @@ namespace ARMeilleure.Instructions
             });
         }
 
-        public static void EmitVectorTernaryOpF32(ArmEmitterContext context, Intrinsic inst32, Intrinsic inst64)
+        public static void EmitVectorTernaryOpF32(ArmEmitterContext context, Intrinsic inst32)
         {
             OpCode32SimdReg op = (OpCode32SimdReg)context.CurrOp;
 
-            Intrinsic inst = (op.Size & 1) != 0 ? inst64 : inst32;
+            Debug.Assert((op.Size & 1) == 0);
 
             EmitVectorTernaryOpSimd32(context, (d, n, m) =>
             {
-                return context.AddIntrinsic(inst, d, n, m);
+                return context.AddIntrinsic(inst32, d, n, m);
             });
         }
 
@@ -927,7 +927,13 @@ namespace ARMeilleure.Instructions
             });
         }
 
-        public static void EmitScalarTernaryOpF32(ArmEmitterContext context, Intrinsic inst32pt1, Intrinsic inst64pt1, Intrinsic inst32pt2, Intrinsic inst64pt2)
+        public static void EmitScalarTernaryOpF32(
+            ArmEmitterContext context,
+            Intrinsic inst32pt1,
+            Intrinsic inst64pt1,
+            Intrinsic inst32pt2,
+            Intrinsic inst64pt2,
+            bool isNegD = false)
         {
             OpCode32SimdRegS op = (OpCode32SimdRegS)context.CurrOp;
 
@@ -939,6 +945,18 @@ namespace ARMeilleure.Instructions
             EmitScalarTernaryOpSimd32(context, (d, n, m) =>
             {
                 Operand res = context.AddIntrinsic(inst1, n, m);
+
+                if (isNegD)
+                {
+                    Operand mask = doubleSize
+                        ? X86GetScalar(context, -0d)
+                        : X86GetScalar(context, -0f);
+
+                    d = doubleSize
+                        ? context.AddIntrinsic(Intrinsic.X86Xorpd, mask, d)
+                        : context.AddIntrinsic(Intrinsic.X86Xorps, mask, d);
+                }
+
                 return context.AddIntrinsic(inst2, d, res);
             });
         }
diff --git a/ARMeilleure/IntermediateRepresentation/Intrinsic.cs b/ARMeilleure/IntermediateRepresentation/Intrinsic.cs
index 515f11434a..e2989863b8 100644
--- a/ARMeilleure/IntermediateRepresentation/Intrinsic.cs
+++ b/ARMeilleure/IntermediateRepresentation/Intrinsic.cs
@@ -155,16 +155,14 @@ namespace ARMeilleure.IntermediateRepresentation
         X86Unpcklps,
         X86Vcvtph2ps,
         X86Vcvtps2ph,
-        X86Vfmadd231pd,
         X86Vfmadd231ps,
         X86Vfmadd231sd,
         X86Vfmadd231ss,
-        X86Vfmsub231pd,
-        X86Vfmsub231ps,
         X86Vfmsub231sd,
         X86Vfmsub231ss,
-        X86Vfnmsub231pd,
-        X86Vfnmsub231ps,
+        X86Vfnmadd231ps,
+        X86Vfnmadd231sd,
+        X86Vfnmadd231ss,
         X86Vfnmsub231sd,
         X86Vfnmsub231ss,
         X86Xorpd,
diff --git a/ARMeilleure/Translation/PTC/Ptc.cs b/ARMeilleure/Translation/PTC/Ptc.cs
index 3150c97c5c..b5a92b9765 100644
--- a/ARMeilleure/Translation/PTC/Ptc.cs
+++ b/ARMeilleure/Translation/PTC/Ptc.cs
@@ -22,7 +22,7 @@ namespace ARMeilleure.Translation.PTC
     {
         private const string HeaderMagic = "PTChd";
 
-        private const uint InternalVersion = 1713; //! To be incremented manually for each change to the ARMeilleure project.
+        private const int InternalVersion = 1775; //! To be incremented manually for each change to the ARMeilleure project.
 
         private const string ActualDir = "0";
         private const string BackupDir = "1";
diff --git a/Ryujinx.Tests/Cpu/CpuTestSimdCvt32.cs b/Ryujinx.Tests/Cpu/CpuTestSimdCvt32.cs
index 565d231a7f..395f246431 100644
--- a/Ryujinx.Tests/Cpu/CpuTestSimdCvt32.cs
+++ b/Ryujinx.Tests/Cpu/CpuTestSimdCvt32.cs
@@ -22,41 +22,45 @@ namespace Ryujinx.Tests.Cpu
                                 0x80000000u, 0xFFFFFFFFu };
         }
 
-        private static IEnumerable<uint> _1S_F_()
+        private static IEnumerable<ulong> _1S_F_()
         {
-            yield return 0xFF7FFFFFu; // -Max Normal    (float.MinValue)
-            yield return 0x80800000u; // -Min Normal
-            yield return 0x807FFFFFu; // -Max Subnormal
-            yield return 0x80000001u; // -Min Subnormal (-float.Epsilon)
-            yield return 0x7F7FFFFFu; // +Max Normal    (float.MaxValue)
-            yield return 0x00800000u; // +Min Normal
-            yield return 0x007FFFFFu; // +Max Subnormal
-            yield return 0x00000001u; // +Min Subnormal (float.Epsilon)
+            yield return 0x00000000FF7FFFFFul; // -Max Normal    (float.MinValue)
+            yield return 0x0000000080800000ul; // -Min Normal
+            yield return 0x00000000807FFFFFul; // -Max Subnormal
+            yield return 0x0000000080000001ul; // -Min Subnormal (-float.Epsilon)
+            yield return 0x000000007F7FFFFFul; // +Max Normal    (float.MaxValue)
+            yield return 0x0000000000800000ul; // +Min Normal
+            yield return 0x00000000007FFFFFul; // +Max Subnormal
+            yield return 0x0000000000000001ul; // +Min Subnormal (float.Epsilon)
 
             if (!NoZeros)
             {
-                yield return 0x80000000u; // -Zero
-                yield return 0x00000000u; // +Zero
+                yield return 0x0000000080000000ul; // -Zero
+                yield return 0x0000000000000000ul; // +Zero
             }
 
             if (!NoInfs)
             {
-                yield return 0xFF800000u; // -Infinity
-                yield return 0x7F800000u; // +Infinity
+                yield return 0x00000000FF800000ul; // -Infinity
+                yield return 0x000000007F800000ul; // +Infinity
             }
 
             if (!NoNaNs)
             {
-                yield return 0xFFC00000u; // -QNaN (all zeros payload) (float.NaN)
-                yield return 0xFFBFFFFFu; // -SNaN (all ones  payload)
-                yield return 0x7FC00000u; // +QNaN (all zeros payload) (-float.NaN) (DefaultNaN)
-                yield return 0x7FBFFFFFu; // +SNaN (all ones  payload)
+                yield return 0x00000000FFC00000ul; // -QNaN (all zeros payload) (float.NaN)
+                yield return 0x00000000FFBFFFFFul; // -SNaN (all ones  payload)
+                yield return 0x000000007FC00000ul; // +QNaN (all zeros payload) (-float.NaN) (DefaultNaN)
+                yield return 0x000000007FBFFFFFul; // +SNaN (all ones  payload)
             }
 
             for (int cnt = 1; cnt <= RndCnt; cnt++)
             {
-                yield return GenNormalS();
-                yield return GenSubnormalS();
+                ulong grbg = TestContext.CurrentContext.Random.NextUInt();
+                ulong rnd1 = GenNormalS();
+                ulong rnd2 = GenSubnormalS();
+
+                yield return (grbg << 32) | rnd1;
+                yield return (grbg << 32) | rnd2;
             }
         }
 
@@ -93,8 +97,11 @@ namespace Ryujinx.Tests.Cpu
 
             for (int cnt = 1; cnt <= RndCnt; cnt++)
             {
-                yield return GenNormalD();
-                yield return GenSubnormalD();
+                ulong rnd1 = GenNormalD();
+                ulong rnd2 = GenSubnormalD();
+
+                yield return rnd1;
+                yield return rnd2;
             }
         }
 #endregion
@@ -109,10 +116,10 @@ namespace Ryujinx.Tests.Cpu
         [Test, Pairwise, Description("VCVT.<dt>.F32 <Sd>, <Sm>")]
         public void Vcvt_F32_I32([Values(0u, 1u, 2u, 3u)] uint rd,
                                  [Values(0u, 1u, 2u, 3u)] uint rm,
-                                 [ValueSource(nameof(_1S_F_))] uint s0,
-                                 [ValueSource(nameof(_1S_F_))] uint s1,
-                                 [ValueSource(nameof(_1S_F_))] uint s2,
-                                 [ValueSource(nameof(_1S_F_))] uint s3,
+                                 [ValueSource(nameof(_1S_F_))] ulong s0,
+                                 [ValueSource(nameof(_1S_F_))] ulong s1,
+                                 [ValueSource(nameof(_1S_F_))] ulong s2,
+                                 [ValueSource(nameof(_1S_F_))] ulong s3,
                                  [Values] bool unsigned) // <U32, S32>
         {
             uint opcode = 0xeebc0ac0u; // VCVT.U32.F32 S0, S0
@@ -125,7 +132,7 @@ namespace Ryujinx.Tests.Cpu
             opcode |= ((rd & 0x1e) << 11) | ((rd & 0x1) << 22);
             opcode |= ((rm & 0x1e) >> 1) | ((rm & 0x1) << 5);
 
-            V128 v0 = MakeVectorE0E1E2E3(s0, s1, s2, s3);
+            V128 v0 = MakeVectorE0E1E2E3((uint)s0, (uint)s1, (uint)s2, (uint)s3);
 
             SingleOpcode(opcode, v0: v0);
 
diff --git a/Ryujinx.Tests/Cpu/CpuTestSimdReg32.cs b/Ryujinx.Tests/Cpu/CpuTestSimdReg32.cs
index 4298bd1f6f..e8298521a7 100644
--- a/Ryujinx.Tests/Cpu/CpuTestSimdReg32.cs
+++ b/Ryujinx.Tests/Cpu/CpuTestSimdReg32.cs
@@ -22,6 +22,59 @@ namespace Ryujinx.Tests.Cpu
             };
         }
 
+        private static uint[] _Vfma_Vfms_Vfnma_Vfnms_S_F32_()
+        {
+            return new uint[]
+            {
+                0xEEA00A00u, // VFMA. F32 S0, S0, S0
+                0xEEA00A40u, // VFMS. F32 S0, S0, S0
+                0xEE900A40u, // VFNMA.F32 S0, S0, S0
+                0xEE900A00u  // VFNMS.F32 S0, S0, S0
+            };
+        }
+
+        private static uint[] _Vfma_Vfms_Vfnma_Vfnms_S_F64_()
+        {
+            return new uint[]
+            {
+                0xEEA00B00u, // VFMA. F64 D0, D0, D0
+                0xEEA00B40u, // VFMS. F64 D0, D0, D0
+                0xEE900B40u, // VFNMA.F64 D0, D0, D0
+                0xEE900B00u  // VFNMS.F64 D0, D0, D0
+            };
+        }
+
+        private static uint[] _Vfma_Vfms_V_F32_()
+        {
+            return new uint[]
+            {
+                0xF2000C10u, // VFMA.F32 D0, D0, D0
+                0xF2200C10u  // VFMS.F32 D0, D0, D0
+            };
+        }
+
+        private static uint[] _Vmla_Vmls_Vnmla_Vnmls_S_F32_()
+        {
+            return new uint[]
+            {
+                0xEE000A00u, // VMLA. F32 S0, S0, S0
+                0xEE000A40u, // VMLS. F32 S0, S0, S0
+                0xEE100A40u, // VNMLA.F32 S0, S0, S0
+                0xEE100A00u  // VNMLS.F32 S0, S0, S0
+            };
+        }
+
+        private static uint[] _Vmla_Vmls_Vnmla_Vnmls_S_F64_()
+        {
+            return new uint[]
+            {
+                0xEE000B00u, // VMLA. F64 D0, D0, D0
+                0xEE000B40u, // VMLS. F64 D0, D0, D0
+                0xEE100B40u, // VNMLA.F64 D0, D0, D0
+                0xEE100B00u  // VNMLS.F64 D0, D0, D0
+            };
+        }
+
         private static uint[] _Vp_Add_Max_Min_F_()
         {
             return new uint[]
@@ -184,8 +237,8 @@ namespace Ryujinx.Tests.Cpu
         private const int RndCnt = 2;
 
         private static readonly bool NoZeros = false;
-        private static readonly bool NoInfs  = true;
-        private static readonly bool NoNaNs  = true;
+        private static readonly bool NoInfs  = false;
+        private static readonly bool NoNaNs  = false;
 
         [Explicit]
         [Test, Pairwise, Description("VADD.f32 V0, V0, V0")]
@@ -293,119 +346,115 @@ namespace Ryujinx.Tests.Cpu
             CompareAgainstUnicorn(fpsrMask: Fpsr.Nzcv);
         }
 
-        [Test, Pairwise, Description("VFMA.F<size> <Vd>, <Vn>, <Vm>")]
-        public void Vfma([Values(0u, 1u)] uint rd,
-                         [Values(0u, 1u)] uint rn,
-                         [Values(0u, 1u)] uint rm,
-                         [Values(0u, 1u)] uint Q,
-                         [ValueSource("_2S_F_")] ulong z,
-                         [ValueSource("_2S_F_")] ulong a,
-                         [ValueSource("_2S_F_")] ulong b )
+        [Test, Pairwise] [Explicit] // Fused.
+        public void Vfma_Vfms_Vfnma_Vfnms_S_F32([ValueSource(nameof(_Vfma_Vfms_Vfnma_Vfnms_S_F32_))] uint opcode,
+                                                [Values(0u, 1u, 2u, 3u)] uint rd,
+                                                [Values(0u, 1u, 2u, 3u)] uint rn,
+                                                [Values(0u, 1u, 2u, 3u)] uint rm,
+                                                [ValueSource(nameof(_1S_F_))] ulong s0,
+                                                [ValueSource(nameof(_1S_F_))] ulong s1,
+                                                [ValueSource(nameof(_1S_F_))] ulong s2,
+                                                [ValueSource(nameof(_1S_F_))] ulong s3)
         {
-            uint opcode = 0xf2000c10;
-            
-            V128 v0;
-            V128 v1;
-            V128 v2;
+            opcode |= (((rd & 0x1) << 22) | (rd & 0x1e) << 11);
+            opcode |= (((rn & 0x1) << 7)  | (rn & 0x1e) << 15);
+            opcode |= (((rm & 0x1) << 5)  | (rm & 0x1e) >> 1);
 
-            uint c = (uint) BitConverter.SingleToInt32Bits(z);
-            uint d = (uint) BitConverter.SingleToInt32Bits(a);
-            uint e = (uint) BitConverter.SingleToInt32Bits(b);
-            if (Q == 0)
-            {
-                opcode |= (((rm & 0x1) << 5) | (rm & 0x1e) >> 1);
-                opcode |= (((rd & 0x1) << 22) | (rd & 0x1e) << 11);
-                opcode |= (((rn & 0x1) << 7) | (rn & 0x1e) >> 15);
+            V128 v0 = MakeVectorE0E1E2E3((uint)s0, (uint)s1, (uint)s2, (uint)s3);
 
-                v0 = MakeVectorE0E1(c, c);
-                v1 = MakeVectorE0E1(d, c);
-                v2 = MakeVectorE0E1(e, c);
-            }
-            else
-            {
-                rd = rn = rm = 0; // Needed, as these values cannot be odd values if Q == 1.
-                opcode |= (((rm & 0x10) << 1) | (rm & 0xf) << 0);
-                opcode |= (((rd & 0x10) << 18) | (rd & 0xf) << 12);
-                opcode |= (((rn & 0x10) << 3) | (rn & 0xf) << 16);
+            SingleOpcode(opcode, v0: v0);
 
-                v0 = MakeVectorE0E1E2E3(c, c, d, e);
-                v1 = MakeVectorE0E1E2E3(d, c, e, c);
-                v2 = MakeVectorE0E1E2E3(e, c, d, c);
-            }
-
-            opcode |= ((Q & 1)  << 6);
-
-            SingleOpcode(opcode, v0: v0, v1: v1, v2: v2);
-            CompareAgainstUnicorn();
-        }
-        
-        [Test, Pairwise, Description("VFNMA.F<size> <Vd>, <Vn>, <Vm>")]
-        public void Vfnma([Values(0u, 1u)] uint rd,
-                          [Values(0u, 1u)] uint rn,
-                          [Values(0u, 1u)] uint rm,
-                          [Values(2u, 3u)] uint size,
-                          [ValueSource("_2S_F_")] ulong z,
-                          [ValueSource("_2S_F_")] ulong a,
-                          [ValueSource("_2S_F_")] ulong b)
-        {
-            uint opcode = 0xe900840;
-
-            if (size == 2)
-            {
-                opcode |= (((rm & 0x1) << 5) | (rm & 0x1e) >> 1);
-                opcode |= (((rd & 0x1) << 22) | (rd & 0x1e) << 11);
-                opcode |= (((rn & 0x1) << 7) | (rn & 0x1e) >> 15);
-
-            }
-            else
-            {
-                opcode |= (((rm & 0x10) << 1) | (rm & 0xf) << 0);
-                opcode |= (((rd & 0x10) << 18) | (rd & 0xf) << 12);
-                opcode |= (((rn & 0x10) << 3) | (rn & 0xf) << 16);
-            }
-
-            opcode |= ((size & 3) << 8);
-
-            V128 v0 = MakeVectorE0E1(z, z);
-            V128 v1 = MakeVectorE0E1(a, z);
-            V128 v2 = MakeVectorE0E1(b, z);
-
-            SingleOpcode(opcode, v0: v0, v1: v1, v2: v2);
             CompareAgainstUnicorn();
         }
 
-        [Test, Pairwise, Description("VFNMS.F<size> <Vd>, <Vn>, <Vm>")]
-        public void Vfnms([Values(0u, 1u)] uint rd,
-                          [Values(0u, 1u)] uint rn,
-                          [Values(0u, 1u)] uint rm,
-                          [Values(2u, 3u)] uint size,
-                          [ValueSource("_2S_F_")] ulong z,
-                          [ValueSource("_2S_F_")] ulong a,
-                          [ValueSource("_2S_F_")] ulong b)
+        [Test, Pairwise] [Explicit] // Fused.
+        public void Vfma_Vfms_Vfnma_Vfnms_S_F64([ValueSource(nameof(_Vfma_Vfms_Vfnma_Vfnms_S_F64_))] uint opcode,
+                                                [Values(0u, 1u)] uint rd,
+                                                [Values(0u, 1u)] uint rn,
+                                                [Values(0u, 1u)] uint rm,
+                                                [ValueSource(nameof(_1D_F_))] ulong d0,
+                                                [ValueSource(nameof(_1D_F_))] ulong d1)
         {
-            uint opcode = 0xee900a00;
+            opcode |= (((rd & 0x10) << 18) | (rd & 0xf) << 12);
+            opcode |= (((rn & 0x10) << 3)  | (rn & 0xf) << 16);
+            opcode |= (((rm & 0x10) << 1)  | (rm & 0xf) << 0);
 
-            if (size == 2)
+            V128 v0 = MakeVectorE0E1(d0, d1);
+
+            SingleOpcode(opcode, v0: v0);
+
+            CompareAgainstUnicorn();
+        }
+
+        [Test, Pairwise] [Explicit] // Fused.
+        public void Vfma_Vfms_V_F32([ValueSource(nameof(_Vfma_Vfms_V_F32_))] uint opcode,
+                                    [Values(0u, 1u, 2u, 3u)] uint rd,
+                                    [Values(0u, 1u, 2u, 3u)] uint rn,
+                                    [Values(0u, 1u, 2u, 3u)] uint rm,
+                                    [ValueSource(nameof(_2S_F_))] ulong d0,
+                                    [ValueSource(nameof(_2S_F_))] ulong d1,
+                                    [ValueSource(nameof(_2S_F_))] ulong d2,
+                                    [ValueSource(nameof(_2S_F_))] ulong d3,
+                                    [Values] bool q)
+        {
+            if (q)
             {
-                opcode |= (((rm & 0x1) << 5)  | (rm & 0x1e) >> 1);
-                opcode |= (((rd & 0x1) << 22) | (rd & 0x1e) << 11);
-                opcode |= (((rn & 0x1) << 7)  | (rn & 0x1e) >> 15);
-               
-            }
-            else
-            {
-                opcode |= (((rm & 0x10) << 1)  | (rm & 0xf) << 0);
-                opcode |= (((rd & 0x10) << 18) | (rd & 0xf) << 12);
-                opcode |= (((rn & 0x10) << 3)  | (rn & 0xf) << 16);
+                opcode |= 1 << 6;
+
+                rd >>= 1; rd <<= 1;
+                rn >>= 1; rn <<= 1;
+                rm >>= 1; rm <<= 1;
             }
 
-            opcode |= ((size & 3) << 8);
+            opcode |= ((rd & 0xf) << 12) | ((rd & 0x10) << 18);
+            opcode |= ((rn & 0xf) << 16) | ((rn & 0x10) << 3);
+            opcode |= ((rm & 0xf) << 0)  | ((rm & 0x10) << 1);
 
-            V128 v0 = MakeVectorE0E1(z, z);
-            V128 v1 = MakeVectorE0E1(a, z);
-            V128 v2 = MakeVectorE0E1(b, z);
+            V128 v0 = MakeVectorE0E1(d0, d1);
+            V128 v1 = MakeVectorE0E1(d2, d3);
+
+            SingleOpcode(opcode, v0: v0, v1: v1);
+
+            CompareAgainstUnicorn();
+        }
+
+        [Test, Pairwise] [Explicit]
+        public void Vmla_Vmls_Vnmla_Vnmls_S_F32([ValueSource(nameof(_Vmla_Vmls_Vnmla_Vnmls_S_F32_))] uint opcode,
+                                                [Values(0u, 1u, 2u, 3u)] uint rd,
+                                                [Values(0u, 1u, 2u, 3u)] uint rn,
+                                                [Values(0u, 1u, 2u, 3u)] uint rm,
+                                                [ValueSource(nameof(_1S_F_))] ulong s0,
+                                                [ValueSource(nameof(_1S_F_))] ulong s1,
+                                                [ValueSource(nameof(_1S_F_))] ulong s2,
+                                                [ValueSource(nameof(_1S_F_))] ulong s3)
+        {
+            opcode |= (((rd & 0x1) << 22) | (rd & 0x1e) << 11);
+            opcode |= (((rn & 0x1) << 7)  | (rn & 0x1e) << 15);
+            opcode |= (((rm & 0x1) << 5)  | (rm & 0x1e) >> 1);
+
+            V128 v0 = MakeVectorE0E1E2E3((uint)s0, (uint)s1, (uint)s2, (uint)s3);
+
+            SingleOpcode(opcode, v0: v0);
+
+            CompareAgainstUnicorn();
+        }
+
+        [Test, Pairwise] [Explicit]
+        public void Vmla_Vmls_Vnmla_Vnmls_S_F64([ValueSource(nameof(_Vmla_Vmls_Vnmla_Vnmls_S_F64_))] uint opcode,
+                                                [Values(0u, 1u)] uint rd,
+                                                [Values(0u, 1u)] uint rn,
+                                                [Values(0u, 1u)] uint rm,
+                                                [ValueSource(nameof(_1D_F_))] ulong d0,
+                                                [ValueSource(nameof(_1D_F_))] ulong d1)
+        {
+            opcode |= (((rd & 0x10) << 18) | (rd & 0xf) << 12);
+            opcode |= (((rn & 0x10) << 3)  | (rn & 0xf) << 16);
+            opcode |= (((rm & 0x10) << 1)  | (rm & 0xf) << 0);
+
+            V128 v0 = MakeVectorE0E1(d0, d1);
+
+            SingleOpcode(opcode, v0: v0);
 
-            SingleOpcode(opcode, v0: v0, v1: v1, v2: v2);
             CompareAgainstUnicorn();
         }