diff --git a/ARMeilleure/CodeGen/Arm64/IntrinsicTable.cs b/ARMeilleure/CodeGen/Arm64/IntrinsicTable.cs
index 53ef152e57..a309d56d95 100644
--- a/ARMeilleure/CodeGen/Arm64/IntrinsicTable.cs
+++ b/ARMeilleure/CodeGen/Arm64/IntrinsicTable.cs
@@ -226,6 +226,8 @@ namespace ARMeilleure.CodeGen.Arm64
             Add(Intrinsic.Arm64MlsVe,         new IntrinsicInfo(0x2f004000u, IntrinsicType.VectorTernaryRdByElem));
             Add(Intrinsic.Arm64MlsV,          new IntrinsicInfo(0x2e209400u, IntrinsicType.VectorTernaryRd));
             Add(Intrinsic.Arm64MoviV,         new IntrinsicInfo(0x0f000400u, IntrinsicType.VectorMovi));
+            Add(Intrinsic.Arm64MrsFpcr,       new IntrinsicInfo(0xd53b4400u, IntrinsicType.GetRegister));
+            Add(Intrinsic.Arm64MsrFpcr,       new IntrinsicInfo(0xd51b4400u, IntrinsicType.SetRegister));
             Add(Intrinsic.Arm64MrsFpsr,       new IntrinsicInfo(0xd53b4420u, IntrinsicType.GetRegister));
             Add(Intrinsic.Arm64MsrFpsr,       new IntrinsicInfo(0xd51b4420u, IntrinsicType.SetRegister));
             Add(Intrinsic.Arm64MulVe,         new IntrinsicInfo(0x0f008000u, IntrinsicType.VectorBinaryByElem));
diff --git a/ARMeilleure/CodeGen/X86/AssemblerTable.cs b/ARMeilleure/CodeGen/X86/AssemblerTable.cs
index b47b3ecd1a..e6a2ff07f9 100644
--- a/ARMeilleure/CodeGen/X86/AssemblerTable.cs
+++ b/ARMeilleure/CodeGen/X86/AssemblerTable.cs
@@ -268,11 +268,13 @@ namespace ARMeilleure.CodeGen.X86
             Add(X86Instruction.Vblendvps,     new InstructionInfo(BadOp,      BadOp,      BadOp,      BadOp,      0x000f3a4a, InstructionFlags.Vex | InstructionFlags.Prefix66));
             Add(X86Instruction.Vcvtph2ps,     new InstructionInfo(BadOp,      BadOp,      BadOp,      BadOp,      0x000f3813, InstructionFlags.Vex | InstructionFlags.Prefix66));
             Add(X86Instruction.Vcvtps2ph,     new InstructionInfo(0x000f3a1d, BadOp,      BadOp,      BadOp,      BadOp,      InstructionFlags.Vex | InstructionFlags.Prefix66));
+            Add(X86Instruction.Vfmadd231pd,   new InstructionInfo(BadOp,      BadOp,      BadOp,      BadOp,      0x000f38b8, InstructionFlags.Vex | InstructionFlags.Prefix66 | InstructionFlags.RexW));
             Add(X86Instruction.Vfmadd231ps,   new InstructionInfo(BadOp,      BadOp,      BadOp,      BadOp,      0x000f38b8, InstructionFlags.Vex | InstructionFlags.Prefix66));
             Add(X86Instruction.Vfmadd231sd,   new InstructionInfo(BadOp,      BadOp,      BadOp,      BadOp,      0x000f38b9, InstructionFlags.Vex | InstructionFlags.Prefix66 | InstructionFlags.RexW));
             Add(X86Instruction.Vfmadd231ss,   new InstructionInfo(BadOp,      BadOp,      BadOp,      BadOp,      0x000f38b9, InstructionFlags.Vex | InstructionFlags.Prefix66));
             Add(X86Instruction.Vfmsub231sd,   new InstructionInfo(BadOp,      BadOp,      BadOp,      BadOp,      0x000f38bb, InstructionFlags.Vex | InstructionFlags.Prefix66 | InstructionFlags.RexW));
             Add(X86Instruction.Vfmsub231ss,   new InstructionInfo(BadOp,      BadOp,      BadOp,      BadOp,      0x000f38bb, InstructionFlags.Vex | InstructionFlags.Prefix66));
+            Add(X86Instruction.Vfnmadd231pd,  new InstructionInfo(BadOp,      BadOp,      BadOp,      BadOp,      0x000f38bc, InstructionFlags.Vex | InstructionFlags.Prefix66 | InstructionFlags.RexW));
             Add(X86Instruction.Vfnmadd231ps,  new InstructionInfo(BadOp,      BadOp,      BadOp,      BadOp,      0x000f38bc, InstructionFlags.Vex | InstructionFlags.Prefix66));
             Add(X86Instruction.Vfnmadd231sd,  new InstructionInfo(BadOp,      BadOp,      BadOp,      BadOp,      0x000f38bd, InstructionFlags.Vex | InstructionFlags.Prefix66 | InstructionFlags.RexW));
             Add(X86Instruction.Vfnmadd231ss,  new InstructionInfo(BadOp,      BadOp,      BadOp,      BadOp,      0x000f38bd, InstructionFlags.Vex | InstructionFlags.Prefix66));
diff --git a/ARMeilleure/CodeGen/X86/CodeGenerator.cs b/ARMeilleure/CodeGen/X86/CodeGenerator.cs
index 8b5a3fc577..e7179b517e 100644
--- a/ARMeilleure/CodeGen/X86/CodeGenerator.cs
+++ b/ARMeilleure/CodeGen/X86/CodeGenerator.cs
@@ -249,10 +249,9 @@ namespace ARMeilleure.CodeGen.X86
                     case IntrinsicType.Mxcsr:
                     {
                         Operand offset = operation.GetSource(0);
-                        Operand bits   = operation.GetSource(1);
 
-                        Debug.Assert(offset.Kind == OperandKind.Constant && bits.Kind == OperandKind.Constant);
-                        Debug.Assert(offset.Type == OperandType.I32 && bits.Type == OperandType.I32);
+                        Debug.Assert(offset.Kind == OperandKind.Constant);
+                        Debug.Assert(offset.Type == OperandType.I32);
 
                         int offs = offset.AsInt32() + context.CallArgsRegionSize;
 
@@ -261,21 +260,23 @@ namespace ARMeilleure.CodeGen.X86
 
                         Debug.Assert(HardwareCapabilities.SupportsSse || HardwareCapabilities.SupportsVexEncoding);
 
-                        context.Assembler.Stmxcsr(memOp);
-
-                        if (operation.Intrinsic == Intrinsic.X86Mxcsrmb)
+                        if (operation.Intrinsic == Intrinsic.X86Ldmxcsr)
                         {
-                            context.Assembler.Or(memOp, bits, OperandType.I32);
+                            Operand bits = operation.GetSource(1);
+                            Debug.Assert(bits.Type == OperandType.I32);
+
+                            context.Assembler.Mov(memOp, bits, OperandType.I32);
+                            context.Assembler.Ldmxcsr(memOp);
                         }
-                        else /* if (intrinOp.Intrinsic == Intrinsic.X86Mxcsrub) */
+                        else if (operation.Intrinsic == Intrinsic.X86Stmxcsr)
                         {
-                            Operand notBits = Const(~bits.AsInt32());
+                            Operand dest = operation.Destination;
+                            Debug.Assert(dest.Type == OperandType.I32);
 
-                            context.Assembler.And(memOp, notBits, OperandType.I32);
+                            context.Assembler.Stmxcsr(memOp);
+                            context.Assembler.Mov(dest, memOp, OperandType.I32);
                         }
 
-                        context.Assembler.Ldmxcsr(memOp);
-
                         break;
                     }
 
diff --git a/ARMeilleure/CodeGen/X86/IntrinsicTable.cs b/ARMeilleure/CodeGen/X86/IntrinsicTable.cs
index c788fa4424..e3d94b7ae3 100644
--- a/ARMeilleure/CodeGen/X86/IntrinsicTable.cs
+++ b/ARMeilleure/CodeGen/X86/IntrinsicTable.cs
@@ -60,6 +60,7 @@ namespace ARMeilleure.CodeGen.X86
             Add(Intrinsic.X86Haddpd,        new IntrinsicInfo(X86Instruction.Haddpd,        IntrinsicType.Binary));
             Add(Intrinsic.X86Haddps,        new IntrinsicInfo(X86Instruction.Haddps,        IntrinsicType.Binary));
             Add(Intrinsic.X86Insertps,      new IntrinsicInfo(X86Instruction.Insertps,      IntrinsicType.TernaryImm));
+            Add(Intrinsic.X86Ldmxcsr,       new IntrinsicInfo(X86Instruction.None,          IntrinsicType.Mxcsr));
             Add(Intrinsic.X86Maxpd,         new IntrinsicInfo(X86Instruction.Maxpd,         IntrinsicType.Binary));
             Add(Intrinsic.X86Maxps,         new IntrinsicInfo(X86Instruction.Maxps,         IntrinsicType.Binary));
             Add(Intrinsic.X86Maxsd,         new IntrinsicInfo(X86Instruction.Maxsd,         IntrinsicType.Binary));
@@ -75,8 +76,6 @@ namespace ARMeilleure.CodeGen.X86
             Add(Intrinsic.X86Mulps,         new IntrinsicInfo(X86Instruction.Mulps,         IntrinsicType.Binary));
             Add(Intrinsic.X86Mulsd,         new IntrinsicInfo(X86Instruction.Mulsd,         IntrinsicType.Binary));
             Add(Intrinsic.X86Mulss,         new IntrinsicInfo(X86Instruction.Mulss,         IntrinsicType.Binary));
-            Add(Intrinsic.X86Mxcsrmb,       new IntrinsicInfo(X86Instruction.None,          IntrinsicType.Mxcsr)); // Mask bits.
-            Add(Intrinsic.X86Mxcsrub,       new IntrinsicInfo(X86Instruction.None,          IntrinsicType.Mxcsr)); // Unmask bits.
             Add(Intrinsic.X86Paddb,         new IntrinsicInfo(X86Instruction.Paddb,         IntrinsicType.Binary));
             Add(Intrinsic.X86Paddd,         new IntrinsicInfo(X86Instruction.Paddd,         IntrinsicType.Binary));
             Add(Intrinsic.X86Paddq,         new IntrinsicInfo(X86Instruction.Paddq,         IntrinsicType.Binary));
@@ -160,6 +159,7 @@ namespace ARMeilleure.CodeGen.X86
             Add(Intrinsic.X86Sqrtps,        new IntrinsicInfo(X86Instruction.Sqrtps,        IntrinsicType.Unary));
             Add(Intrinsic.X86Sqrtsd,        new IntrinsicInfo(X86Instruction.Sqrtsd,        IntrinsicType.Unary));
             Add(Intrinsic.X86Sqrtss,        new IntrinsicInfo(X86Instruction.Sqrtss,        IntrinsicType.Unary));
+            Add(Intrinsic.X86Stmxcsr,       new IntrinsicInfo(X86Instruction.None,          IntrinsicType.Mxcsr));
             Add(Intrinsic.X86Subpd,         new IntrinsicInfo(X86Instruction.Subpd,         IntrinsicType.Binary));
             Add(Intrinsic.X86Subps,         new IntrinsicInfo(X86Instruction.Subps,         IntrinsicType.Binary));
             Add(Intrinsic.X86Subsd,         new IntrinsicInfo(X86Instruction.Subsd,         IntrinsicType.Binary));
@@ -170,11 +170,13 @@ namespace ARMeilleure.CodeGen.X86
             Add(Intrinsic.X86Unpcklps,      new IntrinsicInfo(X86Instruction.Unpcklps,      IntrinsicType.Binary));
             Add(Intrinsic.X86Vcvtph2ps,     new IntrinsicInfo(X86Instruction.Vcvtph2ps,     IntrinsicType.Unary));
             Add(Intrinsic.X86Vcvtps2ph,     new IntrinsicInfo(X86Instruction.Vcvtps2ph,     IntrinsicType.BinaryImm));
+            Add(Intrinsic.X86Vfmadd231pd,   new IntrinsicInfo(X86Instruction.Vfmadd231pd,   IntrinsicType.Fma));
             Add(Intrinsic.X86Vfmadd231ps,   new IntrinsicInfo(X86Instruction.Vfmadd231ps,   IntrinsicType.Fma));
             Add(Intrinsic.X86Vfmadd231sd,   new IntrinsicInfo(X86Instruction.Vfmadd231sd,   IntrinsicType.Fma));
             Add(Intrinsic.X86Vfmadd231ss,   new IntrinsicInfo(X86Instruction.Vfmadd231ss,   IntrinsicType.Fma));
             Add(Intrinsic.X86Vfmsub231sd,   new IntrinsicInfo(X86Instruction.Vfmsub231sd,   IntrinsicType.Fma));
             Add(Intrinsic.X86Vfmsub231ss,   new IntrinsicInfo(X86Instruction.Vfmsub231ss,   IntrinsicType.Fma));
+            Add(Intrinsic.X86Vfnmadd231pd,  new IntrinsicInfo(X86Instruction.Vfnmadd231pd,  IntrinsicType.Fma));
             Add(Intrinsic.X86Vfnmadd231ps,  new IntrinsicInfo(X86Instruction.Vfnmadd231ps,  IntrinsicType.Fma));
             Add(Intrinsic.X86Vfnmadd231sd,  new IntrinsicInfo(X86Instruction.Vfnmadd231sd,  IntrinsicType.Fma));
             Add(Intrinsic.X86Vfnmadd231ss,  new IntrinsicInfo(X86Instruction.Vfnmadd231ss,  IntrinsicType.Fma));
diff --git a/ARMeilleure/CodeGen/X86/Mxcsr.cs b/ARMeilleure/CodeGen/X86/Mxcsr.cs
new file mode 100644
index 0000000000..c61eac31aa
--- /dev/null
+++ b/ARMeilleure/CodeGen/X86/Mxcsr.cs
@@ -0,0 +1,15 @@
+using System;
+
+namespace ARMeilleure.CodeGen.X86
+{
+    [Flags]
+    enum Mxcsr
+    {
+        Ftz = 1 << 15, // Flush To Zero.
+        Rhi = 1 << 14, // Round Mode high bit.
+        Rlo = 1 << 13, // Round Mode low bit.
+        Um = 1 << 11,  // Underflow Mask.
+        Dm = 1 << 8,   // Denormal Mask.
+        Daz = 1 << 6   // Denormals Are Zero.
+    }
+}
diff --git a/ARMeilleure/CodeGen/X86/PreAllocator.cs b/ARMeilleure/CodeGen/X86/PreAllocator.cs
index 72f56514f1..cb742d67f0 100644
--- a/ARMeilleure/CodeGen/X86/PreAllocator.cs
+++ b/ARMeilleure/CodeGen/X86/PreAllocator.cs
@@ -120,12 +120,18 @@ namespace ARMeilleure.CodeGen.X86
                             break;
 
                         case Instruction.Extended:
-                            if (node.Intrinsic == Intrinsic.X86Mxcsrmb || node.Intrinsic == Intrinsic.X86Mxcsrub)
+                            if (node.Intrinsic == Intrinsic.X86Ldmxcsr)
                             {
                                 int stackOffset = stackAlloc.Allocate(OperandType.I32);
 
                                 node.SetSources(new Operand[] { Const(stackOffset), node.GetSource(0) });
                             }
+                            else if (node.Intrinsic == Intrinsic.X86Stmxcsr)
+                            {
+                                int stackOffset = stackAlloc.Allocate(OperandType.I32);
+
+                                node.SetSources(new Operand[] { Const(stackOffset) });
+                            }
                             break;
                     }
                 }
diff --git a/ARMeilleure/CodeGen/X86/X86Instruction.cs b/ARMeilleure/CodeGen/X86/X86Instruction.cs
index ecfc432d70..9a85c516f8 100644
--- a/ARMeilleure/CodeGen/X86/X86Instruction.cs
+++ b/ARMeilleure/CodeGen/X86/X86Instruction.cs
@@ -208,11 +208,13 @@ namespace ARMeilleure.CodeGen.X86
         Vblendvps,
         Vcvtph2ps,
         Vcvtps2ph,
+        Vfmadd231pd,
         Vfmadd231ps,
         Vfmadd231sd,
         Vfmadd231ss,
         Vfmsub231sd,
         Vfmsub231ss,
+        Vfnmadd231pd,
         Vfnmadd231ps,
         Vfnmadd231sd,
         Vfnmadd231ss,
diff --git a/ARMeilleure/Instructions/InstEmitSimdArithmetic.cs b/ARMeilleure/Instructions/InstEmitSimdArithmetic.cs
index d0bb68e4f7..7e7f26b1a2 100644
--- a/ARMeilleure/Instructions/InstEmitSimdArithmetic.cs
+++ b/ARMeilleure/Instructions/InstEmitSimdArithmetic.cs
@@ -615,14 +615,11 @@ namespace ARMeilleure.Instructions
                 {
                     return EmitSse41ProcessNaNsOpF(context, (op1, op2) =>
                     {
-                        return EmitSseOrAvxHandleFzModeOpF(context, (op1, op2) =>
-                        {
-                            IOpCodeSimd op = (IOpCodeSimd)context.CurrOp;
+                        IOpCodeSimd op = (IOpCodeSimd)context.CurrOp;
 
-                            Intrinsic addInst = (op.Size & 1) == 0 ? Intrinsic.X86Addps : Intrinsic.X86Addpd;
+                        Intrinsic addInst = (op.Size & 1) == 0 ? Intrinsic.X86Addps : Intrinsic.X86Addpd;
 
-                            return context.AddIntrinsic(addInst, op1, op2);
-                        }, scalar: false, op1, op2);
+                        return context.AddIntrinsic(addInst, op1, op2);
                     }, scalar: false, op1, op2);
                 });
             }
@@ -696,17 +693,33 @@ namespace ARMeilleure.Instructions
                 Operand n = GetVec(op.Rn);
                 Operand m = GetVec(op.Rm);
 
+                Operand res;
+
                 if (op.Size == 0)
                 {
-                    Operand res = context.AddIntrinsic(Intrinsic.X86Mulss, n, m);
-                            res = context.AddIntrinsic(Intrinsic.X86Addss, a, res);
+                    if (Optimizations.UseFma)
+                    {
+                        res = context.AddIntrinsic(Intrinsic.X86Vfmadd231ss, a, n, m);
+                    }
+                    else
+                    {
+                        res = context.AddIntrinsic(Intrinsic.X86Mulss, n, m);
+                        res = context.AddIntrinsic(Intrinsic.X86Addss, a, res);
+                    }
 
                     context.Copy(d, context.VectorZeroUpper96(res));
                 }
                 else /* if (op.Size == 1) */
                 {
-                    Operand res = context.AddIntrinsic(Intrinsic.X86Mulsd, n, m);
-                            res = context.AddIntrinsic(Intrinsic.X86Addsd, a, res);
+                    if (Optimizations.UseFma)
+                    {
+                        res = context.AddIntrinsic(Intrinsic.X86Vfmadd231sd, a, n, m);
+                    }
+                    else
+                    {
+                        res = context.AddIntrinsic(Intrinsic.X86Mulsd, n, m);
+                        res = context.AddIntrinsic(Intrinsic.X86Addsd, a, res);
+                    }
 
                     context.Copy(d, context.VectorZeroUpper64(res));
                 }
@@ -730,10 +743,7 @@ namespace ARMeilleure.Instructions
             {
                 EmitSse41ProcessNaNsOpF(context, (op1, op2) =>
                 {
-                    return EmitSseOrAvxHandleFzModeOpF(context, (op1, op2) =>
-                    {
-                        return EmitSse2VectorMaxMinOpF(context, op1, op2, isMax: true);
-                    }, scalar: true, op1, op2);
+                    return EmitSse2VectorMaxMinOpF(context, op1, op2, isMax: true);
                 }, scalar: true);
             }
             else
@@ -755,10 +765,7 @@ namespace ARMeilleure.Instructions
             {
                 EmitSse41ProcessNaNsOpF(context, (op1, op2) =>
                 {
-                    return EmitSseOrAvxHandleFzModeOpF(context, (op1, op2) =>
-                    {
-                        return EmitSse2VectorMaxMinOpF(context, op1, op2, isMax: true);
-                    }, scalar: false, op1, op2);
+                    return EmitSse2VectorMaxMinOpF(context, op1, op2, isMax: true);
                 }, scalar: false);
             }
             else
@@ -886,10 +893,7 @@ namespace ARMeilleure.Instructions
                 {
                     return EmitSse41ProcessNaNsOpF(context, (op1, op2) =>
                     {
-                        return EmitSseOrAvxHandleFzModeOpF(context, (op1, op2) =>
-                        {
-                            return EmitSse2VectorMaxMinOpF(context, op1, op2, isMax: true);
-                        }, scalar: false, op1, op2);
+                        return EmitSse2VectorMaxMinOpF(context, op1, op2, isMax: true);
                     }, scalar: false, op1, op2);
                 });
             }
@@ -914,10 +918,7 @@ namespace ARMeilleure.Instructions
                 {
                     return EmitSse41ProcessNaNsOpF(context, (op1, op2) =>
                     {
-                        return EmitSseOrAvxHandleFzModeOpF(context, (op1, op2) =>
-                        {
-                            return EmitSse2VectorMaxMinOpF(context, op1, op2, isMax: true);
-                        }, scalar: false, op1, op2);
+                        return EmitSse2VectorMaxMinOpF(context, op1, op2, isMax: true);
                     }, scalar: false, op1, op2);
                 });
             }
@@ -940,10 +941,7 @@ namespace ARMeilleure.Instructions
             {
                 EmitSse41ProcessNaNsOpF(context, (op1, op2) =>
                 {
-                    return EmitSseOrAvxHandleFzModeOpF(context, (op1, op2) =>
-                    {
-                        return EmitSse2VectorMaxMinOpF(context, op1, op2, isMax: false);
-                    }, scalar: true, op1, op2);
+                    return EmitSse2VectorMaxMinOpF(context, op1, op2, isMax: false);
                 }, scalar: true);
             }
             else
@@ -965,10 +963,7 @@ namespace ARMeilleure.Instructions
             {
                 EmitSse41ProcessNaNsOpF(context, (op1, op2) =>
                 {
-                    return EmitSseOrAvxHandleFzModeOpF(context, (op1, op2) =>
-                    {
-                        return EmitSse2VectorMaxMinOpF(context, op1, op2, isMax: false);
-                    }, scalar: false, op1, op2);
+                    return EmitSse2VectorMaxMinOpF(context, op1, op2, isMax: false);
                 }, scalar: false);
             }
             else
@@ -1096,10 +1091,7 @@ namespace ARMeilleure.Instructions
                 {
                     return EmitSse41ProcessNaNsOpF(context, (op1, op2) =>
                     {
-                        return EmitSseOrAvxHandleFzModeOpF(context, (op1, op2) =>
-                        {
-                            return EmitSse2VectorMaxMinOpF(context, op1, op2, isMax: false);
-                        }, scalar: false, op1, op2);
+                        return EmitSse2VectorMaxMinOpF(context, op1, op2, isMax: false);
                     }, scalar: false, op1, op2);
                 });
             }
@@ -1124,10 +1116,7 @@ namespace ARMeilleure.Instructions
                 {
                     return EmitSse41ProcessNaNsOpF(context, (op1, op2) =>
                     {
-                        return EmitSseOrAvxHandleFzModeOpF(context, (op1, op2) =>
-                        {
-                            return EmitSse2VectorMaxMinOpF(context, op1, op2, isMax: false);
-                        }, scalar: false, op1, op2);
+                        return EmitSse2VectorMaxMinOpF(context, op1, op2, isMax: false);
                     }, scalar: false, op1, op2);
                 });
             }
@@ -1146,6 +1135,37 @@ namespace ARMeilleure.Instructions
             {
                 InstEmitSimdHelperArm64.EmitScalarTernaryOpFRdByElem(context, Intrinsic.Arm64FmlaSe);
             }
+            else if (Optimizations.UseFma)
+            {
+                OpCodeSimdRegElemF op = (OpCodeSimdRegElemF)context.CurrOp;
+
+                Operand d = GetVec(op.Rd);
+                Operand n = GetVec(op.Rn);
+                Operand m = GetVec(op.Rm);
+
+                int sizeF = op.Size & 1;
+
+                if (sizeF == 0)
+                {
+                    int shuffleMask = op.Index | op.Index << 2 | op.Index << 4 | op.Index << 6;
+
+                    Operand res = context.AddIntrinsic(Intrinsic.X86Shufps, m, m, Const(shuffleMask));
+
+                    res = context.AddIntrinsic(Intrinsic.X86Vfmadd231ss, d, n, res);
+
+                    context.Copy(d, context.VectorZeroUpper96(res));
+                }
+                else /* if (sizeF == 1) */
+                {
+                    int shuffleMask = op.Index | op.Index << 1;
+
+                    Operand res = context.AddIntrinsic(Intrinsic.X86Shufpd, m, m, Const(shuffleMask));
+
+                    res = context.AddIntrinsic(Intrinsic.X86Vfmadd231sd, d, n, res);
+
+                    context.Copy(d, context.VectorZeroUpper64(res));
+                }
+            }
             else
             {
                 EmitScalarTernaryOpByElemF(context, (op1, op2, op3) =>
@@ -1171,11 +1191,19 @@ namespace ARMeilleure.Instructions
 
                 int sizeF = op.Size & 1;
 
+                Operand res;
+
                 if (sizeF == 0)
                 {
-                    Operand res = context.AddIntrinsic(Intrinsic.X86Mulps, n, m);
-
-                    res = context.AddIntrinsic(Intrinsic.X86Addps, d, res);
+                    if (Optimizations.UseFma)
+                    {
+                        res = context.AddIntrinsic(Intrinsic.X86Vfmadd231ps, d, n, m);
+                    }
+                    else
+                    {
+                        res = context.AddIntrinsic(Intrinsic.X86Mulps, n, m);
+                        res = context.AddIntrinsic(Intrinsic.X86Addps, d, res);
+                    }
 
                     if (op.RegisterSize == RegisterSize.Simd64)
                     {
@@ -1186,9 +1214,15 @@ namespace ARMeilleure.Instructions
                 }
                 else /* if (sizeF == 1) */
                 {
-                    Operand res = context.AddIntrinsic(Intrinsic.X86Mulpd, n, m);
-
-                    res = context.AddIntrinsic(Intrinsic.X86Addpd, d, res);
+                    if (Optimizations.UseFma)
+                    {
+                        res = context.AddIntrinsic(Intrinsic.X86Vfmadd231pd, d, n, m);
+                    }
+                    else
+                    {
+                        res = context.AddIntrinsic(Intrinsic.X86Mulpd, n, m);
+                        res = context.AddIntrinsic(Intrinsic.X86Addpd, d, res);
+                    }
 
                     context.Copy(d, res);
                 }
@@ -1224,8 +1258,15 @@ namespace ARMeilleure.Instructions
 
                     Operand res = context.AddIntrinsic(Intrinsic.X86Shufps, m, m, Const(shuffleMask));
 
-                    res = context.AddIntrinsic(Intrinsic.X86Mulps, n, res);
-                    res = context.AddIntrinsic(Intrinsic.X86Addps, d, res);
+                    if (Optimizations.UseFma)
+                    {
+                        res = context.AddIntrinsic(Intrinsic.X86Vfmadd231ps, d, n, res);
+                    }
+                    else
+                    {
+                        res = context.AddIntrinsic(Intrinsic.X86Mulps, n, res);
+                        res = context.AddIntrinsic(Intrinsic.X86Addps, d, res);
+                    }
 
                     if (op.RegisterSize == RegisterSize.Simd64)
                     {
@@ -1240,8 +1281,15 @@ namespace ARMeilleure.Instructions
 
                     Operand res = context.AddIntrinsic(Intrinsic.X86Shufpd, m, m, Const(shuffleMask));
 
-                    res = context.AddIntrinsic(Intrinsic.X86Mulpd, n, res);
-                    res = context.AddIntrinsic(Intrinsic.X86Addpd, d, res);
+                    if (Optimizations.UseFma)
+                    {
+                        res = context.AddIntrinsic(Intrinsic.X86Vfmadd231pd, d, n, res);
+                    }
+                    else
+                    {
+                        res = context.AddIntrinsic(Intrinsic.X86Mulpd, n, res);
+                        res = context.AddIntrinsic(Intrinsic.X86Addpd, d, res);
+                    }
 
                     context.Copy(d, res);
                 }
@@ -1261,6 +1309,37 @@ namespace ARMeilleure.Instructions
             {
                 InstEmitSimdHelperArm64.EmitScalarTernaryOpFRdByElem(context, Intrinsic.Arm64FmlsSe);
             }
+            else if (Optimizations.UseFma)
+            {
+                OpCodeSimdRegElemF op = (OpCodeSimdRegElemF)context.CurrOp;
+
+                Operand d = GetVec(op.Rd);
+                Operand n = GetVec(op.Rn);
+                Operand m = GetVec(op.Rm);
+
+                int sizeF = op.Size & 1;
+
+                if (sizeF == 0)
+                {
+                    int shuffleMask = op.Index | op.Index << 2 | op.Index << 4 | op.Index << 6;
+
+                    Operand res = context.AddIntrinsic(Intrinsic.X86Shufps, m, m, Const(shuffleMask));
+
+                    res = context.AddIntrinsic(Intrinsic.X86Vfnmadd231ss, d, n, res);
+
+                    context.Copy(d, context.VectorZeroUpper96(res));
+                }
+                else /* if (sizeF == 1) */
+                {
+                    int shuffleMask = op.Index | op.Index << 1;
+
+                    Operand res = context.AddIntrinsic(Intrinsic.X86Shufpd, m, m, Const(shuffleMask));
+
+                    res = context.AddIntrinsic(Intrinsic.X86Vfnmadd231sd, d, n, res);
+
+                    context.Copy(d, context.VectorZeroUpper64(res));
+                }
+            }
             else
             {
                 EmitScalarTernaryOpByElemF(context, (op1, op2, op3) =>
@@ -1286,11 +1365,19 @@ namespace ARMeilleure.Instructions
 
                 int sizeF = op.Size & 1;
 
+                Operand res;
+
                 if (sizeF == 0)
                 {
-                    Operand res = context.AddIntrinsic(Intrinsic.X86Mulps, n, m);
-
-                    res = context.AddIntrinsic(Intrinsic.X86Subps, d, res);
+                    if (Optimizations.UseFma)
+                    {
+                        res = context.AddIntrinsic(Intrinsic.X86Vfnmadd231ps, d, n, m);
+                    }
+                    else
+                    {
+                        res = context.AddIntrinsic(Intrinsic.X86Mulps, n, m);
+                        res = context.AddIntrinsic(Intrinsic.X86Subps, d, res);
+                    }
 
                     if (op.RegisterSize == RegisterSize.Simd64)
                     {
@@ -1301,9 +1388,15 @@ namespace ARMeilleure.Instructions
                 }
                 else /* if (sizeF == 1) */
                 {
-                    Operand res = context.AddIntrinsic(Intrinsic.X86Mulpd, n, m);
-
-                    res = context.AddIntrinsic(Intrinsic.X86Subpd, d, res);
+                    if (Optimizations.UseFma)
+                    {
+                        res = context.AddIntrinsic(Intrinsic.X86Vfnmadd231pd, d, n, m);
+                    }
+                    else
+                    {
+                        res = context.AddIntrinsic(Intrinsic.X86Mulpd, n, m);
+                        res = context.AddIntrinsic(Intrinsic.X86Subpd, d, res);
+                    }
 
                     context.Copy(d, res);
                 }
@@ -1339,8 +1432,15 @@ namespace ARMeilleure.Instructions
 
                     Operand res = context.AddIntrinsic(Intrinsic.X86Shufps, m, m, Const(shuffleMask));
 
-                    res = context.AddIntrinsic(Intrinsic.X86Mulps, n, res);
-                    res = context.AddIntrinsic(Intrinsic.X86Subps, d, res);
+                    if (Optimizations.UseFma)
+                    {
+                        res = context.AddIntrinsic(Intrinsic.X86Vfnmadd231ps, d, n, res);
+                    }
+                    else
+                    {
+                        res = context.AddIntrinsic(Intrinsic.X86Mulps, n, res);
+                        res = context.AddIntrinsic(Intrinsic.X86Subps, d, res);
+                    }
 
                     if (op.RegisterSize == RegisterSize.Simd64)
                     {
@@ -1355,8 +1455,15 @@ namespace ARMeilleure.Instructions
 
                     Operand res = context.AddIntrinsic(Intrinsic.X86Shufpd, m, m, Const(shuffleMask));
 
-                    res = context.AddIntrinsic(Intrinsic.X86Mulpd, n, res);
-                    res = context.AddIntrinsic(Intrinsic.X86Subpd, d, res);
+                    if (Optimizations.UseFma)
+                    {
+                        res = context.AddIntrinsic(Intrinsic.X86Vfnmadd231pd, d, n, res);
+                    }
+                    else
+                    {
+                        res = context.AddIntrinsic(Intrinsic.X86Mulpd, n, res);
+                        res = context.AddIntrinsic(Intrinsic.X86Subpd, d, res);
+                    }
 
                     context.Copy(d, res);
                 }
@@ -1385,17 +1492,33 @@ namespace ARMeilleure.Instructions
                 Operand n = GetVec(op.Rn);
                 Operand m = GetVec(op.Rm);
 
+                Operand res;
+
                 if (op.Size == 0)
                 {
-                    Operand res = context.AddIntrinsic(Intrinsic.X86Mulss, n, m);
-                            res = context.AddIntrinsic(Intrinsic.X86Subss, a, res);
+                    if (Optimizations.UseFma)
+                    {
+                        res = context.AddIntrinsic(Intrinsic.X86Vfnmadd231ss, a, n, m);
+                    }
+                    else
+                    {
+                        res = context.AddIntrinsic(Intrinsic.X86Mulss, n, m);
+                        res = context.AddIntrinsic(Intrinsic.X86Subss, a, res);
+                    }
 
                     context.Copy(d, context.VectorZeroUpper96(res));
                 }
                 else /* if (op.Size == 1) */
                 {
-                    Operand res = context.AddIntrinsic(Intrinsic.X86Mulsd, n, m);
-                            res = context.AddIntrinsic(Intrinsic.X86Subsd, a, res);
+                    if (Optimizations.UseFma)
+                    {
+                        res = context.AddIntrinsic(Intrinsic.X86Vfnmadd231sd, a, n, m);
+                    }
+                    else
+                    {
+                        res = context.AddIntrinsic(Intrinsic.X86Mulsd, n, m);
+                        res = context.AddIntrinsic(Intrinsic.X86Subsd, a, res);
+                    }
 
                     context.Copy(d, context.VectorZeroUpper64(res));
                 }
@@ -1669,25 +1792,39 @@ namespace ARMeilleure.Instructions
                 Operand n = GetVec(op.Rn);
                 Operand m = GetVec(op.Rm);
 
+                Operand res;
+
                 if (op.Size == 0)
                 {
-                    Operand mask = X86GetScalar(context, -0f);
+                    if (Optimizations.UseFma)
+                    {
+                        res = context.AddIntrinsic(Intrinsic.X86Vfnmsub231ss, a, n, m);
+                    }
+                    else
+                    {
+                        Operand mask = X86GetScalar(context, -0f);
+                        Operand aNeg = context.AddIntrinsic(Intrinsic.X86Xorps, mask, a);
 
-                    Operand aNeg = context.AddIntrinsic(Intrinsic.X86Xorps, mask, a);
-
-                    Operand res = context.AddIntrinsic(Intrinsic.X86Mulss, n, m);
-                            res = context.AddIntrinsic(Intrinsic.X86Subss, aNeg, res);
+                        res = context.AddIntrinsic(Intrinsic.X86Mulss, n, m);
+                        res = context.AddIntrinsic(Intrinsic.X86Subss, aNeg, res);
+                    }
 
                     context.Copy(d, context.VectorZeroUpper96(res));
                 }
                 else /* if (op.Size == 1) */
                 {
-                    Operand mask = X86GetScalar(context, -0d);
+                    if (Optimizations.UseFma)
+                    {
+                        res = context.AddIntrinsic(Intrinsic.X86Vfnmsub231sd, a, n, m);
+                    }
+                    else
+                    {
+                        Operand mask = X86GetScalar(context, -0d);
+                        Operand aNeg = context.AddIntrinsic(Intrinsic.X86Xorpd, mask, a);
 
-                    Operand aNeg = context.AddIntrinsic(Intrinsic.X86Xorpd, mask, a);
-
-                    Operand res = context.AddIntrinsic(Intrinsic.X86Mulsd, n, m);
-                            res = context.AddIntrinsic(Intrinsic.X86Subsd, aNeg, res);
+                        res = context.AddIntrinsic(Intrinsic.X86Mulsd, n, m);
+                        res = context.AddIntrinsic(Intrinsic.X86Subsd, aNeg, res);
+                    }
 
                     context.Copy(d, context.VectorZeroUpper64(res));
                 }
@@ -1716,25 +1853,39 @@ namespace ARMeilleure.Instructions
                 Operand n = GetVec(op.Rn);
                 Operand m = GetVec(op.Rm);
 
+                Operand res;
+
                 if (op.Size == 0)
                 {
-                    Operand mask = X86GetScalar(context, -0f);
+                    if (Optimizations.UseFma)
+                    {
+                        res = context.AddIntrinsic(Intrinsic.X86Vfmsub231ss, a, n, m);
+                    }
+                    else
+                    {
+                        Operand mask = X86GetScalar(context, -0f);
+                        Operand aNeg = context.AddIntrinsic(Intrinsic.X86Xorps, mask, a);
 
-                    Operand aNeg = context.AddIntrinsic(Intrinsic.X86Xorps, mask, a);
-
-                    Operand res = context.AddIntrinsic(Intrinsic.X86Mulss, n, m);
-                            res = context.AddIntrinsic(Intrinsic.X86Addss, aNeg, res);
+                        res = context.AddIntrinsic(Intrinsic.X86Mulss, n, m);
+                        res = context.AddIntrinsic(Intrinsic.X86Addss, aNeg, res);
+                    }
 
                     context.Copy(d, context.VectorZeroUpper96(res));
                 }
                 else /* if (op.Size == 1) */
                 {
-                    Operand mask = X86GetScalar(context, -0d);
+                    if (Optimizations.UseFma)
+                    {
+                        res = context.AddIntrinsic(Intrinsic.X86Vfmsub231sd, a, n, m);
+                    }
+                    else
+                    {
+                        Operand mask = X86GetScalar(context, -0d);
+                        Operand aNeg = context.AddIntrinsic(Intrinsic.X86Xorpd, mask, a);
 
-                    Operand aNeg = context.AddIntrinsic(Intrinsic.X86Xorpd, mask, a);
-
-                    Operand res = context.AddIntrinsic(Intrinsic.X86Mulsd, n, m);
-                            res = context.AddIntrinsic(Intrinsic.X86Addsd, aNeg, res);
+                        res = context.AddIntrinsic(Intrinsic.X86Mulsd, n, m);
+                        res = context.AddIntrinsic(Intrinsic.X86Addsd, aNeg, res);
+                    }
 
                     context.Copy(d, context.VectorZeroUpper64(res));
                 }
@@ -1830,13 +1981,22 @@ namespace ARMeilleure.Instructions
 
                 int sizeF = op.Size & 1;
 
+                Operand res;
+
                 if (sizeF == 0)
                 {
                     Operand mask = X86GetScalar(context, 2f);
 
-                    Operand res = context.AddIntrinsic(Intrinsic.X86Mulss, n, m);
+                    if (Optimizations.UseFma)
+                    {
+                        res = context.AddIntrinsic(Intrinsic.X86Vfnmadd231ss, mask, n, m);
+                    }
+                    else
+                    {
+                        res = context.AddIntrinsic(Intrinsic.X86Mulss, n, m);
+                        res = context.AddIntrinsic(Intrinsic.X86Subss, mask, res);
+                    }
 
-                    res = context.AddIntrinsic(Intrinsic.X86Subss, mask, res);
                     res = EmitSse41RecipStepSelectOpF(context, n, m, res, mask, scalar: true, sizeF);
 
                     context.Copy(GetVec(op.Rd), context.VectorZeroUpper96(res));
@@ -1845,9 +2005,16 @@ namespace ARMeilleure.Instructions
                 {
                     Operand mask = X86GetScalar(context, 2d);
 
-                    Operand res = context.AddIntrinsic(Intrinsic.X86Mulsd, n, m);
+                    if (Optimizations.UseFma)
+                    {
+                        res = context.AddIntrinsic(Intrinsic.X86Vfnmadd231sd, mask, n, m);
+                    }
+                    else
+                    {
+                        res = context.AddIntrinsic(Intrinsic.X86Mulsd, n, m);
+                        res = context.AddIntrinsic(Intrinsic.X86Subsd, mask, res);
+                    }
 
-                    res = context.AddIntrinsic(Intrinsic.X86Subsd, mask, res);
                     res = EmitSse41RecipStepSelectOpF(context, n, m, res, mask, scalar: true, sizeF);
 
                     context.Copy(GetVec(op.Rd), context.VectorZeroUpper64(res));
@@ -1877,14 +2044,23 @@ namespace ARMeilleure.Instructions
 
                 int sizeF = op.Size & 1;
 
+                Operand res;
+
                 if (sizeF == 0)
                 {
                     Operand mask = X86GetAllElements(context, 2f);
 
-                    Operand res = context.AddIntrinsic(Intrinsic.X86Mulps, n, m);
-                    res = EmitSse41RecipStepSelectOpF(context, n, m, res, mask, scalar: false, sizeF);
+                    if (Optimizations.UseFma)
+                    {
+                        res = context.AddIntrinsic(Intrinsic.X86Vfnmadd231ps, mask, n, m);
+                    }
+                    else
+                    {
+                        res = context.AddIntrinsic(Intrinsic.X86Mulps, n, m);
+                        res = context.AddIntrinsic(Intrinsic.X86Subps, mask, res);
+                    }
 
-                    res = context.AddIntrinsic(Intrinsic.X86Subps, mask, res);
+                    res = EmitSse41RecipStepSelectOpF(context, n, m, res, mask, scalar: false, sizeF);
 
                     if (op.RegisterSize == RegisterSize.Simd64)
                     {
@@ -1897,10 +2073,17 @@ namespace ARMeilleure.Instructions
                 {
                     Operand mask = X86GetAllElements(context, 2d);
 
-                    Operand res = context.AddIntrinsic(Intrinsic.X86Mulpd, n, m);
-                    res = EmitSse41RecipStepSelectOpF(context, n, m, res, mask, scalar: false, sizeF);
+                    if (Optimizations.UseFma)
+                    {
+                        res = context.AddIntrinsic(Intrinsic.X86Vfnmadd231pd, mask, n, m);
+                    }
+                    else
+                    {
+                        res = context.AddIntrinsic(Intrinsic.X86Mulpd, n, m);
+                        res = context.AddIntrinsic(Intrinsic.X86Subpd, mask, res);
+                    }
 
-                    res = context.AddIntrinsic(Intrinsic.X86Subpd, mask, res);
+                    res = EmitSse41RecipStepSelectOpF(context, n, m, res, mask, scalar: false, sizeF);
 
                     context.Copy(GetVec(op.Rd), res);
                 }
@@ -2113,20 +2296,32 @@ namespace ARMeilleure.Instructions
 
         public static void Frintx_S(ArmEmitterContext context)
         {
-            // TODO Arm64: Fast path. Should we set host FPCR?
-            EmitScalarUnaryOpF(context, (op1) =>
+            if (Optimizations.UseAdvSimd)
             {
-                return EmitRoundByRMode(context, op1);
-            });
+                InstEmitSimdHelperArm64.EmitScalarUnaryOpF(context, Intrinsic.Arm64FrintxS);
+            }
+            else
+            {
+                EmitScalarUnaryOpF(context, (op1) =>
+                {
+                    return EmitRoundByRMode(context, op1);
+                });
+            }
         }
 
         public static void Frintx_V(ArmEmitterContext context)
         {
-            // TODO Arm64: Fast path. Should we set host FPCR?
-            EmitVectorUnaryOpF(context, (op1) =>
+            if (Optimizations.UseAdvSimd)
             {
-                return EmitRoundByRMode(context, op1);
-            });
+                InstEmitSimdHelperArm64.EmitVectorUnaryOpF(context, Intrinsic.Arm64FrintxV);
+            }
+            else
+            {
+                EmitVectorUnaryOpF(context, (op1) =>
+                {
+                    return EmitRoundByRMode(context, op1);
+                });
+            }
         }
 
         public static void Frintz_S(ArmEmitterContext context)
@@ -2237,16 +2432,25 @@ namespace ARMeilleure.Instructions
 
                 int sizeF = op.Size & 1;
 
+                Operand res;
+
                 if (sizeF == 0)
                 {
                     Operand maskHalf    = X86GetScalar(context, 0.5f);
                     Operand maskThree   = X86GetScalar(context, 3f);
                     Operand maskOneHalf = X86GetScalar(context, 1.5f);
 
-                    Operand res = context.AddIntrinsic(Intrinsic.X86Mulss, n, m);
+                    if (Optimizations.UseFma)
+                    {
+                        res = context.AddIntrinsic(Intrinsic.X86Vfnmadd231ss, maskThree, n, m);
+                    }
+                    else
+                    {
+                        res = context.AddIntrinsic(Intrinsic.X86Mulss, n, m);
+                        res = context.AddIntrinsic(Intrinsic.X86Subss, maskThree, res);
+                    }
 
-                    res = context.AddIntrinsic(Intrinsic.X86Subss, maskThree, res);
-                    res = context.AddIntrinsic(Intrinsic.X86Mulss, maskHalf,  res);
+                    res = context.AddIntrinsic(Intrinsic.X86Mulss, maskHalf, res);
                     res = EmitSse41RecipStepSelectOpF(context, n, m, res, maskOneHalf, scalar: true, sizeF);
 
                     context.Copy(GetVec(op.Rd), context.VectorZeroUpper96(res));
@@ -2257,10 +2461,17 @@ namespace ARMeilleure.Instructions
                     Operand maskThree   = X86GetScalar(context, 3d);
                     Operand maskOneHalf = X86GetScalar(context, 1.5d);
 
-                    Operand res = context.AddIntrinsic(Intrinsic.X86Mulsd, n, m);
+                    if (Optimizations.UseFma)
+                    {
+                        res = context.AddIntrinsic(Intrinsic.X86Vfnmadd231sd, maskThree, n, m);
+                    }
+                    else
+                    {
+                        res = context.AddIntrinsic(Intrinsic.X86Mulsd, n, m);
+                        res = context.AddIntrinsic(Intrinsic.X86Subsd, maskThree, res);
+                    }
 
-                    res = context.AddIntrinsic(Intrinsic.X86Subsd, maskThree, res);
-                    res = context.AddIntrinsic(Intrinsic.X86Mulsd, maskHalf,  res);
+                    res = context.AddIntrinsic(Intrinsic.X86Mulsd, maskHalf, res);
                     res = EmitSse41RecipStepSelectOpF(context, n, m, res, maskOneHalf, scalar: true, sizeF);
 
                     context.Copy(GetVec(op.Rd), context.VectorZeroUpper64(res));
@@ -2290,15 +2501,24 @@ namespace ARMeilleure.Instructions
 
                 int sizeF = op.Size & 1;
 
+                Operand res;
+
                 if (sizeF == 0)
                 {
                     Operand maskHalf    = X86GetAllElements(context, 0.5f);
                     Operand maskThree   = X86GetAllElements(context, 3f);
                     Operand maskOneHalf = X86GetAllElements(context, 1.5f);
 
-                    Operand res = context.AddIntrinsic(Intrinsic.X86Mulps, n, m);
+                    if (Optimizations.UseFma)
+                    {
+                        res = context.AddIntrinsic(Intrinsic.X86Vfnmadd231ps, maskThree, n, m);
+                    }
+                    else
+                    {
+                        res = context.AddIntrinsic(Intrinsic.X86Mulps, n, m);
+                        res = context.AddIntrinsic(Intrinsic.X86Subps, maskThree, res);
+                    }
 
-                    res = context.AddIntrinsic(Intrinsic.X86Subps, maskThree, res);
                     res = context.AddIntrinsic(Intrinsic.X86Mulps, maskHalf,  res);
                     res = EmitSse41RecipStepSelectOpF(context, n, m, res, maskOneHalf, scalar: false, sizeF);
 
@@ -2315,9 +2535,16 @@ namespace ARMeilleure.Instructions
                     Operand maskThree   = X86GetAllElements(context, 3d);
                     Operand maskOneHalf = X86GetAllElements(context, 1.5d);
 
-                    Operand res = context.AddIntrinsic(Intrinsic.X86Mulpd, n, m);
+                    if (Optimizations.UseFma)
+                    {
+                        res = context.AddIntrinsic(Intrinsic.X86Vfnmadd231pd, maskThree, n, m);
+                    }
+                    else
+                    {
+                        res = context.AddIntrinsic(Intrinsic.X86Mulpd, n, m);
+                        res = context.AddIntrinsic(Intrinsic.X86Subpd, maskThree, res);
+                    }
 
-                    res = context.AddIntrinsic(Intrinsic.X86Subpd, maskThree, res);
                     res = context.AddIntrinsic(Intrinsic.X86Mulpd, maskHalf,  res);
                     res = EmitSse41RecipStepSelectOpF(context, n, m, res, maskOneHalf, scalar: false, sizeF);
 
@@ -4728,53 +4955,6 @@ namespace ARMeilleure.Instructions
             }
         }
 
-        public static Operand EmitSseOrAvxHandleFzModeOpF(
-            ArmEmitterContext context,
-            Func2I emit,
-            bool scalar,
-            Operand n = default,
-            Operand m = default)
-        {
-            Operand nCopy = n == default ? context.Copy(GetVec(((OpCodeSimdReg)context.CurrOp).Rn)) : n;
-            Operand mCopy = m == default ? context.Copy(GetVec(((OpCodeSimdReg)context.CurrOp).Rm)) : m;
-
-            EmitSseOrAvxEnterFtzAndDazModesOpF(context, out Operand isTrue);
-
-            Operand res = emit(nCopy, mCopy);
-
-            EmitSseOrAvxExitFtzAndDazModesOpF(context, isTrue);
-
-            if (n != default || m != default)
-            {
-                return res;
-            }
-
-            int sizeF = ((IOpCodeSimd)context.CurrOp).Size & 1;
-
-            if (sizeF == 0)
-            {
-                if (scalar)
-                {
-                    res = context.VectorZeroUpper96(res);
-                }
-                else if (((OpCodeSimdReg)context.CurrOp).RegisterSize == RegisterSize.Simd64)
-                {
-                    res = context.VectorZeroUpper64(res);
-                }
-            }
-            else /* if (sizeF == 1) */
-            {
-                if (scalar)
-                {
-                    res = context.VectorZeroUpper64(res);
-                }
-            }
-
-            context.Copy(GetVec(((OpCodeSimdReg)context.CurrOp).Rd), res);
-
-            return default;
-        }
-
         private static Operand EmitSse2VectorMaxMinOpF(ArmEmitterContext context, Operand n, Operand m, bool isMax)
         {
             IOpCodeSimd op = (IOpCodeSimd)context.CurrOp;
@@ -4834,10 +5014,7 @@ namespace ARMeilleure.Instructions
 
                 Operand res = EmitSse41ProcessNaNsOpF(context, (op1, op2) =>
                 {
-                    return EmitSseOrAvxHandleFzModeOpF(context, (op1, op2) =>
-                    {
-                        return EmitSse2VectorMaxMinOpF(context, op1, op2, isMax: isMaxNum);
-                    }, scalar: scalar, op1, op2);
+                    return EmitSse2VectorMaxMinOpF(context, op1, op2, isMax: isMaxNum);
                 }, scalar: scalar, nCopy, mCopy);
 
                 if (n != default || m != default)
@@ -4872,10 +5049,7 @@ namespace ARMeilleure.Instructions
 
                 Operand res = EmitSse41ProcessNaNsOpF(context, (op1, op2) =>
                 {
-                    return EmitSseOrAvxHandleFzModeOpF(context, (op1, op2) =>
-                    {
-                        return EmitSse2VectorMaxMinOpF(context, op1, op2, isMax: isMaxNum);
-                    }, scalar: scalar, op1, op2);
+                    return EmitSse2VectorMaxMinOpF(context, op1, op2, isMax: isMaxNum);
                 }, scalar: scalar, nCopy, mCopy);
 
                 if (n != default || m != default)
diff --git a/ARMeilleure/Instructions/InstEmitSimdCvt32.cs b/ARMeilleure/Instructions/InstEmitSimdCvt32.cs
index 5fdc3b5ad7..33ae83df64 100644
--- a/ARMeilleure/Instructions/InstEmitSimdCvt32.cs
+++ b/ARMeilleure/Instructions/InstEmitSimdCvt32.cs
@@ -356,9 +356,11 @@ namespace ARMeilleure.Instructions
                         ? typeof(SoftFloat64_16).GetMethod(nameof(SoftFloat64_16.FPConvert))
                         : typeof(SoftFloat32_16).GetMethod(nameof(SoftFloat32_16.FPConvert));
 
+                    context.ExitArmFpMode();
                     context.StoreToContext();
                     Operand res = context.Call(method, src);
                     context.LoadFromContext();
+                    context.EnterArmFpMode();
 
                     InsertScalar16(context, op.Vd, op.T, res);
                 }
@@ -372,9 +374,11 @@ namespace ARMeilleure.Instructions
                         ? typeof(SoftFloat16_64).GetMethod(nameof(SoftFloat16_64.FPConvert))
                         : typeof(SoftFloat16_32).GetMethod(nameof(SoftFloat16_32.FPConvert));
 
+                    context.ExitArmFpMode();
                     context.StoreToContext();
                     Operand res = context.Call(method, src);
                     context.LoadFromContext();
+                    context.EnterArmFpMode();
 
                     InsertScalar(context, op.Vd, res);
                 }
@@ -542,10 +546,17 @@ namespace ARMeilleure.Instructions
         // VRINTX (floating-point).
         public static void Vrintx_S(ArmEmitterContext context)
         {
-            EmitScalarUnaryOpF32(context, (op1) =>
+            if (Optimizations.UseAdvSimd)
             {
-                return EmitRoundByRMode(context, op1);
-            });
+                InstEmitSimdHelper32Arm64.EmitScalarUnaryOpF32(context, Intrinsic.Arm64FrintxS);
+            }
+            else
+            {
+                EmitScalarUnaryOpF32(context, (op1) =>
+                {
+                    return EmitRoundByRMode(context, op1);
+                });
+            }
         }
 
         private static Operand EmitFPConvert(ArmEmitterContext context, Operand value, OperandType type, bool signed)
diff --git a/ARMeilleure/Instructions/InstEmitSimdHelper.cs b/ARMeilleure/Instructions/InstEmitSimdHelper.cs
index 0e7af794a5..c44c9b4d95 100644
--- a/ARMeilleure/Instructions/InstEmitSimdHelper.cs
+++ b/ARMeilleure/Instructions/InstEmitSimdHelper.cs
@@ -1,3 +1,4 @@
+using ARMeilleure.CodeGen.X86;
 using ARMeilleure.Decoders;
 using ARMeilleure.IntermediateRepresentation;
 using ARMeilleure.State;
@@ -158,6 +159,75 @@ namespace ARMeilleure.Instructions
         };
 #endregion
 
+        public static void EnterArmFpMode(EmitterContext context, Func<FPState, Operand> getFpFlag)
+        {
+            if (Optimizations.UseSse2)
+            {
+                Operand mxcsr = context.AddIntrinsicInt(Intrinsic.X86Stmxcsr);
+
+                Operand fzTrue = getFpFlag(FPState.FzFlag);
+                Operand r0True = getFpFlag(FPState.RMode0Flag);
+                Operand r1True = getFpFlag(FPState.RMode1Flag);
+
+                mxcsr = context.BitwiseAnd(mxcsr, Const(~(int)(Mxcsr.Ftz | Mxcsr.Daz | Mxcsr.Rhi | Mxcsr.Rlo)));
+
+                mxcsr = context.BitwiseOr(mxcsr, context.ConditionalSelect(fzTrue, Const((int)(Mxcsr.Ftz | Mxcsr.Daz | Mxcsr.Um | Mxcsr.Dm)), Const(0)));
+
+                // X86 round modes in order: nearest, negative, positive, zero
+                // ARM round modes in order: nearest, positive, negative, zero
+                // Read the bits backwards to correct this.
+
+                mxcsr = context.BitwiseOr(mxcsr, context.ConditionalSelect(r0True, Const((int)Mxcsr.Rhi), Const(0)));
+                mxcsr = context.BitwiseOr(mxcsr, context.ConditionalSelect(r1True, Const((int)Mxcsr.Rlo), Const(0)));
+
+                context.AddIntrinsicNoRet(Intrinsic.X86Ldmxcsr, mxcsr);
+            }
+            else if (Optimizations.UseAdvSimd)
+            {
+                Operand fpcr = context.AddIntrinsicInt(Intrinsic.Arm64MrsFpcr);
+
+                Operand fzTrue = getFpFlag(FPState.FzFlag);
+                Operand r0True = getFpFlag(FPState.RMode0Flag);
+                Operand r1True = getFpFlag(FPState.RMode1Flag);
+
+                fpcr = context.BitwiseAnd(fpcr, Const(~(int)(FPCR.Fz | FPCR.RMode0 | FPCR.RMode1)));
+
+                fpcr = context.BitwiseOr(fpcr, context.ConditionalSelect(fzTrue, Const((int)FPCR.Fz), Const(0)));
+                fpcr = context.BitwiseOr(fpcr, context.ConditionalSelect(r0True, Const((int)FPCR.RMode0), Const(0)));
+                fpcr = context.BitwiseOr(fpcr, context.ConditionalSelect(r1True, Const((int)FPCR.RMode1), Const(0)));
+
+                context.AddIntrinsicNoRet(Intrinsic.Arm64MsrFpcr, fpcr);
+
+                // TODO: Restore FPSR
+            }
+        }
+
+        public static void ExitArmFpMode(EmitterContext context, Action<FPState, Operand> setFpFlag)
+        {
+            if (Optimizations.UseSse2)
+            {
+                Operand mxcsr = context.AddIntrinsicInt(Intrinsic.X86Stmxcsr);
+
+                // Unset round mode (to nearest) and ftz.
+                mxcsr = context.BitwiseAnd(mxcsr, Const(~(int)(Mxcsr.Ftz | Mxcsr.Daz | Mxcsr.Rhi | Mxcsr.Rlo)));
+
+                context.AddIntrinsicNoRet(Intrinsic.X86Ldmxcsr, mxcsr);
+
+                // Status flags would be stored here if they were used.
+            }
+            else if (Optimizations.UseAdvSimd)
+            {
+                Operand fpcr = context.AddIntrinsicInt(Intrinsic.Arm64MrsFpcr);
+
+                // Unset round mode (to nearest) and fz.
+                fpcr = context.BitwiseAnd(fpcr, Const(~(int)(FPCR.Fz | FPCR.RMode0 | FPCR.RMode1)));
+
+                context.AddIntrinsicNoRet(Intrinsic.Arm64MsrFpcr, fpcr);
+
+                // TODO: Store FPSR
+            }
+        }
+
         public static int GetImmShl(OpCodeSimdShImm op)
         {
             return op.Imm - (8 << op.Size);
@@ -465,9 +535,11 @@ namespace ARMeilleure.Instructions
                 ? typeof(SoftFloat32).GetMethod(name)
                 : typeof(SoftFloat64).GetMethod(name);
 
+            context.ExitArmFpMode();
             context.StoreToContext();
             Operand res = context.Call(info, callArgs);
             context.LoadFromContext();
+            context.EnterArmFpMode();
 
             return res;
         }
@@ -1358,39 +1430,6 @@ namespace ARMeilleure.Instructions
             }
         }
 
-        [Flags]
-        public enum Mxcsr
-        {
-            Ftz = 1 << 15, // Flush To Zero.
-            Um  = 1 << 11, // Underflow Mask.
-            Dm  = 1 << 8,  // Denormal Mask.
-            Daz = 1 << 6   // Denormals Are Zero.
-        }
-
-        public static void EmitSseOrAvxEnterFtzAndDazModesOpF(ArmEmitterContext context, out Operand isTrue)
-        {
-            isTrue = GetFpFlag(FPState.FzFlag);
-
-            Operand lblTrue = Label();
-            context.BranchIfFalse(lblTrue, isTrue);
-
-            context.AddIntrinsicNoRet(Intrinsic.X86Mxcsrmb, Const((int)(Mxcsr.Ftz | Mxcsr.Um | Mxcsr.Dm | Mxcsr.Daz)));
-
-            context.MarkLabel(lblTrue);
-        }
-
-        public static void EmitSseOrAvxExitFtzAndDazModesOpF(ArmEmitterContext context, Operand isTrue = default)
-        {
-            isTrue = isTrue == default ? GetFpFlag(FPState.FzFlag) : isTrue;
-
-            Operand lblTrue = Label();
-            context.BranchIfFalse(lblTrue, isTrue);
-
-            context.AddIntrinsicNoRet(Intrinsic.X86Mxcsrub, Const((int)(Mxcsr.Ftz | Mxcsr.Daz)));
-
-            context.MarkLabel(lblTrue);
-        }
-
         public enum CmpCondition
         {
             // Legacy Sse.
diff --git a/ARMeilleure/Instructions/InstEmitSimdHelper32.cs b/ARMeilleure/Instructions/InstEmitSimdHelper32.cs
index 84b01d05ca..36d27d4252 100644
--- a/ARMeilleure/Instructions/InstEmitSimdHelper32.cs
+++ b/ARMeilleure/Instructions/InstEmitSimdHelper32.cs
@@ -1197,9 +1197,11 @@ namespace ARMeilleure.Instructions
             Array.Resize(ref callArgs, callArgs.Length + 1);
             callArgs[callArgs.Length - 1] = Const(1);
 
+            context.ExitArmFpMode();
             context.StoreToContext();
             Operand res = context.Call(info, callArgs);
             context.LoadFromContext();
+            context.EnterArmFpMode();
 
             return res;
         }
diff --git a/ARMeilleure/Instructions/InstEmitSystem.cs b/ARMeilleure/Instructions/InstEmitSystem.cs
index 1345bbf109..f668b83b6e 100644
--- a/ARMeilleure/Instructions/InstEmitSystem.cs
+++ b/ARMeilleure/Instructions/InstEmitSystem.cs
@@ -192,6 +192,8 @@ namespace ARMeilleure.Instructions
                     SetFpFlag(context, (FPState)flag, context.BitwiseAnd(context.ShiftRightUI(fpcr, Const(flag)), Const(1)));
                 }
             }
+
+            context.UpdateArmFpMode();
         }
 
         private static void EmitSetFpsr(ArmEmitterContext context)
@@ -210,6 +212,8 @@ namespace ARMeilleure.Instructions
                     SetFpFlag(context, (FPState)flag, context.BitwiseAnd(context.ShiftRightUI(fpsr, Const(flag)), Const(1)));
                 }
             }
+
+            context.UpdateArmFpMode();
         }
     }
 }
diff --git a/ARMeilleure/Instructions/InstEmitSystem32.cs b/ARMeilleure/Instructions/InstEmitSystem32.cs
index e07db41218..2f6cf19d68 100644
--- a/ARMeilleure/Instructions/InstEmitSystem32.cs
+++ b/ARMeilleure/Instructions/InstEmitSystem32.cs
@@ -321,6 +321,8 @@ namespace ARMeilleure.Instructions
                     SetFpFlag(context, (FPState)flag, context.BitwiseAnd(context.ShiftRightUI(fpscr, Const(flag)), Const(1)));
                 }
             }
+
+            context.UpdateArmFpMode();
         }
     }
 }
diff --git a/ARMeilleure/IntermediateRepresentation/Intrinsic.cs b/ARMeilleure/IntermediateRepresentation/Intrinsic.cs
index b629345ee4..f5a776fa23 100644
--- a/ARMeilleure/IntermediateRepresentation/Intrinsic.cs
+++ b/ARMeilleure/IntermediateRepresentation/Intrinsic.cs
@@ -53,6 +53,7 @@ namespace ARMeilleure.IntermediateRepresentation
         X86Haddpd,
         X86Haddps,
         X86Insertps,
+        X86Ldmxcsr,
         X86Maxpd,
         X86Maxps,
         X86Maxsd,
@@ -68,8 +69,6 @@ namespace ARMeilleure.IntermediateRepresentation
         X86Mulps,
         X86Mulsd,
         X86Mulss,
-        X86Mxcsrmb,
-        X86Mxcsrub,
         X86Paddb,
         X86Paddd,
         X86Paddq,
@@ -153,6 +152,7 @@ namespace ARMeilleure.IntermediateRepresentation
         X86Sqrtps,
         X86Sqrtsd,
         X86Sqrtss,
+        X86Stmxcsr,
         X86Subpd,
         X86Subps,
         X86Subsd,
@@ -163,11 +163,13 @@ namespace ARMeilleure.IntermediateRepresentation
         X86Unpcklps,
         X86Vcvtph2ps,
         X86Vcvtps2ph,
+        X86Vfmadd231pd,
         X86Vfmadd231ps,
         X86Vfmadd231sd,
         X86Vfmadd231ss,
         X86Vfmsub231sd,
         X86Vfmsub231ss,
+        X86Vfnmadd231pd,
         X86Vfnmadd231ps,
         X86Vfnmadd231sd,
         X86Vfnmadd231ss,
@@ -394,6 +396,8 @@ namespace ARMeilleure.IntermediateRepresentation
         Arm64MlsVe,
         Arm64MlsV,
         Arm64MoviV,
+        Arm64MrsFpcr,
+        Arm64MsrFpcr,
         Arm64MrsFpsr,
         Arm64MsrFpsr,
         Arm64MulVe,
diff --git a/ARMeilleure/Translation/ArmEmitterContext.cs b/ARMeilleure/Translation/ArmEmitterContext.cs
index 238f85082c..565d2aada3 100644
--- a/ARMeilleure/Translation/ArmEmitterContext.cs
+++ b/ARMeilleure/Translation/ArmEmitterContext.cs
@@ -188,6 +188,21 @@ namespace ARMeilleure.Translation
             }
         }
 
+        public void EnterArmFpMode()
+        {
+            InstEmitSimdHelper.EnterArmFpMode(this, InstEmitHelper.GetFpFlag);
+        }
+
+        public void UpdateArmFpMode()
+        {
+            EnterArmFpMode();
+        }
+
+        public void ExitArmFpMode()
+        {
+            InstEmitSimdHelper.ExitArmFpMode(this, (flag, value) => InstEmitHelper.SetFpFlag(this, flag, value));
+        }
+
         public Operand TryGetComparisonResult(Condition condition)
         {
             if (_optOpLastCompare == null || _optOpLastCompare != _optOpLastFlagSet)
diff --git a/ARMeilleure/Translation/DispatcherFunction.cs b/ARMeilleure/Translation/DispatcherFunction.cs
index e3ea21f67b..7d5a3388ef 100644
--- a/ARMeilleure/Translation/DispatcherFunction.cs
+++ b/ARMeilleure/Translation/DispatcherFunction.cs
@@ -3,4 +3,5 @@
 namespace ARMeilleure.Translation
 {
     delegate void DispatcherFunction(IntPtr nativeContext, ulong startAddress);
+    delegate ulong WrapperFunction(IntPtr nativeContext, ulong startAddress);
 }
diff --git a/ARMeilleure/Translation/PTC/Ptc.cs b/ARMeilleure/Translation/PTC/Ptc.cs
index 17f6870623..5970c4ff9c 100644
--- a/ARMeilleure/Translation/PTC/Ptc.cs
+++ b/ARMeilleure/Translation/PTC/Ptc.cs
@@ -30,7 +30,7 @@ namespace ARMeilleure.Translation.PTC
         private const string OuterHeaderMagicString = "PTCohd\0\0";
         private const string InnerHeaderMagicString = "PTCihd\0\0";
 
-        private const uint InternalVersion = 4485; //! To be incremented manually for each change to the ARMeilleure project.
+        private const uint InternalVersion = 4626; //! To be incremented manually for each change to the ARMeilleure project.
 
         private const string ActualDir = "0";
         private const string BackupDir = "1";
diff --git a/ARMeilleure/Translation/TranslatedFunction.cs b/ARMeilleure/Translation/TranslatedFunction.cs
index 71eec08ac2..f007883efb 100644
--- a/ARMeilleure/Translation/TranslatedFunction.cs
+++ b/ARMeilleure/Translation/TranslatedFunction.cs
@@ -25,5 +25,10 @@ namespace ARMeilleure.Translation
         {
             return _func(context.NativeContextPtr);
         }
+
+        public ulong Execute(WrapperFunction dispatcher, State.ExecutionContext context)
+        {
+            return dispatcher(context.NativeContextPtr, (ulong)FuncPointer);
+        }
     }
 }
\ No newline at end of file
diff --git a/ARMeilleure/Translation/Translator.cs b/ARMeilleure/Translation/Translator.cs
index 0c05b2b49d..f349c5ebf5 100644
--- a/ARMeilleure/Translation/Translator.cs
+++ b/ARMeilleure/Translation/Translator.cs
@@ -183,7 +183,7 @@ namespace ARMeilleure.Translation
 
             Statistics.StartTimer();
 
-            ulong nextAddr = func.Execute(context);
+            ulong nextAddr = func.Execute(Stubs.ContextWrapper, context);
 
             Statistics.StopTimer(address);
 
@@ -194,7 +194,7 @@ namespace ARMeilleure.Translation
         {
             TranslatedFunction func = Translate(address, context.ExecutionMode, highCq: false, singleStep: true);
 
-            address = func.Execute(context);
+            address = func.Execute(Stubs.ContextWrapper, context);
 
             EnqueueForDeletion(address, func);
 
diff --git a/ARMeilleure/Translation/TranslatorStubs.cs b/ARMeilleure/Translation/TranslatorStubs.cs
index 6ed84de80b..69648df449 100644
--- a/ARMeilleure/Translation/TranslatorStubs.cs
+++ b/ARMeilleure/Translation/TranslatorStubs.cs
@@ -21,6 +21,7 @@ namespace ARMeilleure.Translation
         private readonly Translator _translator;
         private readonly Lazy<IntPtr> _dispatchStub;
         private readonly Lazy<DispatcherFunction> _dispatchLoop;
+        private readonly Lazy<WrapperFunction> _contextWrapper;
 
         /// <summary>
         /// Gets the dispatch stub.
@@ -64,6 +65,20 @@ namespace ARMeilleure.Translation
             }
         }
 
+        /// <summary>
+        /// Gets the context wrapper function.
+        /// </summary>
+        /// <exception cref="ObjectDisposedException"><see cref="TranslatorStubs"/> instance was disposed</exception>
+        public WrapperFunction ContextWrapper
+        {
+            get
+            {
+                ObjectDisposedException.ThrowIf(_disposed, this);
+
+                return _contextWrapper.Value;
+            }
+        }
+
         /// <summary>
         /// Initializes a new instance of the <see cref="TranslatorStubs"/> class with the specified
         /// <see cref="Translator"/> instance.
@@ -77,6 +92,7 @@ namespace ARMeilleure.Translation
             _translator = translator;
             _dispatchStub = new(GenerateDispatchStub, isThreadSafe: true);
             _dispatchLoop = new(GenerateDispatchLoop, isThreadSafe: true);
+            _contextWrapper = new(GenerateContextWrapper, isThreadSafe: true);
         }
 
         /// <summary>
@@ -202,6 +218,32 @@ namespace ARMeilleure.Translation
             return Marshal.GetFunctionPointerForDelegate(func);
         }
 
+        /// <summary>
+        /// Emits code that syncs FP state before executing guest code, or returns it to normal.
+        /// </summary>
+        /// <param name="context">Emitter context for the method</param>
+        /// <param name="nativeContext">Pointer to the native context</param>
+        /// <param name="enter">True if entering guest code, false otherwise</param>
+        private void EmitSyncFpContext(EmitterContext context, Operand nativeContext, bool enter)
+        {
+            if (enter)
+            {
+                InstEmitSimdHelper.EnterArmFpMode(context, (flag) =>
+                {
+                    Operand flagAddress = context.Add(nativeContext, Const((ulong)NativeContext.GetRegisterOffset(new Register((int)flag, RegisterType.FpFlag))));
+                    return context.Load(OperandType.I32, flagAddress);
+                });
+            }
+            else
+            {
+                InstEmitSimdHelper.ExitArmFpMode(context, (flag, value) =>
+                {
+                    Operand flagAddress = context.Add(nativeContext, Const((ulong)NativeContext.GetRegisterOffset(new Register((int)flag, RegisterType.FpFlag))));
+                    context.Store(flagAddress, value);
+                });
+            }
+        }
+
         /// <summary>
         /// Generates a <see cref="DispatchLoop"/> function.
         /// </summary>
@@ -221,6 +263,8 @@ namespace ARMeilleure.Translation
             Operand runningAddress = context.Add(nativeContext, Const((ulong)NativeContext.GetRunningOffset()));
             Operand dispatchAddress = context.Add(nativeContext, Const((ulong)NativeContext.GetDispatchAddressOffset()));
 
+            EmitSyncFpContext(context, nativeContext, true);
+
             context.MarkLabel(beginLbl);
             context.Store(dispatchAddress, guestAddress);
             context.Copy(guestAddress, context.Call(Const((ulong)DispatchStub), OperandType.I64, nativeContext));
@@ -229,6 +273,9 @@ namespace ARMeilleure.Translation
             context.Branch(beginLbl);
 
             context.MarkLabel(endLbl);
+
+            EmitSyncFpContext(context, nativeContext, false);
+
             context.Return();
 
             var cfg = context.GetControlFlowGraph();
@@ -237,5 +284,29 @@ namespace ARMeilleure.Translation
 
             return Compiler.Compile(cfg, argTypes, retType, CompilerOptions.HighCq, RuntimeInformation.ProcessArchitecture).Map<DispatcherFunction>();
         }
+
+        /// <summary>
+        /// Generates a <see cref="ContextWrapper"/> function.
+        /// </summary>
+        /// <returns><see cref="ContextWrapper"/> function</returns>
+        private WrapperFunction GenerateContextWrapper()
+        {
+            var context = new EmitterContext();
+
+            Operand nativeContext = context.LoadArgument(OperandType.I64, 0);
+            Operand guestMethod = context.LoadArgument(OperandType.I64, 1);
+
+            EmitSyncFpContext(context, nativeContext, true);
+            Operand returnValue = context.Call(guestMethod, OperandType.I64, nativeContext);
+            EmitSyncFpContext(context, nativeContext, false);
+
+            context.Return(returnValue);
+
+            var cfg = context.GetControlFlowGraph();
+            var retType = OperandType.I64;
+            var argTypes = new[] { OperandType.I64, OperandType.I64 };
+
+            return Compiler.Compile(cfg, argTypes, retType, CompilerOptions.HighCq, RuntimeInformation.ProcessArchitecture).Map<WrapperFunction>();
+        }
     }
 }
diff --git a/ARMeilleure/Translation/TranslatorTestMethods.cs b/ARMeilleure/Translation/TranslatorTestMethods.cs
new file mode 100644
index 0000000000..ab96019a68
--- /dev/null
+++ b/ARMeilleure/Translation/TranslatorTestMethods.cs
@@ -0,0 +1,148 @@
+using ARMeilleure.CodeGen.X86;
+using ARMeilleure.IntermediateRepresentation;
+using ARMeilleure.State;
+using ARMeilleure.Translation;
+using System;
+using System.Runtime.InteropServices;
+using static ARMeilleure.IntermediateRepresentation.Operand.Factory;
+
+namespace ARMeilleure.Translation
+{
+    public static class TranslatorTestMethods
+    {
+        public delegate int FpFlagsPInvokeTest(IntPtr managedMethod);
+
+        private static bool SetPlatformFtz(EmitterContext context, bool ftz)
+        {
+            if (Optimizations.UseSse2)
+            {
+                Operand mxcsr = context.AddIntrinsicInt(Intrinsic.X86Stmxcsr);
+
+                if (ftz)
+                {
+                    mxcsr = context.BitwiseOr(mxcsr, Const((int)(Mxcsr.Ftz | Mxcsr.Um | Mxcsr.Dm)));
+                }
+                else
+                {
+                    mxcsr = context.BitwiseAnd(mxcsr, Const(~(int)Mxcsr.Ftz));
+                }
+
+                context.AddIntrinsicNoRet(Intrinsic.X86Ldmxcsr, mxcsr);
+
+                return true;
+            }
+            else if (Optimizations.UseAdvSimd)
+            {
+                Operand fpcr = context.AddIntrinsicInt(Intrinsic.Arm64MrsFpcr);
+
+                if (ftz)
+                {
+                    fpcr = context.BitwiseOr(fpcr, Const((int)FPCR.Fz));
+                }
+                else
+                {
+                    fpcr = context.BitwiseAnd(fpcr, Const(~(int)FPCR.Fz));
+                }
+
+                context.AddIntrinsicNoRet(Intrinsic.Arm64MsrFpcr, fpcr);
+
+                return true;
+            }
+            else
+            {
+                return false;
+            }
+        }
+
+        private static Operand FpBitsToInt(EmitterContext context, Operand fp)
+        {
+            Operand vec = context.VectorInsert(context.VectorZero(), fp, 0);
+            return context.VectorExtract(OperandType.I32, vec, 0);
+        }
+
+        public static FpFlagsPInvokeTest GenerateFpFlagsPInvokeTest()
+        {
+            EmitterContext context = new EmitterContext();
+
+            Operand methodAddress = context.Copy(context.LoadArgument(OperandType.I64, 0));
+
+            // Verify that default dotnet fp state does not flush to zero.
+            // This is required for SoftFloat to function.
+
+            // Denormal + zero != 0
+
+            Operand denormal = ConstF(BitConverter.Int32BitsToSingle(1)); // 1.40129846432e-45
+            Operand zeroF = ConstF(0f);
+            Operand zero = Const(0);
+
+            Operand result = context.Add(zeroF, denormal);
+
+            // Must not be zero.
+
+            Operand correct1Label = Label();
+
+            context.BranchIfFalse(correct1Label, context.ICompareEqual(FpBitsToInt(context, result), zero));
+
+            context.Return(Const(1));
+
+            context.MarkLabel(correct1Label);
+
+            // Set flush to zero flag. If unsupported by the backend, just return true.
+
+            if (!SetPlatformFtz(context, true))
+            {
+                context.Return(Const(0));
+            }
+
+            // Denormal + zero == 0
+
+            Operand resultFz = context.Add(zeroF, denormal);
+
+            // Must equal zero.
+
+            Operand correct2Label = Label();
+
+            context.BranchIfTrue(correct2Label, context.ICompareEqual(FpBitsToInt(context, resultFz), zero));
+
+            SetPlatformFtz(context, false);
+
+            context.Return(Const(2));
+
+            context.MarkLabel(correct2Label);
+
+            // Call a managed method. This method should not change Fz state. 
+
+            context.Call(methodAddress, OperandType.None);
+
+            // Denormal + zero == 0
+
+            Operand resultFz2 = context.Add(zeroF, denormal);
+
+            // Must equal zero.
+
+            Operand correct3Label = Label();
+
+            context.BranchIfTrue(correct3Label, context.ICompareEqual(FpBitsToInt(context, resultFz2), zero));
+
+            SetPlatformFtz(context, false);
+
+            context.Return(Const(3));
+
+            context.MarkLabel(correct3Label);
+
+            // Success.
+
+            SetPlatformFtz(context, false);
+
+            context.Return(Const(0));
+
+            // Compile and return the function.
+
+            ControlFlowGraph cfg = context.GetControlFlowGraph();
+
+            OperandType[] argTypes = new OperandType[] { OperandType.I64 };
+
+            return Compiler.Compile(cfg, argTypes, OperandType.I32, CompilerOptions.HighCq, RuntimeInformation.ProcessArchitecture).Map<FpFlagsPInvokeTest>();
+        }
+    }
+}
diff --git a/Ryujinx.Tests/Cpu/EnvironmentTests.cs b/Ryujinx.Tests/Cpu/EnvironmentTests.cs
new file mode 100644
index 0000000000..d374c08a5d
--- /dev/null
+++ b/Ryujinx.Tests/Cpu/EnvironmentTests.cs
@@ -0,0 +1,91 @@
+using ARMeilleure.Translation;
+using NUnit.Framework;
+using Ryujinx.Cpu.Jit;
+using Ryujinx.Tests.Memory;
+using System;
+using System.Runtime.CompilerServices;
+using System.Runtime.InteropServices;
+
+namespace Ryujinx.Tests.Cpu
+{
+    internal class EnvironmentTests
+    {
+        private static Translator _translator;
+
+        private void EnsureTranslator()
+        {
+            // Create a translator, as one is needed to register the signal handler or emit methods.
+            _translator ??= new Translator(new JitMemoryAllocator(), new MockMemoryManager(), true);
+        }
+
+        [MethodImpl(MethodImplOptions.NoInlining | MethodImplOptions.NoOptimization)]
+        private float GetDenormal()
+        {
+            return BitConverter.Int32BitsToSingle(1);
+        }
+
+        [MethodImpl(MethodImplOptions.NoInlining | MethodImplOptions.NoOptimization)]
+        private float GetZero()
+        {
+            return BitConverter.Int32BitsToSingle(0);
+        }
+
+        /// <summary>
+        /// This test ensures that managed methods do not reset floating point control flags.
+        /// This is used to avoid changing control flags when running methods that don't require it, such as SVC calls, software memory...
+        /// </summary>
+        [Test]
+        public void FpFlagsPInvoke()
+        {
+            EnsureTranslator();
+
+            // Subnormal results are not flushed to zero by default.
+            // This operation should not be allowed to do constant propagation, hence the methods that explicitly disallow inlining.
+            Assert.AreNotEqual(GetDenormal() + GetZero(), 0f);
+
+            bool methodCalled = false;
+            bool isFz = false;
+
+            var managedMethod = () =>
+            {
+                // Floating point math should not modify fp flags.
+                float test = 2f * 3.5f;
+
+                if (test < 4f)
+                {
+                    throw new System.Exception("Sanity check.");
+                }
+
+                isFz = GetDenormal() + GetZero() == 0f;
+
+                try
+                {
+                    if (test >= 4f)
+                    {
+                        throw new System.Exception("Always throws.");
+                    }
+                }
+                catch
+                {
+                    // Exception handling should not modify fp flags.
+
+                    methodCalled = true;
+                }
+            };
+
+            var method = TranslatorTestMethods.GenerateFpFlagsPInvokeTest();
+
+            // This method sets flush-to-zero and then calls the managed method.
+            // Before and after setting the flags, it ensures subnormal addition works as expected.
+            // It returns a positive result if any tests fail, and 0 on success (or if the platform cannot change FP flags)
+            int result = method(Marshal.GetFunctionPointerForDelegate(managedMethod));
+
+            // Subnormal results are not flushed to zero by default, which we should have returned to exiting the method.
+            Assert.AreNotEqual(GetDenormal() + GetZero(), 0f);
+
+            Assert.True(result == 0);
+            Assert.True(methodCalled);
+            Assert.True(isFz);
+        }
+    }
+}