diff --git a/ARMeilleure/Instructions/InstEmitSimdArithmetic.cs b/ARMeilleure/Instructions/InstEmitSimdArithmetic.cs
index a35e28a15f..b91c522ec9 100644
--- a/ARMeilleure/Instructions/InstEmitSimdArithmetic.cs
+++ b/ARMeilleure/Instructions/InstEmitSimdArithmetic.cs
@@ -1617,18 +1617,32 @@ namespace ARMeilleure.Instructions
 
         public static void Frinta_S(ArmEmitterContext context)
         {
-            EmitScalarUnaryOpF(context, (op1) =>
+            if (Optimizations.UseSse41)
             {
-                return EmitRoundMathCall(context, MidpointRounding.AwayFromZero, op1);
-            });
+                EmitSse41ScalarRoundOpF(context, FPRoundingMode.ToNearestAway);
+            }
+            else
+            {
+                EmitScalarUnaryOpF(context, (op1) =>
+                {
+                    return EmitRoundMathCall(context, MidpointRounding.AwayFromZero, op1);
+                });
+            }
         }
 
         public static void Frinta_V(ArmEmitterContext context)
         {
-            EmitVectorUnaryOpF(context, (op1) =>
+            if (Optimizations.UseSse41)
             {
-                return EmitRoundMathCall(context, MidpointRounding.AwayFromZero, op1);
-            });
+                EmitSse41VectorRoundOpF(context, FPRoundingMode.ToNearestAway);
+            }
+            else
+            {
+                EmitVectorUnaryOpF(context, (op1) =>
+                {
+                    return EmitRoundMathCall(context, MidpointRounding.AwayFromZero, op1);
+                });
+            }
         }
 
         public static void Frinti_S(ArmEmitterContext context)
@@ -3516,9 +3530,18 @@ namespace ARMeilleure.Instructions
 
             Operand n = GetVec(op.Rn);
 
-            Intrinsic inst = (op.Size & 1) != 0 ? Intrinsic.X86Roundsd : Intrinsic.X86Roundss;
+            Operand res;
 
-            Operand res = context.AddIntrinsic(inst, n, Const(X86GetRoundControl(roundMode)));
+            if (roundMode != FPRoundingMode.ToNearestAway)
+            {
+                Intrinsic inst = (op.Size & 1) != 0 ? Intrinsic.X86Roundsd : Intrinsic.X86Roundss;
+
+                res = context.AddIntrinsic(inst, n, Const(X86GetRoundControl(roundMode)));
+            }
+            else
+            {
+                res = EmitSse41RoundToNearestWithTiesToAwayOpF(context, n, scalar: true);
+            }
 
             if ((op.Size & 1) != 0)
             {
@@ -3538,9 +3561,18 @@ namespace ARMeilleure.Instructions
 
             Operand n = GetVec(op.Rn);
 
-            Intrinsic inst = (op.Size & 1) != 0 ? Intrinsic.X86Roundpd : Intrinsic.X86Roundps;
+            Operand res;
 
-            Operand res = context.AddIntrinsic(inst, n, Const(X86GetRoundControl(roundMode)));
+            if (roundMode != FPRoundingMode.ToNearestAway)
+            {
+                Intrinsic inst = (op.Size & 1) != 0 ? Intrinsic.X86Roundpd : Intrinsic.X86Roundps;
+
+                res = context.AddIntrinsic(inst, n, Const(X86GetRoundControl(roundMode)));
+            }
+            else
+            {
+                res = EmitSse41RoundToNearestWithTiesToAwayOpF(context, n, scalar: false);
+            }
 
             if (op.RegisterSize == RegisterSize.Simd64)
             {
diff --git a/ARMeilleure/Instructions/InstEmitSimdCvt.cs b/ARMeilleure/Instructions/InstEmitSimdCvt.cs
index c8c427b790..9329f2b757 100644
--- a/ARMeilleure/Instructions/InstEmitSimdCvt.cs
+++ b/ARMeilleure/Instructions/InstEmitSimdCvt.cs
@@ -164,32 +164,74 @@ namespace ARMeilleure.Instructions
 
         public static void Fcvtas_Gp(ArmEmitterContext context)
         {
-            EmitFcvt_s_Gp(context, (op1) => EmitRoundMathCall(context, MidpointRounding.AwayFromZero, op1));
+            if (Optimizations.UseSse41)
+            {
+                EmitSse41Fcvts_Gp(context, FPRoundingMode.ToNearestAway, isFixed: false);
+            }
+            else
+            {
+                EmitFcvt_s_Gp(context, (op1) => EmitRoundMathCall(context, MidpointRounding.AwayFromZero, op1));
+            }
         }
 
         public static void Fcvtas_S(ArmEmitterContext context)
         {
-            EmitFcvt(context, (op1) => EmitRoundMathCall(context, MidpointRounding.AwayFromZero, op1), signed: true, scalar: true);
+            if (Optimizations.UseSse41)
+            {
+                EmitSse41FcvtsOpF(context, FPRoundingMode.ToNearestAway, scalar: true);
+            }
+            else
+            {
+                EmitFcvt(context, (op1) => EmitRoundMathCall(context, MidpointRounding.AwayFromZero, op1), signed: true, scalar: true);
+            }
         }
 
         public static void Fcvtas_V(ArmEmitterContext context)
         {
-            EmitFcvt(context, (op1) => EmitRoundMathCall(context, MidpointRounding.AwayFromZero, op1), signed: true, scalar: false);
+            if (Optimizations.UseSse41)
+            {
+                EmitSse41FcvtsOpF(context, FPRoundingMode.ToNearestAway, scalar: false);
+            }
+            else
+            {
+                EmitFcvt(context, (op1) => EmitRoundMathCall(context, MidpointRounding.AwayFromZero, op1), signed: true, scalar: false);
+            }
         }
 
         public static void Fcvtau_Gp(ArmEmitterContext context)
         {
-            EmitFcvt_u_Gp(context, (op1) => EmitRoundMathCall(context, MidpointRounding.AwayFromZero, op1));
+            if (Optimizations.UseSse41)
+            {
+                EmitSse41Fcvtu_Gp(context, FPRoundingMode.ToNearestAway, isFixed: false);
+            }
+            else
+            {
+                EmitFcvt_u_Gp(context, (op1) => EmitRoundMathCall(context, MidpointRounding.AwayFromZero, op1));
+            }
         }
 
         public static void Fcvtau_S(ArmEmitterContext context)
         {
-            EmitFcvt(context, (op1) => EmitRoundMathCall(context, MidpointRounding.AwayFromZero, op1), signed: false, scalar: true);
+            if (Optimizations.UseSse41)
+            {
+                EmitSse41FcvtuOpF(context, FPRoundingMode.ToNearestAway, scalar: true);
+            }
+            else
+            {
+                EmitFcvt(context, (op1) => EmitRoundMathCall(context, MidpointRounding.AwayFromZero, op1), signed: false, scalar: true);
+            }
         }
 
         public static void Fcvtau_V(ArmEmitterContext context)
         {
-            EmitFcvt(context, (op1) => EmitRoundMathCall(context, MidpointRounding.AwayFromZero, op1), signed: false, scalar: false);
+            if (Optimizations.UseSse41)
+            {
+                EmitSse41FcvtuOpF(context, FPRoundingMode.ToNearestAway, scalar: false);
+            }
+            else
+            {
+                EmitFcvt(context, (op1) => EmitRoundMathCall(context, MidpointRounding.AwayFromZero, op1), signed: false, scalar: false);
+            }
         }
 
         public static void Fcvtl_V(ArmEmitterContext context)
@@ -1223,7 +1265,14 @@ namespace ARMeilleure.Instructions
                     nRes = context.AddIntrinsic(Intrinsic.X86Mulps, nRes, fpScaledMask);
                 }
 
-                nRes = context.AddIntrinsic(Intrinsic.X86Roundps, nRes, Const(X86GetRoundControl(roundMode)));
+                if (roundMode != FPRoundingMode.ToNearestAway)
+                {
+                    nRes = context.AddIntrinsic(Intrinsic.X86Roundps, nRes, Const(X86GetRoundControl(roundMode)));
+                }
+                else
+                {
+                    nRes = EmitSse41RoundToNearestWithTiesToAwayOpF(context, nRes, scalar);
+                }
 
                 Operand nInt = context.AddIntrinsic(Intrinsic.X86Cvtps2dq, nRes);
 
@@ -1265,7 +1314,14 @@ namespace ARMeilleure.Instructions
                     nRes = context.AddIntrinsic(Intrinsic.X86Mulpd, nRes, fpScaledMask);
                 }
 
-                nRes = context.AddIntrinsic(Intrinsic.X86Roundpd, nRes, Const(X86GetRoundControl(roundMode)));
+                if (roundMode != FPRoundingMode.ToNearestAway)
+                {
+                    nRes = context.AddIntrinsic(Intrinsic.X86Roundpd, nRes, Const(X86GetRoundControl(roundMode)));
+                }
+                else
+                {
+                    nRes = EmitSse41RoundToNearestWithTiesToAwayOpF(context, nRes, scalar);
+                }
 
                 Operand nLong = EmitSse2CvtDoubleToInt64OpF(context, nRes, scalar);
 
@@ -1314,7 +1370,14 @@ namespace ARMeilleure.Instructions
                     nRes = context.AddIntrinsic(Intrinsic.X86Mulps, nRes, fpScaledMask);
                 }
 
-                nRes = context.AddIntrinsic(Intrinsic.X86Roundps, nRes, Const(X86GetRoundControl(roundMode)));
+                if (roundMode != FPRoundingMode.ToNearestAway)
+                {
+                    nRes = context.AddIntrinsic(Intrinsic.X86Roundps, nRes, Const(X86GetRoundControl(roundMode)));
+                }
+                else
+                {
+                    nRes = EmitSse41RoundToNearestWithTiesToAwayOpF(context, nRes, scalar);
+                }
 
                 Operand zero = context.VectorZero();
 
@@ -1369,7 +1432,14 @@ namespace ARMeilleure.Instructions
                     nRes = context.AddIntrinsic(Intrinsic.X86Mulpd, nRes, fpScaledMask);
                 }
 
-                nRes = context.AddIntrinsic(Intrinsic.X86Roundpd, nRes, Const(X86GetRoundControl(roundMode)));
+                if (roundMode != FPRoundingMode.ToNearestAway)
+                {
+                    nRes = context.AddIntrinsic(Intrinsic.X86Roundpd, nRes, Const(X86GetRoundControl(roundMode)));
+                }
+                else
+                {
+                    nRes = EmitSse41RoundToNearestWithTiesToAwayOpF(context, nRes, scalar);
+                }
 
                 Operand zero = context.VectorZero();
 
@@ -1424,7 +1494,14 @@ namespace ARMeilleure.Instructions
                     nRes = context.AddIntrinsic(Intrinsic.X86Mulss, nRes, fpScaledMask);
                 }
 
-                nRes = context.AddIntrinsic(Intrinsic.X86Roundss, nRes, Const(X86GetRoundControl(roundMode)));
+                if (roundMode != FPRoundingMode.ToNearestAway)
+                {
+                    nRes = context.AddIntrinsic(Intrinsic.X86Roundss, nRes, Const(X86GetRoundControl(roundMode)));
+                }
+                else
+                {
+                    nRes = EmitSse41RoundToNearestWithTiesToAwayOpF(context, nRes, scalar: true);
+                }
 
                 Operand nIntOrLong = op.RegisterSize == RegisterSize.Int32
                     ? context.AddIntrinsicInt (Intrinsic.X86Cvtss2si, nRes)
@@ -1464,7 +1541,14 @@ namespace ARMeilleure.Instructions
                     nRes = context.AddIntrinsic(Intrinsic.X86Mulsd, nRes, fpScaledMask);
                 }
 
-                nRes = context.AddIntrinsic(Intrinsic.X86Roundsd, nRes, Const(X86GetRoundControl(roundMode)));
+                if (roundMode != FPRoundingMode.ToNearestAway)
+                {
+                    nRes = context.AddIntrinsic(Intrinsic.X86Roundsd, nRes, Const(X86GetRoundControl(roundMode)));
+                }
+                else
+                {
+                    nRes = EmitSse41RoundToNearestWithTiesToAwayOpF(context, nRes, scalar: true);
+                }
 
                 Operand nIntOrLong = op.RegisterSize == RegisterSize.Int32
                     ? context.AddIntrinsicInt (Intrinsic.X86Cvtsd2si, nRes)
@@ -1512,7 +1596,14 @@ namespace ARMeilleure.Instructions
                     nRes = context.AddIntrinsic(Intrinsic.X86Mulss, nRes, fpScaledMask);
                 }
 
-                nRes = context.AddIntrinsic(Intrinsic.X86Roundss, nRes, Const(X86GetRoundControl(roundMode)));
+                if (roundMode != FPRoundingMode.ToNearestAway)
+                {
+                    nRes = context.AddIntrinsic(Intrinsic.X86Roundss, nRes, Const(X86GetRoundControl(roundMode)));
+                }
+                else
+                {
+                    nRes = EmitSse41RoundToNearestWithTiesToAwayOpF(context, nRes, scalar: true);
+                }
 
                 Operand zero = context.VectorZero();
 
@@ -1567,7 +1658,14 @@ namespace ARMeilleure.Instructions
                     nRes = context.AddIntrinsic(Intrinsic.X86Mulsd, nRes, fpScaledMask);
                 }
 
-                nRes = context.AddIntrinsic(Intrinsic.X86Roundsd, nRes, Const(X86GetRoundControl(roundMode)));
+                if (roundMode != FPRoundingMode.ToNearestAway)
+                {
+                    nRes = context.AddIntrinsic(Intrinsic.X86Roundsd, nRes, Const(X86GetRoundControl(roundMode)));
+                }
+                else
+                {
+                    nRes = EmitSse41RoundToNearestWithTiesToAwayOpF(context, nRes, scalar: true);
+                }
 
                 Operand zero = context.VectorZero();
 
diff --git a/ARMeilleure/Instructions/InstEmitSimdCvt32.cs b/ARMeilleure/Instructions/InstEmitSimdCvt32.cs
index 69ba427471..c76634ebf9 100644
--- a/ARMeilleure/Instructions/InstEmitSimdCvt32.cs
+++ b/ARMeilleure/Instructions/InstEmitSimdCvt32.cs
@@ -203,6 +203,9 @@ namespace ARMeilleure.Instructions
             FPRoundingMode roundMode;
             switch (rm)
             {
+                case 0b00:
+                    roundMode = FPRoundingMode.ToNearestAway;
+                    break;
                 case 0b01:
                     roundMode = FPRoundingMode.ToNearest;
                     break;
@@ -228,7 +231,7 @@ namespace ARMeilleure.Instructions
             bool unsigned = op.Opc == 0;
             int rm = op.Opc2 & 3;
 
-            if (Optimizations.UseSse41 && rm != 0b00)
+            if (Optimizations.UseSse41)
             {
                 EmitSse41ConvertInt32(context, RMToRoundMode(rm), !unsigned);
             }
@@ -267,15 +270,21 @@ namespace ARMeilleure.Instructions
 
             int rm = op.Opc2 & 3;
 
-            if (Optimizations.UseSse2 && rm != 0b00)
+            if (Optimizations.UseSse41)
             {
                 EmitScalarUnaryOpSimd32(context, (m) =>
                 {
-                    Intrinsic inst = (op.Size & 1) == 0 ? Intrinsic.X86Roundss : Intrinsic.X86Roundsd;
-
                     FPRoundingMode roundMode = RMToRoundMode(rm);
 
-                    return context.AddIntrinsic(inst, m, Const(X86GetRoundControl(roundMode)));
+                    if (roundMode != FPRoundingMode.ToNearestAway)
+                    {
+                        Intrinsic inst = (op.Size & 1) == 0 ? Intrinsic.X86Roundss : Intrinsic.X86Roundsd;
+                        return context.AddIntrinsic(inst, m, Const(X86GetRoundControl(roundMode)));
+                    }
+                    else
+                    {
+                        return EmitSse41RoundToNearestWithTiesToAwayOpF(context, m, scalar: true);
+                    }
                 });
             }
             else
@@ -305,7 +314,17 @@ namespace ARMeilleure.Instructions
         // VRINTA (vector).
         public static void Vrinta_V(ArmEmitterContext context)
         {
-            EmitVectorUnaryOpF32(context, (m) => EmitRoundMathCall(context, MidpointRounding.AwayFromZero, m));
+            if (Optimizations.UseSse41)
+            {
+                EmitVectorUnaryOpSimd32(context, (m) =>
+                {
+                    return EmitSse41RoundToNearestWithTiesToAwayOpF(context, m, scalar: false);
+                });
+            }
+            else
+            {
+                EmitVectorUnaryOpF32(context, (m) => EmitRoundMathCall(context, MidpointRounding.AwayFromZero, m));
+            }
         }
 
         // VRINTM (vector).
@@ -413,7 +432,14 @@ namespace ARMeilleure.Instructions
                 Operand nRes = context.AddIntrinsic(Intrinsic.X86Cmpss, n, n, Const((int)CmpCondition.OrderedQ));
                 nRes = context.AddIntrinsic(Intrinsic.X86Pand, nRes, n);
 
-                nRes = context.AddIntrinsic(Intrinsic.X86Roundss, nRes, Const(X86GetRoundControl(roundMode)));
+                if (roundMode != FPRoundingMode.ToNearestAway)
+                {
+                    nRes = context.AddIntrinsic(Intrinsic.X86Roundss, nRes, Const(X86GetRoundControl(roundMode)));
+                }
+                else
+                {
+                    nRes = EmitSse41RoundToNearestWithTiesToAwayOpF(context, nRes, scalar: true);
+                }
 
                 Operand zero = context.VectorZero();
 
@@ -464,7 +490,14 @@ namespace ARMeilleure.Instructions
                 Operand nRes = context.AddIntrinsic(Intrinsic.X86Cmpsd, n, n, Const((int)CmpCondition.OrderedQ));
                 nRes = context.AddIntrinsic(Intrinsic.X86Pand, nRes, n);
 
-                nRes = context.AddIntrinsic(Intrinsic.X86Roundsd, nRes, Const(X86GetRoundControl(roundMode)));
+                if (roundMode != FPRoundingMode.ToNearestAway)
+                {
+                    nRes = context.AddIntrinsic(Intrinsic.X86Roundsd, nRes, Const(X86GetRoundControl(roundMode)));
+                }
+                else
+                {
+                    nRes = EmitSse41RoundToNearestWithTiesToAwayOpF(context, nRes, scalar: true);
+                }
 
                 Operand zero = context.VectorZero();
 
diff --git a/ARMeilleure/Instructions/InstEmitSimdHelper.cs b/ARMeilleure/Instructions/InstEmitSimdHelper.cs
index 49c17560b4..0e7af794a5 100644
--- a/ARMeilleure/Instructions/InstEmitSimdHelper.cs
+++ b/ARMeilleure/Instructions/InstEmitSimdHelper.cs
@@ -33,6 +33,14 @@ namespace ARMeilleure.Instructions
         };
 
         public static readonly long ZeroMask = 128L << 56 | 128L << 48 | 128L << 40 | 128L << 32 | 128L << 24 | 128L << 16 | 128L << 8 | 128L << 0;
+
+        public static ulong X86GetGf2p8LogicalShiftLeft(int shift)
+        {
+            ulong identity = (0b00000001UL << 56) | (0b00000010UL << 48) | (0b00000100UL << 40) | (0b00001000UL << 32) |
+                             (0b00010000UL << 24) | (0b00100000UL << 16) | (0b01000000UL <<  8) | (0b10000000UL <<  0);
+
+            return shift >= 0 ? identity >> (shift * 8) : identity << (-shift * 8);
+        }
 #endregion
 
 #region "X86 SSE Intrinsics"
@@ -243,19 +251,44 @@ namespace ARMeilleure.Instructions
             throw new ArgumentException($"Invalid rounding mode \"{roundMode}\".");
         }
 
-        public static ulong X86GetGf2p8LogicalShiftLeft(int shift)
+        public static Operand EmitSse41RoundToNearestWithTiesToAwayOpF(ArmEmitterContext context, Operand n, bool scalar)
         {
-            ulong identity = 
-                (0b00000001UL << 56) |
-                (0b00000010UL << 48) |
-                (0b00000100UL << 40) |
-                (0b00001000UL << 32) |
-                (0b00010000UL << 24) |
-                (0b00100000UL << 16) |
-                (0b01000000UL <<  8) |
-                (0b10000000UL <<  0);
+            Debug.Assert(n.Type == OperandType.V128);
 
-            return shift >= 0 ? identity >> (shift * 8) : identity << (-shift * 8);
+            Operand nCopy = context.Copy(n);
+
+            Operand rC = Const(X86GetRoundControl(FPRoundingMode.TowardsZero));
+
+            IOpCodeSimd op = (IOpCodeSimd)context.CurrOp;
+
+            if ((op.Size & 1) == 0)
+            {
+                Operand signMask = scalar ? X86GetScalar(context, int.MinValue) : X86GetAllElements(context, int.MinValue);
+                        signMask = context.AddIntrinsic(Intrinsic.X86Pand, signMask, nCopy);
+
+                // 0x3EFFFFFF == BitConverter.SingleToInt32Bits(0.5f) - 1
+                Operand valueMask = scalar ? X86GetScalar(context, 0x3EFFFFFF) : X86GetAllElements(context, 0x3EFFFFFF);
+                        valueMask = context.AddIntrinsic(Intrinsic.X86Por, valueMask, signMask);
+
+                nCopy = context.AddIntrinsic(scalar ? Intrinsic.X86Addss : Intrinsic.X86Addps, nCopy, valueMask);
+
+                nCopy = context.AddIntrinsic(scalar ? Intrinsic.X86Roundss : Intrinsic.X86Roundps, nCopy, rC);
+            }
+            else
+            {
+                Operand signMask = scalar ? X86GetScalar(context, long.MinValue) : X86GetAllElements(context, long.MinValue);
+                        signMask = context.AddIntrinsic(Intrinsic.X86Pand, signMask, nCopy);
+
+                // 0x3FDFFFFFFFFFFFFFL == BitConverter.DoubleToInt64Bits(0.5d) - 1L
+                Operand valueMask = scalar ? X86GetScalar(context, 0x3FDFFFFFFFFFFFFFL) : X86GetAllElements(context, 0x3FDFFFFFFFFFFFFFL);
+                        valueMask = context.AddIntrinsic(Intrinsic.X86Por, valueMask, signMask);
+
+                nCopy = context.AddIntrinsic(scalar ? Intrinsic.X86Addsd : Intrinsic.X86Addpd, nCopy, valueMask);
+
+                nCopy = context.AddIntrinsic(scalar ? Intrinsic.X86Roundsd : Intrinsic.X86Roundpd, nCopy, rC);
+            }
+
+            return nCopy;
         }
 
         public static Operand EmitCountSetBits8(ArmEmitterContext context, Operand op) // "size" is 8 (SIMD&FP Inst.).
diff --git a/ARMeilleure/State/FPRoundingMode.cs b/ARMeilleure/State/FPRoundingMode.cs
index ee4f876686..8d757a151f 100644
--- a/ARMeilleure/State/FPRoundingMode.cs
+++ b/ARMeilleure/State/FPRoundingMode.cs
@@ -2,9 +2,10 @@ namespace ARMeilleure.State
 {
     public enum FPRoundingMode
     {
-        ToNearest            = 0,
+        ToNearest            = 0, // With ties to even.
         TowardsPlusInfinity  = 1,
         TowardsMinusInfinity = 2,
-        TowardsZero          = 3
+        TowardsZero          = 3,
+        ToNearestAway        = 4 // With ties to away.
     }
 }
diff --git a/ARMeilleure/Translation/PTC/Ptc.cs b/ARMeilleure/Translation/PTC/Ptc.cs
index 1515713be5..70f6e0127b 100644
--- a/ARMeilleure/Translation/PTC/Ptc.cs
+++ b/ARMeilleure/Translation/PTC/Ptc.cs
@@ -27,7 +27,7 @@ namespace ARMeilleure.Translation.PTC
         private const string OuterHeaderMagicString = "PTCohd\0\0";
         private const string InnerHeaderMagicString = "PTCihd\0\0";
 
-        private const uint InternalVersion = 3710; //! To be incremented manually for each change to the ARMeilleure project.
+        private const uint InternalVersion = 3713; //! To be incremented manually for each change to the ARMeilleure project.
 
         private const string ActualDir = "0";
         private const string BackupDir = "1";