diff --git a/ARMeilleure/Instructions/InstEmitSimdArithmetic.cs b/ARMeilleure/Instructions/InstEmitSimdArithmetic.cs index a35e28a15f..b91c522ec9 100644 --- a/ARMeilleure/Instructions/InstEmitSimdArithmetic.cs +++ b/ARMeilleure/Instructions/InstEmitSimdArithmetic.cs @@ -1617,18 +1617,32 @@ namespace ARMeilleure.Instructions public static void Frinta_S(ArmEmitterContext context) { - EmitScalarUnaryOpF(context, (op1) => + if (Optimizations.UseSse41) { - return EmitRoundMathCall(context, MidpointRounding.AwayFromZero, op1); - }); + EmitSse41ScalarRoundOpF(context, FPRoundingMode.ToNearestAway); + } + else + { + EmitScalarUnaryOpF(context, (op1) => + { + return EmitRoundMathCall(context, MidpointRounding.AwayFromZero, op1); + }); + } } public static void Frinta_V(ArmEmitterContext context) { - EmitVectorUnaryOpF(context, (op1) => + if (Optimizations.UseSse41) { - return EmitRoundMathCall(context, MidpointRounding.AwayFromZero, op1); - }); + EmitSse41VectorRoundOpF(context, FPRoundingMode.ToNearestAway); + } + else + { + EmitVectorUnaryOpF(context, (op1) => + { + return EmitRoundMathCall(context, MidpointRounding.AwayFromZero, op1); + }); + } } public static void Frinti_S(ArmEmitterContext context) @@ -3516,9 +3530,18 @@ namespace ARMeilleure.Instructions Operand n = GetVec(op.Rn); - Intrinsic inst = (op.Size & 1) != 0 ? Intrinsic.X86Roundsd : Intrinsic.X86Roundss; + Operand res; - Operand res = context.AddIntrinsic(inst, n, Const(X86GetRoundControl(roundMode))); + if (roundMode != FPRoundingMode.ToNearestAway) + { + Intrinsic inst = (op.Size & 1) != 0 ? Intrinsic.X86Roundsd : Intrinsic.X86Roundss; + + res = context.AddIntrinsic(inst, n, Const(X86GetRoundControl(roundMode))); + } + else + { + res = EmitSse41RoundToNearestWithTiesToAwayOpF(context, n, scalar: true); + } if ((op.Size & 1) != 0) { @@ -3538,9 +3561,18 @@ namespace ARMeilleure.Instructions Operand n = GetVec(op.Rn); - Intrinsic inst = (op.Size & 1) != 0 ? Intrinsic.X86Roundpd : Intrinsic.X86Roundps; + Operand res; - Operand res = context.AddIntrinsic(inst, n, Const(X86GetRoundControl(roundMode))); + if (roundMode != FPRoundingMode.ToNearestAway) + { + Intrinsic inst = (op.Size & 1) != 0 ? Intrinsic.X86Roundpd : Intrinsic.X86Roundps; + + res = context.AddIntrinsic(inst, n, Const(X86GetRoundControl(roundMode))); + } + else + { + res = EmitSse41RoundToNearestWithTiesToAwayOpF(context, n, scalar: false); + } if (op.RegisterSize == RegisterSize.Simd64) { diff --git a/ARMeilleure/Instructions/InstEmitSimdCvt.cs b/ARMeilleure/Instructions/InstEmitSimdCvt.cs index c8c427b790..9329f2b757 100644 --- a/ARMeilleure/Instructions/InstEmitSimdCvt.cs +++ b/ARMeilleure/Instructions/InstEmitSimdCvt.cs @@ -164,32 +164,74 @@ namespace ARMeilleure.Instructions public static void Fcvtas_Gp(ArmEmitterContext context) { - EmitFcvt_s_Gp(context, (op1) => EmitRoundMathCall(context, MidpointRounding.AwayFromZero, op1)); + if (Optimizations.UseSse41) + { + EmitSse41Fcvts_Gp(context, FPRoundingMode.ToNearestAway, isFixed: false); + } + else + { + EmitFcvt_s_Gp(context, (op1) => EmitRoundMathCall(context, MidpointRounding.AwayFromZero, op1)); + } } public static void Fcvtas_S(ArmEmitterContext context) { - EmitFcvt(context, (op1) => EmitRoundMathCall(context, MidpointRounding.AwayFromZero, op1), signed: true, scalar: true); + if (Optimizations.UseSse41) + { + EmitSse41FcvtsOpF(context, FPRoundingMode.ToNearestAway, scalar: true); + } + else + { + EmitFcvt(context, (op1) => EmitRoundMathCall(context, MidpointRounding.AwayFromZero, op1), signed: true, scalar: true); + } } public static void Fcvtas_V(ArmEmitterContext context) { - EmitFcvt(context, (op1) => EmitRoundMathCall(context, MidpointRounding.AwayFromZero, op1), signed: true, scalar: false); + if (Optimizations.UseSse41) + { + EmitSse41FcvtsOpF(context, FPRoundingMode.ToNearestAway, scalar: false); + } + else + { + EmitFcvt(context, (op1) => EmitRoundMathCall(context, MidpointRounding.AwayFromZero, op1), signed: true, scalar: false); + } } public static void Fcvtau_Gp(ArmEmitterContext context) { - EmitFcvt_u_Gp(context, (op1) => EmitRoundMathCall(context, MidpointRounding.AwayFromZero, op1)); + if (Optimizations.UseSse41) + { + EmitSse41Fcvtu_Gp(context, FPRoundingMode.ToNearestAway, isFixed: false); + } + else + { + EmitFcvt_u_Gp(context, (op1) => EmitRoundMathCall(context, MidpointRounding.AwayFromZero, op1)); + } } public static void Fcvtau_S(ArmEmitterContext context) { - EmitFcvt(context, (op1) => EmitRoundMathCall(context, MidpointRounding.AwayFromZero, op1), signed: false, scalar: true); + if (Optimizations.UseSse41) + { + EmitSse41FcvtuOpF(context, FPRoundingMode.ToNearestAway, scalar: true); + } + else + { + EmitFcvt(context, (op1) => EmitRoundMathCall(context, MidpointRounding.AwayFromZero, op1), signed: false, scalar: true); + } } public static void Fcvtau_V(ArmEmitterContext context) { - EmitFcvt(context, (op1) => EmitRoundMathCall(context, MidpointRounding.AwayFromZero, op1), signed: false, scalar: false); + if (Optimizations.UseSse41) + { + EmitSse41FcvtuOpF(context, FPRoundingMode.ToNearestAway, scalar: false); + } + else + { + EmitFcvt(context, (op1) => EmitRoundMathCall(context, MidpointRounding.AwayFromZero, op1), signed: false, scalar: false); + } } public static void Fcvtl_V(ArmEmitterContext context) @@ -1223,7 +1265,14 @@ namespace ARMeilleure.Instructions nRes = context.AddIntrinsic(Intrinsic.X86Mulps, nRes, fpScaledMask); } - nRes = context.AddIntrinsic(Intrinsic.X86Roundps, nRes, Const(X86GetRoundControl(roundMode))); + if (roundMode != FPRoundingMode.ToNearestAway) + { + nRes = context.AddIntrinsic(Intrinsic.X86Roundps, nRes, Const(X86GetRoundControl(roundMode))); + } + else + { + nRes = EmitSse41RoundToNearestWithTiesToAwayOpF(context, nRes, scalar); + } Operand nInt = context.AddIntrinsic(Intrinsic.X86Cvtps2dq, nRes); @@ -1265,7 +1314,14 @@ namespace ARMeilleure.Instructions nRes = context.AddIntrinsic(Intrinsic.X86Mulpd, nRes, fpScaledMask); } - nRes = context.AddIntrinsic(Intrinsic.X86Roundpd, nRes, Const(X86GetRoundControl(roundMode))); + if (roundMode != FPRoundingMode.ToNearestAway) + { + nRes = context.AddIntrinsic(Intrinsic.X86Roundpd, nRes, Const(X86GetRoundControl(roundMode))); + } + else + { + nRes = EmitSse41RoundToNearestWithTiesToAwayOpF(context, nRes, scalar); + } Operand nLong = EmitSse2CvtDoubleToInt64OpF(context, nRes, scalar); @@ -1314,7 +1370,14 @@ namespace ARMeilleure.Instructions nRes = context.AddIntrinsic(Intrinsic.X86Mulps, nRes, fpScaledMask); } - nRes = context.AddIntrinsic(Intrinsic.X86Roundps, nRes, Const(X86GetRoundControl(roundMode))); + if (roundMode != FPRoundingMode.ToNearestAway) + { + nRes = context.AddIntrinsic(Intrinsic.X86Roundps, nRes, Const(X86GetRoundControl(roundMode))); + } + else + { + nRes = EmitSse41RoundToNearestWithTiesToAwayOpF(context, nRes, scalar); + } Operand zero = context.VectorZero(); @@ -1369,7 +1432,14 @@ namespace ARMeilleure.Instructions nRes = context.AddIntrinsic(Intrinsic.X86Mulpd, nRes, fpScaledMask); } - nRes = context.AddIntrinsic(Intrinsic.X86Roundpd, nRes, Const(X86GetRoundControl(roundMode))); + if (roundMode != FPRoundingMode.ToNearestAway) + { + nRes = context.AddIntrinsic(Intrinsic.X86Roundpd, nRes, Const(X86GetRoundControl(roundMode))); + } + else + { + nRes = EmitSse41RoundToNearestWithTiesToAwayOpF(context, nRes, scalar); + } Operand zero = context.VectorZero(); @@ -1424,7 +1494,14 @@ namespace ARMeilleure.Instructions nRes = context.AddIntrinsic(Intrinsic.X86Mulss, nRes, fpScaledMask); } - nRes = context.AddIntrinsic(Intrinsic.X86Roundss, nRes, Const(X86GetRoundControl(roundMode))); + if (roundMode != FPRoundingMode.ToNearestAway) + { + nRes = context.AddIntrinsic(Intrinsic.X86Roundss, nRes, Const(X86GetRoundControl(roundMode))); + } + else + { + nRes = EmitSse41RoundToNearestWithTiesToAwayOpF(context, nRes, scalar: true); + } Operand nIntOrLong = op.RegisterSize == RegisterSize.Int32 ? context.AddIntrinsicInt (Intrinsic.X86Cvtss2si, nRes) @@ -1464,7 +1541,14 @@ namespace ARMeilleure.Instructions nRes = context.AddIntrinsic(Intrinsic.X86Mulsd, nRes, fpScaledMask); } - nRes = context.AddIntrinsic(Intrinsic.X86Roundsd, nRes, Const(X86GetRoundControl(roundMode))); + if (roundMode != FPRoundingMode.ToNearestAway) + { + nRes = context.AddIntrinsic(Intrinsic.X86Roundsd, nRes, Const(X86GetRoundControl(roundMode))); + } + else + { + nRes = EmitSse41RoundToNearestWithTiesToAwayOpF(context, nRes, scalar: true); + } Operand nIntOrLong = op.RegisterSize == RegisterSize.Int32 ? context.AddIntrinsicInt (Intrinsic.X86Cvtsd2si, nRes) @@ -1512,7 +1596,14 @@ namespace ARMeilleure.Instructions nRes = context.AddIntrinsic(Intrinsic.X86Mulss, nRes, fpScaledMask); } - nRes = context.AddIntrinsic(Intrinsic.X86Roundss, nRes, Const(X86GetRoundControl(roundMode))); + if (roundMode != FPRoundingMode.ToNearestAway) + { + nRes = context.AddIntrinsic(Intrinsic.X86Roundss, nRes, Const(X86GetRoundControl(roundMode))); + } + else + { + nRes = EmitSse41RoundToNearestWithTiesToAwayOpF(context, nRes, scalar: true); + } Operand zero = context.VectorZero(); @@ -1567,7 +1658,14 @@ namespace ARMeilleure.Instructions nRes = context.AddIntrinsic(Intrinsic.X86Mulsd, nRes, fpScaledMask); } - nRes = context.AddIntrinsic(Intrinsic.X86Roundsd, nRes, Const(X86GetRoundControl(roundMode))); + if (roundMode != FPRoundingMode.ToNearestAway) + { + nRes = context.AddIntrinsic(Intrinsic.X86Roundsd, nRes, Const(X86GetRoundControl(roundMode))); + } + else + { + nRes = EmitSse41RoundToNearestWithTiesToAwayOpF(context, nRes, scalar: true); + } Operand zero = context.VectorZero(); diff --git a/ARMeilleure/Instructions/InstEmitSimdCvt32.cs b/ARMeilleure/Instructions/InstEmitSimdCvt32.cs index 69ba427471..c76634ebf9 100644 --- a/ARMeilleure/Instructions/InstEmitSimdCvt32.cs +++ b/ARMeilleure/Instructions/InstEmitSimdCvt32.cs @@ -203,6 +203,9 @@ namespace ARMeilleure.Instructions FPRoundingMode roundMode; switch (rm) { + case 0b00: + roundMode = FPRoundingMode.ToNearestAway; + break; case 0b01: roundMode = FPRoundingMode.ToNearest; break; @@ -228,7 +231,7 @@ namespace ARMeilleure.Instructions bool unsigned = op.Opc == 0; int rm = op.Opc2 & 3; - if (Optimizations.UseSse41 && rm != 0b00) + if (Optimizations.UseSse41) { EmitSse41ConvertInt32(context, RMToRoundMode(rm), !unsigned); } @@ -267,15 +270,21 @@ namespace ARMeilleure.Instructions int rm = op.Opc2 & 3; - if (Optimizations.UseSse2 && rm != 0b00) + if (Optimizations.UseSse41) { EmitScalarUnaryOpSimd32(context, (m) => { - Intrinsic inst = (op.Size & 1) == 0 ? Intrinsic.X86Roundss : Intrinsic.X86Roundsd; - FPRoundingMode roundMode = RMToRoundMode(rm); - return context.AddIntrinsic(inst, m, Const(X86GetRoundControl(roundMode))); + if (roundMode != FPRoundingMode.ToNearestAway) + { + Intrinsic inst = (op.Size & 1) == 0 ? Intrinsic.X86Roundss : Intrinsic.X86Roundsd; + return context.AddIntrinsic(inst, m, Const(X86GetRoundControl(roundMode))); + } + else + { + return EmitSse41RoundToNearestWithTiesToAwayOpF(context, m, scalar: true); + } }); } else @@ -305,7 +314,17 @@ namespace ARMeilleure.Instructions // VRINTA (vector). public static void Vrinta_V(ArmEmitterContext context) { - EmitVectorUnaryOpF32(context, (m) => EmitRoundMathCall(context, MidpointRounding.AwayFromZero, m)); + if (Optimizations.UseSse41) + { + EmitVectorUnaryOpSimd32(context, (m) => + { + return EmitSse41RoundToNearestWithTiesToAwayOpF(context, m, scalar: false); + }); + } + else + { + EmitVectorUnaryOpF32(context, (m) => EmitRoundMathCall(context, MidpointRounding.AwayFromZero, m)); + } } // VRINTM (vector). @@ -413,7 +432,14 @@ namespace ARMeilleure.Instructions Operand nRes = context.AddIntrinsic(Intrinsic.X86Cmpss, n, n, Const((int)CmpCondition.OrderedQ)); nRes = context.AddIntrinsic(Intrinsic.X86Pand, nRes, n); - nRes = context.AddIntrinsic(Intrinsic.X86Roundss, nRes, Const(X86GetRoundControl(roundMode))); + if (roundMode != FPRoundingMode.ToNearestAway) + { + nRes = context.AddIntrinsic(Intrinsic.X86Roundss, nRes, Const(X86GetRoundControl(roundMode))); + } + else + { + nRes = EmitSse41RoundToNearestWithTiesToAwayOpF(context, nRes, scalar: true); + } Operand zero = context.VectorZero(); @@ -464,7 +490,14 @@ namespace ARMeilleure.Instructions Operand nRes = context.AddIntrinsic(Intrinsic.X86Cmpsd, n, n, Const((int)CmpCondition.OrderedQ)); nRes = context.AddIntrinsic(Intrinsic.X86Pand, nRes, n); - nRes = context.AddIntrinsic(Intrinsic.X86Roundsd, nRes, Const(X86GetRoundControl(roundMode))); + if (roundMode != FPRoundingMode.ToNearestAway) + { + nRes = context.AddIntrinsic(Intrinsic.X86Roundsd, nRes, Const(X86GetRoundControl(roundMode))); + } + else + { + nRes = EmitSse41RoundToNearestWithTiesToAwayOpF(context, nRes, scalar: true); + } Operand zero = context.VectorZero(); diff --git a/ARMeilleure/Instructions/InstEmitSimdHelper.cs b/ARMeilleure/Instructions/InstEmitSimdHelper.cs index 49c17560b4..0e7af794a5 100644 --- a/ARMeilleure/Instructions/InstEmitSimdHelper.cs +++ b/ARMeilleure/Instructions/InstEmitSimdHelper.cs @@ -33,6 +33,14 @@ namespace ARMeilleure.Instructions }; public static readonly long ZeroMask = 128L << 56 | 128L << 48 | 128L << 40 | 128L << 32 | 128L << 24 | 128L << 16 | 128L << 8 | 128L << 0; + + public static ulong X86GetGf2p8LogicalShiftLeft(int shift) + { + ulong identity = (0b00000001UL << 56) | (0b00000010UL << 48) | (0b00000100UL << 40) | (0b00001000UL << 32) | + (0b00010000UL << 24) | (0b00100000UL << 16) | (0b01000000UL << 8) | (0b10000000UL << 0); + + return shift >= 0 ? identity >> (shift * 8) : identity << (-shift * 8); + } #endregion #region "X86 SSE Intrinsics" @@ -243,19 +251,44 @@ namespace ARMeilleure.Instructions throw new ArgumentException($"Invalid rounding mode \"{roundMode}\"."); } - public static ulong X86GetGf2p8LogicalShiftLeft(int shift) + public static Operand EmitSse41RoundToNearestWithTiesToAwayOpF(ArmEmitterContext context, Operand n, bool scalar) { - ulong identity = - (0b00000001UL << 56) | - (0b00000010UL << 48) | - (0b00000100UL << 40) | - (0b00001000UL << 32) | - (0b00010000UL << 24) | - (0b00100000UL << 16) | - (0b01000000UL << 8) | - (0b10000000UL << 0); + Debug.Assert(n.Type == OperandType.V128); - return shift >= 0 ? identity >> (shift * 8) : identity << (-shift * 8); + Operand nCopy = context.Copy(n); + + Operand rC = Const(X86GetRoundControl(FPRoundingMode.TowardsZero)); + + IOpCodeSimd op = (IOpCodeSimd)context.CurrOp; + + if ((op.Size & 1) == 0) + { + Operand signMask = scalar ? X86GetScalar(context, int.MinValue) : X86GetAllElements(context, int.MinValue); + signMask = context.AddIntrinsic(Intrinsic.X86Pand, signMask, nCopy); + + // 0x3EFFFFFF == BitConverter.SingleToInt32Bits(0.5f) - 1 + Operand valueMask = scalar ? X86GetScalar(context, 0x3EFFFFFF) : X86GetAllElements(context, 0x3EFFFFFF); + valueMask = context.AddIntrinsic(Intrinsic.X86Por, valueMask, signMask); + + nCopy = context.AddIntrinsic(scalar ? Intrinsic.X86Addss : Intrinsic.X86Addps, nCopy, valueMask); + + nCopy = context.AddIntrinsic(scalar ? Intrinsic.X86Roundss : Intrinsic.X86Roundps, nCopy, rC); + } + else + { + Operand signMask = scalar ? X86GetScalar(context, long.MinValue) : X86GetAllElements(context, long.MinValue); + signMask = context.AddIntrinsic(Intrinsic.X86Pand, signMask, nCopy); + + // 0x3FDFFFFFFFFFFFFFL == BitConverter.DoubleToInt64Bits(0.5d) - 1L + Operand valueMask = scalar ? X86GetScalar(context, 0x3FDFFFFFFFFFFFFFL) : X86GetAllElements(context, 0x3FDFFFFFFFFFFFFFL); + valueMask = context.AddIntrinsic(Intrinsic.X86Por, valueMask, signMask); + + nCopy = context.AddIntrinsic(scalar ? Intrinsic.X86Addsd : Intrinsic.X86Addpd, nCopy, valueMask); + + nCopy = context.AddIntrinsic(scalar ? Intrinsic.X86Roundsd : Intrinsic.X86Roundpd, nCopy, rC); + } + + return nCopy; } public static Operand EmitCountSetBits8(ArmEmitterContext context, Operand op) // "size" is 8 (SIMD&FP Inst.). diff --git a/ARMeilleure/State/FPRoundingMode.cs b/ARMeilleure/State/FPRoundingMode.cs index ee4f876686..8d757a151f 100644 --- a/ARMeilleure/State/FPRoundingMode.cs +++ b/ARMeilleure/State/FPRoundingMode.cs @@ -2,9 +2,10 @@ namespace ARMeilleure.State { public enum FPRoundingMode { - ToNearest = 0, + ToNearest = 0, // With ties to even. TowardsPlusInfinity = 1, TowardsMinusInfinity = 2, - TowardsZero = 3 + TowardsZero = 3, + ToNearestAway = 4 // With ties to away. } } diff --git a/ARMeilleure/Translation/PTC/Ptc.cs b/ARMeilleure/Translation/PTC/Ptc.cs index 1515713be5..70f6e0127b 100644 --- a/ARMeilleure/Translation/PTC/Ptc.cs +++ b/ARMeilleure/Translation/PTC/Ptc.cs @@ -27,7 +27,7 @@ namespace ARMeilleure.Translation.PTC private const string OuterHeaderMagicString = "PTCohd\0\0"; private const string InnerHeaderMagicString = "PTCihd\0\0"; - private const uint InternalVersion = 3710; //! To be incremented manually for each change to the ARMeilleure project. + private const uint InternalVersion = 3713; //! To be incremented manually for each change to the ARMeilleure project. private const string ActualDir = "0"; private const string BackupDir = "1";