From dcce4070719a3798bb96d3aa02b9ba02a7fecc16 Mon Sep 17 00:00:00 2001
From: gdkchan <gab.dark.100@gmail.com>
Date: Wed, 27 Jan 2021 20:23:00 -0300
Subject: [PATCH] Lower precision of estimate instruction results to match Arm
 behavior (#1943)

* Lower precision of estimate instruction results to match Arm behavior

* PTC version update

* Nits
---
 .../Instructions/InstEmitSimdArithmetic.cs    | 83 +++++++++++++++----
 ARMeilleure/Translation/PTC/Ptc.cs            |  2 +-
 2 files changed, 66 insertions(+), 19 deletions(-)

diff --git a/ARMeilleure/Instructions/InstEmitSimdArithmetic.cs b/ARMeilleure/Instructions/InstEmitSimdArithmetic.cs
index f18b91cfcc..deaa6f5acd 100644
--- a/ARMeilleure/Instructions/InstEmitSimdArithmetic.cs
+++ b/ARMeilleure/Instructions/InstEmitSimdArithmetic.cs
@@ -1475,9 +1475,11 @@ namespace ARMeilleure.Instructions
 
             int sizeF = op.Size & 1;
 
-            if (Optimizations.FastFP && Optimizations.UseSse && sizeF == 0)
+            if (Optimizations.FastFP && Optimizations.UseSse41 && sizeF == 0)
             {
-                EmitScalarUnaryOpF(context, Intrinsic.X86Rcpss, 0);
+                Operand res = EmitSse41FP32RoundExp8(context, context.AddIntrinsic(Intrinsic.X86Rcpss, GetVec(op.Rn)), scalar: true);
+
+                context.Copy(GetVec(op.Rd), context.VectorZeroUpper96(res));
             }
             else
             {
@@ -1494,9 +1496,16 @@ namespace ARMeilleure.Instructions
 
             int sizeF = op.Size & 1;
 
-            if (Optimizations.FastFP && Optimizations.UseSse && sizeF == 0)
+            if (Optimizations.FastFP && Optimizations.UseSse41 && sizeF == 0)
             {
-                EmitVectorUnaryOpF(context, Intrinsic.X86Rcpps, 0);
+                Operand res = EmitSse41FP32RoundExp8(context, context.AddIntrinsic(Intrinsic.X86Rcpps, GetVec(op.Rn)), scalar: false);
+
+                if (op.RegisterSize == RegisterSize.Simd64)
+                {
+                    res = context.VectorZeroUpper64(res);
+                }
+
+                context.Copy(GetVec(op.Rd), res);
             }
             else
             {
@@ -1652,7 +1661,7 @@ namespace ARMeilleure.Instructions
         {
             if (Optimizations.UseSse41)
             {
-                EmitScalarRoundOpF(context, FPRoundingMode.TowardsMinusInfinity);
+                EmitSse41ScalarRoundOpF(context, FPRoundingMode.TowardsMinusInfinity);
             }
             else
             {
@@ -1667,7 +1676,7 @@ namespace ARMeilleure.Instructions
         {
             if (Optimizations.UseSse41)
             {
-                EmitVectorRoundOpF(context, FPRoundingMode.TowardsMinusInfinity);
+                EmitSse41VectorRoundOpF(context, FPRoundingMode.TowardsMinusInfinity);
             }
             else
             {
@@ -1682,7 +1691,7 @@ namespace ARMeilleure.Instructions
         {
             if (Optimizations.UseSse41)
             {
-                EmitScalarRoundOpF(context, FPRoundingMode.ToNearest);
+                EmitSse41ScalarRoundOpF(context, FPRoundingMode.ToNearest);
             }
             else
             {
@@ -1697,7 +1706,7 @@ namespace ARMeilleure.Instructions
         {
             if (Optimizations.UseSse41)
             {
-                EmitVectorRoundOpF(context, FPRoundingMode.ToNearest);
+                EmitSse41VectorRoundOpF(context, FPRoundingMode.ToNearest);
             }
             else
             {
@@ -1712,7 +1721,7 @@ namespace ARMeilleure.Instructions
         {
             if (Optimizations.UseSse41)
             {
-                EmitScalarRoundOpF(context, FPRoundingMode.TowardsPlusInfinity);
+                EmitSse41ScalarRoundOpF(context, FPRoundingMode.TowardsPlusInfinity);
             }
             else
             {
@@ -1727,7 +1736,7 @@ namespace ARMeilleure.Instructions
         {
             if (Optimizations.UseSse41)
             {
-                EmitVectorRoundOpF(context, FPRoundingMode.TowardsPlusInfinity);
+                EmitSse41VectorRoundOpF(context, FPRoundingMode.TowardsPlusInfinity);
             }
             else
             {
@@ -1778,7 +1787,7 @@ namespace ARMeilleure.Instructions
         {
             if (Optimizations.UseSse41)
             {
-                EmitScalarRoundOpF(context, FPRoundingMode.TowardsZero);
+                EmitSse41ScalarRoundOpF(context, FPRoundingMode.TowardsZero);
             }
             else
             {
@@ -1793,7 +1802,7 @@ namespace ARMeilleure.Instructions
         {
             if (Optimizations.UseSse41)
             {
-                EmitVectorRoundOpF(context, FPRoundingMode.TowardsZero);
+                EmitSse41VectorRoundOpF(context, FPRoundingMode.TowardsZero);
             }
             else
             {
@@ -1810,9 +1819,11 @@ namespace ARMeilleure.Instructions
 
             int sizeF = op.Size & 1;
 
-            if (Optimizations.FastFP && Optimizations.UseSse && sizeF == 0)
+            if (Optimizations.FastFP && Optimizations.UseSse41 && sizeF == 0)
             {
-                EmitScalarUnaryOpF(context, Intrinsic.X86Rsqrtss, 0);
+                Operand res = EmitSse41FP32RoundExp8(context, context.AddIntrinsic(Intrinsic.X86Rsqrtss, GetVec(op.Rn)), scalar: true);
+
+                context.Copy(GetVec(op.Rd), context.VectorZeroUpper96(res));
             }
             else
             {
@@ -1829,9 +1840,16 @@ namespace ARMeilleure.Instructions
 
             int sizeF = op.Size & 1;
 
-            if (Optimizations.FastFP && Optimizations.UseSse && sizeF == 0)
+            if (Optimizations.FastFP && Optimizations.UseSse41 && sizeF == 0)
             {
-                EmitVectorUnaryOpF(context, Intrinsic.X86Rsqrtps, 0);
+                Operand res = EmitSse41FP32RoundExp8(context, context.AddIntrinsic(Intrinsic.X86Rsqrtps, GetVec(op.Rn)), scalar: false);
+
+                if (op.RegisterSize == RegisterSize.Simd64)
+                {
+                    res = context.VectorZeroUpper64(res);
+                }
+
+                context.Copy(GetVec(op.Rd), res);
             }
             else
             {
@@ -3498,7 +3516,7 @@ namespace ARMeilleure.Instructions
             return context.ConditionalSelect(cmp, op1, op2);
         }
 
-        private static void EmitScalarRoundOpF(ArmEmitterContext context, FPRoundingMode roundMode)
+        private static void EmitSse41ScalarRoundOpF(ArmEmitterContext context, FPRoundingMode roundMode)
         {
             OpCodeSimd op = (OpCodeSimd)context.CurrOp;
 
@@ -3520,7 +3538,7 @@ namespace ARMeilleure.Instructions
             context.Copy(GetVec(op.Rd), res);
         }
 
-        private static void EmitVectorRoundOpF(ArmEmitterContext context, FPRoundingMode roundMode)
+        private static void EmitSse41VectorRoundOpF(ArmEmitterContext context, FPRoundingMode roundMode)
         {
             OpCodeSimd op = (OpCodeSimd)context.CurrOp;
 
@@ -3538,6 +3556,35 @@ namespace ARMeilleure.Instructions
             context.Copy(GetVec(op.Rd), res);
         }
 
+        private static Operand EmitSse41FP32RoundExp8(ArmEmitterContext context, Operand value, bool scalar)
+        {
+            Operand roundMask;
+            Operand truncMask;
+            Operand expMask;
+
+            if (scalar)
+            {
+                roundMask = X86GetScalar(context, 0x4000);
+                truncMask = X86GetScalar(context, unchecked((int)0xFFFF8000));
+                expMask = X86GetScalar(context, 0x7F800000);
+            }
+            else
+            {
+                roundMask = X86GetAllElements(context, 0x4000);
+                truncMask = X86GetAllElements(context, unchecked((int)0xFFFF8000));
+                expMask = X86GetAllElements(context, 0x7F800000);
+            }
+
+            Operand oValue = value;
+            Operand masked = context.AddIntrinsic(Intrinsic.X86Pand, value, expMask);
+            Operand isNaNInf = context.AddIntrinsic(Intrinsic.X86Pcmpeqw, masked, expMask);
+
+            value = context.AddIntrinsic(Intrinsic.X86Paddw, value, roundMask);
+            value = context.AddIntrinsic(Intrinsic.X86Pand, value, truncMask);
+
+            return context.AddIntrinsic(Intrinsic.X86Blendvps, value, oValue, isNaNInf);
+        }
+
         public static void EmitSse2VectorIsNaNOpF(
             ArmEmitterContext context,
             Operand opF,
diff --git a/ARMeilleure/Translation/PTC/Ptc.cs b/ARMeilleure/Translation/PTC/Ptc.cs
index 75a801e5f8..fd69077b02 100644
--- a/ARMeilleure/Translation/PTC/Ptc.cs
+++ b/ARMeilleure/Translation/PTC/Ptc.cs
@@ -22,7 +22,7 @@ namespace ARMeilleure.Translation.PTC
     {
         private const string HeaderMagic = "PTChd";
 
-        private const int InternalVersion = 1956; //! To be incremented manually for each change to the ARMeilleure project.
+        private const int InternalVersion = 1943; //! To be incremented manually for each change to the ARMeilleure project.
 
         private const string ActualDir = "0";
         private const string BackupDir = "1";