From 464ec7ced8bd8dc9ea8e4021cf602e6caedfffcf Mon Sep 17 00:00:00 2001
From: LDj3SNuD <35856442+LDj3SNuD@users.noreply.github.com>
Date: Mon, 25 Mar 2019 00:23:27 +0100
Subject: [PATCH] Add Cmeq_V, Cmge_V, Cmgt_V, Cmle_V & Cmlt_V (Z & ~Z) Sse
 opt.. (#646)

* Follow-up (Neg_V).

* Follow-up (Not_V & Orn_V).

* Add Cmeq/ge/gt/le/lt_V (Z & ~Z) Sse opt..

* Add EmitLd/Stvectmp2/3.

* Remove Dup (EmitVectorPairwiseSseOrSse2OpF).

* Remove Dup (EmitFcmpOrFcmpe).

* Add S/Uabd/l_V Sse opt.. Remove Dup (Srhadd_V).

* Nit.
---
 .../Instructions/InstEmitSimdArithmetic.cs    | 279 ++++++++++++++++--
 ChocolArm64/Instructions/InstEmitSimdCmp.cs   | 165 +++++++++--
 .../Instructions/InstEmitSimdHelper.cs        |   2 +-
 .../Instructions/InstEmitSimdLogical.cs       |  12 +-
 4 files changed, 403 insertions(+), 55 deletions(-)

diff --git a/ChocolArm64/Instructions/InstEmitSimdArithmetic.cs b/ChocolArm64/Instructions/InstEmitSimdArithmetic.cs
index 5ceea77491..d2d87beffe 100644
--- a/ChocolArm64/Instructions/InstEmitSimdArithmetic.cs
+++ b/ChocolArm64/Instructions/InstEmitSimdArithmetic.cs
@@ -1863,13 +1863,7 @@ namespace ChocolArm64.Instructions
 
                 Type[] typesSub = new Type[] { VectorIntTypesPerSizeLog2[op.Size], VectorIntTypesPerSizeLog2[op.Size] };
 
-                string[] namesSzv = new string[] { nameof(VectorHelper.VectorSByteZero),
-                                                   nameof(VectorHelper.VectorInt16Zero),
-                                                   nameof(VectorHelper.VectorInt32Zero),
-                                                   nameof(VectorHelper.VectorInt64Zero) };
-
-                VectorHelper.EmitCall(context, namesSzv[op.Size]);
-
+                VectorHelper.EmitCall(context, nameof(VectorHelper.VectorSingleZero));
                 context.EmitLdvec(op.Rn);
 
                 context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.Subtract), typesSub));
@@ -1921,20 +1915,125 @@ namespace ChocolArm64.Instructions
 
         public static void Sabd_V(ILEmitterCtx context)
         {
-            EmitVectorBinaryOpSx(context, () =>
+            if (Optimizations.UseSse2)
             {
-                context.Emit(OpCodes.Sub);
-                EmitAbs(context);
-            });
+                OpCodeSimdReg64 op = (OpCodeSimdReg64)context.CurrOp;
+
+                Type[] typesCmpSub = new Type[] { VectorIntTypesPerSizeLog2[op.Size], VectorIntTypesPerSizeLog2[op.Size] };
+                Type[] typesAndOr  = new Type[] { typeof(Vector128<long>), typeof(Vector128<long>) };
+
+                context.EmitLdvec(op.Rn);
+                context.EmitLdvec(op.Rm);
+                context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.CompareGreaterThan), typesCmpSub));
+
+                context.EmitStvectmp(); // Cmp mask
+                context.EmitLdvectmp(); // Cmp mask
+
+                context.EmitLdvec(op.Rn);
+                context.EmitLdvec(op.Rm);
+                context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.Subtract), typesCmpSub));
+
+                context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.And), typesAndOr));
+
+                context.EmitLdvectmp(); // Cmp mask
+
+                context.EmitLdvec(op.Rm);
+                context.EmitLdvec(op.Rn);
+                context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.Subtract), typesCmpSub));
+
+                context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.AndNot), typesAndOr));
+
+                context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.Or), typesAndOr));
+
+                context.EmitStvec(op.Rd);
+
+                if (op.RegisterSize == RegisterSize.Simd64)
+                {
+                    EmitVectorZeroUpper(context, op.Rd);
+                }
+            }
+            else
+            {
+                EmitVectorBinaryOpSx(context, () =>
+                {
+                    context.Emit(OpCodes.Sub);
+                    EmitAbs(context);
+                });
+            }
         }
 
         public static void Sabdl_V(ILEmitterCtx context)
         {
-            EmitVectorWidenRnRmBinaryOpSx(context, () =>
+            OpCodeSimdReg64 op = (OpCodeSimdReg64)context.CurrOp;
+
+            if (Optimizations.UseSse41 && op.Size < 2)
             {
-                context.Emit(OpCodes.Sub);
-                EmitAbs(context);
-            });
+                Type[] typesCmpSub = new Type[] { VectorIntTypesPerSizeLog2[op.Size + 1],
+                                                  VectorIntTypesPerSizeLog2[op.Size + 1] };
+                Type[] typesSrl    = new Type[] { VectorIntTypesPerSizeLog2[op.Size], typeof(byte) };
+                Type[] typesAndOr  = new Type[] { typeof(Vector128<long>), typeof(Vector128<long>) };
+                Type[] typesCvt    = new Type[] { VectorIntTypesPerSizeLog2[op.Size] };
+
+                string nameCvt = op.Size == 0
+                    ? nameof(Sse41.ConvertToVector128Int16)
+                    : nameof(Sse41.ConvertToVector128Int32);
+
+                context.EmitLdvec(op.Rn);
+
+                if (op.RegisterSize == RegisterSize.Simd128)
+                {
+                    context.Emit(OpCodes.Ldc_I4_8);
+                    context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.ShiftRightLogical128BitLane), typesSrl));
+                }
+
+                context.EmitCall(typeof(Sse41).GetMethod(nameCvt, typesCvt));
+
+                context.EmitLdvec(op.Rm);
+
+                if (op.RegisterSize == RegisterSize.Simd128)
+                {
+                    context.Emit(OpCodes.Ldc_I4_8);
+                    context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.ShiftRightLogical128BitLane), typesSrl));
+                }
+
+                context.EmitCall(typeof(Sse41).GetMethod(nameCvt, typesCvt));
+
+                context.EmitStvectmp2(); // Long Rm
+                context.EmitStvectmp();  // Long Rn
+
+                context.EmitLdvectmp();  // Long Rn
+                context.EmitLdvectmp2(); // Long Rm
+                context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.CompareGreaterThan), typesCmpSub));
+
+                context.EmitStvectmp3(); // Cmp mask
+                context.EmitLdvectmp3(); // Cmp mask
+
+                context.EmitLdvectmp();  // Long Rn
+                context.EmitLdvectmp2(); // Long Rm
+                context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.Subtract), typesCmpSub));
+
+                context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.And), typesAndOr));
+
+                context.EmitLdvectmp3(); // Cmp mask
+
+                context.EmitLdvectmp2(); // Long Rm
+                context.EmitLdvectmp();  // Long Rn
+                context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.Subtract), typesCmpSub));
+
+                context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.AndNot), typesAndOr));
+
+                context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.Or), typesAndOr));
+
+                context.EmitStvec(op.Rd);
+            }
+            else
+            {
+                EmitVectorWidenRnRmBinaryOpSx(context, () =>
+                {
+                    context.Emit(OpCodes.Sub);
+                    EmitAbs(context);
+                });
+            }
         }
 
         public static void Sadalp_V(ILEmitterCtx context)
@@ -2430,8 +2529,8 @@ namespace ChocolArm64.Instructions
                 context.EmitLdc_I4(op.Size == 0 ? sbyte.MinValue : short.MinValue);
                 context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.SetAllVector128), typesSav));
 
-                context.Emit(OpCodes.Dup);
                 context.EmitStvectmp();
+                context.EmitLdvectmp();
 
                 context.EmitLdvec(op.Rn);
                 context.EmitLdvectmp();
@@ -2604,20 +2703,152 @@ namespace ChocolArm64.Instructions
 
         public static void Uabd_V(ILEmitterCtx context)
         {
-            EmitVectorBinaryOpZx(context, () =>
+            if (Optimizations.UseSse41)
             {
-                context.Emit(OpCodes.Sub);
-                EmitAbs(context);
-            });
+                OpCodeSimdReg64 op = (OpCodeSimdReg64)context.CurrOp;
+
+                Type[] typesMax    = new Type[] { VectorUIntTypesPerSizeLog2[op.Size], VectorUIntTypesPerSizeLog2[op.Size] };
+                Type[] typesCmpSub = new Type[] { VectorIntTypesPerSizeLog2 [op.Size], VectorIntTypesPerSizeLog2 [op.Size] };
+                Type[] typesAndOr  = new Type[] { typeof(Vector128<long>), typeof(Vector128<long>) };
+                Type[] typesSav    = new Type[] { typeof(long) };
+
+                Type typeSse = op.Size == 0 ? typeof(Sse2) : typeof(Sse41);
+
+                context.EmitLdvec(op.Rm);
+                context.EmitLdvec(op.Rn);
+
+                context.EmitCall(typeSse.GetMethod(nameof(Sse2.Max), typesMax));
+
+                context.EmitLdvec(op.Rm);
+
+                context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.CompareEqual), typesCmpSub));
+
+                context.EmitLdc_I8(-1L);
+                context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.SetAllVector128), typesSav));
+
+                context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.AndNot), typesAndOr));
+
+                context.EmitStvectmp(); // Cmp mask
+                context.EmitLdvectmp(); // Cmp mask
+
+                context.EmitLdvec(op.Rn);
+                context.EmitLdvec(op.Rm);
+                context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.Subtract), typesCmpSub));
+
+                context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.And), typesAndOr));
+
+                context.EmitLdvectmp(); // Cmp mask
+
+                context.EmitLdvec(op.Rm);
+                context.EmitLdvec(op.Rn);
+                context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.Subtract), typesCmpSub));
+
+                context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.AndNot), typesAndOr));
+
+                context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.Or), typesAndOr));
+
+                context.EmitStvec(op.Rd);
+
+                if (op.RegisterSize == RegisterSize.Simd64)
+                {
+                    EmitVectorZeroUpper(context, op.Rd);
+                }
+            }
+            else
+            {
+                EmitVectorBinaryOpZx(context, () =>
+                {
+                    context.Emit(OpCodes.Sub);
+                    EmitAbs(context);
+                });
+            }
         }
 
         public static void Uabdl_V(ILEmitterCtx context)
         {
-            EmitVectorWidenRnRmBinaryOpZx(context, () =>
+            OpCodeSimdReg64 op = (OpCodeSimdReg64)context.CurrOp;
+
+            if (Optimizations.UseSse41 && op.Size < 2)
             {
-                context.Emit(OpCodes.Sub);
-                EmitAbs(context);
-            });
+                Type[] typesMax    = new Type[] { VectorUIntTypesPerSizeLog2[op.Size + 1],
+                                                  VectorUIntTypesPerSizeLog2[op.Size + 1] };
+                Type[] typesCmpSub = new Type[] { VectorIntTypesPerSizeLog2 [op.Size + 1],
+                                                  VectorIntTypesPerSizeLog2 [op.Size + 1] };
+                Type[] typesSrl    = new Type[] { VectorUIntTypesPerSizeLog2[op.Size], typeof(byte) };
+                Type[] typesAndOr  = new Type[] { typeof(Vector128<long>), typeof(Vector128<long>) };
+                Type[] typesCvt    = new Type[] { VectorUIntTypesPerSizeLog2[op.Size] };
+                Type[] typesSav    = new Type[] { typeof(long) };
+
+                string nameCvt = op.Size == 0
+                    ? nameof(Sse41.ConvertToVector128Int16)
+                    : nameof(Sse41.ConvertToVector128Int32);
+
+                context.EmitLdvec(op.Rn);
+
+                if (op.RegisterSize == RegisterSize.Simd128)
+                {
+                    context.Emit(OpCodes.Ldc_I4_8);
+                    context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.ShiftRightLogical128BitLane), typesSrl));
+                }
+
+                context.EmitCall(typeof(Sse41).GetMethod(nameCvt, typesCvt));
+
+                context.EmitLdvec(op.Rm);
+
+                if (op.RegisterSize == RegisterSize.Simd128)
+                {
+                    context.Emit(OpCodes.Ldc_I4_8);
+                    context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.ShiftRightLogical128BitLane), typesSrl));
+                }
+
+                context.EmitCall(typeof(Sse41).GetMethod(nameCvt, typesCvt));
+
+                context.EmitStvectmp2(); // Long Rm
+                context.EmitStvectmp();  // Long Rn
+
+                context.EmitLdvectmp2(); // Long Rm
+                context.EmitLdvectmp();  // Long Rn
+
+                context.EmitCall(typeof(Sse41).GetMethod(nameof(Sse41.Max), typesMax));
+
+                context.EmitLdvectmp2(); // Long Rm
+
+                context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.CompareEqual), typesCmpSub));
+
+                context.EmitLdc_I8(-1L);
+                context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.SetAllVector128), typesSav));
+
+                context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.AndNot), typesAndOr));
+
+                context.EmitStvectmp3(); // Cmp mask
+                context.EmitLdvectmp3(); // Cmp mask
+
+                context.EmitLdvectmp();  // Long Rn
+                context.EmitLdvectmp2(); // Long Rm
+                context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.Subtract), typesCmpSub));
+
+                context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.And), typesAndOr));
+
+                context.EmitLdvectmp3(); // Cmp mask
+
+                context.EmitLdvectmp2(); // Long Rm
+                context.EmitLdvectmp();  // Long Rn
+                context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.Subtract), typesCmpSub));
+
+                context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.AndNot), typesAndOr));
+
+                context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.Or), typesAndOr));
+
+                context.EmitStvec(op.Rd);
+            }
+            else
+            {
+                EmitVectorWidenRnRmBinaryOpZx(context, () =>
+                {
+                    context.Emit(OpCodes.Sub);
+                    EmitAbs(context);
+                });
+            }
         }
 
         public static void Uadalp_V(ILEmitterCtx context)
diff --git a/ChocolArm64/Instructions/InstEmitSimdCmp.cs b/ChocolArm64/Instructions/InstEmitSimdCmp.cs
index 62cf772091..e6b33f797b 100644
--- a/ChocolArm64/Instructions/InstEmitSimdCmp.cs
+++ b/ChocolArm64/Instructions/InstEmitSimdCmp.cs
@@ -20,19 +20,32 @@ namespace ChocolArm64.Instructions
 
         public static void Cmeq_V(ILEmitterCtx context)
         {
-            if (context.CurrOp is OpCodeSimdReg64 op)
+            if (Optimizations.UseSse41)
             {
-                if (op.Size < 3 && Optimizations.UseSse2)
+                OpCodeSimd64 op = (OpCodeSimd64)context.CurrOp;
+
+                Type[] typesCmp = new Type[] { VectorIntTypesPerSizeLog2[op.Size], VectorIntTypesPerSizeLog2[op.Size] };
+
+                Type typeSse = op.Size != 3 ? typeof(Sse2) : typeof(Sse41);
+
+                context.EmitLdvec(op.Rn);
+
+                if (op is OpCodeSimdReg64 binOp)
                 {
-                    EmitSse2Op(context, nameof(Sse2.CompareEqual));
-                }
-                else if (op.Size == 3 && Optimizations.UseSse41)
-                {
-                    EmitSse41Op(context, nameof(Sse41.CompareEqual));
+                    context.EmitLdvec(binOp.Rm);
                 }
                 else
                 {
-                    EmitCmpOp(context, OpCodes.Beq_S, scalar: false);
+                    VectorHelper.EmitCall(context, nameof(VectorHelper.VectorSingleZero));
+                }
+
+                context.EmitCall(typeSse.GetMethod(nameof(Sse2.CompareEqual), typesCmp));
+
+                context.EmitStvec(op.Rd);
+
+                if (op.RegisterSize == RegisterSize.Simd64)
+                {
+                    EmitVectorZeroUpper(context, op.Rd);
                 }
             }
             else
@@ -48,7 +61,45 @@ namespace ChocolArm64.Instructions
 
         public static void Cmge_V(ILEmitterCtx context)
         {
-            EmitCmpOp(context, OpCodes.Bge_S, scalar: false);
+            if (Optimizations.UseSse42)
+            {
+                OpCodeSimd64 op = (OpCodeSimd64)context.CurrOp;
+
+                Type[] typesCmp = new Type[] { VectorIntTypesPerSizeLog2[op.Size], VectorIntTypesPerSizeLog2[op.Size] };
+                Type[] typesAnt = new Type[] { typeof(Vector128<long>), typeof(Vector128<long>) };
+                Type[] typesSav = new Type[] { typeof(long) };
+
+                Type typeSse = op.Size != 3 ? typeof(Sse2) : typeof(Sse42);
+
+                if (op is OpCodeSimdReg64 binOp)
+                {
+                    context.EmitLdvec(binOp.Rm);
+                }
+                else
+                {
+                    VectorHelper.EmitCall(context, nameof(VectorHelper.VectorSingleZero));
+                }
+
+                context.EmitLdvec(op.Rn);
+
+                context.EmitCall(typeSse.GetMethod(nameof(Sse2.CompareGreaterThan), typesCmp));
+
+                context.EmitLdc_I8(-1L);
+                context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.SetAllVector128), typesSav));
+
+                context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.AndNot), typesAnt));
+
+                context.EmitStvec(op.Rd);
+
+                if (op.RegisterSize == RegisterSize.Simd64)
+                {
+                    EmitVectorZeroUpper(context, op.Rd);
+                }
+            }
+            else
+            {
+                EmitCmpOp(context, OpCodes.Bge_S, scalar: false);
+            }
         }
 
         public static void Cmgt_S(ILEmitterCtx context)
@@ -58,19 +109,32 @@ namespace ChocolArm64.Instructions
 
         public static void Cmgt_V(ILEmitterCtx context)
         {
-            if (context.CurrOp is OpCodeSimdReg64 op)
+            if (Optimizations.UseSse42)
             {
-                if (op.Size < 3 && Optimizations.UseSse2)
+                OpCodeSimd64 op = (OpCodeSimd64)context.CurrOp;
+
+                Type[] typesCmp = new Type[] { VectorIntTypesPerSizeLog2[op.Size], VectorIntTypesPerSizeLog2[op.Size] };
+
+                Type typeSse = op.Size != 3 ? typeof(Sse2) : typeof(Sse42);
+
+                context.EmitLdvec(op.Rn);
+
+                if (op is OpCodeSimdReg64 binOp)
                 {
-                    EmitSse2Op(context, nameof(Sse2.CompareGreaterThan));
-                }
-                else if (op.Size == 3 && Optimizations.UseSse42)
-                {
-                    EmitSse42Op(context, nameof(Sse42.CompareGreaterThan));
+                    context.EmitLdvec(binOp.Rm);
                 }
                 else
                 {
-                    EmitCmpOp(context, OpCodes.Bgt_S, scalar: false);
+                    VectorHelper.EmitCall(context, nameof(VectorHelper.VectorSingleZero));
+                }
+
+                context.EmitCall(typeSse.GetMethod(nameof(Sse2.CompareGreaterThan), typesCmp));
+
+                context.EmitStvec(op.Rd);
+
+                if (op.RegisterSize == RegisterSize.Simd64)
+                {
+                    EmitVectorZeroUpper(context, op.Rd);
                 }
             }
             else
@@ -92,8 +156,8 @@ namespace ChocolArm64.Instructions
             {
                 Type[] typesMax = new Type[] { VectorUIntTypesPerSizeLog2[op.Size], VectorUIntTypesPerSizeLog2[op.Size] };
                 Type[] typesCmp = new Type[] { VectorIntTypesPerSizeLog2 [op.Size], VectorIntTypesPerSizeLog2 [op.Size] };
-                Type[] typesAnt = new Type[] { typeof(Vector128<byte>), typeof(Vector128<byte>) };
-                Type[] typesSav = new Type[] { typeof(byte) };
+                Type[] typesAnt = new Type[] { typeof(Vector128<long>), typeof(Vector128<long>) };
+                Type[] typesSav = new Type[] { typeof(long) };
 
                 Type typeSse = op.Size == 0 ? typeof(Sse2) : typeof(Sse41);
 
@@ -106,7 +170,7 @@ namespace ChocolArm64.Instructions
 
                 context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.CompareEqual), typesCmp));
 
-                context.EmitLdc_I4(byte.MaxValue);
+                context.EmitLdc_I8(-1L);
                 context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.SetAllVector128), typesSav));
 
                 context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.AndNot), typesAnt));
@@ -169,7 +233,37 @@ namespace ChocolArm64.Instructions
 
         public static void Cmle_V(ILEmitterCtx context)
         {
-            EmitCmpOp(context, OpCodes.Ble_S, scalar: false);
+            if (Optimizations.UseSse42)
+            {
+                OpCodeSimd64 op = (OpCodeSimd64)context.CurrOp;
+
+                Type[] typesCmp = new Type[] { VectorIntTypesPerSizeLog2[op.Size], VectorIntTypesPerSizeLog2[op.Size] };
+                Type[] typesAnt = new Type[] { typeof(Vector128<long>), typeof(Vector128<long>) };
+                Type[] typesSav = new Type[] { typeof(long) };
+
+                Type typeSse = op.Size != 3 ? typeof(Sse2) : typeof(Sse42);
+
+                context.EmitLdvec(op.Rn);
+                VectorHelper.EmitCall(context, nameof(VectorHelper.VectorSingleZero));
+
+                context.EmitCall(typeSse.GetMethod(nameof(Sse2.CompareGreaterThan), typesCmp));
+
+                context.EmitLdc_I8(-1L);
+                context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.SetAllVector128), typesSav));
+
+                context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.AndNot), typesAnt));
+
+                context.EmitStvec(op.Rd);
+
+                if (op.RegisterSize == RegisterSize.Simd64)
+                {
+                    EmitVectorZeroUpper(context, op.Rd);
+                }
+            }
+            else
+            {
+                EmitCmpOp(context, OpCodes.Ble_S, scalar: false);
+            }
         }
 
         public static void Cmlt_S(ILEmitterCtx context)
@@ -179,7 +273,30 @@ namespace ChocolArm64.Instructions
 
         public static void Cmlt_V(ILEmitterCtx context)
         {
-            EmitCmpOp(context, OpCodes.Blt_S, scalar: false);
+            if (Optimizations.UseSse42)
+            {
+                OpCodeSimd64 op = (OpCodeSimd64)context.CurrOp;
+
+                Type[] typesCmp = new Type[] { VectorIntTypesPerSizeLog2[op.Size], VectorIntTypesPerSizeLog2[op.Size] };
+
+                Type typeSse = op.Size != 3 ? typeof(Sse2) : typeof(Sse42);
+
+                VectorHelper.EmitCall(context, nameof(VectorHelper.VectorSingleZero));
+                context.EmitLdvec(op.Rn);
+
+                context.EmitCall(typeSse.GetMethod(nameof(Sse2.CompareGreaterThan), typesCmp));
+
+                context.EmitStvec(op.Rd);
+
+                if (op.RegisterSize == RegisterSize.Simd64)
+                {
+                    EmitVectorZeroUpper(context, op.Rd);
+                }
+            }
+            else
+            {
+                EmitCmpOp(context, OpCodes.Blt_S, scalar: false);
+            }
         }
 
         public static void Cmtst_S(ILEmitterCtx context)
@@ -390,8 +507,8 @@ namespace ChocolArm64.Instructions
                         context.EmitLdvec(op.Rm);
                     }
 
-                    context.Emit(OpCodes.Dup);
                     context.EmitStvectmp();
+                    context.EmitLdvectmp();
 
                     context.EmitCall(typeof(Sse).GetMethod(nameof(Sse.CompareOrderedScalar), typesCmp));
                     VectorHelper.EmitCall(context, nameof(VectorHelper.VectorSingleZero));
@@ -453,8 +570,8 @@ namespace ChocolArm64.Instructions
                         context.EmitLdvec(op.Rm);
                     }
 
-                    context.Emit(OpCodes.Dup);
                     context.EmitStvectmp();
+                    context.EmitLdvectmp();
 
                     context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.CompareOrderedScalar), typesCmp));
                     VectorHelper.EmitCall(context, nameof(VectorHelper.VectorDoubleZero));
diff --git a/ChocolArm64/Instructions/InstEmitSimdHelper.cs b/ChocolArm64/Instructions/InstEmitSimdHelper.cs
index 10b86a3e17..56ef1fdca4 100644
--- a/ChocolArm64/Instructions/InstEmitSimdHelper.cs
+++ b/ChocolArm64/Instructions/InstEmitSimdHelper.cs
@@ -872,8 +872,8 @@ namespace ChocolArm64.Instructions
 
                     context.EmitCall(typeof(Sse).GetMethod(nameof(Sse.UnpackLow), types));
 
-                    context.Emit(OpCodes.Dup);
                     context.EmitStvectmp();
+                    context.EmitLdvectmp();
 
                     VectorHelper.EmitCall(context, nameof(VectorHelper.VectorSingleZero));
 
diff --git a/ChocolArm64/Instructions/InstEmitSimdLogical.cs b/ChocolArm64/Instructions/InstEmitSimdLogical.cs
index bf80bada3e..a5a9227410 100644
--- a/ChocolArm64/Instructions/InstEmitSimdLogical.cs
+++ b/ChocolArm64/Instructions/InstEmitSimdLogical.cs
@@ -193,12 +193,12 @@ namespace ChocolArm64.Instructions
             {
                 OpCodeSimd64 op = (OpCodeSimd64)context.CurrOp;
 
-                Type[] typesSav = new Type[] { typeof(byte) };
-                Type[] typesAnt = new Type[] { typeof(Vector128<byte>), typeof(Vector128<byte>) };
+                Type[] typesSav = new Type[] { typeof(long) };
+                Type[] typesAnt = new Type[] { typeof(Vector128<long>), typeof(Vector128<long>) };
 
                 context.EmitLdvec(op.Rn);
 
-                context.EmitLdc_I4(byte.MaxValue);
+                context.EmitLdc_I8(-1L);
                 context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.SetAllVector128), typesSav));
 
                 context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.AndNot), typesAnt));
@@ -222,13 +222,13 @@ namespace ChocolArm64.Instructions
             {
                 OpCodeSimdReg64 op = (OpCodeSimdReg64)context.CurrOp;
 
-                Type[] typesSav   = new Type[] { typeof(byte) };
-                Type[] typesAntOr = new Type[] { typeof(Vector128<byte>), typeof(Vector128<byte>) };
+                Type[] typesSav   = new Type[] { typeof(long) };
+                Type[] typesAntOr = new Type[] { typeof(Vector128<long>), typeof(Vector128<long>) };
 
                 context.EmitLdvec(op.Rn);
                 context.EmitLdvec(op.Rm);
 
-                context.EmitLdc_I4(byte.MaxValue);
+                context.EmitLdc_I8(-1L);
                 context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.SetAllVector128), typesSav));
 
                 context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.AndNot), typesAntOr));