From 1bef70c068f8aeb6a3a518b8ca635de19122da14 Mon Sep 17 00:00:00 2001
From: LDj3SNuD <35856442+LDj3SNuD@users.noreply.github.com>
Date: Wed, 13 Mar 2019 09:23:52 +0100
Subject: [PATCH] Add Rshrn_V & Shrn_V Sse opt.. Add Mla_V, Mls_V & Mul_V Sse
 opt.; add Tests. (#614)

* Update CountLeadingZeros().

* Remove obsolete Tests.

* Follow-up.

* Follow-up.

* Follow-up.

* Add Mla_V, Mls_V & Mul_V Tests.

* Update PackageReferences.

* Remove EmitLd/Stvectmp2().

* Remove Dup. Nits.

* Remove EmitLd/Stvectmp2() & Dup; nits.

* Remove Tmp stuff & Dup; rework Fcvtz() as Fcvtn().

* Remove Tmp stuff, EmitLd/Stvectmp2() & Dup. Nits.

* Add (R)shrn_V Sse opt.; add "Part" & "Shift" opt..

Remove Tmp stuff; remove Dup.
Nits.

* Add Mla/Mls/Mul_V Sse opt.. Add "Part" opt..

Remove EmitLd/Stvectmp2(), remove Dup.
Nits.

* Nits.

* Nits.

* Nit.

* Add "Part" opt.. Nit.

* Nit.

* Nit.

* Add Cmhi_V & Cmhs_V Sse opt..
---
 .../Instructions/InstEmitSimdArithmetic.cs    | 407 ++++++++++++------
 ChocolArm64/Instructions/InstEmitSimdCmp.cs   | 109 +++--
 ChocolArm64/Instructions/InstEmitSimdCvt.cs   | 195 +++------
 .../Instructions/InstEmitSimdHelper.cs        |  83 ++--
 .../Instructions/InstEmitSimdLogical.cs       |  33 +-
 ChocolArm64/Instructions/InstEmitSimdShift.cs | 187 +++++---
 ChocolArm64/Translation/ILEmitterCtx.cs       |   6 +-
 Ryujinx.Common/Utilities/BitUtils.cs          |   4 +-
 Ryujinx.Tests/Cpu/CpuTestSimdArithmetic.cs    |  34 --
 Ryujinx.Tests/Cpu/CpuTestSimdExt.cs           |   7 +-
 Ryujinx.Tests/Cpu/CpuTestSimdIns.cs           |  11 +-
 Ryujinx.Tests/Cpu/CpuTestSimdReg.cs           |  64 +++
 Ryujinx.Tests/Cpu/CpuTestSimdShImm.cs         |  47 +-
 Ryujinx.Tests/Ryujinx.Tests.csproj            |   4 +-
 14 files changed, 707 insertions(+), 484 deletions(-)

diff --git a/ChocolArm64/Instructions/InstEmitSimdArithmetic.cs b/ChocolArm64/Instructions/InstEmitSimdArithmetic.cs
index f7236e9a4a..5ceea77491 100644
--- a/ChocolArm64/Instructions/InstEmitSimdArithmetic.cs
+++ b/ChocolArm64/Instructions/InstEmitSimdArithmetic.cs
@@ -1,4 +1,5 @@
 // https://github.com/intel/ARM_NEON_2_x86_SSE/blob/master/NEON_2_SSE.h
+// https://www.agner.org/optimize/#vectorclass @ vectori128.h
 
 using ChocolArm64.Decoders;
 using ChocolArm64.State;
@@ -184,8 +185,8 @@ namespace ChocolArm64.Instructions
 
                 if (sizeF == 0)
                 {
-                    Type[] typesSsv       = new Type[] { typeof(float) };
-                    Type[] typesSubAndNot = new Type[] { typeof(Vector128<float>), typeof(Vector128<float>) };
+                    Type[] typesSsv    = new Type[] { typeof(float) };
+                    Type[] typesSubAnt = new Type[] { typeof(Vector128<float>), typeof(Vector128<float>) };
 
                     context.EmitLdc_R4(-0f);
                     context.EmitCall(typeof(Sse).GetMethod(nameof(Sse.SetScalarVector128), typesSsv));
@@ -193,8 +194,8 @@ namespace ChocolArm64.Instructions
                     context.EmitLdvec(op.Rn);
                     context.EmitLdvec(op.Rm);
 
-                    context.EmitCall(typeof(Sse).GetMethod(nameof(Sse.SubtractScalar), typesSubAndNot));
-                    context.EmitCall(typeof(Sse).GetMethod(nameof(Sse.AndNot),         typesSubAndNot));
+                    context.EmitCall(typeof(Sse).GetMethod(nameof(Sse.SubtractScalar), typesSubAnt));
+                    context.EmitCall(typeof(Sse).GetMethod(nameof(Sse.AndNot),         typesSubAnt));
 
                     context.EmitStvec(op.Rd);
 
@@ -202,8 +203,8 @@ namespace ChocolArm64.Instructions
                 }
                 else /* if (sizeF == 1) */
                 {
-                    Type[] typesSsv       = new Type[] { typeof(double) };
-                    Type[] typesSubAndNot = new Type[] { typeof(Vector128<double>), typeof(Vector128<double>) };
+                    Type[] typesSsv    = new Type[] { typeof(double) };
+                    Type[] typesSubAnt = new Type[] { typeof(Vector128<double>), typeof(Vector128<double>) };
 
                     context.EmitLdc_R8(-0d);
                     context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.SetScalarVector128), typesSsv));
@@ -211,8 +212,8 @@ namespace ChocolArm64.Instructions
                     context.EmitLdvec(op.Rn);
                     context.EmitLdvec(op.Rm);
 
-                    context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.SubtractScalar), typesSubAndNot));
-                    context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.AndNot),         typesSubAndNot));
+                    context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.SubtractScalar), typesSubAnt));
+                    context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.AndNot),         typesSubAnt));
 
                     context.EmitStvec(op.Rd);
 
@@ -240,8 +241,8 @@ namespace ChocolArm64.Instructions
 
                 if (sizeF == 0)
                 {
-                    Type[] typesSav       = new Type[] { typeof(float) };
-                    Type[] typesSubAndNot = new Type[] { typeof(Vector128<float>), typeof(Vector128<float>) };
+                    Type[] typesSav    = new Type[] { typeof(float) };
+                    Type[] typesSubAnt = new Type[] { typeof(Vector128<float>), typeof(Vector128<float>) };
 
                     context.EmitLdc_R4(-0f);
                     context.EmitCall(typeof(Sse).GetMethod(nameof(Sse.SetAllVector128), typesSav));
@@ -249,8 +250,8 @@ namespace ChocolArm64.Instructions
                     context.EmitLdvec(op.Rn);
                     context.EmitLdvec(op.Rm);
 
-                    context.EmitCall(typeof(Sse).GetMethod(nameof(Sse.Subtract), typesSubAndNot));
-                    context.EmitCall(typeof(Sse).GetMethod(nameof(Sse.AndNot),   typesSubAndNot));
+                    context.EmitCall(typeof(Sse).GetMethod(nameof(Sse.Subtract), typesSubAnt));
+                    context.EmitCall(typeof(Sse).GetMethod(nameof(Sse.AndNot),   typesSubAnt));
 
                     context.EmitStvec(op.Rd);
 
@@ -261,8 +262,8 @@ namespace ChocolArm64.Instructions
                 }
                 else /* if (sizeF == 1) */
                 {
-                    Type[] typesSav       = new Type[] { typeof(double) };
-                    Type[] typesSubAndNot = new Type[] { typeof(Vector128<double>), typeof(Vector128<double>) };
+                    Type[] typesSav    = new Type[] { typeof(double) };
+                    Type[] typesSubAnt = new Type[] { typeof(Vector128<double>), typeof(Vector128<double>) };
 
                     context.EmitLdc_R8(-0d);
                     context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.SetAllVector128), typesSav));
@@ -270,8 +271,8 @@ namespace ChocolArm64.Instructions
                     context.EmitLdvec(op.Rn);
                     context.EmitLdvec(op.Rm);
 
-                    context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.Subtract), typesSubAndNot));
-                    context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.AndNot),   typesSubAndNot));
+                    context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.Subtract), typesSubAnt));
+                    context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.AndNot),   typesSubAnt));
 
                     context.EmitStvec(op.Rd);
                 }
@@ -295,15 +296,15 @@ namespace ChocolArm64.Instructions
 
                 if (op.Size == 0)
                 {
-                    Type[] typesSsv    = new Type[] { typeof(float) };
-                    Type[] typesAndNot = new Type[] { typeof(Vector128<float>), typeof(Vector128<float>) };
+                    Type[] typesSsv = new Type[] { typeof(float) };
+                    Type[] typesAnt = new Type[] { typeof(Vector128<float>), typeof(Vector128<float>) };
 
                     context.EmitLdc_R4(-0f);
                     context.EmitCall(typeof(Sse).GetMethod(nameof(Sse.SetScalarVector128), typesSsv));
 
                     context.EmitLdvec(op.Rn);
 
-                    context.EmitCall(typeof(Sse).GetMethod(nameof(Sse.AndNot), typesAndNot));
+                    context.EmitCall(typeof(Sse).GetMethod(nameof(Sse.AndNot), typesAnt));
 
                     context.EmitStvec(op.Rd);
 
@@ -311,15 +312,15 @@ namespace ChocolArm64.Instructions
                 }
                 else /* if (op.Size == 1) */
                 {
-                    Type[] typesSsv    = new Type[] { typeof(double) };
-                    Type[] typesAndNot = new Type[] { typeof(Vector128<double>), typeof(Vector128<double>) };
+                    Type[] typesSsv = new Type[] { typeof(double) };
+                    Type[] typesAnt = new Type[] { typeof(Vector128<double>), typeof(Vector128<double>) };
 
                     context.EmitLdc_R8(-0d);
                     context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.SetScalarVector128), typesSsv));
 
                     context.EmitLdvec(op.Rn);
 
-                    context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.AndNot), typesAndNot));
+                    context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.AndNot), typesAnt));
 
                     context.EmitStvec(op.Rd);
 
@@ -345,15 +346,15 @@ namespace ChocolArm64.Instructions
 
                 if (sizeF == 0)
                 {
-                    Type[] typesSav    = new Type[] { typeof(float) };
-                    Type[] typesAndNot = new Type[] { typeof(Vector128<float>), typeof(Vector128<float>) };
+                    Type[] typesSav = new Type[] { typeof(float) };
+                    Type[] typesAnt = new Type[] { typeof(Vector128<float>), typeof(Vector128<float>) };
 
                     context.EmitLdc_R4(-0f);
                     context.EmitCall(typeof(Sse).GetMethod(nameof(Sse.SetAllVector128), typesSav));
 
                     context.EmitLdvec(op.Rn);
 
-                    context.EmitCall(typeof(Sse).GetMethod(nameof(Sse.AndNot), typesAndNot));
+                    context.EmitCall(typeof(Sse).GetMethod(nameof(Sse.AndNot), typesAnt));
 
                     context.EmitStvec(op.Rd);
 
@@ -364,15 +365,15 @@ namespace ChocolArm64.Instructions
                 }
                 else /* if (sizeF == 1) */
                 {
-                    Type[] typesSav    = new Type[] { typeof(double) };
-                    Type[] typesAndNot = new Type[] { typeof(Vector128<double>), typeof(Vector128<double>) };
+                    Type[] typesSav = new Type[] { typeof(double) };
+                    Type[] typesAnt = new Type[] { typeof(Vector128<double>), typeof(Vector128<double>) };
 
                     context.EmitLdc_R8(-0d);
                     context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.SetAllVector128), typesSav));
 
                     context.EmitLdvec(op.Rn);
 
-                    context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.AndNot), typesAndNot));
+                    context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.AndNot), typesAnt));
 
                     context.EmitStvec(op.Rd);
                 }
@@ -429,7 +430,7 @@ namespace ChocolArm64.Instructions
                     Type[] typesAddH = new Type[] { typeof(Vector128<float>), typeof(Vector128<float>) };
 
                     context.EmitLdvec(op.Rn);
-                    context.Emit(OpCodes.Dup);
+                    context.EmitLdvec(op.Rn);
 
                     context.EmitCall(typeof(Sse3).GetMethod(nameof(Sse3.HorizontalAdd), typesAddH));
 
@@ -442,7 +443,7 @@ namespace ChocolArm64.Instructions
                     Type[] typesAddH = new Type[] { typeof(Vector128<double>), typeof(Vector128<double>) };
 
                     context.EmitLdvec(op.Rn);
-                    context.Emit(OpCodes.Dup);
+                    context.EmitLdvec(op.Rn);
 
                     context.EmitCall(typeof(Sse3).GetMethod(nameof(Sse3.HorizontalAdd), typesAddH));
 
@@ -748,11 +749,13 @@ namespace ChocolArm64.Instructions
 
                     context.EmitLdvec(op.Rd);
                     context.EmitLdvec(op.Rn);
+
+                    context.EmitLdvec(op.Rm);
                     context.EmitLdvec(op.Rm);
-                    context.Emit(OpCodes.Dup);
 
                     context.EmitLdc_I4(op.Index | op.Index << 2 | op.Index << 4 | op.Index << 6);
-                    context.EmitCall(typeof(Sse).GetMethod(nameof(Sse.Shuffle),  typesSfl));
+                    context.EmitCall(typeof(Sse).GetMethod(nameof(Sse.Shuffle), typesSfl));
+
                     context.EmitCall(typeof(Sse).GetMethod(nameof(Sse.Multiply), typesMulAdd));
                     context.EmitCall(typeof(Sse).GetMethod(nameof(Sse.Add),      typesMulAdd));
 
@@ -770,11 +773,13 @@ namespace ChocolArm64.Instructions
 
                     context.EmitLdvec(op.Rd);
                     context.EmitLdvec(op.Rn);
+
+                    context.EmitLdvec(op.Rm);
                     context.EmitLdvec(op.Rm);
-                    context.Emit(OpCodes.Dup);
 
                     context.EmitLdc_I4(op.Index | op.Index << 1);
-                    context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.Shuffle),  typesSfl));
+                    context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.Shuffle), typesSfl));
+
                     context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.Multiply), typesMulAdd));
                     context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.Add),      typesMulAdd));
 
@@ -863,11 +868,13 @@ namespace ChocolArm64.Instructions
 
                     context.EmitLdvec(op.Rd);
                     context.EmitLdvec(op.Rn);
+
+                    context.EmitLdvec(op.Rm);
                     context.EmitLdvec(op.Rm);
-                    context.Emit(OpCodes.Dup);
 
                     context.EmitLdc_I4(op.Index | op.Index << 2 | op.Index << 4 | op.Index << 6);
-                    context.EmitCall(typeof(Sse).GetMethod(nameof(Sse.Shuffle),  typesSfl));
+                    context.EmitCall(typeof(Sse).GetMethod(nameof(Sse.Shuffle), typesSfl));
+
                     context.EmitCall(typeof(Sse).GetMethod(nameof(Sse.Multiply), typesMulSub));
                     context.EmitCall(typeof(Sse).GetMethod(nameof(Sse.Subtract), typesMulSub));
 
@@ -885,11 +892,13 @@ namespace ChocolArm64.Instructions
 
                     context.EmitLdvec(op.Rd);
                     context.EmitLdvec(op.Rn);
+
+                    context.EmitLdvec(op.Rm);
                     context.EmitLdvec(op.Rm);
-                    context.Emit(OpCodes.Dup);
 
                     context.EmitLdc_I4(op.Index | op.Index << 1);
-                    context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.Shuffle),  typesSfl));
+                    context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.Shuffle), typesSfl));
+
                     context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.Multiply), typesMulSub));
                     context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.Subtract), typesMulSub));
 
@@ -1000,11 +1009,13 @@ namespace ChocolArm64.Instructions
                     Type[] typesMul = new Type[] { typeof(Vector128<float>), typeof(Vector128<float>) };
 
                     context.EmitLdvec(op.Rn);
+
+                    context.EmitLdvec(op.Rm);
                     context.EmitLdvec(op.Rm);
-                    context.Emit(OpCodes.Dup);
 
                     context.EmitLdc_I4(op.Index | op.Index << 2 | op.Index << 4 | op.Index << 6);
-                    context.EmitCall(typeof(Sse).GetMethod(nameof(Sse.Shuffle),  typesSfl));
+                    context.EmitCall(typeof(Sse).GetMethod(nameof(Sse.Shuffle), typesSfl));
+
                     context.EmitCall(typeof(Sse).GetMethod(nameof(Sse.Multiply), typesMul));
 
                     context.EmitStvec(op.Rd);
@@ -1020,11 +1031,13 @@ namespace ChocolArm64.Instructions
                     Type[] typesMul = new Type[] { typeof(Vector128<double>), typeof(Vector128<double>) };
 
                     context.EmitLdvec(op.Rn);
+
+                    context.EmitLdvec(op.Rm);
                     context.EmitLdvec(op.Rm);
-                    context.Emit(OpCodes.Dup);
 
                     context.EmitLdc_I4(op.Index | op.Index << 1);
-                    context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.Shuffle),  typesSfl));
+                    context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.Shuffle), typesSfl));
+
                     context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.Multiply), typesMul));
 
                     context.EmitStvec(op.Rd);
@@ -1772,11 +1785,18 @@ namespace ChocolArm64.Instructions
 
         public static void Mla_V(ILEmitterCtx context)
         {
-            EmitVectorTernaryOpZx(context, () =>
+            if (Optimizations.UseSse41)
             {
-                context.Emit(OpCodes.Mul);
-                context.Emit(OpCodes.Add);
-            });
+                EmitSse41Mul_AddSub(context, nameof(Sse2.Add));
+            }
+            else
+            {
+                EmitVectorTernaryOpZx(context, () =>
+                {
+                    context.Emit(OpCodes.Mul);
+                    context.Emit(OpCodes.Add);
+                });
+            }
         }
 
         public static void Mla_Ve(ILEmitterCtx context)
@@ -1790,11 +1810,18 @@ namespace ChocolArm64.Instructions
 
         public static void Mls_V(ILEmitterCtx context)
         {
-            EmitVectorTernaryOpZx(context, () =>
+            if (Optimizations.UseSse41)
             {
-                context.Emit(OpCodes.Mul);
-                context.Emit(OpCodes.Sub);
-            });
+                EmitSse41Mul_AddSub(context, nameof(Sse2.Subtract));
+            }
+            else
+            {
+                EmitVectorTernaryOpZx(context, () =>
+                {
+                    context.Emit(OpCodes.Mul);
+                    context.Emit(OpCodes.Sub);
+                });
+            }
         }
 
         public static void Mls_Ve(ILEmitterCtx context)
@@ -1808,7 +1835,14 @@ namespace ChocolArm64.Instructions
 
         public static void Mul_V(ILEmitterCtx context)
         {
-            EmitVectorBinaryOpZx(context, () => context.Emit(OpCodes.Mul));
+            if (Optimizations.UseSse41)
+            {
+                EmitSse41Mul_AddSub(context);
+            }
+            else
+            {
+                EmitVectorBinaryOpZx(context, () => context.Emit(OpCodes.Mul));
+            }
         }
 
         public static void Mul_Ve(ILEmitterCtx context)
@@ -1923,19 +1957,23 @@ namespace ChocolArm64.Instructions
                                                    nameof(Sse41.ConvertToVector128Int32),
                                                    nameof(Sse41.ConvertToVector128Int64) };
 
-                int numBytes = op.RegisterSize == RegisterSize.Simd128 ? 8 : 0;
-
                 context.EmitLdvec(op.Rn);
 
-                context.EmitLdc_I4(numBytes);
-                context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.ShiftRightLogical128BitLane), typesSrl));
+                if (op.RegisterSize == RegisterSize.Simd128)
+                {
+                    context.Emit(OpCodes.Ldc_I4_8);
+                    context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.ShiftRightLogical128BitLane), typesSrl));
+                }
 
                 context.EmitCall(typeof(Sse41).GetMethod(namesCvt[op.Size], typesCvt));
 
                 context.EmitLdvec(op.Rm);
 
-                context.EmitLdc_I4(numBytes);
-                context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.ShiftRightLogical128BitLane), typesSrl));
+                if (op.RegisterSize == RegisterSize.Simd128)
+                {
+                    context.Emit(OpCodes.Ldc_I4_8);
+                    context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.ShiftRightLogical128BitLane), typesSrl));
+                }
 
                 context.EmitCall(typeof(Sse41).GetMethod(namesCvt[op.Size], typesCvt));
 
@@ -1969,13 +2007,14 @@ namespace ChocolArm64.Instructions
                                                    nameof(Sse41.ConvertToVector128Int32),
                                                    nameof(Sse41.ConvertToVector128Int64) };
 
-                int numBytes = op.RegisterSize == RegisterSize.Simd128 ? 8 : 0;
-
                 context.EmitLdvec(op.Rn);
                 context.EmitLdvec(op.Rm);
 
-                context.EmitLdc_I4(numBytes);
-                context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.ShiftRightLogical128BitLane), typesSrl));
+                if (op.RegisterSize == RegisterSize.Simd128)
+                {
+                    context.Emit(OpCodes.Ldc_I4_8);
+                    context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.ShiftRightLogical128BitLane), typesSrl));
+                }
 
                 context.EmitCall(typeof(Sse41).GetMethod(namesCvt[op.Size], typesCvt));
 
@@ -1999,25 +2038,19 @@ namespace ChocolArm64.Instructions
                 Type[] typesAndXorAdd = new Type[] { VectorIntTypesPerSizeLog2[op.Size], VectorIntTypesPerSizeLog2[op.Size] };
 
                 context.EmitLdvec(op.Rn);
-
-                context.Emit(OpCodes.Dup);
-                context.EmitStvectmp();
-
                 context.EmitLdvec(op.Rm);
 
-                context.Emit(OpCodes.Dup);
-                context.EmitStvectmp2();
-
                 context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.And), typesAndXorAdd));
 
-                context.EmitLdvectmp();
-                context.EmitLdvectmp2();
+                context.EmitLdvec(op.Rn);
+                context.EmitLdvec(op.Rm);
 
                 context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.Xor), typesAndXorAdd));
 
-                context.EmitLdc_I4(1);
+                context.Emit(OpCodes.Ldc_I4_1);
                 context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.ShiftRightArithmetic), typesSra));
-                context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.Add),                  typesAndXorAdd));
+
+                context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.Add), typesAndXorAdd));
 
                 context.EmitStvec(op.Rd);
 
@@ -2185,20 +2218,24 @@ namespace ChocolArm64.Instructions
                     ? nameof(Sse41.ConvertToVector128Int16)
                     : nameof(Sse41.ConvertToVector128Int32);
 
-                int numBytes = op.RegisterSize == RegisterSize.Simd128 ? 8 : 0;
-
                 context.EmitLdvec(op.Rd);
                 context.EmitLdvec(op.Rn);
 
-                context.EmitLdc_I4(numBytes);
-                context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.ShiftRightLogical128BitLane), typesSrl));
+                if (op.RegisterSize == RegisterSize.Simd128)
+                {
+                    context.Emit(OpCodes.Ldc_I4_8);
+                    context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.ShiftRightLogical128BitLane), typesSrl));
+                }
 
                 context.EmitCall(typeof(Sse41).GetMethod(nameCvt, typesCvt));
 
                 context.EmitLdvec(op.Rm);
 
-                context.EmitLdc_I4(numBytes);
-                context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.ShiftRightLogical128BitLane), typesSrl));
+                if (op.RegisterSize == RegisterSize.Simd128)
+                {
+                    context.Emit(OpCodes.Ldc_I4_8);
+                    context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.ShiftRightLogical128BitLane), typesSrl));
+                }
 
                 context.EmitCall(typeof(Sse41).GetMethod(nameCvt, typesCvt));
 
@@ -2244,20 +2281,24 @@ namespace ChocolArm64.Instructions
                     ? nameof(Sse41.ConvertToVector128Int16)
                     : nameof(Sse41.ConvertToVector128Int32);
 
-                int numBytes = op.RegisterSize == RegisterSize.Simd128 ? 8 : 0;
-
                 context.EmitLdvec(op.Rd);
                 context.EmitLdvec(op.Rn);
 
-                context.EmitLdc_I4(numBytes);
-                context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.ShiftRightLogical128BitLane), typesSrl));
+                if (op.RegisterSize == RegisterSize.Simd128)
+                {
+                    context.Emit(OpCodes.Ldc_I4_8);
+                    context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.ShiftRightLogical128BitLane), typesSrl));
+                }
 
                 context.EmitCall(typeof(Sse41).GetMethod(nameCvt, typesCvt));
 
                 context.EmitLdvec(op.Rm);
 
-                context.EmitLdc_I4(numBytes);
-                context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.ShiftRightLogical128BitLane), typesSrl));
+                if (op.RegisterSize == RegisterSize.Simd128)
+                {
+                    context.Emit(OpCodes.Ldc_I4_8);
+                    context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.ShiftRightLogical128BitLane), typesSrl));
+                }
 
                 context.EmitCall(typeof(Sse41).GetMethod(nameCvt, typesCvt));
 
@@ -2441,19 +2482,23 @@ namespace ChocolArm64.Instructions
                                                    nameof(Sse41.ConvertToVector128Int32),
                                                    nameof(Sse41.ConvertToVector128Int64) };
 
-                int numBytes = op.RegisterSize == RegisterSize.Simd128 ? 8 : 0;
-
                 context.EmitLdvec(op.Rn);
 
-                context.EmitLdc_I4(numBytes);
-                context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.ShiftRightLogical128BitLane), typesSrl));
+                if (op.RegisterSize == RegisterSize.Simd128)
+                {
+                    context.Emit(OpCodes.Ldc_I4_8);
+                    context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.ShiftRightLogical128BitLane), typesSrl));
+                }
 
                 context.EmitCall(typeof(Sse41).GetMethod(namesCvt[op.Size], typesCvt));
 
                 context.EmitLdvec(op.Rm);
 
-                context.EmitLdc_I4(numBytes);
-                context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.ShiftRightLogical128BitLane), typesSrl));
+                if (op.RegisterSize == RegisterSize.Simd128)
+                {
+                    context.Emit(OpCodes.Ldc_I4_8);
+                    context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.ShiftRightLogical128BitLane), typesSrl));
+                }
 
                 context.EmitCall(typeof(Sse41).GetMethod(namesCvt[op.Size], typesCvt));
 
@@ -2482,13 +2527,14 @@ namespace ChocolArm64.Instructions
                                                    nameof(Sse41.ConvertToVector128Int32),
                                                    nameof(Sse41.ConvertToVector128Int64) };
 
-                int numBytes = op.RegisterSize == RegisterSize.Simd128 ? 8 : 0;
-
                 context.EmitLdvec(op.Rn);
                 context.EmitLdvec(op.Rm);
 
-                context.EmitLdc_I4(numBytes);
-                context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.ShiftRightLogical128BitLane), typesSrl));
+                if (op.RegisterSize == RegisterSize.Simd128)
+                {
+                    context.Emit(OpCodes.Ldc_I4_8);
+                    context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.ShiftRightLogical128BitLane), typesSrl));
+                }
 
                 context.EmitCall(typeof(Sse41).GetMethod(namesCvt[op.Size], typesCvt));
 
@@ -2594,19 +2640,23 @@ namespace ChocolArm64.Instructions
                                                    nameof(Sse41.ConvertToVector128Int32),
                                                    nameof(Sse41.ConvertToVector128Int64) };
 
-                int numBytes = op.RegisterSize == RegisterSize.Simd128 ? 8 : 0;
-
                 context.EmitLdvec(op.Rn);
 
-                context.EmitLdc_I4(numBytes);
-                context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.ShiftRightLogical128BitLane), typesSrl));
+                if (op.RegisterSize == RegisterSize.Simd128)
+                {
+                    context.Emit(OpCodes.Ldc_I4_8);
+                    context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.ShiftRightLogical128BitLane), typesSrl));
+                }
 
                 context.EmitCall(typeof(Sse41).GetMethod(namesCvt[op.Size], typesCvt));
 
                 context.EmitLdvec(op.Rm);
 
-                context.EmitLdc_I4(numBytes);
-                context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.ShiftRightLogical128BitLane), typesSrl));
+                if (op.RegisterSize == RegisterSize.Simd128)
+                {
+                    context.Emit(OpCodes.Ldc_I4_8);
+                    context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.ShiftRightLogical128BitLane), typesSrl));
+                }
 
                 context.EmitCall(typeof(Sse41).GetMethod(namesCvt[op.Size], typesCvt));
 
@@ -2659,13 +2709,14 @@ namespace ChocolArm64.Instructions
                                                    nameof(Sse41.ConvertToVector128Int32),
                                                    nameof(Sse41.ConvertToVector128Int64) };
 
-                int numBytes = op.RegisterSize == RegisterSize.Simd128 ? 8 : 0;
-
                 context.EmitLdvec(op.Rn);
                 context.EmitLdvec(op.Rm);
 
-                context.EmitLdc_I4(numBytes);
-                context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.ShiftRightLogical128BitLane), typesSrl));
+                if (op.RegisterSize == RegisterSize.Simd128)
+                {
+                    context.Emit(OpCodes.Ldc_I4_8);
+                    context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.ShiftRightLogical128BitLane), typesSrl));
+                }
 
                 context.EmitCall(typeof(Sse41).GetMethod(namesCvt[op.Size], typesCvt));
 
@@ -2689,25 +2740,19 @@ namespace ChocolArm64.Instructions
                 Type[] typesAndXorAdd = new Type[] { VectorUIntTypesPerSizeLog2[op.Size], VectorUIntTypesPerSizeLog2[op.Size] };
 
                 context.EmitLdvec(op.Rn);
-
-                context.Emit(OpCodes.Dup);
-                context.EmitStvectmp();
-
                 context.EmitLdvec(op.Rm);
 
-                context.Emit(OpCodes.Dup);
-                context.EmitStvectmp2();
-
                 context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.And), typesAndXorAdd));
 
-                context.EmitLdvectmp();
-                context.EmitLdvectmp2();
+                context.EmitLdvec(op.Rn);
+                context.EmitLdvec(op.Rm);
 
                 context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.Xor), typesAndXorAdd));
 
-                context.EmitLdc_I4(1);
+                context.Emit(OpCodes.Ldc_I4_1);
                 context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.ShiftRightLogical), typesSrl));
-                context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.Add),               typesAndXorAdd));
+
+                context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.Add), typesAndXorAdd));
 
                 context.EmitStvec(op.Rd);
 
@@ -2737,8 +2782,7 @@ namespace ChocolArm64.Instructions
                 Type[] typesAvgSub = new Type[] { VectorUIntTypesPerSizeLog2[op.Size], VectorUIntTypesPerSizeLog2[op.Size] };
 
                 context.EmitLdvec(op.Rn);
-                context.Emit(OpCodes.Dup);
-
+                context.EmitLdvec(op.Rn);
                 context.EmitLdvec(op.Rm);
 
                 context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.Average),  typesAvgSub));
@@ -2862,20 +2906,24 @@ namespace ChocolArm64.Instructions
                     ? nameof(Sse41.ConvertToVector128Int16)
                     : nameof(Sse41.ConvertToVector128Int32);
 
-                int numBytes = op.RegisterSize == RegisterSize.Simd128 ? 8 : 0;
-
                 context.EmitLdvec(op.Rd);
                 context.EmitLdvec(op.Rn);
 
-                context.EmitLdc_I4(numBytes);
-                context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.ShiftRightLogical128BitLane), typesSrl));
+                if (op.RegisterSize == RegisterSize.Simd128)
+                {
+                    context.Emit(OpCodes.Ldc_I4_8);
+                    context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.ShiftRightLogical128BitLane), typesSrl));
+                }
 
                 context.EmitCall(typeof(Sse41).GetMethod(nameCvt, typesCvt));
 
                 context.EmitLdvec(op.Rm);
 
-                context.EmitLdc_I4(numBytes);
-                context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.ShiftRightLogical128BitLane), typesSrl));
+                if (op.RegisterSize == RegisterSize.Simd128)
+                {
+                    context.Emit(OpCodes.Ldc_I4_8);
+                    context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.ShiftRightLogical128BitLane), typesSrl));
+                }
 
                 context.EmitCall(typeof(Sse41).GetMethod(nameCvt, typesCvt));
 
@@ -2921,20 +2969,24 @@ namespace ChocolArm64.Instructions
                     ? nameof(Sse41.ConvertToVector128Int16)
                     : nameof(Sse41.ConvertToVector128Int32);
 
-                int numBytes = op.RegisterSize == RegisterSize.Simd128 ? 8 : 0;
-
                 context.EmitLdvec(op.Rd);
                 context.EmitLdvec(op.Rn);
 
-                context.EmitLdc_I4(numBytes);
-                context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.ShiftRightLogical128BitLane), typesSrl));
+                if (op.RegisterSize == RegisterSize.Simd128)
+                {
+                    context.Emit(OpCodes.Ldc_I4_8);
+                    context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.ShiftRightLogical128BitLane), typesSrl));
+                }
 
                 context.EmitCall(typeof(Sse41).GetMethod(nameCvt, typesCvt));
 
                 context.EmitLdvec(op.Rm);
 
-                context.EmitLdc_I4(numBytes);
-                context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.ShiftRightLogical128BitLane), typesSrl));
+                if (op.RegisterSize == RegisterSize.Simd128)
+                {
+                    context.Emit(OpCodes.Ldc_I4_8);
+                    context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.ShiftRightLogical128BitLane), typesSrl));
+                }
 
                 context.EmitCall(typeof(Sse41).GetMethod(nameCvt, typesCvt));
 
@@ -3063,19 +3115,23 @@ namespace ChocolArm64.Instructions
                                                    nameof(Sse41.ConvertToVector128Int32),
                                                    nameof(Sse41.ConvertToVector128Int64) };
 
-                int numBytes = op.RegisterSize == RegisterSize.Simd128 ? 8 : 0;
-
                 context.EmitLdvec(op.Rn);
 
-                context.EmitLdc_I4(numBytes);
-                context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.ShiftRightLogical128BitLane), typesSrl));
+                if (op.RegisterSize == RegisterSize.Simd128)
+                {
+                    context.Emit(OpCodes.Ldc_I4_8);
+                    context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.ShiftRightLogical128BitLane), typesSrl));
+                }
 
                 context.EmitCall(typeof(Sse41).GetMethod(namesCvt[op.Size], typesCvt));
 
                 context.EmitLdvec(op.Rm);
 
-                context.EmitLdc_I4(numBytes);
-                context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.ShiftRightLogical128BitLane), typesSrl));
+                if (op.RegisterSize == RegisterSize.Simd128)
+                {
+                    context.Emit(OpCodes.Ldc_I4_8);
+                    context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.ShiftRightLogical128BitLane), typesSrl));
+                }
 
                 context.EmitCall(typeof(Sse41).GetMethod(namesCvt[op.Size], typesCvt));
 
@@ -3104,13 +3160,14 @@ namespace ChocolArm64.Instructions
                                                    nameof(Sse41.ConvertToVector128Int32),
                                                    nameof(Sse41.ConvertToVector128Int64) };
 
-                int numBytes = op.RegisterSize == RegisterSize.Simd128 ? 8 : 0;
-
                 context.EmitLdvec(op.Rn);
                 context.EmitLdvec(op.Rm);
 
-                context.EmitLdc_I4(numBytes);
-                context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.ShiftRightLogical128BitLane), typesSrl));
+                if (op.RegisterSize == RegisterSize.Simd128)
+                {
+                    context.Emit(OpCodes.Ldc_I4_8);
+                    context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.ShiftRightLogical128BitLane), typesSrl));
+                }
 
                 context.EmitCall(typeof(Sse41).GetMethod(namesCvt[op.Size], typesCvt));
 
@@ -3253,5 +3310,77 @@ namespace ChocolArm64.Instructions
                 EmitVectorZeroUpper(context, op.Rd);
             }
         }
+
+        private static void EmitSse41Mul_AddSub(ILEmitterCtx context, string nameAddSub = null)
+        {
+            OpCodeSimdReg64 op = (OpCodeSimdReg64)context.CurrOp;
+
+            if (nameAddSub != null)
+            {
+                context.EmitLdvec(op.Rd);
+            }
+
+            if (op.Size == 0)
+            {
+                Type[] typesBle = new Type[] { typeof(Vector128<sbyte>), typeof(Vector128<sbyte>), typeof(Vector128<sbyte>) };
+                Type[] typesMul = new Type[] { typeof(Vector128<short>), typeof(Vector128<short>) };
+                Type[] typesShs = new Type[] { typeof(Vector128<short>), typeof(byte) };
+                Type[] typesSav = new Type[] { typeof(int) };
+
+                context.EmitLdvec(op.Rn);
+                context.Emit(OpCodes.Ldc_I4_8);
+                context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.ShiftRightLogical), typesShs));
+
+                context.EmitLdvec(op.Rm);
+                context.Emit(OpCodes.Ldc_I4_8);
+                context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.ShiftRightLogical), typesShs));
+
+                context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.MultiplyLow), typesMul));
+
+                context.Emit(OpCodes.Ldc_I4_8);
+                context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.ShiftLeftLogical), typesShs));
+
+                context.EmitLdvec(op.Rn);
+                context.EmitLdvec(op.Rm);
+                context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.MultiplyLow), typesMul));
+
+                context.EmitLdc_I4(0x00FF00FF);
+                context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.SetAllVector128), typesSav));
+
+                context.EmitCall(typeof(Sse41).GetMethod(nameof(Sse41.BlendVariable), typesBle));
+            }
+            else if (op.Size == 1)
+            {
+                Type[] typesMul = new Type[] { typeof(Vector128<short>), typeof(Vector128<short>) };
+
+                context.EmitLdvec(op.Rn);
+                context.EmitLdvec(op.Rm);
+
+                context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.MultiplyLow), typesMul));
+            }
+            else /* if (op.Size == 2) */
+            {
+                Type[] typesMul = new Type[] { typeof(Vector128<int>), typeof(Vector128<int>) };
+
+                context.EmitLdvec(op.Rn);
+                context.EmitLdvec(op.Rm);
+
+                context.EmitCall(typeof(Sse41).GetMethod(nameof(Sse41.MultiplyLow), typesMul));
+            }
+
+            if (nameAddSub != null)
+            {
+                Type[] typesAddSub = new Type[] { VectorIntTypesPerSizeLog2[op.Size], VectorIntTypesPerSizeLog2[op.Size] };
+
+                context.EmitCall(typeof(Sse2).GetMethod(nameAddSub, typesAddSub));
+            }
+
+            context.EmitStvec(op.Rd);
+
+            if (op.RegisterSize == RegisterSize.Simd64)
+            {
+                EmitVectorZeroUpper(context, op.Rd);
+            }
+        }
     }
 }
diff --git a/ChocolArm64/Instructions/InstEmitSimdCmp.cs b/ChocolArm64/Instructions/InstEmitSimdCmp.cs
index c29dcd9dc5..62cf772091 100644
--- a/ChocolArm64/Instructions/InstEmitSimdCmp.cs
+++ b/ChocolArm64/Instructions/InstEmitSimdCmp.cs
@@ -86,7 +86,42 @@ namespace ChocolArm64.Instructions
 
         public static void Cmhi_V(ILEmitterCtx context)
         {
-            EmitCmpOp(context, OpCodes.Bgt_Un_S, scalar: false);
+            OpCodeSimdReg64 op = (OpCodeSimdReg64)context.CurrOp;
+
+            if (Optimizations.UseSse41 && op.Size < 3)
+            {
+                Type[] typesMax = new Type[] { VectorUIntTypesPerSizeLog2[op.Size], VectorUIntTypesPerSizeLog2[op.Size] };
+                Type[] typesCmp = new Type[] { VectorIntTypesPerSizeLog2 [op.Size], VectorIntTypesPerSizeLog2 [op.Size] };
+                Type[] typesAnt = new Type[] { typeof(Vector128<byte>), typeof(Vector128<byte>) };
+                Type[] typesSav = new Type[] { typeof(byte) };
+
+                Type typeSse = op.Size == 0 ? typeof(Sse2) : typeof(Sse41);
+
+                context.EmitLdvec(op.Rm);
+                context.EmitLdvec(op.Rn);
+
+                context.EmitCall(typeSse.GetMethod(nameof(Sse2.Max), typesMax));
+
+                context.EmitLdvec(op.Rm);
+
+                context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.CompareEqual), typesCmp));
+
+                context.EmitLdc_I4(byte.MaxValue);
+                context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.SetAllVector128), typesSav));
+
+                context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.AndNot), typesAnt));
+
+                context.EmitStvec(op.Rd);
+
+                if (op.RegisterSize == RegisterSize.Simd64)
+                {
+                    EmitVectorZeroUpper(context, op.Rd);
+                }
+            }
+            else
+            {
+                EmitCmpOp(context, OpCodes.Bgt_Un_S, scalar: false);
+            }
         }
 
         public static void Cmhs_S(ILEmitterCtx context)
@@ -96,7 +131,35 @@ namespace ChocolArm64.Instructions
 
         public static void Cmhs_V(ILEmitterCtx context)
         {
-            EmitCmpOp(context, OpCodes.Bge_Un_S, scalar: false);
+            OpCodeSimdReg64 op = (OpCodeSimdReg64)context.CurrOp;
+
+            if (Optimizations.UseSse41 && op.Size < 3)
+            {
+                Type[] typesMax = new Type[] { VectorUIntTypesPerSizeLog2[op.Size], VectorUIntTypesPerSizeLog2[op.Size] };
+                Type[] typesCmp = new Type[] { VectorIntTypesPerSizeLog2 [op.Size], VectorIntTypesPerSizeLog2 [op.Size] };
+
+                Type typeSse = op.Size == 0 ? typeof(Sse2) : typeof(Sse41);
+
+                context.EmitLdvec(op.Rn);
+                context.EmitLdvec(op.Rm);
+
+                context.EmitCall(typeSse.GetMethod(nameof(Sse2.Max), typesMax));
+
+                context.EmitLdvec(op.Rn);
+
+                context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.CompareEqual), typesCmp));
+
+                context.EmitStvec(op.Rd);
+
+                if (op.RegisterSize == RegisterSize.Simd64)
+                {
+                    EmitVectorZeroUpper(context, op.Rd);
+                }
+            }
+            else
+            {
+                EmitCmpOp(context, OpCodes.Bge_Un_S, scalar: false);
+            }
         }
 
         public static void Cmle_S(ILEmitterCtx context)
@@ -318,9 +381,6 @@ namespace ChocolArm64.Instructions
 
                     context.EmitLdvec(op.Rn);
 
-                    context.Emit(OpCodes.Dup);
-                    context.EmitStvectmp();
-
                     if (cmpWithZero)
                     {
                         VectorHelper.EmitCall(context, nameof(VectorHelper.VectorSingleZero));
@@ -331,7 +391,7 @@ namespace ChocolArm64.Instructions
                     }
 
                     context.Emit(OpCodes.Dup);
-                    context.EmitStvectmp2();
+                    context.EmitStvectmp();
 
                     context.EmitCall(typeof(Sse).GetMethod(nameof(Sse.CompareOrderedScalar), typesCmp));
                     VectorHelper.EmitCall(context, nameof(VectorHelper.VectorSingleZero));
@@ -340,18 +400,18 @@ namespace ChocolArm64.Instructions
 
                     context.Emit(OpCodes.Brtrue_S, lblNaN);
 
-                    context.EmitLdc_I4(0);
+                    context.Emit(OpCodes.Ldc_I4_0);
 
+                    context.EmitLdvec(op.Rn);
                     context.EmitLdvectmp();
-                    context.EmitLdvectmp2();
                     context.EmitCall(typeof(Sse).GetMethod(nameof(Sse.CompareGreaterThanOrEqualOrderedScalar), typesCmp));
 
+                    context.EmitLdvec(op.Rn);
                     context.EmitLdvectmp();
-                    context.EmitLdvectmp2();
                     context.EmitCall(typeof(Sse).GetMethod(nameof(Sse.CompareEqualOrderedScalar), typesCmp));
 
+                    context.EmitLdvec(op.Rn);
                     context.EmitLdvectmp();
-                    context.EmitLdvectmp2();
                     context.EmitCall(typeof(Sse).GetMethod(nameof(Sse.CompareLessThanOrderedScalar), typesCmp));
 
                     context.EmitStflg((int)PState.NBit);
@@ -363,10 +423,10 @@ namespace ChocolArm64.Instructions
 
                     context.MarkLabel(lblNaN);
 
-                    context.EmitLdc_I4(1);
-                    context.Emit(OpCodes.Dup);
-                    context.EmitLdc_I4(0);
-                    context.Emit(OpCodes.Dup);
+                    context.Emit(OpCodes.Ldc_I4_1);
+                    context.Emit(OpCodes.Ldc_I4_1);
+                    context.Emit(OpCodes.Ldc_I4_0);
+                    context.Emit(OpCodes.Ldc_I4_0);
 
                     context.EmitStflg((int)PState.NBit);
                     context.EmitStflg((int)PState.ZBit);
@@ -384,9 +444,6 @@ namespace ChocolArm64.Instructions
 
                     context.EmitLdvec(op.Rn);
 
-                    context.Emit(OpCodes.Dup);
-                    context.EmitStvectmp();
-
                     if (cmpWithZero)
                     {
                         VectorHelper.EmitCall(context, nameof(VectorHelper.VectorDoubleZero));
@@ -397,7 +454,7 @@ namespace ChocolArm64.Instructions
                     }
 
                     context.Emit(OpCodes.Dup);
-                    context.EmitStvectmp2();
+                    context.EmitStvectmp();
 
                     context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.CompareOrderedScalar), typesCmp));
                     VectorHelper.EmitCall(context, nameof(VectorHelper.VectorDoubleZero));
@@ -406,18 +463,18 @@ namespace ChocolArm64.Instructions
 
                     context.Emit(OpCodes.Brtrue_S, lblNaN);
 
-                    context.EmitLdc_I4(0);
+                    context.Emit(OpCodes.Ldc_I4_0);
 
+                    context.EmitLdvec(op.Rn);
                     context.EmitLdvectmp();
-                    context.EmitLdvectmp2();
                     context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.CompareGreaterThanOrEqualOrderedScalar), typesCmp));
 
+                    context.EmitLdvec(op.Rn);
                     context.EmitLdvectmp();
-                    context.EmitLdvectmp2();
                     context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.CompareEqualOrderedScalar), typesCmp));
 
+                    context.EmitLdvec(op.Rn);
                     context.EmitLdvectmp();
-                    context.EmitLdvectmp2();
                     context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.CompareLessThanOrderedScalar), typesCmp));
 
                     context.EmitStflg((int)PState.NBit);
@@ -429,10 +486,10 @@ namespace ChocolArm64.Instructions
 
                     context.MarkLabel(lblNaN);
 
-                    context.EmitLdc_I4(1);
-                    context.Emit(OpCodes.Dup);
-                    context.EmitLdc_I4(0);
-                    context.Emit(OpCodes.Dup);
+                    context.Emit(OpCodes.Ldc_I4_1);
+                    context.Emit(OpCodes.Ldc_I4_1);
+                    context.Emit(OpCodes.Ldc_I4_0);
+                    context.Emit(OpCodes.Ldc_I4_0);
 
                     context.EmitStflg((int)PState.NBit);
                     context.EmitStflg((int)PState.ZBit);
diff --git a/ChocolArm64/Instructions/InstEmitSimdCvt.cs b/ChocolArm64/Instructions/InstEmitSimdCvt.cs
index 78a86a33eb..c5c61bcca5 100644
--- a/ChocolArm64/Instructions/InstEmitSimdCvt.cs
+++ b/ChocolArm64/Instructions/InstEmitSimdCvt.cs
@@ -21,26 +21,24 @@ namespace ChocolArm64.Instructions
                 if (op.Size == 1 && op.Opc == 0)
                 {
                     //Double -> Single.
-                    VectorHelper.EmitCall(context, nameof(VectorHelper.VectorSingleZero));
+                    Type[] typesCvt = new Type[] { typeof(Vector128<float>), typeof(Vector128<double>) };
 
+                    VectorHelper.EmitCall(context, nameof(VectorHelper.VectorSingleZero));
                     context.EmitLdvec(op.Rn);
 
-                    Type[] types = new Type[] { typeof(Vector128<float>), typeof(Vector128<double>) };
-
-                    context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.ConvertScalarToVector128Single), types));
+                    context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.ConvertScalarToVector128Single), typesCvt));
 
                     context.EmitStvec(op.Rd);
                 }
                 else if (op.Size == 0 && op.Opc == 1)
                 {
                     //Single -> Double.
-                    VectorHelper.EmitCall(context, nameof(VectorHelper.VectorDoubleZero));
+                    Type[] typesCvt = new Type[] { typeof(Vector128<double>), typeof(Vector128<float>) };
 
+                    VectorHelper.EmitCall(context, nameof(VectorHelper.VectorDoubleZero));
                     context.EmitLdvec(op.Rn);
 
-                    Type[] types = new Type[] { typeof(Vector128<double>), typeof(Vector128<float>) };
-
-                    context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.ConvertScalarToVector128Double), types));
+                    context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.ConvertScalarToVector128Double), typesCvt));
 
                     context.EmitStvec(op.Rd);
                 }
@@ -80,14 +78,14 @@ namespace ChocolArm64.Instructions
             {
                 Type[] typesCvt = new Type[] { typeof(Vector128<float>) };
 
-                string nameMov = op.RegisterSize == RegisterSize.Simd128
-                    ? nameof(Sse.MoveHighToLow)
-                    : nameof(Sse.MoveLowToHigh);
-
                 context.EmitLdvec(op.Rn);
-                context.Emit(OpCodes.Dup);
 
-                context.EmitCall(typeof(Sse).GetMethod(nameMov));
+                if (op.RegisterSize == RegisterSize.Simd128)
+                {
+                    context.EmitLdvec(op.Rn);
+
+                    context.EmitCall(typeof(Sse).GetMethod(nameof(Sse.MoveHighToLow)));
+                }
 
                 context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.ConvertToVector128Double), typesCvt));
 
@@ -249,12 +247,12 @@ namespace ChocolArm64.Instructions
 
         public static void Fcvtzs_S(ILEmitterCtx context)
         {
-            EmitScalarFcvtzs(context);
+            EmitFcvtz(context, signed: true, scalar: true);
         }
 
         public static void Fcvtzs_V(ILEmitterCtx context)
         {
-            EmitVectorFcvtzs(context);
+            EmitFcvtz(context, signed: true, scalar: false);
         }
 
         public static void Fcvtzu_Gp(ILEmitterCtx context)
@@ -269,12 +267,12 @@ namespace ChocolArm64.Instructions
 
         public static void Fcvtzu_S(ILEmitterCtx context)
         {
-            EmitScalarFcvtzu(context);
+            EmitFcvtz(context, signed: false, scalar: true);
         }
 
         public static void Fcvtzu_V(ILEmitterCtx context)
         {
-            EmitVectorFcvtzu(context);
+            EmitFcvtz(context, signed: false, scalar: false);
         }
 
         public static void Scvtf_Gp(ILEmitterCtx context)
@@ -415,11 +413,6 @@ namespace ChocolArm64.Instructions
             int bytes = op.GetBitsCount() >> 3;
             int elems = !scalar ? bytes >> sizeI : 1;
 
-            if (scalar && (sizeF == 0))
-            {
-                EmitVectorZeroLowerTmp(context);
-            }
-
             for (int index = 0; index < elems; index++)
             {
                 EmitVectorExtractF(context, op.Rn, index, sizeF);
@@ -441,13 +434,62 @@ namespace ChocolArm64.Instructions
                         : nameof(VectorHelper.SatF64ToU64));
                 }
 
-                EmitVectorInsertTmp(context, index, sizeI);
+                if (scalar)
+                {
+                    EmitVectorZeroAll(context, op.Rd);
+                }
+
+                EmitVectorInsert(context, op.Rd, index, sizeI);
             }
 
-            context.EmitLdvectmp();
-            context.EmitStvec(op.Rd);
+            if (op.RegisterSize == RegisterSize.Simd64)
+            {
+                EmitVectorZeroUpper(context, op.Rd);
+            }
+        }
 
-            if ((op.RegisterSize == RegisterSize.Simd64) || scalar)
+        private static void EmitFcvtz(ILEmitterCtx context, bool signed, bool scalar)
+        {
+            OpCodeSimd64 op = (OpCodeSimd64)context.CurrOp;
+
+            int sizeF = op.Size & 1;
+            int sizeI = sizeF + 2;
+
+            int fBits = GetFBits(context);
+
+            int bytes = op.GetBitsCount() >> 3;
+            int elems = !scalar ? bytes >> sizeI : 1;
+
+            for (int index = 0; index < elems; index++)
+            {
+                EmitVectorExtractF(context, op.Rn, index, sizeF);
+
+                EmitF2iFBitsMul(context, sizeF, fBits);
+
+                if (sizeF == 0)
+                {
+                    VectorHelper.EmitCall(context, signed
+                        ? nameof(VectorHelper.SatF32ToS32)
+                        : nameof(VectorHelper.SatF32ToU32));
+
+                    context.Emit(OpCodes.Conv_U8);
+                }
+                else /* if (sizeF == 1) */
+                {
+                    VectorHelper.EmitCall(context, signed
+                        ? nameof(VectorHelper.SatF64ToS64)
+                        : nameof(VectorHelper.SatF64ToU64));
+                }
+
+                if (scalar)
+                {
+                    EmitVectorZeroAll(context, op.Rd);
+                }
+
+                EmitVectorInsert(context, op.Rd, index, sizeI);
+            }
+
+            if (op.RegisterSize == RegisterSize.Simd64)
             {
                 EmitVectorZeroUpper(context, op.Rd);
             }
@@ -555,105 +597,6 @@ namespace ChocolArm64.Instructions
             }
         }
 
-        private static void EmitScalarFcvtzs(ILEmitterCtx context)
-        {
-            EmitScalarFcvtz(context, true);
-        }
-
-        private static void EmitScalarFcvtzu(ILEmitterCtx context)
-        {
-            EmitScalarFcvtz(context, false);
-        }
-
-        private static void EmitScalarFcvtz(ILEmitterCtx context, bool signed)
-        {
-            OpCodeSimd64 op = (OpCodeSimd64)context.CurrOp;
-
-            int sizeF = op.Size & 1;
-            int sizeI = sizeF + 2;
-
-            int fBits = GetFBits(context);
-
-            EmitVectorExtractF(context, op.Rn, 0, sizeF);
-
-            EmitF2iFBitsMul(context, sizeF, fBits);
-
-            if (sizeF == 0)
-            {
-                VectorHelper.EmitCall(context, signed
-                    ? nameof(VectorHelper.SatF32ToS32)
-                    : nameof(VectorHelper.SatF32ToU32));
-            }
-            else /* if (sizeF == 1) */
-            {
-                VectorHelper.EmitCall(context, signed
-                    ? nameof(VectorHelper.SatF64ToS64)
-                    : nameof(VectorHelper.SatF64ToU64));
-            }
-
-            if (sizeF == 0)
-            {
-                context.Emit(OpCodes.Conv_U8);
-            }
-
-            EmitScalarSet(context, op.Rd, sizeI);
-        }
-
-        private static void EmitVectorFcvtzs(ILEmitterCtx context)
-        {
-            EmitVectorFcvtz(context, true);
-        }
-
-        private static void EmitVectorFcvtzu(ILEmitterCtx context)
-        {
-            EmitVectorFcvtz(context, false);
-        }
-
-        private static void EmitVectorFcvtz(ILEmitterCtx context, bool signed)
-        {
-            OpCodeSimd64 op = (OpCodeSimd64)context.CurrOp;
-
-            int sizeF = op.Size & 1;
-            int sizeI = sizeF + 2;
-
-            int fBits = GetFBits(context);
-
-            int bytes = op.GetBitsCount() >> 3;
-            int elems = bytes >> sizeI;
-
-            for (int index = 0; index < elems; index++)
-            {
-                EmitVectorExtractF(context, op.Rn, index, sizeF);
-
-                EmitF2iFBitsMul(context, sizeF, fBits);
-
-                if (sizeF == 0)
-                {
-                    VectorHelper.EmitCall(context, signed
-                        ? nameof(VectorHelper.SatF32ToS32)
-                        : nameof(VectorHelper.SatF32ToU32));
-                }
-                else /* if (sizeF == 1) */
-                {
-                    VectorHelper.EmitCall(context, signed
-                        ? nameof(VectorHelper.SatF64ToS64)
-                        : nameof(VectorHelper.SatF64ToU64));
-                }
-
-                if (sizeF == 0)
-                {
-                    context.Emit(OpCodes.Conv_U8);
-                }
-
-                EmitVectorInsert(context, op.Rd, index, sizeI);
-            }
-
-            if (op.RegisterSize == RegisterSize.Simd64)
-            {
-                EmitVectorZeroUpper(context, op.Rd);
-            }
-        }
-
         private static int GetFBits(ILEmitterCtx context)
         {
             if (context.CurrOp is OpCodeSimdShImm64 op)
diff --git a/ChocolArm64/Instructions/InstEmitSimdHelper.cs b/ChocolArm64/Instructions/InstEmitSimdHelper.cs
index b7dd09b4bb..10b86a3e17 100644
--- a/ChocolArm64/Instructions/InstEmitSimdHelper.cs
+++ b/ChocolArm64/Instructions/InstEmitSimdHelper.cs
@@ -592,12 +592,9 @@ namespace ChocolArm64.Instructions
 
                 emit();
 
-                EmitVectorInsertTmp(context, index, op.Size);
+                EmitVectorInsert(context, op.Rd, index, op.Size);
             }
 
-            context.EmitLdvectmp();
-            context.EmitStvec(op.Rd);
-
             if (op.RegisterSize == RegisterSize.Simd64)
             {
                 EmitVectorZeroUpper(context, op.Rd);
@@ -898,20 +895,13 @@ namespace ChocolArm64.Instructions
                     Type[] types    = new Type[] { typeof(Vector128<float>), typeof(Vector128<float>) };
 
                     context.EmitLdvec(op.Rn);
-
-                    context.Emit(OpCodes.Dup);
-                    context.EmitStvectmp();
-
                     context.EmitLdvec(op.Rm);
 
-                    context.Emit(OpCodes.Dup);
-                    context.EmitStvectmp2();
-
                     context.EmitLdc_I4(2 << 6 | 0 << 4 | 2 << 2 | 0 << 0);
                     context.EmitCall(typeof(Sse).GetMethod(nameof(Sse.Shuffle), typesSfl));
 
-                    context.EmitLdvectmp();
-                    context.EmitLdvectmp2();
+                    context.EmitLdvec(op.Rn);
+                    context.EmitLdvec(op.Rm);
 
                     context.EmitLdc_I4(3 << 6 | 1 << 4 | 3 << 2 | 1 << 0);
                     context.EmitCall(typeof(Sse).GetMethod(nameof(Sse.Shuffle), typesSfl));
@@ -926,19 +916,12 @@ namespace ChocolArm64.Instructions
                 Type[] types = new Type[] { typeof(Vector128<double>), typeof(Vector128<double>) };
 
                 context.EmitLdvec(op.Rn);
-
-                context.Emit(OpCodes.Dup);
-                context.EmitStvectmp();
-
                 context.EmitLdvec(op.Rm);
 
-                context.Emit(OpCodes.Dup);
-                context.EmitStvectmp2();
-
                 context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.UnpackLow), types));
 
-                context.EmitLdvectmp();
-                context.EmitLdvectmp2();
+                context.EmitLdvec(op.Rn);
+                context.EmitLdvec(op.Rm);
 
                 context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.UnpackHigh), types));
 
@@ -985,11 +968,6 @@ namespace ChocolArm64.Instructions
             int bytes = op.GetBitsCount() >> 3;
             int elems = !scalar ? bytes >> op.Size : 1;
 
-            if (scalar)
-            {
-                EmitVectorZeroLowerTmp(context);
-            }
-
             for (int index = 0; index < elems; index++)
             {
                 EmitVectorExtractSx(context, op.Rn, index, op.Size);
@@ -1005,13 +983,15 @@ namespace ChocolArm64.Instructions
                     EmitUnarySignedSatQAbsOrNeg(context);
                 }
 
-                EmitVectorInsertTmp(context, index, op.Size);
+                if (scalar)
+                {
+                    EmitVectorZeroAll(context, op.Rd);
+                }
+
+                EmitVectorInsert(context, op.Rd, index, op.Size);
             }
 
-            context.EmitLdvectmp();
-            context.EmitStvec(op.Rd);
-
-            if ((op.RegisterSize == RegisterSize.Simd64) || scalar)
+            if (op.RegisterSize == RegisterSize.Simd64)
             {
                 EmitVectorZeroUpper(context, op.Rd);
             }
@@ -1052,11 +1032,6 @@ namespace ChocolArm64.Instructions
             int bytes = op.GetBitsCount() >> 3;
             int elems = !scalar ? bytes >> op.Size : 1;
 
-            if (scalar)
-            {
-                EmitVectorZeroLowerTmp(context);
-            }
-
             if (add || sub)
             {
                 for (int index = 0; index < elems; index++)
@@ -1082,7 +1057,12 @@ namespace ChocolArm64.Instructions
                         }
                     }
 
-                    EmitVectorInsertTmp(context, index, op.Size);
+                    if (scalar)
+                    {
+                        EmitVectorZeroAll(context, op.Rd);
+                    }
+
+                    EmitVectorInsert(context, op.Rd, index, op.Size);
                 }
             }
             else if (accumulate)
@@ -1103,7 +1083,12 @@ namespace ChocolArm64.Instructions
                         EmitBinarySatQAccumulate(context, signed);
                     }
 
-                    EmitVectorInsertTmp(context, index, op.Size);
+                    if (scalar)
+                    {
+                        EmitVectorZeroAll(context, op.Rd);
+                    }
+
+                    EmitVectorInsert(context, op.Rd, index, op.Size);
                 }
             }
             else
@@ -1117,14 +1102,16 @@ namespace ChocolArm64.Instructions
 
                     EmitSatQ(context, op.Size, true, signed);
 
-                    EmitVectorInsertTmp(context, index, op.Size);
+                    if (scalar)
+                    {
+                        EmitVectorZeroAll(context, op.Rd);
+                    }
+
+                    EmitVectorInsert(context, op.Rd, index, op.Size);
                 }
             }
 
-            context.EmitLdvectmp();
-            context.EmitStvec(op.Rd);
-
-            if ((op.RegisterSize == RegisterSize.Simd64) || scalar)
+            if (op.RegisterSize == RegisterSize.Simd64)
             {
                 EmitVectorZeroUpper(context, op.Rd);
             }
@@ -1190,7 +1177,7 @@ namespace ChocolArm64.Instructions
         // TSrc (16bit, 32bit, 64bit; signed, unsigned) > TDst (8bit, 16bit, 32bit; signed, unsigned).
         public static void EmitSatQ(ILEmitterCtx context, int sizeDst, bool signedSrc, bool signedDst)
         {
-            if ((uint)sizeDst > 2)
+            if ((uint)sizeDst > 2u)
             {
                 throw new ArgumentOutOfRangeException(nameof(sizeDst));
             }
@@ -1381,15 +1368,15 @@ namespace ChocolArm64.Instructions
             if (Optimizations.UseSse)
             {
                 //TODO: Use Sse2.MoveScalar once it is fixed,
-                //as of the time of writing it just crashes the JIT (SDK 2.1.503).
+                //as of the time of writing it just crashes the JIT (SDK 2.1.504).
 
                 /*Type[] typesMov = new Type[] { typeof(Vector128<ulong>) };
 
-                EmitLdvecWithUnsignedCast(context, reg, 3);
+                context.EmitLdvec(reg);
 
                 context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.MoveScalar), typesMov));
 
-                EmitStvecWithUnsignedCast(context, reg, 3);*/
+                context.EmitStvec(reg);*/
 
                 context.EmitLdvec(reg);
                 VectorHelper.EmitCall(context, nameof(VectorHelper.VectorSingleZero));
diff --git a/ChocolArm64/Instructions/InstEmitSimdLogical.cs b/ChocolArm64/Instructions/InstEmitSimdLogical.cs
index 6c718182db..bf80bada3e 100644
--- a/ChocolArm64/Instructions/InstEmitSimdLogical.cs
+++ b/ChocolArm64/Instructions/InstEmitSimdLogical.cs
@@ -30,12 +30,12 @@ namespace ChocolArm64.Instructions
             {
                 OpCodeSimdReg64 op = (OpCodeSimdReg64)context.CurrOp;
 
-                Type[] typesAndNot = new Type[] { typeof(Vector128<byte>), typeof(Vector128<byte>) };
+                Type[] typesAnt = new Type[] { typeof(Vector128<byte>), typeof(Vector128<byte>) };
 
                 context.EmitLdvec(op.Rm);
                 context.EmitLdvec(op.Rn);
 
-                context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.AndNot), typesAndNot));
+                context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.AndNot), typesAnt));
 
                 context.EmitStvec(op.Rd);
 
@@ -79,18 +79,18 @@ namespace ChocolArm64.Instructions
 
             if (Optimizations.UseSse2)
             {
-                Type[] typesXorAndNot = new Type[] { typeof(Vector128<byte>), typeof(Vector128<byte>) };
+                Type[] typesXorAnd = new Type[] { typeof(Vector128<byte>), typeof(Vector128<byte>) };
 
-                string nameAndNot = notRm ? nameof(Sse2.AndNot) : nameof(Sse2.And);
+                string nameAnd = notRm ? nameof(Sse2.AndNot) : nameof(Sse2.And);
 
                 context.EmitLdvec(op.Rd);
                 context.EmitLdvec(op.Rm);
                 context.EmitLdvec(op.Rn);
                 context.EmitLdvec(op.Rd);
 
-                context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.Xor), typesXorAndNot));
-                context.EmitCall(typeof(Sse2).GetMethod(nameAndNot,       typesXorAndNot));
-                context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.Xor), typesXorAndNot));
+                context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.Xor), typesXorAnd));
+                context.EmitCall(typeof(Sse2).GetMethod(nameAnd,          typesXorAnd));
+                context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.Xor), typesXorAnd));
 
                 context.EmitStvec(op.Rd);
 
@@ -120,7 +120,6 @@ namespace ChocolArm64.Instructions
                     }
 
                     context.Emit(OpCodes.And);
-
                     context.Emit(OpCodes.Xor);
 
                     EmitVectorInsert(context, op.Rd, index, 3);
@@ -142,8 +141,7 @@ namespace ChocolArm64.Instructions
                 Type[] typesXorAnd = new Type[] { typeof(Vector128<byte>), typeof(Vector128<byte>) };
 
                 context.EmitLdvec(op.Rm);
-                context.Emit(OpCodes.Dup);
-
+                context.EmitLdvec(op.Rm);
                 context.EmitLdvec(op.Rn);
 
                 context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.Xor), typesXorAnd));
@@ -151,7 +149,6 @@ namespace ChocolArm64.Instructions
                 context.EmitLdvec(op.Rd);
 
                 context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.And), typesXorAnd));
-
                 context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.Xor), typesXorAnd));
 
                 context.EmitStvec(op.Rd);
@@ -196,15 +193,15 @@ namespace ChocolArm64.Instructions
             {
                 OpCodeSimd64 op = (OpCodeSimd64)context.CurrOp;
 
-                Type[] typesSav    = new Type[] { typeof(byte) };
-                Type[] typesAndNot = new Type[] { typeof(Vector128<byte>), typeof(Vector128<byte>) };
+                Type[] typesSav = new Type[] { typeof(byte) };
+                Type[] typesAnt = new Type[] { typeof(Vector128<byte>), typeof(Vector128<byte>) };
 
                 context.EmitLdvec(op.Rn);
 
                 context.EmitLdc_I4(byte.MaxValue);
                 context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.SetAllVector128), typesSav));
 
-                context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.AndNot), typesAndNot));
+                context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.AndNot), typesAnt));
 
                 context.EmitStvec(op.Rd);
 
@@ -225,8 +222,8 @@ namespace ChocolArm64.Instructions
             {
                 OpCodeSimdReg64 op = (OpCodeSimdReg64)context.CurrOp;
 
-                Type[] typesSav      = new Type[] { typeof(byte) };
-                Type[] typesAndNotOr = new Type[] { typeof(Vector128<byte>), typeof(Vector128<byte>) };
+                Type[] typesSav   = new Type[] { typeof(byte) };
+                Type[] typesAntOr = new Type[] { typeof(Vector128<byte>), typeof(Vector128<byte>) };
 
                 context.EmitLdvec(op.Rn);
                 context.EmitLdvec(op.Rm);
@@ -234,8 +231,8 @@ namespace ChocolArm64.Instructions
                 context.EmitLdc_I4(byte.MaxValue);
                 context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.SetAllVector128), typesSav));
 
-                context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.AndNot), typesAndNotOr));
-                context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.Or),     typesAndNotOr));
+                context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.AndNot), typesAntOr));
+                context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.Or),     typesAntOr));
 
                 context.EmitStvec(op.Rd);
 
diff --git a/ChocolArm64/Instructions/InstEmitSimdShift.cs b/ChocolArm64/Instructions/InstEmitSimdShift.cs
index c0b20d7ea6..6865948ae0 100644
--- a/ChocolArm64/Instructions/InstEmitSimdShift.cs
+++ b/ChocolArm64/Instructions/InstEmitSimdShift.cs
@@ -5,6 +5,7 @@ using ChocolArm64.State;
 using ChocolArm64.Translation;
 using System;
 using System.Reflection.Emit;
+using System.Runtime.Intrinsics;
 using System.Runtime.Intrinsics.X86;
 
 using static ChocolArm64.Instructions.InstEmitSimdHelper;
@@ -13,9 +14,65 @@ namespace ChocolArm64.Instructions
 {
     static partial class InstEmit
     {
+#region "Masks"
+        private static readonly long[] _masks_RshrnShrn = new long[]
+        {
+            14L << 56 | 12L << 48 | 10L << 40 | 08L << 32 | 06L << 24 | 04L << 16 | 02L << 8 | 00L << 0,
+            13L << 56 | 12L << 48 | 09L << 40 | 08L << 32 | 05L << 24 | 04L << 16 | 01L << 8 | 00L << 0,
+            11L << 56 | 10L << 48 | 09L << 40 | 08L << 32 | 03L << 24 | 02L << 16 | 01L << 8 | 00L << 0
+        };
+#endregion
+
         public static void Rshrn_V(ILEmitterCtx context)
         {
-            EmitVectorShrImmNarrowOpZx(context, round: true);
+            if (Optimizations.UseSsse3)
+            {
+                OpCodeSimdShImm64 op = (OpCodeSimdShImm64)context.CurrOp;
+
+                Type[] typesAdd = new Type[] { VectorUIntTypesPerSizeLog2[op.Size + 1], VectorUIntTypesPerSizeLog2[op.Size + 1] };
+                Type[] typesSrl = new Type[] { VectorUIntTypesPerSizeLog2[op.Size + 1], typeof(byte) };
+                Type[] typesSfl = new Type[] { typeof(Vector128<sbyte>), typeof(Vector128<sbyte>) };
+                Type[] typesSav = new Type[] { UIntTypesPerSizeLog2[op.Size + 1] };
+                Type[] typesSve = new Type[] { typeof(long), typeof(long) };
+
+                string nameMov = op.RegisterSize == RegisterSize.Simd128
+                    ? nameof(Sse.MoveLowToHigh)
+                    : nameof(Sse.MoveHighToLow);
+
+                int shift = GetImmShr(op);
+
+                long roundConst = 1L << (shift - 1);
+
+                context.EmitLdvec(op.Rd);
+                VectorHelper.EmitCall(context, nameof(VectorHelper.VectorSingleZero));
+
+                context.EmitCall(typeof(Sse).GetMethod(nameof(Sse.MoveLowToHigh)));
+
+                context.EmitLdvec(op.Rn);
+
+                context.EmitLdc_I8(roundConst);
+                context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.SetAllVector128), typesSav));
+
+                context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.Add), typesAdd));
+
+                context.EmitLdc_I4(shift);
+                context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.ShiftRightLogical), typesSrl)); // value
+
+                context.EmitLdc_I8(_masks_RshrnShrn[op.Size]); // mask
+                context.Emit(OpCodes.Dup); // mask
+
+                context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.SetVector128), typesSve));
+
+                context.EmitCall(typeof(Ssse3).GetMethod(nameof(Ssse3.Shuffle), typesSfl));
+
+                context.EmitCall(typeof(Sse).GetMethod(nameMov));
+
+                context.EmitStvec(op.Rd);
+            }
+            else
+            {
+                EmitVectorShrImmNarrowOpZx(context, round: true);
+            }
         }
 
         public static void Shl_S(ILEmitterCtx context)
@@ -80,12 +137,13 @@ namespace ChocolArm64.Instructions
                                                    nameof(Sse41.ConvertToVector128Int32),
                                                    nameof(Sse41.ConvertToVector128Int64) };
 
-                int numBytes = op.RegisterSize == RegisterSize.Simd128 ? 8 : 0;
-
                 context.EmitLdvec(op.Rn);
 
-                context.EmitLdc_I4(numBytes);
-                context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.ShiftRightLogical128BitLane), typesSll));
+                if (op.RegisterSize == RegisterSize.Simd128)
+                {
+                    context.Emit(OpCodes.Ldc_I4_8);
+                    context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.ShiftRightLogical128BitLane), typesSll));
+                }
 
                 context.EmitCall(typeof(Sse41).GetMethod(namesCvt[op.Size], typesCvt));
 
@@ -102,7 +160,45 @@ namespace ChocolArm64.Instructions
 
         public static void Shrn_V(ILEmitterCtx context)
         {
-            EmitVectorShrImmNarrowOpZx(context, round: false);
+            if (Optimizations.UseSsse3)
+            {
+                OpCodeSimdShImm64 op = (OpCodeSimdShImm64)context.CurrOp;
+
+                Type[] typesSrl = new Type[] { VectorUIntTypesPerSizeLog2[op.Size + 1], typeof(byte) };
+                Type[] typesSfl = new Type[] { typeof(Vector128<sbyte>), typeof(Vector128<sbyte>) };
+                Type[] typesSve = new Type[] { typeof(long), typeof(long) };
+
+                string nameMov = op.RegisterSize == RegisterSize.Simd128
+                    ? nameof(Sse.MoveLowToHigh)
+                    : nameof(Sse.MoveHighToLow);
+
+                int shift = GetImmShr(op);
+
+                context.EmitLdvec(op.Rd);
+                VectorHelper.EmitCall(context, nameof(VectorHelper.VectorSingleZero));
+
+                context.EmitCall(typeof(Sse).GetMethod(nameof(Sse.MoveLowToHigh)));
+
+                context.EmitLdvec(op.Rn);
+
+                context.EmitLdc_I4(shift);
+                context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.ShiftRightLogical), typesSrl)); // value
+
+                context.EmitLdc_I8(_masks_RshrnShrn[op.Size]); // mask
+                context.Emit(OpCodes.Dup); // mask
+
+                context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.SetVector128), typesSve));
+
+                context.EmitCall(typeof(Ssse3).GetMethod(nameof(Ssse3.Shuffle), typesSfl));
+
+                context.EmitCall(typeof(Sse).GetMethod(nameMov));
+
+                context.EmitStvec(op.Rd);
+            }
+            else
+            {
+                EmitVectorShrImmNarrowOpZx(context, round: false);
+            }
         }
 
         public static void Sli_V(ILEmitterCtx context)
@@ -271,8 +367,7 @@ namespace ChocolArm64.Instructions
         {
             OpCodeSimdShImm64 op = (OpCodeSimdShImm64)context.CurrOp;
 
-            if (Optimizations.UseSse2 && op.Size > 0
-                                      && op.Size < 3)
+            if (Optimizations.UseSse2 && op.Size > 0 && op.Size < 3)
             {
                 Type[] typesShs = new Type[] { VectorIntTypesPerSizeLog2[op.Size], typeof(byte) };
                 Type[] typesAdd = new Type[] { VectorIntTypesPerSizeLog2[op.Size], VectorIntTypesPerSizeLog2[op.Size] };
@@ -282,16 +377,13 @@ namespace ChocolArm64.Instructions
 
                 context.EmitLdvec(op.Rn);
 
-                context.Emit(OpCodes.Dup);
-                context.EmitStvectmp();
-
                 context.EmitLdc_I4(eSize - shift);
                 context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.ShiftLeftLogical), typesShs));
 
                 context.EmitLdc_I4(eSize - 1);
                 context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.ShiftRightLogical), typesShs));
 
-                context.EmitLdvectmp();
+                context.EmitLdvec(op.Rn);
 
                 context.EmitLdc_I4(shift);
                 context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.ShiftRightArithmetic), typesShs));
@@ -320,8 +412,7 @@ namespace ChocolArm64.Instructions
         {
             OpCodeSimdShImm64 op = (OpCodeSimdShImm64)context.CurrOp;
 
-            if (Optimizations.UseSse2 && op.Size > 0
-                                      && op.Size < 3)
+            if (Optimizations.UseSse2 && op.Size > 0 && op.Size < 3)
             {
                 Type[] typesShs = new Type[] { VectorIntTypesPerSizeLog2[op.Size], typeof(byte) };
                 Type[] typesAdd = new Type[] { VectorIntTypesPerSizeLog2[op.Size], VectorIntTypesPerSizeLog2[op.Size] };
@@ -332,16 +423,13 @@ namespace ChocolArm64.Instructions
                 context.EmitLdvec(op.Rd);
                 context.EmitLdvec(op.Rn);
 
-                context.Emit(OpCodes.Dup);
-                context.EmitStvectmp();
-
                 context.EmitLdc_I4(eSize - shift);
                 context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.ShiftLeftLogical), typesShs));
 
                 context.EmitLdc_I4(eSize - 1);
                 context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.ShiftRightLogical), typesShs));
 
-                context.EmitLdvectmp();
+                context.EmitLdvec(op.Rn);
 
                 context.EmitLdc_I4(shift);
                 context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.ShiftRightArithmetic), typesShs));
@@ -403,17 +491,21 @@ namespace ChocolArm64.Instructions
                                                    nameof(Sse41.ConvertToVector128Int32),
                                                    nameof(Sse41.ConvertToVector128Int64) };
 
-                int numBytes = op.RegisterSize == RegisterSize.Simd128 ? 8 : 0;
-
                 context.EmitLdvec(op.Rn);
 
-                context.EmitLdc_I4(numBytes);
-                context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.ShiftRightLogical128BitLane), typesSll));
+                if (op.RegisterSize == RegisterSize.Simd128)
+                {
+                    context.Emit(OpCodes.Ldc_I4_8);
+                    context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.ShiftRightLogical128BitLane), typesSll));
+                }
 
                 context.EmitCall(typeof(Sse41).GetMethod(namesCvt[op.Size], typesCvt));
 
-                context.EmitLdc_I4(shift);
-                context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.ShiftLeftLogical), typesSll));
+                if (shift != 0)
+                {
+                    context.EmitLdc_I4(shift);
+                    context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.ShiftLeftLogical), typesSll));
+                }
 
                 context.EmitStvec(op.Rd);
             }
@@ -432,8 +524,7 @@ namespace ChocolArm64.Instructions
         {
             OpCodeSimdShImm64 op = (OpCodeSimdShImm64)context.CurrOp;
 
-            if (Optimizations.UseSse2 && op.Size > 0
-                                      && op.Size < 3)
+            if (Optimizations.UseSse2 && op.Size > 0 && op.Size < 3)
             {
                 Type[] typesSra = new Type[] { VectorIntTypesPerSizeLog2[op.Size], typeof(byte) };
 
@@ -464,8 +555,7 @@ namespace ChocolArm64.Instructions
         {
             OpCodeSimdShImm64 op = (OpCodeSimdShImm64)context.CurrOp;
 
-            if (Optimizations.UseSse2 && op.Size > 0
-                                      && op.Size < 3)
+            if (Optimizations.UseSse2 && op.Size > 0 && op.Size < 3)
             {
                 Type[] typesSra = new Type[] { VectorIntTypesPerSizeLog2[op.Size], typeof(byte) };
                 Type[] typesAdd = new Type[] { VectorIntTypesPerSizeLog2[op.Size], VectorIntTypesPerSizeLog2[op.Size] };
@@ -474,8 +564,8 @@ namespace ChocolArm64.Instructions
                 context.EmitLdvec(op.Rn);
 
                 context.EmitLdc_I4(GetImmShr(op));
-
                 context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.ShiftRightArithmetic), typesSra));
+
                 context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.Add), typesAdd));
 
                 context.EmitStvec(op.Rd);
@@ -612,16 +702,13 @@ namespace ChocolArm64.Instructions
 
                 context.EmitLdvec(op.Rn);
 
-                context.Emit(OpCodes.Dup);
-                context.EmitStvectmp();
-
                 context.EmitLdc_I4(eSize - shift);
                 context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.ShiftLeftLogical), typesShs));
 
                 context.EmitLdc_I4(eSize - 1);
                 context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.ShiftRightLogical), typesShs));
 
-                context.EmitLdvectmp();
+                context.EmitLdvec(op.Rn);
 
                 context.EmitLdc_I4(shift);
                 context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.ShiftRightLogical), typesShs));
@@ -661,16 +748,13 @@ namespace ChocolArm64.Instructions
                 context.EmitLdvec(op.Rd);
                 context.EmitLdvec(op.Rn);
 
-                context.Emit(OpCodes.Dup);
-                context.EmitStvectmp();
-
                 context.EmitLdc_I4(eSize - shift);
                 context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.ShiftLeftLogical), typesShs));
 
                 context.EmitLdc_I4(eSize - 1);
                 context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.ShiftRightLogical), typesShs));
 
-                context.EmitLdvectmp();
+                context.EmitLdvec(op.Rn);
 
                 context.EmitLdc_I4(shift);
                 context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.ShiftRightLogical), typesShs));
@@ -732,17 +816,21 @@ namespace ChocolArm64.Instructions
                                                    nameof(Sse41.ConvertToVector128Int32),
                                                    nameof(Sse41.ConvertToVector128Int64) };
 
-                int numBytes = op.RegisterSize == RegisterSize.Simd128 ? 8 : 0;
-
                 context.EmitLdvec(op.Rn);
 
-                context.EmitLdc_I4(numBytes);
-                context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.ShiftRightLogical128BitLane), typesSll));
+                if (op.RegisterSize == RegisterSize.Simd128)
+                {
+                    context.Emit(OpCodes.Ldc_I4_8);
+                    context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.ShiftRightLogical128BitLane), typesSll));
+                }
 
                 context.EmitCall(typeof(Sse41).GetMethod(namesCvt[op.Size], typesCvt));
 
-                context.EmitLdc_I4(shift);
-                context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.ShiftLeftLogical), typesSll));
+                if (shift != 0)
+                {
+                    context.EmitLdc_I4(shift);
+                    context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.ShiftLeftLogical), typesSll));
+                }
 
                 context.EmitStvec(op.Rd);
             }
@@ -801,8 +889,8 @@ namespace ChocolArm64.Instructions
                 context.EmitLdvec(op.Rn);
 
                 context.EmitLdc_I4(GetImmShr(op));
-
                 context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.ShiftRightLogical), typesSrl));
+
                 context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.Add), typesAdd));
 
                 context.EmitStvec(op.Rd);
@@ -899,12 +987,9 @@ namespace ChocolArm64.Instructions
                     context.Emit(OpCodes.Add);
                 }
 
-                EmitVectorInsertTmp(context, index, op.Size);
+                EmitVectorInsert(context, op.Rd, index, op.Size);
             }
 
-            context.EmitLdvectmp();
-            context.EmitStvec(op.Rd);
-
             if ((op.RegisterSize == RegisterSize.Simd64) || scalar)
             {
                 EmitVectorZeroUpper(context, op.Rd);
@@ -1044,11 +1129,7 @@ namespace ChocolArm64.Instructions
         }
 
         // dst64 = (Int(src64, signed) + roundConst) >> shift;
-        private static void EmitShrImm64(
-            ILEmitterCtx context,
-            bool signed,
-            long roundConst,
-            int  shift)
+        private static void EmitShrImm64(ILEmitterCtx context, bool signed, long roundConst, int shift)
         {
             context.EmitLdc_I8(roundConst);
             context.EmitLdc_I4(shift);
diff --git a/ChocolArm64/Translation/ILEmitterCtx.cs b/ChocolArm64/Translation/ILEmitterCtx.cs
index 91b72b13ae..f39bd37112 100644
--- a/ChocolArm64/Translation/ILEmitterCtx.cs
+++ b/ChocolArm64/Translation/ILEmitterCtx.cs
@@ -61,8 +61,7 @@ namespace ChocolArm64.Translation
 
         //Vectors are part of another "set" of locals.
         private const int VecGpTmp1Index   = ReservedLocalsCount + 0;
-        private const int VecGpTmp2Index   = ReservedLocalsCount + 1;
-        private const int UserVecTempStart = ReservedLocalsCount + 2;
+        private const int UserVecTempStart = ReservedLocalsCount + 1;
 
         private static int _userIntTempCount;
         private static int _userVecTempCount;
@@ -630,9 +629,6 @@ namespace ChocolArm64.Translation
         public void EmitLdvectmp() => EmitLdvec(VecGpTmp1Index);
         public void EmitStvectmp() => EmitStvec(VecGpTmp1Index);
 
-        public void EmitLdvectmp2() => EmitLdvec(VecGpTmp2Index);
-        public void EmitStvectmp2() => EmitStvec(VecGpTmp2Index);
-
         public void EmitLdint(int index) => Ldloc(index, VarType.Int);
         public void EmitStint(int index) => Stloc(index, VarType.Int);
 
diff --git a/Ryujinx.Common/Utilities/BitUtils.cs b/Ryujinx.Common/Utilities/BitUtils.cs
index b6fba4fba1..5f70f742a0 100644
--- a/Ryujinx.Common/Utilities/BitUtils.cs
+++ b/Ryujinx.Common/Utilities/BitUtils.cs
@@ -100,7 +100,7 @@ namespace Ryujinx.Common
             do
             {
                 nibbleIdx -= 4;
-                preCount = ClzNibbleTbl[(value >> nibbleIdx) & 0b1111];
+                preCount = ClzNibbleTbl[(int)(value >> nibbleIdx) & 0b1111];
                 count += preCount;
             }
             while (preCount == 4);
@@ -136,4 +136,4 @@ namespace Ryujinx.Common
             return (value >> 32) | (value << 32);
         }
     }
-}
\ No newline at end of file
+}
diff --git a/Ryujinx.Tests/Cpu/CpuTestSimdArithmetic.cs b/Ryujinx.Tests/Cpu/CpuTestSimdArithmetic.cs
index 2f8604ebac..63e0bda83c 100644
--- a/Ryujinx.Tests/Cpu/CpuTestSimdArithmetic.cs
+++ b/Ryujinx.Tests/Cpu/CpuTestSimdArithmetic.cs
@@ -8,26 +8,6 @@ namespace Ryujinx.Tests.Cpu
 {
     public class CpuTestSimdArithmetic : CpuTest
     {
-        [TestCase(0x00000000u, 0x7F800000u)]
-        [TestCase(0x80000000u, 0xFF800000u)]
-        [TestCase(0x00FFF000u, 0x7E000000u)]
-        [TestCase(0x41200000u, 0x3DCC8000u)]
-        [TestCase(0xC1200000u, 0xBDCC8000u)]
-        [TestCase(0x001FFFFFu, 0x7F800000u)]
-        [TestCase(0x007FF000u, 0x7E800000u)]
-        public void Frecpe_S(uint a, uint result)
-        {
-            uint opcode = 0x5EA1D820; // FRECPE S0, S1
-
-            Vector128<float> v1 = MakeVectorE0(a);
-
-            CpuThreadState threadState = SingleOpcode(opcode, v1: v1);
-
-            Assert.That(GetVectorE0(threadState.V0), Is.EqualTo(result));
-
-            CompareAgainstUnicorn();
-        }
-
         [TestCase(0x3FE66666u, false, 0x40000000u)]
         [TestCase(0x3F99999Au, false, 0x3F800000u)]
         [TestCase(0x404CCCCDu, false, 0x40400000u)]
@@ -601,19 +581,5 @@ namespace Ryujinx.Tests.Cpu
 
             CompareAgainstUnicorn();
         }
-
-        [TestCase(0x41200000u, 0x3EA18000u)]
-        public void Frsqrte_S(uint a, uint result)
-        {
-            uint opcode = 0x7EA1D820; // FRSQRTE S0, S1
-
-            Vector128<float> v1 = MakeVectorE0(a);
-
-            CpuThreadState threadState = SingleOpcode(opcode, v1: v1);
-
-            Assert.That(GetVectorE0(threadState.V0), Is.EqualTo(result));
-
-            CompareAgainstUnicorn();
-        }
     }
 }
diff --git a/Ryujinx.Tests/Cpu/CpuTestSimdExt.cs b/Ryujinx.Tests/Cpu/CpuTestSimdExt.cs
index f232989f77..b8548169be 100644
--- a/Ryujinx.Tests/Cpu/CpuTestSimdExt.cs
+++ b/Ryujinx.Tests/Cpu/CpuTestSimdExt.cs
@@ -19,7 +19,8 @@ namespace Ryujinx.Tests.Cpu
         }
 #endregion
 
-        private const int RndCnt = 2;
+        private const int RndCnt      = 2;
+        private const int RndCntIndex = 2;
 
         [Test, Pairwise, Description("EXT <Vd>.8B, <Vn>.8B, <Vm>.8B, #<index>")]
         public void Ext_V_8B([Values(0u)]     uint rd,
@@ -28,7 +29,7 @@ namespace Ryujinx.Tests.Cpu
                              [ValueSource("_8B_")] [Random(RndCnt)] ulong z,
                              [ValueSource("_8B_")] [Random(RndCnt)] ulong a,
                              [ValueSource("_8B_")] [Random(RndCnt)] ulong b,
-                             [Range(0u, 7u)] uint index)
+                             [Values(0u, 7u)] [Random(1u, 6u, RndCntIndex)] uint index)
         {
             uint imm4 = index & 0x7u;
 
@@ -52,7 +53,7 @@ namespace Ryujinx.Tests.Cpu
                               [ValueSource("_8B_")] [Random(RndCnt)] ulong z,
                               [ValueSource("_8B_")] [Random(RndCnt)] ulong a,
                               [ValueSource("_8B_")] [Random(RndCnt)] ulong b,
-                              [Range(0u, 15u)] uint index)
+                              [Values(0u, 15u)] [Random(1u, 14u, RndCntIndex)] uint index)
         {
             uint imm4 = index & 0xFu;
 
diff --git a/Ryujinx.Tests/Cpu/CpuTestSimdIns.cs b/Ryujinx.Tests/Cpu/CpuTestSimdIns.cs
index 4ca54a2b42..fe93f06e37 100644
--- a/Ryujinx.Tests/Cpu/CpuTestSimdIns.cs
+++ b/Ryujinx.Tests/Cpu/CpuTestSimdIns.cs
@@ -67,7 +67,8 @@ namespace Ryujinx.Tests.Cpu
         }
 #endregion
 
-        private const int RndCnt = 2;
+        private const int RndCnt      = 2;
+        private const int RndCntIndex = 2;
 
         [Test, Pairwise, Description("DUP <Vd>.<T>, <R><n>")]
         public void Dup_Gp_W([Values(0u)]      uint rd,
@@ -109,7 +110,7 @@ namespace Ryujinx.Tests.Cpu
 
         [Test, Pairwise, Description("DUP B0, V1.B[<index>]")]
         public void Dup_S_B([ValueSource("_8B_")] [Random(RndCnt)] ulong a,
-                            [Range(0u, 15u)] uint index)
+                            [Values(0u, 15u)] [Random(1u, 14u, RndCntIndex)] uint index)
         {
             const int size = 0;
 
@@ -129,7 +130,7 @@ namespace Ryujinx.Tests.Cpu
 
         [Test, Pairwise, Description("DUP H0, V1.H[<index>]")]
         public void Dup_S_H([ValueSource("_4H_")] [Random(RndCnt)] ulong a,
-                            [Range(0u, 7u)] uint index)
+                            [Values(0u, 7u)] [Random(1u, 6u, RndCntIndex)] uint index)
         {
             const int size = 1;
 
@@ -192,7 +193,7 @@ namespace Ryujinx.Tests.Cpu
                                  [Values(1u, 0u)] uint rn,
                                  [ValueSource("_8B_")] [Random(RndCnt)] ulong z,
                                  [ValueSource("_8B_")] [Random(RndCnt)] ulong a,
-                                 [Range(0u, 15u)] uint index,
+                                 [Values(0u, 15u)] [Random(1u, 14u, RndCntIndex)] uint index,
                                  [Values(0b0u, 0b1u)] uint q) // <8B, 16B>
         {
             const int size = 0;
@@ -217,7 +218,7 @@ namespace Ryujinx.Tests.Cpu
                                 [Values(1u, 0u)] uint rn,
                                 [ValueSource("_4H_")] [Random(RndCnt)] ulong z,
                                 [ValueSource("_4H_")] [Random(RndCnt)] ulong a,
-                                [Range(0u, 7u)] uint index,
+                                [Values(0u, 7u)] [Random(1u, 6u, RndCntIndex)] uint index,
                                 [Values(0b0u, 0b1u)] uint q) // <4H, 8H>
         {
             const int size = 1;
diff --git a/Ryujinx.Tests/Cpu/CpuTestSimdReg.cs b/Ryujinx.Tests/Cpu/CpuTestSimdReg.cs
index 8d2f4e9a34..1c418341b6 100644
--- a/Ryujinx.Tests/Cpu/CpuTestSimdReg.cs
+++ b/Ryujinx.Tests/Cpu/CpuTestSimdReg.cs
@@ -434,6 +434,26 @@ namespace Ryujinx.Tests.Cpu
             };
         }
 
+        private static uint[] _Mla_Mls_Mul_V_8B_4H_2S_()
+        {
+            return new uint[]
+            {
+                0x0E209400u, // MLA V0.8B, V0.8B, V0.8B
+                0x2E209400u, // MLS V0.8B, V0.8B, V0.8B
+                0x0E209C00u  // MUL V0.8B, V0.8B, V0.8B
+            };
+        }
+
+        private static uint[] _Mla_Mls_Mul_V_16B_8H_4S_()
+        {
+            return new uint[]
+            {
+                0x4E209400u, // MLA V0.16B, V0.16B, V0.16B
+                0x6E209400u, // MLS V0.16B, V0.16B, V0.16B
+                0x4E209C00u  // MUL V0.16B, V0.16B, V0.16B
+            };
+        }
+
         private static uint[] _Sha1c_Sha1m_Sha1p_Sha1su0_V_()
         {
             return new uint[]
@@ -1786,6 +1806,50 @@ namespace Ryujinx.Tests.Cpu
             CompareAgainstUnicorn(Fpsr.Ioc | Fpsr.Idc, FpSkips.IfUnderflow, FpTolerances.UpToOneUlpsD);
         }
 
+        [Test, Pairwise]
+        public void Mla_Mls_Mul_V_8B_4H_2S([ValueSource("_Mla_Mls_Mul_V_8B_4H_2S_")] uint opcodes,
+                                           [Values(0u)]     uint rd,
+                                           [Values(1u, 0u)] uint rn,
+                                           [Values(2u, 0u)] uint rm,
+                                           [ValueSource("_8B4H2S_")] [Random(RndCnt)] ulong z,
+                                           [ValueSource("_8B4H2S_")] [Random(RndCnt)] ulong a,
+                                           [ValueSource("_8B4H2S_")] [Random(RndCnt)] ulong b,
+                                           [Values(0b00u, 0b01u, 0b10u)] uint size) // <8B, 4H, 2S>
+        {
+            opcodes |= ((rm & 31) << 16) | ((rn & 31) << 5) | ((rd & 31) << 0);
+            opcodes |= ((size & 3) << 22);
+
+            Vector128<float> v0 = MakeVectorE0E1(z, z);
+            Vector128<float> v1 = MakeVectorE0(a);
+            Vector128<float> v2 = MakeVectorE0(b);
+
+            SingleOpcode(opcodes, v0: v0, v1: v1, v2: v2);
+
+            CompareAgainstUnicorn();
+        }
+
+        [Test, Pairwise]
+        public void Mla_Mls_Mul_V_16B_8H_4S([ValueSource("_Mla_Mls_Mul_V_16B_8H_4S_")] uint opcodes,
+                                            [Values(0u)]     uint rd,
+                                            [Values(1u, 0u)] uint rn,
+                                            [Values(2u, 0u)] uint rm,
+                                            [ValueSource("_8B4H2S_")] [Random(RndCnt)] ulong z,
+                                            [ValueSource("_8B4H2S_")] [Random(RndCnt)] ulong a,
+                                            [ValueSource("_8B4H2S_")] [Random(RndCnt)] ulong b,
+                                            [Values(0b00u, 0b01u, 0b10u)] uint size) // <16B, 8H, 4S>
+        {
+            opcodes |= ((rm & 31) << 16) | ((rn & 31) << 5) | ((rd & 31) << 0);
+            opcodes |= ((size & 3) << 22);
+
+            Vector128<float> v0 = MakeVectorE0E1(z, z);
+            Vector128<float> v1 = MakeVectorE0E1(a, a);
+            Vector128<float> v2 = MakeVectorE0E1(b, b);
+
+            SingleOpcode(opcodes, v0: v0, v1: v1, v2: v2);
+
+            CompareAgainstUnicorn();
+        }
+
         [Test, Pairwise, Description("ORN <Vd>.<T>, <Vn>.<T>, <Vm>.<T>")]
         public void Orn_V_8B([Values(0u)]     uint rd,
                              [Values(1u, 0u)] uint rn,
diff --git a/Ryujinx.Tests/Cpu/CpuTestSimdShImm.cs b/Ryujinx.Tests/Cpu/CpuTestSimdShImm.cs
index f026158cc7..9a295d5ed1 100644
--- a/Ryujinx.Tests/Cpu/CpuTestSimdShImm.cs
+++ b/Ryujinx.Tests/Cpu/CpuTestSimdShImm.cs
@@ -258,14 +258,15 @@ namespace Ryujinx.Tests.Cpu
         }
 #endregion
 
-        private const int RndCnt = 2;
+        private const int RndCnt      = 2;
+        private const int RndCntShift = 2;
 
         [Test, Pairwise, Description("SHL <V><d>, <V><n>, #<shift>")]
         public void Shl_S_D([Values(0u)]     uint rd,
                             [Values(1u, 0u)] uint rn,
                             [ValueSource("_1D_")] [Random(RndCnt)] ulong z,
                             [ValueSource("_1D_")] [Random(RndCnt)] ulong a,
-                            [Range(0u, 63u)] uint shift)
+                            [Values(0u, 63u)] [Random(1u, 62u, RndCntShift)] uint shift)
         {
             uint immHb = (64 + shift) & 0x7F;
 
@@ -286,7 +287,7 @@ namespace Ryujinx.Tests.Cpu
                                  [Values(1u, 0u)] uint rn,
                                  [ValueSource("_8B_")] [Random(RndCnt)] ulong z,
                                  [ValueSource("_8B_")] [Random(RndCnt)] ulong a,
-                                 [Range(0u, 7u)] uint shift,
+                                 [Values(0u, 7u)] [Random(1u, 6u, RndCntShift)] uint shift,
                                  [Values(0b0u, 0b1u)] uint q) // <8B, 16B>
         {
             uint immHb = (8 + shift) & 0x7F;
@@ -309,7 +310,7 @@ namespace Ryujinx.Tests.Cpu
                                 [Values(1u, 0u)] uint rn,
                                 [ValueSource("_4H_")] [Random(RndCnt)] ulong z,
                                 [ValueSource("_4H_")] [Random(RndCnt)] ulong a,
-                                [Range(0u, 15u)] uint shift,
+                                [Values(0u, 15u)] [Random(1u, 14u, RndCntShift)] uint shift,
                                 [Values(0b0u, 0b1u)] uint q) // <4H, 8H>
         {
             uint immHb = (16 + shift) & 0x7F;
@@ -332,7 +333,7 @@ namespace Ryujinx.Tests.Cpu
                                 [Values(1u, 0u)] uint rn,
                                 [ValueSource("_2S_")] [Random(RndCnt)] ulong z,
                                 [ValueSource("_2S_")] [Random(RndCnt)] ulong a,
-                                [Range(0u, 31u)] uint shift,
+                                [Values(0u, 31u)] [Random(1u, 30u, RndCntShift)] uint shift,
                                 [Values(0b0u, 0b1u)] uint q) // <2S, 4S>
         {
             uint immHb = (32 + shift) & 0x7F;
@@ -355,7 +356,7 @@ namespace Ryujinx.Tests.Cpu
                              [Values(1u, 0u)] uint rn,
                              [ValueSource("_1D_")] [Random(RndCnt)] ulong z,
                              [ValueSource("_1D_")] [Random(RndCnt)] ulong a,
-                             [Range(0u, 63u)] uint shift)
+                             [Values(0u, 63u)] [Random(1u, 62u, RndCntShift)] uint shift)
         {
             uint immHb = (64 + shift) & 0x7F;
 
@@ -377,7 +378,7 @@ namespace Ryujinx.Tests.Cpu
                                          [Values(1u, 0u)] uint rn,
                                          [ValueSource("_8B_")] [Random(RndCnt)] ulong z,
                                          [ValueSource("_8B_")] [Random(RndCnt)] ulong a,
-                                         [Range(0u, 7u)] uint shift,
+                                         [Values(0u, 7u)] [Random(1u, 6u, RndCntShift)] uint shift,
                                          [Values(0b0u, 0b1u)] uint q) // <8B8H, 16B8H>
         {
             uint immHb = (8 + shift) & 0x7F;
@@ -400,7 +401,7 @@ namespace Ryujinx.Tests.Cpu
                                         [Values(1u, 0u)] uint rn,
                                         [ValueSource("_4H_")] [Random(RndCnt)] ulong z,
                                         [ValueSource("_4H_")] [Random(RndCnt)] ulong a,
-                                        [Range(0u, 15u)] uint shift,
+                                        [Values(0u, 15u)] [Random(1u, 14u, RndCntShift)] uint shift,
                                         [Values(0b0u, 0b1u)] uint q) // <4H4S, 8H4S>
         {
             uint immHb = (16 + shift) & 0x7F;
@@ -423,7 +424,7 @@ namespace Ryujinx.Tests.Cpu
                                         [Values(1u, 0u)] uint rn,
                                         [ValueSource("_2S_")] [Random(RndCnt)] ulong z,
                                         [ValueSource("_2S_")] [Random(RndCnt)] ulong a,
-                                        [Range(0u, 31u)] uint shift,
+                                        [Values(0u, 31u)] [Random(1u, 30u, RndCntShift)] uint shift,
                                         [Values(0b0u, 0b1u)] uint q) // <2S2D, 4S2D>
         {
             uint immHb = (32 + shift) & 0x7F;
@@ -446,7 +447,7 @@ namespace Ryujinx.Tests.Cpu
                                [Values(1u, 0u)] uint rn,
                                [ValueSource("_1D_")] [Random(RndCnt)] ulong z,
                                [ValueSource("_1D_")] [Random(RndCnt)] ulong a,
-                               [Range(1u, 64u)] uint shift)
+                               [Values(1u, 64u)] [Random(2u, 63u, RndCntShift)] uint shift)
         {
             uint immHb = (128 - shift) & 0x7F;
 
@@ -467,7 +468,7 @@ namespace Ryujinx.Tests.Cpu
                                     [Values(1u, 0u)] uint rn,
                                     [ValueSource("_8B_")] [Random(RndCnt)] ulong z,
                                     [ValueSource("_8B_")] [Random(RndCnt)] ulong a,
-                                    [Range(1u, 8u)] uint shift,
+                                    [Values(1u, 8u)] [Random(2u, 7u, RndCntShift)] uint shift,
                                     [Values(0b0u, 0b1u)] uint q) // <8B, 16B>
         {
             uint immHb = (16 - shift) & 0x7F;
@@ -490,7 +491,7 @@ namespace Ryujinx.Tests.Cpu
                                    [Values(1u, 0u)] uint rn,
                                    [ValueSource("_4H_")] [Random(RndCnt)] ulong z,
                                    [ValueSource("_4H_")] [Random(RndCnt)] ulong a,
-                                   [Range(1u, 16u)] uint shift,
+                                   [Values(1u, 16u)] [Random(2u, 15u, RndCntShift)] uint shift,
                                    [Values(0b0u, 0b1u)] uint q) // <4H, 8H>
         {
             uint immHb = (32 - shift) & 0x7F;
@@ -513,7 +514,7 @@ namespace Ryujinx.Tests.Cpu
                                    [Values(1u, 0u)] uint rn,
                                    [ValueSource("_2S_")] [Random(RndCnt)] ulong z,
                                    [ValueSource("_2S_")] [Random(RndCnt)] ulong a,
-                                   [Range(1u, 32u)] uint shift,
+                                   [Values(1u, 32u)] [Random(2u, 31u, RndCntShift)] uint shift,
                                    [Values(0b0u, 0b1u)] uint q) // <2S, 4S>
         {
             uint immHb = (64 - shift) & 0x7F;
@@ -536,7 +537,7 @@ namespace Ryujinx.Tests.Cpu
                                 [Values(1u, 0u)] uint rn,
                                 [ValueSource("_1D_")] [Random(RndCnt)] ulong z,
                                 [ValueSource("_1D_")] [Random(RndCnt)] ulong a,
-                                [Range(1u, 64u)] uint shift)
+                                [Values(1u, 64u)] [Random(2u, 63u, RndCntShift)] uint shift)
         {
             uint immHb = (128 - shift) & 0x7F;
 
@@ -557,7 +558,7 @@ namespace Ryujinx.Tests.Cpu
                                               [Values(1u, 0u)] uint rn,
                                               [ValueSource("_4H_")] [Random(RndCnt)] ulong z,
                                               [ValueSource("_4H_")] [Random(RndCnt)] ulong a,
-                                              [Range(1u, 8u)] uint shift,
+                                              [Values(1u, 8u)] [Random(2u, 7u, RndCntShift)] uint shift,
                                               [Values(0b0u, 0b1u)] uint q) // <8H8B, 8H16B>
         {
             uint immHb = (16 - shift) & 0x7F;
@@ -580,7 +581,7 @@ namespace Ryujinx.Tests.Cpu
                                              [Values(1u, 0u)] uint rn,
                                              [ValueSource("_2S_")] [Random(RndCnt)] ulong z,
                                              [ValueSource("_2S_")] [Random(RndCnt)] ulong a,
-                                             [Range(1u, 16u)] uint shift,
+                                             [Values(1u, 16u)] [Random(2u, 15u, RndCntShift)] uint shift,
                                              [Values(0b0u, 0b1u)] uint q) // <4S4H, 4S8H>
         {
             uint immHb = (32 - shift) & 0x7F;
@@ -603,7 +604,7 @@ namespace Ryujinx.Tests.Cpu
                                              [Values(1u, 0u)] uint rn,
                                              [ValueSource("_1D_")] [Random(RndCnt)] ulong z,
                                              [ValueSource("_1D_")] [Random(RndCnt)] ulong a,
-                                             [Range(1u, 32u)] uint shift,
+                                             [Values(1u, 32u)] [Random(2u, 31u, RndCntShift)] uint shift,
                                              [Values(0b0u, 0b1u)] uint q) // <2D2S, 2D4S>
         {
             uint immHb = (64 - shift) & 0x7F;
@@ -626,7 +627,7 @@ namespace Ryujinx.Tests.Cpu
                                                 [Values(1u, 0u)] uint rn,
                                                 [ValueSource("_1H_")] [Random(RndCnt)] ulong z,
                                                 [ValueSource("_1H_")] [Random(RndCnt)] ulong a,
-                                                [Range(1u, 8u)] uint shift)
+                                                [Values(1u, 8u)] [Random(2u, 7u, RndCntShift)] uint shift)
         {
             uint immHb = (16 - shift) & 0x7F;
 
@@ -647,7 +648,7 @@ namespace Ryujinx.Tests.Cpu
                                                 [Values(1u, 0u)] uint rn,
                                                 [ValueSource("_1S_")] [Random(RndCnt)] ulong z,
                                                 [ValueSource("_1S_")] [Random(RndCnt)] ulong a,
-                                                [Range(1u, 16u)] uint shift)
+                                                [Values(1u, 16u)] [Random(2u, 15u, RndCntShift)] uint shift)
         {
             uint immHb = (32 - shift) & 0x7F;
 
@@ -668,7 +669,7 @@ namespace Ryujinx.Tests.Cpu
                                                 [Values(1u, 0u)] uint rn,
                                                 [ValueSource("_1D_")] [Random(RndCnt)] ulong z,
                                                 [ValueSource("_1D_")] [Random(RndCnt)] ulong a,
-                                                [Range(1u, 32u)] uint shift)
+                                                [Values(1u, 32u)] [Random(2u, 31u, RndCntShift)] uint shift)
         {
             uint immHb = (64 - shift) & 0x7F;
 
@@ -689,7 +690,7 @@ namespace Ryujinx.Tests.Cpu
                                                         [Values(1u, 0u)] uint rn,
                                                         [ValueSource("_4H_")] [Random(RndCnt)] ulong z,
                                                         [ValueSource("_4H_")] [Random(RndCnt)] ulong a,
-                                                        [Range(1u, 8u)] uint shift,
+                                                        [Values(1u, 8u)] [Random(2u, 7u, RndCntShift)] uint shift,
                                                         [Values(0b0u, 0b1u)] uint q) // <8H8B, 8H16B>
         {
             uint immHb = (16 - shift) & 0x7F;
@@ -712,7 +713,7 @@ namespace Ryujinx.Tests.Cpu
                                                        [Values(1u, 0u)] uint rn,
                                                        [ValueSource("_2S_")] [Random(RndCnt)] ulong z,
                                                        [ValueSource("_2S_")] [Random(RndCnt)] ulong a,
-                                                       [Range(1u, 16u)] uint shift,
+                                                       [Values(1u, 16u)] [Random(2u, 15u, RndCntShift)] uint shift,
                                                        [Values(0b0u, 0b1u)] uint q) // <4S4H, 4S8H>
         {
             uint immHb = (32 - shift) & 0x7F;
@@ -735,7 +736,7 @@ namespace Ryujinx.Tests.Cpu
                                                        [Values(1u, 0u)] uint rn,
                                                        [ValueSource("_1D_")] [Random(RndCnt)] ulong z,
                                                        [ValueSource("_1D_")] [Random(RndCnt)] ulong a,
-                                                       [Range(1u, 32u)] uint shift,
+                                                       [Values(1u, 32u)] [Random(2u, 31u, RndCntShift)] uint shift,
                                                        [Values(0b0u, 0b1u)] uint q) // <2D2S, 2D4S>
         {
             uint immHb = (64 - shift) & 0x7F;
diff --git a/Ryujinx.Tests/Ryujinx.Tests.csproj b/Ryujinx.Tests/Ryujinx.Tests.csproj
index 35405c769f..ce94326d24 100644
--- a/Ryujinx.Tests/Ryujinx.Tests.csproj
+++ b/Ryujinx.Tests/Ryujinx.Tests.csproj
@@ -16,9 +16,9 @@
   </PropertyGroup>
 
   <ItemGroup>
-    <PackageReference Include="Microsoft.NET.Test.Sdk" Version="15.9.0" />
+    <PackageReference Include="Microsoft.NET.Test.Sdk" Version="16.0.1" />
     <PackageReference Include="NUnit" Version="3.11.0" />
-    <PackageReference Include="NUnit3TestAdapter" Version="3.11.0" />
+    <PackageReference Include="NUnit3TestAdapter" Version="3.13.0" />
     <PackageReference Include="System.Runtime.Intrinsics.Experimental" Version="4.5.0-rc1" />
   </ItemGroup>