diff --git a/ARMeilleure/Instructions/InstEmitSimdArithmetic.cs b/ARMeilleure/Instructions/InstEmitSimdArithmetic.cs
index bd6a98bed8..f18b91cfcc 100644
--- a/ARMeilleure/Instructions/InstEmitSimdArithmetic.cs
+++ b/ARMeilleure/Instructions/InstEmitSimdArithmetic.cs
@@ -120,24 +120,155 @@ namespace ARMeilleure.Instructions
         {
             OpCodeSimd op = (OpCodeSimd)context.CurrOp;
 
-            Operand res = context.VectorZero();
-
-            int elems = op.GetBytesCount() >> op.Size;
-
             int eSize = 8 << op.Size;
 
-            for (int index = 0; index < elems; index++)
+            Operand res = eSize switch {
+                8  => Clz_V_I8 (context, GetVec(op.Rn)),
+                16 => Clz_V_I16(context, GetVec(op.Rn)),
+                32 => Clz_V_I32(context, GetVec(op.Rn)),
+                _  => null
+            };
+
+            if (res != null)
             {
-                Operand ne = EmitVectorExtractZx(context, op.Rn, index, op.Size);
+                if (op.RegisterSize == RegisterSize.Simd64)
+                {
+                    res = context.VectorZeroUpper64(res);
+                }
+            }
+            else
+            {
+                int elems = op.GetBytesCount() >> op.Size;
 
-                Operand de = context.Call(typeof(SoftFallback).GetMethod(nameof(SoftFallback.CountLeadingZeros)), ne, Const(eSize));
+                res = context.VectorZero();
 
-                res = EmitVectorInsert(context, res, de, index, op.Size);
+                for (int index = 0; index < elems; index++)
+                {
+                    Operand ne = EmitVectorExtractZx(context, op.Rn, index, op.Size);
+
+                    Operand de = context.Call(typeof(SoftFallback).GetMethod(nameof(SoftFallback.CountLeadingZeros)), ne, Const(eSize));
+
+                    res = EmitVectorInsert(context, res, de, index, op.Size);
+                }
             }
 
             context.Copy(GetVec(op.Rd), res);
         }
 
+        private static Operand Clz_V_I8(ArmEmitterContext context, Operand arg)
+        {
+            if (!Optimizations.UseSsse3)
+            {
+                return null;
+            }
+
+            // CLZ nibble table.
+            Operand clzTable = X86GetScalar(context, 0x01_01_01_01_02_02_03_04);
+
+            Operand maskLow = X86GetAllElements(context, 0x0f_0f_0f_0f);
+            Operand c04     = X86GetAllElements(context, 0x04_04_04_04);
+
+            // CLZ of low 4 bits of elements in arg.
+            Operand loClz = context.AddIntrinsic(Intrinsic.X86Pshufb, clzTable, arg);
+
+            // Get the high 4 bits of elements in arg.
+            Operand hiArg = context.AddIntrinsic(Intrinsic.X86Psrlw, arg, Const(4));
+                    hiArg = context.AddIntrinsic(Intrinsic.X86Pand, hiArg, maskLow);
+
+            // CLZ of high 4 bits of elements in arg.
+            Operand hiClz = context.AddIntrinsic(Intrinsic.X86Pshufb, clzTable, hiArg);
+
+            // If high 4 bits are not all zero, we discard the CLZ of the low 4 bits.
+            Operand mask = context.AddIntrinsic(Intrinsic.X86Pcmpeqb, hiClz, c04);
+            loClz = context.AddIntrinsic(Intrinsic.X86Pand, loClz, mask);
+
+            return context.AddIntrinsic(Intrinsic.X86Paddb, loClz, hiClz);
+        }
+
+        private static Operand Clz_V_I16(ArmEmitterContext context, Operand arg)
+        {
+            if (!Optimizations.UseSsse3)
+            {
+                return null;
+            }
+
+            Operand maskSwap = X86GetElements(context, 0x80_0f_80_0d_80_0b_80_09, 0x80_07_80_05_80_03_80_01);
+            Operand maskLow  = X86GetAllElements(context, 0x00ff_00ff);
+            Operand c0008    = X86GetAllElements(context, 0x0008_0008);
+
+            // CLZ pair of high 8 and low 8 bits of elements in arg.
+            Operand hiloClz = Clz_V_I8(context, arg);
+            // Get CLZ of low 8 bits in each pair.
+            Operand loClz = context.AddIntrinsic(Intrinsic.X86Pand, hiloClz, maskLow);
+            // Get CLZ of high 8 bits in each pair.
+            Operand hiClz = context.AddIntrinsic(Intrinsic.X86Pshufb, hiloClz, maskSwap);
+
+            // If high 8 bits are not all zero, we discard the CLZ of the low 8 bits.
+            Operand mask = context.AddIntrinsic(Intrinsic.X86Pcmpeqw, hiClz, c0008);
+            loClz = context.AddIntrinsic(Intrinsic.X86Pand, loClz, mask);
+
+            return context.AddIntrinsic(Intrinsic.X86Paddw, loClz, hiClz);
+        }
+
+        private static Operand Clz_V_I32(ArmEmitterContext context, Operand arg)
+        {
+            // TODO: Use vplzcntd when AVX-512 is supported.
+            if (!Optimizations.UseSse2)
+            {
+                return null;
+            }
+
+            Operand AddVectorI32(Operand op0, Operand op1)      => context.AddIntrinsic(Intrinsic.X86Paddd, op0, op1);
+            Operand SubVectorI32(Operand op0, Operand op1)      => context.AddIntrinsic(Intrinsic.X86Psubd, op0, op1);
+            Operand ShiftRightVectorUI32(Operand op0, int imm8) => context.AddIntrinsic(Intrinsic.X86Psrld, op0, Const(imm8));
+            Operand OrVector(Operand op0, Operand op1)          => context.AddIntrinsic(Intrinsic.X86Por, op0, op1);
+            Operand AndVector(Operand op0, Operand op1)         => context.AddIntrinsic(Intrinsic.X86Pand, op0, op1);
+            Operand NotVector(Operand op0)                      => context.AddIntrinsic(Intrinsic.X86Pandn, op0, context.VectorOne());
+
+            Operand c55555555 = X86GetAllElements(context, 0x55555555);
+            Operand c33333333 = X86GetAllElements(context, 0x33333333);
+            Operand c0f0f0f0f = X86GetAllElements(context, 0x0f0f0f0f);
+            Operand c0000003f = X86GetAllElements(context, 0x0000003f);
+
+            Operand tmp0;
+            Operand tmp1;
+            Operand res;
+
+            // Set all bits after highest set bit to 1.
+            res = OrVector(ShiftRightVectorUI32(arg, 1), arg);
+            res = OrVector(ShiftRightVectorUI32(res, 2), res);
+            res = OrVector(ShiftRightVectorUI32(res, 4), res);
+            res = OrVector(ShiftRightVectorUI32(res, 8), res);
+            res = OrVector(ShiftRightVectorUI32(res, 16), res);
+
+            // Make leading 0s into leading 1s.
+            res = NotVector(res);
+
+            // Count leading 1s, which is the population count.
+            tmp0 = ShiftRightVectorUI32(res, 1);
+            tmp0 = AndVector(tmp0, c55555555);
+            res  = SubVectorI32(res, tmp0);
+
+            tmp0 = ShiftRightVectorUI32(res, 2);
+            tmp0 = AndVector(tmp0, c33333333);
+            tmp1 = AndVector(res, c33333333);
+            res  = AddVectorI32(tmp0, tmp1);
+
+            tmp0 = ShiftRightVectorUI32(res, 4);
+            tmp0 = AddVectorI32(tmp0, res);
+            res  = AndVector(tmp0, c0f0f0f0f);
+
+            tmp0 = ShiftRightVectorUI32(res, 8);
+            res  = AddVectorI32(tmp0, res);
+
+            tmp0 = ShiftRightVectorUI32(res, 16);
+            res  = AddVectorI32(tmp0, res);
+
+            res  = AndVector(res, c0000003f);
+
+            return res;
+        }
+
         public static void Cnt_V(ArmEmitterContext context)
         {
             OpCodeSimd op = (OpCodeSimd)context.CurrOp;
diff --git a/ARMeilleure/Instructions/InstEmitSimdHelper.cs b/ARMeilleure/Instructions/InstEmitSimdHelper.cs
index e9d5303c78..da8ccae78a 100644
--- a/ARMeilleure/Instructions/InstEmitSimdHelper.cs
+++ b/ARMeilleure/Instructions/InstEmitSimdHelper.cs
@@ -209,6 +209,11 @@ namespace ARMeilleure.Instructions
         }
 
         public static Operand X86GetElements(ArmEmitterContext context, long e1, long e0)
+        {
+            return X86GetElements(context, (ulong)e1, (ulong)e0);
+        }
+
+        public static Operand X86GetElements(ArmEmitterContext context, ulong e1, ulong e0)
         {
             Operand vector0 = context.VectorCreateScalar(Const(e0));
             Operand vector1 = context.VectorCreateScalar(Const(e1));
diff --git a/ARMeilleure/Translation/PTC/Ptc.cs b/ARMeilleure/Translation/PTC/Ptc.cs
index 8f250a5528..92094e6212 100644
--- a/ARMeilleure/Translation/PTC/Ptc.cs
+++ b/ARMeilleure/Translation/PTC/Ptc.cs
@@ -22,7 +22,7 @@ namespace ARMeilleure.Translation.PTC
     {
         private const string HeaderMagic = "PTChd";
 
-        private const int InternalVersion = 1817; //! To be incremented manually for each change to the ARMeilleure project.
+        private const int InternalVersion = 1917; //! To be incremented manually for each change to the ARMeilleure project.
 
         private const string ActualDir = "0";
         private const string BackupDir = "1";