diff --git a/ARMeilleure/CodeGen/X86/Assembler.cs b/ARMeilleure/CodeGen/X86/Assembler.cs
index 537c746c35..d0ccd6f810 100644
--- a/ARMeilleure/CodeGen/X86/Assembler.cs
+++ b/ARMeilleure/CodeGen/X86/Assembler.cs
@@ -28,10 +28,10 @@ namespace ARMeilleure.CodeGen.X86
             Vex      = 1 << 4,
 
             PrefixBit  = 16,
-            PrefixMask = 3 << PrefixBit,
+            PrefixMask = 7 << PrefixBit,
             Prefix66   = 1 << PrefixBit,
             PrefixF3   = 2 << PrefixBit,
-            PrefixF2   = 3 << PrefixBit
+            PrefixF2   = 4 << PrefixBit
         }
 
         private struct InstructionInfo
@@ -104,6 +104,9 @@ namespace ARMeilleure.CodeGen.X86
             Add(X86Instruction.Comisd,     new InstructionInfo(BadOp,      BadOp,      BadOp,      BadOp,      0x00000f2f, InstructionFlags.Vex | InstructionFlags.Prefix66));
             Add(X86Instruction.Comiss,     new InstructionInfo(BadOp,      BadOp,      BadOp,      BadOp,      0x00000f2f, InstructionFlags.Vex));
             Add(X86Instruction.Cpuid,      new InstructionInfo(BadOp,      BadOp,      BadOp,      BadOp,      0x00000fa2, InstructionFlags.RegOnly));
+            Add(X86Instruction.Crc32,      new InstructionInfo(BadOp,      BadOp,      BadOp,      BadOp,      0x000f38f1, InstructionFlags.PrefixF2));
+            Add(X86Instruction.Crc32_16,   new InstructionInfo(BadOp,      BadOp,      BadOp,      BadOp,      0x000f38f1, InstructionFlags.PrefixF2 | InstructionFlags.Prefix66));
+            Add(X86Instruction.Crc32_8,    new InstructionInfo(BadOp,      BadOp,      BadOp,      BadOp,      0x000f38f0, InstructionFlags.PrefixF2 | InstructionFlags.Reg8Src));
             Add(X86Instruction.Cvtdq2pd,   new InstructionInfo(BadOp,      BadOp,      BadOp,      BadOp,      0x00000fe6, InstructionFlags.Vex | InstructionFlags.PrefixF3));
             Add(X86Instruction.Cvtdq2ps,   new InstructionInfo(BadOp,      BadOp,      BadOp,      BadOp,      0x00000f5b, InstructionFlags.Vex));
             Add(X86Instruction.Cvtpd2dq,   new InstructionInfo(BadOp,      BadOp,      BadOp,      BadOp,      0x00000fe6, InstructionFlags.Vex | InstructionFlags.PrefixF2));
@@ -1172,7 +1175,15 @@ namespace ARMeilleure.CodeGen.X86
 
             if ((flags & InstructionFlags.Vex) != 0 && HardwareCapabilities.SupportsVexEncoding)
             {
-                int vexByte2 = (int)(flags & InstructionFlags.PrefixMask) >> (int)InstructionFlags.PrefixBit;
+                // In a vex encoding, only one prefix can be active at a time. The active prefix is encoded in the second byte using two bits.
+
+                int vexByte2 = (flags & InstructionFlags.PrefixMask) switch
+                {
+                    InstructionFlags.Prefix66 => 1,
+                    InstructionFlags.PrefixF3 => 2,
+                    InstructionFlags.PrefixF2 => 3,
+                    _ => 0
+                };
 
                 if (src1 != null)
                 {
@@ -1220,11 +1231,19 @@ namespace ARMeilleure.CodeGen.X86
             }
             else
             {
-                switch (flags & InstructionFlags.PrefixMask)
+                if (flags.HasFlag(InstructionFlags.Prefix66))
                 {
-                    case InstructionFlags.Prefix66: WriteByte(0x66); break;
-                    case InstructionFlags.PrefixF2: WriteByte(0xf2); break;
-                    case InstructionFlags.PrefixF3: WriteByte(0xf3); break;
+                    WriteByte(0x66);
+                }
+
+                if (flags.HasFlag(InstructionFlags.PrefixF2))
+                {
+                    WriteByte(0xf2);
+                }
+
+                if (flags.HasFlag(InstructionFlags.PrefixF3))
+                {
+                    WriteByte(0xf3);
                 }
 
                 if (rexPrefix != 0)
diff --git a/ARMeilleure/CodeGen/X86/CodeGenerator.cs b/ARMeilleure/CodeGen/X86/CodeGenerator.cs
index e7e7553ede..e217a66581 100644
--- a/ARMeilleure/CodeGen/X86/CodeGenerator.cs
+++ b/ARMeilleure/CodeGen/X86/CodeGenerator.cs
@@ -333,6 +333,21 @@ namespace ARMeilleure.CodeGen.X86
                         break;
                     }
 
+                    case IntrinsicType.Crc32:
+                    {
+                        Operand dest = operation.Destination;
+                        Operand src1 = operation.GetSource(0);
+                        Operand src2 = operation.GetSource(1);
+
+                        EnsureSameReg(dest, src1);
+
+                        Debug.Assert(dest.Type.IsInteger() && src1.Type.IsInteger() && src2.Type.IsInteger());
+
+                        context.Assembler.WriteInstruction(info.Inst, dest, src2, dest.Type);
+
+                        break;
+                    }
+
                     case IntrinsicType.BinaryImm:
                     {
                         Operand dest = operation.Destination;
diff --git a/ARMeilleure/CodeGen/X86/IntrinsicTable.cs b/ARMeilleure/CodeGen/X86/IntrinsicTable.cs
index bc07c6b09c..f7469badc0 100644
--- a/ARMeilleure/CodeGen/X86/IntrinsicTable.cs
+++ b/ARMeilleure/CodeGen/X86/IntrinsicTable.cs
@@ -38,6 +38,9 @@ namespace ARMeilleure.CodeGen.X86
             Add(Intrinsic.X86Comisseq,   new IntrinsicInfo(X86Instruction.Comiss,     IntrinsicType.Comis_));
             Add(Intrinsic.X86Comissge,   new IntrinsicInfo(X86Instruction.Comiss,     IntrinsicType.Comis_));
             Add(Intrinsic.X86Comisslt,   new IntrinsicInfo(X86Instruction.Comiss,     IntrinsicType.Comis_));
+            Add(Intrinsic.X86Crc32,      new IntrinsicInfo(X86Instruction.Crc32,      IntrinsicType.Crc32));
+            Add(Intrinsic.X86Crc32_16,   new IntrinsicInfo(X86Instruction.Crc32_16,   IntrinsicType.Crc32));
+            Add(Intrinsic.X86Crc32_8,    new IntrinsicInfo(X86Instruction.Crc32_8,    IntrinsicType.Crc32));
             Add(Intrinsic.X86Cvtdq2pd,   new IntrinsicInfo(X86Instruction.Cvtdq2pd,   IntrinsicType.Unary));
             Add(Intrinsic.X86Cvtdq2ps,   new IntrinsicInfo(X86Instruction.Cvtdq2ps,   IntrinsicType.Unary));
             Add(Intrinsic.X86Cvtpd2dq,   new IntrinsicInfo(X86Instruction.Cvtpd2dq,   IntrinsicType.Unary));
diff --git a/ARMeilleure/CodeGen/X86/IntrinsicType.cs b/ARMeilleure/CodeGen/X86/IntrinsicType.cs
index 41c52b59db..fe0f66ed7b 100644
--- a/ARMeilleure/CodeGen/X86/IntrinsicType.cs
+++ b/ARMeilleure/CodeGen/X86/IntrinsicType.cs
@@ -9,6 +9,7 @@ namespace ARMeilleure.CodeGen.X86
         Binary,
         BinaryGpr,
         BinaryImm,
+        Crc32,
         Ternary,
         TernaryImm
     }
diff --git a/ARMeilleure/CodeGen/X86/PreAllocator.cs b/ARMeilleure/CodeGen/X86/PreAllocator.cs
index d1794b555d..dc7f3a75a4 100644
--- a/ARMeilleure/CodeGen/X86/PreAllocator.cs
+++ b/ARMeilleure/CodeGen/X86/PreAllocator.cs
@@ -1294,11 +1294,22 @@ namespace ARMeilleure.CodeGen.X86
                 case Instruction.VectorInsert16:
                 case Instruction.VectorInsert8:
                     return !HardwareCapabilities.SupportsVexEncoding;
+
+                case Instruction.Extended:
+                    return IsIntrinsicSameOperandDestSrc1(operation);
             }
 
             return IsVexSameOperandDestSrc1(operation);
         }
 
+        private static bool IsIntrinsicSameOperandDestSrc1(Operation operation)
+        {
+            IntrinsicOperation intrinOp = (IntrinsicOperation)operation;
+            IntrinsicInfo info = IntrinsicTable.GetInfo(intrinOp.Intrinsic);
+
+            return info.Type == IntrinsicType.Crc32 || IsVexSameOperandDestSrc1(operation);
+        }
+
         private static bool IsVexSameOperandDestSrc1(Operation operation)
         {
             if (IsIntrinsic(operation.Instruction))
diff --git a/ARMeilleure/CodeGen/X86/X86Instruction.cs b/ARMeilleure/CodeGen/X86/X86Instruction.cs
index c3dffc62c6..9ac17e5bf3 100644
--- a/ARMeilleure/CodeGen/X86/X86Instruction.cs
+++ b/ARMeilleure/CodeGen/X86/X86Instruction.cs
@@ -33,6 +33,9 @@ namespace ARMeilleure.CodeGen.X86
         Comisd,
         Comiss,
         Cpuid,
+        Crc32,
+        Crc32_16,
+        Crc32_8,
         Cvtdq2pd,
         Cvtdq2ps,
         Cvtpd2dq,
diff --git a/ARMeilleure/Decoders/OpCodeTable.cs b/ARMeilleure/Decoders/OpCodeTable.cs
index 8567e1ce69..c1632d461c 100644
--- a/ARMeilleure/Decoders/OpCodeTable.cs
+++ b/ARMeilleure/Decoders/OpCodeTable.cs
@@ -659,6 +659,12 @@ namespace ARMeilleure.Decoders
             SetA32("<<<<00110101xxxx0000xxxxxxxxxxxx", InstName.Cmp,     InstEmit32.Cmp,     typeof(OpCode32AluImm));
             SetA32("<<<<00010101xxxx0000xxxxxxx0xxxx", InstName.Cmp,     InstEmit32.Cmp,     typeof(OpCode32AluRsImm));
             SetA32("<<<<00010101xxxx0000xxxx0xx1xxxx", InstName.Cmp,     InstEmit32.Cmp,     typeof(OpCode32AluRsReg));
+            SetA32("<<<<00010000xxxxxxxx00000100xxxx", InstName.Crc32b,  InstEmit32.Crc32b,  typeof(OpCode32AluReg));
+            SetA32("<<<<00010000xxxxxxxx00100100xxxx", InstName.Crc32cb, InstEmit32.Crc32cb, typeof(OpCode32AluReg));
+            SetA32("<<<<00010010xxxxxxxx00100100xxxx", InstName.Crc32ch, InstEmit32.Crc32ch, typeof(OpCode32AluReg));
+            SetA32("<<<<00010100xxxxxxxx00100100xxxx", InstName.Crc32cw, InstEmit32.Crc32cw, typeof(OpCode32AluReg));
+            SetA32("<<<<00010010xxxxxxxx00000100xxxx", InstName.Crc32h,  InstEmit32.Crc32h,  typeof(OpCode32AluReg));
+            SetA32("<<<<00010100xxxxxxxx00000100xxxx", InstName.Crc32w,  InstEmit32.Crc32w,  typeof(OpCode32AluReg));
             SetA32("1111010101111111111100000101xxxx", InstName.Dmb,     InstEmit32.Dmb,     typeof(OpCode32));
             SetA32("1111010101111111111100000100xxxx", InstName.Dsb,     InstEmit32.Dsb,     typeof(OpCode32));
             SetA32("<<<<0010001xxxxxxxxxxxxxxxxxxxxx", InstName.Eor,     InstEmit32.Eor,     typeof(OpCode32AluImm));
diff --git a/ARMeilleure/Instructions/InstEmitHash.cs b/ARMeilleure/Instructions/InstEmitHash.cs
index 2a8b348889..82b3e3534f 100644
--- a/ARMeilleure/Instructions/InstEmitHash.cs
+++ b/ARMeilleure/Instructions/InstEmitHash.cs
@@ -1,182 +1,67 @@
-// https://www.intel.com/content/dam/www/public/us/en/documents/white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf
-
 using ARMeilleure.Decoders;
 using ARMeilleure.IntermediateRepresentation;
 using ARMeilleure.Translation;
 
+using static ARMeilleure.Instructions.InstEmitHashHelper;
 using static ARMeilleure.Instructions.InstEmitHelper;
-using static ARMeilleure.Instructions.InstEmitSimdHelper;
-using static ARMeilleure.IntermediateRepresentation.OperandHelper;
 
 namespace ARMeilleure.Instructions
 {
     static partial class InstEmit
     {
+        private const int ByteSizeLog2 = 0;
+        private const int HWordSizeLog2 = 1;
+        private const int WordSizeLog2 = 2;
+        private const int DWordSizeLog2 = 3;
+
         public static void Crc32b(ArmEmitterContext context)
         {
-            if (Optimizations.UsePclmulqdq)
-            {
-                EmitCrc32Optimized(context, false, 8);
-            }
-            else
-            {
-                EmitCrc32Call(context, nameof(SoftFallback.Crc32b));
-            }
+            EmitCrc32Call(context, ByteSizeLog2, false);
         }
 
         public static void Crc32h(ArmEmitterContext context)
         {
-            if (Optimizations.UsePclmulqdq)
-            {
-                EmitCrc32Optimized(context, false, 16);
-            }
-            else
-            {
-                EmitCrc32Call(context, nameof(SoftFallback.Crc32h));
-            }
+            EmitCrc32Call(context, HWordSizeLog2, false);
         }
 
         public static void Crc32w(ArmEmitterContext context)
         {
-            if (Optimizations.UsePclmulqdq)
-            {
-                EmitCrc32Optimized(context, false, 32);
-            }
-            else
-            {
-                EmitCrc32Call(context, nameof(SoftFallback.Crc32w));
-            }
+            EmitCrc32Call(context, WordSizeLog2, false);
         }
 
         public static void Crc32x(ArmEmitterContext context)
         {
-            if (Optimizations.UsePclmulqdq)
-            {
-                EmitCrc32Optimized64(context, false);
-            }
-            else
-            {
-                EmitCrc32Call(context, nameof(SoftFallback.Crc32x));
-            }
+            EmitCrc32Call(context, DWordSizeLog2, false);
         }
 
         public static void Crc32cb(ArmEmitterContext context)
         {
-            if (Optimizations.UsePclmulqdq)
-            {
-                EmitCrc32Optimized(context, true, 8);
-            }
-            else
-            {
-                EmitCrc32Call(context, nameof(SoftFallback.Crc32cb));
-            }
+            EmitCrc32Call(context, ByteSizeLog2, true);
         }
 
         public static void Crc32ch(ArmEmitterContext context)
         {
-            if (Optimizations.UsePclmulqdq)
-            {
-                EmitCrc32Optimized(context, true, 16);
-            }
-            else
-            {
-                EmitCrc32Call(context, nameof(SoftFallback.Crc32ch));
-            }
+            EmitCrc32Call(context, HWordSizeLog2, true);
         }
 
         public static void Crc32cw(ArmEmitterContext context)
         {
-            if (Optimizations.UsePclmulqdq)
-            {
-                EmitCrc32Optimized(context, true, 32);
-            }
-            else
-            {
-                EmitCrc32Call(context, nameof(SoftFallback.Crc32cw));
-            }
+            EmitCrc32Call(context, WordSizeLog2, true);
         }
 
         public static void Crc32cx(ArmEmitterContext context)
         {
-            if (Optimizations.UsePclmulqdq)
-            {
-                EmitCrc32Optimized64(context, true);
-            }
-            else
-            {
-                EmitCrc32Call(context, nameof(SoftFallback.Crc32cx));
-            }
+            EmitCrc32Call(context, DWordSizeLog2, true);
         }
 
-        private static void EmitCrc32Optimized(ArmEmitterContext context, bool castagnoli, int bitsize)
-        {
-            OpCodeAluBinary op = (OpCodeAluBinary)context.CurrOp;
-
-            long mu = castagnoli ? 0x0DEA713F1 : 0x1F7011641; // mu' = floor(x^64/P(x))'
-            long polynomial = castagnoli ? 0x105EC76F0 : 0x1DB710641; // P'(x) << 1
-
-            Operand crc = GetIntOrZR(context, op.Rn);
-            Operand data = GetIntOrZR(context, op.Rm);
-
-            crc = context.VectorInsert(context.VectorZero(), crc, 0);
-
-            switch (bitsize)
-            {
-                case 8: data = context.VectorInsert8(context.VectorZero(), data, 0); break;
-                case 16: data = context.VectorInsert16(context.VectorZero(), data, 0); break;
-                case 32: data = context.VectorInsert(context.VectorZero(), data, 0); break;
-            }
-
-            Operand tmp = context.AddIntrinsic(Intrinsic.X86Pxor, crc, data);
-            tmp = context.AddIntrinsic(Intrinsic.X86Psllq, tmp, Const(64 - bitsize));
-            tmp = context.AddIntrinsic(Intrinsic.X86Pclmulqdq, tmp, X86GetScalar(context, mu), Const(0));
-            tmp = context.AddIntrinsic(Intrinsic.X86Pclmulqdq, tmp, X86GetScalar(context, polynomial), Const(0));
-
-            if (bitsize < 32)
-            {
-                crc = context.AddIntrinsic(Intrinsic.X86Pslldq, crc, Const((64 - bitsize) / 8));
-                tmp = context.AddIntrinsic(Intrinsic.X86Pxor, tmp, crc);
-            }
-
-            SetIntOrZR(context, op.Rd, context.VectorExtract(OperandType.I32, tmp, 2));
-        }
-
-        private static void EmitCrc32Optimized64(ArmEmitterContext context, bool castagnoli)
-        {
-            OpCodeAluBinary op = (OpCodeAluBinary)context.CurrOp;
-
-            long mu = castagnoli ? 0x0DEA713F1 : 0x1F7011641; // mu' = floor(x^64/P(x))'
-            long polynomial = castagnoli ? 0x105EC76F0 : 0x1DB710641; // P'(x) << 1
-
-            Operand crc = GetIntOrZR(context, op.Rn);
-            Operand data = GetIntOrZR(context, op.Rm);
-
-            crc = context.VectorInsert(context.VectorZero(), crc, 0);
-            data = context.VectorInsert(context.VectorZero(), data, 0);
-
-            Operand tmp = context.AddIntrinsic(Intrinsic.X86Pxor, crc, data);
-            Operand res = context.AddIntrinsic(Intrinsic.X86Pslldq, tmp, Const(4));
-
-            tmp = context.AddIntrinsic(Intrinsic.X86Pclmulqdq, res, X86GetScalar(context, mu), Const(0));
-            tmp = context.AddIntrinsic(Intrinsic.X86Pclmulqdq, tmp, X86GetScalar(context, polynomial), Const(0));
-
-            tmp = context.AddIntrinsic(Intrinsic.X86Pxor, tmp, res);
-            tmp = context.AddIntrinsic(Intrinsic.X86Psllq, tmp, Const(32));
-
-            tmp = context.AddIntrinsic(Intrinsic.X86Pclmulqdq, tmp, X86GetScalar(context, mu), Const(1));
-            tmp = context.AddIntrinsic(Intrinsic.X86Pclmulqdq, tmp, X86GetScalar(context, polynomial), Const(0));
-
-            SetIntOrZR(context, op.Rd, context.VectorExtract(OperandType.I32, tmp, 2));
-        }
-
-        private static void EmitCrc32Call(ArmEmitterContext context, string name)
+        private static void EmitCrc32Call(ArmEmitterContext context, int size, bool c)
         {
             OpCodeAluBinary op = (OpCodeAluBinary)context.CurrOp;
 
             Operand n = GetIntOrZR(context, op.Rn);
             Operand m = GetIntOrZR(context, op.Rm);
 
-            Operand d = context.Call(typeof(SoftFallback).GetMethod(name), n, m);
+            Operand d = EmitCrc32(context, n, m, size, c);
 
             SetIntOrZR(context, op.Rd, d);
         }
diff --git a/ARMeilleure/Instructions/InstEmitHash32.cs b/ARMeilleure/Instructions/InstEmitHash32.cs
new file mode 100644
index 0000000000..fec782dd82
--- /dev/null
+++ b/ARMeilleure/Instructions/InstEmitHash32.cs
@@ -0,0 +1,54 @@
+using ARMeilleure.Decoders;
+using ARMeilleure.IntermediateRepresentation;
+using ARMeilleure.Translation;
+
+using static ARMeilleure.Instructions.InstEmitHelper;
+using static ARMeilleure.Instructions.InstEmitHashHelper;
+
+namespace ARMeilleure.Instructions
+{
+    static partial class InstEmit32
+    {
+        public static void Crc32b(ArmEmitterContext context)
+        {
+            EmitCrc32Call(context, ByteSizeLog2, false);
+        }
+
+        public static void Crc32h(ArmEmitterContext context)
+        {
+            EmitCrc32Call(context, HWordSizeLog2, false);
+        }
+
+        public static void Crc32w(ArmEmitterContext context)
+        {
+            EmitCrc32Call(context, WordSizeLog2, false);
+        }
+
+        public static void Crc32cb(ArmEmitterContext context)
+        {
+            EmitCrc32Call(context, ByteSizeLog2, true);
+        }
+
+        public static void Crc32ch(ArmEmitterContext context)
+        {
+            EmitCrc32Call(context, HWordSizeLog2, true);
+        }
+
+        public static void Crc32cw(ArmEmitterContext context)
+        {
+            EmitCrc32Call(context, WordSizeLog2, true);
+        }
+
+        private static void EmitCrc32Call(ArmEmitterContext context, int size, bool c)
+        {
+            IOpCode32AluReg op = (IOpCode32AluReg)context.CurrOp;
+
+            Operand n = GetIntA32(context, op.Rn);
+            Operand m = GetIntA32(context, op.Rm);
+
+            Operand d = EmitCrc32(context, n, m, size, c);
+
+            EmitAluStore(context, d);
+        }
+    }
+}
diff --git a/ARMeilleure/Instructions/InstEmitHashHelper.cs b/ARMeilleure/Instructions/InstEmitHashHelper.cs
new file mode 100644
index 0000000000..9206e6d5b9
--- /dev/null
+++ b/ARMeilleure/Instructions/InstEmitHashHelper.cs
@@ -0,0 +1,119 @@
+// https://www.intel.com/content/dam/www/public/us/en/documents/white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf
+
+using ARMeilleure.IntermediateRepresentation;
+using ARMeilleure.Translation;
+using System;
+using System.Diagnostics;
+
+using static ARMeilleure.IntermediateRepresentation.OperandHelper;
+using static ARMeilleure.Instructions.InstEmitSimdHelper;
+
+namespace ARMeilleure.Instructions
+{
+    static class InstEmitHashHelper
+    {
+        public const uint Crc32RevPoly = 0xedb88320;
+        public const uint Crc32cRevPoly = 0x82f63b78;
+
+        public static Operand EmitCrc32(ArmEmitterContext context, Operand crc, Operand value, int size, bool castagnoli)
+        {
+            Debug.Assert(crc.Type.IsInteger() && value.Type.IsInteger());
+            Debug.Assert(size >= 0 && size < 4);
+            Debug.Assert((size < 3) || (value.Type == OperandType.I64));
+
+            if (castagnoli && Optimizations.UseSse42)
+            {
+                // The CRC32 instruction does not have an immediate variant, so ensure both inputs are in registers.
+                value = (value.Kind == OperandKind.Constant) ? context.Copy(value) : value;
+                crc = (crc.Kind == OperandKind.Constant) ? context.Copy(crc) : crc;
+
+                Intrinsic op = size switch
+                {
+                    0 => Intrinsic.X86Crc32_8,
+                    1 => Intrinsic.X86Crc32_16,
+                    _ => Intrinsic.X86Crc32,
+                };
+
+                return (size == 3) ? context.ConvertI64ToI32(context.AddIntrinsicLong(op, crc, value)) : context.AddIntrinsicInt(op, crc, value);
+            }
+            else if (Optimizations.UsePclmulqdq)
+            {
+                return size switch
+                {
+                    3 => EmitCrc32Optimized64(context, crc, value, castagnoli),
+                    _ => EmitCrc32Optimized(context, crc, value, castagnoli, size),
+                };
+            }
+            else
+            {
+                string name = (size, castagnoli) switch 
+                {
+                    (0, false) => nameof(SoftFallback.Crc32b),
+                    (1, false) => nameof(SoftFallback.Crc32h),
+                    (2, false) => nameof(SoftFallback.Crc32w),
+                    (3, false) => nameof(SoftFallback.Crc32x),
+                    (0, true) => nameof(SoftFallback.Crc32cb),
+                    (1, true) => nameof(SoftFallback.Crc32ch),
+                    (2, true) => nameof(SoftFallback.Crc32cw),
+                    (3, true) => nameof(SoftFallback.Crc32cx),
+                    _ => throw new ArgumentOutOfRangeException(nameof(size))
+                };
+
+                return context.Call(typeof(SoftFallback).GetMethod(name), crc, value);
+            }
+        }
+
+        private static Operand EmitCrc32Optimized(ArmEmitterContext context, Operand crc, Operand data, bool castagnoli, int size)
+        {
+            long mu = castagnoli ? 0x0DEA713F1 : 0x1F7011641; // mu' = floor(x^64/P(x))'
+            long polynomial = castagnoli ? 0x105EC76F0 : 0x1DB710641; // P'(x) << 1
+
+            crc = context.VectorInsert(context.VectorZero(), crc, 0);
+
+            switch (size)
+            {
+                case 0: data = context.VectorInsert8(context.VectorZero(), data, 0); break;
+                case 1: data = context.VectorInsert16(context.VectorZero(), data, 0); break;
+                case 2: data = context.VectorInsert(context.VectorZero(), data, 0); break;
+            }
+
+            int bitsize = 8 << size;
+
+            Operand tmp = context.AddIntrinsic(Intrinsic.X86Pxor, crc, data);
+            tmp = context.AddIntrinsic(Intrinsic.X86Psllq, tmp, Const(64 - bitsize));
+            tmp = context.AddIntrinsic(Intrinsic.X86Pclmulqdq, tmp, X86GetScalar(context, mu), Const(0));
+            tmp = context.AddIntrinsic(Intrinsic.X86Pclmulqdq, tmp, X86GetScalar(context, polynomial), Const(0));
+
+            if (bitsize < 32)
+            {
+                crc = context.AddIntrinsic(Intrinsic.X86Pslldq, crc, Const((64 - bitsize) / 8));
+                tmp = context.AddIntrinsic(Intrinsic.X86Pxor, tmp, crc);
+            }
+
+            return context.VectorExtract(OperandType.I32, tmp, 2);
+        }
+
+        private static Operand EmitCrc32Optimized64(ArmEmitterContext context, Operand crc, Operand data, bool castagnoli)
+        {
+            long mu = castagnoli ? 0x0DEA713F1 : 0x1F7011641; // mu' = floor(x^64/P(x))'
+            long polynomial = castagnoli ? 0x105EC76F0 : 0x1DB710641; // P'(x) << 1
+
+            crc = context.VectorInsert(context.VectorZero(), crc, 0);
+            data = context.VectorInsert(context.VectorZero(), data, 0);
+
+            Operand tmp = context.AddIntrinsic(Intrinsic.X86Pxor, crc, data);
+            Operand res = context.AddIntrinsic(Intrinsic.X86Pslldq, tmp, Const(4));
+
+            tmp = context.AddIntrinsic(Intrinsic.X86Pclmulqdq, res, X86GetScalar(context, mu), Const(0));
+            tmp = context.AddIntrinsic(Intrinsic.X86Pclmulqdq, tmp, X86GetScalar(context, polynomial), Const(0));
+
+            tmp = context.AddIntrinsic(Intrinsic.X86Pxor, tmp, res);
+            tmp = context.AddIntrinsic(Intrinsic.X86Psllq, tmp, Const(32));
+
+            tmp = context.AddIntrinsic(Intrinsic.X86Pclmulqdq, tmp, X86GetScalar(context, mu), Const(1));
+            tmp = context.AddIntrinsic(Intrinsic.X86Pclmulqdq, tmp, X86GetScalar(context, polynomial), Const(0));
+
+            return context.VectorExtract(OperandType.I32, tmp, 2);
+        }
+    }
+}
diff --git a/ARMeilleure/IntermediateRepresentation/Intrinsic.cs b/ARMeilleure/IntermediateRepresentation/Intrinsic.cs
index 639ba7f926..7f89117005 100644
--- a/ARMeilleure/IntermediateRepresentation/Intrinsic.cs
+++ b/ARMeilleure/IntermediateRepresentation/Intrinsic.cs
@@ -27,6 +27,9 @@ namespace ARMeilleure.IntermediateRepresentation
         X86Comisseq,
         X86Comissge,
         X86Comisslt,
+        X86Crc32,
+        X86Crc32_16,
+        X86Crc32_8,
         X86Cvtdq2pd,
         X86Cvtdq2ps,
         X86Cvtpd2dq,
diff --git a/ARMeilleure/Translation/PTC/Ptc.cs b/ARMeilleure/Translation/PTC/Ptc.cs
index 0051d25adb..deffabe1e1 100644
--- a/ARMeilleure/Translation/PTC/Ptc.cs
+++ b/ARMeilleure/Translation/PTC/Ptc.cs
@@ -20,7 +20,7 @@ namespace ARMeilleure.Translation.PTC
     {
         private const string HeaderMagic = "PTChd";
 
-        private const int InternalVersion = 8; //! To be incremented manually for each change to the ARMeilleure project.
+        private const int InternalVersion = 9; //! To be incremented manually for each change to the ARMeilleure project.
 
         private const string BaseDir = "Ryujinx";
 
diff --git a/Ryujinx.Tests/Cpu/CpuTest.cs b/Ryujinx.Tests/Cpu/CpuTest.cs
index 9e37c2114c..2c04724816 100644
--- a/Ryujinx.Tests/Cpu/CpuTest.cs
+++ b/Ryujinx.Tests/Cpu/CpuTest.cs
@@ -167,41 +167,42 @@ namespace Ryujinx.Tests.Cpu
             }
         }
 
-        protected void ExecuteOpcodes()
+        protected void ExecuteOpcodes(bool runUnicorn = true)
         {
             _cpuContext.Execute(_context, _entryPoint);
 
-            if (_unicornAvailable)
+            if (_unicornAvailable && runUnicorn)
             {
                 _unicornEmu.RunForCount((_currAddress - _entryPoint - 4) / 4);
             }
         }
 
         protected ExecutionContext SingleOpcode(uint  opcode,
-                                                ulong x0       = 0,
-                                                ulong x1       = 0,
-                                                ulong x2       = 0,
-                                                ulong x3       = 0,
-                                                ulong x31      = 0,
-                                                V128  v0       = default,
-                                                V128  v1       = default,
-                                                V128  v2       = default,
-                                                V128  v3       = default,
-                                                V128  v4       = default,
-                                                V128  v5       = default,
-                                                V128  v30      = default,
-                                                V128  v31      = default,
-                                                bool  overflow = false,
-                                                bool  carry    = false,
-                                                bool  zero     = false,
-                                                bool  negative = false,
-                                                int   fpcr     = 0,
-                                                int   fpsr     = 0)
+                                                ulong x0         = 0,
+                                                ulong x1         = 0,
+                                                ulong x2         = 0,
+                                                ulong x3         = 0,
+                                                ulong x31        = 0,
+                                                V128  v0         = default,
+                                                V128  v1         = default,
+                                                V128  v2         = default,
+                                                V128  v3         = default,
+                                                V128  v4         = default,
+                                                V128  v5         = default,
+                                                V128  v30        = default,
+                                                V128  v31        = default,
+                                                bool  overflow   = false,
+                                                bool  carry      = false,
+                                                bool  zero       = false,
+                                                bool  negative   = false,
+                                                int   fpcr       = 0,
+                                                int   fpsr       = 0,
+                                                bool  runUnicorn = true)
         {
             Opcode(opcode);
             Opcode(0xD65F03C0); // RET
             SetContext(x0, x1, x2, x3, x31, v0, v1, v2, v3, v4, v5, v30, v31, overflow, carry, zero, negative, fpcr, fpsr);
-            ExecuteOpcodes();
+            ExecuteOpcodes(runUnicorn);
 
             return GetContext();
         }
diff --git a/Ryujinx.Tests/Cpu/CpuTestAluBinary.cs b/Ryujinx.Tests/Cpu/CpuTestAluBinary.cs
index 2823477fc5..eb94b84502 100644
--- a/Ryujinx.Tests/Cpu/CpuTestAluBinary.cs
+++ b/Ryujinx.Tests/Cpu/CpuTestAluBinary.cs
@@ -1,5 +1,6 @@
 #define AluBinary
 
+using ARMeilleure.State;
 using NUnit.Framework;
 
 namespace Ryujinx.Tests.Cpu
@@ -8,8 +9,78 @@ namespace Ryujinx.Tests.Cpu
     public sealed class CpuTestAluBinary : CpuTest
     {
 #if AluBinary
+        public struct CrcTest
+        {
+            public uint Crc;
+            public ulong Value;
+            public bool C;
+
+            public uint[] Results; // One result for each CRC variant (8, 16, 32)
+
+            public CrcTest(uint crc, ulong value, bool c, params uint[] results)
+            {
+                Crc = crc;
+                Value = value;
+                C = c;
+                Results = results;
+            }
+        }
+
+#region "ValueSource (CRC32)"
+        private static CrcTest[] _CRC32_Test_Values_()
+        {
+            // Created with http://www.sunshine2k.de/coding/javascript/crc/crc_js.html, with:
+            //  - non-reflected polynomials
+            //  - input reflected, result reflected
+            //  - bytes in order of increasing significance
+            //  - xor 0
+            // Only includes non-C variant, as the other can be tested with unicorn.
+
+            return new CrcTest[]
+            {
+                new CrcTest(0x00000000u, 0x00_00_00_00_00_00_00_00u, false, 0x00000000, 0x00000000, 0x00000000, 0x00000000),
+                new CrcTest(0x00000000u, 0x7f_ff_ff_ff_ff_ff_ff_ffu, false, 0x2d02ef8d, 0xbe2612ff, 0xdebb20e3, 0xa9de8355),
+                new CrcTest(0x00000000u, 0x80_00_00_00_00_00_00_00u, false, 0x00000000, 0x00000000, 0x00000000, 0xedb88320),
+                new CrcTest(0x00000000u, 0xff_ff_ff_ff_ff_ff_ff_ffu, false, 0x2d02ef8d, 0xbe2612ff, 0xdebb20e3, 0x44660075),
+                new CrcTest(0x00000000u, 0xa0_02_f1_ca_52_78_8c_1cu, false, 0x14015c4f, 0x02799256, 0x9063c9e5, 0x8816610a),
+
+                new CrcTest(0xffffffffu, 0x00_00_00_00_00_00_00_00u, false, 0x2dfd1072, 0xbe26ed00, 0xdebb20e3, 0x9add2096),
+                new CrcTest(0xffffffffu, 0x7f_ff_ff_ff_ff_ff_ff_ffu, false, 0x00ffffff, 0x0000ffff, 0x00000000, 0x3303a3c3),
+                new CrcTest(0xffffffffu, 0x80_00_00_00_00_00_00_00u, false, 0x2dfd1072, 0xbe26ed00, 0xdebb20e3, 0x7765a3b6),
+                new CrcTest(0xffffffffu, 0xff_ff_ff_ff_ff_ff_ff_ffu, false, 0x00ffffff, 0x0000ffff, 0x00000000, 0xdebb20e3),
+                new CrcTest(0xffffffffu, 0xa0_02_f1_ca_52_78_8c_1cu, false, 0x39fc4c3d, 0xbc5f7f56, 0x4ed8e906, 0x12cb419c)
+            };
+        }
+#endregion
+
         private const int RndCnt = 2;
 
+        [Test, Combinatorial]
+        public void Crc32_b_h_w_x([Values(0u)] uint rd,
+                                  [Values(1u)] uint rn,
+                                  [Values(2u)] uint rm,
+                                  [Range(0u, 3u)] uint size,
+                                  [ValueSource("_CRC32_Test_Values_")] CrcTest test)
+        {
+            uint opcode = 0x1AC04000; // CRC32B W0, W0, W0
+
+            opcode |= size << 10;
+            opcode |= ((rm & 31) << 16) | ((rn & 31) << 5) | ((rd & 31) << 0);
+
+            if (size == 3)
+            {
+                opcode |= 0x80000000;
+            }
+
+            uint w31 = TestContext.CurrentContext.Random.NextUInt();
+
+            SingleOpcode(opcode, x1: test.Crc, x2: test.Value, x31: w31, runUnicorn: false);
+
+            ExecutionContext context = GetContext();
+            ulong result = context.GetX((int)rd);
+            Assert.That(result == test.Results[size]);
+        }
+
         [Test, Pairwise, Description("CRC32X <Wd>, <Wn>, <Xm>"), Ignore("Unicorn fails.")]
         public void Crc32x([Values(0u, 31u)] uint rd,
                            [Values(1u, 31u)] uint rn,
diff --git a/Ryujinx.Tests/Cpu/CpuTestAluBinary32.cs b/Ryujinx.Tests/Cpu/CpuTestAluBinary32.cs
new file mode 100644
index 0000000000..0a0302c269
--- /dev/null
+++ b/Ryujinx.Tests/Cpu/CpuTestAluBinary32.cs
@@ -0,0 +1,96 @@
+#define AluBinary32
+
+using ARMeilleure.State;
+using NUnit.Framework;
+using System;
+
+namespace Ryujinx.Tests.Cpu
+{
+
+    [Category("AluBinary32")]
+    public sealed class CpuTestAluBinary32 : CpuTest32
+    {
+#if AluBinary32
+        public struct CrcTest32
+        {
+            public uint Crc;
+            public uint Value;
+            public bool C;
+
+            public uint[] Results; // One result for each CRC variant (8, 16, 32)
+
+            public CrcTest32(uint crc, uint value, bool c, params uint[] results)
+            {
+                Crc = crc;
+                Value = value;
+                C = c;
+                Results = results;
+            }
+        }
+
+#region "ValueSource (CRC32/CRC32C)"
+        private static CrcTest32[] _CRC32_Test_Values_()
+        {
+            // Created with http://www.sunshine2k.de/coding/javascript/crc/crc_js.html, with:
+            //  - non-reflected polynomials
+            //  - input reflected, result reflected
+            //  - bytes in order of increasing significance
+            //  - xor 0
+
+            return new CrcTest32[]
+            {
+                new CrcTest32(0x00000000u, 0x00_00_00_00u, false, 0x00000000, 0x00000000, 0x00000000),
+                new CrcTest32(0x00000000u, 0x7f_ff_ff_ffu, false, 0x2d02ef8d, 0xbe2612ff, 0x3303a3c3),
+                new CrcTest32(0x00000000u, 0x80_00_00_00u, false, 0x00000000, 0x00000000, 0xedb88320),
+                new CrcTest32(0x00000000u, 0xff_ff_ff_ffu, false, 0x2d02ef8d, 0xbe2612ff, 0xdebb20e3),
+                new CrcTest32(0x00000000u, 0x9d_cb_12_f0u, false, 0xbdbdf21c, 0xe70590f5, 0x3f7480c5),
+
+                new CrcTest32(0xffffffffu, 0x00_00_00_00u, false, 0x2dfd1072, 0xbe26ed00, 0xdebb20e3),
+                new CrcTest32(0xffffffffu, 0x7f_ff_ff_ffu, false, 0x00ffffff, 0x0000ffff, 0xedb88320),
+                new CrcTest32(0xffffffffu, 0x80_00_00_00u, false, 0x2dfd1072, 0xbe26ed00, 0x3303a3c3),
+                new CrcTest32(0xffffffffu, 0xff_ff_ff_ffu, false, 0x00ffffff, 0x0000ffff, 0x00000000),
+                new CrcTest32(0xffffffffu, 0x9d_cb_12_f0u, false, 0x9040e26e, 0x59237df5, 0xe1cfa026),
+
+                new CrcTest32(0x00000000u, 0x00_00_00_00u, true, 0x00000000, 0x00000000, 0x00000000),
+                new CrcTest32(0x00000000u, 0x7f_ff_ff_ffu, true, 0xad7d5351, 0x0e9e77d2, 0x356e8f40),
+                new CrcTest32(0x00000000u, 0x80_00_00_00u, true, 0x00000000, 0x00000000, 0x82f63b78),
+                new CrcTest32(0x00000000u, 0xff_ff_ff_ffu, true, 0xad7d5351, 0x0e9e77d2, 0xb798b438),
+                new CrcTest32(0x00000000u, 0x9d_cb_12_f0u, true, 0xf36e6f75, 0xb5ff99e6, 0x782dfbf1),
+
+                new CrcTest32(0xffffffffu, 0x00_00_00_00u, true, 0xad82acae, 0x0e9e882d, 0xb798b438),
+                new CrcTest32(0xffffffffu, 0x7f_ff_ff_ffu, true, 0x00ffffff, 0x0000ffff, 0x82f63b78),
+                new CrcTest32(0xffffffffu, 0x80_00_00_00u, true, 0xad82acae, 0x0e9e882d, 0x356e8f40),
+                new CrcTest32(0xffffffffu, 0xff_ff_ff_ffu, true, 0x00ffffff, 0x0000ffff, 0x00000000),
+                new CrcTest32(0xffffffffu, 0x9d_cb_12_f0u, true, 0x5eecc3db, 0xbb6111cb, 0xcfb54fc9)
+            };
+        }
+#endregion
+
+        [Test, Combinatorial]
+        public void Crc32_Crc32c_b_h_w([Values(0u)] uint rd,
+                                       [Values(1u)] uint rn,
+                                       [Values(2u)] uint rm,
+                                       [Range(0u, 2u)] uint size,
+                                       [ValueSource("_CRC32_Test_Values_")] CrcTest32 test)
+        {
+            // Unicorn does not yet support 32bit crc instructions, so test against a known table of results/values.
+
+            uint opcode = 0xe1000040; // CRC32B R0, R0, R0
+            opcode |= ((rm & 15) << 0) | ((rd & 15) << 12) | ((rn & 15) << 16);
+            opcode |= size << 21;
+            if (test.C)
+            {
+                opcode |= 1 << 9;
+            }
+
+            uint sp = TestContext.CurrentContext.Random.NextUInt();
+
+            SingleOpcode(opcode, r1: test.Crc, r2: test.Value, sp: sp, runUnicorn: false);
+
+            ExecutionContext context = GetContext();
+            ulong result = context.GetX((int)rd);
+            Assert.That(result == test.Results[size]);
+        }
+#endif
+    }
+}