From f8cd072b62808c8da06549807cc263003f0049b7 Mon Sep 17 00:00:00 2001
From: merry <MerryMage@users.noreply.github.com>
Date: Fri, 5 Jun 2020 11:58:27 +0100
Subject: [PATCH] Faster crc32 implementation (#1294)

* Add Pclmulqdq intrinsic

* Implement crc32 in terms of pclmulqdq

* Address PR comments
---
 ARMeilleure/CodeGen/X86/Assembler.cs          |   8 +
 ARMeilleure/CodeGen/X86/IntrinsicTable.cs     |   1 +
 ARMeilleure/CodeGen/X86/X86Instruction.cs     |   1 +
 ARMeilleure/Instructions/InstEmitHash.cs      | 137 +++++++++++++++++-
 .../IntermediateRepresentation/Intrinsic.cs   |   1 +
 ARMeilleure/Optimizations.cs                  |  38 ++---
 6 files changed, 160 insertions(+), 26 deletions(-)

diff --git a/ARMeilleure/CodeGen/X86/Assembler.cs b/ARMeilleure/CodeGen/X86/Assembler.cs
index de361677bf..5ad54289ce 100644
--- a/ARMeilleure/CodeGen/X86/Assembler.cs
+++ b/ARMeilleure/CodeGen/X86/Assembler.cs
@@ -165,6 +165,7 @@ namespace ARMeilleure.CodeGen.X86
             Add(X86Instruction.Pavgb,      new InstructionInfo(BadOp,      BadOp,      BadOp,      BadOp,      0x00000fe0, InstructionFlags.Vex | InstructionFlags.Prefix66));
             Add(X86Instruction.Pavgw,      new InstructionInfo(BadOp,      BadOp,      BadOp,      BadOp,      0x00000fe3, InstructionFlags.Vex | InstructionFlags.Prefix66));
             Add(X86Instruction.Pblendvb,   new InstructionInfo(BadOp,      BadOp,      BadOp,      BadOp,      0x000f3810, InstructionFlags.Prefix66));
+            Add(X86Instruction.Pclmulqdq,  new InstructionInfo(BadOp,      BadOp,      BadOp,      BadOp,      0x000f3a44, InstructionFlags.Vex | InstructionFlags.Prefix66));
             Add(X86Instruction.Pcmpeqb,    new InstructionInfo(BadOp,      BadOp,      BadOp,      BadOp,      0x00000f74, InstructionFlags.Vex | InstructionFlags.Prefix66));
             Add(X86Instruction.Pcmpeqd,    new InstructionInfo(BadOp,      BadOp,      BadOp,      BadOp,      0x00000f76, InstructionFlags.Vex | InstructionFlags.Prefix66));
             Add(X86Instruction.Pcmpeqq,    new InstructionInfo(BadOp,      BadOp,      BadOp,      BadOp,      0x000f3829, InstructionFlags.Vex | InstructionFlags.Prefix66));
@@ -633,6 +634,13 @@ namespace ARMeilleure.CodeGen.X86
             WriteInstruction(dest, source, type, X86Instruction.Or);
         }
 
+        public void Pclmulqdq(Operand dest, Operand source, byte imm)
+        {
+            WriteInstruction(dest, null, source, X86Instruction.Pclmulqdq);
+
+            WriteByte(imm);
+        }
+
         public void Pcmpeqw(Operand dest, Operand src1, Operand src2)
         {
             WriteInstruction(dest, src1, src2, X86Instruction.Pcmpeqw);
diff --git a/ARMeilleure/CodeGen/X86/IntrinsicTable.cs b/ARMeilleure/CodeGen/X86/IntrinsicTable.cs
index 5382e3ead8..bc07c6b09c 100644
--- a/ARMeilleure/CodeGen/X86/IntrinsicTable.cs
+++ b/ARMeilleure/CodeGen/X86/IntrinsicTable.cs
@@ -82,6 +82,7 @@ namespace ARMeilleure.CodeGen.X86
             Add(Intrinsic.X86Pavgb,      new IntrinsicInfo(X86Instruction.Pavgb,      IntrinsicType.Binary));
             Add(Intrinsic.X86Pavgw,      new IntrinsicInfo(X86Instruction.Pavgw,      IntrinsicType.Binary));
             Add(Intrinsic.X86Pblendvb,   new IntrinsicInfo(X86Instruction.Pblendvb,   IntrinsicType.Ternary));
+            Add(Intrinsic.X86Pclmulqdq,  new IntrinsicInfo(X86Instruction.Pclmulqdq,  IntrinsicType.TernaryImm));
             Add(Intrinsic.X86Pcmpeqb,    new IntrinsicInfo(X86Instruction.Pcmpeqb,    IntrinsicType.Binary));
             Add(Intrinsic.X86Pcmpeqd,    new IntrinsicInfo(X86Instruction.Pcmpeqd,    IntrinsicType.Binary));
             Add(Intrinsic.X86Pcmpeqq,    new IntrinsicInfo(X86Instruction.Pcmpeqq,    IntrinsicType.Binary));
diff --git a/ARMeilleure/CodeGen/X86/X86Instruction.cs b/ARMeilleure/CodeGen/X86/X86Instruction.cs
index e4682595e8..c3dffc62c6 100644
--- a/ARMeilleure/CodeGen/X86/X86Instruction.cs
+++ b/ARMeilleure/CodeGen/X86/X86Instruction.cs
@@ -98,6 +98,7 @@ namespace ARMeilleure.CodeGen.X86
         Pavgb,
         Pavgw,
         Pblendvb,
+        Pclmulqdq,
         Pcmpeqb,
         Pcmpeqd,
         Pcmpeqq,
diff --git a/ARMeilleure/Instructions/InstEmitHash.cs b/ARMeilleure/Instructions/InstEmitHash.cs
index 0be8458e20..8a539666e9 100644
--- a/ARMeilleure/Instructions/InstEmitHash.cs
+++ b/ARMeilleure/Instructions/InstEmitHash.cs
@@ -1,9 +1,13 @@
+// https://www.intel.com/content/dam/www/public/us/en/documents/white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf
+
 using ARMeilleure.Decoders;
 using ARMeilleure.IntermediateRepresentation;
 using ARMeilleure.Translation;
 using System;
 
 using static ARMeilleure.Instructions.InstEmitHelper;
+using static ARMeilleure.Instructions.InstEmitSimdHelper;
+using static ARMeilleure.IntermediateRepresentation.OperandHelper;
 
 namespace ARMeilleure.Instructions
 {
@@ -11,42 +15,159 @@ namespace ARMeilleure.Instructions
     {
         public static void Crc32b(ArmEmitterContext context)
         {
-            EmitCrc32Call(context, new _U32_U32_U8(SoftFallback.Crc32b));
+            if (Optimizations.UsePclmulqdq)
+            {
+                EmitCrc32Optimized(context, false, 8);
+            }
+            else
+            {
+                EmitCrc32Call(context, new _U32_U32_U8(SoftFallback.Crc32b));
+            }
         }
 
         public static void Crc32h(ArmEmitterContext context)
         {
-            EmitCrc32Call(context, new _U32_U32_U16(SoftFallback.Crc32h));
+            if (Optimizations.UsePclmulqdq)
+            {
+                EmitCrc32Optimized(context, false, 16);
+            }
+            else
+            {
+                EmitCrc32Call(context, new _U32_U32_U16(SoftFallback.Crc32h));
+            }
         }
 
         public static void Crc32w(ArmEmitterContext context)
         {
-            EmitCrc32Call(context, new _U32_U32_U32(SoftFallback.Crc32w));
+            if (Optimizations.UsePclmulqdq)
+            {
+                EmitCrc32Optimized(context, false, 32);
+            }
+            else
+            {
+                EmitCrc32Call(context, new _U32_U32_U32(SoftFallback.Crc32w));
+            }
         }
 
         public static void Crc32x(ArmEmitterContext context)
         {
-            EmitCrc32Call(context, new _U32_U32_U64(SoftFallback.Crc32x));
+            if (Optimizations.UsePclmulqdq)
+            {
+                EmitCrc32Optimized64(context, false);
+            }
+            else
+            {
+                EmitCrc32Call(context, new _U32_U32_U64(SoftFallback.Crc32x));
+            }
         }
 
         public static void Crc32cb(ArmEmitterContext context)
         {
-            EmitCrc32Call(context, new _U32_U32_U8(SoftFallback.Crc32cb));
+            if (Optimizations.UsePclmulqdq)
+            {
+                EmitCrc32Optimized(context, true, 8);
+            }
+            else
+            {
+                EmitCrc32Call(context, new _U32_U32_U8(SoftFallback.Crc32cb));
+            }
         }
 
         public static void Crc32ch(ArmEmitterContext context)
         {
-            EmitCrc32Call(context, new _U32_U32_U16(SoftFallback.Crc32ch));
+            if (Optimizations.UsePclmulqdq)
+            {
+                EmitCrc32Optimized(context, true, 16);
+            }
+            else
+            {
+                EmitCrc32Call(context, new _U32_U32_U16(SoftFallback.Crc32ch));
+            }
         }
 
         public static void Crc32cw(ArmEmitterContext context)
         {
-            EmitCrc32Call(context, new _U32_U32_U32(SoftFallback.Crc32cw));
+            if (Optimizations.UsePclmulqdq)
+            {
+                EmitCrc32Optimized(context, true, 32);
+            }
+            else
+            {
+                EmitCrc32Call(context, new _U32_U32_U32(SoftFallback.Crc32cw));
+            }
         }
 
         public static void Crc32cx(ArmEmitterContext context)
         {
-            EmitCrc32Call(context, new _U32_U32_U64(SoftFallback.Crc32cx));
+            if (Optimizations.UsePclmulqdq)
+            {
+                EmitCrc32Optimized64(context, true);
+            }
+            else
+            {
+                EmitCrc32Call(context, new _U32_U32_U64(SoftFallback.Crc32cx));
+            }
+        }
+
+        private static void EmitCrc32Optimized(ArmEmitterContext context, bool castagnoli, int bitsize)
+        {
+            OpCodeAluBinary op = (OpCodeAluBinary)context.CurrOp;
+
+            long mu = castagnoli ? 0x0DEA713F1 : 0x1F7011641; // mu' = floor(x^64/P(x))'
+            long polynomial = castagnoli ? 0x105EC76F0 : 0x1DB710641; // P'(x) << 1
+
+            Operand crc = GetIntOrZR(context, op.Rn);
+            Operand data = GetIntOrZR(context, op.Rm);
+
+            crc = context.VectorInsert(context.VectorZero(), crc, 0);
+
+            switch (bitsize)
+            {
+                case 8: data = context.VectorInsert8(context.VectorZero(), data, 0); break;
+                case 16: data = context.VectorInsert16(context.VectorZero(), data, 0); break;
+                case 32: data = context.VectorInsert(context.VectorZero(), data, 0); break;
+            }
+
+            Operand tmp = context.AddIntrinsic(Intrinsic.X86Pxor, crc, data);
+            tmp = context.AddIntrinsic(Intrinsic.X86Psllq, tmp, Const(64 - bitsize));
+            tmp = context.AddIntrinsic(Intrinsic.X86Pclmulqdq, tmp, X86GetScalar(context, mu), Const(0));
+            tmp = context.AddIntrinsic(Intrinsic.X86Pclmulqdq, tmp, X86GetScalar(context, polynomial), Const(0));
+
+            if (bitsize < 32)
+            {
+                crc = context.AddIntrinsic(Intrinsic.X86Pslldq, crc, Const((64 - bitsize) / 8));
+                tmp = context.AddIntrinsic(Intrinsic.X86Pxor, tmp, crc);
+            }
+
+            SetIntOrZR(context, op.Rd, context.VectorExtract(OperandType.I32, tmp, 2));
+        }
+
+        private static void EmitCrc32Optimized64(ArmEmitterContext context, bool castagnoli)
+        {
+            OpCodeAluBinary op = (OpCodeAluBinary)context.CurrOp;
+
+            long mu = castagnoli ? 0x0DEA713F1 : 0x1F7011641; // mu' = floor(x^64/P(x))'
+            long polynomial = castagnoli ? 0x105EC76F0 : 0x1DB710641; // P'(x) << 1
+
+            Operand crc = GetIntOrZR(context, op.Rn);
+            Operand data = GetIntOrZR(context, op.Rm);
+
+            crc = context.VectorInsert(context.VectorZero(), crc, 0);
+            data = context.VectorInsert(context.VectorZero(), data, 0);
+
+            Operand tmp = context.AddIntrinsic(Intrinsic.X86Pxor, crc, data);
+            Operand res = context.AddIntrinsic(Intrinsic.X86Pslldq, tmp, Const(4));
+
+            tmp = context.AddIntrinsic(Intrinsic.X86Pclmulqdq, res, X86GetScalar(context, mu), Const(0));
+            tmp = context.AddIntrinsic(Intrinsic.X86Pclmulqdq, tmp, X86GetScalar(context, polynomial), Const(0));
+
+            tmp = context.AddIntrinsic(Intrinsic.X86Pxor, tmp, res);
+            tmp = context.AddIntrinsic(Intrinsic.X86Psllq, tmp, Const(32));
+
+            tmp = context.AddIntrinsic(Intrinsic.X86Pclmulqdq, tmp, X86GetScalar(context, mu), Const(1));
+            tmp = context.AddIntrinsic(Intrinsic.X86Pclmulqdq, tmp, X86GetScalar(context, polynomial), Const(0));
+
+            SetIntOrZR(context, op.Rd, context.VectorExtract(OperandType.I32, tmp, 2));
         }
 
         private static void EmitCrc32Call(ArmEmitterContext context, Delegate dlg)
diff --git a/ARMeilleure/IntermediateRepresentation/Intrinsic.cs b/ARMeilleure/IntermediateRepresentation/Intrinsic.cs
index 28ec9f32d1..639ba7f926 100644
--- a/ARMeilleure/IntermediateRepresentation/Intrinsic.cs
+++ b/ARMeilleure/IntermediateRepresentation/Intrinsic.cs
@@ -71,6 +71,7 @@ namespace ARMeilleure.IntermediateRepresentation
         X86Pavgb,
         X86Pavgw,
         X86Pblendvb,
+        X86Pclmulqdq,
         X86Pcmpeqb,
         X86Pcmpeqd,
         X86Pcmpeqq,
diff --git a/ARMeilleure/Optimizations.cs b/ARMeilleure/Optimizations.cs
index b486c5d200..fa06a41096 100644
--- a/ARMeilleure/Optimizations.cs
+++ b/ARMeilleure/Optimizations.cs
@@ -8,15 +8,16 @@ namespace ARMeilleure
 
         public static bool FastFP { get; set; } = true;
 
-        public static bool UseSseIfAvailable    { get; set; } = true;
-        public static bool UseSse2IfAvailable   { get; set; } = true;
-        public static bool UseSse3IfAvailable   { get; set; } = true;
-        public static bool UseSsse3IfAvailable  { get; set; } = true;
-        public static bool UseSse41IfAvailable  { get; set; } = true;
-        public static bool UseSse42IfAvailable  { get; set; } = true;
-        public static bool UsePopCntIfAvailable { get; set; } = true;
-        public static bool UseAvxIfAvailable    { get; set; } = true;
-        public static bool UseAesniIfAvailable  { get; set; } = true;
+        public static bool UseSseIfAvailable       { get; set; } = true;
+        public static bool UseSse2IfAvailable      { get; set; } = true;
+        public static bool UseSse3IfAvailable      { get; set; } = true;
+        public static bool UseSsse3IfAvailable     { get; set; } = true;
+        public static bool UseSse41IfAvailable     { get; set; } = true;
+        public static bool UseSse42IfAvailable     { get; set; } = true;
+        public static bool UsePopCntIfAvailable    { get; set; } = true;
+        public static bool UseAvxIfAvailable       { get; set; } = true;
+        public static bool UseAesniIfAvailable     { get; set; } = true;
+        public static bool UsePclmulqdqIfAvailable { get; set; } = true;
 
         public static bool ForceLegacySse
         {
@@ -24,14 +25,15 @@ namespace ARMeilleure
             set => HardwareCapabilities.ForceLegacySse = value;
         }
 
-        internal static bool UseSse    => UseSseIfAvailable    && HardwareCapabilities.SupportsSse;
-        internal static bool UseSse2   => UseSse2IfAvailable   && HardwareCapabilities.SupportsSse2;
-        internal static bool UseSse3   => UseSse3IfAvailable   && HardwareCapabilities.SupportsSse3;
-        internal static bool UseSsse3  => UseSsse3IfAvailable  && HardwareCapabilities.SupportsSsse3;
-        internal static bool UseSse41  => UseSse41IfAvailable  && HardwareCapabilities.SupportsSse41;
-        internal static bool UseSse42  => UseSse42IfAvailable  && HardwareCapabilities.SupportsSse42;
-        internal static bool UsePopCnt => UsePopCntIfAvailable && HardwareCapabilities.SupportsPopcnt;
-        internal static bool UseAvx    => UseAvxIfAvailable    && HardwareCapabilities.SupportsAvx && !ForceLegacySse;
-        internal static bool UseAesni  => UseAesniIfAvailable  && HardwareCapabilities.SupportsAesni;
+        internal static bool UseSse       => UseSseIfAvailable       && HardwareCapabilities.SupportsSse;
+        internal static bool UseSse2      => UseSse2IfAvailable      && HardwareCapabilities.SupportsSse2;
+        internal static bool UseSse3      => UseSse3IfAvailable      && HardwareCapabilities.SupportsSse3;
+        internal static bool UseSsse3     => UseSsse3IfAvailable     && HardwareCapabilities.SupportsSsse3;
+        internal static bool UseSse41     => UseSse41IfAvailable     && HardwareCapabilities.SupportsSse41;
+        internal static bool UseSse42     => UseSse42IfAvailable     && HardwareCapabilities.SupportsSse42;
+        internal static bool UsePopCnt    => UsePopCntIfAvailable    && HardwareCapabilities.SupportsPopcnt;
+        internal static bool UseAvx       => UseAvxIfAvailable       && HardwareCapabilities.SupportsAvx && !ForceLegacySse;
+        internal static bool UseAesni     => UseAesniIfAvailable     && HardwareCapabilities.SupportsAesni;
+        internal static bool UsePclmulqdq => UsePclmulqdqIfAvailable && HardwareCapabilities.SupportsPclmulqdq;
     }
 }
\ No newline at end of file