From d7044b10a253dae31b9a0041a432e3a7adce59f6 Mon Sep 17 00:00:00 2001 From: riperiperi Date: Mon, 13 Jul 2020 11:48:14 +0100 Subject: [PATCH] Add SSE4.2 Path for CRC32, add A32 variant, add tests for non-castagnoli variants. (#1328) * Add CRC32 A32 instructions. * Fix CRC32 instructions. * Add CRC intrinsic and fast path. Loop is currently unrolled, will look into adding temp vars after tests are added. * Begin work on Crc tests * Fix SSE4.2 path for CRC32C, finialize tests. * Remove unused IR path. * Fix spacing between prefix checks. * This should be Src. * PTC Version * OpCodeTable Order * Integer check improvement. Value and Crc can be either 32 or 64 size. * This wasn't necessary... * If size is 3, value type must be I64. * Fix same src+dest handling for non crc intrinsics. * Pre-fix (ha) issue with vex encodings --- ARMeilleure/CodeGen/X86/Assembler.cs | 33 +++- ARMeilleure/CodeGen/X86/CodeGenerator.cs | 15 ++ ARMeilleure/CodeGen/X86/IntrinsicTable.cs | 3 + ARMeilleure/CodeGen/X86/IntrinsicType.cs | 1 + ARMeilleure/CodeGen/X86/PreAllocator.cs | 11 ++ ARMeilleure/CodeGen/X86/X86Instruction.cs | 3 + ARMeilleure/Decoders/OpCodeTable.cs | 6 + ARMeilleure/Instructions/InstEmitHash.cs | 147 ++---------------- ARMeilleure/Instructions/InstEmitHash32.cs | 54 +++++++ .../Instructions/InstEmitHashHelper.cs | 119 ++++++++++++++ .../IntermediateRepresentation/Intrinsic.cs | 3 + ARMeilleure/Translation/PTC/Ptc.cs | 2 +- Ryujinx.Tests/Cpu/CpuTest.cs | 45 +++--- Ryujinx.Tests/Cpu/CpuTestAluBinary.cs | 71 +++++++++ Ryujinx.Tests/Cpu/CpuTestAluBinary32.cs | 96 ++++++++++++ 15 files changed, 448 insertions(+), 161 deletions(-) create mode 100644 ARMeilleure/Instructions/InstEmitHash32.cs create mode 100644 ARMeilleure/Instructions/InstEmitHashHelper.cs create mode 100644 Ryujinx.Tests/Cpu/CpuTestAluBinary32.cs diff --git a/ARMeilleure/CodeGen/X86/Assembler.cs b/ARMeilleure/CodeGen/X86/Assembler.cs index 537c746c35..d0ccd6f810 100644 --- a/ARMeilleure/CodeGen/X86/Assembler.cs +++ b/ARMeilleure/CodeGen/X86/Assembler.cs @@ -28,10 +28,10 @@ namespace ARMeilleure.CodeGen.X86 Vex = 1 << 4, PrefixBit = 16, - PrefixMask = 3 << PrefixBit, + PrefixMask = 7 << PrefixBit, Prefix66 = 1 << PrefixBit, PrefixF3 = 2 << PrefixBit, - PrefixF2 = 3 << PrefixBit + PrefixF2 = 4 << PrefixBit } private struct InstructionInfo @@ -104,6 +104,9 @@ namespace ARMeilleure.CodeGen.X86 Add(X86Instruction.Comisd, new InstructionInfo(BadOp, BadOp, BadOp, BadOp, 0x00000f2f, InstructionFlags.Vex | InstructionFlags.Prefix66)); Add(X86Instruction.Comiss, new InstructionInfo(BadOp, BadOp, BadOp, BadOp, 0x00000f2f, InstructionFlags.Vex)); Add(X86Instruction.Cpuid, new InstructionInfo(BadOp, BadOp, BadOp, BadOp, 0x00000fa2, InstructionFlags.RegOnly)); + Add(X86Instruction.Crc32, new InstructionInfo(BadOp, BadOp, BadOp, BadOp, 0x000f38f1, InstructionFlags.PrefixF2)); + Add(X86Instruction.Crc32_16, new InstructionInfo(BadOp, BadOp, BadOp, BadOp, 0x000f38f1, InstructionFlags.PrefixF2 | InstructionFlags.Prefix66)); + Add(X86Instruction.Crc32_8, new InstructionInfo(BadOp, BadOp, BadOp, BadOp, 0x000f38f0, InstructionFlags.PrefixF2 | InstructionFlags.Reg8Src)); Add(X86Instruction.Cvtdq2pd, new InstructionInfo(BadOp, BadOp, BadOp, BadOp, 0x00000fe6, InstructionFlags.Vex | InstructionFlags.PrefixF3)); Add(X86Instruction.Cvtdq2ps, new InstructionInfo(BadOp, BadOp, BadOp, BadOp, 0x00000f5b, InstructionFlags.Vex)); Add(X86Instruction.Cvtpd2dq, new InstructionInfo(BadOp, BadOp, BadOp, BadOp, 0x00000fe6, InstructionFlags.Vex | InstructionFlags.PrefixF2)); @@ -1172,7 +1175,15 @@ namespace ARMeilleure.CodeGen.X86 if ((flags & InstructionFlags.Vex) != 0 && HardwareCapabilities.SupportsVexEncoding) { - int vexByte2 = (int)(flags & InstructionFlags.PrefixMask) >> (int)InstructionFlags.PrefixBit; + // In a vex encoding, only one prefix can be active at a time. The active prefix is encoded in the second byte using two bits. + + int vexByte2 = (flags & InstructionFlags.PrefixMask) switch + { + InstructionFlags.Prefix66 => 1, + InstructionFlags.PrefixF3 => 2, + InstructionFlags.PrefixF2 => 3, + _ => 0 + }; if (src1 != null) { @@ -1220,11 +1231,19 @@ namespace ARMeilleure.CodeGen.X86 } else { - switch (flags & InstructionFlags.PrefixMask) + if (flags.HasFlag(InstructionFlags.Prefix66)) { - case InstructionFlags.Prefix66: WriteByte(0x66); break; - case InstructionFlags.PrefixF2: WriteByte(0xf2); break; - case InstructionFlags.PrefixF3: WriteByte(0xf3); break; + WriteByte(0x66); + } + + if (flags.HasFlag(InstructionFlags.PrefixF2)) + { + WriteByte(0xf2); + } + + if (flags.HasFlag(InstructionFlags.PrefixF3)) + { + WriteByte(0xf3); } if (rexPrefix != 0) diff --git a/ARMeilleure/CodeGen/X86/CodeGenerator.cs b/ARMeilleure/CodeGen/X86/CodeGenerator.cs index e7e7553ede..e217a66581 100644 --- a/ARMeilleure/CodeGen/X86/CodeGenerator.cs +++ b/ARMeilleure/CodeGen/X86/CodeGenerator.cs @@ -333,6 +333,21 @@ namespace ARMeilleure.CodeGen.X86 break; } + case IntrinsicType.Crc32: + { + Operand dest = operation.Destination; + Operand src1 = operation.GetSource(0); + Operand src2 = operation.GetSource(1); + + EnsureSameReg(dest, src1); + + Debug.Assert(dest.Type.IsInteger() && src1.Type.IsInteger() && src2.Type.IsInteger()); + + context.Assembler.WriteInstruction(info.Inst, dest, src2, dest.Type); + + break; + } + case IntrinsicType.BinaryImm: { Operand dest = operation.Destination; diff --git a/ARMeilleure/CodeGen/X86/IntrinsicTable.cs b/ARMeilleure/CodeGen/X86/IntrinsicTable.cs index bc07c6b09c..f7469badc0 100644 --- a/ARMeilleure/CodeGen/X86/IntrinsicTable.cs +++ b/ARMeilleure/CodeGen/X86/IntrinsicTable.cs @@ -38,6 +38,9 @@ namespace ARMeilleure.CodeGen.X86 Add(Intrinsic.X86Comisseq, new IntrinsicInfo(X86Instruction.Comiss, IntrinsicType.Comis_)); Add(Intrinsic.X86Comissge, new IntrinsicInfo(X86Instruction.Comiss, IntrinsicType.Comis_)); Add(Intrinsic.X86Comisslt, new IntrinsicInfo(X86Instruction.Comiss, IntrinsicType.Comis_)); + Add(Intrinsic.X86Crc32, new IntrinsicInfo(X86Instruction.Crc32, IntrinsicType.Crc32)); + Add(Intrinsic.X86Crc32_16, new IntrinsicInfo(X86Instruction.Crc32_16, IntrinsicType.Crc32)); + Add(Intrinsic.X86Crc32_8, new IntrinsicInfo(X86Instruction.Crc32_8, IntrinsicType.Crc32)); Add(Intrinsic.X86Cvtdq2pd, new IntrinsicInfo(X86Instruction.Cvtdq2pd, IntrinsicType.Unary)); Add(Intrinsic.X86Cvtdq2ps, new IntrinsicInfo(X86Instruction.Cvtdq2ps, IntrinsicType.Unary)); Add(Intrinsic.X86Cvtpd2dq, new IntrinsicInfo(X86Instruction.Cvtpd2dq, IntrinsicType.Unary)); diff --git a/ARMeilleure/CodeGen/X86/IntrinsicType.cs b/ARMeilleure/CodeGen/X86/IntrinsicType.cs index 41c52b59db..fe0f66ed7b 100644 --- a/ARMeilleure/CodeGen/X86/IntrinsicType.cs +++ b/ARMeilleure/CodeGen/X86/IntrinsicType.cs @@ -9,6 +9,7 @@ namespace ARMeilleure.CodeGen.X86 Binary, BinaryGpr, BinaryImm, + Crc32, Ternary, TernaryImm } diff --git a/ARMeilleure/CodeGen/X86/PreAllocator.cs b/ARMeilleure/CodeGen/X86/PreAllocator.cs index d1794b555d..dc7f3a75a4 100644 --- a/ARMeilleure/CodeGen/X86/PreAllocator.cs +++ b/ARMeilleure/CodeGen/X86/PreAllocator.cs @@ -1294,11 +1294,22 @@ namespace ARMeilleure.CodeGen.X86 case Instruction.VectorInsert16: case Instruction.VectorInsert8: return !HardwareCapabilities.SupportsVexEncoding; + + case Instruction.Extended: + return IsIntrinsicSameOperandDestSrc1(operation); } return IsVexSameOperandDestSrc1(operation); } + private static bool IsIntrinsicSameOperandDestSrc1(Operation operation) + { + IntrinsicOperation intrinOp = (IntrinsicOperation)operation; + IntrinsicInfo info = IntrinsicTable.GetInfo(intrinOp.Intrinsic); + + return info.Type == IntrinsicType.Crc32 || IsVexSameOperandDestSrc1(operation); + } + private static bool IsVexSameOperandDestSrc1(Operation operation) { if (IsIntrinsic(operation.Instruction)) diff --git a/ARMeilleure/CodeGen/X86/X86Instruction.cs b/ARMeilleure/CodeGen/X86/X86Instruction.cs index c3dffc62c6..9ac17e5bf3 100644 --- a/ARMeilleure/CodeGen/X86/X86Instruction.cs +++ b/ARMeilleure/CodeGen/X86/X86Instruction.cs @@ -33,6 +33,9 @@ namespace ARMeilleure.CodeGen.X86 Comisd, Comiss, Cpuid, + Crc32, + Crc32_16, + Crc32_8, Cvtdq2pd, Cvtdq2ps, Cvtpd2dq, diff --git a/ARMeilleure/Decoders/OpCodeTable.cs b/ARMeilleure/Decoders/OpCodeTable.cs index 8567e1ce69..c1632d461c 100644 --- a/ARMeilleure/Decoders/OpCodeTable.cs +++ b/ARMeilleure/Decoders/OpCodeTable.cs @@ -659,6 +659,12 @@ namespace ARMeilleure.Decoders SetA32("<<<<00110101xxxx0000xxxxxxxxxxxx", InstName.Cmp, InstEmit32.Cmp, typeof(OpCode32AluImm)); SetA32("<<<<00010101xxxx0000xxxxxxx0xxxx", InstName.Cmp, InstEmit32.Cmp, typeof(OpCode32AluRsImm)); SetA32("<<<<00010101xxxx0000xxxx0xx1xxxx", InstName.Cmp, InstEmit32.Cmp, typeof(OpCode32AluRsReg)); + SetA32("<<<<00010000xxxxxxxx00000100xxxx", InstName.Crc32b, InstEmit32.Crc32b, typeof(OpCode32AluReg)); + SetA32("<<<<00010000xxxxxxxx00100100xxxx", InstName.Crc32cb, InstEmit32.Crc32cb, typeof(OpCode32AluReg)); + SetA32("<<<<00010010xxxxxxxx00100100xxxx", InstName.Crc32ch, InstEmit32.Crc32ch, typeof(OpCode32AluReg)); + SetA32("<<<<00010100xxxxxxxx00100100xxxx", InstName.Crc32cw, InstEmit32.Crc32cw, typeof(OpCode32AluReg)); + SetA32("<<<<00010010xxxxxxxx00000100xxxx", InstName.Crc32h, InstEmit32.Crc32h, typeof(OpCode32AluReg)); + SetA32("<<<<00010100xxxxxxxx00000100xxxx", InstName.Crc32w, InstEmit32.Crc32w, typeof(OpCode32AluReg)); SetA32("1111010101111111111100000101xxxx", InstName.Dmb, InstEmit32.Dmb, typeof(OpCode32)); SetA32("1111010101111111111100000100xxxx", InstName.Dsb, InstEmit32.Dsb, typeof(OpCode32)); SetA32("<<<<0010001xxxxxxxxxxxxxxxxxxxxx", InstName.Eor, InstEmit32.Eor, typeof(OpCode32AluImm)); diff --git a/ARMeilleure/Instructions/InstEmitHash.cs b/ARMeilleure/Instructions/InstEmitHash.cs index 2a8b348889..82b3e3534f 100644 --- a/ARMeilleure/Instructions/InstEmitHash.cs +++ b/ARMeilleure/Instructions/InstEmitHash.cs @@ -1,182 +1,67 @@ -// https://www.intel.com/content/dam/www/public/us/en/documents/white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf - using ARMeilleure.Decoders; using ARMeilleure.IntermediateRepresentation; using ARMeilleure.Translation; +using static ARMeilleure.Instructions.InstEmitHashHelper; using static ARMeilleure.Instructions.InstEmitHelper; -using static ARMeilleure.Instructions.InstEmitSimdHelper; -using static ARMeilleure.IntermediateRepresentation.OperandHelper; namespace ARMeilleure.Instructions { static partial class InstEmit { + private const int ByteSizeLog2 = 0; + private const int HWordSizeLog2 = 1; + private const int WordSizeLog2 = 2; + private const int DWordSizeLog2 = 3; + public static void Crc32b(ArmEmitterContext context) { - if (Optimizations.UsePclmulqdq) - { - EmitCrc32Optimized(context, false, 8); - } - else - { - EmitCrc32Call(context, nameof(SoftFallback.Crc32b)); - } + EmitCrc32Call(context, ByteSizeLog2, false); } public static void Crc32h(ArmEmitterContext context) { - if (Optimizations.UsePclmulqdq) - { - EmitCrc32Optimized(context, false, 16); - } - else - { - EmitCrc32Call(context, nameof(SoftFallback.Crc32h)); - } + EmitCrc32Call(context, HWordSizeLog2, false); } public static void Crc32w(ArmEmitterContext context) { - if (Optimizations.UsePclmulqdq) - { - EmitCrc32Optimized(context, false, 32); - } - else - { - EmitCrc32Call(context, nameof(SoftFallback.Crc32w)); - } + EmitCrc32Call(context, WordSizeLog2, false); } public static void Crc32x(ArmEmitterContext context) { - if (Optimizations.UsePclmulqdq) - { - EmitCrc32Optimized64(context, false); - } - else - { - EmitCrc32Call(context, nameof(SoftFallback.Crc32x)); - } + EmitCrc32Call(context, DWordSizeLog2, false); } public static void Crc32cb(ArmEmitterContext context) { - if (Optimizations.UsePclmulqdq) - { - EmitCrc32Optimized(context, true, 8); - } - else - { - EmitCrc32Call(context, nameof(SoftFallback.Crc32cb)); - } + EmitCrc32Call(context, ByteSizeLog2, true); } public static void Crc32ch(ArmEmitterContext context) { - if (Optimizations.UsePclmulqdq) - { - EmitCrc32Optimized(context, true, 16); - } - else - { - EmitCrc32Call(context, nameof(SoftFallback.Crc32ch)); - } + EmitCrc32Call(context, HWordSizeLog2, true); } public static void Crc32cw(ArmEmitterContext context) { - if (Optimizations.UsePclmulqdq) - { - EmitCrc32Optimized(context, true, 32); - } - else - { - EmitCrc32Call(context, nameof(SoftFallback.Crc32cw)); - } + EmitCrc32Call(context, WordSizeLog2, true); } public static void Crc32cx(ArmEmitterContext context) { - if (Optimizations.UsePclmulqdq) - { - EmitCrc32Optimized64(context, true); - } - else - { - EmitCrc32Call(context, nameof(SoftFallback.Crc32cx)); - } + EmitCrc32Call(context, DWordSizeLog2, true); } - private static void EmitCrc32Optimized(ArmEmitterContext context, bool castagnoli, int bitsize) - { - OpCodeAluBinary op = (OpCodeAluBinary)context.CurrOp; - - long mu = castagnoli ? 0x0DEA713F1 : 0x1F7011641; // mu' = floor(x^64/P(x))' - long polynomial = castagnoli ? 0x105EC76F0 : 0x1DB710641; // P'(x) << 1 - - Operand crc = GetIntOrZR(context, op.Rn); - Operand data = GetIntOrZR(context, op.Rm); - - crc = context.VectorInsert(context.VectorZero(), crc, 0); - - switch (bitsize) - { - case 8: data = context.VectorInsert8(context.VectorZero(), data, 0); break; - case 16: data = context.VectorInsert16(context.VectorZero(), data, 0); break; - case 32: data = context.VectorInsert(context.VectorZero(), data, 0); break; - } - - Operand tmp = context.AddIntrinsic(Intrinsic.X86Pxor, crc, data); - tmp = context.AddIntrinsic(Intrinsic.X86Psllq, tmp, Const(64 - bitsize)); - tmp = context.AddIntrinsic(Intrinsic.X86Pclmulqdq, tmp, X86GetScalar(context, mu), Const(0)); - tmp = context.AddIntrinsic(Intrinsic.X86Pclmulqdq, tmp, X86GetScalar(context, polynomial), Const(0)); - - if (bitsize < 32) - { - crc = context.AddIntrinsic(Intrinsic.X86Pslldq, crc, Const((64 - bitsize) / 8)); - tmp = context.AddIntrinsic(Intrinsic.X86Pxor, tmp, crc); - } - - SetIntOrZR(context, op.Rd, context.VectorExtract(OperandType.I32, tmp, 2)); - } - - private static void EmitCrc32Optimized64(ArmEmitterContext context, bool castagnoli) - { - OpCodeAluBinary op = (OpCodeAluBinary)context.CurrOp; - - long mu = castagnoli ? 0x0DEA713F1 : 0x1F7011641; // mu' = floor(x^64/P(x))' - long polynomial = castagnoli ? 0x105EC76F0 : 0x1DB710641; // P'(x) << 1 - - Operand crc = GetIntOrZR(context, op.Rn); - Operand data = GetIntOrZR(context, op.Rm); - - crc = context.VectorInsert(context.VectorZero(), crc, 0); - data = context.VectorInsert(context.VectorZero(), data, 0); - - Operand tmp = context.AddIntrinsic(Intrinsic.X86Pxor, crc, data); - Operand res = context.AddIntrinsic(Intrinsic.X86Pslldq, tmp, Const(4)); - - tmp = context.AddIntrinsic(Intrinsic.X86Pclmulqdq, res, X86GetScalar(context, mu), Const(0)); - tmp = context.AddIntrinsic(Intrinsic.X86Pclmulqdq, tmp, X86GetScalar(context, polynomial), Const(0)); - - tmp = context.AddIntrinsic(Intrinsic.X86Pxor, tmp, res); - tmp = context.AddIntrinsic(Intrinsic.X86Psllq, tmp, Const(32)); - - tmp = context.AddIntrinsic(Intrinsic.X86Pclmulqdq, tmp, X86GetScalar(context, mu), Const(1)); - tmp = context.AddIntrinsic(Intrinsic.X86Pclmulqdq, tmp, X86GetScalar(context, polynomial), Const(0)); - - SetIntOrZR(context, op.Rd, context.VectorExtract(OperandType.I32, tmp, 2)); - } - - private static void EmitCrc32Call(ArmEmitterContext context, string name) + private static void EmitCrc32Call(ArmEmitterContext context, int size, bool c) { OpCodeAluBinary op = (OpCodeAluBinary)context.CurrOp; Operand n = GetIntOrZR(context, op.Rn); Operand m = GetIntOrZR(context, op.Rm); - Operand d = context.Call(typeof(SoftFallback).GetMethod(name), n, m); + Operand d = EmitCrc32(context, n, m, size, c); SetIntOrZR(context, op.Rd, d); } diff --git a/ARMeilleure/Instructions/InstEmitHash32.cs b/ARMeilleure/Instructions/InstEmitHash32.cs new file mode 100644 index 0000000000..fec782dd82 --- /dev/null +++ b/ARMeilleure/Instructions/InstEmitHash32.cs @@ -0,0 +1,54 @@ +using ARMeilleure.Decoders; +using ARMeilleure.IntermediateRepresentation; +using ARMeilleure.Translation; + +using static ARMeilleure.Instructions.InstEmitHelper; +using static ARMeilleure.Instructions.InstEmitHashHelper; + +namespace ARMeilleure.Instructions +{ + static partial class InstEmit32 + { + public static void Crc32b(ArmEmitterContext context) + { + EmitCrc32Call(context, ByteSizeLog2, false); + } + + public static void Crc32h(ArmEmitterContext context) + { + EmitCrc32Call(context, HWordSizeLog2, false); + } + + public static void Crc32w(ArmEmitterContext context) + { + EmitCrc32Call(context, WordSizeLog2, false); + } + + public static void Crc32cb(ArmEmitterContext context) + { + EmitCrc32Call(context, ByteSizeLog2, true); + } + + public static void Crc32ch(ArmEmitterContext context) + { + EmitCrc32Call(context, HWordSizeLog2, true); + } + + public static void Crc32cw(ArmEmitterContext context) + { + EmitCrc32Call(context, WordSizeLog2, true); + } + + private static void EmitCrc32Call(ArmEmitterContext context, int size, bool c) + { + IOpCode32AluReg op = (IOpCode32AluReg)context.CurrOp; + + Operand n = GetIntA32(context, op.Rn); + Operand m = GetIntA32(context, op.Rm); + + Operand d = EmitCrc32(context, n, m, size, c); + + EmitAluStore(context, d); + } + } +} diff --git a/ARMeilleure/Instructions/InstEmitHashHelper.cs b/ARMeilleure/Instructions/InstEmitHashHelper.cs new file mode 100644 index 0000000000..9206e6d5b9 --- /dev/null +++ b/ARMeilleure/Instructions/InstEmitHashHelper.cs @@ -0,0 +1,119 @@ +// https://www.intel.com/content/dam/www/public/us/en/documents/white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf + +using ARMeilleure.IntermediateRepresentation; +using ARMeilleure.Translation; +using System; +using System.Diagnostics; + +using static ARMeilleure.IntermediateRepresentation.OperandHelper; +using static ARMeilleure.Instructions.InstEmitSimdHelper; + +namespace ARMeilleure.Instructions +{ + static class InstEmitHashHelper + { + public const uint Crc32RevPoly = 0xedb88320; + public const uint Crc32cRevPoly = 0x82f63b78; + + public static Operand EmitCrc32(ArmEmitterContext context, Operand crc, Operand value, int size, bool castagnoli) + { + Debug.Assert(crc.Type.IsInteger() && value.Type.IsInteger()); + Debug.Assert(size >= 0 && size < 4); + Debug.Assert((size < 3) || (value.Type == OperandType.I64)); + + if (castagnoli && Optimizations.UseSse42) + { + // The CRC32 instruction does not have an immediate variant, so ensure both inputs are in registers. + value = (value.Kind == OperandKind.Constant) ? context.Copy(value) : value; + crc = (crc.Kind == OperandKind.Constant) ? context.Copy(crc) : crc; + + Intrinsic op = size switch + { + 0 => Intrinsic.X86Crc32_8, + 1 => Intrinsic.X86Crc32_16, + _ => Intrinsic.X86Crc32, + }; + + return (size == 3) ? context.ConvertI64ToI32(context.AddIntrinsicLong(op, crc, value)) : context.AddIntrinsicInt(op, crc, value); + } + else if (Optimizations.UsePclmulqdq) + { + return size switch + { + 3 => EmitCrc32Optimized64(context, crc, value, castagnoli), + _ => EmitCrc32Optimized(context, crc, value, castagnoli, size), + }; + } + else + { + string name = (size, castagnoli) switch + { + (0, false) => nameof(SoftFallback.Crc32b), + (1, false) => nameof(SoftFallback.Crc32h), + (2, false) => nameof(SoftFallback.Crc32w), + (3, false) => nameof(SoftFallback.Crc32x), + (0, true) => nameof(SoftFallback.Crc32cb), + (1, true) => nameof(SoftFallback.Crc32ch), + (2, true) => nameof(SoftFallback.Crc32cw), + (3, true) => nameof(SoftFallback.Crc32cx), + _ => throw new ArgumentOutOfRangeException(nameof(size)) + }; + + return context.Call(typeof(SoftFallback).GetMethod(name), crc, value); + } + } + + private static Operand EmitCrc32Optimized(ArmEmitterContext context, Operand crc, Operand data, bool castagnoli, int size) + { + long mu = castagnoli ? 0x0DEA713F1 : 0x1F7011641; // mu' = floor(x^64/P(x))' + long polynomial = castagnoli ? 0x105EC76F0 : 0x1DB710641; // P'(x) << 1 + + crc = context.VectorInsert(context.VectorZero(), crc, 0); + + switch (size) + { + case 0: data = context.VectorInsert8(context.VectorZero(), data, 0); break; + case 1: data = context.VectorInsert16(context.VectorZero(), data, 0); break; + case 2: data = context.VectorInsert(context.VectorZero(), data, 0); break; + } + + int bitsize = 8 << size; + + Operand tmp = context.AddIntrinsic(Intrinsic.X86Pxor, crc, data); + tmp = context.AddIntrinsic(Intrinsic.X86Psllq, tmp, Const(64 - bitsize)); + tmp = context.AddIntrinsic(Intrinsic.X86Pclmulqdq, tmp, X86GetScalar(context, mu), Const(0)); + tmp = context.AddIntrinsic(Intrinsic.X86Pclmulqdq, tmp, X86GetScalar(context, polynomial), Const(0)); + + if (bitsize < 32) + { + crc = context.AddIntrinsic(Intrinsic.X86Pslldq, crc, Const((64 - bitsize) / 8)); + tmp = context.AddIntrinsic(Intrinsic.X86Pxor, tmp, crc); + } + + return context.VectorExtract(OperandType.I32, tmp, 2); + } + + private static Operand EmitCrc32Optimized64(ArmEmitterContext context, Operand crc, Operand data, bool castagnoli) + { + long mu = castagnoli ? 0x0DEA713F1 : 0x1F7011641; // mu' = floor(x^64/P(x))' + long polynomial = castagnoli ? 0x105EC76F0 : 0x1DB710641; // P'(x) << 1 + + crc = context.VectorInsert(context.VectorZero(), crc, 0); + data = context.VectorInsert(context.VectorZero(), data, 0); + + Operand tmp = context.AddIntrinsic(Intrinsic.X86Pxor, crc, data); + Operand res = context.AddIntrinsic(Intrinsic.X86Pslldq, tmp, Const(4)); + + tmp = context.AddIntrinsic(Intrinsic.X86Pclmulqdq, res, X86GetScalar(context, mu), Const(0)); + tmp = context.AddIntrinsic(Intrinsic.X86Pclmulqdq, tmp, X86GetScalar(context, polynomial), Const(0)); + + tmp = context.AddIntrinsic(Intrinsic.X86Pxor, tmp, res); + tmp = context.AddIntrinsic(Intrinsic.X86Psllq, tmp, Const(32)); + + tmp = context.AddIntrinsic(Intrinsic.X86Pclmulqdq, tmp, X86GetScalar(context, mu), Const(1)); + tmp = context.AddIntrinsic(Intrinsic.X86Pclmulqdq, tmp, X86GetScalar(context, polynomial), Const(0)); + + return context.VectorExtract(OperandType.I32, tmp, 2); + } + } +} diff --git a/ARMeilleure/IntermediateRepresentation/Intrinsic.cs b/ARMeilleure/IntermediateRepresentation/Intrinsic.cs index 639ba7f926..7f89117005 100644 --- a/ARMeilleure/IntermediateRepresentation/Intrinsic.cs +++ b/ARMeilleure/IntermediateRepresentation/Intrinsic.cs @@ -27,6 +27,9 @@ namespace ARMeilleure.IntermediateRepresentation X86Comisseq, X86Comissge, X86Comisslt, + X86Crc32, + X86Crc32_16, + X86Crc32_8, X86Cvtdq2pd, X86Cvtdq2ps, X86Cvtpd2dq, diff --git a/ARMeilleure/Translation/PTC/Ptc.cs b/ARMeilleure/Translation/PTC/Ptc.cs index 0051d25adb..deffabe1e1 100644 --- a/ARMeilleure/Translation/PTC/Ptc.cs +++ b/ARMeilleure/Translation/PTC/Ptc.cs @@ -20,7 +20,7 @@ namespace ARMeilleure.Translation.PTC { private const string HeaderMagic = "PTChd"; - private const int InternalVersion = 8; //! To be incremented manually for each change to the ARMeilleure project. + private const int InternalVersion = 9; //! To be incremented manually for each change to the ARMeilleure project. private const string BaseDir = "Ryujinx"; diff --git a/Ryujinx.Tests/Cpu/CpuTest.cs b/Ryujinx.Tests/Cpu/CpuTest.cs index 9e37c2114c..2c04724816 100644 --- a/Ryujinx.Tests/Cpu/CpuTest.cs +++ b/Ryujinx.Tests/Cpu/CpuTest.cs @@ -167,41 +167,42 @@ namespace Ryujinx.Tests.Cpu } } - protected void ExecuteOpcodes() + protected void ExecuteOpcodes(bool runUnicorn = true) { _cpuContext.Execute(_context, _entryPoint); - if (_unicornAvailable) + if (_unicornAvailable && runUnicorn) { _unicornEmu.RunForCount((_currAddress - _entryPoint - 4) / 4); } } protected ExecutionContext SingleOpcode(uint opcode, - ulong x0 = 0, - ulong x1 = 0, - ulong x2 = 0, - ulong x3 = 0, - ulong x31 = 0, - V128 v0 = default, - V128 v1 = default, - V128 v2 = default, - V128 v3 = default, - V128 v4 = default, - V128 v5 = default, - V128 v30 = default, - V128 v31 = default, - bool overflow = false, - bool carry = false, - bool zero = false, - bool negative = false, - int fpcr = 0, - int fpsr = 0) + ulong x0 = 0, + ulong x1 = 0, + ulong x2 = 0, + ulong x3 = 0, + ulong x31 = 0, + V128 v0 = default, + V128 v1 = default, + V128 v2 = default, + V128 v3 = default, + V128 v4 = default, + V128 v5 = default, + V128 v30 = default, + V128 v31 = default, + bool overflow = false, + bool carry = false, + bool zero = false, + bool negative = false, + int fpcr = 0, + int fpsr = 0, + bool runUnicorn = true) { Opcode(opcode); Opcode(0xD65F03C0); // RET SetContext(x0, x1, x2, x3, x31, v0, v1, v2, v3, v4, v5, v30, v31, overflow, carry, zero, negative, fpcr, fpsr); - ExecuteOpcodes(); + ExecuteOpcodes(runUnicorn); return GetContext(); } diff --git a/Ryujinx.Tests/Cpu/CpuTestAluBinary.cs b/Ryujinx.Tests/Cpu/CpuTestAluBinary.cs index 2823477fc5..eb94b84502 100644 --- a/Ryujinx.Tests/Cpu/CpuTestAluBinary.cs +++ b/Ryujinx.Tests/Cpu/CpuTestAluBinary.cs @@ -1,5 +1,6 @@ #define AluBinary +using ARMeilleure.State; using NUnit.Framework; namespace Ryujinx.Tests.Cpu @@ -8,8 +9,78 @@ namespace Ryujinx.Tests.Cpu public sealed class CpuTestAluBinary : CpuTest { #if AluBinary + public struct CrcTest + { + public uint Crc; + public ulong Value; + public bool C; + + public uint[] Results; // One result for each CRC variant (8, 16, 32) + + public CrcTest(uint crc, ulong value, bool c, params uint[] results) + { + Crc = crc; + Value = value; + C = c; + Results = results; + } + } + +#region "ValueSource (CRC32)" + private static CrcTest[] _CRC32_Test_Values_() + { + // Created with http://www.sunshine2k.de/coding/javascript/crc/crc_js.html, with: + // - non-reflected polynomials + // - input reflected, result reflected + // - bytes in order of increasing significance + // - xor 0 + // Only includes non-C variant, as the other can be tested with unicorn. + + return new CrcTest[] + { + new CrcTest(0x00000000u, 0x00_00_00_00_00_00_00_00u, false, 0x00000000, 0x00000000, 0x00000000, 0x00000000), + new CrcTest(0x00000000u, 0x7f_ff_ff_ff_ff_ff_ff_ffu, false, 0x2d02ef8d, 0xbe2612ff, 0xdebb20e3, 0xa9de8355), + new CrcTest(0x00000000u, 0x80_00_00_00_00_00_00_00u, false, 0x00000000, 0x00000000, 0x00000000, 0xedb88320), + new CrcTest(0x00000000u, 0xff_ff_ff_ff_ff_ff_ff_ffu, false, 0x2d02ef8d, 0xbe2612ff, 0xdebb20e3, 0x44660075), + new CrcTest(0x00000000u, 0xa0_02_f1_ca_52_78_8c_1cu, false, 0x14015c4f, 0x02799256, 0x9063c9e5, 0x8816610a), + + new CrcTest(0xffffffffu, 0x00_00_00_00_00_00_00_00u, false, 0x2dfd1072, 0xbe26ed00, 0xdebb20e3, 0x9add2096), + new CrcTest(0xffffffffu, 0x7f_ff_ff_ff_ff_ff_ff_ffu, false, 0x00ffffff, 0x0000ffff, 0x00000000, 0x3303a3c3), + new CrcTest(0xffffffffu, 0x80_00_00_00_00_00_00_00u, false, 0x2dfd1072, 0xbe26ed00, 0xdebb20e3, 0x7765a3b6), + new CrcTest(0xffffffffu, 0xff_ff_ff_ff_ff_ff_ff_ffu, false, 0x00ffffff, 0x0000ffff, 0x00000000, 0xdebb20e3), + new CrcTest(0xffffffffu, 0xa0_02_f1_ca_52_78_8c_1cu, false, 0x39fc4c3d, 0xbc5f7f56, 0x4ed8e906, 0x12cb419c) + }; + } +#endregion + private const int RndCnt = 2; + [Test, Combinatorial] + public void Crc32_b_h_w_x([Values(0u)] uint rd, + [Values(1u)] uint rn, + [Values(2u)] uint rm, + [Range(0u, 3u)] uint size, + [ValueSource("_CRC32_Test_Values_")] CrcTest test) + { + uint opcode = 0x1AC04000; // CRC32B W0, W0, W0 + + opcode |= size << 10; + opcode |= ((rm & 31) << 16) | ((rn & 31) << 5) | ((rd & 31) << 0); + + if (size == 3) + { + opcode |= 0x80000000; + } + + uint w31 = TestContext.CurrentContext.Random.NextUInt(); + + SingleOpcode(opcode, x1: test.Crc, x2: test.Value, x31: w31, runUnicorn: false); + + ExecutionContext context = GetContext(); + ulong result = context.GetX((int)rd); + Assert.That(result == test.Results[size]); + } + [Test, Pairwise, Description("CRC32X , , "), Ignore("Unicorn fails.")] public void Crc32x([Values(0u, 31u)] uint rd, [Values(1u, 31u)] uint rn, diff --git a/Ryujinx.Tests/Cpu/CpuTestAluBinary32.cs b/Ryujinx.Tests/Cpu/CpuTestAluBinary32.cs new file mode 100644 index 0000000000..0a0302c269 --- /dev/null +++ b/Ryujinx.Tests/Cpu/CpuTestAluBinary32.cs @@ -0,0 +1,96 @@ +#define AluBinary32 + +using ARMeilleure.State; +using NUnit.Framework; +using System; + +namespace Ryujinx.Tests.Cpu +{ + + [Category("AluBinary32")] + public sealed class CpuTestAluBinary32 : CpuTest32 + { +#if AluBinary32 + public struct CrcTest32 + { + public uint Crc; + public uint Value; + public bool C; + + public uint[] Results; // One result for each CRC variant (8, 16, 32) + + public CrcTest32(uint crc, uint value, bool c, params uint[] results) + { + Crc = crc; + Value = value; + C = c; + Results = results; + } + } + +#region "ValueSource (CRC32/CRC32C)" + private static CrcTest32[] _CRC32_Test_Values_() + { + // Created with http://www.sunshine2k.de/coding/javascript/crc/crc_js.html, with: + // - non-reflected polynomials + // - input reflected, result reflected + // - bytes in order of increasing significance + // - xor 0 + + return new CrcTest32[] + { + new CrcTest32(0x00000000u, 0x00_00_00_00u, false, 0x00000000, 0x00000000, 0x00000000), + new CrcTest32(0x00000000u, 0x7f_ff_ff_ffu, false, 0x2d02ef8d, 0xbe2612ff, 0x3303a3c3), + new CrcTest32(0x00000000u, 0x80_00_00_00u, false, 0x00000000, 0x00000000, 0xedb88320), + new CrcTest32(0x00000000u, 0xff_ff_ff_ffu, false, 0x2d02ef8d, 0xbe2612ff, 0xdebb20e3), + new CrcTest32(0x00000000u, 0x9d_cb_12_f0u, false, 0xbdbdf21c, 0xe70590f5, 0x3f7480c5), + + new CrcTest32(0xffffffffu, 0x00_00_00_00u, false, 0x2dfd1072, 0xbe26ed00, 0xdebb20e3), + new CrcTest32(0xffffffffu, 0x7f_ff_ff_ffu, false, 0x00ffffff, 0x0000ffff, 0xedb88320), + new CrcTest32(0xffffffffu, 0x80_00_00_00u, false, 0x2dfd1072, 0xbe26ed00, 0x3303a3c3), + new CrcTest32(0xffffffffu, 0xff_ff_ff_ffu, false, 0x00ffffff, 0x0000ffff, 0x00000000), + new CrcTest32(0xffffffffu, 0x9d_cb_12_f0u, false, 0x9040e26e, 0x59237df5, 0xe1cfa026), + + new CrcTest32(0x00000000u, 0x00_00_00_00u, true, 0x00000000, 0x00000000, 0x00000000), + new CrcTest32(0x00000000u, 0x7f_ff_ff_ffu, true, 0xad7d5351, 0x0e9e77d2, 0x356e8f40), + new CrcTest32(0x00000000u, 0x80_00_00_00u, true, 0x00000000, 0x00000000, 0x82f63b78), + new CrcTest32(0x00000000u, 0xff_ff_ff_ffu, true, 0xad7d5351, 0x0e9e77d2, 0xb798b438), + new CrcTest32(0x00000000u, 0x9d_cb_12_f0u, true, 0xf36e6f75, 0xb5ff99e6, 0x782dfbf1), + + new CrcTest32(0xffffffffu, 0x00_00_00_00u, true, 0xad82acae, 0x0e9e882d, 0xb798b438), + new CrcTest32(0xffffffffu, 0x7f_ff_ff_ffu, true, 0x00ffffff, 0x0000ffff, 0x82f63b78), + new CrcTest32(0xffffffffu, 0x80_00_00_00u, true, 0xad82acae, 0x0e9e882d, 0x356e8f40), + new CrcTest32(0xffffffffu, 0xff_ff_ff_ffu, true, 0x00ffffff, 0x0000ffff, 0x00000000), + new CrcTest32(0xffffffffu, 0x9d_cb_12_f0u, true, 0x5eecc3db, 0xbb6111cb, 0xcfb54fc9) + }; + } +#endregion + + [Test, Combinatorial] + public void Crc32_Crc32c_b_h_w([Values(0u)] uint rd, + [Values(1u)] uint rn, + [Values(2u)] uint rm, + [Range(0u, 2u)] uint size, + [ValueSource("_CRC32_Test_Values_")] CrcTest32 test) + { + // Unicorn does not yet support 32bit crc instructions, so test against a known table of results/values. + + uint opcode = 0xe1000040; // CRC32B R0, R0, R0 + opcode |= ((rm & 15) << 0) | ((rd & 15) << 12) | ((rn & 15) << 16); + opcode |= size << 21; + if (test.C) + { + opcode |= 1 << 9; + } + + uint sp = TestContext.CurrentContext.Random.NextUInt(); + + SingleOpcode(opcode, r1: test.Crc, r2: test.Value, sp: sp, runUnicorn: false); + + ExecutionContext context = GetContext(); + ulong result = context.GetX((int)rd); + Assert.That(result == test.Results[size]); + } +#endif + } +}