diff --git a/ARMeilleure/Decoders/OpCodeTable.cs b/ARMeilleure/Decoders/OpCodeTable.cs index 665e71290d..b19124851a 100644 --- a/ARMeilleure/Decoders/OpCodeTable.cs +++ b/ARMeilleure/Decoders/OpCodeTable.cs @@ -413,6 +413,8 @@ namespace ARMeilleure.Decoders SetA64("0x001110101xxxxx000111xxxxxxxxxx", InstName.Orr_V, InstEmit.Orr_V, OpCodeSimdReg.Create); SetA64("0x00111100000xxx0xx101xxxxxxxxxx", InstName.Orr_Vi, InstEmit.Orr_Vi, OpCodeSimdImm.Create); SetA64("0x00111100000xxx10x101xxxxxxxxxx", InstName.Orr_Vi, InstEmit.Orr_Vi, OpCodeSimdImm.Create); + SetA64("0x001110001xxxxx111000xxxxxxxxxx", InstName.Pmull_V, InstEmit.Pmull_V, OpCodeSimdReg.Create); + SetA64("0x001110111xxxxx111000xxxxxxxxxx", InstName.Pmull_V, InstEmit.Pmull_V, OpCodeSimdReg.Create); SetA64("0x101110<<1xxxxx010000xxxxxxxxxx", InstName.Raddhn_V, InstEmit.Raddhn_V, OpCodeSimdReg.Create); SetA64("0x10111001100000010110xxxxxxxxxx", InstName.Rbit_V, InstEmit.Rbit_V, OpCodeSimd.Create); SetA64("0x00111000100000000110xxxxxxxxxx", InstName.Rev16_V, InstEmit.Rev16_V, OpCodeSimd.Create); @@ -886,7 +888,7 @@ namespace ARMeilleure.Decoders SetA32("111100110x00xxxxxxxx1101xxx1xxxx", InstName.Vmul, InstEmit32.Vmul_V, OpCode32SimdReg.Create); SetA32("1111001x1x<<xxxxxxx01010x1x0xxxx", InstName.Vmull, InstEmit32.Vmull_1, OpCode32SimdRegElemLong.Create); SetA32("1111001x1x<<xxxxxxx01100x0x0xxxx", InstName.Vmull, InstEmit32.Vmull_I, OpCode32SimdRegLong.Create); - SetA32("111100101x00xxxxxxx01110x0x0xxxx", InstName.Vmull, InstEmit32.Vmull_I, OpCode32SimdRegLong.Create); // Polynomial + SetA32("111100101xx0xxxxxxx01110x0x0xxxx", InstName.Vmull, InstEmit32.Vmull_I, OpCode32SimdRegLong.Create); // P8/P64 SetA32("111100111x110000xxxx01011xx0xxxx", InstName.Vmvn, InstEmit32.Vmvn_I, OpCode32SimdBinary.Create); SetA32("1111001x1x000xxxxxxx0xx00x11xxxx", InstName.Vmvn, InstEmit32.Vmvn_II, OpCode32SimdImm.Create); // D/Q vector I32. SetA32("1111001x1x000xxxxxxx10x00x11xxxx", InstName.Vmvn, InstEmit32.Vmvn_II, OpCode32SimdImm.Create); diff --git a/ARMeilleure/Instructions/InstEmitSimdArithmetic.cs b/ARMeilleure/Instructions/InstEmitSimdArithmetic.cs index 3a97bc526a..88be07bdd3 100644 --- a/ARMeilleure/Instructions/InstEmitSimdArithmetic.cs +++ b/ARMeilleure/Instructions/InstEmitSimdArithmetic.cs @@ -10,6 +10,7 @@ using System.Diagnostics; using static ARMeilleure.Instructions.InstEmitHelper; using static ARMeilleure.Instructions.InstEmitSimdHelper; +using static ARMeilleure.Instructions.InstEmitSimdHelper32; using static ARMeilleure.IntermediateRepresentation.OperandHelper; namespace ARMeilleure.Instructions @@ -1928,6 +1929,112 @@ namespace ARMeilleure.Instructions } } + public static void Pmull_V(ArmEmitterContext context) + { + OpCodeSimdReg op = (OpCodeSimdReg)context.CurrOp; + + if (Optimizations.UsePclmulqdq && op.Size == 3) + { + Operand n = GetVec(op.Rn); + Operand m = GetVec(op.Rm); + + int imm8 = op.RegisterSize == RegisterSize.Simd64 ? 0b0000_0000 : 0b0001_0001; + + Operand res = context.AddIntrinsic(Intrinsic.X86Pclmulqdq, n, m, Const(imm8)); + + context.Copy(GetVec(op.Rd), res); + } + else if (Optimizations.UseSse41) + { + Operand n = GetVec(op.Rn); + Operand m = GetVec(op.Rm); + + if (op.RegisterSize == RegisterSize.Simd64) + { + n = context.VectorZeroUpper64(n); + m = context.VectorZeroUpper64(m); + } + else /* if (op.RegisterSize == RegisterSize.Simd128) */ + { + n = context.AddIntrinsic(Intrinsic.X86Psrldq, n, Const(8)); + m = context.AddIntrinsic(Intrinsic.X86Psrldq, m, Const(8)); + } + + Operand res = context.VectorZero(); + + if (op.Size == 0) + { + n = context.AddIntrinsic(Intrinsic.X86Pmovzxbw, n); + m = context.AddIntrinsic(Intrinsic.X86Pmovzxbw, m); + + for (int i = 0; i < 8; i++) + { + Operand mask = context.AddIntrinsic(Intrinsic.X86Psllw, n, Const(15 - i)); + mask = context.AddIntrinsic(Intrinsic.X86Psraw, mask, Const(15)); + + Operand tmp = context.AddIntrinsic(Intrinsic.X86Psllw, m, Const(i)); + tmp = context.AddIntrinsic(Intrinsic.X86Pand, tmp, mask); + + res = context.AddIntrinsic(Intrinsic.X86Pxor, res, tmp); + } + } + else /* if (op.Size == 3) */ + { + Operand zero = context.VectorZero(); + + for (int i = 0; i < 64; i++) + { + Operand mask = context.AddIntrinsic(Intrinsic.X86Movlhps, n, n); + mask = context.AddIntrinsic(Intrinsic.X86Psllq, mask, Const(63 - i)); + mask = context.AddIntrinsic(Intrinsic.X86Psrlq, mask, Const(63)); + mask = context.AddIntrinsic(Intrinsic.X86Psubq, zero, mask); + + Operand tmp = EmitSse2Sll_128(context, m, i); + tmp = context.AddIntrinsic(Intrinsic.X86Pand, tmp, mask); + + res = context.AddIntrinsic(Intrinsic.X86Pxor, res, tmp); + } + } + + context.Copy(GetVec(op.Rd), res); + } + else + { + Operand n = GetVec(op.Rn); + Operand m = GetVec(op.Rm); + + Operand res; + + if (op.Size == 0) + { + res = context.VectorZero(); + + int part = op.RegisterSize == RegisterSize.Simd64 ? 0 : 8; + + for (int index = 0; index < 8; index++) + { + Operand ne = context.VectorExtract8(n, part + index); + Operand me = context.VectorExtract8(m, part + index); + + Operand de = EmitPolynomialMultiply(context, ne, me, 8); + + res = EmitVectorInsert(context, res, de, index, 1); + } + } + else /* if (op.Size == 3) */ + { + int part = op.RegisterSize == RegisterSize.Simd64 ? 0 : 1; + + Operand ne = context.VectorExtract(OperandType.I64, n, part); + Operand me = context.VectorExtract(OperandType.I64, m, part); + + res = context.Call(typeof(SoftFallback).GetMethod(nameof(SoftFallback.PolynomialMult64_128)), ne, me); + } + + context.Copy(GetVec(op.Rd), res); + } + } + public static void Raddhn_V(ArmEmitterContext context) { EmitHighNarrow(context, (op1, op2) => context.Add(op1, op2), round: true); @@ -3690,5 +3797,23 @@ namespace ARMeilleure.Instructions context.Copy(GetVec(op.Rd), res); } + + private static Operand EmitSse2Sll_128(ArmEmitterContext context, Operand op, int shift) + { + // The upper part of op is assumed to be zero. + Debug.Assert(shift >= 0 && shift < 64); + + if (shift == 0) + { + return op; + } + + Operand high = context.AddIntrinsic(Intrinsic.X86Pslldq, op, Const(8)); + high = context.AddIntrinsic(Intrinsic.X86Psrlq, high, Const(64 - shift)); + + Operand low = context.AddIntrinsic(Intrinsic.X86Psllq, op, Const(shift)); + + return context.AddIntrinsic(Intrinsic.X86Por, high, low); + } } } diff --git a/ARMeilleure/Instructions/InstEmitSimdArithmetic32.cs b/ARMeilleure/Instructions/InstEmitSimdArithmetic32.cs index d35af20912..0fc8c39114 100644 --- a/ARMeilleure/Instructions/InstEmitSimdArithmetic32.cs +++ b/ARMeilleure/Instructions/InstEmitSimdArithmetic32.cs @@ -920,7 +920,19 @@ namespace ARMeilleure.Instructions if (op.Polynomial) { - EmitVectorBinaryLongOpI32(context, (op1, op2) => EmitPolynomialMultiply(context, op1, op2, 8 << op.Size), false); + if (op.Size == 0) // P8 + { + EmitVectorBinaryLongOpI32(context, (op1, op2) => EmitPolynomialMultiply(context, op1, op2, 8 << op.Size), false); + } + else /* if (op.Size == 2) // P64 */ + { + Operand ne = context.VectorExtract(OperandType.I64, GetVec(op.Qn), op.Vn & 1); + Operand me = context.VectorExtract(OperandType.I64, GetVec(op.Qm), op.Vm & 1); + + Operand res = context.Call(typeof(SoftFallback).GetMethod(nameof(SoftFallback.PolynomialMult64_128)), ne, me); + + context.Copy(GetVecA32(op.Qd), res); + } } else { @@ -1366,27 +1378,5 @@ namespace ARMeilleure.Instructions EmitVectorBinaryOpSimd32(context, genericEmit); } } - - private static Operand EmitPolynomialMultiply(ArmEmitterContext context, Operand op1, Operand op2, int eSize) - { - Debug.Assert(eSize <= 32); - - Operand result = eSize == 32 ? Const(0L) : Const(0); - - if (eSize == 32) - { - op1 = context.ZeroExtend32(OperandType.I64, op1); - op2 = context.ZeroExtend32(OperandType.I64, op2); - } - - for (int i = 0; i < eSize; i++) - { - Operand mask = context.BitwiseAnd(op1, Const(op1.Type, 1L << i)); - - result = context.BitwiseExclusiveOr(result, context.Multiply(op2, mask)); - } - - return result; - } } } diff --git a/ARMeilleure/Instructions/InstEmitSimdHelper32.cs b/ARMeilleure/Instructions/InstEmitSimdHelper32.cs index 3919505779..59d3dc294f 100644 --- a/ARMeilleure/Instructions/InstEmitSimdHelper32.cs +++ b/ARMeilleure/Instructions/InstEmitSimdHelper32.cs @@ -1167,5 +1167,27 @@ namespace ARMeilleure.Instructions return res; } + + public static Operand EmitPolynomialMultiply(ArmEmitterContext context, Operand op1, Operand op2, int eSize) + { + Debug.Assert(eSize <= 32); + + Operand result = eSize == 32 ? Const(0L) : Const(0); + + if (eSize == 32) + { + op1 = context.ZeroExtend32(OperandType.I64, op1); + op2 = context.ZeroExtend32(OperandType.I64, op2); + } + + for (int i = 0; i < eSize; i++) + { + Operand mask = context.BitwiseAnd(op1, Const(op1.Type, 1L << i)); + + result = context.BitwiseExclusiveOr(result, context.Multiply(op2, mask)); + } + + return result; + } } } diff --git a/ARMeilleure/Instructions/InstName.cs b/ARMeilleure/Instructions/InstName.cs index 41bb51f356..a0ec9dc394 100644 --- a/ARMeilleure/Instructions/InstName.cs +++ b/ARMeilleure/Instructions/InstName.cs @@ -296,6 +296,7 @@ namespace ARMeilleure.Instructions Orn_V, Orr_V, Orr_Vi, + Pmull_V, Raddhn_V, Rbit_V, Rev16_V, diff --git a/ARMeilleure/Instructions/SoftFallback.cs b/ARMeilleure/Instructions/SoftFallback.cs index ef00fd9d74..1d8fa2e238 100644 --- a/ARMeilleure/Instructions/SoftFallback.cs +++ b/ARMeilleure/Instructions/SoftFallback.cs @@ -1260,5 +1260,22 @@ namespace ARMeilleure.Instructions : (uint)(value >> 32); } #endregion + + public static V128 PolynomialMult64_128(ulong op1, ulong op2) + { + V128 result = V128.Zero; + + V128 op2_128 = new V128(op2, 0); + + for (int i = 0; i < 64; i++) + { + if (((op1 >> i) & 1) == 1) + { + result ^= op2_128 << i; + } + } + + return result; + } } } diff --git a/ARMeilleure/State/V128.cs b/ARMeilleure/State/V128.cs index 399cea1398..3fa9f9a999 100644 --- a/ARMeilleure/State/V128.cs +++ b/ARMeilleure/State/V128.cs @@ -189,6 +189,11 @@ namespace ARMeilleure.State /// </remarks> public static V128 operator <<(V128 x, int shift) { + if (shift == 0) + { + return new V128(x._e0, x._e1); + } + ulong shiftOut = x._e0 >> (64 - shift); return new V128(x._e0 << shift, (x._e1 << shift) | shiftOut); @@ -205,6 +210,11 @@ namespace ARMeilleure.State /// </remarks> public static V128 operator >>(V128 x, int shift) { + if (shift == 0) + { + return new V128(x._e0, x._e1); + } + ulong shiftOut = x._e1 & ((1UL << shift) - 1); return new V128((x._e0 >> shift) | (shiftOut << (64 - shift)), x._e1 >> shift); diff --git a/ARMeilleure/Translation/Delegates.cs b/ARMeilleure/Translation/Delegates.cs index a45192df39..1097ea5896 100644 --- a/ARMeilleure/Translation/Delegates.cs +++ b/ARMeilleure/Translation/Delegates.cs @@ -171,6 +171,7 @@ namespace ARMeilleure.Translation SetDelegateInfo(typeof(SoftFallback).GetMethod(nameof(SoftFallback.HashUpper))); SetDelegateInfo(typeof(SoftFallback).GetMethod(nameof(SoftFallback.InverseMixColumns))); SetDelegateInfo(typeof(SoftFallback).GetMethod(nameof(SoftFallback.MixColumns))); + SetDelegateInfo(typeof(SoftFallback).GetMethod(nameof(SoftFallback.PolynomialMult64_128))); SetDelegateInfo(typeof(SoftFallback).GetMethod(nameof(SoftFallback.Round))); SetDelegateInfo(typeof(SoftFallback).GetMethod(nameof(SoftFallback.RoundF))); SetDelegateInfo(typeof(SoftFallback).GetMethod(nameof(SoftFallback.SatF32ToS32))); diff --git a/ARMeilleure/Translation/PTC/Ptc.cs b/ARMeilleure/Translation/PTC/Ptc.cs index 5bb8cf6dde..f382cc637e 100644 --- a/ARMeilleure/Translation/PTC/Ptc.cs +++ b/ARMeilleure/Translation/PTC/Ptc.cs @@ -22,7 +22,7 @@ namespace ARMeilleure.Translation.PTC { private const string HeaderMagic = "PTChd"; - private const int InternalVersion = 1814; //! To be incremented manually for each change to the ARMeilleure project. + private const int InternalVersion = 1817; //! To be incremented manually for each change to the ARMeilleure project. private const string ActualDir = "0"; private const string BackupDir = "1"; diff --git a/Ryujinx.Tests/Cpu/CpuTestSimdReg.cs b/Ryujinx.Tests/Cpu/CpuTestSimdReg.cs index 828c1bf960..0daeb1d103 100644 --- a/Ryujinx.Tests/Cpu/CpuTestSimdReg.cs +++ b/Ryujinx.Tests/Cpu/CpuTestSimdReg.cs @@ -60,6 +60,13 @@ namespace Ryujinx.Tests.Cpu 0x8080808080808080ul, 0xFFFFFFFFFFFFFFFFul }; } + private static ulong[] _8B1D_() + { + return new ulong[] { 0x0000000000000000ul, 0x7F7F7F7F7F7F7F7Ful, + 0x8080808080808080ul, 0x7FFFFFFFFFFFFFFFul, + 0x8000000000000000ul, 0xFFFFFFFFFFFFFFFFul }; + } + private static ulong[] _8B4H2S_() { return new ulong[] { 0x0000000000000000ul, 0x7F7F7F7F7F7F7F7Ful, @@ -1977,6 +1984,33 @@ namespace Ryujinx.Tests.Cpu CompareAgainstUnicorn(); } + [Test, Pairwise, Description("PMULL{2} <Vd>.<Ta>, <Vn>.<Tb>, <Vm>.<Tb>")] + public void Pmull_V([Values(0u)] uint rd, + [Values(1u, 0u)] uint rn, + [Values(2u, 0u)] uint rm, + [ValueSource(nameof(_8B1D_))] [Random(RndCnt)] ulong z0, + [ValueSource(nameof(_8B1D_))] [Random(RndCnt)] ulong z1, + [ValueSource(nameof(_8B1D_))] [Random(RndCnt)] ulong a0, + [ValueSource(nameof(_8B1D_))] [Random(RndCnt)] ulong a1, + [ValueSource(nameof(_8B1D_))] [Random(RndCnt)] ulong b0, + [ValueSource(nameof(_8B1D_))] [Random(RndCnt)] ulong b1, + [Values(0b00u, 0b11u)] uint size, // Q0: <8B, 1D> => <8H, 1Q> + [Values(0b0u, 0b1u)] uint q) // Q1: <16B, 2D> => <8H, 1Q> + { + uint opcode = 0x0E20E000; // PMULL V0.8H, V0.8B, V0.8B + opcode |= ((rm & 31) << 16) | ((rn & 31) << 5) | ((rd & 31) << 0); + opcode |= ((size & 3) << 22); + opcode |= ((q & 1) << 30); + + V128 v0 = MakeVectorE0E1(z0, z1); + V128 v1 = MakeVectorE0E1(a0, a1); + V128 v2 = MakeVectorE0E1(b0, b1); + + SingleOpcode(opcode, v0: v0, v1: v1, v2: v2); + + CompareAgainstUnicorn(); + } + [Test, Pairwise, Description("RADDHN{2} <Vd>.<Tb>, <Vn>.<Ta>, <Vm>.<Ta>")] public void Raddhn_V_8H8B_4S4H_2D2S([Values(0u)] uint rd, [Values(1u, 0u)] uint rn, diff --git a/Ryujinx.Tests/Cpu/CpuTestSimdReg32.cs b/Ryujinx.Tests/Cpu/CpuTestSimdReg32.cs index e8298521a7..ed1c0f74c7 100644 --- a/Ryujinx.Tests/Cpu/CpuTestSimdReg32.cs +++ b/Ryujinx.Tests/Cpu/CpuTestSimdReg32.cs @@ -100,6 +100,13 @@ namespace Ryujinx.Tests.Cpu #endregion #region "ValueSource (Types)" + private static ulong[] _8B1D_() + { + return new ulong[] { 0x0000000000000000ul, 0x7F7F7F7F7F7F7F7Ful, + 0x8080808080808080ul, 0x7FFFFFFFFFFFFFFFul, + 0x8000000000000000ul, 0xFFFFFFFFFFFFFFFFul }; + } + private static ulong[] _8B4H2S1D_() { return new ulong[] { 0x0000000000000000ul, 0x7F7F7F7F7F7F7F7Ful, @@ -530,6 +537,36 @@ namespace Ryujinx.Tests.Cpu CompareAgainstUnicorn(); } + [Test, Pairwise, Description("VMULL.<P8, P64> <Qd>, <Dn>, <Dm>")] + public void Vmull_I_P8_P64([Values(0u, 1u)] uint rd, + [Values(0u, 1u)] uint rn, + [Values(0u, 1u)] uint rm, + [ValueSource(nameof(_8B1D_))] [Random(RndCnt)] ulong d0, + [ValueSource(nameof(_8B1D_))] [Random(RndCnt)] ulong d1, + [Values(0u/*, 2u*/)] uint size) // <P8, P64> + { + /*if (size == 2u) + { + Assert.Ignore("Ryujinx.Tests.Unicorn.UnicornException : Invalid instruction (UC_ERR_INSN_INVALID)"); + }*/ + + uint opcode = 0xf2800e00u; // VMULL.P8 Q0, D0, D0 + + rd >>= 1; rd <<= 1; + + opcode |= (((rd & 0x10) << 18) | (rd & 0xf) << 12); + opcode |= (((rn & 0x10) << 3) | (rn & 0xf) << 16); + opcode |= (((rm & 0x10) << 1) | (rm & 0xf) << 0); + + opcode |= (size & 0x3) << 20; + + V128 v0 = MakeVectorE0E1(d0, d1); + + SingleOpcode(opcode, v0: v0); + + CompareAgainstUnicorn(); + } + [Test, Pairwise, Description("VSHL.<size> {<Vd>}, <Vm>, <Vn>")] public void Vshl([Values(0u)] uint rd, [Values(1u, 0u)] uint rn,