From 430ba6da65a781196db7d723cc88710bb7f5caf8 Mon Sep 17 00:00:00 2001
From: LDj3SNuD <35856442+LDj3SNuD@users.noreply.github.com>
Date: Mon, 4 Jan 2021 23:45:54 +0100
Subject: [PATCH] CPU (A64): Add Pmull_V Inst. with Clmul fast path for the
 "1/2D -> 1Q" variant & Sse fast path and slow path for both the "8/16B -> 8H"
 and "1/2D -> 1Q" variants; with Test. (#1817)

* Add Pmull_V Sse fast path only, both "8/16B -> 8H" and "1/2D -> 1Q" variants; with Test.

* Add Clmul fast path for the 128 bits variant.

* Small optimisation (save 60 instructions) for the Sse fast path about the 128 bits variant.

* Add slow path, both variants. Fix V128 Shl/Shr when shift = 0.

* A32: Add Vmull_I P64 variant (slow path); not tested.

* A32: Add Vmull_I_P8_P64 Test and fix P64 variant.
---
 ARMeilleure/Decoders/OpCodeTable.cs           |   4 +-
 .../Instructions/InstEmitSimdArithmetic.cs    | 125 ++++++++++++++++++
 .../Instructions/InstEmitSimdArithmetic32.cs  |  36 ++---
 .../Instructions/InstEmitSimdHelper32.cs      |  22 +++
 ARMeilleure/Instructions/InstName.cs          |   1 +
 ARMeilleure/Instructions/SoftFallback.cs      |  17 +++
 ARMeilleure/State/V128.cs                     |  10 ++
 ARMeilleure/Translation/Delegates.cs          |   1 +
 ARMeilleure/Translation/PTC/Ptc.cs            |   2 +-
 Ryujinx.Tests/Cpu/CpuTestSimdReg.cs           |  34 +++++
 Ryujinx.Tests/Cpu/CpuTestSimdReg32.cs         |  37 ++++++
 11 files changed, 264 insertions(+), 25 deletions(-)

diff --git a/ARMeilleure/Decoders/OpCodeTable.cs b/ARMeilleure/Decoders/OpCodeTable.cs
index 665e71290d..b19124851a 100644
--- a/ARMeilleure/Decoders/OpCodeTable.cs
+++ b/ARMeilleure/Decoders/OpCodeTable.cs
@@ -413,6 +413,8 @@ namespace ARMeilleure.Decoders
             SetA64("0x001110101xxxxx000111xxxxxxxxxx", InstName.Orr_V,           InstEmit.Orr_V,           OpCodeSimdReg.Create);
             SetA64("0x00111100000xxx0xx101xxxxxxxxxx", InstName.Orr_Vi,          InstEmit.Orr_Vi,          OpCodeSimdImm.Create);
             SetA64("0x00111100000xxx10x101xxxxxxxxxx", InstName.Orr_Vi,          InstEmit.Orr_Vi,          OpCodeSimdImm.Create);
+            SetA64("0x001110001xxxxx111000xxxxxxxxxx", InstName.Pmull_V,         InstEmit.Pmull_V,         OpCodeSimdReg.Create);
+            SetA64("0x001110111xxxxx111000xxxxxxxxxx", InstName.Pmull_V,         InstEmit.Pmull_V,         OpCodeSimdReg.Create);
             SetA64("0x101110<<1xxxxx010000xxxxxxxxxx", InstName.Raddhn_V,        InstEmit.Raddhn_V,        OpCodeSimdReg.Create);
             SetA64("0x10111001100000010110xxxxxxxxxx", InstName.Rbit_V,          InstEmit.Rbit_V,          OpCodeSimd.Create);
             SetA64("0x00111000100000000110xxxxxxxxxx", InstName.Rev16_V,         InstEmit.Rev16_V,         OpCodeSimd.Create);
@@ -886,7 +888,7 @@ namespace ARMeilleure.Decoders
             SetA32("111100110x00xxxxxxxx1101xxx1xxxx", InstName.Vmul,     InstEmit32.Vmul_V,   OpCode32SimdReg.Create);
             SetA32("1111001x1x<<xxxxxxx01010x1x0xxxx", InstName.Vmull,    InstEmit32.Vmull_1,  OpCode32SimdRegElemLong.Create);
             SetA32("1111001x1x<<xxxxxxx01100x0x0xxxx", InstName.Vmull,    InstEmit32.Vmull_I,  OpCode32SimdRegLong.Create);
-            SetA32("111100101x00xxxxxxx01110x0x0xxxx", InstName.Vmull,    InstEmit32.Vmull_I,  OpCode32SimdRegLong.Create); // Polynomial
+            SetA32("111100101xx0xxxxxxx01110x0x0xxxx", InstName.Vmull,    InstEmit32.Vmull_I,  OpCode32SimdRegLong.Create); // P8/P64
             SetA32("111100111x110000xxxx01011xx0xxxx", InstName.Vmvn,     InstEmit32.Vmvn_I,   OpCode32SimdBinary.Create);
             SetA32("1111001x1x000xxxxxxx0xx00x11xxxx", InstName.Vmvn,     InstEmit32.Vmvn_II,  OpCode32SimdImm.Create); // D/Q vector I32.
             SetA32("1111001x1x000xxxxxxx10x00x11xxxx", InstName.Vmvn,     InstEmit32.Vmvn_II,  OpCode32SimdImm.Create);
diff --git a/ARMeilleure/Instructions/InstEmitSimdArithmetic.cs b/ARMeilleure/Instructions/InstEmitSimdArithmetic.cs
index 3a97bc526a..88be07bdd3 100644
--- a/ARMeilleure/Instructions/InstEmitSimdArithmetic.cs
+++ b/ARMeilleure/Instructions/InstEmitSimdArithmetic.cs
@@ -10,6 +10,7 @@ using System.Diagnostics;
 
 using static ARMeilleure.Instructions.InstEmitHelper;
 using static ARMeilleure.Instructions.InstEmitSimdHelper;
+using static ARMeilleure.Instructions.InstEmitSimdHelper32;
 using static ARMeilleure.IntermediateRepresentation.OperandHelper;
 
 namespace ARMeilleure.Instructions
@@ -1928,6 +1929,112 @@ namespace ARMeilleure.Instructions
             }
         }
 
+        public static void Pmull_V(ArmEmitterContext context)
+        {
+            OpCodeSimdReg op = (OpCodeSimdReg)context.CurrOp;
+
+            if (Optimizations.UsePclmulqdq && op.Size == 3)
+            {
+                Operand n = GetVec(op.Rn);
+                Operand m = GetVec(op.Rm);
+
+                int imm8 = op.RegisterSize == RegisterSize.Simd64 ? 0b0000_0000 : 0b0001_0001;
+
+                Operand res = context.AddIntrinsic(Intrinsic.X86Pclmulqdq, n, m, Const(imm8));
+
+                context.Copy(GetVec(op.Rd), res);
+            }
+            else if (Optimizations.UseSse41)
+            {
+                Operand n = GetVec(op.Rn);
+                Operand m = GetVec(op.Rm);
+
+                if (op.RegisterSize == RegisterSize.Simd64)
+                {
+                    n = context.VectorZeroUpper64(n);
+                    m = context.VectorZeroUpper64(m);
+                }
+                else /* if (op.RegisterSize == RegisterSize.Simd128) */
+                {
+                    n = context.AddIntrinsic(Intrinsic.X86Psrldq, n, Const(8));
+                    m = context.AddIntrinsic(Intrinsic.X86Psrldq, m, Const(8));
+                }
+
+                Operand res = context.VectorZero();
+
+                if (op.Size == 0)
+                {
+                    n = context.AddIntrinsic(Intrinsic.X86Pmovzxbw, n);
+                    m = context.AddIntrinsic(Intrinsic.X86Pmovzxbw, m);
+
+                    for (int i = 0; i < 8; i++)
+                    {
+                        Operand mask = context.AddIntrinsic(Intrinsic.X86Psllw, n, Const(15 - i));
+                                mask = context.AddIntrinsic(Intrinsic.X86Psraw, mask, Const(15));
+
+                        Operand tmp = context.AddIntrinsic(Intrinsic.X86Psllw, m, Const(i));
+                                tmp = context.AddIntrinsic(Intrinsic.X86Pand, tmp, mask);
+
+                        res = context.AddIntrinsic(Intrinsic.X86Pxor, res, tmp);
+                    }
+                }
+                else /* if (op.Size == 3) */
+                {
+                    Operand zero = context.VectorZero();
+
+                    for (int i = 0; i < 64; i++)
+                    {
+                        Operand mask = context.AddIntrinsic(Intrinsic.X86Movlhps, n, n);
+                                mask = context.AddIntrinsic(Intrinsic.X86Psllq, mask, Const(63 - i));
+                                mask = context.AddIntrinsic(Intrinsic.X86Psrlq, mask, Const(63));
+                                mask = context.AddIntrinsic(Intrinsic.X86Psubq, zero, mask);
+
+                        Operand tmp = EmitSse2Sll_128(context, m, i);
+                                tmp = context.AddIntrinsic(Intrinsic.X86Pand, tmp, mask);
+
+                        res = context.AddIntrinsic(Intrinsic.X86Pxor, res, tmp);
+                    }
+                }
+
+                context.Copy(GetVec(op.Rd), res);
+            }
+            else
+            {
+                Operand n = GetVec(op.Rn);
+                Operand m = GetVec(op.Rm);
+
+                Operand res;
+
+                if (op.Size == 0)
+                {
+                    res = context.VectorZero();
+
+                    int part = op.RegisterSize == RegisterSize.Simd64 ? 0 : 8;
+
+                    for (int index = 0; index < 8; index++)
+                    {
+                        Operand ne = context.VectorExtract8(n, part + index);
+                        Operand me = context.VectorExtract8(m, part + index);
+
+                        Operand de = EmitPolynomialMultiply(context, ne, me, 8);
+
+                        res = EmitVectorInsert(context, res, de, index, 1);
+                    }
+                }
+                else /* if (op.Size == 3) */
+                {
+                    int part = op.RegisterSize == RegisterSize.Simd64 ? 0 : 1;
+
+                    Operand ne = context.VectorExtract(OperandType.I64, n, part);
+                    Operand me = context.VectorExtract(OperandType.I64, m, part);
+
+                    res = context.Call(typeof(SoftFallback).GetMethod(nameof(SoftFallback.PolynomialMult64_128)), ne, me);
+                }
+
+                context.Copy(GetVec(op.Rd), res);
+            }
+        }
+
         public static void Raddhn_V(ArmEmitterContext context)
         {
             EmitHighNarrow(context, (op1, op2) => context.Add(op1, op2), round: true);
@@ -3690,5 +3797,23 @@ namespace ARMeilleure.Instructions
 
             context.Copy(GetVec(op.Rd), res);
         }
+
+        private static Operand EmitSse2Sll_128(ArmEmitterContext context, Operand op, int shift)
+        {
+            // The upper part of op is assumed to be zero.
+            Debug.Assert(shift >= 0 && shift < 64);
+
+            if (shift == 0)
+            {
+                return op;
+            }
+
+            Operand high = context.AddIntrinsic(Intrinsic.X86Pslldq, op, Const(8));
+                    high = context.AddIntrinsic(Intrinsic.X86Psrlq, high, Const(64 - shift));
+
+            Operand low = context.AddIntrinsic(Intrinsic.X86Psllq, op, Const(shift));
+
+            return context.AddIntrinsic(Intrinsic.X86Por, high, low);
+        }
     }
 }
diff --git a/ARMeilleure/Instructions/InstEmitSimdArithmetic32.cs b/ARMeilleure/Instructions/InstEmitSimdArithmetic32.cs
index d35af20912..0fc8c39114 100644
--- a/ARMeilleure/Instructions/InstEmitSimdArithmetic32.cs
+++ b/ARMeilleure/Instructions/InstEmitSimdArithmetic32.cs
@@ -920,7 +920,19 @@ namespace ARMeilleure.Instructions
 
             if (op.Polynomial)
             {
-                EmitVectorBinaryLongOpI32(context, (op1, op2) => EmitPolynomialMultiply(context, op1, op2, 8 << op.Size), false);
+                if (op.Size == 0) // P8
+                {
+                    EmitVectorBinaryLongOpI32(context, (op1, op2) => EmitPolynomialMultiply(context, op1, op2, 8 << op.Size), false);
+                }
+                else /* if (op.Size == 2) // P64 */
+                {
+                    Operand ne = context.VectorExtract(OperandType.I64, GetVec(op.Qn), op.Vn & 1);
+                    Operand me = context.VectorExtract(OperandType.I64, GetVec(op.Qm), op.Vm & 1);
+
+                    Operand res = context.Call(typeof(SoftFallback).GetMethod(nameof(SoftFallback.PolynomialMult64_128)), ne, me);
+
+                    context.Copy(GetVecA32(op.Qd), res);
+                }
             }
             else
             {
@@ -1366,27 +1378,5 @@ namespace ARMeilleure.Instructions
                 EmitVectorBinaryOpSimd32(context, genericEmit);
             }
         }
-
-        private static Operand EmitPolynomialMultiply(ArmEmitterContext context, Operand op1, Operand op2, int eSize)
-        {
-            Debug.Assert(eSize <= 32);
-
-            Operand result = eSize == 32 ? Const(0L) : Const(0);
-
-            if (eSize == 32)
-            {
-                op1 = context.ZeroExtend32(OperandType.I64, op1);
-                op2 = context.ZeroExtend32(OperandType.I64, op2);
-            }
-
-            for (int i = 0; i < eSize; i++)
-            {
-                Operand mask = context.BitwiseAnd(op1, Const(op1.Type, 1L << i));
-
-                result = context.BitwiseExclusiveOr(result, context.Multiply(op2, mask));
-            }
-
-            return result;
-        }
     }
 }
diff --git a/ARMeilleure/Instructions/InstEmitSimdHelper32.cs b/ARMeilleure/Instructions/InstEmitSimdHelper32.cs
index 3919505779..59d3dc294f 100644
--- a/ARMeilleure/Instructions/InstEmitSimdHelper32.cs
+++ b/ARMeilleure/Instructions/InstEmitSimdHelper32.cs
@@ -1167,5 +1167,27 @@ namespace ARMeilleure.Instructions
 
             return res;
         }
+
+        public static Operand EmitPolynomialMultiply(ArmEmitterContext context, Operand op1, Operand op2, int eSize)
+        {
+            Debug.Assert(eSize <= 32);
+
+            Operand result = eSize == 32 ? Const(0L) : Const(0);
+
+            if (eSize == 32)
+            {
+                op1 = context.ZeroExtend32(OperandType.I64, op1);
+                op2 = context.ZeroExtend32(OperandType.I64, op2);
+            }
+
+            for (int i = 0; i < eSize; i++)
+            {
+                Operand mask = context.BitwiseAnd(op1, Const(op1.Type, 1L << i));
+
+                result = context.BitwiseExclusiveOr(result, context.Multiply(op2, mask));
+            }
+
+            return result;
+        }
     }
 }
diff --git a/ARMeilleure/Instructions/InstName.cs b/ARMeilleure/Instructions/InstName.cs
index 41bb51f356..a0ec9dc394 100644
--- a/ARMeilleure/Instructions/InstName.cs
+++ b/ARMeilleure/Instructions/InstName.cs
@@ -296,6 +296,7 @@ namespace ARMeilleure.Instructions
         Orn_V,
         Orr_V,
         Orr_Vi,
+        Pmull_V,
         Raddhn_V,
         Rbit_V,
         Rev16_V,
diff --git a/ARMeilleure/Instructions/SoftFallback.cs b/ARMeilleure/Instructions/SoftFallback.cs
index ef00fd9d74..1d8fa2e238 100644
--- a/ARMeilleure/Instructions/SoftFallback.cs
+++ b/ARMeilleure/Instructions/SoftFallback.cs
@@ -1260,5 +1260,22 @@ namespace ARMeilleure.Instructions
                 : (uint)(value >> 32);
         }
 #endregion
+
+        public static V128 PolynomialMult64_128(ulong op1, ulong op2)
+        {
+            V128 result = V128.Zero;
+
+            V128 op2_128 = new V128(op2, 0);
+
+            for (int i = 0; i < 64; i++)
+            {
+                if (((op1 >> i) & 1) == 1)
+                {
+                    result ^= op2_128 << i;
+                }
+            }
+
+            return result;
+        }
     }
 }
diff --git a/ARMeilleure/State/V128.cs b/ARMeilleure/State/V128.cs
index 399cea1398..3fa9f9a999 100644
--- a/ARMeilleure/State/V128.cs
+++ b/ARMeilleure/State/V128.cs
@@ -189,6 +189,11 @@ namespace ARMeilleure.State
         /// </remarks>
         public static V128 operator <<(V128 x, int shift)
         {
+            if (shift == 0)
+            {
+                return new V128(x._e0, x._e1);
+            }
+
             ulong shiftOut = x._e0 >> (64 - shift);
 
             return new V128(x._e0 << shift, (x._e1 << shift) | shiftOut);
@@ -205,6 +210,11 @@ namespace ARMeilleure.State
         /// </remarks>
         public static V128 operator >>(V128 x, int shift)
         {
+            if (shift == 0)
+            {
+                return new V128(x._e0, x._e1);
+            }
+
             ulong shiftOut = x._e1 & ((1UL << shift) - 1);
 
             return new V128((x._e0 >> shift) | (shiftOut << (64 - shift)), x._e1 >> shift);
diff --git a/ARMeilleure/Translation/Delegates.cs b/ARMeilleure/Translation/Delegates.cs
index a45192df39..1097ea5896 100644
--- a/ARMeilleure/Translation/Delegates.cs
+++ b/ARMeilleure/Translation/Delegates.cs
@@ -171,6 +171,7 @@ namespace ARMeilleure.Translation
             SetDelegateInfo(typeof(SoftFallback).GetMethod(nameof(SoftFallback.HashUpper)));
             SetDelegateInfo(typeof(SoftFallback).GetMethod(nameof(SoftFallback.InverseMixColumns)));
             SetDelegateInfo(typeof(SoftFallback).GetMethod(nameof(SoftFallback.MixColumns)));
+            SetDelegateInfo(typeof(SoftFallback).GetMethod(nameof(SoftFallback.PolynomialMult64_128)));
             SetDelegateInfo(typeof(SoftFallback).GetMethod(nameof(SoftFallback.Round)));
             SetDelegateInfo(typeof(SoftFallback).GetMethod(nameof(SoftFallback.RoundF)));
             SetDelegateInfo(typeof(SoftFallback).GetMethod(nameof(SoftFallback.SatF32ToS32)));
diff --git a/ARMeilleure/Translation/PTC/Ptc.cs b/ARMeilleure/Translation/PTC/Ptc.cs
index 5bb8cf6dde..f382cc637e 100644
--- a/ARMeilleure/Translation/PTC/Ptc.cs
+++ b/ARMeilleure/Translation/PTC/Ptc.cs
@@ -22,7 +22,7 @@ namespace ARMeilleure.Translation.PTC
     {
         private const string HeaderMagic = "PTChd";
 
-        private const int InternalVersion = 1814; //! To be incremented manually for each change to the ARMeilleure project.
+        private const int InternalVersion = 1817; //! To be incremented manually for each change to the ARMeilleure project.
 
         private const string ActualDir = "0";
         private const string BackupDir = "1";
diff --git a/Ryujinx.Tests/Cpu/CpuTestSimdReg.cs b/Ryujinx.Tests/Cpu/CpuTestSimdReg.cs
index 828c1bf960..0daeb1d103 100644
--- a/Ryujinx.Tests/Cpu/CpuTestSimdReg.cs
+++ b/Ryujinx.Tests/Cpu/CpuTestSimdReg.cs
@@ -60,6 +60,13 @@ namespace Ryujinx.Tests.Cpu
                                  0x8080808080808080ul, 0xFFFFFFFFFFFFFFFFul };
         }
 
+        private static ulong[] _8B1D_()
+        {
+            return new ulong[] { 0x0000000000000000ul, 0x7F7F7F7F7F7F7F7Ful,
+                                 0x8080808080808080ul, 0x7FFFFFFFFFFFFFFFul,
+                                 0x8000000000000000ul, 0xFFFFFFFFFFFFFFFFul };
+        }
+
         private static ulong[] _8B4H2S_()
         {
             return new ulong[] { 0x0000000000000000ul, 0x7F7F7F7F7F7F7F7Ful,
@@ -1977,6 +1984,33 @@ namespace Ryujinx.Tests.Cpu
             CompareAgainstUnicorn();
         }
 
+        [Test, Pairwise, Description("PMULL{2} <Vd>.<Ta>, <Vn>.<Tb>, <Vm>.<Tb>")]
+        public void Pmull_V([Values(0u)]     uint rd,
+                            [Values(1u, 0u)] uint rn,
+                            [Values(2u, 0u)] uint rm,
+                            [ValueSource(nameof(_8B1D_))] [Random(RndCnt)] ulong z0,
+                            [ValueSource(nameof(_8B1D_))] [Random(RndCnt)] ulong z1,
+                            [ValueSource(nameof(_8B1D_))] [Random(RndCnt)] ulong a0,
+                            [ValueSource(nameof(_8B1D_))] [Random(RndCnt)] ulong a1,
+                            [ValueSource(nameof(_8B1D_))] [Random(RndCnt)] ulong b0,
+                            [ValueSource(nameof(_8B1D_))] [Random(RndCnt)] ulong b1,
+                            [Values(0b00u, 0b11u)] uint size, // Q0: <8B,  1D> => <8H, 1Q>
+                            [Values(0b0u, 0b1u)]   uint q)    // Q1: <16B, 2D> => <8H, 1Q>
+        {
+            uint opcode = 0x0E20E000; // PMULL V0.8H, V0.8B, V0.8B
+            opcode |= ((rm & 31) << 16) | ((rn & 31) << 5) | ((rd & 31) << 0);
+            opcode |= ((size & 3) << 22);
+            opcode |= ((q & 1) << 30);
+
+            V128 v0 = MakeVectorE0E1(z0, z1);
+            V128 v1 = MakeVectorE0E1(a0, a1);
+            V128 v2 = MakeVectorE0E1(b0, b1);
+
+            SingleOpcode(opcode, v0: v0, v1: v1, v2: v2);
+
+            CompareAgainstUnicorn();
+        }
+
         [Test, Pairwise, Description("RADDHN{2} <Vd>.<Tb>, <Vn>.<Ta>, <Vm>.<Ta>")]
         public void Raddhn_V_8H8B_4S4H_2D2S([Values(0u)]     uint rd,
                                             [Values(1u, 0u)] uint rn,
diff --git a/Ryujinx.Tests/Cpu/CpuTestSimdReg32.cs b/Ryujinx.Tests/Cpu/CpuTestSimdReg32.cs
index e8298521a7..ed1c0f74c7 100644
--- a/Ryujinx.Tests/Cpu/CpuTestSimdReg32.cs
+++ b/Ryujinx.Tests/Cpu/CpuTestSimdReg32.cs
@@ -100,6 +100,13 @@ namespace Ryujinx.Tests.Cpu
 #endregion
 
 #region "ValueSource (Types)"
+        private static ulong[] _8B1D_()
+        {
+            return new ulong[] { 0x0000000000000000ul, 0x7F7F7F7F7F7F7F7Ful,
+                                 0x8080808080808080ul, 0x7FFFFFFFFFFFFFFFul,
+                                 0x8000000000000000ul, 0xFFFFFFFFFFFFFFFFul };
+        }
+
         private static ulong[] _8B4H2S1D_()
         {
             return new ulong[] { 0x0000000000000000ul, 0x7F7F7F7F7F7F7F7Ful,
@@ -530,6 +537,36 @@ namespace Ryujinx.Tests.Cpu
             CompareAgainstUnicorn();
         }
 
+        [Test, Pairwise, Description("VMULL.<P8, P64> <Qd>, <Dn>, <Dm>")]
+        public void Vmull_I_P8_P64([Values(0u, 1u)] uint rd,
+                                   [Values(0u, 1u)] uint rn,
+                                   [Values(0u, 1u)] uint rm,
+                                   [ValueSource(nameof(_8B1D_))] [Random(RndCnt)] ulong d0,
+                                   [ValueSource(nameof(_8B1D_))] [Random(RndCnt)] ulong d1,
+                                   [Values(0u/*, 2u*/)] uint size) // <P8, P64>
+        {
+            /*if (size == 2u)
+            {
+                Assert.Ignore("Ryujinx.Tests.Unicorn.UnicornException : Invalid instruction (UC_ERR_INSN_INVALID)");
+            }*/
+
+            uint opcode = 0xf2800e00u; // VMULL.P8 Q0, D0, D0
+
+            rd >>= 1; rd <<= 1;
+
+            opcode |= (((rd & 0x10) << 18) | (rd & 0xf) << 12);
+            opcode |= (((rn & 0x10) << 3)  | (rn & 0xf) << 16);
+            opcode |= (((rm & 0x10) << 1)  | (rm & 0xf) << 0);
+
+            opcode |= (size & 0x3) << 20;
+
+            V128 v0 = MakeVectorE0E1(d0, d1);
+
+            SingleOpcode(opcode, v0: v0);
+
+            CompareAgainstUnicorn();
+        }
+
         [Test, Pairwise, Description("VSHL.<size> {<Vd>}, <Vm>, <Vn>")]
         public void Vshl([Values(0u)] uint rd,
                          [Values(1u, 0u)] uint rn,