Add VCLZ.* fast path (#1917)

* Add VCLZ fast path

* Add VCLZ.8B/16B SSSE3 fast path
* Add VCLZ.4H/8H SSSE3 fast path
* Add VCLZ.2S/4S SSE2 fast path

* Improve CLZ.4H/8H fast path

* Improve CLZ.2S/4S fast path

* Set PPTC version
This commit is contained in:
FICTURE7 2021-01-25 03:01:25 +04:00 committed by GitHub
parent f94acdb4ef
commit ddf1105bcb
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
3 changed files with 145 additions and 9 deletions

View file

@ -120,24 +120,155 @@ namespace ARMeilleure.Instructions
{
OpCodeSimd op = (OpCodeSimd)context.CurrOp;
Operand res = context.VectorZero();
int elems = op.GetBytesCount() >> op.Size;
int eSize = 8 << op.Size;
for (int index = 0; index < elems; index++)
Operand res = eSize switch {
8 => Clz_V_I8 (context, GetVec(op.Rn)),
16 => Clz_V_I16(context, GetVec(op.Rn)),
32 => Clz_V_I32(context, GetVec(op.Rn)),
_ => null
};
if (res != null)
{
Operand ne = EmitVectorExtractZx(context, op.Rn, index, op.Size);
if (op.RegisterSize == RegisterSize.Simd64)
{
res = context.VectorZeroUpper64(res);
}
}
else
{
int elems = op.GetBytesCount() >> op.Size;
Operand de = context.Call(typeof(SoftFallback).GetMethod(nameof(SoftFallback.CountLeadingZeros)), ne, Const(eSize));
res = context.VectorZero();
res = EmitVectorInsert(context, res, de, index, op.Size);
for (int index = 0; index < elems; index++)
{
Operand ne = EmitVectorExtractZx(context, op.Rn, index, op.Size);
Operand de = context.Call(typeof(SoftFallback).GetMethod(nameof(SoftFallback.CountLeadingZeros)), ne, Const(eSize));
res = EmitVectorInsert(context, res, de, index, op.Size);
}
}
context.Copy(GetVec(op.Rd), res);
}
private static Operand Clz_V_I8(ArmEmitterContext context, Operand arg)
{
if (!Optimizations.UseSsse3)
{
return null;
}
// CLZ nibble table.
Operand clzTable = X86GetScalar(context, 0x01_01_01_01_02_02_03_04);
Operand maskLow = X86GetAllElements(context, 0x0f_0f_0f_0f);
Operand c04 = X86GetAllElements(context, 0x04_04_04_04);
// CLZ of low 4 bits of elements in arg.
Operand loClz = context.AddIntrinsic(Intrinsic.X86Pshufb, clzTable, arg);
// Get the high 4 bits of elements in arg.
Operand hiArg = context.AddIntrinsic(Intrinsic.X86Psrlw, arg, Const(4));
hiArg = context.AddIntrinsic(Intrinsic.X86Pand, hiArg, maskLow);
// CLZ of high 4 bits of elements in arg.
Operand hiClz = context.AddIntrinsic(Intrinsic.X86Pshufb, clzTable, hiArg);
// If high 4 bits are not all zero, we discard the CLZ of the low 4 bits.
Operand mask = context.AddIntrinsic(Intrinsic.X86Pcmpeqb, hiClz, c04);
loClz = context.AddIntrinsic(Intrinsic.X86Pand, loClz, mask);
return context.AddIntrinsic(Intrinsic.X86Paddb, loClz, hiClz);
}
private static Operand Clz_V_I16(ArmEmitterContext context, Operand arg)
{
if (!Optimizations.UseSsse3)
{
return null;
}
Operand maskSwap = X86GetElements(context, 0x80_0f_80_0d_80_0b_80_09, 0x80_07_80_05_80_03_80_01);
Operand maskLow = X86GetAllElements(context, 0x00ff_00ff);
Operand c0008 = X86GetAllElements(context, 0x0008_0008);
// CLZ pair of high 8 and low 8 bits of elements in arg.
Operand hiloClz = Clz_V_I8(context, arg);
// Get CLZ of low 8 bits in each pair.
Operand loClz = context.AddIntrinsic(Intrinsic.X86Pand, hiloClz, maskLow);
// Get CLZ of high 8 bits in each pair.
Operand hiClz = context.AddIntrinsic(Intrinsic.X86Pshufb, hiloClz, maskSwap);
// If high 8 bits are not all zero, we discard the CLZ of the low 8 bits.
Operand mask = context.AddIntrinsic(Intrinsic.X86Pcmpeqw, hiClz, c0008);
loClz = context.AddIntrinsic(Intrinsic.X86Pand, loClz, mask);
return context.AddIntrinsic(Intrinsic.X86Paddw, loClz, hiClz);
}
private static Operand Clz_V_I32(ArmEmitterContext context, Operand arg)
{
// TODO: Use vplzcntd when AVX-512 is supported.
if (!Optimizations.UseSse2)
{
return null;
}
Operand AddVectorI32(Operand op0, Operand op1) => context.AddIntrinsic(Intrinsic.X86Paddd, op0, op1);
Operand SubVectorI32(Operand op0, Operand op1) => context.AddIntrinsic(Intrinsic.X86Psubd, op0, op1);
Operand ShiftRightVectorUI32(Operand op0, int imm8) => context.AddIntrinsic(Intrinsic.X86Psrld, op0, Const(imm8));
Operand OrVector(Operand op0, Operand op1) => context.AddIntrinsic(Intrinsic.X86Por, op0, op1);
Operand AndVector(Operand op0, Operand op1) => context.AddIntrinsic(Intrinsic.X86Pand, op0, op1);
Operand NotVector(Operand op0) => context.AddIntrinsic(Intrinsic.X86Pandn, op0, context.VectorOne());
Operand c55555555 = X86GetAllElements(context, 0x55555555);
Operand c33333333 = X86GetAllElements(context, 0x33333333);
Operand c0f0f0f0f = X86GetAllElements(context, 0x0f0f0f0f);
Operand c0000003f = X86GetAllElements(context, 0x0000003f);
Operand tmp0;
Operand tmp1;
Operand res;
// Set all bits after highest set bit to 1.
res = OrVector(ShiftRightVectorUI32(arg, 1), arg);
res = OrVector(ShiftRightVectorUI32(res, 2), res);
res = OrVector(ShiftRightVectorUI32(res, 4), res);
res = OrVector(ShiftRightVectorUI32(res, 8), res);
res = OrVector(ShiftRightVectorUI32(res, 16), res);
// Make leading 0s into leading 1s.
res = NotVector(res);
// Count leading 1s, which is the population count.
tmp0 = ShiftRightVectorUI32(res, 1);
tmp0 = AndVector(tmp0, c55555555);
res = SubVectorI32(res, tmp0);
tmp0 = ShiftRightVectorUI32(res, 2);
tmp0 = AndVector(tmp0, c33333333);
tmp1 = AndVector(res, c33333333);
res = AddVectorI32(tmp0, tmp1);
tmp0 = ShiftRightVectorUI32(res, 4);
tmp0 = AddVectorI32(tmp0, res);
res = AndVector(tmp0, c0f0f0f0f);
tmp0 = ShiftRightVectorUI32(res, 8);
res = AddVectorI32(tmp0, res);
tmp0 = ShiftRightVectorUI32(res, 16);
res = AddVectorI32(tmp0, res);
res = AndVector(res, c0000003f);
return res;
}
public static void Cnt_V(ArmEmitterContext context)
{
OpCodeSimd op = (OpCodeSimd)context.CurrOp;

View file

@ -209,6 +209,11 @@ namespace ARMeilleure.Instructions
}
public static Operand X86GetElements(ArmEmitterContext context, long e1, long e0)
{
return X86GetElements(context, (ulong)e1, (ulong)e0);
}
public static Operand X86GetElements(ArmEmitterContext context, ulong e1, ulong e0)
{
Operand vector0 = context.VectorCreateScalar(Const(e0));
Operand vector1 = context.VectorCreateScalar(Const(e1));

View file

@ -22,7 +22,7 @@ namespace ARMeilleure.Translation.PTC
{
private const string HeaderMagic = "PTChd";
private const int InternalVersion = 1817; //! To be incremented manually for each change to the ARMeilleure project.
private const int InternalVersion = 1917; //! To be incremented manually for each change to the ARMeilleure project.
private const string ActualDir = "0";
private const string BackupDir = "1";