forked from Mirror/Ryujinx
17620d18db
* ARMeilleure: Add AVX512{F,VL,DQ,BW} detection Add `UseAvx512Ortho` and `UseAvx512OrthoFloat` optimization flags as short-hands for `F+VL` and `F+VL+DQ`. * ARMeilleure: Add initial support for EVEX instruction encoding Does not implement rounding, or exception controls. * ARMeilleure: Add `X86Vpternlogd` Accelerates the vector-`Not` instruction. * ARMeilleure: Add check for `OSXSAVE` for AVX{2,512} * ARMeilleure: Add check for `XCR0` flags Add XCR0 register checks for AVX and AVX512F, following the guidelines from section 14.3 and 15.2 from the Intel Architecture Software Developer's Manual. * ARMeilleure: Remove redundant `ReProtect` and `Dispose`, formatting * ARMeilleure: Move XCR0 procedure to GetXcr0Eax * ARMeilleure: Add `XCR0` to `FeatureInfo` structure * ARMeilleure: Utilize `ReadOnlySpan` for Xcr0 assembly Avoids an additional allocation * ARMeilleure: Formatting fixes * ARMeilleure: Fix EVEX encoding src2 register index > Just like in VEX prefix, vvvv is provided in inverted form. * ARMeilleure: Add `X86Vpternlogd` acceleration to `Vmvn_I` Passes unit tests, verified instruction utilization * ARMeilleure: Fix EVEX register operand designations Operand 2 was being sourced improperly. EVEX encoded instructions source their operands like so: Operand 1: ModRM:reg Operand 2: EVEX.vvvvv Operand 3: ModRM:r/m Operand 4: Imm This fixes the improper register designations when emitting vpternlog. Now "dest", "src1", "src2" arguments emit in the proper order in EVEX instructions. * ARMeilleure: Add `X86Vpternlogd` acceleration to `Orn_V` * ARMeilleure: PTC version bump * ARMeilleure: Update EVEX encoding Debug.Assert to Debug.Fail * ARMeilleure: Update EVEX encoding comment capitalization
266 lines
9.5 KiB
C#
266 lines
9.5 KiB
C#
using ARMeilleure.Decoders;
|
|
using ARMeilleure.IntermediateRepresentation;
|
|
using ARMeilleure.Translation;
|
|
|
|
using static ARMeilleure.Instructions.InstEmitHelper;
|
|
using static ARMeilleure.Instructions.InstEmitSimdHelper;
|
|
using static ARMeilleure.Instructions.InstEmitSimdHelper32;
|
|
using static ARMeilleure.IntermediateRepresentation.Operand.Factory;
|
|
|
|
namespace ARMeilleure.Instructions
|
|
{
|
|
static partial class InstEmit32
|
|
{
|
|
public static void Vand_I(ArmEmitterContext context)
|
|
{
|
|
if (Optimizations.UseAdvSimd)
|
|
{
|
|
InstEmitSimdHelper32Arm64.EmitVectorBinaryOpSimd32(context, (n, m) => context.AddIntrinsic(Intrinsic.Arm64AndV | Intrinsic.Arm64V128, n, m));
|
|
}
|
|
else if (Optimizations.UseSse2)
|
|
{
|
|
EmitVectorBinaryOpSimd32(context, (n, m) => context.AddIntrinsic(Intrinsic.X86Pand, n, m));
|
|
}
|
|
else
|
|
{
|
|
EmitVectorBinaryOpZx32(context, (op1, op2) => context.BitwiseAnd(op1, op2));
|
|
}
|
|
}
|
|
|
|
public static void Vbic_I(ArmEmitterContext context)
|
|
{
|
|
if (Optimizations.UseAdvSimd)
|
|
{
|
|
InstEmitSimdHelper32Arm64.EmitVectorBinaryOpSimd32(context, (n, m) => context.AddIntrinsic(Intrinsic.Arm64BicV | Intrinsic.Arm64V128, n, m));
|
|
}
|
|
else if (Optimizations.UseSse2)
|
|
{
|
|
EmitVectorBinaryOpSimd32(context, (n, m) => context.AddIntrinsic(Intrinsic.X86Pandn, m, n));
|
|
}
|
|
else
|
|
{
|
|
EmitVectorBinaryOpZx32(context, (op1, op2) => context.BitwiseAnd(op1, context.BitwiseNot(op2)));
|
|
}
|
|
}
|
|
|
|
public static void Vbic_II(ArmEmitterContext context)
|
|
{
|
|
OpCode32SimdImm op = (OpCode32SimdImm)context.CurrOp;
|
|
|
|
long immediate = op.Immediate;
|
|
|
|
// Replicate fields to fill the 64-bits, if size is < 64-bits.
|
|
switch (op.Size)
|
|
{
|
|
case 0: immediate *= 0x0101010101010101L; break;
|
|
case 1: immediate *= 0x0001000100010001L; break;
|
|
case 2: immediate *= 0x0000000100000001L; break;
|
|
}
|
|
|
|
Operand imm = Const(immediate);
|
|
Operand res = GetVecA32(op.Qd);
|
|
|
|
if (op.Q)
|
|
{
|
|
for (int elem = 0; elem < 2; elem++)
|
|
{
|
|
Operand de = EmitVectorExtractZx(context, op.Qd, elem, 3);
|
|
|
|
res = EmitVectorInsert(context, res, context.BitwiseAnd(de, context.BitwiseNot(imm)), elem, 3);
|
|
}
|
|
}
|
|
else
|
|
{
|
|
Operand de = EmitVectorExtractZx(context, op.Qd, op.Vd & 1, 3);
|
|
|
|
res = EmitVectorInsert(context, res, context.BitwiseAnd(de, context.BitwiseNot(imm)), op.Vd & 1, 3);
|
|
}
|
|
|
|
context.Copy(GetVecA32(op.Qd), res);
|
|
}
|
|
|
|
public static void Vbif(ArmEmitterContext context)
|
|
{
|
|
if (Optimizations.UseAdvSimd)
|
|
{
|
|
InstEmitSimdHelper32Arm64.EmitVectorTernaryOpSimd32(context, (d, n, m) => context.AddIntrinsic(Intrinsic.Arm64BifV | Intrinsic.Arm64V128, d, n, m));
|
|
}
|
|
else
|
|
{
|
|
EmitBifBit(context, true);
|
|
}
|
|
}
|
|
|
|
public static void Vbit(ArmEmitterContext context)
|
|
{
|
|
if (Optimizations.UseAdvSimd)
|
|
{
|
|
InstEmitSimdHelper32Arm64.EmitVectorTernaryOpSimd32(context, (d, n, m) => context.AddIntrinsic(Intrinsic.Arm64BitV | Intrinsic.Arm64V128, d, n, m));
|
|
}
|
|
else
|
|
{
|
|
EmitBifBit(context, false);
|
|
}
|
|
}
|
|
|
|
public static void Vbsl(ArmEmitterContext context)
|
|
{
|
|
if (Optimizations.UseAdvSimd)
|
|
{
|
|
InstEmitSimdHelper32Arm64.EmitVectorTernaryOpSimd32(context, (d, n, m) => context.AddIntrinsic(Intrinsic.Arm64BslV | Intrinsic.Arm64V128, d, n, m));
|
|
}
|
|
else if (Optimizations.UseSse2)
|
|
{
|
|
EmitVectorTernaryOpSimd32(context, (d, n, m) =>
|
|
{
|
|
Operand res = context.AddIntrinsic(Intrinsic.X86Pxor, n, m);
|
|
res = context.AddIntrinsic(Intrinsic.X86Pand, res, d);
|
|
return context.AddIntrinsic(Intrinsic.X86Pxor, res, m);
|
|
});
|
|
}
|
|
else
|
|
{
|
|
EmitVectorTernaryOpZx32(context, (op1, op2, op3) =>
|
|
{
|
|
return context.BitwiseExclusiveOr(
|
|
context.BitwiseAnd(op1,
|
|
context.BitwiseExclusiveOr(op2, op3)), op3);
|
|
});
|
|
}
|
|
}
|
|
|
|
public static void Veor_I(ArmEmitterContext context)
|
|
{
|
|
if (Optimizations.UseAdvSimd)
|
|
{
|
|
InstEmitSimdHelper32Arm64.EmitVectorBinaryOpSimd32(context, (n, m) => context.AddIntrinsic(Intrinsic.Arm64EorV | Intrinsic.Arm64V128, n, m));
|
|
}
|
|
else if (Optimizations.UseSse2)
|
|
{
|
|
EmitVectorBinaryOpSimd32(context, (n, m) => context.AddIntrinsic(Intrinsic.X86Pxor, n, m));
|
|
}
|
|
else
|
|
{
|
|
EmitVectorBinaryOpZx32(context, (op1, op2) => context.BitwiseExclusiveOr(op1, op2));
|
|
}
|
|
}
|
|
|
|
public static void Vorn_I(ArmEmitterContext context)
|
|
{
|
|
if (Optimizations.UseAdvSimd)
|
|
{
|
|
InstEmitSimdHelper32Arm64.EmitVectorBinaryOpSimd32(context, (n, m) => context.AddIntrinsic(Intrinsic.Arm64OrnV | Intrinsic.Arm64V128, n, m));
|
|
}
|
|
else if (Optimizations.UseAvx512Ortho)
|
|
{
|
|
EmitVectorBinaryOpSimd32(context, (n, m) =>
|
|
{
|
|
return context.AddIntrinsic(Intrinsic.X86Vpternlogd, n, m, Const(0b11001100 | ~0b10101010));
|
|
});
|
|
}
|
|
else if (Optimizations.UseSse2)
|
|
{
|
|
Operand mask = context.VectorOne();
|
|
|
|
EmitVectorBinaryOpSimd32(context, (n, m) =>
|
|
{
|
|
m = context.AddIntrinsic(Intrinsic.X86Pandn, m, mask);
|
|
return context.AddIntrinsic(Intrinsic.X86Por, n, m);
|
|
});
|
|
}
|
|
else
|
|
{
|
|
EmitVectorBinaryOpZx32(context, (op1, op2) => context.BitwiseOr(op1, context.BitwiseNot(op2)));
|
|
}
|
|
}
|
|
|
|
public static void Vorr_I(ArmEmitterContext context)
|
|
{
|
|
if (Optimizations.UseAdvSimd)
|
|
{
|
|
InstEmitSimdHelper32Arm64.EmitVectorBinaryOpSimd32(context, (n, m) => context.AddIntrinsic(Intrinsic.Arm64OrrV | Intrinsic.Arm64V128, n, m));
|
|
}
|
|
else if (Optimizations.UseSse2)
|
|
{
|
|
EmitVectorBinaryOpSimd32(context, (n, m) => context.AddIntrinsic(Intrinsic.X86Por, n, m));
|
|
}
|
|
else
|
|
{
|
|
EmitVectorBinaryOpZx32(context, (op1, op2) => context.BitwiseOr(op1, op2));
|
|
}
|
|
}
|
|
|
|
public static void Vorr_II(ArmEmitterContext context)
|
|
{
|
|
OpCode32SimdImm op = (OpCode32SimdImm)context.CurrOp;
|
|
|
|
long immediate = op.Immediate;
|
|
|
|
// Replicate fields to fill the 64-bits, if size is < 64-bits.
|
|
switch (op.Size)
|
|
{
|
|
case 0: immediate *= 0x0101010101010101L; break;
|
|
case 1: immediate *= 0x0001000100010001L; break;
|
|
case 2: immediate *= 0x0000000100000001L; break;
|
|
}
|
|
|
|
Operand imm = Const(immediate);
|
|
Operand res = GetVecA32(op.Qd);
|
|
|
|
if (op.Q)
|
|
{
|
|
for (int elem = 0; elem < 2; elem++)
|
|
{
|
|
Operand de = EmitVectorExtractZx(context, op.Qd, elem, 3);
|
|
|
|
res = EmitVectorInsert(context, res, context.BitwiseOr(de, imm), elem, 3);
|
|
}
|
|
}
|
|
else
|
|
{
|
|
Operand de = EmitVectorExtractZx(context, op.Qd, op.Vd & 1, 3);
|
|
|
|
res = EmitVectorInsert(context, res, context.BitwiseOr(de, imm), op.Vd & 1, 3);
|
|
}
|
|
|
|
context.Copy(GetVecA32(op.Qd), res);
|
|
}
|
|
|
|
public static void Vtst(ArmEmitterContext context)
|
|
{
|
|
EmitVectorBinaryOpZx32(context, (op1, op2) =>
|
|
{
|
|
Operand isZero = context.ICompareEqual(context.BitwiseAnd(op1, op2), Const(0));
|
|
return context.ConditionalSelect(isZero, Const(0), Const(-1));
|
|
});
|
|
}
|
|
|
|
private static void EmitBifBit(ArmEmitterContext context, bool notRm)
|
|
{
|
|
OpCode32SimdReg op = (OpCode32SimdReg)context.CurrOp;
|
|
|
|
if (Optimizations.UseSse2)
|
|
{
|
|
EmitVectorTernaryOpSimd32(context, (d, n, m) =>
|
|
{
|
|
Operand res = context.AddIntrinsic(Intrinsic.X86Pxor, n, d);
|
|
res = context.AddIntrinsic((notRm) ? Intrinsic.X86Pandn : Intrinsic.X86Pand, m, res);
|
|
return context.AddIntrinsic(Intrinsic.X86Pxor, d, res);
|
|
});
|
|
}
|
|
else
|
|
{
|
|
EmitVectorTernaryOpZx32(context, (d, n, m) =>
|
|
{
|
|
if (notRm)
|
|
{
|
|
m = context.BitwiseNot(m);
|
|
}
|
|
return context.BitwiseExclusiveOr(
|
|
context.BitwiseAnd(m,
|
|
context.BitwiseExclusiveOr(d, n)), d);
|
|
});
|
|
}
|
|
}
|
|
}
|
|
}
|