forked from Mirror/Ryujinx
CPU: Implement VFNMS.F32/64 (#1758)
* Add necessary methods / op-code * Enable Support for FMA Instruction Set * Add Intrinsics / Assembly Opcodes for VFMSUB231XX. * Add X86 Instructions for VFMSUB231XX * Implement VFNMS * Implement VFNMS Tests * Add special cases for FMA instructions. * Update PPTC Version * Remove unused Op * Move Check into Assert / Cleanup * Rename and cleanup * Whitespace * Whitespace / Rename * Re-sort * Address final requests * Implement VFMA.F64 * Simplify switch * Simplify FMA Instructions into their own IntrinsicType. * Remove whitespace * Fix indentation * Change tests for Vfnms -- disable inf / nan * Move args up, not description ;) * Undo vfma * Completely remove vfms code., * Fix order of instruction in assembler
This commit is contained in:
parent
c00d39b675
commit
b479a43939
14 changed files with 462 additions and 363 deletions
|
@ -271,6 +271,10 @@ namespace ARMeilleure.CodeGen.X86
|
|||
Add(X86Instruction.Vblendvps, new InstructionInfo(BadOp, BadOp, BadOp, BadOp, 0x000f3a4a, InstructionFlags.Vex | InstructionFlags.Prefix66));
|
||||
Add(X86Instruction.Vcvtph2ps, new InstructionInfo(BadOp, BadOp, BadOp, BadOp, 0x000f3813, InstructionFlags.Vex | InstructionFlags.Prefix66));
|
||||
Add(X86Instruction.Vcvtps2ph, new InstructionInfo(0x000f3a1d, BadOp, BadOp, BadOp, BadOp, InstructionFlags.Vex | InstructionFlags.Prefix66));
|
||||
Add(X86Instruction.Vfmsub231ps, new InstructionInfo(BadOp, BadOp, BadOp, BadOp, 0x000f38ba, InstructionFlags.Vex | InstructionFlags.Prefix66));
|
||||
Add(X86Instruction.Vfmsub231pd, new InstructionInfo(BadOp, BadOp, BadOp, BadOp, 0x000f38ba, InstructionFlags.Vex | InstructionFlags.Prefix66 | InstructionFlags.RexW));
|
||||
Add(X86Instruction.Vfmsub231ss, new InstructionInfo(BadOp, BadOp, BadOp, BadOp, 0x000f38bb, InstructionFlags.Vex | InstructionFlags.Prefix66));
|
||||
Add(X86Instruction.Vfmsub231sd, new InstructionInfo(BadOp, BadOp, BadOp, BadOp, 0x000f38bb, InstructionFlags.Vex | InstructionFlags.Prefix66 | InstructionFlags.RexW));
|
||||
Add(X86Instruction.Vpblendvb, new InstructionInfo(BadOp, BadOp, BadOp, BadOp, 0x000f3a4c, InstructionFlags.Vex | InstructionFlags.Prefix66));
|
||||
Add(X86Instruction.Xor, new InstructionInfo(0x00000031, 0x06000083, 0x06000081, BadOp, 0x00000033, InstructionFlags.None));
|
||||
Add(X86Instruction.Xorpd, new InstructionInfo(BadOp, BadOp, BadOp, BadOp, 0x00000f57, InstructionFlags.Vex | InstructionFlags.Prefix66));
|
||||
|
|
|
@ -406,12 +406,9 @@ namespace ARMeilleure.CodeGen.X86
|
|||
else
|
||||
{
|
||||
EnsureSameReg(dest, src1);
|
||||
|
||||
Debug.Assert(src3.GetRegister().Index == 0);
|
||||
|
||||
context.Assembler.WriteInstruction(info.Inst, dest, src1, src2);
|
||||
}
|
||||
|
||||
break;
|
||||
}
|
||||
|
||||
|
@ -435,6 +432,23 @@ namespace ARMeilleure.CodeGen.X86
|
|||
|
||||
break;
|
||||
}
|
||||
|
||||
case IntrinsicType.Fma:
|
||||
{
|
||||
Operand dest = operation.Destination;
|
||||
Operand src1 = operation.GetSource(0);
|
||||
Operand src2 = operation.GetSource(1);
|
||||
Operand src3 = operation.GetSource(2);
|
||||
|
||||
EnsureSameType(dest, src1, src2, src3);
|
||||
EnsureSameReg(dest, src1);
|
||||
Debug.Assert(!dest.Type.IsInteger());
|
||||
Debug.Assert(HardwareCapabilities.SupportsVexEncoding);
|
||||
|
||||
context.Assembler.WriteInstruction(info.Inst, dest, src2, src3);
|
||||
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
else
|
||||
|
|
|
@ -164,6 +164,10 @@ namespace ARMeilleure.CodeGen.X86
|
|||
Add(Intrinsic.X86Unpcklps, new IntrinsicInfo(X86Instruction.Unpcklps, IntrinsicType.Binary));
|
||||
Add(Intrinsic.X86Vcvtph2ps, new IntrinsicInfo(X86Instruction.Vcvtph2ps, IntrinsicType.Unary));
|
||||
Add(Intrinsic.X86Vcvtps2ph, new IntrinsicInfo(X86Instruction.Vcvtps2ph, IntrinsicType.BinaryImm));
|
||||
Add(Intrinsic.X86Vfmsub231pd, new IntrinsicInfo(X86Instruction.Vfmsub231pd, IntrinsicType.Fma));
|
||||
Add(Intrinsic.X86Vfmsub231ps, new IntrinsicInfo(X86Instruction.Vfmsub231ps, IntrinsicType.Fma));
|
||||
Add(Intrinsic.X86Vfmsub231sd, new IntrinsicInfo(X86Instruction.Vfmsub231sd, IntrinsicType.Fma));
|
||||
Add(Intrinsic.X86Vfmsub231ss, new IntrinsicInfo(X86Instruction.Vfmsub231ss, IntrinsicType.Fma));
|
||||
Add(Intrinsic.X86Xorpd, new IntrinsicInfo(X86Instruction.Xorpd, IntrinsicType.Binary));
|
||||
Add(Intrinsic.X86Xorps, new IntrinsicInfo(X86Instruction.Xorps, IntrinsicType.Binary));
|
||||
}
|
||||
|
|
|
@ -11,6 +11,7 @@ namespace ARMeilleure.CodeGen.X86
|
|||
BinaryImm,
|
||||
Crc32,
|
||||
Ternary,
|
||||
TernaryImm
|
||||
TernaryImm,
|
||||
Fma
|
||||
}
|
||||
}
|
|
@ -1309,7 +1309,7 @@ namespace ARMeilleure.CodeGen.X86
|
|||
IntrinsicOperation intrinOp = (IntrinsicOperation)operation;
|
||||
IntrinsicInfo info = IntrinsicTable.GetInfo(intrinOp.Intrinsic);
|
||||
|
||||
return info.Type == IntrinsicType.Crc32 || IsVexSameOperandDestSrc1(operation);
|
||||
return info.Type == IntrinsicType.Crc32 || info.Type == IntrinsicType.Fma || IsVexSameOperandDestSrc1(operation);
|
||||
}
|
||||
|
||||
private static bool IsVexSameOperandDestSrc1(Operation operation)
|
||||
|
|
|
@ -200,6 +200,10 @@ namespace ARMeilleure.CodeGen.X86
|
|||
Vblendvps,
|
||||
Vcvtph2ps,
|
||||
Vcvtps2ph,
|
||||
Vfmsub231ps,
|
||||
Vfmsub231pd,
|
||||
Vfmsub231ss,
|
||||
Vfmsub231sd,
|
||||
Vpblendvb,
|
||||
Xor,
|
||||
Xorpd,
|
||||
|
|
|
@ -821,6 +821,7 @@ namespace ARMeilleure.Decoders
|
|||
SetA32("111100101x11xxxxxxxxxxxxxxx0xxxx", InstName.Vext, InstEmit32.Vext, OpCode32SimdExt.Create);
|
||||
SetA32("<<<<11101x10xxxxxxxx101xx0x0xxxx", InstName.Vfma, InstEmit32.Vfma_S, OpCode32SimdRegS.Create);
|
||||
SetA32("<<<<11101x10xxxxxxxx101xx1x0xxxx", InstName.Vfms, InstEmit32.Vfms_S, OpCode32SimdRegS.Create);
|
||||
SetA32("<<<<11101x01xxxxxxxx101xx0x0xxxx", InstName.Vfnms, InstEmit32.Vfnms_S, OpCode32SimdRegS.Create);
|
||||
SetA32("1111001x0x<<xxxxxxxx0000xxx0xxxx", InstName.Vhadd, InstEmit32.Vhadd, OpCode32SimdReg.Create);
|
||||
SetA32("111101001x10xxxxxxxxxx00xxxxxxxx", InstName.Vld1, InstEmit32.Vld1, OpCode32SimdMemSingle.Create);
|
||||
SetA32("111101000x10xxxxxxxx0111xxxxxxxx", InstName.Vld1, InstEmit32.Vld1, OpCode32SimdMemPair.Create); // Regs = 1.
|
||||
|
|
|
@ -284,6 +284,21 @@ namespace ARMeilleure.Instructions
|
|||
}
|
||||
}
|
||||
|
||||
public static void Vfnms_S(ArmEmitterContext context) // Fused.
|
||||
{
|
||||
if (Optimizations.FastFP && Optimizations.UseFma)
|
||||
{
|
||||
EmitScalarTernaryOpF32(context, Intrinsic.X86Vfmsub231ss, Intrinsic.X86Vfmsub231sd);
|
||||
}
|
||||
else
|
||||
{
|
||||
EmitScalarTernaryOpF32(context, (op1, op2, op3) =>
|
||||
{
|
||||
return EmitSoftFloatCall(context, nameof(SoftFloat32.FPMulAdd), context.Negate(op1), op2, op3);
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
public static void Vhadd(ArmEmitterContext context)
|
||||
{
|
||||
OpCode32SimdReg op = (OpCode32SimdReg)context.CurrOp;
|
||||
|
|
|
@ -901,6 +901,20 @@ namespace ARMeilleure.Instructions
|
|||
context.Copy(initialD, res);
|
||||
}
|
||||
|
||||
public static void EmitScalarTernaryOpF32(ArmEmitterContext context, Intrinsic inst32, Intrinsic inst64)
|
||||
{
|
||||
OpCode32SimdRegS op = (OpCode32SimdRegS)context.CurrOp;
|
||||
|
||||
bool doubleSize = (op.Size & 1) != 0;
|
||||
|
||||
Intrinsic inst = doubleSize ? inst64 : inst32;
|
||||
|
||||
EmitScalarTernaryOpSimd32(context, (d, n, m) =>
|
||||
{
|
||||
return context.AddIntrinsic(inst, d, n, m);
|
||||
});
|
||||
}
|
||||
|
||||
public static void EmitScalarTernaryOpF32(ArmEmitterContext context, Intrinsic inst32pt1, Intrinsic inst64pt1, Intrinsic inst32pt2, Intrinsic inst64pt2)
|
||||
{
|
||||
OpCode32SimdRegS op = (OpCode32SimdRegS)context.CurrOp;
|
||||
|
|
|
@ -571,6 +571,7 @@ namespace ARMeilleure.Instructions
|
|||
Vext,
|
||||
Vfma,
|
||||
Vfms,
|
||||
Vfnms,
|
||||
Vhadd,
|
||||
Vld1,
|
||||
Vld2,
|
||||
|
|
|
@ -153,6 +153,10 @@ namespace ARMeilleure.IntermediateRepresentation
|
|||
X86Unpcklps,
|
||||
X86Vcvtph2ps,
|
||||
X86Vcvtps2ph,
|
||||
X86Vfmsub231pd,
|
||||
X86Vfmsub231ps,
|
||||
X86Vfmsub231sd,
|
||||
X86Vfmsub231ss,
|
||||
X86Xorpd,
|
||||
X86Xorps
|
||||
}
|
||||
|
|
|
@ -15,6 +15,7 @@ namespace ARMeilleure
|
|||
public static bool UsePopCntIfAvailable { get; set; } = true;
|
||||
public static bool UseAvxIfAvailable { get; set; } = true;
|
||||
public static bool UseF16cIfAvailable { get; set; } = true;
|
||||
public static bool UseFmaIfAvailable { get; set; } = true;
|
||||
public static bool UseAesniIfAvailable { get; set; } = true;
|
||||
public static bool UsePclmulqdqIfAvailable { get; set; } = true;
|
||||
|
||||
|
@ -33,6 +34,7 @@ namespace ARMeilleure
|
|||
internal static bool UsePopCnt => UsePopCntIfAvailable && HardwareCapabilities.SupportsPopcnt;
|
||||
internal static bool UseAvx => UseAvxIfAvailable && HardwareCapabilities.SupportsAvx && !ForceLegacySse;
|
||||
internal static bool UseF16c => UseF16cIfAvailable && HardwareCapabilities.SupportsF16c;
|
||||
internal static bool UseFma => UseFmaIfAvailable && HardwareCapabilities.SupportsFma;
|
||||
internal static bool UseAesni => UseAesniIfAvailable && HardwareCapabilities.SupportsAesni;
|
||||
internal static bool UsePclmulqdq => UsePclmulqdqIfAvailable && HardwareCapabilities.SupportsPclmulqdq;
|
||||
}
|
||||
|
|
|
@ -21,7 +21,7 @@ namespace ARMeilleure.Translation.PTC
|
|||
{
|
||||
private const string HeaderMagic = "PTChd";
|
||||
|
||||
private const int InternalVersion = 1650; //! To be incremented manually for each change to the ARMeilleure project.
|
||||
private const int InternalVersion = 1758; //! To be incremented manually for each change to the ARMeilleure project.
|
||||
|
||||
private const string ActualDir = "0";
|
||||
private const string BackupDir = "1";
|
||||
|
|
|
@ -184,8 +184,8 @@ namespace Ryujinx.Tests.Cpu
|
|||
private const int RndCnt = 2;
|
||||
|
||||
private static readonly bool NoZeros = false;
|
||||
private static readonly bool NoInfs = false;
|
||||
private static readonly bool NoNaNs = false;
|
||||
private static readonly bool NoInfs = true;
|
||||
private static readonly bool NoNaNs = true;
|
||||
|
||||
[Explicit]
|
||||
[Test, Pairwise, Description("VADD.f32 V0, V0, V0")]
|
||||
|
@ -293,6 +293,41 @@ namespace Ryujinx.Tests.Cpu
|
|||
CompareAgainstUnicorn(fpsrMask: Fpsr.Nzcv);
|
||||
}
|
||||
|
||||
[Test, Pairwise, Description("VFNMS.F<size> <Vd>, <Vn>, <Vm>")]
|
||||
public void Vfnms([Values(0u, 1u)] uint rd,
|
||||
[Values(0u, 1u)] uint rn,
|
||||
[Values(0u, 1u)] uint rm,
|
||||
[Values(2u, 3u)] uint size,
|
||||
[ValueSource("_2S_F_")] ulong z,
|
||||
[ValueSource("_2S_F_")] ulong a,
|
||||
[ValueSource("_2S_F_")] ulong b)
|
||||
{
|
||||
uint opcode = 0xee900a00;
|
||||
|
||||
if (size == 2)
|
||||
{
|
||||
opcode |= (((rm & 0x1) << 5) | (rm & 0x1e) >> 1);
|
||||
opcode |= (((rd & 0x1) << 22) | (rd & 0x1e) << 11);
|
||||
opcode |= (((rn & 0x1) << 7) | (rn & 0x1e) >> 15);
|
||||
|
||||
}
|
||||
else
|
||||
{
|
||||
opcode |= (((rm & 0x10) << 1) | (rm & 0xf) << 0);
|
||||
opcode |= (((rd & 0x10) << 18) | (rd & 0xf) << 12);
|
||||
opcode |= (((rn & 0x10) << 3) | (rn & 0xf) << 16);
|
||||
}
|
||||
|
||||
opcode |= ((size & 3) << 8);
|
||||
|
||||
V128 v0 = MakeVectorE0E1(z, z);
|
||||
V128 v1 = MakeVectorE0E1(a, z);
|
||||
V128 v2 = MakeVectorE0E1(b, z);
|
||||
|
||||
SingleOpcode(opcode, v0: v0, v1: v1, v2: v2);
|
||||
CompareAgainstUnicorn();
|
||||
}
|
||||
|
||||
[Test, Pairwise, Description("VMLSL.<type><size> <Vd>, <Vn>, <Vm>")]
|
||||
public void Vmlsl_I([Values(0u)] uint rd,
|
||||
[Values(1u, 0u)] uint rn,
|
||||
|
|
Loading…
Reference in a new issue