From d904706fc0a14d17072f7235d73c80c4f01b1041 Mon Sep 17 00:00:00 2001 From: riperiperi Date: Thu, 12 Mar 2020 03:20:55 +0000 Subject: [PATCH] Use a Jump Table for direct and indirect calls/jumps, removing transitions to managed (#975) * Implement Jump Table for Native Calls NOTE: this slows down rejit considerably! Not recommended to be used without codegen optimisation or AOT. - Does not work on Linux - A32 needs an additional commit. * A32 Support (WIP) * Actually write Direct Call pointers to the table That would help. * Direct Calls: Rather than returning to the translator, attempt to keep within the native stack frame. A return to the translator can still happen, but only by exceptionally bubbling up to it. Also: - Always translate lowCq as a function. Faster interop with the direct jumps, and this will be useful in future if we want to do speculative translation. - Tail Call Detection: after the decoding stage, detect if we do a tail call, and avoid translating into it. Detected if a jump is made to an address outwith the contiguous sequence of blocks surrounding the entry point. The goal is to reduce code touched by jit and rejit. * A32 Support * Use smaller max function size for lowCq, fix exceptional returns When a return has an unexpected value and there is no code block following this one, we now return the value rather than continuing. * CompareAndSwap (buggy) * Ensure CompareAndSwap does not get optimized away. * Use CompareAndSwap to make the dynamic table thread safe. * Tail call for linux, throw on too many arguments. * Combine CompareAndSwap 128 and 32/64. They emit different IR instructions since their PreAllocator behaviour is different, but now they just have one function on EmitterContext. * Fix issues separating from optimisations. * Use a stub to find and execute missing functions. This allows us to skip doing many runtime comparisons and branches, and reduces the amount of code we need to emit significantly. For the indirect call table, this stub also does the work of moving in the highCq address to the table when one is found. * Make Jump Tables and Jit Cache dynmically resize Reserve virtual memory, commit as needed. * Move TailCallRemover to its own class. * Multithreaded Translation (based on heuristic) A poor one, at that. Need to get core count for a better one, which means a lot of OS specific garbage. * Better priority management for background threads. * Bound core limit a bit more Past a certain point the load is not paralellizable and starts stealing from the main thread. Likely due to GC, memory, heap allocation thread contention. Reduce by one core til optimisations come to improve the situation. * Fix memory management on linux. * Temporary solution to some sync problems. This will make sure threads exit correctly, most of the time. There is a potential race where setting the sync counter to 0 does nothing (counter stays at what it was before, thread could take too long to exit), but we need to find a better way to do this anyways. Synchronization frequency has been tightened as we never enter blockwise segments of code. Essentially this means, check every x functions or loop iterations, before lowcq blocks existed and were worth just as much. Ideally it should be done in a better way, since functions can be anywhere from 1 to 5000 instructions. (maybe based on host timer, or an interrupt flag from a scheduler thread) * Address feedback minus CompareAndSwap change. * Use default ReservedRegion granularity. * Merge CompareAndSwap with its V128 variant. * We already got the source, no need to do it again. * Make sure all background translation threads exit. * Fix CompareAndSwap128 Detection criteria was a bit scuffed. * Address Comments. --- .../CodeGen/Optimizations/Optimizer.cs | 4 +- ARMeilleure/CodeGen/X86/Assembler.cs | 14 ++ ARMeilleure/CodeGen/X86/CodeGenerator.cs | 32 ++- ARMeilleure/CodeGen/X86/PreAllocator.cs | 188 ++++++++++++++-- ARMeilleure/CodeGen/X86/X86Instruction.cs | 2 + ARMeilleure/Decoders/Block.cs | 2 + ARMeilleure/Decoders/Decoder.cs | 16 +- .../Decoders/Optimizations/TailCallRemover.cs | 75 +++++++ ARMeilleure/Instructions/DelegateTypes.cs | 2 + ARMeilleure/Instructions/InstEmitAluHelper.cs | 22 +- ARMeilleure/Instructions/InstEmitException.cs | 5 +- .../Instructions/InstEmitException32.cs | 3 +- ARMeilleure/Instructions/InstEmitFlow.cs | 18 +- ARMeilleure/Instructions/InstEmitFlow32.cs | 14 +- .../Instructions/InstEmitFlowHelper.cs | 208 ++++++++++++++++-- ARMeilleure/Instructions/InstEmitHelper.cs | 32 ++- .../Instructions/InstEmitMemoryHelper.cs | 2 +- ARMeilleure/Instructions/NativeInterface.cs | 43 +++- .../IntermediateRepresentation/Instruction.cs | 3 +- ARMeilleure/Memory/MemoryManagement.cs | 38 ++++ ARMeilleure/Memory/MemoryManagementUnix.cs | 23 ++ ARMeilleure/Memory/MemoryManagementWindows.cs | 23 ++ ARMeilleure/Memory/MemoryManagerPal.cs | 2 +- ARMeilleure/Memory/ReservedRegion.cs | 53 +++++ ARMeilleure/State/ExecutionContext.cs | 10 +- ARMeilleure/State/NativeContext.cs | 10 +- ARMeilleure/Translation/ArmEmitterContext.cs | 15 +- ARMeilleure/Translation/DirectCallStubs.cs | 131 +++++++++++ ARMeilleure/Translation/EmitterContext.cs | 17 +- ARMeilleure/Translation/JitCache.cs | 11 +- ARMeilleure/Translation/JumpTable.cs | 149 +++++++++++++ ARMeilleure/Translation/TranslatedFunction.cs | 9 + ARMeilleure/Translation/Translator.cs | 48 +++- .../HOS/Kernel/Threading/HleScheduler.cs | 2 +- Ryujinx.HLE/HOS/Kernel/Threading/KThread.cs | 4 +- 35 files changed, 1094 insertions(+), 136 deletions(-) create mode 100644 ARMeilleure/Decoders/Optimizations/TailCallRemover.cs create mode 100644 ARMeilleure/Memory/ReservedRegion.cs create mode 100644 ARMeilleure/Translation/DirectCallStubs.cs create mode 100644 ARMeilleure/Translation/JumpTable.cs diff --git a/ARMeilleure/CodeGen/Optimizations/Optimizer.cs b/ARMeilleure/CodeGen/Optimizations/Optimizer.cs index d3ffd185e1..8b0c75fd6b 100644 --- a/ARMeilleure/CodeGen/Optimizations/Optimizer.cs +++ b/ARMeilleure/CodeGen/Optimizations/Optimizer.cs @@ -136,7 +136,9 @@ namespace ARMeilleure.CodeGen.Optimizations private static bool HasSideEffects(Node node) { - return (node is Operation operation) && operation.Instruction == Instruction.Call; + return (node is Operation operation) && (operation.Instruction == Instruction.Call + || operation.Instruction == Instruction.Tailcall + || operation.Instruction == Instruction.CompareAndSwap); } private static bool IsPropagableCopy(Operation operation) diff --git a/ARMeilleure/CodeGen/X86/Assembler.cs b/ARMeilleure/CodeGen/X86/Assembler.cs index 70130d90e4..5088e6f0c7 100644 --- a/ARMeilleure/CodeGen/X86/Assembler.cs +++ b/ARMeilleure/CodeGen/X86/Assembler.cs @@ -90,6 +90,7 @@ namespace ARMeilleure.CodeGen.X86 Add(X86Instruction.Cmpps, new InstructionInfo(BadOp, BadOp, BadOp, BadOp, 0x00000fc2, InstructionFlags.Vex)); Add(X86Instruction.Cmpsd, new InstructionInfo(BadOp, BadOp, BadOp, BadOp, 0x00000fc2, InstructionFlags.Vex | InstructionFlags.PrefixF2)); Add(X86Instruction.Cmpss, new InstructionInfo(BadOp, BadOp, BadOp, BadOp, 0x00000fc2, InstructionFlags.Vex | InstructionFlags.PrefixF3)); + Add(X86Instruction.Cmpxchg, new InstructionInfo(0x00000fb1, BadOp, BadOp, BadOp, BadOp, InstructionFlags.None)); Add(X86Instruction.Cmpxchg16b, new InstructionInfo(0x01000fc7, BadOp, BadOp, BadOp, BadOp, InstructionFlags.RexW)); Add(X86Instruction.Comisd, new InstructionInfo(BadOp, BadOp, BadOp, BadOp, 0x00000f2f, InstructionFlags.Vex | InstructionFlags.Prefix66)); Add(X86Instruction.Comiss, new InstructionInfo(BadOp, BadOp, BadOp, BadOp, 0x00000f2f, InstructionFlags.Vex)); @@ -117,6 +118,7 @@ namespace ARMeilleure.CodeGen.X86 Add(X86Instruction.Imul, new InstructionInfo(BadOp, 0x0000006b, 0x00000069, BadOp, 0x00000faf, InstructionFlags.None)); Add(X86Instruction.Imul128, new InstructionInfo(BadOp, BadOp, BadOp, BadOp, 0x050000f7, InstructionFlags.None)); Add(X86Instruction.Insertps, new InstructionInfo(BadOp, BadOp, BadOp, BadOp, 0x000f3a21, InstructionFlags.Vex | InstructionFlags.Prefix66)); + Add(X86Instruction.Jmp, new InstructionInfo(0x040000ff, BadOp, BadOp, BadOp, BadOp, InstructionFlags.None)); Add(X86Instruction.Lea, new InstructionInfo(BadOp, BadOp, BadOp, BadOp, 0x0000008d, InstructionFlags.None)); Add(X86Instruction.Maxpd, new InstructionInfo(BadOp, BadOp, BadOp, BadOp, 0x00000f5f, InstructionFlags.Vex | InstructionFlags.Prefix66)); Add(X86Instruction.Maxps, new InstructionInfo(BadOp, BadOp, BadOp, BadOp, 0x00000f5f, InstructionFlags.Vex)); @@ -328,6 +330,13 @@ namespace ARMeilleure.CodeGen.X86 WriteByte(0x99); } + public void Cmpxchg(MemoryOperand memOp, Operand src) + { + WriteByte(LockPrefix); + + WriteInstruction(memOp, src, src.Type, X86Instruction.Cmpxchg); + } + public void Cmpxchg16b(MemoryOperand memOp) { WriteByte(LockPrefix); @@ -480,6 +489,11 @@ namespace ARMeilleure.CodeGen.X86 } } + public void Jmp(Operand dest) + { + WriteInstruction(dest, null, OperandType.None, X86Instruction.Jmp); + } + public void Lea(Operand dest, Operand source, OperandType type) { WriteInstruction(dest, source, type, X86Instruction.Lea); diff --git a/ARMeilleure/CodeGen/X86/CodeGenerator.cs b/ARMeilleure/CodeGen/X86/CodeGenerator.cs index 32ca6a7812..1d0a4c12ff 100644 --- a/ARMeilleure/CodeGen/X86/CodeGenerator.cs +++ b/ARMeilleure/CodeGen/X86/CodeGenerator.cs @@ -34,7 +34,7 @@ namespace ARMeilleure.CodeGen.X86 Add(Instruction.ByteSwap, GenerateByteSwap); Add(Instruction.Call, GenerateCall); Add(Instruction.Clobber, GenerateClobber); - Add(Instruction.CompareAndSwap128, GenerateCompareAndSwap128); + Add(Instruction.CompareAndSwap, GenerateCompareAndSwap); Add(Instruction.CompareEqual, GenerateCompareEqual); Add(Instruction.CompareGreater, GenerateCompareGreater); Add(Instruction.CompareGreaterOrEqual, GenerateCompareGreaterOrEqual); @@ -76,6 +76,7 @@ namespace ARMeilleure.CodeGen.X86 Add(Instruction.Store16, GenerateStore16); Add(Instruction.Store8, GenerateStore8); Add(Instruction.Subtract, GenerateSubtract); + Add(Instruction.Tailcall, GenerateTailcall); Add(Instruction.VectorCreateScalar, GenerateVectorCreateScalar); Add(Instruction.VectorExtract, GenerateVectorExtract); Add(Instruction.VectorExtract16, GenerateVectorExtract16); @@ -543,13 +544,27 @@ namespace ARMeilleure.CodeGen.X86 // register allocator, we don't need to produce any code. } - private static void GenerateCompareAndSwap128(CodeGenContext context, Operation operation) + private static void GenerateCompareAndSwap(CodeGenContext context, Operation operation) { - Operand source = operation.GetSource(0); + Operand src1 = operation.GetSource(0); - MemoryOperand memOp = new MemoryOperand(OperandType.I64, source); + if (operation.SourcesCount == 5) // CompareAndSwap128 has 5 sources, compared to CompareAndSwap64/32's 3. + { + MemoryOperand memOp = new MemoryOperand(OperandType.I64, src1); - context.Assembler.Cmpxchg16b(memOp); + context.Assembler.Cmpxchg16b(memOp); + } + else + { + Operand src2 = operation.GetSource(1); + Operand src3 = operation.GetSource(2); + + EnsureSameType(src2, src3); + + MemoryOperand memOp = new MemoryOperand(src3.Type, src1); + + context.Assembler.Cmpxchg(memOp, src3); + } } private static void GenerateCompareEqual(CodeGenContext context, Operation operation) @@ -1083,6 +1098,13 @@ namespace ARMeilleure.CodeGen.X86 } } + private static void GenerateTailcall(CodeGenContext context, Operation operation) + { + WriteEpilogue(context); + + context.Assembler.Jmp(operation.GetSource(0)); + } + private static void GenerateVectorCreateScalar(CodeGenContext context, Operation operation) { Operand dest = operation.Destination; diff --git a/ARMeilleure/CodeGen/X86/PreAllocator.cs b/ARMeilleure/CodeGen/X86/PreAllocator.cs index 75844b099b..e20fca9d60 100644 --- a/ARMeilleure/CodeGen/X86/PreAllocator.cs +++ b/ARMeilleure/CodeGen/X86/PreAllocator.cs @@ -1,6 +1,7 @@ using ARMeilleure.CodeGen.RegisterAllocators; using ARMeilleure.IntermediateRepresentation; using ARMeilleure.Translation; +using System; using System.Collections.Generic; using System.Diagnostics; @@ -101,6 +102,17 @@ namespace ARMeilleure.CodeGen.X86 } break; + case Instruction.Tailcall: + if (callConv == CallConvName.Windows) + { + HandleTailcallWindowsAbi(block.Operations, stackAlloc, node, operation); + } + else + { + HandleTailcallSystemVAbi(block.Operations, stackAlloc, node, operation); + } + break; + case Instruction.VectorInsert8: if (!HardwareCapabilities.SupportsSse41) { @@ -199,32 +211,55 @@ namespace ARMeilleure.CodeGen.X86 switch (operation.Instruction) { - case Instruction.CompareAndSwap128: + case Instruction.CompareAndSwap: { - // Handle the many restrictions of the compare and exchange (16 bytes) instruction: - // - The expected value should be in RDX:RAX. - // - The new value to be written should be in RCX:RBX. - // - The value at the memory location is loaded to RDX:RAX. - void SplitOperand(Operand source, Operand lr, Operand hr) + OperandType type = operation.GetSource(1).Type; + + if (type == OperandType.V128) { - nodes.AddBefore(node, new Operation(Instruction.VectorExtract, lr, source, Const(0))); - nodes.AddBefore(node, new Operation(Instruction.VectorExtract, hr, source, Const(1))); + // Handle the many restrictions of the compare and exchange (16 bytes) instruction: + // - The expected value should be in RDX:RAX. + // - The new value to be written should be in RCX:RBX. + // - The value at the memory location is loaded to RDX:RAX. + void SplitOperand(Operand source, Operand lr, Operand hr) + { + nodes.AddBefore(node, new Operation(Instruction.VectorExtract, lr, source, Const(0))); + nodes.AddBefore(node, new Operation(Instruction.VectorExtract, hr, source, Const(1))); + } + + Operand rax = Gpr(X86Register.Rax, OperandType.I64); + Operand rbx = Gpr(X86Register.Rbx, OperandType.I64); + Operand rcx = Gpr(X86Register.Rcx, OperandType.I64); + Operand rdx = Gpr(X86Register.Rdx, OperandType.I64); + + SplitOperand(operation.GetSource(1), rax, rdx); + SplitOperand(operation.GetSource(2), rbx, rcx); + + node = nodes.AddAfter(node, new Operation(Instruction.VectorCreateScalar, dest, rax)); + node = nodes.AddAfter(node, new Operation(Instruction.VectorInsert, dest, dest, rdx, Const(1))); + + operation.SetDestinations(new Operand[] { rdx, rax }); + + operation.SetSources(new Operand[] { operation.GetSource(0), rdx, rax, rcx, rbx }); } + else + { + // Handle the many restrictions of the compare and exchange (32/64) instruction: + // - The expected value should be in (E/R)AX. + // - The value at the memory location is loaded to (E/R)AX. - Operand rax = Gpr(X86Register.Rax, OperandType.I64); - Operand rbx = Gpr(X86Register.Rbx, OperandType.I64); - Operand rcx = Gpr(X86Register.Rcx, OperandType.I64); - Operand rdx = Gpr(X86Register.Rdx, OperandType.I64); + Operand expected = operation.GetSource(1); - SplitOperand(operation.GetSource(1), rax, rdx); - SplitOperand(operation.GetSource(2), rbx, rcx); + Operand rax = Gpr(X86Register.Rax, expected.Type); - node = nodes.AddAfter(node, new Operation(Instruction.VectorCreateScalar, dest, rax)); - node = nodes.AddAfter(node, new Operation(Instruction.VectorInsert, dest, dest, rdx, Const(1))); + nodes.AddBefore(node, new Operation(Instruction.Copy, rax, expected)); - operation.SetDestinations(new Operand[] { rdx, rax }); + operation.SetSources(new Operand[] { operation.GetSource(0), rax, operation.GetSource(2) }); - operation.SetSources(new Operand[] { operation.GetSource(0), rdx, rax, rcx, rbx }); + node = nodes.AddAfter(node, new Operation(Instruction.Copy, dest, rax)); + + operation.Destination = rax; + } break; } @@ -829,6 +864,123 @@ namespace ARMeilleure.CodeGen.X86 return node; } + private static void HandleTailcallSystemVAbi(IntrusiveList nodes, StackAllocator stackAlloc, Node node, Operation operation) + { + List sources = new List(); + + sources.Add(operation.GetSource(0)); + + int argsCount = operation.SourcesCount - 1; + + int intMax = CallingConvention.GetIntArgumentsOnRegsCount(); + int vecMax = CallingConvention.GetVecArgumentsOnRegsCount(); + + int intCount = 0; + int vecCount = 0; + + // Handle arguments passed on registers. + for (int index = 0; index < argsCount; index++) + { + Operand source = operation.GetSource(1 + index); + + bool passOnReg; + + if (source.Type.IsInteger()) + { + passOnReg = intCount + 1 < intMax; + } + else + { + passOnReg = vecCount < vecMax; + } + + if (source.Type == OperandType.V128 && passOnReg) + { + // V128 is a struct, we pass each half on a GPR if possible. + Operand argReg = Gpr(CallingConvention.GetIntArgumentRegister(intCount++), OperandType.I64); + Operand argReg2 = Gpr(CallingConvention.GetIntArgumentRegister(intCount++), OperandType.I64); + + nodes.AddBefore(node, new Operation(Instruction.VectorExtract, argReg, source, Const(0))); + nodes.AddBefore(node, new Operation(Instruction.VectorExtract, argReg2, source, Const(1))); + + continue; + } + + if (passOnReg) + { + Operand argReg = source.Type.IsInteger() + ? Gpr(CallingConvention.GetIntArgumentRegister(intCount++), source.Type) + : Xmm(CallingConvention.GetVecArgumentRegister(vecCount++), source.Type); + + Operation copyOp = new Operation(Instruction.Copy, argReg, source); + + HandleConstantCopy(nodes, nodes.AddBefore(node, copyOp), copyOp); + + sources.Add(argReg); + } + else + { + throw new NotImplementedException("Spilling is not currently supported for tail calls. (too many arguments)"); + } + } + + // The target address must be on the return registers, since we + // don't return anything and it is guaranteed to not be a + // callee saved register (which would be trashed on the epilogue). + Operand retReg = Gpr(CallingConvention.GetIntReturnRegister(), OperandType.I64); + + Operation addrCopyOp = new Operation(Instruction.Copy, retReg, operation.GetSource(0)); + + nodes.AddBefore(node, addrCopyOp); + + sources[0] = retReg; + + operation.SetSources(sources.ToArray()); + } + + private static void HandleTailcallWindowsAbi(IntrusiveList nodes, StackAllocator stackAlloc, Node node, Operation operation) + { + int argsCount = operation.SourcesCount - 1; + + int maxArgs = CallingConvention.GetArgumentsOnRegsCount(); + + if (argsCount > maxArgs) + { + throw new NotImplementedException("Spilling is not currently supported for tail calls. (too many arguments)"); + } + + Operand[] sources = new Operand[1 + argsCount]; + + // Handle arguments passed on registers. + for (int index = 0; index < argsCount; index++) + { + Operand source = operation.GetSource(1 + index); + + Operand argReg = source.Type.IsInteger() + ? Gpr(CallingConvention.GetIntArgumentRegister(index), source.Type) + : Xmm(CallingConvention.GetVecArgumentRegister(index), source.Type); + + Operation copyOp = new Operation(Instruction.Copy, argReg, source); + + HandleConstantCopy(nodes, nodes.AddBefore(node, copyOp), copyOp); + + sources[1 + index] = argReg; + } + + // The target address must be on the return registers, since we + // don't return anything and it is guaranteed to not be a + // callee saved register (which would be trashed on the epilogue). + Operand retReg = Gpr(CallingConvention.GetIntReturnRegister(), OperandType.I64); + + Operation addrCopyOp = new Operation(Instruction.Copy, retReg, operation.GetSource(0)); + + nodes.AddBefore(node, addrCopyOp); + + sources[0] = retReg; + + operation.SetSources(sources); + } + private static void HandleLoadArgumentWindowsAbi( CompilerContext cctx, IntrusiveList nodes, diff --git a/ARMeilleure/CodeGen/X86/X86Instruction.cs b/ARMeilleure/CodeGen/X86/X86Instruction.cs index 813730f2a3..a6dbf1a5b7 100644 --- a/ARMeilleure/CodeGen/X86/X86Instruction.cs +++ b/ARMeilleure/CodeGen/X86/X86Instruction.cs @@ -23,6 +23,7 @@ namespace ARMeilleure.CodeGen.X86 Cmpps, Cmpsd, Cmpss, + Cmpxchg, Cmpxchg16b, Comisd, Comiss, @@ -50,6 +51,7 @@ namespace ARMeilleure.CodeGen.X86 Imul, Imul128, Insertps, + Jmp, Lea, Maxpd, Maxps, diff --git a/ARMeilleure/Decoders/Block.cs b/ARMeilleure/Decoders/Block.cs index 3d13c2d5e4..d38b5a8ec4 100644 --- a/ARMeilleure/Decoders/Block.cs +++ b/ARMeilleure/Decoders/Block.cs @@ -11,6 +11,8 @@ namespace ARMeilleure.Decoders public Block Next { get; set; } public Block Branch { get; set; } + public bool TailCall { get; set; } + public List OpCodes { get; private set; } public Block() diff --git a/ARMeilleure/Decoders/Decoder.cs b/ARMeilleure/Decoders/Decoder.cs index 7cbb62e6c5..9675dc8db9 100644 --- a/ARMeilleure/Decoders/Decoder.cs +++ b/ARMeilleure/Decoders/Decoder.cs @@ -1,3 +1,4 @@ +using ARMeilleure.Decoders.Optimizations; using ARMeilleure.Instructions; using ARMeilleure.Memory; using ARMeilleure.State; @@ -15,6 +16,9 @@ namespace ARMeilleure.Decoders // take too long to compile and use too much memory. private const int MaxInstsPerFunction = 5000; + // For lower code quality translation, we set a lower limit since we're blocking execution. + private const int MaxInstsPerFunctionLowCq = 500; + private delegate object MakeOp(InstDescriptor inst, ulong address, int opCode); private static ConcurrentDictionary _opActivators; @@ -33,7 +37,7 @@ namespace ARMeilleure.Decoders return new Block[] { block }; } - public static Block[] DecodeFunction(MemoryManager memory, ulong address, ExecutionMode mode) + public static Block[] DecodeFunction(MemoryManager memory, ulong address, ExecutionMode mode, bool highCq) { List blocks = new List(); @@ -43,11 +47,13 @@ namespace ARMeilleure.Decoders int opsCount = 0; + int instructionLimit = highCq ? MaxInstsPerFunction : MaxInstsPerFunctionLowCq; + Block GetBlock(ulong blkAddress) { if (!visited.TryGetValue(blkAddress, out Block block)) { - if (opsCount > MaxInstsPerFunction || !memory.IsMapped((long)blkAddress)) + if (opsCount > instructionLimit || !memory.IsMapped((long)blkAddress)) { return null; } @@ -121,7 +127,7 @@ namespace ARMeilleure.Decoders currBlock.Branch = GetBlock((ulong)op.Immediate); } - if (!IsUnconditionalBranch(lastOp) /*|| isCall*/) + if (!IsUnconditionalBranch(lastOp) || isCall) { currBlock.Next = GetBlock(currBlock.EndAddress); } @@ -140,10 +146,12 @@ namespace ARMeilleure.Decoders } } + TailCallRemover.RunPass(address, blocks); + return blocks.ToArray(); } - private static bool BinarySearch(List blocks, ulong address, out int index) + public static bool BinarySearch(List blocks, ulong address, out int index) { index = 0; diff --git a/ARMeilleure/Decoders/Optimizations/TailCallRemover.cs b/ARMeilleure/Decoders/Optimizations/TailCallRemover.cs new file mode 100644 index 0000000000..2d6439bac0 --- /dev/null +++ b/ARMeilleure/Decoders/Optimizations/TailCallRemover.cs @@ -0,0 +1,75 @@ +using ARMeilleure.Decoders; +using System; +using System.Collections.Generic; + +namespace ARMeilleure.Decoders.Optimizations +{ + static class TailCallRemover + { + public static void RunPass(ulong entryAddress, List blocks) + { + // Detect tail calls: + // - Assume this function spans the space covered by contiguous code blocks surrounding the entry address. + // - Unconditional jump to an area outside this contiguous region will be treated as a tail call. + // - Include a small allowance for jumps outside the contiguous range. + + if (!Decoder.BinarySearch(blocks, entryAddress, out int entryBlockId)) + { + throw new InvalidOperationException("Function entry point is not contained in a block."); + } + + const ulong allowance = 4; + Block entryBlock = blocks[entryBlockId]; + int startBlockIndex = entryBlockId; + Block startBlock = entryBlock; + int endBlockIndex = entryBlockId; + Block endBlock = entryBlock; + + for (int i = entryBlockId + 1; i < blocks.Count; i++) // Search forwards. + { + Block block = blocks[i]; + if (endBlock.EndAddress < block.Address - allowance) + { + break; // End of contiguous function. + } + + endBlock = block; + endBlockIndex = i; + } + + for (int i = entryBlockId - 1; i >= 0; i--) // Search backwards. + { + Block block = blocks[i]; + if (startBlock.Address > block.EndAddress + allowance) + { + break; // End of contiguous function. + } + + startBlock = block; + startBlockIndex = i; + } + + if (startBlockIndex == 0 && endBlockIndex == blocks.Count - 1) + { + return; // Nothing to do here. + } + + // Replace all branches to blocks outside the range with null, and force a tail call. + + for (int i = startBlockIndex; i <= endBlockIndex; i++) + { + Block block = blocks[i]; + if (block.Branch != null && (block.Branch.Address > endBlock.EndAddress || block.Branch.EndAddress < startBlock.Address)) + { + block.Branch = null; + block.TailCall = true; + } + } + + // Finally, delete all blocks outside the contiguous range. + + blocks.RemoveRange(endBlockIndex + 1, (blocks.Count - endBlockIndex) - 1); + blocks.RemoveRange(0, startBlockIndex); + } + } +} diff --git a/ARMeilleure/Instructions/DelegateTypes.cs b/ARMeilleure/Instructions/DelegateTypes.cs index b65149cb81..41614f88e5 100644 --- a/ARMeilleure/Instructions/DelegateTypes.cs +++ b/ARMeilleure/Instructions/DelegateTypes.cs @@ -3,6 +3,8 @@ using System; namespace ARMeilleure.Instructions { + delegate bool _Bool(); + delegate double _F64_F64(double a1); delegate double _F64_F64_Bool(double a1, bool a2); delegate double _F64_F64_F64(double a1, double a2); diff --git a/ARMeilleure/Instructions/InstEmitAluHelper.cs b/ARMeilleure/Instructions/InstEmitAluHelper.cs index 916a1da5a9..12fa1bf1b5 100644 --- a/ARMeilleure/Instructions/InstEmitAluHelper.cs +++ b/ARMeilleure/Instructions/InstEmitAluHelper.cs @@ -116,12 +116,14 @@ namespace ARMeilleure.Instructions { Debug.Assert(value.Type == OperandType.I32); - context.StoreToContext(); - if (IsThumb(context.CurrOp)) { - // Make this count as a call, the translator will ignore the low bit for the address. - context.Return(context.ZeroExtend32(OperandType.I64, context.BitwiseOr(value, Const(1)))); + context.StoreToContext(); + bool isReturn = IsA32Return(context); + + Operand addr = context.BitwiseOr(value, Const(1)); + + InstEmitFlowHelper.EmitVirtualJump(context, addr, isReturn); } else { @@ -138,18 +140,8 @@ namespace ARMeilleure.Instructions if (setFlags) { // TODO: Load SPSR etc. - Operand isThumb = GetFlag(PState.TFlag); - Operand lblThumb = Label(); - - context.BranchIfTrue(lblThumb, isThumb); - - // Make this count as a call, the translator will ignore the low bit for the address. - context.Return(context.ZeroExtend32(OperandType.I64, context.BitwiseOr(context.BitwiseAnd(value, Const(~3)), Const(1)))); - - context.MarkLabel(lblThumb); - - context.Return(context.ZeroExtend32(OperandType.I64, context.BitwiseOr(value, Const(1)))); + EmitBxWritePc(context, value); } else { diff --git a/ARMeilleure/Instructions/InstEmitException.cs b/ARMeilleure/Instructions/InstEmitException.cs index 6f7b6fd51f..f0bde242a6 100644 --- a/ARMeilleure/Instructions/InstEmitException.cs +++ b/ARMeilleure/Instructions/InstEmitException.cs @@ -2,6 +2,7 @@ using ARMeilleure.Decoders; using ARMeilleure.Translation; using System; +using static ARMeilleure.Instructions.InstEmitFlowHelper; using static ARMeilleure.IntermediateRepresentation.OperandHelper; namespace ARMeilleure.Instructions @@ -30,7 +31,7 @@ namespace ARMeilleure.Instructions if (context.CurrBlock.Next == null) { - context.Return(Const(op.Address + 4)); + EmitTailContinue(context, Const(op.Address + 4)); } } @@ -48,7 +49,7 @@ namespace ARMeilleure.Instructions if (context.CurrBlock.Next == null) { - context.Return(Const(op.Address + 4)); + EmitTailContinue(context, Const(op.Address + 4)); } } } diff --git a/ARMeilleure/Instructions/InstEmitException32.cs b/ARMeilleure/Instructions/InstEmitException32.cs index a73f0dec77..8ffad1d1fc 100644 --- a/ARMeilleure/Instructions/InstEmitException32.cs +++ b/ARMeilleure/Instructions/InstEmitException32.cs @@ -1,6 +1,7 @@ using ARMeilleure.Decoders; using ARMeilleure.Translation; +using static ARMeilleure.Instructions.InstEmitFlowHelper; using static ARMeilleure.IntermediateRepresentation.OperandHelper; namespace ARMeilleure.Instructions @@ -29,7 +30,7 @@ namespace ARMeilleure.Instructions if (context.CurrBlock.Next == null) { - context.Return(Const(op.Address + 4)); + EmitTailContinue(context, Const(op.Address + 4)); } } } diff --git a/ARMeilleure/Instructions/InstEmitFlow.cs b/ARMeilleure/Instructions/InstEmitFlow.cs index 93d36e1b94..bac9ec588c 100644 --- a/ARMeilleure/Instructions/InstEmitFlow.cs +++ b/ARMeilleure/Instructions/InstEmitFlow.cs @@ -21,7 +21,7 @@ namespace ARMeilleure.Instructions } else { - context.Return(Const(op.Immediate)); + EmitTailContinue(context, Const(op.Immediate), context.CurrBlock.TailCall); } } @@ -56,7 +56,7 @@ namespace ARMeilleure.Instructions { OpCodeBReg op = (OpCodeBReg)context.CurrOp; - EmitVirtualJump(context, GetIntOrZR(context, op.Rn)); + EmitVirtualJump(context, GetIntOrZR(context, op.Rn), op.Rn == RegisterAlias.Lr); } public static void Cbnz(ArmEmitterContext context) => EmitCb(context, onNotZero: true); @@ -71,7 +71,7 @@ namespace ARMeilleure.Instructions public static void Ret(ArmEmitterContext context) { - context.Return(context.BitwiseOr(GetIntOrZR(context, RegisterAlias.Lr), Const(CallFlag))); + context.Return(GetIntOrZR(context, RegisterAlias.Lr)); } public static void Tbnz(ArmEmitterContext context) => EmitTb(context, onNotZero: true); @@ -96,7 +96,7 @@ namespace ARMeilleure.Instructions if (context.CurrBlock.Next == null) { - context.Return(Const(op.Address + 4)); + EmitTailContinue(context, Const(op.Address + 4)); } } else @@ -105,11 +105,11 @@ namespace ARMeilleure.Instructions EmitCondBranch(context, lblTaken, cond); - context.Return(Const(op.Address + 4)); + EmitTailContinue(context, Const(op.Address + 4)); context.MarkLabel(lblTaken); - context.Return(Const(op.Immediate)); + EmitTailContinue(context, Const(op.Immediate)); } } @@ -132,7 +132,7 @@ namespace ARMeilleure.Instructions if (context.CurrBlock.Next == null) { - context.Return(Const(op.Address + 4)); + EmitTailContinue(context, Const(op.Address + 4)); } } else @@ -148,11 +148,11 @@ namespace ARMeilleure.Instructions context.BranchIfFalse(lblTaken, value); } - context.Return(Const(op.Address + 4)); + EmitTailContinue(context, Const(op.Address + 4)); context.MarkLabel(lblTaken); - context.Return(Const(op.Immediate)); + EmitTailContinue(context, Const(op.Immediate)); } } } diff --git a/ARMeilleure/Instructions/InstEmitFlow32.cs b/ARMeilleure/Instructions/InstEmitFlow32.cs index cbb9ad5b26..47233eb99a 100644 --- a/ARMeilleure/Instructions/InstEmitFlow32.cs +++ b/ARMeilleure/Instructions/InstEmitFlow32.cs @@ -21,8 +21,7 @@ namespace ARMeilleure.Instructions } else { - context.StoreToContext(); - context.Return(Const(op.Immediate)); + EmitTailContinue(context, Const(op.Immediate)); } } @@ -57,7 +56,7 @@ namespace ARMeilleure.Instructions SetFlag(context, PState.TFlag, Const(isThumb ? 0 : 1)); } - InstEmitFlowHelper.EmitCall(context, (ulong)op.Immediate); + EmitCall(context, (ulong)op.Immediate); } public static void Blxr(ArmEmitterContext context) @@ -66,9 +65,8 @@ namespace ARMeilleure.Instructions uint pc = op.GetPc(); - Operand addr = GetIntA32(context, op.Rm); + Operand addr = context.Copy(GetIntA32(context, op.Rm)); Operand bitOne = context.BitwiseAnd(addr, Const(1)); - addr = context.BitwiseOr(addr, Const((int)CallFlag)); // Set call flag. bool isThumb = IsThumb(context.CurrOp); @@ -80,16 +78,14 @@ namespace ARMeilleure.Instructions SetFlag(context, PState.TFlag, bitOne); - context.Return(addr); // Call. + EmitVirtualCall(context, addr); } public static void Bx(ArmEmitterContext context) { IOpCode32BReg op = (IOpCode32BReg)context.CurrOp; - context.StoreToContext(); - - EmitBxWritePc(context, GetIntA32(context, op.Rm)); + EmitBxWritePc(context, GetIntA32(context, op.Rm), op.Rm); } } } \ No newline at end of file diff --git a/ARMeilleure/Instructions/InstEmitFlowHelper.cs b/ARMeilleure/Instructions/InstEmitFlowHelper.cs index a8eb21d33f..f0a81e8557 100644 --- a/ARMeilleure/Instructions/InstEmitFlowHelper.cs +++ b/ARMeilleure/Instructions/InstEmitFlowHelper.cs @@ -2,6 +2,7 @@ using ARMeilleure.Decoders; using ARMeilleure.IntermediateRepresentation; using ARMeilleure.State; using ARMeilleure.Translation; +using System; using static ARMeilleure.Instructions.InstEmitHelper; using static ARMeilleure.IntermediateRepresentation.OperandHelper; @@ -142,7 +143,29 @@ namespace ARMeilleure.Instructions public static void EmitCall(ArmEmitterContext context, ulong immediate) { - context.Return(Const(immediate | CallFlag)); + EmitJumpTableBranch(context, Const(immediate)); + } + + private static void EmitNativeCall(ArmEmitterContext context, Operand nativeContextPtr, Operand funcAddr, bool isJump = false) + { + context.StoreToContext(); + Operand returnAddress; + if (isJump) + { + context.Tailcall(funcAddr, nativeContextPtr); + } + else + { + returnAddress = context.Call(funcAddr, OperandType.I64, nativeContextPtr); + context.LoadFromContext(); + + EmitContinueOrReturnCheck(context, returnAddress); + } + } + + private static void EmitNativeCall(ArmEmitterContext context, Operand funcAddr, bool isJump = false) + { + EmitNativeCall(context, context.LoadArgument(OperandType.I64, 0), funcAddr, isJump); } public static void EmitVirtualCall(ArmEmitterContext context, Operand target) @@ -150,37 +173,45 @@ namespace ARMeilleure.Instructions EmitVirtualCallOrJump(context, target, isJump: false); } - public static void EmitVirtualJump(ArmEmitterContext context, Operand target) + public static void EmitVirtualJump(ArmEmitterContext context, Operand target, bool isReturn) { - EmitVirtualCallOrJump(context, target, isJump: true); + EmitVirtualCallOrJump(context, target, isJump: true, isReturn: isReturn); } - private static void EmitVirtualCallOrJump(ArmEmitterContext context, Operand target, bool isJump) + private static void EmitVirtualCallOrJump(ArmEmitterContext context, Operand target, bool isJump, bool isReturn = false) { - context.Return(context.BitwiseOr(target, Const(target.Type, (long)CallFlag))); - } - - private static void EmitContinueOrReturnCheck(ArmEmitterContext context, Operand retVal) - { - // Note: The return value of the called method will be placed - // at the Stack, the return value is always a Int64 with the - // return address of the function. We check if the address is - // correct, if it isn't we keep returning until we reach the dispatcher. - ulong nextAddr = GetNextOpAddress(context.CurrOp); - - if (context.CurrBlock.Next != null) + if (isReturn) { - Operand lblContinue = Label(); - - context.BranchIfTrue(lblContinue, context.ICompareEqual(retVal, Const(nextAddr))); - - context.Return(Const(nextAddr)); - - context.MarkLabel(lblContinue); + context.Return(target); } else { - context.Return(Const(nextAddr)); + EmitJumpTableBranch(context, target, isJump); + } + } + + private static void EmitContinueOrReturnCheck(ArmEmitterContext context, Operand returnAddress) + { + // Note: The return value of a translated function is always an Int64 with the + // address execution has returned to. We expect this address to be immediately after the + // current instruction, if it isn't we keep returning until we reach the dispatcher. + Operand nextAddr = Const(GetNextOpAddress(context.CurrOp)); + + // Try to continue within this block. + // If the return address isn't to our next instruction, we need to return so the JIT can figure out what to do. + Operand lblContinue = Label(); + + // We need to clear out the call flag for the return address before comparing it. + context.BranchIfTrue(lblContinue, context.ICompareEqual(context.BitwiseAnd(returnAddress, Const(~CallFlag)), nextAddr)); + + context.Return(returnAddress); + + context.MarkLabel(lblContinue); + + if (context.CurrBlock.Next == null) + { + // No code following this instruction, try and find the next block and jump to it. + EmitTailContinue(context, nextAddr); } } @@ -188,5 +219,134 @@ namespace ARMeilleure.Instructions { return op.Address + (ulong)op.OpCodeSizeInBytes; } + + public static void EmitTailContinue(ArmEmitterContext context, Operand address, bool allowRejit = false) + { + bool useTailContinue = true; // Left option here as it may be useful if we need to return to managed rather than tail call in future. (eg. for debug) + if (useTailContinue) + { + if (allowRejit) + { + address = context.BitwiseOr(address, Const(1L)); + } + + Operand fallbackAddr = context.Call(new _U64_U64(NativeInterface.GetFunctionAddress), address); + + EmitNativeCall(context, fallbackAddr, true); + } + else + { + context.Return(address); + } + } + + private static void EmitNativeCallWithGuestAddress(ArmEmitterContext context, Operand funcAddr, Operand guestAddress, bool isJump) + { + Operand nativeContextPtr = context.LoadArgument(OperandType.I64, 0); + context.Store(context.Add(nativeContextPtr, Const(NativeContext.GetCallAddressOffset())), guestAddress); + + EmitNativeCall(context, nativeContextPtr, funcAddr, isJump); + } + + private static void EmitBranchFallback(ArmEmitterContext context, Operand address, bool isJump) + { + address = context.BitwiseOr(address, Const(address.Type, (long)CallFlag)); // Set call flag. + Operand fallbackAddr = context.Call(new _U64_U64(NativeInterface.GetFunctionAddress), address); + EmitNativeCall(context, fallbackAddr, isJump); + } + + public static void EmitDynamicTableCall(ArmEmitterContext context, Operand tableAddress, Operand address, bool isJump) + { + // Loop over elements of the dynamic table. Unrolled loop. + + Operand endLabel = Label(); + Operand fallbackLabel = Label(); + + Action emitTableEntry = (Operand entrySkipLabel) => + { + // Try to take this entry in the table if its guest address equals 0. + Operand gotResult = context.CompareAndSwap(tableAddress, Const(0L), address); + + // Is the address ours? (either taken via CompareAndSwap (0), or what was already here) + context.BranchIfFalse(entrySkipLabel, context.BitwiseOr(context.ICompareEqual(gotResult, address), context.ICompareEqual(gotResult, Const(0L)))); + + // It's ours, so what function is it pointing to? + Operand targetFunctionPtr = context.Add(tableAddress, Const(8L)); + Operand targetFunction = context.Load(OperandType.I64, targetFunctionPtr); + + // Call the function. + // We pass in the entry address as the guest address, as the entry may need to be updated by the indirect call stub. + EmitNativeCallWithGuestAddress(context, targetFunction, tableAddress, isJump); + context.Branch(endLabel); + }; + + // Currently this uses a size of 1, as higher values inflate code size for no real benefit. + for (int i = 0; i < JumpTable.DynamicTableElems; i++) + { + if (i == JumpTable.DynamicTableElems - 1) + { + emitTableEntry(fallbackLabel); // If this is the last entry, avoid emitting the additional label and add. + } + else + { + Operand nextLabel = Label(); + + emitTableEntry(nextLabel); + + context.MarkLabel(nextLabel); + tableAddress = context.Add(tableAddress, Const((long)JumpTable.JumpTableStride)); // Move to the next table entry. + } + } + + context.MarkLabel(fallbackLabel); + + EmitBranchFallback(context, address, isJump); + + context.MarkLabel(endLabel); + } + + public static void EmitJumpTableBranch(ArmEmitterContext context, Operand address, bool isJump = false) + { + if (address.Type == OperandType.I32) + { + address = context.ZeroExtend32(OperandType.I64, address); + } + + // TODO: Constant folding. Indirect calls are slower in the best case and emit more code so we want to avoid them when possible. + bool isConst = address.Kind == OperandKind.Constant; + long constAddr = (long)address.Value; + + if (!context.HighCq) + { + // Don't emit indirect calls or jumps if we're compiling in lowCq mode. + // This avoids wasting space on the jump and indirect tables. + // Just ask the translator for the function address. + + EmitBranchFallback(context, address, isJump); + } + else if (!isConst) + { + // Virtual branch/call - store first used addresses on a small table for fast lookup. + int entry = context.JumpTable.ReserveDynamicEntry(isJump); + + int jumpOffset = entry * JumpTable.JumpTableStride * JumpTable.DynamicTableElems; + Operand dynTablePtr = Const(context.JumpTable.DynamicPointer.ToInt64() + jumpOffset); + + EmitDynamicTableCall(context, dynTablePtr, address, isJump); + } + else + { + int entry = context.JumpTable.ReserveTableEntry(context.BaseAddress & (~3L), constAddr, isJump); + + int jumpOffset = entry * JumpTable.JumpTableStride + 8; // Offset directly to the host address. + + // TODO: Relocatable jump table ptr for AOT. Would prefer a solution to patch this constant into functions as they are loaded rather than calculate at runtime. + Operand tableEntryPtr = Const(context.JumpTable.JumpPointer.ToInt64() + jumpOffset); + + Operand funcAddr = context.Load(OperandType.I64, tableEntryPtr); + + EmitNativeCallWithGuestAddress(context, funcAddr, address, isJump); // Call the function directly. If it's not present yet, this will call the direct call stub. + } + } } } diff --git a/ARMeilleure/Instructions/InstEmitHelper.cs b/ARMeilleure/Instructions/InstEmitHelper.cs index f5495c6600..a4227543fa 100644 --- a/ARMeilleure/Instructions/InstEmitHelper.cs +++ b/ARMeilleure/Instructions/InstEmitHelper.cs @@ -144,22 +144,34 @@ namespace ARMeilleure.Instructions } } - public static void EmitBxWritePc(ArmEmitterContext context, Operand pc) + public static bool IsA32Return(ArmEmitterContext context) { + switch (context.CurrOp) + { + case IOpCode32MemMult op: + return true; // Setting PC using LDM is nearly always a return. + case OpCode32AluRsImm op: + return op.Rm == RegisterAlias.Aarch32Lr; + case OpCode32AluRsReg op: + return op.Rm == RegisterAlias.Aarch32Lr; + case OpCode32AluReg op: + return op.Rm == RegisterAlias.Aarch32Lr; + case OpCode32Mem op: + return op.Rn == RegisterAlias.Aarch32Sp && op.WBack && !op.Index; // Setting PC to an address stored on the stack is nearly always a return. + } + return false; + } + + public static void EmitBxWritePc(ArmEmitterContext context, Operand pc, int sourceRegister = 0) + { + bool isReturn = sourceRegister == RegisterAlias.Aarch32Lr || IsA32Return(context); Operand mode = context.BitwiseAnd(pc, Const(1)); SetFlag(context, PState.TFlag, mode); - Operand lblArmMode = Label(); + Operand addr = context.ConditionalSelect(mode, context.BitwiseOr(pc, Const((int)InstEmitFlowHelper.CallFlag)), context.BitwiseAnd(pc, Const(~3))); - context.BranchIfTrue(lblArmMode, mode); - - // Make this count as a call, the translator will ignore the low bit for the address. - context.Return(context.ZeroExtend32(OperandType.I64, context.BitwiseOr(pc, Const((int)InstEmitFlowHelper.CallFlag)))); - - context.MarkLabel(lblArmMode); - - context.Return(context.ZeroExtend32(OperandType.I64, context.BitwiseOr(context.BitwiseAnd(pc, Const(~3)), Const((int)InstEmitFlowHelper.CallFlag)))); + InstEmitFlowHelper.EmitVirtualJump(context, addr, isReturn); } public static Operand GetIntOrZR(ArmEmitterContext context, int regIndex) diff --git a/ARMeilleure/Instructions/InstEmitMemoryHelper.cs b/ARMeilleure/Instructions/InstEmitMemoryHelper.cs index 70861d1634..e1dec3313a 100644 --- a/ARMeilleure/Instructions/InstEmitMemoryHelper.cs +++ b/ARMeilleure/Instructions/InstEmitMemoryHelper.cs @@ -51,7 +51,7 @@ namespace ARMeilleure.Instructions EmitReadInt(context, address, rt, size); } - if (!isSimd) + if (!isSimd && !(context.CurrOp is OpCode32 && rt == State.RegisterAlias.Aarch32Pc)) { Operand value = GetInt(context, rt); diff --git a/ARMeilleure/Instructions/NativeInterface.cs b/ARMeilleure/Instructions/NativeInterface.cs index 988e86bd77..4514c0da49 100644 --- a/ARMeilleure/Instructions/NativeInterface.cs +++ b/ARMeilleure/Instructions/NativeInterface.cs @@ -1,6 +1,8 @@ using ARMeilleure.Memory; using ARMeilleure.State; +using ARMeilleure.Translation; using System; +using System.Runtime.InteropServices; namespace ARMeilleure.Instructions { @@ -10,17 +12,19 @@ namespace ARMeilleure.Instructions private class ThreadContext { - public ExecutionContext Context { get; } - public MemoryManager Memory { get; } + public ExecutionContext Context { get; } + public MemoryManager Memory { get; } + public Translator Translator { get; } public ulong ExclusiveAddress { get; set; } public ulong ExclusiveValueLow { get; set; } public ulong ExclusiveValueHigh { get; set; } - public ThreadContext(ExecutionContext context, MemoryManager memory) + public ThreadContext(ExecutionContext context, MemoryManager memory, Translator translator) { - Context = context; - Memory = memory; + Context = context; + Memory = memory; + Translator = translator; ExclusiveAddress = ulong.MaxValue; } @@ -29,9 +33,9 @@ namespace ARMeilleure.Instructions [ThreadStatic] private static ThreadContext _context; - public static void RegisterThread(ExecutionContext context, MemoryManager memory) + public static void RegisterThread(ExecutionContext context, MemoryManager memory, Translator translator) { - _context = new ThreadContext(context, memory); + _context = new ThreadContext(context, memory, translator); } public static void UnregisterThread() @@ -381,18 +385,39 @@ namespace ARMeilleure.Instructions return address & ~((4UL << ErgSizeLog2) - 1); } + public static ulong GetFunctionAddress(ulong address) + { + TranslatedFunction function = _context.Translator.GetOrTranslate(address, GetContext().ExecutionMode); + return (ulong)function.GetPointer().ToInt64(); + } + + public static ulong GetIndirectFunctionAddress(ulong address, ulong entryAddress) + { + TranslatedFunction function = _context.Translator.GetOrTranslate(address, GetContext().ExecutionMode); + ulong ptr = (ulong)function.GetPointer().ToInt64(); + if (function.HighCq) + { + // Rewrite the host function address in the table to point to the highCq function. + Marshal.WriteInt64((IntPtr)entryAddress, 8, (long)ptr); + } + return ptr; + } + public static void ClearExclusive() { _context.ExclusiveAddress = ulong.MaxValue; } - public static void CheckSynchronization() + public static bool CheckSynchronization() { Statistics.PauseTimer(); - GetContext().CheckInterrupt(); + ExecutionContext context = GetContext(); + context.CheckInterrupt(); Statistics.ResumeTimer(); + + return context.Running; } public static ExecutionContext GetContext() diff --git a/ARMeilleure/IntermediateRepresentation/Instruction.cs b/ARMeilleure/IntermediateRepresentation/Instruction.cs index 4c4ecb8f2d..d1ce1aa379 100644 --- a/ARMeilleure/IntermediateRepresentation/Instruction.cs +++ b/ARMeilleure/IntermediateRepresentation/Instruction.cs @@ -12,7 +12,7 @@ namespace ARMeilleure.IntermediateRepresentation BranchIfTrue, ByteSwap, Call, - CompareAndSwap128, + CompareAndSwap, CompareEqual, CompareGreater, CompareGreaterOrEqual, @@ -52,6 +52,7 @@ namespace ARMeilleure.IntermediateRepresentation Store16, Store8, Subtract, + Tailcall, VectorCreateScalar, VectorExtract, VectorExtract16, diff --git a/ARMeilleure/Memory/MemoryManagement.cs b/ARMeilleure/Memory/MemoryManagement.cs index e299ae49da..ba62f8e73f 100644 --- a/ARMeilleure/Memory/MemoryManagement.cs +++ b/ARMeilleure/Memory/MemoryManagement.cs @@ -44,6 +44,25 @@ namespace ARMeilleure.Memory } } + public static bool Commit(IntPtr address, ulong size) + { + if (RuntimeInformation.IsOSPlatform(OSPlatform.Windows)) + { + IntPtr sizeNint = new IntPtr((long)size); + + return MemoryManagementWindows.Commit(address, sizeNint); + } + else if (RuntimeInformation.IsOSPlatform(OSPlatform.Linux) || + RuntimeInformation.IsOSPlatform(OSPlatform.OSX)) + { + return MemoryManagementUnix.Commit(address, size); + } + else + { + throw new PlatformNotSupportedException(); + } + } + public static void Reprotect(IntPtr address, ulong size, MemoryProtection permission) { bool result; @@ -70,6 +89,25 @@ namespace ARMeilleure.Memory } } + public static IntPtr Reserve(ulong size) + { + if (RuntimeInformation.IsOSPlatform(OSPlatform.Windows)) + { + IntPtr sizeNint = new IntPtr((long)size); + + return MemoryManagementWindows.Reserve(sizeNint); + } + else if (RuntimeInformation.IsOSPlatform(OSPlatform.Linux) || + RuntimeInformation.IsOSPlatform(OSPlatform.OSX)) + { + return MemoryManagementUnix.Reserve(size); + } + else + { + throw new PlatformNotSupportedException(); + } + } + public static bool Free(IntPtr address) { if (RuntimeInformation.IsOSPlatform(OSPlatform.Windows)) diff --git a/ARMeilleure/Memory/MemoryManagementUnix.cs b/ARMeilleure/Memory/MemoryManagementUnix.cs index 3331fb428f..e9b296081e 100644 --- a/ARMeilleure/Memory/MemoryManagementUnix.cs +++ b/ARMeilleure/Memory/MemoryManagementUnix.cs @@ -30,6 +30,11 @@ namespace ARMeilleure.Memory return ptr; } + public static bool Commit(IntPtr address, ulong size) + { + return Syscall.mprotect(address, size, MmapProts.PROT_READ | MmapProts.PROT_WRITE) == 0; + } + public static bool Reprotect(IntPtr address, ulong size, Memory.MemoryProtection protection) { MmapProts prot = GetProtection(protection); @@ -37,6 +42,24 @@ namespace ARMeilleure.Memory return Syscall.mprotect(address, size, prot) == 0; } + public static IntPtr Reserve(ulong size) + { + ulong pageSize = (ulong)Syscall.sysconf(SysconfName._SC_PAGESIZE); + + const MmapProts prot = MmapProts.PROT_NONE; + + const MmapFlags flags = MmapFlags.MAP_PRIVATE | MmapFlags.MAP_ANONYMOUS; + + IntPtr ptr = Syscall.mmap(IntPtr.Zero, size + pageSize, prot, flags, -1, 0); + + if (ptr == IntPtr.Zero) + { + throw new OutOfMemoryException(); + } + + return ptr; + } + private static MmapProts GetProtection(Memory.MemoryProtection protection) { switch (protection) diff --git a/ARMeilleure/Memory/MemoryManagementWindows.cs b/ARMeilleure/Memory/MemoryManagementWindows.cs index ae64b5c62b..a945506317 100644 --- a/ARMeilleure/Memory/MemoryManagementWindows.cs +++ b/ARMeilleure/Memory/MemoryManagementWindows.cs @@ -89,6 +89,15 @@ namespace ARMeilleure.Memory return ptr; } + public static bool Commit(IntPtr location, IntPtr size) + { + const AllocationType flags = AllocationType.Commit; + + IntPtr ptr = VirtualAlloc(location, size, flags, MemoryProtection.ReadWrite); + + return ptr != IntPtr.Zero; + } + public static bool Reprotect(IntPtr address, IntPtr size, Memory.MemoryProtection protection) { MemoryProtection prot = GetProtection(protection); @@ -96,6 +105,20 @@ namespace ARMeilleure.Memory return VirtualProtect(address, size, prot, out _); } + public static IntPtr Reserve(IntPtr size) + { + const AllocationType flags = AllocationType.Reserve; + + IntPtr ptr = VirtualAlloc(IntPtr.Zero, size, flags, MemoryProtection.ReadWrite); + + if (ptr == IntPtr.Zero) + { + throw new OutOfMemoryException(); + } + + return ptr; + } + private static MemoryProtection GetProtection(Memory.MemoryProtection protection) { switch (protection) diff --git a/ARMeilleure/Memory/MemoryManagerPal.cs b/ARMeilleure/Memory/MemoryManagerPal.cs index 64191a0acb..66c436424a 100644 --- a/ARMeilleure/Memory/MemoryManagerPal.cs +++ b/ARMeilleure/Memory/MemoryManagerPal.cs @@ -53,7 +53,7 @@ namespace ARMeilleure.Memory Operand expected = context.LoadArgument(OperandType.V128, 1); Operand desired = context.LoadArgument(OperandType.V128, 2); - Operand result = context.CompareAndSwap128(address, expected, desired); + Operand result = context.CompareAndSwap(address, expected, desired); context.Return(result); diff --git a/ARMeilleure/Memory/ReservedRegion.cs b/ARMeilleure/Memory/ReservedRegion.cs new file mode 100644 index 0000000000..521019adeb --- /dev/null +++ b/ARMeilleure/Memory/ReservedRegion.cs @@ -0,0 +1,53 @@ +using System; +using System.Collections.Generic; +using System.Text; + +namespace ARMeilleure.Memory +{ + class ReservedRegion + { + private const int DefaultGranularity = 65536; // Mapping granularity in Windows. + + public IntPtr Pointer { get; } + + private ulong _maxSize; + private ulong _sizeGranularity; + private ulong _currentSize; + + public ReservedRegion(ulong maxSize, ulong granularity = 0) + { + if (granularity == 0) + { + granularity = DefaultGranularity; + } + + Pointer = MemoryManagement.Reserve(maxSize); + _maxSize = maxSize; + _sizeGranularity = granularity; + _currentSize = 0; + } + + public void ExpandIfNeeded(ulong desiredSize) + { + if (desiredSize > _maxSize) + { + throw new OutOfMemoryException(); + } + + if (desiredSize > _currentSize) + { + // Lock, and then check again. We only want to commit once. + lock (this) + { + if (desiredSize >= _currentSize) + { + ulong overflowBytes = desiredSize - _currentSize; + ulong moreToCommit = (((_sizeGranularity - 1) + overflowBytes) / _sizeGranularity) * _sizeGranularity; // Round up. + MemoryManagement.Commit(new IntPtr((long)Pointer + (long)_currentSize), moreToCommit); + _currentSize += moreToCommit; + } + } + } + } + } +} diff --git a/ARMeilleure/State/ExecutionContext.cs b/ARMeilleure/State/ExecutionContext.cs index 482665dbfe..57a05dbfd0 100644 --- a/ARMeilleure/State/ExecutionContext.cs +++ b/ARMeilleure/State/ExecutionContext.cs @@ -5,7 +5,7 @@ namespace ARMeilleure.State { public class ExecutionContext { - private const int MinCountForCheck = 40000; + private const int MinCountForCheck = 4000; private NativeContext _nativeContext; @@ -57,7 +57,7 @@ namespace ARMeilleure.State } } - public bool Running { get; set; } + internal bool Running { get; private set; } public event EventHandler Interrupt; public event EventHandler Break; @@ -126,6 +126,12 @@ namespace ARMeilleure.State Undefined?.Invoke(this, new InstUndefinedEventArgs(address, opCode)); } + public void StopRunning() + { + Running = false; + _nativeContext.SetCounter(0); + } + public void Dispose() { _nativeContext.Dispose(); diff --git a/ARMeilleure/State/NativeContext.cs b/ARMeilleure/State/NativeContext.cs index eb54505c6a..0ab9a3fd2c 100644 --- a/ARMeilleure/State/NativeContext.cs +++ b/ARMeilleure/State/NativeContext.cs @@ -10,7 +10,7 @@ namespace ARMeilleure.State private const int IntSize = 8; private const int VecSize = 16; private const int FlagSize = 4; - private const int ExtraSize = 4; + private const int ExtraSize = 8; private const int TotalSize = RegisterConsts.IntRegsCount * IntSize + RegisterConsts.VecRegsCount * VecSize + @@ -183,6 +183,14 @@ namespace ARMeilleure.State RegisterConsts.FpFlagsCount * FlagSize; } + public static int GetCallAddressOffset() + { + return RegisterConsts.IntRegsCount * IntSize + + RegisterConsts.VecRegsCount * VecSize + + RegisterConsts.FlagsCount * FlagSize + + RegisterConsts.FpFlagsCount * FlagSize + 4; + } + public void Dispose() { MemoryManagement.Free(BasePtr); diff --git a/ARMeilleure/Translation/ArmEmitterContext.cs b/ARMeilleure/Translation/ArmEmitterContext.cs index d35e985e6c..d1a2c92db5 100644 --- a/ARMeilleure/Translation/ArmEmitterContext.cs +++ b/ARMeilleure/Translation/ArmEmitterContext.cs @@ -41,10 +41,19 @@ namespace ARMeilleure.Translation public Aarch32Mode Mode { get; } - public ArmEmitterContext(MemoryManager memory, Aarch32Mode mode) + public JumpTable JumpTable { get; } + + public long BaseAddress { get; } + + public bool HighCq { get; } + + public ArmEmitterContext(MemoryManager memory, JumpTable jumpTable, long baseAddress, bool highCq, Aarch32Mode mode) { - Memory = memory; - Mode = mode; + Memory = memory; + JumpTable = jumpTable; + BaseAddress = baseAddress; + HighCq = highCq; + Mode = mode; _labels = new Dictionary(); } diff --git a/ARMeilleure/Translation/DirectCallStubs.cs b/ARMeilleure/Translation/DirectCallStubs.cs new file mode 100644 index 0000000000..e6e87b2b61 --- /dev/null +++ b/ARMeilleure/Translation/DirectCallStubs.cs @@ -0,0 +1,131 @@ +using ARMeilleure.Instructions; +using ARMeilleure.IntermediateRepresentation; +using ARMeilleure.State; +using System; +using System.Runtime.InteropServices; + +using static ARMeilleure.IntermediateRepresentation.OperandHelper; + +namespace ARMeilleure.Translation +{ + static class DirectCallStubs + { + private delegate long GuestFunction(IntPtr nativeContextPtr); + + private static GuestFunction _directCallStub; + private static GuestFunction _directTailCallStub; + private static GuestFunction _indirectCallStub; + private static GuestFunction _indirectTailCallStub; + + private static object _lock; + private static bool _initialized; + + static DirectCallStubs() + { + _lock = new object(); + } + + public static void InitializeStubs() + { + if (_initialized) return; + lock (_lock) + { + if (_initialized) return; + _directCallStub = GenerateDirectCallStub(false); + _directTailCallStub = GenerateDirectCallStub(true); + _indirectCallStub = GenerateIndirectCallStub(false); + _indirectTailCallStub = GenerateIndirectCallStub(true); + _initialized = true; + } + } + + public static IntPtr DirectCallStub(bool tailCall) + { + return Marshal.GetFunctionPointerForDelegate(tailCall ? _directTailCallStub : _directCallStub); + } + + public static IntPtr IndirectCallStub(bool tailCall) + { + return Marshal.GetFunctionPointerForDelegate(tailCall ? _indirectTailCallStub : _indirectCallStub); + } + + private static void EmitCall(EmitterContext context, Operand address, bool tailCall) + { + if (tailCall) + { + context.Tailcall(address, context.LoadArgument(OperandType.I64, 0)); + } + else + { + context.Return(context.Call(address, OperandType.I64, context.LoadArgument(OperandType.I64, 0))); + } + } + + /// + /// Generates a stub that is used to find function addresses. Used for direct calls when their jump table does not have the host address yet. + /// Takes a NativeContext like a translated guest function, and extracts the target address from the NativeContext. + /// When the target function is compiled in highCq, all table entries are updated to point to that function instead of this stub by the translator. + /// + private static GuestFunction GenerateDirectCallStub(bool tailCall) + { + EmitterContext context = new EmitterContext(); + + Operand nativeContextPtr = context.LoadArgument(OperandType.I64, 0); + + Operand address = context.Load(OperandType.I64, context.Add(nativeContextPtr, Const((long)NativeContext.GetCallAddressOffset()))); + + address = context.BitwiseOr(address, Const(address.Type, 1)); // Set call flag. + Operand functionAddr = context.Call(new _U64_U64(NativeInterface.GetFunctionAddress), address); + EmitCall(context, functionAddr, tailCall); + + ControlFlowGraph cfg = context.GetControlFlowGraph(); + + OperandType[] argTypes = new OperandType[] + { + OperandType.I64 + }; + + return Compiler.Compile( + cfg, + argTypes, + OperandType.I64, + CompilerOptions.HighCq); + } + + /// + /// Generates a stub that is used to find function addresses and add them to an indirect table. + /// Used for indirect calls entries (already claimed) when their jump table does not have the host address yet. + /// Takes a NativeContext like a translated guest function, and extracts the target indirect table entry from the NativeContext. + /// If the function we find is highCq, the entry in the table is updated to point to that function rather than this stub. + /// + private static GuestFunction GenerateIndirectCallStub(bool tailCall) + { + EmitterContext context = new EmitterContext(); + + Operand nativeContextPtr = context.LoadArgument(OperandType.I64, 0); + + Operand entryAddress = context.Load(OperandType.I64, context.Add(nativeContextPtr, Const((long)NativeContext.GetCallAddressOffset()))); + Operand address = context.Load(OperandType.I64, entryAddress); + + // We need to find the missing function. If the function is HighCq, then it replaces this stub in the indirect table. + // Either way, we call it afterwards. + Operand functionAddr = context.Call(new _U64_U64_U64(NativeInterface.GetIndirectFunctionAddress), address, entryAddress); + + // Call and save the function. + EmitCall(context, functionAddr, tailCall); + + ControlFlowGraph cfg = context.GetControlFlowGraph(); + + OperandType[] argTypes = new OperandType[] + { + OperandType.I64 + }; + + return Compiler.Compile( + cfg, + argTypes, + OperandType.I64, + CompilerOptions.HighCq); + } + } +} diff --git a/ARMeilleure/Translation/EmitterContext.cs b/ARMeilleure/Translation/EmitterContext.cs index a125a715da..a11d25a6db 100644 --- a/ARMeilleure/Translation/EmitterContext.cs +++ b/ARMeilleure/Translation/EmitterContext.cs @@ -143,9 +143,22 @@ namespace ARMeilleure.Translation } } - public Operand CompareAndSwap128(Operand address, Operand expected, Operand desired) + public void Tailcall(Operand address, params Operand[] callArgs) { - return Add(Instruction.CompareAndSwap128, Local(OperandType.V128), address, expected, desired); + Operand[] args = new Operand[callArgs.Length + 1]; + + args[0] = address; + + Array.Copy(callArgs, 0, args, 1, callArgs.Length); + + Add(Instruction.Tailcall, null, args); + + _needsNewBlock = true; + } + + public Operand CompareAndSwap(Operand address, Operand expected, Operand desired) + { + return Add(Instruction.CompareAndSwap, Local(desired.Type), address, expected, desired); } public Operand ConditionalSelect(Operand op1, Operand op2, Operand op3) diff --git a/ARMeilleure/Translation/JitCache.cs b/ARMeilleure/Translation/JitCache.cs index 73f04a966d..b004cc22aa 100644 --- a/ARMeilleure/Translation/JitCache.cs +++ b/ARMeilleure/Translation/JitCache.cs @@ -13,9 +13,11 @@ namespace ARMeilleure.Translation private const int CodeAlignment = 4; // Bytes - private const int CacheSize = 512 * 1024 * 1024; + private const int CacheSize = 2047 * 1024 * 1024; - private static IntPtr _basePointer; + private static ReservedRegion _jitRegion; + + private static IntPtr _basePointer => _jitRegion.Pointer; private static int _offset; @@ -25,10 +27,11 @@ namespace ARMeilleure.Translation static JitCache() { - _basePointer = MemoryManagement.Allocate(CacheSize); + _jitRegion = new ReservedRegion(CacheSize); if (RuntimeInformation.IsOSPlatform(OSPlatform.Windows)) { + _jitRegion.ExpandIfNeeded(PageSize); JitUnwindWindows.InstallFunctionTableHandler(_basePointer, CacheSize); // The first page is used for the table based SEH structs. @@ -97,6 +100,8 @@ namespace ARMeilleure.Translation _offset += codeSize; + _jitRegion.ExpandIfNeeded((ulong)_offset); + if ((ulong)(uint)_offset > CacheSize) { throw new OutOfMemoryException(); diff --git a/ARMeilleure/Translation/JumpTable.cs b/ARMeilleure/Translation/JumpTable.cs new file mode 100644 index 0000000000..5cad294480 --- /dev/null +++ b/ARMeilleure/Translation/JumpTable.cs @@ -0,0 +1,149 @@ +using ARMeilleure.Memory; +using System; +using System.Collections.Concurrent; +using System.Collections.Generic; +using System.Runtime.InteropServices; +using System.Threading; + +namespace ARMeilleure.Translation +{ + class JumpTable + { + public static JumpTable Instance { get; } + + static JumpTable() + { + Instance = new JumpTable(); + } + + // The jump table is a block of (guestAddress, hostAddress) function mappings. + // Each entry corresponds to one branch in a JIT compiled function. The entries are + // reserved specifically for each call. + // The _dependants dictionary can be used to update the hostAddress for any functions that change. + + public const int JumpTableStride = 16; // 8 byte guest address, 8 byte host address + + private const int JumpTableSize = 1048576; + + private const int JumpTableByteSize = JumpTableSize * JumpTableStride; + + // The dynamic table is also a block of (guestAddress, hostAddress) function mappings. + // The main difference is that indirect calls and jumps reserve _multiple_ entries on the table. + // These start out as all 0. When an indirect call is made, it tries to find the guest address on the table. + + // If we get to an empty address, the guestAddress is set to the call that we want. + + // If we get to a guestAddress that matches our own (or we just claimed it), the hostAddress is read. + // If it is non-zero, we immediately branch or call the host function. + // If it is 0, NativeInterface is called to find the rejited address of the call. + // If none is found, the hostAddress entry stays at 0. Otherwise, the new address is placed in the entry. + + // If the table size is exhausted and we didn't find our desired address, we fall back to requesting + // the function from the JIT. + + private const int DynamicTableSize = 1048576; + + public const int DynamicTableElems = 1; + + public const int DynamicTableStride = DynamicTableElems * JumpTableStride; + + private const int DynamicTableByteSize = DynamicTableSize * JumpTableStride * DynamicTableElems; + + private int _tableEnd = 0; + private int _dynTableEnd = 0; + + private ConcurrentDictionary _targets; + private ConcurrentDictionary> _dependants; // TODO: Attach to TranslatedFunction or a wrapper class. + + private ReservedRegion _jumpRegion; + private ReservedRegion _dynamicRegion; + public IntPtr JumpPointer => _jumpRegion.Pointer; + public IntPtr DynamicPointer => _dynamicRegion.Pointer; + + public JumpTable() + { + _jumpRegion = new ReservedRegion(JumpTableByteSize); + _dynamicRegion = new ReservedRegion(DynamicTableByteSize); + + _targets = new ConcurrentDictionary(); + _dependants = new ConcurrentDictionary>(); + } + + public void RegisterFunction(ulong address, TranslatedFunction func) { + address &= ~3UL; + _targets.AddOrUpdate(address, func, (key, oldFunc) => func); + long funcPtr = func.GetPointer().ToInt64(); + + // Update all jump table entries that target this address. + LinkedList myDependants; + if (_dependants.TryGetValue(address, out myDependants)) + { + lock (myDependants) + { + foreach (var entry in myDependants) + { + IntPtr addr = _jumpRegion.Pointer + entry * JumpTableStride; + Marshal.WriteInt64(addr, 8, funcPtr); + } + } + } + } + + public int ReserveDynamicEntry(bool isJump) + { + int entry = Interlocked.Increment(ref _dynTableEnd); + if (entry >= DynamicTableSize) + { + throw new OutOfMemoryException("JIT Dynamic Jump Table exhausted."); + } + + _dynamicRegion.ExpandIfNeeded((ulong)((entry + 1) * DynamicTableStride)); + + // Initialize all host function pointers to the indirect call stub. + + IntPtr addr = _dynamicRegion.Pointer + entry * DynamicTableStride; + long stubPtr = (long)DirectCallStubs.IndirectCallStub(isJump); + + for (int i = 0; i < DynamicTableElems; i++) + { + Marshal.WriteInt64(addr, i * JumpTableStride + 8, stubPtr); + } + + return entry; + } + + public int ReserveTableEntry(long ownerAddress, long address, bool isJump) + { + int entry = Interlocked.Increment(ref _tableEnd); + if (entry >= JumpTableSize) + { + throw new OutOfMemoryException("JIT Direct Jump Table exhausted."); + } + + _jumpRegion.ExpandIfNeeded((ulong)((entry + 1) * JumpTableStride)); + + // Is the address we have already registered? If so, put the function address in the jump table. + // If not, it will point to the direct call stub. + long value = (long)DirectCallStubs.DirectCallStub(isJump); + TranslatedFunction func; + if (_targets.TryGetValue((ulong)address, out func)) + { + value = func.GetPointer().ToInt64(); + } + + // Make sure changes to the function at the target address update this jump table entry. + LinkedList targetDependants = _dependants.GetOrAdd((ulong)address, (addr) => new LinkedList()); + lock (targetDependants) + { + targetDependants.AddLast(entry); + } + + IntPtr addr = _jumpRegion.Pointer + entry * JumpTableStride; + + Marshal.WriteInt64(addr, 0, address); + Marshal.WriteInt64(addr, 8, value); + + return entry; + } + } +} diff --git a/ARMeilleure/Translation/TranslatedFunction.cs b/ARMeilleure/Translation/TranslatedFunction.cs index 06069cf8fe..af01aaab31 100644 --- a/ARMeilleure/Translation/TranslatedFunction.cs +++ b/ARMeilleure/Translation/TranslatedFunction.cs @@ -1,3 +1,5 @@ +using System; +using System.Runtime.InteropServices; using System.Threading; namespace ARMeilleure.Translation @@ -11,6 +13,8 @@ namespace ARMeilleure.Translation private bool _rejit; private int _callCount; + public bool HighCq => !_rejit; + public TranslatedFunction(GuestFunction func, bool rejit) { _func = func; @@ -26,5 +30,10 @@ namespace ARMeilleure.Translation { return _rejit && Interlocked.Increment(ref _callCount) == MinCallsForRejit; } + + public IntPtr GetPointer() + { + return Marshal.GetFunctionPointerForDelegate(_func); + } } } \ No newline at end of file diff --git a/ARMeilleure/Translation/Translator.cs b/ARMeilleure/Translation/Translator.cs index 3008303e76..9d534d58dd 100644 --- a/ARMeilleure/Translation/Translator.cs +++ b/ARMeilleure/Translation/Translator.cs @@ -16,10 +16,14 @@ namespace ARMeilleure.Translation { private const ulong CallFlag = InstEmitFlowHelper.CallFlag; + private const bool AlwaysTranslateFunctions = true; // If false, only translates a single block for lowCq. + private MemoryManager _memory; private ConcurrentDictionary _funcs; + private JumpTable _jumpTable; + private PriorityQueue _backgroundQueue; private AutoResetEvent _backgroundTranslatorEvent; @@ -32,9 +36,13 @@ namespace ARMeilleure.Translation _funcs = new ConcurrentDictionary(); + _jumpTable = JumpTable.Instance; + _backgroundQueue = new PriorityQueue(2); _backgroundTranslatorEvent = new AutoResetEvent(false); + + DirectCallStubs.InitializeStubs(); } private void TranslateQueuedSubs() @@ -46,30 +54,42 @@ namespace ARMeilleure.Translation TranslatedFunction func = Translate(request.Address, request.Mode, highCq: true); _funcs.AddOrUpdate(request.Address, func, (key, oldFunc) => func); + _jumpTable.RegisterFunction(request.Address, func); } else { _backgroundTranslatorEvent.WaitOne(); } } + _backgroundTranslatorEvent.Set(); // Wake up any other background translator threads, to encourage them to exit. } public void Execute(State.ExecutionContext context, ulong address) { if (Interlocked.Increment(ref _threadCount) == 1) { - Thread backgroundTranslatorThread = new Thread(TranslateQueuedSubs) + // Simple heuristic, should be user configurable in future. (1 for 4 core/ht or less, 2 for 6 core+ht etc). + // All threads are normal priority except from the last, which just fills as much of the last core as the os lets it with a low priority. + // If we only have one rejit thread, it should be normal priority as highCq code is performance critical. + // TODO: Use physical cores rather than logical. This only really makes sense for processors with hyperthreading. Requires OS specific code. + int unboundedThreadCount = Math.Max(1, (Environment.ProcessorCount - 6) / 3); + int threadCount = Math.Min(3, unboundedThreadCount); + for (int i = 0; i < threadCount; i++) { - Name = "CPU.BackgroundTranslatorThread", - Priority = ThreadPriority.Lowest - }; + bool last = i != 0 && i == unboundedThreadCount - 1; + Thread backgroundTranslatorThread = new Thread(TranslateQueuedSubs) + { + Name = "CPU.BackgroundTranslatorThread." + i, + Priority = last ? ThreadPriority.Lowest : ThreadPriority.Normal + }; - backgroundTranslatorThread.Start(); + backgroundTranslatorThread.Start(); + } } Statistics.InitializeTimer(); - NativeInterface.RegisterThread(context, _memory); + NativeInterface.RegisterThread(context, _memory, this); do { @@ -98,7 +118,7 @@ namespace ARMeilleure.Translation return nextAddr; } - private TranslatedFunction GetOrTranslate(ulong address, ExecutionMode mode) + internal TranslatedFunction GetOrTranslate(ulong address, ExecutionMode mode) { // TODO: Investigate how we should handle code at unaligned addresses. // Currently, those low bits are used to store special flags. @@ -124,12 +144,12 @@ namespace ARMeilleure.Translation private TranslatedFunction Translate(ulong address, ExecutionMode mode, bool highCq) { - ArmEmitterContext context = new ArmEmitterContext(_memory, Aarch32Mode.User); + ArmEmitterContext context = new ArmEmitterContext(_memory, _jumpTable, (long)address, highCq, Aarch32Mode.User); Logger.StartPass(PassName.Decoding); - Block[] blocks = highCq - ? Decoder.DecodeFunction (_memory, address, mode) + Block[] blocks = AlwaysTranslateFunctions + ? Decoder.DecodeFunction (_memory, address, mode, highCq) : Decoder.DecodeBasicBlock(_memory, address, mode); Logger.EndPass(PassName.Decoding); @@ -216,7 +236,7 @@ namespace ARMeilleure.Translation // with some kind of branch). if (isLastOp && block.Next == null) { - context.Return(Const(opCode.Address + (ulong)opCode.OpCodeSizeInBytes)); + InstEmitFlowHelper.EmitTailContinue(context, Const(opCode.Address + (ulong)opCode.OpCodeSizeInBytes)); } } } @@ -238,7 +258,11 @@ namespace ARMeilleure.Translation context.BranchIfTrue(lblNonZero, count); - context.Call(new _Void(NativeInterface.CheckSynchronization)); + Operand running = context.Call(new _Bool(NativeInterface.CheckSynchronization)); + + context.BranchIfTrue(lblExit, running); + + context.Return(Const(0L)); context.Branch(lblExit); diff --git a/Ryujinx.HLE/HOS/Kernel/Threading/HleScheduler.cs b/Ryujinx.HLE/HOS/Kernel/Threading/HleScheduler.cs index 1a213b924f..c4161d5420 100644 --- a/Ryujinx.HLE/HOS/Kernel/Threading/HleScheduler.cs +++ b/Ryujinx.HLE/HOS/Kernel/Threading/HleScheduler.cs @@ -137,7 +137,7 @@ namespace Ryujinx.HLE.HOS.Kernel.Threading public void ExitThread(KThread thread) { - thread.Context.Running = false; + thread.Context.StopRunning(); CoreManager.Exit(thread.HostThread); } diff --git a/Ryujinx.HLE/HOS/Kernel/Threading/KThread.cs b/Ryujinx.HLE/HOS/Kernel/Threading/KThread.cs index 53eb5bdc9a..cd60c95504 100644 --- a/Ryujinx.HLE/HOS/Kernel/Threading/KThread.cs +++ b/Ryujinx.HLE/HOS/Kernel/Threading/KThread.cs @@ -1141,9 +1141,9 @@ namespace Ryujinx.HLE.HOS.Kernel.Threading { Owner.Translator.Execute(Context, entrypoint); - Context.Dispose(); - ThreadExit(); + + Context.Dispose(); } private void ThreadExit()