diff --git a/ARMeilleure/CodeGen/Optimizations/Optimizer.cs b/ARMeilleure/CodeGen/Optimizations/Optimizer.cs
index d3ffd185e1..8b0c75fd6b 100644
--- a/ARMeilleure/CodeGen/Optimizations/Optimizer.cs
+++ b/ARMeilleure/CodeGen/Optimizations/Optimizer.cs
@@ -136,7 +136,9 @@ namespace ARMeilleure.CodeGen.Optimizations
 
         private static bool HasSideEffects(Node node)
         {
-            return (node is Operation operation) && operation.Instruction == Instruction.Call;
+            return (node is Operation operation) && (operation.Instruction == Instruction.Call
+                || operation.Instruction == Instruction.Tailcall
+                || operation.Instruction == Instruction.CompareAndSwap);
         }
 
         private static bool IsPropagableCopy(Operation operation)
diff --git a/ARMeilleure/CodeGen/X86/Assembler.cs b/ARMeilleure/CodeGen/X86/Assembler.cs
index 70130d90e4..5088e6f0c7 100644
--- a/ARMeilleure/CodeGen/X86/Assembler.cs
+++ b/ARMeilleure/CodeGen/X86/Assembler.cs
@@ -90,6 +90,7 @@ namespace ARMeilleure.CodeGen.X86
             Add(X86Instruction.Cmpps,      new InstructionInfo(BadOp,      BadOp,      BadOp,      BadOp,      0x00000fc2, InstructionFlags.Vex));
             Add(X86Instruction.Cmpsd,      new InstructionInfo(BadOp,      BadOp,      BadOp,      BadOp,      0x00000fc2, InstructionFlags.Vex | InstructionFlags.PrefixF2));
             Add(X86Instruction.Cmpss,      new InstructionInfo(BadOp,      BadOp,      BadOp,      BadOp,      0x00000fc2, InstructionFlags.Vex | InstructionFlags.PrefixF3));
+            Add(X86Instruction.Cmpxchg,    new InstructionInfo(0x00000fb1, BadOp,      BadOp,      BadOp,      BadOp,      InstructionFlags.None));
             Add(X86Instruction.Cmpxchg16b, new InstructionInfo(0x01000fc7, BadOp,      BadOp,      BadOp,      BadOp,      InstructionFlags.RexW));
             Add(X86Instruction.Comisd,     new InstructionInfo(BadOp,      BadOp,      BadOp,      BadOp,      0x00000f2f, InstructionFlags.Vex | InstructionFlags.Prefix66));
             Add(X86Instruction.Comiss,     new InstructionInfo(BadOp,      BadOp,      BadOp,      BadOp,      0x00000f2f, InstructionFlags.Vex));
@@ -117,6 +118,7 @@ namespace ARMeilleure.CodeGen.X86
             Add(X86Instruction.Imul,       new InstructionInfo(BadOp,      0x0000006b, 0x00000069, BadOp,      0x00000faf, InstructionFlags.None));
             Add(X86Instruction.Imul128,    new InstructionInfo(BadOp,      BadOp,      BadOp,      BadOp,      0x050000f7, InstructionFlags.None));
             Add(X86Instruction.Insertps,   new InstructionInfo(BadOp,      BadOp,      BadOp,      BadOp,      0x000f3a21, InstructionFlags.Vex | InstructionFlags.Prefix66));
+            Add(X86Instruction.Jmp,        new InstructionInfo(0x040000ff, BadOp,      BadOp,      BadOp,      BadOp,      InstructionFlags.None));
             Add(X86Instruction.Lea,        new InstructionInfo(BadOp,      BadOp,      BadOp,      BadOp,      0x0000008d, InstructionFlags.None));
             Add(X86Instruction.Maxpd,      new InstructionInfo(BadOp,      BadOp,      BadOp,      BadOp,      0x00000f5f, InstructionFlags.Vex | InstructionFlags.Prefix66));
             Add(X86Instruction.Maxps,      new InstructionInfo(BadOp,      BadOp,      BadOp,      BadOp,      0x00000f5f, InstructionFlags.Vex));
@@ -328,6 +330,13 @@ namespace ARMeilleure.CodeGen.X86
             WriteByte(0x99);
         }
 
+        public void Cmpxchg(MemoryOperand memOp, Operand src)
+        {
+            WriteByte(LockPrefix);
+
+            WriteInstruction(memOp, src, src.Type, X86Instruction.Cmpxchg);
+        }
+
         public void Cmpxchg16b(MemoryOperand memOp)
         {
             WriteByte(LockPrefix);
@@ -480,6 +489,11 @@ namespace ARMeilleure.CodeGen.X86
             }
         }
 
+        public void Jmp(Operand dest)
+        {
+            WriteInstruction(dest, null, OperandType.None, X86Instruction.Jmp);
+        }
+
         public void Lea(Operand dest, Operand source, OperandType type)
         {
             WriteInstruction(dest, source, type, X86Instruction.Lea);
diff --git a/ARMeilleure/CodeGen/X86/CodeGenerator.cs b/ARMeilleure/CodeGen/X86/CodeGenerator.cs
index 32ca6a7812..1d0a4c12ff 100644
--- a/ARMeilleure/CodeGen/X86/CodeGenerator.cs
+++ b/ARMeilleure/CodeGen/X86/CodeGenerator.cs
@@ -34,7 +34,7 @@ namespace ARMeilleure.CodeGen.X86
             Add(Instruction.ByteSwap,                GenerateByteSwap);
             Add(Instruction.Call,                    GenerateCall);
             Add(Instruction.Clobber,                 GenerateClobber);
-            Add(Instruction.CompareAndSwap128,       GenerateCompareAndSwap128);
+            Add(Instruction.CompareAndSwap,          GenerateCompareAndSwap);
             Add(Instruction.CompareEqual,            GenerateCompareEqual);
             Add(Instruction.CompareGreater,          GenerateCompareGreater);
             Add(Instruction.CompareGreaterOrEqual,   GenerateCompareGreaterOrEqual);
@@ -76,6 +76,7 @@ namespace ARMeilleure.CodeGen.X86
             Add(Instruction.Store16,                 GenerateStore16);
             Add(Instruction.Store8,                  GenerateStore8);
             Add(Instruction.Subtract,                GenerateSubtract);
+            Add(Instruction.Tailcall,                GenerateTailcall);
             Add(Instruction.VectorCreateScalar,      GenerateVectorCreateScalar);
             Add(Instruction.VectorExtract,           GenerateVectorExtract);
             Add(Instruction.VectorExtract16,         GenerateVectorExtract16);
@@ -543,13 +544,27 @@ namespace ARMeilleure.CodeGen.X86
             // register allocator, we don't need to produce any code.
         }
 
-        private static void GenerateCompareAndSwap128(CodeGenContext context, Operation operation)
+        private static void GenerateCompareAndSwap(CodeGenContext context, Operation operation)
         {
-            Operand source = operation.GetSource(0);
+            Operand src1 = operation.GetSource(0);
 
-            MemoryOperand memOp = new MemoryOperand(OperandType.I64, source);
+            if (operation.SourcesCount == 5) // CompareAndSwap128 has 5 sources, compared to CompareAndSwap64/32's 3.
+            {
+                MemoryOperand memOp = new MemoryOperand(OperandType.I64, src1);
 
-            context.Assembler.Cmpxchg16b(memOp);
+                context.Assembler.Cmpxchg16b(memOp);
+            }
+            else
+            {
+                Operand src2 = operation.GetSource(1);
+                Operand src3 = operation.GetSource(2);
+
+                EnsureSameType(src2, src3);
+
+                MemoryOperand memOp = new MemoryOperand(src3.Type, src1);
+
+                context.Assembler.Cmpxchg(memOp, src3);
+            }
         }
 
         private static void GenerateCompareEqual(CodeGenContext context, Operation operation)
@@ -1083,6 +1098,13 @@ namespace ARMeilleure.CodeGen.X86
             }
         }
 
+        private static void GenerateTailcall(CodeGenContext context, Operation operation)
+        {
+            WriteEpilogue(context);
+
+            context.Assembler.Jmp(operation.GetSource(0));
+        }
+
         private static void GenerateVectorCreateScalar(CodeGenContext context, Operation operation)
         {
             Operand dest   = operation.Destination;
diff --git a/ARMeilleure/CodeGen/X86/PreAllocator.cs b/ARMeilleure/CodeGen/X86/PreAllocator.cs
index 75844b099b..e20fca9d60 100644
--- a/ARMeilleure/CodeGen/X86/PreAllocator.cs
+++ b/ARMeilleure/CodeGen/X86/PreAllocator.cs
@@ -1,6 +1,7 @@
 using ARMeilleure.CodeGen.RegisterAllocators;
 using ARMeilleure.IntermediateRepresentation;
 using ARMeilleure.Translation;
+using System;
 using System.Collections.Generic;
 using System.Diagnostics;
 
@@ -101,6 +102,17 @@ namespace ARMeilleure.CodeGen.X86
                             }
                             break;
 
+                        case Instruction.Tailcall:
+                            if (callConv == CallConvName.Windows)
+                            {
+                                HandleTailcallWindowsAbi(block.Operations, stackAlloc, node, operation);
+                            } 
+                            else
+                            {
+                                HandleTailcallSystemVAbi(block.Operations, stackAlloc, node, operation);
+                            }
+                            break;
+
                         case Instruction.VectorInsert8:
                             if (!HardwareCapabilities.SupportsSse41)
                             {
@@ -199,32 +211,55 @@ namespace ARMeilleure.CodeGen.X86
 
             switch (operation.Instruction)
             {
-                case Instruction.CompareAndSwap128:
+                case Instruction.CompareAndSwap:
                 {
-                    // Handle the many restrictions of the compare and exchange (16 bytes) instruction:
-                    // - The expected value should be in RDX:RAX.
-                    // - The new value to be written should be in RCX:RBX.
-                    // - The value at the memory location is loaded to RDX:RAX.
-                    void SplitOperand(Operand source, Operand lr, Operand hr)
+                    OperandType type = operation.GetSource(1).Type;
+
+                    if (type == OperandType.V128)
                     {
-                        nodes.AddBefore(node, new Operation(Instruction.VectorExtract, lr, source, Const(0)));
-                        nodes.AddBefore(node, new Operation(Instruction.VectorExtract, hr, source, Const(1)));
+                        // Handle the many restrictions of the compare and exchange (16 bytes) instruction:
+                        // - The expected value should be in RDX:RAX.
+                        // - The new value to be written should be in RCX:RBX.
+                        // - The value at the memory location is loaded to RDX:RAX.
+                        void SplitOperand(Operand source, Operand lr, Operand hr)
+                        {
+                            nodes.AddBefore(node, new Operation(Instruction.VectorExtract, lr, source, Const(0)));
+                            nodes.AddBefore(node, new Operation(Instruction.VectorExtract, hr, source, Const(1)));
+                        }
+
+                        Operand rax = Gpr(X86Register.Rax, OperandType.I64);
+                        Operand rbx = Gpr(X86Register.Rbx, OperandType.I64);
+                        Operand rcx = Gpr(X86Register.Rcx, OperandType.I64);
+                        Operand rdx = Gpr(X86Register.Rdx, OperandType.I64);
+
+                        SplitOperand(operation.GetSource(1), rax, rdx);
+                        SplitOperand(operation.GetSource(2), rbx, rcx);
+
+                        node = nodes.AddAfter(node, new Operation(Instruction.VectorCreateScalar, dest, rax));
+                        node = nodes.AddAfter(node, new Operation(Instruction.VectorInsert,       dest, dest, rdx, Const(1)));
+
+                        operation.SetDestinations(new Operand[] { rdx, rax });
+
+                        operation.SetSources(new Operand[] { operation.GetSource(0), rdx, rax, rcx, rbx });
                     }
+                    else
+                    {
+                        // Handle the many restrictions of the compare and exchange (32/64) instruction:
+                        // - The expected value should be in (E/R)AX.
+                        // - The value at the memory location is loaded to (E/R)AX.
 
-                    Operand rax = Gpr(X86Register.Rax, OperandType.I64);
-                    Operand rbx = Gpr(X86Register.Rbx, OperandType.I64);
-                    Operand rcx = Gpr(X86Register.Rcx, OperandType.I64);
-                    Operand rdx = Gpr(X86Register.Rdx, OperandType.I64);
+                        Operand expected = operation.GetSource(1);
 
-                    SplitOperand(operation.GetSource(1), rax, rdx);
-                    SplitOperand(operation.GetSource(2), rbx, rcx);
+                        Operand rax = Gpr(X86Register.Rax, expected.Type);
 
-                    node = nodes.AddAfter(node, new Operation(Instruction.VectorCreateScalar, dest, rax));
-                    node = nodes.AddAfter(node, new Operation(Instruction.VectorInsert,       dest, dest, rdx, Const(1)));
+                        nodes.AddBefore(node, new Operation(Instruction.Copy, rax, expected));
 
-                    operation.SetDestinations(new Operand[] { rdx, rax });
+                        operation.SetSources(new Operand[] { operation.GetSource(0), rax, operation.GetSource(2) });
 
-                    operation.SetSources(new Operand[] { operation.GetSource(0), rdx, rax, rcx, rbx });
+                        node = nodes.AddAfter(node, new Operation(Instruction.Copy, dest, rax));
+
+                        operation.Destination = rax;
+                    }
 
                     break;
                 }
@@ -829,6 +864,123 @@ namespace ARMeilleure.CodeGen.X86
             return node;
         }
 
+        private static void HandleTailcallSystemVAbi(IntrusiveList<Node> nodes, StackAllocator stackAlloc, Node node, Operation operation)
+        {
+            List<Operand> sources = new List<Operand>();
+
+            sources.Add(operation.GetSource(0));
+
+            int argsCount = operation.SourcesCount - 1;
+
+            int intMax = CallingConvention.GetIntArgumentsOnRegsCount();
+            int vecMax = CallingConvention.GetVecArgumentsOnRegsCount();
+
+            int intCount = 0;
+            int vecCount = 0;
+
+            // Handle arguments passed on registers.
+            for (int index = 0; index < argsCount; index++)
+            {
+                Operand source = operation.GetSource(1 + index);
+
+                bool passOnReg;
+
+                if (source.Type.IsInteger())
+                {
+                    passOnReg = intCount + 1 < intMax;
+                }
+                else
+                {
+                    passOnReg = vecCount < vecMax;
+                }
+
+                if (source.Type == OperandType.V128 && passOnReg)
+                {
+                    // V128 is a struct, we pass each half on a GPR if possible.
+                    Operand argReg = Gpr(CallingConvention.GetIntArgumentRegister(intCount++), OperandType.I64);
+                    Operand argReg2 = Gpr(CallingConvention.GetIntArgumentRegister(intCount++), OperandType.I64);
+
+                    nodes.AddBefore(node, new Operation(Instruction.VectorExtract, argReg, source, Const(0)));
+                    nodes.AddBefore(node, new Operation(Instruction.VectorExtract, argReg2, source, Const(1)));
+
+                    continue;
+                }
+
+                if (passOnReg)
+                {
+                    Operand argReg = source.Type.IsInteger()
+                        ? Gpr(CallingConvention.GetIntArgumentRegister(intCount++), source.Type)
+                        : Xmm(CallingConvention.GetVecArgumentRegister(vecCount++), source.Type);
+
+                    Operation copyOp = new Operation(Instruction.Copy, argReg, source);
+
+                    HandleConstantCopy(nodes, nodes.AddBefore(node, copyOp), copyOp);
+
+                    sources.Add(argReg);
+                } 
+                else
+                {
+                    throw new NotImplementedException("Spilling is not currently supported for tail calls. (too many arguments)");
+                }
+            }
+
+            // The target address must be on the return registers, since we
+            // don't return anything and it is guaranteed to not be a
+            // callee saved register (which would be trashed on the epilogue).
+            Operand retReg = Gpr(CallingConvention.GetIntReturnRegister(), OperandType.I64);
+
+            Operation addrCopyOp = new Operation(Instruction.Copy, retReg, operation.GetSource(0));
+
+            nodes.AddBefore(node, addrCopyOp);
+
+            sources[0] = retReg;
+
+            operation.SetSources(sources.ToArray());
+        }
+
+        private static void HandleTailcallWindowsAbi(IntrusiveList<Node> nodes, StackAllocator stackAlloc, Node node, Operation operation)
+        {
+            int argsCount = operation.SourcesCount - 1;
+
+            int maxArgs = CallingConvention.GetArgumentsOnRegsCount();
+
+            if (argsCount > maxArgs)
+            {
+                throw new NotImplementedException("Spilling is not currently supported for tail calls. (too many arguments)");
+            }
+
+            Operand[] sources = new Operand[1 + argsCount];
+
+            // Handle arguments passed on registers.
+            for (int index = 0; index < argsCount; index++)
+            {
+                Operand source = operation.GetSource(1 + index);
+
+                Operand argReg = source.Type.IsInteger()
+                    ? Gpr(CallingConvention.GetIntArgumentRegister(index), source.Type)
+                    : Xmm(CallingConvention.GetVecArgumentRegister(index), source.Type);
+
+                Operation copyOp = new Operation(Instruction.Copy, argReg, source);
+
+                HandleConstantCopy(nodes, nodes.AddBefore(node, copyOp), copyOp);
+
+                sources[1 + index] = argReg;
+            }
+
+            // The target address must be on the return registers, since we
+            // don't return anything and it is guaranteed to not be a
+            // callee saved register (which would be trashed on the epilogue).
+            Operand retReg = Gpr(CallingConvention.GetIntReturnRegister(), OperandType.I64);
+
+            Operation addrCopyOp = new Operation(Instruction.Copy, retReg, operation.GetSource(0));
+
+            nodes.AddBefore(node, addrCopyOp);
+
+            sources[0] = retReg;
+
+            operation.SetSources(sources);
+        }
+
         private static void HandleLoadArgumentWindowsAbi(
             CompilerContext cctx,
             IntrusiveList<Node> nodes,
diff --git a/ARMeilleure/CodeGen/X86/X86Instruction.cs b/ARMeilleure/CodeGen/X86/X86Instruction.cs
index 813730f2a3..a6dbf1a5b7 100644
--- a/ARMeilleure/CodeGen/X86/X86Instruction.cs
+++ b/ARMeilleure/CodeGen/X86/X86Instruction.cs
@@ -23,6 +23,7 @@ namespace ARMeilleure.CodeGen.X86
         Cmpps,
         Cmpsd,
         Cmpss,
+        Cmpxchg,
         Cmpxchg16b,
         Comisd,
         Comiss,
@@ -50,6 +51,7 @@ namespace ARMeilleure.CodeGen.X86
         Imul,
         Imul128,
         Insertps,
+        Jmp,
         Lea,
         Maxpd,
         Maxps,
diff --git a/ARMeilleure/Decoders/Block.cs b/ARMeilleure/Decoders/Block.cs
index 3d13c2d5e4..d38b5a8ec4 100644
--- a/ARMeilleure/Decoders/Block.cs
+++ b/ARMeilleure/Decoders/Block.cs
@@ -11,6 +11,8 @@ namespace ARMeilleure.Decoders
         public Block Next   { get; set; }
         public Block Branch { get; set; }
 
+        public bool TailCall { get; set; }
+
         public List<OpCode> OpCodes { get; private set; }
 
         public Block()
diff --git a/ARMeilleure/Decoders/Decoder.cs b/ARMeilleure/Decoders/Decoder.cs
index 7cbb62e6c5..9675dc8db9 100644
--- a/ARMeilleure/Decoders/Decoder.cs
+++ b/ARMeilleure/Decoders/Decoder.cs
@@ -1,3 +1,4 @@
+using ARMeilleure.Decoders.Optimizations;
 using ARMeilleure.Instructions;
 using ARMeilleure.Memory;
 using ARMeilleure.State;
@@ -15,6 +16,9 @@ namespace ARMeilleure.Decoders
         // take too long to compile and use too much memory.
         private const int MaxInstsPerFunction = 5000;
 
+        // For lower code quality translation, we set a lower limit since we're blocking execution.
+        private const int MaxInstsPerFunctionLowCq = 500;
+
         private delegate object MakeOp(InstDescriptor inst, ulong address, int opCode);
 
         private static ConcurrentDictionary<Type, MakeOp> _opActivators;
@@ -33,7 +37,7 @@ namespace ARMeilleure.Decoders
             return new Block[] { block };
         }
 
-        public static Block[] DecodeFunction(MemoryManager memory, ulong address, ExecutionMode mode)
+        public static Block[] DecodeFunction(MemoryManager memory, ulong address, ExecutionMode mode, bool highCq)
         {
             List<Block> blocks = new List<Block>();
 
@@ -43,11 +47,13 @@ namespace ARMeilleure.Decoders
 
             int opsCount = 0;
 
+            int instructionLimit = highCq ? MaxInstsPerFunction : MaxInstsPerFunctionLowCq;
+
             Block GetBlock(ulong blkAddress)
             {
                 if (!visited.TryGetValue(blkAddress, out Block block))
                 {
-                    if (opsCount > MaxInstsPerFunction || !memory.IsMapped((long)blkAddress))
+                    if (opsCount > instructionLimit || !memory.IsMapped((long)blkAddress))
                     {
                         return null;
                     }
@@ -121,7 +127,7 @@ namespace ARMeilleure.Decoders
                         currBlock.Branch = GetBlock((ulong)op.Immediate);
                     }
 
-                    if (!IsUnconditionalBranch(lastOp) /*|| isCall*/)
+                    if (!IsUnconditionalBranch(lastOp) || isCall)
                     {
                         currBlock.Next = GetBlock(currBlock.EndAddress);
                     }
@@ -140,10 +146,12 @@ namespace ARMeilleure.Decoders
                 }
             }
 
+            TailCallRemover.RunPass(address, blocks);
+
             return blocks.ToArray();
         }
 
-        private static bool BinarySearch(List<Block> blocks, ulong address, out int index)
+        public static bool BinarySearch(List<Block> blocks, ulong address, out int index)
         {
             index = 0;
 
diff --git a/ARMeilleure/Decoders/Optimizations/TailCallRemover.cs b/ARMeilleure/Decoders/Optimizations/TailCallRemover.cs
new file mode 100644
index 0000000000..2d6439bac0
--- /dev/null
+++ b/ARMeilleure/Decoders/Optimizations/TailCallRemover.cs
@@ -0,0 +1,75 @@
+using ARMeilleure.Decoders;
+using System;
+using System.Collections.Generic;
+
+namespace ARMeilleure.Decoders.Optimizations
+{
+    static class TailCallRemover
+    {
+        public static void RunPass(ulong entryAddress, List<Block> blocks)
+        {
+            // Detect tail calls:
+            // - Assume this function spans the space covered by contiguous code blocks surrounding the entry address.
+            // - Unconditional jump to an area outside this contiguous region will be treated as a tail call.
+            // - Include a small allowance for jumps outside the contiguous range.
+
+            if (!Decoder.BinarySearch(blocks, entryAddress, out int entryBlockId))
+            {
+                throw new InvalidOperationException("Function entry point is not contained in a block.");
+            }
+
+            const ulong allowance = 4;
+            Block entryBlock = blocks[entryBlockId];
+            int startBlockIndex = entryBlockId;
+            Block startBlock = entryBlock;
+            int endBlockIndex = entryBlockId;
+            Block endBlock = entryBlock;
+
+            for (int i = entryBlockId + 1; i < blocks.Count; i++) // Search forwards.
+            {
+                Block block = blocks[i];
+                if (endBlock.EndAddress < block.Address - allowance)
+                {
+                    break; // End of contiguous function.
+                }
+
+                endBlock = block;
+                endBlockIndex = i;
+            }
+
+            for (int i = entryBlockId - 1; i >= 0; i--) // Search backwards.
+            {
+                Block block = blocks[i];
+                if (startBlock.Address > block.EndAddress + allowance)
+                {
+                    break; // End of contiguous function.
+                }
+
+                startBlock = block;
+                startBlockIndex = i;
+            }
+
+            if (startBlockIndex == 0 && endBlockIndex == blocks.Count - 1)
+            {
+                return; // Nothing to do here.
+            }
+
+            // Replace all branches to blocks outside the range with null, and force a tail call.
+
+            for (int i = startBlockIndex; i <= endBlockIndex; i++)
+            {
+                Block block = blocks[i];
+                if (block.Branch != null && (block.Branch.Address > endBlock.EndAddress || block.Branch.EndAddress < startBlock.Address))
+                {
+                    block.Branch = null;
+                    block.TailCall = true;
+                }
+            }
+
+            // Finally, delete all blocks outside the contiguous range.
+
+            blocks.RemoveRange(endBlockIndex + 1, (blocks.Count - endBlockIndex) - 1);
+            blocks.RemoveRange(0, startBlockIndex);
+        }
+    }
+}
diff --git a/ARMeilleure/Instructions/DelegateTypes.cs b/ARMeilleure/Instructions/DelegateTypes.cs
index b65149cb81..41614f88e5 100644
--- a/ARMeilleure/Instructions/DelegateTypes.cs
+++ b/ARMeilleure/Instructions/DelegateTypes.cs
@@ -3,6 +3,8 @@ using System;
 
 namespace ARMeilleure.Instructions
 {
+    delegate bool _Bool();
+
     delegate double _F64_F64(double a1);
     delegate double _F64_F64_Bool(double a1, bool a2);
     delegate double _F64_F64_F64(double a1, double a2);
diff --git a/ARMeilleure/Instructions/InstEmitAluHelper.cs b/ARMeilleure/Instructions/InstEmitAluHelper.cs
index 916a1da5a9..12fa1bf1b5 100644
--- a/ARMeilleure/Instructions/InstEmitAluHelper.cs
+++ b/ARMeilleure/Instructions/InstEmitAluHelper.cs
@@ -116,12 +116,14 @@ namespace ARMeilleure.Instructions
         {
             Debug.Assert(value.Type == OperandType.I32);
 
-            context.StoreToContext();
-
             if (IsThumb(context.CurrOp))
             {
-                // Make this count as a call, the translator will ignore the low bit for the address.
-                context.Return(context.ZeroExtend32(OperandType.I64, context.BitwiseOr(value, Const(1))));
+                context.StoreToContext();
+                bool isReturn = IsA32Return(context);
+
+                Operand addr = context.BitwiseOr(value, Const(1));
+
+                InstEmitFlowHelper.EmitVirtualJump(context, addr, isReturn);
             }
             else
             {
@@ -138,18 +140,8 @@ namespace ARMeilleure.Instructions
                 if (setFlags)
                 {
                     // TODO: Load SPSR etc.
-                    Operand isThumb = GetFlag(PState.TFlag);
 
-                    Operand lblThumb = Label();
-
-                    context.BranchIfTrue(lblThumb, isThumb);
-
-                    // Make this count as a call, the translator will ignore the low bit for the address.
-                    context.Return(context.ZeroExtend32(OperandType.I64, context.BitwiseOr(context.BitwiseAnd(value, Const(~3)), Const(1))));
-
-                    context.MarkLabel(lblThumb);
-
-                    context.Return(context.ZeroExtend32(OperandType.I64, context.BitwiseOr(value, Const(1))));
+                    EmitBxWritePc(context, value);
                 }
                 else
                 {
diff --git a/ARMeilleure/Instructions/InstEmitException.cs b/ARMeilleure/Instructions/InstEmitException.cs
index 6f7b6fd51f..f0bde242a6 100644
--- a/ARMeilleure/Instructions/InstEmitException.cs
+++ b/ARMeilleure/Instructions/InstEmitException.cs
@@ -2,6 +2,7 @@ using ARMeilleure.Decoders;
 using ARMeilleure.Translation;
 using System;
 
+using static ARMeilleure.Instructions.InstEmitFlowHelper;
 using static ARMeilleure.IntermediateRepresentation.OperandHelper;
 
 namespace ARMeilleure.Instructions
@@ -30,7 +31,7 @@ namespace ARMeilleure.Instructions
 
             if (context.CurrBlock.Next == null)
             {
-                context.Return(Const(op.Address + 4));
+                EmitTailContinue(context, Const(op.Address + 4));
             }
         }
 
@@ -48,7 +49,7 @@ namespace ARMeilleure.Instructions
 
             if (context.CurrBlock.Next == null)
             {
-                context.Return(Const(op.Address + 4));
+                EmitTailContinue(context, Const(op.Address + 4));
             }
         }
     }
diff --git a/ARMeilleure/Instructions/InstEmitException32.cs b/ARMeilleure/Instructions/InstEmitException32.cs
index a73f0dec77..8ffad1d1fc 100644
--- a/ARMeilleure/Instructions/InstEmitException32.cs
+++ b/ARMeilleure/Instructions/InstEmitException32.cs
@@ -1,6 +1,7 @@
 using ARMeilleure.Decoders;
 using ARMeilleure.Translation;
 
+using static ARMeilleure.Instructions.InstEmitFlowHelper;
 using static ARMeilleure.IntermediateRepresentation.OperandHelper;
 
 namespace ARMeilleure.Instructions
@@ -29,7 +30,7 @@ namespace ARMeilleure.Instructions
 
             if (context.CurrBlock.Next == null)
             {
-                context.Return(Const(op.Address + 4));
+                EmitTailContinue(context, Const(op.Address + 4));
             }
         }
     }
diff --git a/ARMeilleure/Instructions/InstEmitFlow.cs b/ARMeilleure/Instructions/InstEmitFlow.cs
index 93d36e1b94..bac9ec588c 100644
--- a/ARMeilleure/Instructions/InstEmitFlow.cs
+++ b/ARMeilleure/Instructions/InstEmitFlow.cs
@@ -21,7 +21,7 @@ namespace ARMeilleure.Instructions
             }
             else
             {
-                context.Return(Const(op.Immediate));
+                EmitTailContinue(context, Const(op.Immediate), context.CurrBlock.TailCall);
             }
         }
 
@@ -56,7 +56,7 @@ namespace ARMeilleure.Instructions
         {
             OpCodeBReg op = (OpCodeBReg)context.CurrOp;
 
-            EmitVirtualJump(context, GetIntOrZR(context, op.Rn));
+            EmitVirtualJump(context, GetIntOrZR(context, op.Rn), op.Rn == RegisterAlias.Lr);
         }
 
         public static void Cbnz(ArmEmitterContext context) => EmitCb(context, onNotZero: true);
@@ -71,7 +71,7 @@ namespace ARMeilleure.Instructions
 
         public static void Ret(ArmEmitterContext context)
         {
-            context.Return(context.BitwiseOr(GetIntOrZR(context, RegisterAlias.Lr), Const(CallFlag)));
+            context.Return(GetIntOrZR(context, RegisterAlias.Lr));
         }
 
         public static void Tbnz(ArmEmitterContext context) => EmitTb(context, onNotZero: true);
@@ -96,7 +96,7 @@ namespace ARMeilleure.Instructions
 
                 if (context.CurrBlock.Next == null)
                 {
-                    context.Return(Const(op.Address + 4));
+                    EmitTailContinue(context, Const(op.Address + 4));
                 }
             }
             else
@@ -105,11 +105,11 @@ namespace ARMeilleure.Instructions
 
                 EmitCondBranch(context, lblTaken, cond);
 
-                context.Return(Const(op.Address + 4));
+                EmitTailContinue(context, Const(op.Address + 4));
 
                 context.MarkLabel(lblTaken);
 
-                context.Return(Const(op.Immediate));
+                EmitTailContinue(context, Const(op.Immediate));
             }
         }
 
@@ -132,7 +132,7 @@ namespace ARMeilleure.Instructions
 
                 if (context.CurrBlock.Next == null)
                 {
-                    context.Return(Const(op.Address + 4));
+                    EmitTailContinue(context, Const(op.Address + 4));
                 }
             }
             else
@@ -148,11 +148,11 @@ namespace ARMeilleure.Instructions
                     context.BranchIfFalse(lblTaken, value);
                 }
 
-                context.Return(Const(op.Address + 4));
+                EmitTailContinue(context, Const(op.Address + 4));
 
                 context.MarkLabel(lblTaken);
 
-                context.Return(Const(op.Immediate));
+                EmitTailContinue(context, Const(op.Immediate));
             }
         }
     }
diff --git a/ARMeilleure/Instructions/InstEmitFlow32.cs b/ARMeilleure/Instructions/InstEmitFlow32.cs
index cbb9ad5b26..47233eb99a 100644
--- a/ARMeilleure/Instructions/InstEmitFlow32.cs
+++ b/ARMeilleure/Instructions/InstEmitFlow32.cs
@@ -21,8 +21,7 @@ namespace ARMeilleure.Instructions
             }
             else
             {
-                context.StoreToContext();
-                context.Return(Const(op.Immediate));
+                EmitTailContinue(context, Const(op.Immediate));
             }
         }
 
@@ -57,7 +56,7 @@ namespace ARMeilleure.Instructions
                 SetFlag(context, PState.TFlag, Const(isThumb ? 0 : 1));
             }
 
-            InstEmitFlowHelper.EmitCall(context, (ulong)op.Immediate);
+            EmitCall(context, (ulong)op.Immediate);
         }
 
         public static void Blxr(ArmEmitterContext context)
@@ -66,9 +65,8 @@ namespace ARMeilleure.Instructions
 
             uint pc = op.GetPc();
 
-            Operand addr = GetIntA32(context, op.Rm);
+            Operand addr = context.Copy(GetIntA32(context, op.Rm));
             Operand bitOne = context.BitwiseAnd(addr, Const(1));
-            addr = context.BitwiseOr(addr, Const((int)CallFlag)); // Set call flag.
 
             bool isThumb = IsThumb(context.CurrOp);
 
@@ -80,16 +78,14 @@ namespace ARMeilleure.Instructions
 
             SetFlag(context, PState.TFlag, bitOne);
 
-            context.Return(addr); // Call.
+            EmitVirtualCall(context, addr);
         }
 
         public static void Bx(ArmEmitterContext context)
         {
             IOpCode32BReg op = (IOpCode32BReg)context.CurrOp;
 
-            context.StoreToContext();
-
-            EmitBxWritePc(context, GetIntA32(context, op.Rm));
+            EmitBxWritePc(context, GetIntA32(context, op.Rm), op.Rm);
         }
     }
 }
\ No newline at end of file
diff --git a/ARMeilleure/Instructions/InstEmitFlowHelper.cs b/ARMeilleure/Instructions/InstEmitFlowHelper.cs
index a8eb21d33f..f0a81e8557 100644
--- a/ARMeilleure/Instructions/InstEmitFlowHelper.cs
+++ b/ARMeilleure/Instructions/InstEmitFlowHelper.cs
@@ -2,6 +2,7 @@ using ARMeilleure.Decoders;
 using ARMeilleure.IntermediateRepresentation;
 using ARMeilleure.State;
 using ARMeilleure.Translation;
+using System;
 
 using static ARMeilleure.Instructions.InstEmitHelper;
 using static ARMeilleure.IntermediateRepresentation.OperandHelper;
@@ -142,7 +143,29 @@ namespace ARMeilleure.Instructions
 
         public static void EmitCall(ArmEmitterContext context, ulong immediate)
         {
-            context.Return(Const(immediate | CallFlag));
+            EmitJumpTableBranch(context, Const(immediate));
+        }
+
+        private static void EmitNativeCall(ArmEmitterContext context, Operand nativeContextPtr, Operand funcAddr, bool isJump = false)
+        {
+            context.StoreToContext();
+            Operand returnAddress;
+            if (isJump)
+            {
+                context.Tailcall(funcAddr, nativeContextPtr);
+            }
+            else
+            {
+                returnAddress = context.Call(funcAddr, OperandType.I64, nativeContextPtr);
+                context.LoadFromContext();
+
+                EmitContinueOrReturnCheck(context, returnAddress);
+            }
+        }
+
+        private static void EmitNativeCall(ArmEmitterContext context, Operand funcAddr, bool isJump = false)
+        {
+            EmitNativeCall(context, context.LoadArgument(OperandType.I64, 0), funcAddr, isJump);
         }
 
         public static void EmitVirtualCall(ArmEmitterContext context, Operand target)
@@ -150,37 +173,45 @@ namespace ARMeilleure.Instructions
             EmitVirtualCallOrJump(context, target, isJump: false);
         }
 
-        public static void EmitVirtualJump(ArmEmitterContext context, Operand target)
+        public static void EmitVirtualJump(ArmEmitterContext context, Operand target, bool isReturn)
         {
-            EmitVirtualCallOrJump(context, target, isJump: true);
+            EmitVirtualCallOrJump(context, target, isJump: true, isReturn: isReturn);
         }
 
-        private static void EmitVirtualCallOrJump(ArmEmitterContext context, Operand target, bool isJump)
+        private static void EmitVirtualCallOrJump(ArmEmitterContext context, Operand target, bool isJump, bool isReturn = false)
         {
-            context.Return(context.BitwiseOr(target, Const(target.Type, (long)CallFlag)));
-        }
-
-        private static void EmitContinueOrReturnCheck(ArmEmitterContext context, Operand retVal)
-        {
-            // Note: The return value of the called method will be placed
-            // at the Stack, the return value is always a Int64 with the
-            // return address of the function. We check if the address is
-            // correct, if it isn't we keep returning until we reach the dispatcher.
-            ulong nextAddr = GetNextOpAddress(context.CurrOp);
-
-            if (context.CurrBlock.Next != null)
+            if (isReturn)
             {
-                Operand lblContinue = Label();
-
-                context.BranchIfTrue(lblContinue, context.ICompareEqual(retVal, Const(nextAddr)));
-
-                context.Return(Const(nextAddr));
-
-                context.MarkLabel(lblContinue);
+                context.Return(target);
             }
             else
             {
-                context.Return(Const(nextAddr));
+                EmitJumpTableBranch(context, target, isJump);
+            }
+        }
+
+        private static void EmitContinueOrReturnCheck(ArmEmitterContext context, Operand returnAddress)
+        {
+            // Note: The return value of a translated function is always an Int64 with the
+            // address execution has returned to. We expect this address to be immediately after the
+            // current instruction, if it isn't we keep returning until we reach the dispatcher.
+            Operand nextAddr = Const(GetNextOpAddress(context.CurrOp));
+
+            // Try to continue within this block.
+            // If the return address isn't to our next instruction, we need to return so the JIT can figure out what to do.
+            Operand lblContinue = Label();
+
+            // We need to clear out the call flag for the return address before comparing it.
+            context.BranchIfTrue(lblContinue, context.ICompareEqual(context.BitwiseAnd(returnAddress, Const(~CallFlag)), nextAddr));
+
+            context.Return(returnAddress);
+
+            context.MarkLabel(lblContinue);
+
+            if (context.CurrBlock.Next == null)
+            {
+                // No code following this instruction, try and find the next block and jump to it.
+                EmitTailContinue(context, nextAddr);
             }
         }
 
@@ -188,5 +219,134 @@ namespace ARMeilleure.Instructions
         {
             return op.Address + (ulong)op.OpCodeSizeInBytes;
         }
+
+        public static void EmitTailContinue(ArmEmitterContext context, Operand address, bool allowRejit = false)
+        {
+            bool useTailContinue = true; // Left option here as it may be useful if we need to return to managed rather than tail call in future. (eg. for debug)
+            if (useTailContinue)
+            {
+                if (allowRejit)
+                {
+                    address = context.BitwiseOr(address, Const(1L));
+                }
+
+                Operand fallbackAddr = context.Call(new _U64_U64(NativeInterface.GetFunctionAddress), address);
+
+                EmitNativeCall(context, fallbackAddr, true);
+            } 
+            else
+            {
+                context.Return(address);
+            }
+        }
+
+        private static void EmitNativeCallWithGuestAddress(ArmEmitterContext context, Operand funcAddr, Operand guestAddress, bool isJump)
+        {
+            Operand nativeContextPtr = context.LoadArgument(OperandType.I64, 0);
+            context.Store(context.Add(nativeContextPtr, Const(NativeContext.GetCallAddressOffset())), guestAddress);
+
+            EmitNativeCall(context, nativeContextPtr, funcAddr, isJump);
+        }
+
+        private static void EmitBranchFallback(ArmEmitterContext context, Operand address, bool isJump)
+        {
+            address = context.BitwiseOr(address, Const(address.Type, (long)CallFlag)); // Set call flag.
+            Operand fallbackAddr = context.Call(new _U64_U64(NativeInterface.GetFunctionAddress), address);
+            EmitNativeCall(context, fallbackAddr, isJump);
+        }
+
+        public static void EmitDynamicTableCall(ArmEmitterContext context, Operand tableAddress, Operand address, bool isJump)
+        {
+            // Loop over elements of the dynamic table. Unrolled loop.
+
+            Operand endLabel = Label();
+            Operand fallbackLabel = Label();
+
+            Action<Operand> emitTableEntry = (Operand entrySkipLabel) =>
+            {
+                // Try to take this entry in the table if its guest address equals 0.
+                Operand gotResult = context.CompareAndSwap(tableAddress, Const(0L), address);
+
+                // Is the address ours? (either taken via CompareAndSwap (0), or what was already here)
+                context.BranchIfFalse(entrySkipLabel, context.BitwiseOr(context.ICompareEqual(gotResult, address), context.ICompareEqual(gotResult, Const(0L))));
+
+                // It's ours, so what function is it pointing to?
+                Operand targetFunctionPtr = context.Add(tableAddress, Const(8L));
+                Operand targetFunction = context.Load(OperandType.I64, targetFunctionPtr);
+
+                // Call the function.
+                // We pass in the entry address as the guest address, as the entry may need to be updated by the indirect call stub.
+                EmitNativeCallWithGuestAddress(context, targetFunction, tableAddress, isJump);
+                context.Branch(endLabel);
+            };
+
+            // Currently this uses a size of 1, as higher values inflate code size for no real benefit.
+            for (int i = 0; i < JumpTable.DynamicTableElems; i++) 
+            {
+                if (i == JumpTable.DynamicTableElems - 1)
+                {
+                    emitTableEntry(fallbackLabel); // If this is the last entry, avoid emitting the additional label and add.
+                } 
+                else
+                {
+                    Operand nextLabel = Label();
+
+                    emitTableEntry(nextLabel);
+
+                    context.MarkLabel(nextLabel);
+                    tableAddress = context.Add(tableAddress, Const((long)JumpTable.JumpTableStride)); // Move to the next table entry.
+                }
+            }
+
+            context.MarkLabel(fallbackLabel);
+
+            EmitBranchFallback(context, address, isJump);
+
+            context.MarkLabel(endLabel);
+        }
+
+        public static void EmitJumpTableBranch(ArmEmitterContext context, Operand address, bool isJump = false)
+        {
+            if (address.Type == OperandType.I32)
+            {
+                address = context.ZeroExtend32(OperandType.I64, address);
+            }
+
+            // TODO: Constant folding. Indirect calls are slower in the best case and emit more code so we want to avoid them when possible.
+            bool isConst = address.Kind == OperandKind.Constant;
+            long constAddr = (long)address.Value;
+
+            if (!context.HighCq)
+            {
+                // Don't emit indirect calls or jumps if we're compiling in lowCq mode.
+                // This avoids wasting space on the jump and indirect tables.
+                // Just ask the translator for the function address.
+
+                EmitBranchFallback(context, address, isJump);
+            }
+            else if (!isConst)
+            {
+                // Virtual branch/call - store first used addresses on a small table for fast lookup.
+                int entry = context.JumpTable.ReserveDynamicEntry(isJump);
+
+                int jumpOffset = entry * JumpTable.JumpTableStride * JumpTable.DynamicTableElems;
+                Operand dynTablePtr = Const(context.JumpTable.DynamicPointer.ToInt64() + jumpOffset);
+
+                EmitDynamicTableCall(context, dynTablePtr, address, isJump);
+            }
+            else
+            {
+                int entry = context.JumpTable.ReserveTableEntry(context.BaseAddress & (~3L), constAddr, isJump);
+
+                int jumpOffset = entry * JumpTable.JumpTableStride + 8; // Offset directly to the host address.
+
+                // TODO: Relocatable jump table ptr for AOT. Would prefer a solution to patch this constant into functions as they are loaded rather than calculate at runtime.
+                Operand tableEntryPtr = Const(context.JumpTable.JumpPointer.ToInt64() + jumpOffset);
+
+                Operand funcAddr = context.Load(OperandType.I64, tableEntryPtr);
+
+                EmitNativeCallWithGuestAddress(context, funcAddr, address, isJump); // Call the function directly. If it's not present yet, this will call the direct call stub.
+            }
+        }
     }
 }
diff --git a/ARMeilleure/Instructions/InstEmitHelper.cs b/ARMeilleure/Instructions/InstEmitHelper.cs
index f5495c6600..a4227543fa 100644
--- a/ARMeilleure/Instructions/InstEmitHelper.cs
+++ b/ARMeilleure/Instructions/InstEmitHelper.cs
@@ -144,22 +144,34 @@ namespace ARMeilleure.Instructions
             }
         }
 
-        public static void EmitBxWritePc(ArmEmitterContext context, Operand pc)
+        public static bool IsA32Return(ArmEmitterContext context)
         {
+            switch (context.CurrOp)
+            {
+                case IOpCode32MemMult op:
+                    return true; // Setting PC using LDM is nearly always a return.
+                case OpCode32AluRsImm op:
+                    return op.Rm == RegisterAlias.Aarch32Lr;
+                case OpCode32AluRsReg op:
+                    return op.Rm == RegisterAlias.Aarch32Lr;
+                case OpCode32AluReg op:
+                    return op.Rm == RegisterAlias.Aarch32Lr;
+                case OpCode32Mem op:
+                    return op.Rn == RegisterAlias.Aarch32Sp && op.WBack && !op.Index; // Setting PC to an address stored on the stack is nearly always a return.
+            }
+            return false;
+        }
+
+        public static void EmitBxWritePc(ArmEmitterContext context, Operand pc, int sourceRegister = 0)
+        {
+            bool isReturn = sourceRegister == RegisterAlias.Aarch32Lr || IsA32Return(context);
             Operand mode = context.BitwiseAnd(pc, Const(1));
 
             SetFlag(context, PState.TFlag, mode);
 
-            Operand lblArmMode = Label();
+            Operand addr = context.ConditionalSelect(mode, context.BitwiseOr(pc, Const((int)InstEmitFlowHelper.CallFlag)), context.BitwiseAnd(pc, Const(~3)));
 
-            context.BranchIfTrue(lblArmMode, mode);
-
-            // Make this count as a call, the translator will ignore the low bit for the address.
-            context.Return(context.ZeroExtend32(OperandType.I64, context.BitwiseOr(pc, Const((int)InstEmitFlowHelper.CallFlag))));
-
-            context.MarkLabel(lblArmMode);
-
-            context.Return(context.ZeroExtend32(OperandType.I64, context.BitwiseOr(context.BitwiseAnd(pc, Const(~3)), Const((int)InstEmitFlowHelper.CallFlag))));
+            InstEmitFlowHelper.EmitVirtualJump(context, addr, isReturn);
         }
 
         public static Operand GetIntOrZR(ArmEmitterContext context, int regIndex)
diff --git a/ARMeilleure/Instructions/InstEmitMemoryHelper.cs b/ARMeilleure/Instructions/InstEmitMemoryHelper.cs
index 70861d1634..e1dec3313a 100644
--- a/ARMeilleure/Instructions/InstEmitMemoryHelper.cs
+++ b/ARMeilleure/Instructions/InstEmitMemoryHelper.cs
@@ -51,7 +51,7 @@ namespace ARMeilleure.Instructions
                 EmitReadInt(context, address, rt, size);
             }
 
-            if (!isSimd)
+            if (!isSimd && !(context.CurrOp is OpCode32 && rt == State.RegisterAlias.Aarch32Pc))
             {
                 Operand value = GetInt(context, rt);
 
diff --git a/ARMeilleure/Instructions/NativeInterface.cs b/ARMeilleure/Instructions/NativeInterface.cs
index 988e86bd77..4514c0da49 100644
--- a/ARMeilleure/Instructions/NativeInterface.cs
+++ b/ARMeilleure/Instructions/NativeInterface.cs
@@ -1,6 +1,8 @@
 using ARMeilleure.Memory;
 using ARMeilleure.State;
+using ARMeilleure.Translation;
 using System;
+using System.Runtime.InteropServices;
 
 namespace ARMeilleure.Instructions
 {
@@ -10,17 +12,19 @@ namespace ARMeilleure.Instructions
 
         private class ThreadContext
         {
-            public ExecutionContext Context { get; }
-            public MemoryManager    Memory  { get; }
+            public ExecutionContext Context    { get; }
+            public MemoryManager    Memory     { get; }
+            public Translator       Translator { get; }
 
             public ulong ExclusiveAddress   { get; set; }
             public ulong ExclusiveValueLow  { get; set; }
             public ulong ExclusiveValueHigh { get; set; }
 
-            public ThreadContext(ExecutionContext context, MemoryManager memory)
+            public ThreadContext(ExecutionContext context, MemoryManager memory, Translator translator)
             {
-                Context = context;
-                Memory  = memory;
+                Context    = context;
+                Memory     = memory;
+                Translator = translator;
 
                 ExclusiveAddress = ulong.MaxValue;
             }
@@ -29,9 +33,9 @@ namespace ARMeilleure.Instructions
         [ThreadStatic]
         private static ThreadContext _context;
 
-        public static void RegisterThread(ExecutionContext context, MemoryManager memory)
+        public static void RegisterThread(ExecutionContext context, MemoryManager memory, Translator translator)
         {
-            _context = new ThreadContext(context, memory);
+            _context = new ThreadContext(context, memory, translator);
         }
 
         public static void UnregisterThread()
@@ -381,18 +385,39 @@ namespace ARMeilleure.Instructions
             return address & ~((4UL << ErgSizeLog2) - 1);
         }
 
+        public static ulong GetFunctionAddress(ulong address)
+        {
+            TranslatedFunction function = _context.Translator.GetOrTranslate(address, GetContext().ExecutionMode);
+            return (ulong)function.GetPointer().ToInt64();
+        }
+
+        public static ulong GetIndirectFunctionAddress(ulong address, ulong entryAddress)
+        {
+            TranslatedFunction function = _context.Translator.GetOrTranslate(address, GetContext().ExecutionMode);
+            ulong ptr = (ulong)function.GetPointer().ToInt64();
+            if (function.HighCq)
+            {
+                // Rewrite the host function address in the table to point to the highCq function.
+                Marshal.WriteInt64((IntPtr)entryAddress, 8, (long)ptr);
+            }
+            return ptr;
+        }
+
         public static void ClearExclusive()
         {
             _context.ExclusiveAddress = ulong.MaxValue;
         }
 
-        public static void CheckSynchronization()
+        public static bool CheckSynchronization()
         {
             Statistics.PauseTimer();
 
-            GetContext().CheckInterrupt();
+            ExecutionContext context = GetContext();
+            context.CheckInterrupt();
 
             Statistics.ResumeTimer();
+
+            return context.Running;
         }
 
         public static ExecutionContext GetContext()
diff --git a/ARMeilleure/IntermediateRepresentation/Instruction.cs b/ARMeilleure/IntermediateRepresentation/Instruction.cs
index 4c4ecb8f2d..d1ce1aa379 100644
--- a/ARMeilleure/IntermediateRepresentation/Instruction.cs
+++ b/ARMeilleure/IntermediateRepresentation/Instruction.cs
@@ -12,7 +12,7 @@ namespace ARMeilleure.IntermediateRepresentation
         BranchIfTrue,
         ByteSwap,
         Call,
-        CompareAndSwap128,
+        CompareAndSwap,
         CompareEqual,
         CompareGreater,
         CompareGreaterOrEqual,
@@ -52,6 +52,7 @@ namespace ARMeilleure.IntermediateRepresentation
         Store16,
         Store8,
         Subtract,
+        Tailcall,
         VectorCreateScalar,
         VectorExtract,
         VectorExtract16,
diff --git a/ARMeilleure/Memory/MemoryManagement.cs b/ARMeilleure/Memory/MemoryManagement.cs
index e299ae49da..ba62f8e73f 100644
--- a/ARMeilleure/Memory/MemoryManagement.cs
+++ b/ARMeilleure/Memory/MemoryManagement.cs
@@ -44,6 +44,25 @@ namespace ARMeilleure.Memory
             }
         }
 
+        public static bool Commit(IntPtr address, ulong size)
+        {
+            if (RuntimeInformation.IsOSPlatform(OSPlatform.Windows))
+            {
+                IntPtr sizeNint = new IntPtr((long)size);
+
+                return MemoryManagementWindows.Commit(address, sizeNint);
+            }
+            else if (RuntimeInformation.IsOSPlatform(OSPlatform.Linux) ||
+                     RuntimeInformation.IsOSPlatform(OSPlatform.OSX))
+            {
+                return MemoryManagementUnix.Commit(address, size);
+            }
+            else
+            {
+                throw new PlatformNotSupportedException();
+            }
+        }
+
         public static void Reprotect(IntPtr address, ulong size, MemoryProtection permission)
         {
             bool result;
@@ -70,6 +89,25 @@ namespace ARMeilleure.Memory
             }
         }
 
+        public static IntPtr Reserve(ulong size)
+        {
+            if (RuntimeInformation.IsOSPlatform(OSPlatform.Windows))
+            {
+                IntPtr sizeNint = new IntPtr((long)size);
+
+                return MemoryManagementWindows.Reserve(sizeNint);
+            }
+            else if (RuntimeInformation.IsOSPlatform(OSPlatform.Linux) ||
+                     RuntimeInformation.IsOSPlatform(OSPlatform.OSX))
+            {
+                return MemoryManagementUnix.Reserve(size);
+            }
+            else
+            {
+                throw new PlatformNotSupportedException();
+            }
+        }
+
         public static bool Free(IntPtr address)
         {
             if (RuntimeInformation.IsOSPlatform(OSPlatform.Windows))
diff --git a/ARMeilleure/Memory/MemoryManagementUnix.cs b/ARMeilleure/Memory/MemoryManagementUnix.cs
index 3331fb428f..e9b296081e 100644
--- a/ARMeilleure/Memory/MemoryManagementUnix.cs
+++ b/ARMeilleure/Memory/MemoryManagementUnix.cs
@@ -30,6 +30,11 @@ namespace ARMeilleure.Memory
             return ptr;
         }
 
+        public static bool Commit(IntPtr address, ulong size)
+        {
+            return Syscall.mprotect(address, size, MmapProts.PROT_READ | MmapProts.PROT_WRITE) == 0;
+        }
+
         public static bool Reprotect(IntPtr address, ulong size, Memory.MemoryProtection protection)
         {
             MmapProts prot = GetProtection(protection);
@@ -37,6 +42,24 @@ namespace ARMeilleure.Memory
             return Syscall.mprotect(address, size, prot) == 0;
         }
 
+        public static IntPtr Reserve(ulong size)
+        {
+            ulong pageSize = (ulong)Syscall.sysconf(SysconfName._SC_PAGESIZE);
+
+            const MmapProts prot = MmapProts.PROT_NONE;
+
+            const MmapFlags flags = MmapFlags.MAP_PRIVATE | MmapFlags.MAP_ANONYMOUS;
+
+            IntPtr ptr = Syscall.mmap(IntPtr.Zero, size + pageSize, prot, flags, -1, 0);
+
+            if (ptr == IntPtr.Zero)
+            {
+                throw new OutOfMemoryException();
+            }
+
+            return ptr;
+        }
+
         private static MmapProts GetProtection(Memory.MemoryProtection protection)
         {
             switch (protection)
diff --git a/ARMeilleure/Memory/MemoryManagementWindows.cs b/ARMeilleure/Memory/MemoryManagementWindows.cs
index ae64b5c62b..a945506317 100644
--- a/ARMeilleure/Memory/MemoryManagementWindows.cs
+++ b/ARMeilleure/Memory/MemoryManagementWindows.cs
@@ -89,6 +89,15 @@ namespace ARMeilleure.Memory
             return ptr;
         }
 
+        public static bool Commit(IntPtr location, IntPtr size)
+        {
+            const AllocationType flags = AllocationType.Commit;
+
+            IntPtr ptr = VirtualAlloc(location, size, flags, MemoryProtection.ReadWrite);
+
+            return ptr != IntPtr.Zero;
+        }
+
         public static bool Reprotect(IntPtr address, IntPtr size, Memory.MemoryProtection protection)
         {
             MemoryProtection prot = GetProtection(protection);
@@ -96,6 +105,20 @@ namespace ARMeilleure.Memory
             return VirtualProtect(address, size, prot, out _);
         }
 
+        public static IntPtr Reserve(IntPtr size)
+        {
+            const AllocationType flags = AllocationType.Reserve;
+
+            IntPtr ptr = VirtualAlloc(IntPtr.Zero, size, flags, MemoryProtection.ReadWrite);
+
+            if (ptr == IntPtr.Zero)
+            {
+                throw new OutOfMemoryException();
+            }
+
+            return ptr;
+        }
+
         private static MemoryProtection GetProtection(Memory.MemoryProtection protection)
         {
             switch (protection)
diff --git a/ARMeilleure/Memory/MemoryManagerPal.cs b/ARMeilleure/Memory/MemoryManagerPal.cs
index 64191a0acb..66c436424a 100644
--- a/ARMeilleure/Memory/MemoryManagerPal.cs
+++ b/ARMeilleure/Memory/MemoryManagerPal.cs
@@ -53,7 +53,7 @@ namespace ARMeilleure.Memory
                 Operand expected = context.LoadArgument(OperandType.V128, 1);
                 Operand desired  = context.LoadArgument(OperandType.V128, 2);
 
-                Operand result = context.CompareAndSwap128(address, expected, desired);
+                Operand result = context.CompareAndSwap(address, expected, desired);
 
                 context.Return(result);
 
diff --git a/ARMeilleure/Memory/ReservedRegion.cs b/ARMeilleure/Memory/ReservedRegion.cs
new file mode 100644
index 0000000000..521019adeb
--- /dev/null
+++ b/ARMeilleure/Memory/ReservedRegion.cs
@@ -0,0 +1,53 @@
+using System;
+using System.Collections.Generic;
+using System.Text;
+
+namespace ARMeilleure.Memory
+{
+    class ReservedRegion
+    {
+        private const int DefaultGranularity = 65536; // Mapping granularity in Windows.
+
+        public IntPtr Pointer { get; }
+
+        private ulong _maxSize;
+        private ulong _sizeGranularity;
+        private ulong _currentSize;
+
+        public ReservedRegion(ulong maxSize, ulong granularity = 0)
+        {
+            if (granularity == 0)
+            {
+                granularity = DefaultGranularity;
+            }
+
+            Pointer = MemoryManagement.Reserve(maxSize);
+            _maxSize = maxSize;
+            _sizeGranularity = granularity;
+            _currentSize = 0;
+        }
+
+        public void ExpandIfNeeded(ulong desiredSize)
+        {
+            if (desiredSize > _maxSize)
+            {
+                throw new OutOfMemoryException();
+            }
+
+            if (desiredSize > _currentSize)
+            {
+                // Lock, and then check again. We only want to commit once.
+                lock (this)
+                {
+                    if (desiredSize >= _currentSize)
+                    {
+                        ulong overflowBytes = desiredSize - _currentSize;
+                        ulong moreToCommit = (((_sizeGranularity - 1) + overflowBytes) / _sizeGranularity) * _sizeGranularity; // Round up.
+                        MemoryManagement.Commit(new IntPtr((long)Pointer + (long)_currentSize), moreToCommit);
+                        _currentSize += moreToCommit;
+                    }
+                }
+            }
+        }
+    }
+}
diff --git a/ARMeilleure/State/ExecutionContext.cs b/ARMeilleure/State/ExecutionContext.cs
index 482665dbfe..57a05dbfd0 100644
--- a/ARMeilleure/State/ExecutionContext.cs
+++ b/ARMeilleure/State/ExecutionContext.cs
@@ -5,7 +5,7 @@ namespace ARMeilleure.State
 {
     public class ExecutionContext
     {
-        private const int MinCountForCheck = 40000;
+        private const int MinCountForCheck = 4000;
 
         private NativeContext _nativeContext;
 
@@ -57,7 +57,7 @@ namespace ARMeilleure.State
             }
         }
 
-        public bool Running { get; set; }
+        internal bool Running { get; private set; }
 
         public event EventHandler<EventArgs>              Interrupt;
         public event EventHandler<InstExceptionEventArgs> Break;
@@ -126,6 +126,12 @@ namespace ARMeilleure.State
             Undefined?.Invoke(this, new InstUndefinedEventArgs(address, opCode));
         }
 
+        public void StopRunning()
+        {
+            Running = false;
+            _nativeContext.SetCounter(0);
+        }
+
         public void Dispose()
         {
             _nativeContext.Dispose();
diff --git a/ARMeilleure/State/NativeContext.cs b/ARMeilleure/State/NativeContext.cs
index eb54505c6a..0ab9a3fd2c 100644
--- a/ARMeilleure/State/NativeContext.cs
+++ b/ARMeilleure/State/NativeContext.cs
@@ -10,7 +10,7 @@ namespace ARMeilleure.State
         private const int IntSize   = 8;
         private const int VecSize   = 16;
         private const int FlagSize  = 4;
-        private const int ExtraSize = 4;
+        private const int ExtraSize = 8;
 
         private const int TotalSize = RegisterConsts.IntRegsCount * IntSize  +
                                       RegisterConsts.VecRegsCount * VecSize  +
@@ -183,6 +183,14 @@ namespace ARMeilleure.State
                    RegisterConsts.FpFlagsCount * FlagSize;
         }
 
+        public static int GetCallAddressOffset()
+        {
+            return RegisterConsts.IntRegsCount * IntSize  +
+                   RegisterConsts.VecRegsCount * VecSize  +
+                   RegisterConsts.FlagsCount   * FlagSize +
+                   RegisterConsts.FpFlagsCount * FlagSize + 4;
+        }
+
         public void Dispose()
         {
             MemoryManagement.Free(BasePtr);
diff --git a/ARMeilleure/Translation/ArmEmitterContext.cs b/ARMeilleure/Translation/ArmEmitterContext.cs
index d35e985e6c..d1a2c92db5 100644
--- a/ARMeilleure/Translation/ArmEmitterContext.cs
+++ b/ARMeilleure/Translation/ArmEmitterContext.cs
@@ -41,10 +41,19 @@ namespace ARMeilleure.Translation
 
         public Aarch32Mode Mode { get; }
 
-        public ArmEmitterContext(MemoryManager memory, Aarch32Mode mode)
+        public JumpTable JumpTable { get; }
+
+        public long BaseAddress { get; }
+
+        public bool HighCq { get; }
+
+        public ArmEmitterContext(MemoryManager memory, JumpTable jumpTable, long baseAddress, bool highCq, Aarch32Mode mode)
         {
-            Memory = memory;
-            Mode   = mode;
+            Memory      = memory;
+            JumpTable   = jumpTable;
+            BaseAddress = baseAddress;
+            HighCq      = highCq;
+            Mode        = mode;
 
             _labels = new Dictionary<ulong, Operand>();
         }
diff --git a/ARMeilleure/Translation/DirectCallStubs.cs b/ARMeilleure/Translation/DirectCallStubs.cs
new file mode 100644
index 0000000000..e6e87b2b61
--- /dev/null
+++ b/ARMeilleure/Translation/DirectCallStubs.cs
@@ -0,0 +1,131 @@
+using ARMeilleure.Instructions;
+using ARMeilleure.IntermediateRepresentation;
+using ARMeilleure.State;
+using System;
+using System.Runtime.InteropServices;
+
+using static ARMeilleure.IntermediateRepresentation.OperandHelper;
+
+namespace ARMeilleure.Translation
+{
+    static class DirectCallStubs
+    {
+        private delegate long GuestFunction(IntPtr nativeContextPtr);
+
+        private static GuestFunction _directCallStub;
+        private static GuestFunction _directTailCallStub;
+        private static GuestFunction _indirectCallStub;
+        private static GuestFunction _indirectTailCallStub;
+
+        private static object _lock;
+        private static bool _initialized;
+
+        static DirectCallStubs()
+        {
+            _lock = new object();
+        }
+
+        public static void InitializeStubs()
+        {
+            if (_initialized) return;
+            lock (_lock)
+            {
+                if (_initialized) return;
+                _directCallStub = GenerateDirectCallStub(false);
+                _directTailCallStub = GenerateDirectCallStub(true);
+                _indirectCallStub = GenerateIndirectCallStub(false);
+                _indirectTailCallStub = GenerateIndirectCallStub(true);
+                _initialized = true;
+            }
+        }
+
+        public static IntPtr DirectCallStub(bool tailCall)
+        {
+            return Marshal.GetFunctionPointerForDelegate(tailCall ? _directTailCallStub : _directCallStub);
+        }
+
+        public static IntPtr IndirectCallStub(bool tailCall)
+        {
+            return Marshal.GetFunctionPointerForDelegate(tailCall ? _indirectTailCallStub : _indirectCallStub);
+        }
+
+        private static void EmitCall(EmitterContext context, Operand address, bool tailCall)
+        {
+            if (tailCall)
+            {
+                context.Tailcall(address, context.LoadArgument(OperandType.I64, 0));
+            }
+            else
+            {
+                context.Return(context.Call(address, OperandType.I64, context.LoadArgument(OperandType.I64, 0)));
+            }
+        }
+
+        /// <summary>
+        /// Generates a stub that is used to find function addresses. Used for direct calls when their jump table does not have the host address yet.
+        /// Takes a NativeContext like a translated guest function, and extracts the target address from the NativeContext.
+        /// When the target function is compiled in highCq, all table entries are updated to point to that function instead of this stub by the translator.
+        /// </summary>
+        private static GuestFunction GenerateDirectCallStub(bool tailCall)
+        {
+            EmitterContext context = new EmitterContext();
+
+            Operand nativeContextPtr = context.LoadArgument(OperandType.I64, 0);
+
+            Operand address = context.Load(OperandType.I64, context.Add(nativeContextPtr, Const((long)NativeContext.GetCallAddressOffset())));
+
+            address = context.BitwiseOr(address, Const(address.Type, 1)); // Set call flag.
+            Operand functionAddr = context.Call(new _U64_U64(NativeInterface.GetFunctionAddress), address);
+            EmitCall(context, functionAddr, tailCall);
+
+            ControlFlowGraph cfg = context.GetControlFlowGraph();
+
+            OperandType[] argTypes = new OperandType[]
+            {
+                OperandType.I64
+            };
+
+            return Compiler.Compile<GuestFunction>(
+                cfg,
+                argTypes,
+                OperandType.I64,
+                CompilerOptions.HighCq);
+        }
+
+        /// <summary>
+        /// Generates a stub that is used to find function addresses and add them to an indirect table. 
+        /// Used for indirect calls entries (already claimed) when their jump table does not have the host address yet.
+        /// Takes a NativeContext like a translated guest function, and extracts the target indirect table entry from the NativeContext.
+        /// If the function we find is highCq, the entry in the table is updated to point to that function rather than this stub.
+        /// </summary>
+        private static GuestFunction GenerateIndirectCallStub(bool tailCall)
+        {
+            EmitterContext context = new EmitterContext();
+
+            Operand nativeContextPtr = context.LoadArgument(OperandType.I64, 0);
+
+            Operand entryAddress = context.Load(OperandType.I64, context.Add(nativeContextPtr, Const((long)NativeContext.GetCallAddressOffset())));
+            Operand address = context.Load(OperandType.I64, entryAddress);
+
+            // We need to find the missing function. If the function is HighCq, then it replaces this stub in the indirect table.
+            // Either way, we call it afterwards.
+            Operand functionAddr = context.Call(new _U64_U64_U64(NativeInterface.GetIndirectFunctionAddress), address, entryAddress);
+
+            // Call and save the function.
+            EmitCall(context, functionAddr, tailCall);
+
+            ControlFlowGraph cfg = context.GetControlFlowGraph();
+
+            OperandType[] argTypes = new OperandType[]
+            {
+                OperandType.I64
+            };
+
+            return Compiler.Compile<GuestFunction>(
+                cfg,
+                argTypes,
+                OperandType.I64,
+                CompilerOptions.HighCq);
+        }
+    }
+}
diff --git a/ARMeilleure/Translation/EmitterContext.cs b/ARMeilleure/Translation/EmitterContext.cs
index a125a715da..a11d25a6db 100644
--- a/ARMeilleure/Translation/EmitterContext.cs
+++ b/ARMeilleure/Translation/EmitterContext.cs
@@ -143,9 +143,22 @@ namespace ARMeilleure.Translation
             }
         }
 
-        public Operand CompareAndSwap128(Operand address, Operand expected, Operand desired)
+        public void Tailcall(Operand address, params Operand[] callArgs)
         {
-            return Add(Instruction.CompareAndSwap128, Local(OperandType.V128), address, expected, desired);
+            Operand[] args = new Operand[callArgs.Length + 1];
+
+            args[0] = address;
+
+            Array.Copy(callArgs, 0, args, 1, callArgs.Length);
+
+            Add(Instruction.Tailcall, null, args);
+
+            _needsNewBlock = true;
+        }
+
+        public Operand CompareAndSwap(Operand address, Operand expected, Operand desired)
+        {
+            return Add(Instruction.CompareAndSwap, Local(desired.Type), address, expected, desired);
         }
 
         public Operand ConditionalSelect(Operand op1, Operand op2, Operand op3)
diff --git a/ARMeilleure/Translation/JitCache.cs b/ARMeilleure/Translation/JitCache.cs
index 73f04a966d..b004cc22aa 100644
--- a/ARMeilleure/Translation/JitCache.cs
+++ b/ARMeilleure/Translation/JitCache.cs
@@ -13,9 +13,11 @@ namespace ARMeilleure.Translation
 
         private const int CodeAlignment = 4; // Bytes
 
-        private const int CacheSize = 512 * 1024 * 1024;
+        private const int CacheSize = 2047 * 1024 * 1024;
 
-        private static IntPtr _basePointer;
+        private static ReservedRegion _jitRegion;
+
+        private static IntPtr _basePointer => _jitRegion.Pointer;
 
         private static int _offset;
 
@@ -25,10 +27,11 @@ namespace ARMeilleure.Translation
 
         static JitCache()
         {
-            _basePointer = MemoryManagement.Allocate(CacheSize);
+            _jitRegion = new ReservedRegion(CacheSize);
 
             if (RuntimeInformation.IsOSPlatform(OSPlatform.Windows))
             {
+                _jitRegion.ExpandIfNeeded(PageSize);
                 JitUnwindWindows.InstallFunctionTableHandler(_basePointer, CacheSize);
 
                 // The first page is used for the table based SEH structs.
@@ -97,6 +100,8 @@ namespace ARMeilleure.Translation
 
             _offset += codeSize;
 
+            _jitRegion.ExpandIfNeeded((ulong)_offset);
+
             if ((ulong)(uint)_offset > CacheSize)
             {
                 throw new OutOfMemoryException();
diff --git a/ARMeilleure/Translation/JumpTable.cs b/ARMeilleure/Translation/JumpTable.cs
new file mode 100644
index 0000000000..5cad294480
--- /dev/null
+++ b/ARMeilleure/Translation/JumpTable.cs
@@ -0,0 +1,149 @@
+using ARMeilleure.Memory;
+using System;
+using System.Collections.Concurrent;
+using System.Collections.Generic;
+using System.Runtime.InteropServices;
+using System.Threading;
+
+namespace ARMeilleure.Translation
+{
+    class JumpTable
+    {
+        public static JumpTable Instance { get; }
+
+        static JumpTable()
+        {
+            Instance = new JumpTable();
+        }
+
+        // The jump table is a block of (guestAddress, hostAddress) function mappings.
+        // Each entry corresponds to one branch in a JIT compiled function. The entries are
+        // reserved specifically for each call.
+        // The _dependants dictionary can be used to update the hostAddress for any functions that change.
+
+        public const int JumpTableStride = 16; // 8 byte guest address, 8 byte host address
+
+        private const int JumpTableSize = 1048576;
+
+        private const int JumpTableByteSize = JumpTableSize * JumpTableStride;
+
+        // The dynamic table is also a block of (guestAddress, hostAddress) function mappings.
+        // The main difference is that indirect calls and jumps reserve _multiple_ entries on the table.
+        // These start out as all 0. When an indirect call is made, it tries to find the guest address on the table.
+
+        // If we get to an empty address, the guestAddress is set to the call that we want.
+
+        // If we get to a guestAddress that matches our own (or we just claimed it), the hostAddress is read.
+        // If it is non-zero, we immediately branch or call the host function.
+        // If it is 0, NativeInterface is called to find the rejited address of the call.
+        // If none is found, the hostAddress entry stays at 0. Otherwise, the new address is placed in the entry.
+
+        // If the table size is exhausted and we didn't find our desired address, we fall back to requesting 
+        // the function from the JIT.
+
+        private const int DynamicTableSize = 1048576;
+
+        public const int DynamicTableElems = 1;
+
+        public const int DynamicTableStride = DynamicTableElems * JumpTableStride;
+
+        private const int DynamicTableByteSize = DynamicTableSize * JumpTableStride * DynamicTableElems;
+
+        private int _tableEnd = 0;
+        private int _dynTableEnd = 0;
+
+        private ConcurrentDictionary<ulong, TranslatedFunction> _targets;
+        private ConcurrentDictionary<ulong, LinkedList<int>> _dependants; // TODO: Attach to TranslatedFunction or a wrapper class.
+
+        private ReservedRegion _jumpRegion;
+        private ReservedRegion _dynamicRegion;
+        public IntPtr JumpPointer => _jumpRegion.Pointer;
+        public IntPtr DynamicPointer => _dynamicRegion.Pointer;
+
+        public JumpTable()
+        {
+            _jumpRegion = new ReservedRegion(JumpTableByteSize);
+            _dynamicRegion = new ReservedRegion(DynamicTableByteSize);
+
+            _targets = new ConcurrentDictionary<ulong, TranslatedFunction>();
+            _dependants = new ConcurrentDictionary<ulong, LinkedList<int>>();
+        }
+
+        public void RegisterFunction(ulong address, TranslatedFunction func) {
+            address &= ~3UL;
+            _targets.AddOrUpdate(address, func, (key, oldFunc) => func);
+            long funcPtr = func.GetPointer().ToInt64();
+
+            // Update all jump table entries that target this address.
+            LinkedList<int> myDependants;
+            if (_dependants.TryGetValue(address, out myDependants)) 
+            {
+                lock (myDependants)
+                {
+                    foreach (var entry in myDependants)
+                    {
+                        IntPtr addr = _jumpRegion.Pointer + entry * JumpTableStride;
+                        Marshal.WriteInt64(addr, 8, funcPtr);
+                    }
+                }
+            }
+        }
+
+        public int ReserveDynamicEntry(bool isJump)
+        {
+            int entry = Interlocked.Increment(ref _dynTableEnd);
+            if (entry >= DynamicTableSize)
+            {
+                throw new OutOfMemoryException("JIT Dynamic Jump Table exhausted.");
+            }
+
+            _dynamicRegion.ExpandIfNeeded((ulong)((entry + 1) * DynamicTableStride));
+
+            // Initialize all host function pointers to the indirect call stub.
+
+            IntPtr addr = _dynamicRegion.Pointer + entry * DynamicTableStride;
+            long stubPtr = (long)DirectCallStubs.IndirectCallStub(isJump);
+
+            for (int i = 0; i < DynamicTableElems; i++)
+            {
+                Marshal.WriteInt64(addr, i * JumpTableStride + 8, stubPtr);
+            }
+
+            return entry;
+        }
+
+        public int ReserveTableEntry(long ownerAddress, long address, bool isJump)
+        {
+            int entry = Interlocked.Increment(ref _tableEnd);
+            if (entry >= JumpTableSize)
+            {
+                throw new OutOfMemoryException("JIT Direct Jump Table exhausted.");
+            }
+
+            _jumpRegion.ExpandIfNeeded((ulong)((entry + 1) * JumpTableStride));
+
+            // Is the address we have already registered? If so, put the function address in the jump table.
+            // If not, it will point to the direct call stub.
+            long value = (long)DirectCallStubs.DirectCallStub(isJump);
+            TranslatedFunction func;
+            if (_targets.TryGetValue((ulong)address, out func))
+            {
+                value = func.GetPointer().ToInt64();
+            }
+
+            // Make sure changes to the function at the target address update this jump table entry.
+            LinkedList<int> targetDependants = _dependants.GetOrAdd((ulong)address, (addr) => new LinkedList<int>());
+            lock (targetDependants)
+            {
+                targetDependants.AddLast(entry);
+            }
+
+            IntPtr addr = _jumpRegion.Pointer + entry * JumpTableStride;
+
+            Marshal.WriteInt64(addr, 0, address);
+            Marshal.WriteInt64(addr, 8, value);
+
+            return entry;
+        }
+    }
+}
diff --git a/ARMeilleure/Translation/TranslatedFunction.cs b/ARMeilleure/Translation/TranslatedFunction.cs
index 06069cf8fe..af01aaab31 100644
--- a/ARMeilleure/Translation/TranslatedFunction.cs
+++ b/ARMeilleure/Translation/TranslatedFunction.cs
@@ -1,3 +1,5 @@
+using System;
+using System.Runtime.InteropServices;
 using System.Threading;
 
 namespace ARMeilleure.Translation
@@ -11,6 +13,8 @@ namespace ARMeilleure.Translation
         private bool _rejit;
         private int  _callCount;
 
+        public bool HighCq => !_rejit;
+
         public TranslatedFunction(GuestFunction func, bool rejit)
         {
             _func  = func;
@@ -26,5 +30,10 @@ namespace ARMeilleure.Translation
         {
             return _rejit && Interlocked.Increment(ref _callCount) == MinCallsForRejit;
         }
+
+        public IntPtr GetPointer()
+        {
+            return Marshal.GetFunctionPointerForDelegate(_func);
+        }
     }
 }
\ No newline at end of file
diff --git a/ARMeilleure/Translation/Translator.cs b/ARMeilleure/Translation/Translator.cs
index 3008303e76..9d534d58dd 100644
--- a/ARMeilleure/Translation/Translator.cs
+++ b/ARMeilleure/Translation/Translator.cs
@@ -16,10 +16,14 @@ namespace ARMeilleure.Translation
     {
         private const ulong CallFlag = InstEmitFlowHelper.CallFlag;
 
+        private const bool AlwaysTranslateFunctions = true; // If false, only translates a single block for lowCq.
+
         private MemoryManager _memory;
 
         private ConcurrentDictionary<ulong, TranslatedFunction> _funcs;
 
+        private JumpTable _jumpTable;
+
         private PriorityQueue<RejitRequest> _backgroundQueue;
 
         private AutoResetEvent _backgroundTranslatorEvent;
@@ -32,9 +36,13 @@ namespace ARMeilleure.Translation
 
             _funcs = new ConcurrentDictionary<ulong, TranslatedFunction>();
 
+            _jumpTable = JumpTable.Instance;
+
             _backgroundQueue = new PriorityQueue<RejitRequest>(2);
 
             _backgroundTranslatorEvent = new AutoResetEvent(false);
+
+            DirectCallStubs.InitializeStubs();
         }
 
         private void TranslateQueuedSubs()
@@ -46,30 +54,42 @@ namespace ARMeilleure.Translation
                     TranslatedFunction func = Translate(request.Address, request.Mode, highCq: true);
 
                     _funcs.AddOrUpdate(request.Address, func, (key, oldFunc) => func);
+                    _jumpTable.RegisterFunction(request.Address, func);
                 }
                 else
                 {
                     _backgroundTranslatorEvent.WaitOne();
                 }
             }
+            _backgroundTranslatorEvent.Set(); // Wake up any other background translator threads, to encourage them to exit.
         }
 
         public void Execute(State.ExecutionContext context, ulong address)
         {
             if (Interlocked.Increment(ref _threadCount) == 1)
             {
-                Thread backgroundTranslatorThread = new Thread(TranslateQueuedSubs)
+                // Simple heuristic, should be user configurable in future. (1 for 4 core/ht or less, 2 for 6 core+ht etc).
+                // All threads are normal priority except from the last, which just fills as much of the last core as the os lets it with a low priority.
+                // If we only have one rejit thread, it should be normal priority as highCq code is performance critical.
+                // TODO: Use physical cores rather than logical. This only really makes sense for processors with hyperthreading. Requires OS specific code.
+                int unboundedThreadCount = Math.Max(1, (Environment.ProcessorCount - 6) / 3);
+                int threadCount = Math.Min(3, unboundedThreadCount);
+                for (int i = 0; i < threadCount; i++)
                 {
-                    Name     = "CPU.BackgroundTranslatorThread",
-                    Priority = ThreadPriority.Lowest
-                };
+                    bool last = i != 0 && i == unboundedThreadCount - 1;
+                    Thread backgroundTranslatorThread = new Thread(TranslateQueuedSubs)
+                    {
+                        Name = "CPU.BackgroundTranslatorThread." + i,
+                        Priority = last ? ThreadPriority.Lowest : ThreadPriority.Normal
+                    };
 
-                backgroundTranslatorThread.Start();
+                    backgroundTranslatorThread.Start();
+                }
             }
 
             Statistics.InitializeTimer();
 
-            NativeInterface.RegisterThread(context, _memory);
+            NativeInterface.RegisterThread(context, _memory, this);
 
             do
             {
@@ -98,7 +118,7 @@ namespace ARMeilleure.Translation
             return nextAddr;
         }
 
-        private TranslatedFunction GetOrTranslate(ulong address, ExecutionMode mode)
+        internal TranslatedFunction GetOrTranslate(ulong address, ExecutionMode mode)
         {
             // TODO: Investigate how we should handle code at unaligned addresses.
             // Currently, those low bits are used to store special flags.
@@ -124,12 +144,12 @@ namespace ARMeilleure.Translation
 
         private TranslatedFunction Translate(ulong address, ExecutionMode mode, bool highCq)
         {
-            ArmEmitterContext context = new ArmEmitterContext(_memory, Aarch32Mode.User);
+            ArmEmitterContext context = new ArmEmitterContext(_memory, _jumpTable, (long)address, highCq, Aarch32Mode.User);
 
             Logger.StartPass(PassName.Decoding);
 
-            Block[] blocks = highCq
-                ? Decoder.DecodeFunction  (_memory, address, mode)
+            Block[] blocks = AlwaysTranslateFunctions
+                ? Decoder.DecodeFunction  (_memory, address, mode, highCq)
                 : Decoder.DecodeBasicBlock(_memory, address, mode);
 
             Logger.EndPass(PassName.Decoding);
@@ -216,7 +236,7 @@ namespace ARMeilleure.Translation
                         // with some kind of branch).
                         if (isLastOp && block.Next == null)
                         {
-                            context.Return(Const(opCode.Address + (ulong)opCode.OpCodeSizeInBytes));
+                            InstEmitFlowHelper.EmitTailContinue(context, Const(opCode.Address + (ulong)opCode.OpCodeSizeInBytes));
                         }
                     }
                 }
@@ -238,7 +258,11 @@ namespace ARMeilleure.Translation
 
             context.BranchIfTrue(lblNonZero, count);
 
-            context.Call(new _Void(NativeInterface.CheckSynchronization));
+            Operand running = context.Call(new _Bool(NativeInterface.CheckSynchronization));
+
+            context.BranchIfTrue(lblExit, running);
+
+            context.Return(Const(0L));
 
             context.Branch(lblExit);
 
diff --git a/Ryujinx.HLE/HOS/Kernel/Threading/HleScheduler.cs b/Ryujinx.HLE/HOS/Kernel/Threading/HleScheduler.cs
index 1a213b924f..c4161d5420 100644
--- a/Ryujinx.HLE/HOS/Kernel/Threading/HleScheduler.cs
+++ b/Ryujinx.HLE/HOS/Kernel/Threading/HleScheduler.cs
@@ -137,7 +137,7 @@ namespace Ryujinx.HLE.HOS.Kernel.Threading
 
         public void ExitThread(KThread thread)
         {
-            thread.Context.Running = false;
+            thread.Context.StopRunning();
 
             CoreManager.Exit(thread.HostThread);
         }
diff --git a/Ryujinx.HLE/HOS/Kernel/Threading/KThread.cs b/Ryujinx.HLE/HOS/Kernel/Threading/KThread.cs
index 53eb5bdc9a..cd60c95504 100644
--- a/Ryujinx.HLE/HOS/Kernel/Threading/KThread.cs
+++ b/Ryujinx.HLE/HOS/Kernel/Threading/KThread.cs
@@ -1141,9 +1141,9 @@ namespace Ryujinx.HLE.HOS.Kernel.Threading
         {
             Owner.Translator.Execute(Context, entrypoint);
 
-            Context.Dispose();
-
             ThreadExit();
+
+            Context.Dispose();
         }
 
         private void ThreadExit()