shader_jit_a64: Compact host executable memory (#230)

* common/aarch64: Allow generic code generator types Use the templated `BasicCodeGenerator` type rather than the specialized `CodeGenerator` type. Allows `VectorCodeGenerator` to work with these functions. * common/aarch64: Add `VectorCodeGenerator` to `CallFarFunction` `VectorCodeGenerator` will always do far-calls since we cannot resolve any absolute addresses here. * shader_jit_a64: Implement position-independent VectorCodeGenerator Generates more position-independent assembly to allow for code to be generated within a resizable vector before copying into executable memory, allowing for more compact memory allocations and usage rather than a statically defined worst-case for all-cases. `VectorCodeGenerator` will need to generate position-independent code rather than use absolute addresses. Assumes all far function calls in the case of `VectorCodeGenerator` to use absolute addresses rather than potentially use a relative `BL` branch after memory relocation.
2024-11-24 00:14:16 +00:00 · 2024-09-01 03:24:13 -07:00 · 2024-09-01 03:24:13 -07:00 · 3e5bbac5a1
commit 3e5bbac5a1
parent 82faf2e557
4 changed files with 74 additions and 41 deletions
--- a/src/common/aarch64/oaknut_abi.h
+++ b/src/common/aarch64/oaknut_abi.h
@ -78,7 +78,8 @@ inline ABIFrameInfo ABI_CalculateFrameSize(std::bitset<64> regs, std::size_t fra
    return ABIFrameInfo{static_cast<u32>(total_size), static_cast<u32>(fprs_base_subtraction)};
 }

-inline void ABI_PushRegisters(oaknut::CodeGenerator& code, std::bitset<64> regs,
+template <typename Policy>
+inline void ABI_PushRegisters(oaknut::BasicCodeGenerator<Policy>& code, std::bitset<64> regs,
                              std::size_t frame_size = 0) {
    using namespace oaknut;
    using namespace oaknut::util;
@ -137,7 +138,8 @@ inline void ABI_PushRegisters(oaknut::CodeGenerator& code, std::bitset<64> regs,
    }
 }

-inline void ABI_PopRegisters(oaknut::CodeGenerator& code, std::bitset<64> regs,
+template <typename Policy>
+inline void ABI_PopRegisters(oaknut::BasicCodeGenerator<Policy>& code, std::bitset<64> regs,
                             std::size_t frame_size = 0) {
    using namespace oaknut;
    using namespace oaknut::util;
--- a/src/common/aarch64/oaknut_util.h
+++ b/src/common/aarch64/oaknut_util.h
@ -38,6 +38,16 @@ inline void CallFarFunction(oaknut::CodeGenerator& code, const T f) {
    }
 }

+template <typename T>
+inline void CallFarFunction(oaknut::VectorCodeGenerator& code, const T f) {
+    static_assert(std::is_pointer_v<T>, "Argument must be a (function) pointer.");
+    // X16(IP0) and X17(IP1) is the standard veneer register
+    // LR is also available as an intermediate register
+    // https://developer.arm.com/documentation/102374/0101/Procedure-Call-Standard
+    code.MOVP2R(oaknut::util::X16, reinterpret_cast<const void*>(f));
+    code.BLR(oaknut::util::X16);
+}
+
 } // namespace Common::A64

 #endif // CITRA_ARCH(arm64)
--- a/src/video_core/shader/shader_jit_a64_compiler.cpp
+++ b/src/video_core/shader/shader_jit_a64_compiler.cpp
@ -942,7 +942,7 @@ void JitShader::Compile(const std::array<u32, MAX_PROGRAM_CODE_LENGTH>* program_
    swizzle_data = swizzle_data_;

    // Reset flow control state
-    program = xptr<CompiledShader*>();
+    const std::uintptr_t program_offset = offset();
    program_counter = 0;
    loop_depth = 0;
    instruction_labels.fill(Label());
@ -984,18 +984,28 @@ void JitShader::Compile(const std::array<u32, MAX_PROGRAM_CODE_LENGTH>* program_
    return_offsets.clear();
    return_offsets.shrink_to_fit();

+    // Copy to executable memory
+    const size_t code_size = code_vec.size() * sizeof(u32);
+
+    code_mem = std::make_unique<oaknut::CodeBlock>(code_size);
+    code_mem->unprotect();
+
+    program = reinterpret_cast<CompiledShader*>(reinterpret_cast<std::byte*>(code_mem->ptr()) +
+                                                program_offset);
+
+    // Copy to executable memory
+    std::memcpy(code_mem->ptr(), code_vec.data(), code_vec.size() * sizeof(u32));
+
    // Memory is ready to execute
-    protect();
-    invalidate_all();
+    code_mem->protect();
+    code_mem->invalidate_all();

-    const std::size_t code_size = static_cast<std::size_t>(offset());
-
-    ASSERT_MSG(code_size <= MAX_SHADER_SIZE, "Compiled a shader that exceeds the allocated size!");
-    LOG_DEBUG(HW_GPU, "Compiled shader size={}", code_size);
+    // code_vec is no longer needed
+    code_vec.clear();
+    code_vec.shrink_to_fit();
 }

-JitShader::JitShader() : CodeBlock(MAX_SHADER_SIZE), CodeGenerator(CodeBlock::ptr()) {
-    unprotect();
+JitShader::JitShader() : oaknut::VectorCodeGenerator(code_vec) {
    CompilePrelude();
 }

@ -1013,19 +1023,22 @@ Label JitShader::CompilePrelude_Log2() {
    // range. Coefficients for the minimax polynomial.
    // f(x) computes approximately log2(x) / (x - 1).
    // f(x) = c4 + x * (c3 + x * (c2 + x * (c1 + x * c0)).
-    align(16);
-    const void* c0 = xptr<const void*>();
+    oaknut::Label c0;
+    // align(16);
+    l(c0);
    dw(0x3d74552f);

-    align(16);
-    const void* c14 = xptr<const void*>();
+    // align(16);
+    oaknut::Label c14;
+    l(c14);
    dw(0xbeee7397);
    dw(0x3fbd96dd);
    dw(0xc02153f6);
    dw(0x4038d96c);

-    align(16);
-    const void* negative_infinity_vector = xptr<const void*>();
+    // align(16);
+    oaknut::Label negative_infinity_vector;
+    l(negative_infinity_vector);
    dw(0xff800000);
    dw(0xff800000);
    dw(0xff800000);
@ -1038,19 +1051,19 @@ Label JitShader::CompilePrelude_Log2() {

    Label input_is_nan, input_is_zero, input_out_of_range;

-    align(16);
+    // align(16);
    l(input_out_of_range);
    B(Cond::EQ, input_is_zero);
-    MOVP2R(XSCRATCH0, default_qnan_vector);
+    ADR(XSCRATCH0, default_qnan_vector);
    LDR(SRC1, XSCRATCH0);
    RET();

    l(input_is_zero);
-    MOVP2R(XSCRATCH0, negative_infinity_vector);
+    ADR(XSCRATCH0, negative_infinity_vector);
    LDR(SRC1, XSCRATCH0);
    RET();

-    align(16);
+    // align(16);
    l(subroutine);

    // Here we handle edge cases: input in {NaN, 0, -Inf, Negative}.
@ -1078,14 +1091,14 @@ Label JitShader::CompilePrelude_Log2() {
    UCVTF(VSCRATCH1.toS(), VSCRATCH1.toS());
    // VSCRATCH1 now contains the exponent of the input.

-    MOVP2R(XSCRATCH0, c0);
+    ADR(XSCRATCH0, c0);
    LDR(XSCRATCH0.toW(), XSCRATCH0);
    MOV(VSCRATCH0.Selem()[0], XSCRATCH0.toW());

    // Complete computation of polynomial
    // Load C1,C2,C3,C4 into a single scratch register
    const QReg C14 = SRC2;
-    MOVP2R(XSCRATCH0, c14);
+    ADR(XSCRATCH0, c14);
    LDR(C14, XSCRATCH0);
    FMUL(VSCRATCH0.toS(), VSCRATCH0.toS(), SRC1.toS());
    FMLA(VSCRATCH0.toS(), ONE.toS(), C14.Selem()[0]);
@ -1118,27 +1131,35 @@ Label JitShader::CompilePrelude_Exp2() {
    // polynomial which was fit for the function exp2(x) is then evaluated. We then restore the
    // result into the appropriate range.

-    align(16);
-    const void* input_max = xptr<const void*>();
+    // align(16);
+    Label input_max;
+    l(input_max);
    dw(0x43010000);
-    const void* input_min = xptr<const void*>();
+    Label input_min;
+    l(input_min);
    dw(0xc2fdffff);
-    const void* c0 = xptr<const void*>();
+    Label c0;
+    l(c0);
    dw(0x3c5dbe69);
-    const void* half = xptr<const void*>();
+    Label half;
+    l(half);
    dw(0x3f000000);
-    const void* c1 = xptr<const void*>();
+    Label c1;
+    l(c1);
    dw(0x3d5509f9);
-    const void* c2 = xptr<const void*>();
+    Label c2;
+    l(c2);
    dw(0x3e773cc5);
-    const void* c3 = xptr<const void*>();
+    Label c3;
+    l(c3);
    dw(0x3f3168b3);
-    const void* c4 = xptr<const void*>();
+    Label c4;
+    l(c4);
    dw(0x3f800016);

    Label ret_label;

-    align(16);
+    // align(16);
    l(subroutine);

    // Handle edge cases
@ -1149,15 +1170,15 @@ Label JitShader::CompilePrelude_Exp2() {
    // VSCRATCH0=2^round(input)
    // SRC1=input-round(input) [-0.5, 0.5)
    // Clamp to maximum range since we shift the value directly into the exponent.
-    MOVP2R(XSCRATCH0, input_max);
+    ADR(XSCRATCH0, input_max);
    LDR(VSCRATCH0.toS(), XSCRATCH0);
    FMIN(SRC1.toS(), SRC1.toS(), VSCRATCH0.toS());

-    MOVP2R(XSCRATCH0, input_min);
+    ADR(XSCRATCH0, input_min);
    LDR(VSCRATCH0.toS(), XSCRATCH0);
    FMAX(SRC1.toS(), SRC1.toS(), VSCRATCH0.toS());

-    MOVP2R(XSCRATCH0, half);
+    ADR(XSCRATCH0, half);
    LDR(VSCRATCH0.toS(), XSCRATCH0);
    FSUB(VSCRATCH0.toS(), SRC1.toS(), VSCRATCH0.toS());

--- a/src/video_core/shader/shader_jit_a64_compiler.h
+++ b/src/video_core/shader/shader_jit_a64_compiler.h
@ -30,20 +30,17 @@ struct ShaderUnit;

 namespace Pica::Shader {

-/// Memory allocated for each compiled shader
-constexpr std::size_t MAX_SHADER_SIZE = MAX_PROGRAM_CODE_LENGTH * 256;
-
 /**
 * This class implements the shader JIT compiler. It recompiles a Pica shader program into x86_64
 * code that can be executed on the host machine directly.
 */
-class JitShader : private oaknut::CodeBlock, private oaknut::CodeGenerator {
+class JitShader : public oaknut::VectorCodeGenerator {
 public:
    JitShader();

    void Run(const ShaderSetup& setup, ShaderUnit& state, u32 offset) const {
        program(&setup.uniforms, &state,
-                reinterpret_cast<std::byte*>(oaknut::CodeBlock::ptr()) +
+                reinterpret_cast<const std::byte*>(code_mem->ptr()) +
                    instruction_labels[offset].offset());
    }

@ -81,6 +78,9 @@ public:
    void Compile_SETE(Instruction instr);

 private:
+    std::vector<u32> code_vec;
+    std::unique_ptr<oaknut::CodeBlock> code_mem;
+
    void Compile_Block(u32 end);
    void Compile_NextInstr();