From e6b59cb5d8de2b622a7011beac2c23102484b557 Mon Sep 17 00:00:00 2001
From: BreadFish64 <mohror64@gmail.com>
Date: Tue, 7 Apr 2020 17:09:05 -0500
Subject: [PATCH] video_core: implement optimized D24S8->RGBA8 reinterpreters

---
 .../gl_format_reinterpreter.cpp               | 170 +++++++++++++++++-
 1 file changed, 168 insertions(+), 2 deletions(-)

diff --git a/src/video_core/renderer_opengl/gl_format_reinterpreter.cpp b/src/video_core/renderer_opengl/gl_format_reinterpreter.cpp
index 2175c62bd..ee842a859 100644
--- a/src/video_core/renderer_opengl/gl_format_reinterpreter.cpp
+++ b/src/video_core/renderer_opengl/gl_format_reinterpreter.cpp
@@ -220,9 +220,175 @@ private:
     GLint d24s8_abgr_viewport_u_id;
 };
 
+class ShaderD24S8toRGBA8 final : public FormatReinterpreterBase {
+public:
+    ShaderD24S8toRGBA8() {
+        constexpr std::string_view vs_source = R"(
+out vec2 dst_coord;
+
+uniform mediump ivec2 dst_size;
+
+const vec2 vertices[4] =
+    vec2[4](vec2(-1.0, -1.0), vec2(1.0, -1.0), vec2(-1.0, 1.0), vec2(1.0, 1.0));
+
+void main() {
+    gl_Position = vec4(vertices[gl_VertexID], 0.0, 1.0);
+    dst_coord = (vertices[gl_VertexID] / 2.0 + 0.5) * vec2(dst_size);
+}
+)";
+
+        constexpr std::string_view fs_source = R"(
+in mediump vec2 dst_coord;
+
+out lowp vec4 frag_color;
+
+uniform highp sampler2D depth;
+uniform lowp usampler2D stencil;
+uniform mediump ivec2 dst_size;
+uniform mediump ivec2 src_size;
+uniform mediump ivec2 src_offset;
+
+void main() {
+    mediump ivec2 tex_coord;
+    if (src_size == dst_size) {
+        tex_coord = ivec2(dst_coord);
+    } else {
+        highp int tex_index = int(dst_coord.y) * dst_size.x + int(dst_coord.x);
+        mediump int y = tex_index / src_size.x;
+        tex_coord = ivec2(tex_index - y * src_size.x, y);
+    }
+    tex_coord -= src_offset;
+
+    highp uint depth_val =
+        uint(texelFetch(depth, tex_coord, 0).x * (exp2(32.0) - 1.0));
+    lowp uint stencil_val = texelFetch(stencil, tex_coord, 0).x;
+    highp uvec4 components =
+        uvec4(stencil_val, (uvec3(depth_val) >> uvec3(24u, 16u, 8u)) & 0x000000FFu);
+    frag_color = vec4(components) / (exp2(8.0) - 1.0);
+}
+)";
+
+        program.Create(vs_source.data(), fs_source.data());
+        dst_size_loc = glGetUniformLocation(program.handle, "dst_size");
+        src_size_loc = glGetUniformLocation(program.handle, "src_size");
+        src_offset_loc = glGetUniformLocation(program.handle, "src_offset");
+        vao.Create();
+
+        auto state = OpenGLState::GetCurState();
+        auto cur_program = state.draw.shader_program;
+        state.draw.shader_program = program.handle;
+        state.Apply();
+        glUniform1i(glGetUniformLocation(program.handle, "stencil"), 1);
+        state.draw.shader_program = cur_program;
+        state.Apply();
+
+        // OES_texture_view doesn't seem to support D24S8 views, at least on adreno
+        // so instead it will do an intermediate copy before running through the shader
+        if (GLAD_GL_ARB_texture_view) {
+            texture_view_func = glTextureView;
+        } else {
+            LOG_INFO(Render_OpenGL,
+                     "Texture views are unsupported, reinterpretation will do intermediate copy");
+            temp_tex.Create();
+        }
+    }
+
+    void Reinterpret(GLuint src_tex, const Common::Rectangle<u32>& src_rect, GLuint read_fb_handle,
+                     GLuint dst_tex, const Common::Rectangle<u32>& dst_rect,
+                     GLuint draw_fb_handle) override {
+        OpenGLState prev_state = OpenGLState::GetCurState();
+        SCOPE_EXIT({ prev_state.Apply(); });
+
+        OpenGLState state;
+        state.texture_units[0].texture_2d = src_tex;
+
+        if (texture_view_func) {
+            temp_tex.Create();
+            glActiveTexture(GL_TEXTURE1);
+            texture_view_func(temp_tex.handle, GL_TEXTURE_2D, src_tex, GL_DEPTH24_STENCIL8, 0, 1, 0,
+                              1);
+            glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST);
+            glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
+        } else if (src_rect.top > temp_rect.top || src_rect.right > temp_rect.right) {
+            temp_tex.Release();
+            temp_tex.Create();
+            state.texture_units[1].texture_2d = temp_tex.handle;
+            state.Apply();
+            glActiveTexture(GL_TEXTURE1);
+            glTexStorage2D(GL_TEXTURE_2D, 1, GL_DEPTH24_STENCIL8, src_rect.right, src_rect.top);
+            glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST);
+            glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
+            temp_rect = src_rect;
+        }
+
+        state.texture_units[1].texture_2d = temp_tex.handle;
+        state.draw.draw_framebuffer = draw_fb_handle;
+        state.draw.shader_program = program.handle;
+        state.draw.vertex_array = vao.handle;
+        state.viewport = {static_cast<GLint>(dst_rect.left), static_cast<GLint>(dst_rect.bottom),
+                          static_cast<GLsizei>(dst_rect.GetWidth()),
+                          static_cast<GLsizei>(dst_rect.GetHeight())};
+        state.Apply();
+
+        glActiveTexture(GL_TEXTURE1);
+        if (!texture_view_func) {
+            glCopyImageSubData(src_tex, GL_TEXTURE_2D, 0, src_rect.left, src_rect.bottom, 0,
+                               temp_tex.handle, GL_TEXTURE_2D, 0, src_rect.left, src_rect.bottom, 0,
+                               src_rect.GetWidth(), src_rect.GetHeight(), 1);
+        }
+        glTexParameteri(GL_TEXTURE_2D, GL_DEPTH_STENCIL_TEXTURE_MODE, GL_STENCIL_INDEX);
+
+        glFramebufferTexture2D(GL_DRAW_FRAMEBUFFER, GL_COLOR_ATTACHMENT0, GL_TEXTURE_2D, dst_tex,
+                               0);
+        glFramebufferTexture2D(GL_DRAW_FRAMEBUFFER, GL_DEPTH_STENCIL_ATTACHMENT, GL_TEXTURE_2D, 0,
+                               0);
+
+        glUniform2i(dst_size_loc, dst_rect.GetWidth(), dst_rect.GetHeight());
+        glUniform2i(src_size_loc, src_rect.GetWidth(), src_rect.GetHeight());
+        glUniform2i(src_offset_loc, src_rect.left, src_rect.bottom);
+        glDrawArrays(GL_TRIANGLE_STRIP, 0, 4);
+
+        if (texture_view_func) {
+            temp_tex.Release();
+        }
+    }
+
+private:
+    decltype(glTextureView) texture_view_func = nullptr;
+    OGLProgram program{};
+    GLint dst_size_loc{-1}, src_size_loc{-1}, src_offset_loc{-1};
+    OGLVertexArray vao{};
+    OGLTexture temp_tex{};
+    Common::Rectangle<u32> temp_rect{0, 0, 0, 0};
+};
+
+class CopyImageSubData final : public FormatReinterpreterBase {
+    void Reinterpret(GLuint src_tex, const Common::Rectangle<u32>& src_rect, GLuint read_fb_handle,
+                     GLuint dst_tex, const Common::Rectangle<u32>& dst_rect,
+                     GLuint draw_fb_handle) override {
+        glCopyImageSubData(src_tex, GL_TEXTURE_2D, 0, src_rect.left, src_rect.bottom, 0, dst_tex,
+                           GL_TEXTURE_2D, 0, dst_rect.left, dst_rect.bottom, 0, src_rect.GetWidth(),
+                           src_rect.GetHeight(), 1);
+    }
+};
+
 FormatReinterpreterOpenGL::FormatReinterpreterOpenGL() {
-    reinterpreters.emplace(PixelFormatPair{PixelFormat::RGBA8, PixelFormat::D24S8},
-                           std::make_unique<PixelBufferD24S8toABGR>());
+    std::string_view vendor{reinterpret_cast<const char*>(glGetString(GL_VENDOR))};
+    if (vendor.find("NVIDIA") != vendor.npos) {
+        reinterpreters.emplace(PixelFormatPair{PixelFormat::RGBA8, PixelFormat::D24S8},
+                               std::make_unique<CopyImageSubData>());
+        // Nvidia bends the spec and allows direct copies between color and depth formats
+        // might as well take advantage of it
+        LOG_INFO(Render_OpenGL, "Using glCopyImageSubData for D24S8 to RGBA8 reinterpretation");
+    } else if ((GLAD_GL_ARB_stencil_texturing && GLAD_GL_ARB_texture_storage) || GLES) {
+        reinterpreters.emplace(PixelFormatPair{PixelFormat::RGBA8, PixelFormat::D24S8},
+                               std::make_unique<ShaderD24S8toRGBA8>());
+        LOG_INFO(Render_OpenGL, "Using shader for D24S8 to RGBA8 reinterpretation");
+    } else {
+        reinterpreters.emplace(PixelFormatPair{PixelFormat::RGBA8, PixelFormat::D24S8},
+                               std::make_unique<PixelBufferD24S8toABGR>());
+        LOG_INFO(Render_OpenGL, "Using pbo for D24S8 to RGBA8 reinterpretation");
+    }
     reinterpreters.emplace(PixelFormatPair{PixelFormat::RGB5A1, PixelFormat::RGBA4},
                            std::make_unique<RGBA4toRGB5A1>());
 }