RyuKen/Ryujinx.Graphics.Vulkan/PipelineFull.cs
riperiperi c3c41fa4bb
Periodically Flush Commands for Vulkan (#3689)
* Periodically Flush Commands for Vulkan

NVIDIA's OpenGL driver has a built-in mechanism to automatically flush commands to GPU when a lot have been queued. It's also pretty inconsistent, but we'll ignore that for now.

Our Vulkan implementation only submits a command buffer (flush equivalent) when it needs to. This is typically when another command buffer needs to be sequenced after it, presenting a frame, or an edge case where we flush around GPU queries to get results sooner.

This difference in flush behaviour causes a notable difference between Vulkan and OpenGL when we have to wait for commands. In the worst case, we will wait for a sync point that has just been created. In Vulkan, this sync point is created by flushing the command buffer, and storing a waitable fence that signals its completion. Our command buffer contains _every command that we queued since the last submit_, which could be an entire frame's worth of draws.

This has a huge effect on CPU <-> GPU latency. The more commands in a command buffer, the longer we have to wait for it to complete, which results in wasted time. Because we don't know when the guest will force us to wait, we always want the smallest possible latency.

By periodically flushing, we ensure that each command buffer takes a more consistent, smaller amount of time to execute, and that the back of the GPU queue isn't as far away when we need to wait for something to happen. This also might reduce time that the GPU is left inactive while commands are being built.

The main affected game is Pokemon Sword, which got significantly faster in overworld areas due to reduced waiting time when it flushes a shadow map from the main GPU thread.

Another affected game is BOTW, which gets faster depending on the area. This game flushes textures/buffers from its game thread, which is the bottleneck.

Flush latency and throughput may be improved on other games that are inexplicably slower than OpenGL. It's possible that certain games could have their performance _decreased_ slightly due to flushes not being free, but it is unlikely.

Also, flushing to get query results sooner has been tweaked to improve the number of full draw skips that can be done. (tested in SMO)

* Remove unused variable

* Fix possible issue with early query flush
2022-09-14 13:48:31 -03:00

297 lines
9.5 KiB
C#

using Ryujinx.Graphics.GAL;
using Ryujinx.Graphics.Vulkan.Queries;
using Silk.NET.Vulkan;
using System;
using System.Collections.Generic;
namespace Ryujinx.Graphics.Vulkan
{
class PipelineFull : PipelineBase, IPipeline
{
private const ulong MinByteWeightForFlush = 256 * 1024 * 1024; // MB
private readonly List<QueryPool> _activeQueries;
private CounterQueueEvent _activeConditionalRender;
private readonly List<BufferedQuery> _pendingQueryCopies;
private readonly List<BufferedQuery> _pendingQueryResets;
private ulong _byteWeight;
public PipelineFull(VulkanRenderer gd, Device device) : base(gd, device)
{
_activeQueries = new List<QueryPool>();
_pendingQueryCopies = new();
_pendingQueryResets = new List<BufferedQuery>();
CommandBuffer = (Cbs = gd.CommandBufferPool.Rent()).CommandBuffer;
}
private void CopyPendingQuery()
{
foreach (var query in _pendingQueryCopies)
{
query.PoolCopy(Cbs);
}
lock (_pendingQueryResets)
{
foreach (var query in _pendingQueryResets)
{
query.PoolReset(CommandBuffer);
}
_pendingQueryResets.Clear();
}
_pendingQueryCopies.Clear();
}
public void ClearRenderTargetColor(int index, int layer, int layerCount, uint componentMask, ColorF color)
{
if (FramebufferParams == null)
{
return;
}
if (componentMask != 0xf)
{
// We can't use CmdClearAttachments if not writing all components,
// because on Vulkan, the pipeline state does not affect clears.
var dstTexture = FramebufferParams.GetAttachment(index);
if (dstTexture == null)
{
return;
}
Span<float> clearColor = stackalloc float[4];
clearColor[0] = color.Red;
clearColor[1] = color.Green;
clearColor[2] = color.Blue;
clearColor[3] = color.Alpha;
// TODO: Clear only the specified layer.
Gd.HelperShader.Clear(
Gd,
dstTexture,
clearColor,
componentMask,
(int)FramebufferParams.Width,
(int)FramebufferParams.Height,
FramebufferParams.AttachmentFormats[index],
ClearScissor);
}
else
{
ClearRenderTargetColor(index, layer, layerCount, color);
}
}
public void EndHostConditionalRendering()
{
if (Gd.Capabilities.SupportsConditionalRendering)
{
// Gd.ConditionalRenderingApi.CmdEndConditionalRendering(CommandBuffer);
}
else
{
// throw new NotSupportedException();
}
_activeConditionalRender?.ReleaseHostAccess();
_activeConditionalRender = null;
}
public bool TryHostConditionalRendering(ICounterEvent value, ulong compare, bool isEqual)
{
// Compare an event and a constant value.
if (value is CounterQueueEvent evt)
{
// Easy host conditional rendering when the check matches what GL can do:
// - Event is of type samples passed.
// - Result is not a combination of multiple queries.
// - Comparing against 0.
// - Event has not already been flushed.
if (compare == 0 && evt.Type == CounterType.SamplesPassed && evt.ClearCounter)
{
if (!value.ReserveForHostAccess())
{
// If the event has been flushed, then just use the values on the CPU.
// The query object may already be repurposed for another draw (eg. begin + end).
return false;
}
if (Gd.Capabilities.SupportsConditionalRendering)
{
var buffer = evt.GetBuffer().Get(Cbs, 0, sizeof(long)).Value;
var flags = isEqual ? ConditionalRenderingFlagsEXT.ConditionalRenderingInvertedBitExt : 0;
var conditionalRenderingBeginInfo = new ConditionalRenderingBeginInfoEXT()
{
SType = StructureType.ConditionalRenderingBeginInfoExt,
Buffer = buffer,
Flags = flags
};
// Gd.ConditionalRenderingApi.CmdBeginConditionalRendering(CommandBuffer, conditionalRenderingBeginInfo);
}
_activeConditionalRender = evt;
return true;
}
}
// The GPU will flush the queries to CPU and evaluate the condition there instead.
FlushPendingQuery(); // The thread will be stalled manually flushing the counter, so flush commands now.
return false;
}
public bool TryHostConditionalRendering(ICounterEvent value, ICounterEvent compare, bool isEqual)
{
FlushPendingQuery(); // The thread will be stalled manually flushing the counter, so flush commands now.
return false;
}
private void FlushPendingQuery()
{
if (AutoFlush.ShouldFlushQuery())
{
FlushCommandsImpl();
}
}
public CommandBufferScoped GetPreloadCommandBuffer()
{
if (PreloadCbs == null)
{
PreloadCbs = Gd.CommandBufferPool.Rent();
}
return PreloadCbs.Value;
}
public void FlushCommandsIfWeightExceeding(IAuto disposedResource, ulong byteWeight)
{
bool usedByCurrentCb = disposedResource.HasCommandBufferDependency(Cbs);
if (PreloadCbs != null && !usedByCurrentCb)
{
usedByCurrentCb = disposedResource.HasCommandBufferDependency(PreloadCbs.Value);
}
if (usedByCurrentCb)
{
// Since we can only free memory after the command buffer that uses a given resource was executed,
// keeping the command buffer might cause a high amount of memory to be in use.
// To prevent that, we force submit command buffers if the memory usage by resources
// in use by the current command buffer is above a given limit, and those resources were disposed.
_byteWeight += byteWeight;
if (_byteWeight >= MinByteWeightForFlush)
{
FlushCommandsImpl();
}
}
}
public void Restore()
{
if (Pipeline != null)
{
Gd.Api.CmdBindPipeline(CommandBuffer, Pbp, Pipeline.Get(Cbs).Value);
}
SignalCommandBufferChange();
}
public void FlushCommandsImpl()
{
AutoFlush.RegisterFlush(DrawCount);
EndRenderPass();
foreach (var queryPool in _activeQueries)
{
Gd.Api.CmdEndQuery(CommandBuffer, queryPool, 0);
}
_byteWeight = 0;
if (PreloadCbs != null)
{
PreloadCbs.Value.Dispose();
PreloadCbs = null;
}
CommandBuffer = (Cbs = Gd.CommandBufferPool.ReturnAndRent(Cbs)).CommandBuffer;
// Restore per-command buffer state.
foreach (var queryPool in _activeQueries)
{
Gd.Api.CmdResetQueryPool(CommandBuffer, queryPool, 0, 1);
Gd.Api.CmdBeginQuery(CommandBuffer, queryPool, 0, 0);
}
Restore();
}
public void BeginQuery(BufferedQuery query, QueryPool pool, bool needsReset)
{
if (needsReset)
{
EndRenderPass();
Gd.Api.CmdResetQueryPool(CommandBuffer, pool, 0, 1);
lock (_pendingQueryResets)
{
_pendingQueryResets.Remove(query); // Might be present on here.
}
}
Gd.Api.CmdBeginQuery(CommandBuffer, pool, 0, 0);
_activeQueries.Add(pool);
}
public void EndQuery(QueryPool pool)
{
Gd.Api.CmdEndQuery(CommandBuffer, pool, 0);
_activeQueries.Remove(pool);
}
public void ResetQuery(BufferedQuery query)
{
lock (_pendingQueryResets)
{
_pendingQueryResets.Add(query);
}
}
public void CopyQueryResults(BufferedQuery query)
{
_pendingQueryCopies.Add(query);
if (AutoFlush.RegisterPendingQuery())
{
FlushCommandsImpl();
}
}
protected override void SignalAttachmentChange()
{
if (AutoFlush.ShouldFlush(DrawCount))
{
FlushCommandsImpl();
}
}
protected override void SignalRenderPassEnd()
{
CopyPendingQuery();
}
}
}