rjx-mirror/Ryujinx.Graphics.Gpu/Engine/InlineToMemory/InlineToMemoryClass.cs
riperiperi d92fff541b
Replace CacheResourceWrite with more general "precise" write (#2684)
* Replace CacheResourceWrite with more general "precise" write

The goal of CacheResourceWrite was to notify GPU resources when they were modified directly, by looking up the modified address/size in a structure and calling a method on each resource. The downside of this is that each resource cache has to be queried individually, they all have to implement their own way to do this, and it can only signal to resources using the same PhysicalMemory instance.

This PR adds the ability to signal a write as "precise" on the tracking, which signals a special handler (if present) which can be used to avoid unnecessary flush actions, or maybe even more. For buffers, precise writes specifically do not flush, and instead punch a hole in the modified range list to indicate that the data on GPU has been replaced.

The downside is that precise actions must ignore the page protection bits and always signal - as they need to notify the target resource to ignore the sequence number optimization.

I had to reintroduce the sequence number increment after I2M, as removing it was causing issues in rabbids kingdom battle. However - all resources modified by I2M are notified directly to lower their sequence number, so the problem is likely that another unrelated resource is not being properly updated. Thankfully, doing this does not affect performance in the games I tested.

This should fix regressions from #2624. Test any games that were broken by that. (RF4, rabbids kingdom battle)

I've also added a sequence number increment to ThreedClass.IncrementSyncpoint, as it seems to fix buffer corruption in OpenGL homebrew. (this was a regression from removing sequence number increment from constant buffer update - another unrelated resource thing)

* Add tests.

* Add XML docs for GpuRegionHandle

* Skip UpdateProtection if only precise actions were called

This allows precise actions to skip reprotection costs.
2021-09-29 02:27:03 +02:00

235 lines
7.7 KiB
C#

using Ryujinx.Common;
using Ryujinx.Graphics.Device;
using Ryujinx.Graphics.Texture;
using System;
using System.Collections.Generic;
using System.Runtime.InteropServices;
using System.Runtime.Intrinsics;
namespace Ryujinx.Graphics.Gpu.Engine.InlineToMemory
{
/// <summary>
/// Represents a Inline-to-Memory engine class.
/// </summary>
class InlineToMemoryClass : IDeviceState
{
private readonly GpuContext _context;
private readonly GpuChannel _channel;
private readonly DeviceState<InlineToMemoryClassState> _state;
private bool _isLinear;
private int _offset;
private int _size;
private ulong _dstGpuVa;
private int _dstX;
private int _dstY;
private int _dstWidth;
private int _dstHeight;
private int _dstStride;
private int _dstGobBlocksInY;
private int _lineLengthIn;
private int _lineCount;
private bool _finished;
private int[] _buffer;
/// <summary>
/// Creates a new instance of the Inline-to-Memory engine class.
/// </summary>
/// <param name="context">GPU context</param>
/// <param name="channel">GPU channel</param>
/// <param name="initializeState">Indicates if the internal state should be initialized. Set to false if part of another engine</param>
public InlineToMemoryClass(GpuContext context, GpuChannel channel, bool initializeState)
{
_context = context;
_channel = channel;
if (initializeState)
{
_state = new DeviceState<InlineToMemoryClassState>(new Dictionary<string, RwCallback>
{
{ nameof(InlineToMemoryClassState.LaunchDma), new RwCallback(LaunchDma, null) },
{ nameof(InlineToMemoryClassState.LoadInlineData), new RwCallback(LoadInlineData, null) }
});
}
}
/// <summary>
/// Creates a new instance of the inline-to-memory engine class.
/// </summary>
/// <param name="context">GPU context</param>
/// <param name="channel">GPU channel</param>
public InlineToMemoryClass(GpuContext context, GpuChannel channel) : this(context, channel, true)
{
}
/// <summary>
/// Reads data from the class registers.
/// </summary>
/// <param name="offset">Register byte offset</param>
/// <returns>Data at the specified offset</returns>
public int Read(int offset) => _state.Read(offset);
/// <summary>
/// Writes data to the class registers.
/// </summary>
/// <param name="offset">Register byte offset</param>
/// <param name="data">Data to be written</param>
public void Write(int offset, int data) => _state.Write(offset, data);
/// <summary>
/// Launches Inline-to-Memory engine DMA copy.
/// </summary>
/// <param name="argument">Method call argument</param>
private void LaunchDma(int argument)
{
LaunchDma(ref _state.State, argument);
}
/// <summary>
/// Launches Inline-to-Memory engine DMA copy.
/// </summary>
/// <param name="state">Current class state</param>
/// <param name="argument">Method call argument</param>
public void LaunchDma(ref InlineToMemoryClassState state, int argument)
{
_isLinear = (argument & 1) != 0;
_offset = 0;
_size = (int)(state.LineLengthIn * state.LineCount);
int count = BitUtils.DivRoundUp(_size, 4);
if (_buffer == null || _buffer.Length < count)
{
_buffer = new int[count];
}
ulong dstGpuVa = ((ulong)state.OffsetOutUpperValue << 32) | state.OffsetOut;
_dstGpuVa = dstGpuVa;
_dstX = state.SetDstOriginBytesXV;
_dstY = state.SetDstOriginSamplesYV;
_dstWidth = (int)state.SetDstWidth;
_dstHeight = (int)state.SetDstHeight;
_dstStride = (int)state.PitchOut;
_dstGobBlocksInY = 1 << (int)state.SetDstBlockSizeHeight;
_lineLengthIn = (int)state.LineLengthIn;
_lineCount = (int)state.LineCount;
_finished = false;
}
/// <summary>
/// Pushes a block of data to the Inline-to-Memory engine.
/// </summary>
/// <param name="data">Data to push</param>
public void LoadInlineData(ReadOnlySpan<int> data)
{
if (!_finished)
{
int copySize = Math.Min(data.Length, _buffer.Length - _offset);
data.Slice(0, copySize).CopyTo(new Span<int>(_buffer).Slice(_offset, copySize));
_offset += copySize;
if (_offset * 4 >= _size)
{
FinishTransfer();
}
}
}
/// <summary>
/// Pushes a word of data to the Inline-to-Memory engine.
/// </summary>
/// <param name="argument">Method call argument</param>
public void LoadInlineData(int argument)
{
if (!_finished)
{
_buffer[_offset++] = argument;
if (_offset * 4 >= _size)
{
FinishTransfer();
}
}
}
/// <summary>
/// Performs actual copy of the inline data after the transfer is finished.
/// </summary>
private void FinishTransfer()
{
var memoryManager = _channel.MemoryManager;
var data = MemoryMarshal.Cast<int, byte>(_buffer).Slice(0, _size);
if (_isLinear && _lineCount == 1)
{
memoryManager.WriteTrackedResource(_dstGpuVa, data);
_context.AdvanceSequence();
}
else
{
var dstCalculator = new OffsetCalculator(
_dstWidth,
_dstHeight,
_dstStride,
_isLinear,
_dstGobBlocksInY,
1);
int srcOffset = 0;
for (int y = _dstY; y < _dstY + _lineCount; y++)
{
int x1 = _dstX;
int x2 = _dstX + _lineLengthIn;
int x1Round = BitUtils.AlignUp(_dstX, 16);
int x2Trunc = BitUtils.AlignDown(x2, 16);
int x = x1;
if (x1Round <= x2)
{
for (; x < x1Round; x++, srcOffset++)
{
int dstOffset = dstCalculator.GetOffset(x, y);
ulong dstAddress = _dstGpuVa + (uint)dstOffset;
memoryManager.Write(dstAddress, data[srcOffset]);
}
}
for (; x < x2Trunc; x += 16, srcOffset += 16)
{
int dstOffset = dstCalculator.GetOffset(x, y);
ulong dstAddress = _dstGpuVa + (uint)dstOffset;
memoryManager.Write(dstAddress, MemoryMarshal.Cast<byte, Vector128<byte>>(data.Slice(srcOffset, 16))[0]);
}
for (; x < x2; x++, srcOffset++)
{
int dstOffset = dstCalculator.GetOffset(x, y);
ulong dstAddress = _dstGpuVa + (uint)dstOffset;
memoryManager.Write(dstAddress, data[srcOffset]);
}
}
_context.AdvanceSequence();
}
_finished = true;
}
}
}