R/Ryujinx.Graphics.Gpu/Engine/InlineToMemory/InlineToMemoryClass.cs

using Ryujinx.Common;
using Ryujinx.Graphics.Device;
using Ryujinx.Graphics.Texture;
using System;
using System.Collections.Generic;
using System.Runtime.InteropServices;
using System.Runtime.Intrinsics;

namespace Ryujinx.Graphics.Gpu.Engine.InlineToMemory
{
    /// <summary>
    /// Represents a Inline-to-Memory engine class.
    /// </summary>
    class InlineToMemoryClass : IDeviceState
    {
        private readonly GpuContext _context;
        private readonly GpuChannel _channel;
        private readonly DeviceState<InlineToMemoryClassState> _state;

        private bool _isLinear;

        private int _offset;
        private int _size;

        private ulong _dstGpuVa;
        private int _dstX;
        private int _dstY;
        private int _dstWidth;
        private int _dstHeight;
        private int _dstStride;
        private int _dstGobBlocksInY;
        private int _lineLengthIn;
        private int _lineCount;

        private bool _finished;

        private int[] _buffer;

        /// <summary>
        /// Creates a new instance of the Inline-to-Memory engine class.
        /// </summary>
        /// <param name="context">GPU context</param>
        /// <param name="channel">GPU channel</param>
        /// <param name="initializeState">Indicates if the internal state should be initialized. Set to false if part of another engine</param>
        public InlineToMemoryClass(GpuContext context, GpuChannel channel, bool initializeState)
        {
            _context = context;
            _channel = channel;

            if (initializeState)
            {
                _state = new DeviceState<InlineToMemoryClassState>(new Dictionary<string, RwCallback>
                {
                    { nameof(InlineToMemoryClassState.LaunchDma), new RwCallback(LaunchDma, null) },
                    { nameof(InlineToMemoryClassState.LoadInlineData), new RwCallback(LoadInlineData, null) }
                });
            }
        }

        /// <summary>
        /// Creates a new instance of the inline-to-memory engine class.
        /// </summary>
        /// <param name="context">GPU context</param>
        /// <param name="channel">GPU channel</param>
        public InlineToMemoryClass(GpuContext context, GpuChannel channel) : this(context, channel, true)
        {
        }

        /// <summary>
        /// Reads data from the class registers.
        /// </summary>
        /// <param name="offset">Register byte offset</param>
        /// <returns>Data at the specified offset</returns>
        public int Read(int offset) => _state.Read(offset);

        /// <summary>
        /// Writes data to the class registers.
        /// </summary>
        /// <param name="offset">Register byte offset</param>
        /// <param name="data">Data to be written</param>
        public void Write(int offset, int data) => _state.Write(offset, data);

        /// <summary>
        /// Launches Inline-to-Memory engine DMA copy.
        /// </summary>
        /// <param name="argument">Method call argument</param>
        private void LaunchDma(int argument)
        {
            LaunchDma(ref _state.State, argument);
        }

        /// <summary>
        /// Launches Inline-to-Memory engine DMA copy.
        /// </summary>
        /// <param name="state">Current class state</param>
        /// <param name="argument">Method call argument</param>
        public void LaunchDma(ref InlineToMemoryClassState state, int argument)
        {
            _isLinear = (argument & 1) != 0;

            _offset = 0;
            _size = (int)(state.LineLengthIn * state.LineCount);

            int count = BitUtils.DivRoundUp(_size, 4);

            if (_buffer == null || _buffer.Length < count)
            {
                _buffer = new int[count];
            }

            ulong dstGpuVa = ((ulong)state.OffsetOutUpperValue << 32) | state.OffsetOut;

            _dstGpuVa = dstGpuVa;
            _dstX = state.SetDstOriginBytesXV;
            _dstY = state.SetDstOriginSamplesYV;
            _dstWidth = (int)state.SetDstWidth;
            _dstHeight = (int)state.SetDstHeight;
            _dstStride = (int)state.PitchOut;
            _dstGobBlocksInY = 1 << (int)state.SetDstBlockSizeHeight;
            _lineLengthIn = (int)state.LineLengthIn;
            _lineCount = (int)state.LineCount;

            _finished = false;
        }

        /// <summary>
        /// Pushes a block of data to the Inline-to-Memory engine.
        /// </summary>
        /// <param name="data">Data to push</param>
        public void LoadInlineData(ReadOnlySpan<int> data)
        {
            if (!_finished)
            {
                int copySize = Math.Min(data.Length, _buffer.Length - _offset);
                data.Slice(0, copySize).CopyTo(new Span<int>(_buffer).Slice(_offset, copySize));

                _offset += copySize;

                if (_offset * 4 >= _size)
                {
                    FinishTransfer();
                }
            }
        }

        /// <summary>
        /// Pushes a word of data to the Inline-to-Memory engine.
        /// </summary>
        /// <param name="argument">Method call argument</param>
        public void LoadInlineData(int argument)
        {
            if (!_finished)
            {
                _buffer[_offset++] = argument;

                if (_offset * 4 >= _size)
                {
                    FinishTransfer();
                }
            }
        }

        /// <summary>
        /// Performs actual copy of the inline data after the transfer is finished.
        /// </summary>
        private void FinishTransfer()
        {
            var memoryManager = _channel.MemoryManager;

            var data = MemoryMarshal.Cast<int, byte>(_buffer).Slice(0, _size);

            if (_isLinear && _lineCount == 1)
            {
                memoryManager.Physical.CacheResourceWrite(memoryManager, _dstGpuVa, data);
            }
            else
            {
                var dstCalculator = new OffsetCalculator(
                    _dstWidth,
                    _dstHeight,
                    _dstStride,
                    _isLinear,
                    _dstGobBlocksInY,
                    1);

                int srcOffset = 0;

                for (int y = _dstY; y < _dstY + _lineCount; y++)
                {
                    int x1 = _dstX;
                    int x2 = _dstX + _lineLengthIn;
                    int x1Round = BitUtils.AlignUp(_dstX, 16);
                    int x2Trunc = BitUtils.AlignDown(x2, 16);

                    int x = x1;

                    if (x1Round <= x2)
                    {
                        for (; x < x1Round; x++, srcOffset++)
                        {
                            int dstOffset = dstCalculator.GetOffset(x, y);

                            ulong dstAddress = _dstGpuVa + (uint)dstOffset;

                            memoryManager.Write(dstAddress, data[srcOffset]);
                        }
                    }

                    for (; x < x2Trunc; x += 16, srcOffset += 16)
                    {
                        int dstOffset = dstCalculator.GetOffset(x, y);

                        ulong dstAddress = _dstGpuVa + (uint)dstOffset;

                        memoryManager.Write(dstAddress, MemoryMarshal.Cast<byte, Vector128<byte>>(data.Slice(srcOffset, 16))[0]);
                    }

                    for (; x < x2; x++, srcOffset++)
                    {
                        int dstOffset = dstCalculator.GetOffset(x, y);

                        ulong dstAddress = _dstGpuVa + (uint)dstOffset;

                        memoryManager.Write(dstAddress, data[srcOffset]);
                    }
                }

                _context.AdvanceSequence();
            }

            _finished = true;
        }
    }
}
-												Separate GPU engines and make state follow official docs (part 1/2) (#2422)

* Use DeviceState for compute and i2m

* Migrate 2D class, more comments

* Migrate DMA copy engine

* Remove now unused code

* Replace GpuState by GpuAccessorState on GpuAcessor, since compute no longer has a GpuState

* More comments

* Add logging (disabled)

* Add back i2m on 3D engine
											
										
										
											2021-07-07 23:56:06 +00:00
+								using Ryujinx.Common;
 								using Ryujinx.Graphics.Device;
 								using Ryujinx.Graphics.Texture;
 								using System;
 								using System.Collections.Generic;
 								using System.Runtime.InteropServices;
-												Support non-contiguous copies on I2M and DMA engines (#2473)

* Support non-contiguous copies on I2M and DMA engines

* Vector copy should start aligned on I2M

* Nits

* Zero extend the offset
											
										
										
											2021-08-04 20:20:58 +00:00
+								using System.Runtime.Intrinsics;
-												Separate GPU engines and make state follow official docs (part 1/2) (#2422)

* Use DeviceState for compute and i2m

* Migrate 2D class, more comments

* Migrate DMA copy engine

* Remove now unused code

* Replace GpuState by GpuAccessorState on GpuAcessor, since compute no longer has a GpuState

* More comments

* Add logging (disabled)

* Add back i2m on 3D engine
											
										
										
											2021-07-07 23:56:06 +00:00
 								namespace Ryujinx.Graphics.Gpu.Engine.InlineToMemory
 								{
 								    /// <summary>
 								    /// Represents a Inline-to-Memory engine class.
 								    /// </summary>
 								    class InlineToMemoryClass : IDeviceState
 								    {
 								        private readonly GpuContext _context;
 								        private readonly GpuChannel _channel;
 								        private readonly DeviceState<InlineToMemoryClassState> _state;
 								        private bool _isLinear;
 								        private int _offset;
 								        private int _size;
 								        private ulong _dstGpuVa;
 								        private int _dstX;
 								        private int _dstY;
 								        private int _dstWidth;
 								        private int _dstHeight;
 								        private int _dstStride;
 								        private int _dstGobBlocksInY;
 								        private int _lineLengthIn;
 								        private int _lineCount;
 								        private bool _finished;
 								        private int[] _buffer;
 								        /// <summary>
 								        /// Creates a new instance of the Inline-to-Memory engine class.
 								        /// </summary>
 								        /// <param name="context">GPU context</param>
 								        /// <param name="channel">GPU channel</param>
 								        /// <param name="initializeState">Indicates if the internal state should be initialized. Set to false if part of another engine</param>
-												Separate GPU engines (part 2/2) (#2440)

* 3D engine now uses DeviceState too, plus new state modification tracking

* Remove old methods code

* Remove GpuState and friends

* Optimize DeviceState, force inline some functions

* This change was not supposed to go in

* Proper channel initialization

* Optimize state read/write methods even more

* Fix debug build

* Do not dirty state if the write is redundant

* The YControl register should dirty either the viewport or front face state too, to update the host origin

* Avoid redundant vertex buffer updates

* Move state and get rid of the Ryujinx.Graphics.Gpu.State namespace

* Comments and nits

* Fix rebase

* PR feedback

* Move changed = false to improve codegen

* PR feedback

* Carry RyuJIT a bit more
											
										
										
											2021-07-11 20:20:40 +00:00
+								        public InlineToMemoryClass(GpuContext context, GpuChannel channel, bool initializeState)
-												Separate GPU engines and make state follow official docs (part 1/2) (#2422)

* Use DeviceState for compute and i2m

* Migrate 2D class, more comments

* Migrate DMA copy engine

* Remove now unused code

* Replace GpuState by GpuAccessorState on GpuAcessor, since compute no longer has a GpuState

* More comments

* Add logging (disabled)

* Add back i2m on 3D engine
											
										
										
											2021-07-07 23:56:06 +00:00
+								        {
 								            _context = context;
 								            _channel = channel;
 								            if (initializeState)
 								            {
 								                _state = new DeviceState<InlineToMemoryClassState>(new Dictionary<string, RwCallback>
 								                {
 								                    { nameof(InlineToMemoryClassState.LaunchDma), new RwCallback(LaunchDma, null) },
 								                    { nameof(InlineToMemoryClassState.LoadInlineData), new RwCallback(LoadInlineData, null) }
 								                });
 								            }
 								        }
 								        /// <summary>
 								        /// Creates a new instance of the inline-to-memory engine class.
 								        /// </summary>
 								        /// <param name="context">GPU context</param>
 								        /// <param name="channel">GPU channel</param>
 								        public InlineToMemoryClass(GpuContext context, GpuChannel channel) : this(context, channel, true)
 								        {
 								        }
 								        /// <summary>
 								        /// Reads data from the class registers.
 								        /// </summary>
 								        /// <param name="offset">Register byte offset</param>
 								        /// <returns>Data at the specified offset</returns>
-												Separate GPU engines (part 2/2) (#2440)

* 3D engine now uses DeviceState too, plus new state modification tracking

* Remove old methods code

* Remove GpuState and friends

* Optimize DeviceState, force inline some functions

* This change was not supposed to go in

* Proper channel initialization

* Optimize state read/write methods even more

* Fix debug build

* Do not dirty state if the write is redundant

* The YControl register should dirty either the viewport or front face state too, to update the host origin

* Avoid redundant vertex buffer updates

* Move state and get rid of the Ryujinx.Graphics.Gpu.State namespace

* Comments and nits

* Fix rebase

* PR feedback

* Move changed = false to improve codegen

* PR feedback

* Carry RyuJIT a bit more
											
										
										
											2021-07-11 20:20:40 +00:00
+								        public int Read(int offset) => _state.Read(offset);
-												Separate GPU engines and make state follow official docs (part 1/2) (#2422)

* Use DeviceState for compute and i2m

* Migrate 2D class, more comments

* Migrate DMA copy engine

* Remove now unused code

* Replace GpuState by GpuAccessorState on GpuAcessor, since compute no longer has a GpuState

* More comments

* Add logging (disabled)

* Add back i2m on 3D engine
											
										
										
											2021-07-07 23:56:06 +00:00
 								        /// <summary>
 								        /// Writes data to the class registers.
 								        /// </summary>
 								        /// <param name="offset">Register byte offset</param>
 								        /// <param name="data">Data to be written</param>
-												Separate GPU engines (part 2/2) (#2440)

* 3D engine now uses DeviceState too, plus new state modification tracking

* Remove old methods code

* Remove GpuState and friends

* Optimize DeviceState, force inline some functions

* This change was not supposed to go in

* Proper channel initialization

* Optimize state read/write methods even more

* Fix debug build

* Do not dirty state if the write is redundant

* The YControl register should dirty either the viewport or front face state too, to update the host origin

* Avoid redundant vertex buffer updates

* Move state and get rid of the Ryujinx.Graphics.Gpu.State namespace

* Comments and nits

* Fix rebase

* PR feedback

* Move changed = false to improve codegen

* PR feedback

* Carry RyuJIT a bit more
											
										
										
											2021-07-11 20:20:40 +00:00
+								        public void Write(int offset, int data) => _state.Write(offset, data);
-												Separate GPU engines and make state follow official docs (part 1/2) (#2422)

* Use DeviceState for compute and i2m

* Migrate 2D class, more comments

* Migrate DMA copy engine

* Remove now unused code

* Replace GpuState by GpuAccessorState on GpuAcessor, since compute no longer has a GpuState

* More comments

* Add logging (disabled)

* Add back i2m on 3D engine
											
										
										
											2021-07-07 23:56:06 +00:00
 								        /// <summary>
 								        /// Launches Inline-to-Memory engine DMA copy.
 								        /// </summary>
 								        /// <param name="argument">Method call argument</param>
-												Separate GPU engines (part 2/2) (#2440)

* 3D engine now uses DeviceState too, plus new state modification tracking

* Remove old methods code

* Remove GpuState and friends

* Optimize DeviceState, force inline some functions

* This change was not supposed to go in

* Proper channel initialization

* Optimize state read/write methods even more

* Fix debug build

* Do not dirty state if the write is redundant

* The YControl register should dirty either the viewport or front face state too, to update the host origin

* Avoid redundant vertex buffer updates

* Move state and get rid of the Ryujinx.Graphics.Gpu.State namespace

* Comments and nits

* Fix rebase

* PR feedback

* Move changed = false to improve codegen

* PR feedback

* Carry RyuJIT a bit more
											
										
										
											2021-07-11 20:20:40 +00:00
+								        private void LaunchDma(int argument)
-												Separate GPU engines and make state follow official docs (part 1/2) (#2422)

* Use DeviceState for compute and i2m

* Migrate 2D class, more comments

* Migrate DMA copy engine

* Remove now unused code

* Replace GpuState by GpuAccessorState on GpuAcessor, since compute no longer has a GpuState

* More comments

* Add logging (disabled)

* Add back i2m on 3D engine
											
										
										
											2021-07-07 23:56:06 +00:00
+								        {
 								            LaunchDma(ref _state.State, argument);
 								        }
 								        /// <summary>
 								        /// Launches Inline-to-Memory engine DMA copy.
 								        /// </summary>
 								        /// <param name="state">Current class state</param>
 								        /// <param name="argument">Method call argument</param>
-												Separate GPU engines (part 2/2) (#2440)

* 3D engine now uses DeviceState too, plus new state modification tracking

* Remove old methods code

* Remove GpuState and friends

* Optimize DeviceState, force inline some functions

* This change was not supposed to go in

* Proper channel initialization

* Optimize state read/write methods even more

* Fix debug build

* Do not dirty state if the write is redundant

* The YControl register should dirty either the viewport or front face state too, to update the host origin

* Avoid redundant vertex buffer updates

* Move state and get rid of the Ryujinx.Graphics.Gpu.State namespace

* Comments and nits

* Fix rebase

* PR feedback

* Move changed = false to improve codegen

* PR feedback

* Carry RyuJIT a bit more
											
										
										
											2021-07-11 20:20:40 +00:00
+								        public void LaunchDma(ref InlineToMemoryClassState state, int argument)
-												Separate GPU engines and make state follow official docs (part 1/2) (#2422)

* Use DeviceState for compute and i2m

* Migrate 2D class, more comments

* Migrate DMA copy engine

* Remove now unused code

* Replace GpuState by GpuAccessorState on GpuAcessor, since compute no longer has a GpuState

* More comments

* Add logging (disabled)

* Add back i2m on 3D engine
											
										
										
											2021-07-07 23:56:06 +00:00
+								        {
 								            _isLinear = (argument & 1) != 0;
 								            _offset = 0;
 								            _size = (int)(state.LineLengthIn * state.LineCount);
 								            int count = BitUtils.DivRoundUp(_size, 4);
 								            if (_buffer == null || _buffer.Length < count)
 								            {
 								                _buffer = new int[count];
 								            }
 								            ulong dstGpuVa = ((ulong)state.OffsetOutUpperValue << 32) | state.OffsetOut;
 								            _dstGpuVa = dstGpuVa;
 								            _dstX = state.SetDstOriginBytesXV;
 								            _dstY = state.SetDstOriginSamplesYV;
 								            _dstWidth = (int)state.SetDstWidth;
 								            _dstHeight = (int)state.SetDstHeight;
 								            _dstStride = (int)state.PitchOut;
 								            _dstGobBlocksInY = 1 << (int)state.SetDstBlockSizeHeight;
 								            _lineLengthIn = (int)state.LineLengthIn;
 								            _lineCount = (int)state.LineCount;
 								            _finished = false;
 								        }
-												Implement a fast path for I2M transfers (#2467)


											
										
										
											2021-07-12 19:48:57 +00:00
+								        /// <summary>
 								        /// Pushes a block of data to the Inline-to-Memory engine.
 								        /// </summary>
 								        /// <param name="data">Data to push</param>
 								        public void LoadInlineData(ReadOnlySpan<int> data)
 								        {
 								            if (!_finished)
 								            {
 								                int copySize = Math.Min(data.Length, _buffer.Length - _offset);
 								                data.Slice(0, copySize).CopyTo(new Span<int>(_buffer).Slice(_offset, copySize));
 								                _offset += copySize;
 								                if (_offset * 4 >= _size)
 								                {
 								                    FinishTransfer();
 								                }
 								            }
 								        }
-												Separate GPU engines and make state follow official docs (part 1/2) (#2422)

* Use DeviceState for compute and i2m

* Migrate 2D class, more comments

* Migrate DMA copy engine

* Remove now unused code

* Replace GpuState by GpuAccessorState on GpuAcessor, since compute no longer has a GpuState

* More comments

* Add logging (disabled)

* Add back i2m on 3D engine
											
										
										
											2021-07-07 23:56:06 +00:00
+								        /// <summary>
 								        /// Pushes a word of data to the Inline-to-Memory engine.
 								        /// </summary>
 								        /// <param name="argument">Method call argument</param>
-												Separate GPU engines (part 2/2) (#2440)

* 3D engine now uses DeviceState too, plus new state modification tracking

* Remove old methods code

* Remove GpuState and friends

* Optimize DeviceState, force inline some functions

* This change was not supposed to go in

* Proper channel initialization

* Optimize state read/write methods even more

* Fix debug build

* Do not dirty state if the write is redundant

* The YControl register should dirty either the viewport or front face state too, to update the host origin

* Avoid redundant vertex buffer updates

* Move state and get rid of the Ryujinx.Graphics.Gpu.State namespace

* Comments and nits

* Fix rebase

* PR feedback

* Move changed = false to improve codegen

* PR feedback

* Carry RyuJIT a bit more
											
										
										
											2021-07-11 20:20:40 +00:00
+								        public void LoadInlineData(int argument)
-												Separate GPU engines and make state follow official docs (part 1/2) (#2422)

* Use DeviceState for compute and i2m

* Migrate 2D class, more comments

* Migrate DMA copy engine

* Remove now unused code

* Replace GpuState by GpuAccessorState on GpuAcessor, since compute no longer has a GpuState

* More comments

* Add logging (disabled)

* Add back i2m on 3D engine
											
										
										
											2021-07-07 23:56:06 +00:00
+								        {
 								            if (!_finished)
 								            {
 								                _buffer[_offset++] = argument;
 								                if (_offset * 4 >= _size)
 								                {
 								                    FinishTransfer();
 								                }
 								            }
 								        }
 								        /// <summary>
 								        /// Performs actual copy of the inline data after the transfer is finished.
 								        /// </summary>
 								        private void FinishTransfer()
 								        {
-												Support non-contiguous copies on I2M and DMA engines (#2473)

* Support non-contiguous copies on I2M and DMA engines

* Vector copy should start aligned on I2M

* Nits

* Zero extend the offset
											
										
										
											2021-08-04 20:20:58 +00:00
+								            var memoryManager = _channel.MemoryManager;
 								            var data = MemoryMarshal.Cast<int, byte>(_buffer).Slice(0, _size);
-												Separate GPU engines and make state follow official docs (part 1/2) (#2422)

* Use DeviceState for compute and i2m

* Migrate 2D class, more comments

* Migrate DMA copy engine

* Remove now unused code

* Replace GpuState by GpuAccessorState on GpuAcessor, since compute no longer has a GpuState

* More comments

* Add logging (disabled)

* Add back i2m on 3D engine
											
										
										
											2021-07-07 23:56:06 +00:00
 								            if (_isLinear && _lineCount == 1)
 								            {
-												Fast path for Inline2Memory buffer write that skips write tracking (#2624)

* Fast path for Inline2Memory buffer write

This PR adds a method to PhysicalMemory that attempts to write all cached resources directly, so that memory tracking can be avoided. The goal of this is both to avoid flushing buffer data, and to avoid raising the sequence number when data is written, which causes buffer and texture handles to be re-checked.

This currently only targets buffers, with a side check on textures that falls back to a tracked write if any exist within the target range. It's not expected to write textures from here - this is just a mechanism to protect us if someone does decide to do that. It's possible to add a fast path for this in future (and for ShaderCache, once that starts using tracking)

The forced read before inline2memory begins has been skipped, as the data is fully written when the transfer is completed anyways. This allows us to flush on read in emergency situations, but still write the new data over the flushed data.

Improves performance on Xenoblade 2 and DE, which was flushing buffer data on the GPU thread when trying to write compute data. May improve performance in other games that write SSBOs from compute, and update data in the same/nearby pages often.

Super Smash Bros Ultimate should probably be tested to make sure the vertex explosions haven't returned, as I think that's what this AdvanceSequence was for.

* ForceDirty before write, to make sure data does not flush over the new write
											
										
										
											2021-09-19 13:09:53 +00:00
+								                memoryManager.Physical.CacheResourceWrite(memoryManager, _dstGpuVa, data);
-												Separate GPU engines and make state follow official docs (part 1/2) (#2422)

* Use DeviceState for compute and i2m

* Migrate 2D class, more comments

* Migrate DMA copy engine

* Remove now unused code

* Replace GpuState by GpuAccessorState on GpuAcessor, since compute no longer has a GpuState

* More comments

* Add logging (disabled)

* Add back i2m on 3D engine
											
										
										
											2021-07-07 23:56:06 +00:00
+								            }
 								            else
 								            {
 								                var dstCalculator = new OffsetCalculator(
 								                    _dstWidth,
 								                    _dstHeight,
 								                    _dstStride,
 								                    _isLinear,
 								                    _dstGobBlocksInY,
 );
 								                int srcOffset = 0;
 								                for (int y = _dstY; y < _dstY + _lineCount; y++)
 								                {
 								                    int x1 = _dstX;
 								                    int x2 = _dstX + _lineLengthIn;
-												Support non-contiguous copies on I2M and DMA engines (#2473)

* Support non-contiguous copies on I2M and DMA engines

* Vector copy should start aligned on I2M

* Nits

* Zero extend the offset
											
										
										
											2021-08-04 20:20:58 +00:00
+								                    int x1Round = BitUtils.AlignUp(_dstX, 16);
 								                    int x2Trunc = BitUtils.AlignDown(x2, 16);
-												Separate GPU engines and make state follow official docs (part 1/2) (#2422)

* Use DeviceState for compute and i2m

* Migrate 2D class, more comments

* Migrate DMA copy engine

* Remove now unused code

* Replace GpuState by GpuAccessorState on GpuAcessor, since compute no longer has a GpuState

* More comments

* Add logging (disabled)

* Add back i2m on 3D engine
											
										
										
											2021-07-07 23:56:06 +00:00
-												Support non-contiguous copies on I2M and DMA engines (#2473)

* Support non-contiguous copies on I2M and DMA engines

* Vector copy should start aligned on I2M

* Nits

* Zero extend the offset
											
										
										
											2021-08-04 20:20:58 +00:00
+								                    int x = x1;
-												Separate GPU engines and make state follow official docs (part 1/2) (#2422)

* Use DeviceState for compute and i2m

* Migrate 2D class, more comments

* Migrate DMA copy engine

* Remove now unused code

* Replace GpuState by GpuAccessorState on GpuAcessor, since compute no longer has a GpuState

* More comments

* Add logging (disabled)

* Add back i2m on 3D engine
											
										
										
											2021-07-07 23:56:06 +00:00
-												Support non-contiguous copies on I2M and DMA engines (#2473)

* Support non-contiguous copies on I2M and DMA engines

* Vector copy should start aligned on I2M

* Nits

* Zero extend the offset
											
										
										
											2021-08-04 20:20:58 +00:00
+								                    if (x1Round <= x2)
-												Separate GPU engines and make state follow official docs (part 1/2) (#2422)

* Use DeviceState for compute and i2m

* Migrate 2D class, more comments

* Migrate DMA copy engine

* Remove now unused code

* Replace GpuState by GpuAccessorState on GpuAcessor, since compute no longer has a GpuState

* More comments

* Add logging (disabled)

* Add back i2m on 3D engine
											
										
										
											2021-07-07 23:56:06 +00:00
+								                    {
-												Support non-contiguous copies on I2M and DMA engines (#2473)

* Support non-contiguous copies on I2M and DMA engines

* Vector copy should start aligned on I2M

* Nits

* Zero extend the offset
											
										
										
											2021-08-04 20:20:58 +00:00
+								                        for (; x < x1Round; x++, srcOffset++)
 								                        {
 								                            int dstOffset = dstCalculator.GetOffset(x, y);
 								                            ulong dstAddress = _dstGpuVa + (uint)dstOffset;
-												Separate GPU engines and make state follow official docs (part 1/2) (#2422)

* Use DeviceState for compute and i2m

* Migrate 2D class, more comments

* Migrate DMA copy engine

* Remove now unused code

* Replace GpuState by GpuAccessorState on GpuAcessor, since compute no longer has a GpuState

* More comments

* Add logging (disabled)

* Add back i2m on 3D engine
											
										
										
											2021-07-07 23:56:06 +00:00
-												Support non-contiguous copies on I2M and DMA engines (#2473)

* Support non-contiguous copies on I2M and DMA engines

* Vector copy should start aligned on I2M

* Nits

* Zero extend the offset
											
										
										
											2021-08-04 20:20:58 +00:00
+								                            memoryManager.Write(dstAddress, data[srcOffset]);
 								                        }
 								                    }
-												Separate GPU engines and make state follow official docs (part 1/2) (#2422)

* Use DeviceState for compute and i2m

* Migrate 2D class, more comments

* Migrate DMA copy engine

* Remove now unused code

* Replace GpuState by GpuAccessorState on GpuAcessor, since compute no longer has a GpuState

* More comments

* Add logging (disabled)

* Add back i2m on 3D engine
											
										
										
											2021-07-07 23:56:06 +00:00
-												Support non-contiguous copies on I2M and DMA engines (#2473)

* Support non-contiguous copies on I2M and DMA engines

* Vector copy should start aligned on I2M

* Nits

* Zero extend the offset
											
										
										
											2021-08-04 20:20:58 +00:00
+								                    for (; x < x2Trunc; x += 16, srcOffset += 16)
 								                    {
 								                        int dstOffset = dstCalculator.GetOffset(x, y);
-												Separate GPU engines and make state follow official docs (part 1/2) (#2422)

* Use DeviceState for compute and i2m

* Migrate 2D class, more comments

* Migrate DMA copy engine

* Remove now unused code

* Replace GpuState by GpuAccessorState on GpuAcessor, since compute no longer has a GpuState

* More comments

* Add logging (disabled)

* Add back i2m on 3D engine
											
										
										
											2021-07-07 23:56:06 +00:00
-												Support non-contiguous copies on I2M and DMA engines (#2473)

* Support non-contiguous copies on I2M and DMA engines

* Vector copy should start aligned on I2M

* Nits

* Zero extend the offset
											
										
										
											2021-08-04 20:20:58 +00:00
+								                        ulong dstAddress = _dstGpuVa + (uint)dstOffset;
 								                        memoryManager.Write(dstAddress, MemoryMarshal.Cast<byte, Vector128<byte>>(data.Slice(srcOffset, 16))[0]);
-												Separate GPU engines and make state follow official docs (part 1/2) (#2422)

* Use DeviceState for compute and i2m

* Migrate 2D class, more comments

* Migrate DMA copy engine

* Remove now unused code

* Replace GpuState by GpuAccessorState on GpuAcessor, since compute no longer has a GpuState

* More comments

* Add logging (disabled)

* Add back i2m on 3D engine
											
										
										
											2021-07-07 23:56:06 +00:00
+								                    }
 								                    for (; x < x2; x++, srcOffset++)
 								                    {
 								                        int dstOffset = dstCalculator.GetOffset(x, y);
-												Support non-contiguous copies on I2M and DMA engines (#2473)

* Support non-contiguous copies on I2M and DMA engines

* Vector copy should start aligned on I2M

* Nits

* Zero extend the offset
											
										
										
											2021-08-04 20:20:58 +00:00
+								                        ulong dstAddress = _dstGpuVa + (uint)dstOffset;
-												Separate GPU engines and make state follow official docs (part 1/2) (#2422)

* Use DeviceState for compute and i2m

* Migrate 2D class, more comments

* Migrate DMA copy engine

* Remove now unused code

* Replace GpuState by GpuAccessorState on GpuAcessor, since compute no longer has a GpuState

* More comments

* Add logging (disabled)

* Add back i2m on 3D engine
											
										
										
											2021-07-07 23:56:06 +00:00
-												Support non-contiguous copies on I2M and DMA engines (#2473)

* Support non-contiguous copies on I2M and DMA engines

* Vector copy should start aligned on I2M

* Nits

* Zero extend the offset
											
										
										
											2021-08-04 20:20:58 +00:00
+								                        memoryManager.Write(dstAddress, data[srcOffset]);
-												Separate GPU engines and make state follow official docs (part 1/2) (#2422)

* Use DeviceState for compute and i2m

* Migrate 2D class, more comments

* Migrate DMA copy engine

* Remove now unused code

* Replace GpuState by GpuAccessorState on GpuAcessor, since compute no longer has a GpuState

* More comments

* Add logging (disabled)

* Add back i2m on 3D engine
											
										
										
											2021-07-07 23:56:06 +00:00
+								                    }
 								                }
-												Fast path for Inline2Memory buffer write that skips write tracking (#2624)

* Fast path for Inline2Memory buffer write

This PR adds a method to PhysicalMemory that attempts to write all cached resources directly, so that memory tracking can be avoided. The goal of this is both to avoid flushing buffer data, and to avoid raising the sequence number when data is written, which causes buffer and texture handles to be re-checked.

This currently only targets buffers, with a side check on textures that falls back to a tracked write if any exist within the target range. It's not expected to write textures from here - this is just a mechanism to protect us if someone does decide to do that. It's possible to add a fast path for this in future (and for ShaderCache, once that starts using tracking)

The forced read before inline2memory begins has been skipped, as the data is fully written when the transfer is completed anyways. This allows us to flush on read in emergency situations, but still write the new data over the flushed data.

Improves performance on Xenoblade 2 and DE, which was flushing buffer data on the GPU thread when trying to write compute data. May improve performance in other games that write SSBOs from compute, and update data in the same/nearby pages often.

Super Smash Bros Ultimate should probably be tested to make sure the vertex explosions haven't returned, as I think that's what this AdvanceSequence was for.

* ForceDirty before write, to make sure data does not flush over the new write
											
										
										
											2021-09-19 13:09:53 +00:00
 								                _context.AdvanceSequence();
-												Separate GPU engines and make state follow official docs (part 1/2) (#2422)

* Use DeviceState for compute and i2m

* Migrate 2D class, more comments

* Migrate DMA copy engine

* Remove now unused code

* Replace GpuState by GpuAccessorState on GpuAcessor, since compute no longer has a GpuState

* More comments

* Add logging (disabled)

* Add back i2m on 3D engine
											
										
										
											2021-07-07 23:56:06 +00:00
+								            }
 								            _finished = true;
 								        }
 								    }
 								}