2019-10-13 06:02:07 +00:00
|
|
|
using Ryujinx.Common;
|
|
|
|
using System;
|
2020-06-13 22:31:06 +00:00
|
|
|
using System.Runtime.Intrinsics;
|
2019-10-13 06:02:07 +00:00
|
|
|
using static Ryujinx.Graphics.Texture.BlockLinearConstants;
|
|
|
|
|
|
|
|
namespace Ryujinx.Graphics.Texture
|
|
|
|
{
|
|
|
|
public static class LayoutConverter
|
|
|
|
{
|
2022-12-26 18:50:27 +00:00
|
|
|
public const int HostStrideAlignment = 4;
|
2019-10-13 06:02:07 +00:00
|
|
|
|
2020-07-12 03:07:01 +00:00
|
|
|
public static void ConvertBlockLinearToLinear(
|
|
|
|
Span<byte> dst,
|
|
|
|
int width,
|
|
|
|
int height,
|
|
|
|
int stride,
|
|
|
|
int bytesPerPixel,
|
|
|
|
int gobBlocksInY,
|
|
|
|
ReadOnlySpan<byte> data)
|
|
|
|
{
|
|
|
|
int gobHeight = gobBlocksInY * GobHeight;
|
|
|
|
|
|
|
|
int strideTrunc = BitUtils.AlignDown(width * bytesPerPixel, 16);
|
|
|
|
int strideTrunc64 = BitUtils.AlignDown(width * bytesPerPixel, 64);
|
|
|
|
|
|
|
|
int xStart = strideTrunc / bytesPerPixel;
|
|
|
|
|
|
|
|
int outStrideGap = stride - width * bytesPerPixel;
|
|
|
|
|
|
|
|
int alignment = GobStride / bytesPerPixel;
|
|
|
|
|
|
|
|
int wAligned = BitUtils.AlignUp(width, alignment);
|
|
|
|
|
|
|
|
BlockLinearLayout layoutConverter = new BlockLinearLayout(wAligned, height, gobBlocksInY, 1, bytesPerPixel);
|
|
|
|
|
|
|
|
unsafe bool Convert<T>(Span<byte> output, ReadOnlySpan<byte> data) where T : unmanaged
|
|
|
|
{
|
|
|
|
fixed (byte* outputPtr = output, dataPtr = data)
|
|
|
|
{
|
|
|
|
byte* outPtr = outputPtr;
|
|
|
|
|
|
|
|
for (int y = 0; y < height; y++)
|
|
|
|
{
|
|
|
|
layoutConverter.SetY(y);
|
|
|
|
|
|
|
|
for (int x = 0; x < strideTrunc64; x += 64, outPtr += 64)
|
|
|
|
{
|
|
|
|
byte* offset = dataPtr + layoutConverter.GetOffsetWithLineOffset64(x);
|
|
|
|
byte* offset2 = offset + 0x20;
|
|
|
|
byte* offset3 = offset + 0x100;
|
|
|
|
byte* offset4 = offset + 0x120;
|
|
|
|
|
|
|
|
Vector128<byte> value = *(Vector128<byte>*)offset;
|
|
|
|
Vector128<byte> value2 = *(Vector128<byte>*)offset2;
|
|
|
|
Vector128<byte> value3 = *(Vector128<byte>*)offset3;
|
|
|
|
Vector128<byte> value4 = *(Vector128<byte>*)offset4;
|
|
|
|
|
|
|
|
*(Vector128<byte>*)outPtr = value;
|
|
|
|
*(Vector128<byte>*)(outPtr + 16) = value2;
|
|
|
|
*(Vector128<byte>*)(outPtr + 32) = value3;
|
|
|
|
*(Vector128<byte>*)(outPtr + 48) = value4;
|
|
|
|
}
|
|
|
|
|
|
|
|
for (int x = strideTrunc64; x < strideTrunc; x += 16, outPtr += 16)
|
|
|
|
{
|
|
|
|
byte* offset = dataPtr + layoutConverter.GetOffsetWithLineOffset16(x);
|
|
|
|
|
|
|
|
*(Vector128<byte>*)outPtr = *(Vector128<byte>*)offset;
|
|
|
|
}
|
|
|
|
|
|
|
|
for (int x = xStart; x < width; x++, outPtr += bytesPerPixel)
|
|
|
|
{
|
|
|
|
byte* offset = dataPtr + layoutConverter.GetOffset(x);
|
|
|
|
|
|
|
|
*(T*)outPtr = *(T*)offset;
|
|
|
|
}
|
|
|
|
|
|
|
|
outPtr += outStrideGap;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
bool _ = bytesPerPixel switch
|
|
|
|
{
|
|
|
|
1 => Convert<byte>(dst, data),
|
|
|
|
2 => Convert<ushort>(dst, data),
|
|
|
|
4 => Convert<uint>(dst, data),
|
|
|
|
8 => Convert<ulong>(dst, data),
|
|
|
|
12 => Convert<Bpp12Pixel>(dst, data),
|
|
|
|
16 => Convert<Vector128<byte>>(dst, data),
|
|
|
|
_ => throw new NotSupportedException($"Unable to convert ${bytesPerPixel} bpp pixel format.")
|
|
|
|
};
|
|
|
|
}
|
|
|
|
|
GPU: Pass SpanOrArray for Texture SetData to avoid copy (#3745)
* GPU: Pass SpanOrArray for Texture SetData to avoid copy
Texture data is often converted before upload, meaning that an array was allocated to perform the conversion into. However, the backend SetData methods were being passed a Span of that data, and the Multithreaded layer does `ToArray()` on it so that it can be stored for later! This method can't extract the original array, so it creates a copy.
This PR changes the type passed for textures to a new ref struct called SpanOrArray, which is backed by either a ReadOnlySpan or an array. The benefit here is that we can have a ToArray method that doesn't copy if it is originally backed by an array.
This will also avoid a copy when running the ASTC decoder.
On NieR this was taking 38% of texture upload time, which it does a _lot_ of when you move between areas, so there should be a 1.6x performance boost when strictly uploading textures. No doubt this will also improve texture streaming performance in UE4 games, and maybe a small reduction with video playback.
From the numbers, it's probably possible to improve the upload rate by a further 1.6x by performing layout conversion on GPU. I'm not sure if we could improve it further than that - multithreading conversion on CPU would probably result in memory bottleneck.
This doesn't extend to buffers, since we don't convert their data on the GPU emulator side.
* Remove implicit cast to array.
2022-10-08 15:04:47 +00:00
|
|
|
public static byte[] ConvertBlockLinearToLinear(
|
2019-10-13 06:02:07 +00:00
|
|
|
int width,
|
|
|
|
int height,
|
|
|
|
int depth,
|
2022-01-09 16:28:48 +00:00
|
|
|
int sliceDepth,
|
2019-10-13 06:02:07 +00:00
|
|
|
int levels,
|
|
|
|
int layers,
|
|
|
|
int blockWidth,
|
|
|
|
int blockHeight,
|
|
|
|
int bytesPerPixel,
|
|
|
|
int gobBlocksInY,
|
|
|
|
int gobBlocksInZ,
|
|
|
|
int gobBlocksInTileX,
|
|
|
|
SizeInfo sizeInfo,
|
2019-12-05 20:34:47 +00:00
|
|
|
ReadOnlySpan<byte> data)
|
2019-10-13 06:02:07 +00:00
|
|
|
{
|
|
|
|
int outSize = GetTextureSize(
|
|
|
|
width,
|
|
|
|
height,
|
2023-03-14 20:08:44 +00:00
|
|
|
sliceDepth,
|
2019-10-13 06:02:07 +00:00
|
|
|
levels,
|
|
|
|
layers,
|
|
|
|
blockWidth,
|
|
|
|
blockHeight,
|
|
|
|
bytesPerPixel);
|
|
|
|
|
GPU: Pass SpanOrArray for Texture SetData to avoid copy (#3745)
* GPU: Pass SpanOrArray for Texture SetData to avoid copy
Texture data is often converted before upload, meaning that an array was allocated to perform the conversion into. However, the backend SetData methods were being passed a Span of that data, and the Multithreaded layer does `ToArray()` on it so that it can be stored for later! This method can't extract the original array, so it creates a copy.
This PR changes the type passed for textures to a new ref struct called SpanOrArray, which is backed by either a ReadOnlySpan or an array. The benefit here is that we can have a ToArray method that doesn't copy if it is originally backed by an array.
This will also avoid a copy when running the ASTC decoder.
On NieR this was taking 38% of texture upload time, which it does a _lot_ of when you move between areas, so there should be a 1.6x performance boost when strictly uploading textures. No doubt this will also improve texture streaming performance in UE4 games, and maybe a small reduction with video playback.
From the numbers, it's probably possible to improve the upload rate by a further 1.6x by performing layout conversion on GPU. I'm not sure if we could improve it further than that - multithreading conversion on CPU would probably result in memory bottleneck.
This doesn't extend to buffers, since we don't convert their data on the GPU emulator side.
* Remove implicit cast to array.
2022-10-08 15:04:47 +00:00
|
|
|
byte[] output = new byte[outSize];
|
2019-10-13 06:02:07 +00:00
|
|
|
|
|
|
|
int outOffs = 0;
|
|
|
|
|
|
|
|
int mipGobBlocksInY = gobBlocksInY;
|
|
|
|
int mipGobBlocksInZ = gobBlocksInZ;
|
|
|
|
|
2020-04-25 13:40:20 +00:00
|
|
|
int gobWidth = (GobStride / bytesPerPixel) * gobBlocksInTileX;
|
|
|
|
int gobHeight = gobBlocksInY * GobHeight;
|
|
|
|
|
2019-10-13 06:02:07 +00:00
|
|
|
for (int level = 0; level < levels; level++)
|
|
|
|
{
|
|
|
|
int w = Math.Max(1, width >> level);
|
|
|
|
int h = Math.Max(1, height >> level);
|
|
|
|
int d = Math.Max(1, depth >> level);
|
|
|
|
|
|
|
|
w = BitUtils.DivRoundUp(w, blockWidth);
|
|
|
|
h = BitUtils.DivRoundUp(h, blockHeight);
|
|
|
|
|
|
|
|
while (h <= (mipGobBlocksInY >> 1) * GobHeight && mipGobBlocksInY != 1)
|
|
|
|
{
|
|
|
|
mipGobBlocksInY >>= 1;
|
|
|
|
}
|
|
|
|
|
Texture: Fix layout conversion when gobs in z is used with depth = 1 (#5220)
* Texture: Fix layout conversion when gobs in z is used with depth = 1
The size calculator methods deliberately reduce the gob size of textures if they are deemed too small for it. This is required to get correct sizes when iterating mip levels of a texture.
Rendering to a slice of a 3D texture can produce a 3D texture with depth 1, but a gob size matching a much larger texture. We _can't_ "correct" this gob size, as it is intended as a slice of a larger 3D texture. Ignoring it causes layout conversion to break on read and flush.
This caused an issue in Tears of the Kingdom where the compressed 3D texture used for the gloom would always break on OpenGL, and seemingly randomly break on Vulkan. In the first case, the data is forcibly flushed to decompress the BC4 texture on the CPU to upload it as 3D, which was broken due to the incorrect layout. In the second, the data may be randomly flushed if it falls out of the cache, but it will appear correct if it's able to form copy dependencies.
This change only allows gob sizes to be reduced once per mip level. For the purpose of aligned size, it can still be reduced infinitely as our texture cache isn't properly able to handle a view being _misaligned_.
The SizeCalculator has also been changed to reduce the size of rendered depth slices to only include the exact range a single depth slice will cover. (before, the size was way too small with gobs in z reduced to 1, and too large when using the correct value)
Gobs in Y logic remains untouched, we don't support Y slices of textures so it's fine as is.
This is probably worth testing in a few games as it also affects texture size and view logic.
* Improve wording
* Maybe a bit better
2023-06-04 20:25:57 +00:00
|
|
|
if (level > 0 && d <= (mipGobBlocksInZ >> 1) && mipGobBlocksInZ != 1)
|
2019-10-13 06:02:07 +00:00
|
|
|
{
|
|
|
|
mipGobBlocksInZ >>= 1;
|
|
|
|
}
|
|
|
|
|
2019-12-11 19:43:28 +00:00
|
|
|
int strideTrunc = BitUtils.AlignDown(w * bytesPerPixel, 16);
|
2020-06-13 22:31:06 +00:00
|
|
|
int strideTrunc64 = BitUtils.AlignDown(w * bytesPerPixel, 64);
|
2019-12-11 19:43:28 +00:00
|
|
|
|
|
|
|
int xStart = strideTrunc / bytesPerPixel;
|
|
|
|
|
2020-04-25 13:40:20 +00:00
|
|
|
int stride = BitUtils.AlignUp(w * bytesPerPixel, HostStrideAlignment);
|
|
|
|
|
2020-06-13 22:31:06 +00:00
|
|
|
int outStrideGap = stride - w * bytesPerPixel;
|
|
|
|
|
2020-04-25 13:40:20 +00:00
|
|
|
int alignment = gobWidth;
|
|
|
|
|
|
|
|
if (d < gobBlocksInZ || w <= gobWidth || h <= gobHeight)
|
|
|
|
{
|
|
|
|
alignment = GobStride / bytesPerPixel;
|
|
|
|
}
|
|
|
|
|
|
|
|
int wAligned = BitUtils.AlignUp(w, alignment);
|
2019-10-13 06:02:07 +00:00
|
|
|
|
|
|
|
BlockLinearLayout layoutConverter = new BlockLinearLayout(
|
|
|
|
wAligned,
|
|
|
|
h,
|
|
|
|
mipGobBlocksInY,
|
|
|
|
mipGobBlocksInZ,
|
|
|
|
bytesPerPixel);
|
|
|
|
|
2022-01-09 16:28:48 +00:00
|
|
|
int sd = Math.Max(1, sliceDepth >> level);
|
|
|
|
|
2020-06-13 22:31:06 +00:00
|
|
|
unsafe bool Convert<T>(Span<byte> output, ReadOnlySpan<byte> data) where T : unmanaged
|
2019-10-13 06:02:07 +00:00
|
|
|
{
|
2020-06-13 22:31:06 +00:00
|
|
|
fixed (byte* outputPtr = output, dataPtr = data)
|
2019-10-13 06:02:07 +00:00
|
|
|
{
|
2020-06-13 22:31:06 +00:00
|
|
|
byte* outPtr = outputPtr + outOffs;
|
|
|
|
for (int layer = 0; layer < layers; layer++)
|
2019-12-11 19:43:28 +00:00
|
|
|
{
|
2020-06-13 22:31:06 +00:00
|
|
|
byte* inBaseOffset = dataPtr + (layer * sizeInfo.LayerSize + sizeInfo.GetMipOffset(level));
|
|
|
|
|
2022-01-09 16:28:48 +00:00
|
|
|
for (int z = 0; z < sd; z++)
|
2020-06-13 22:31:06 +00:00
|
|
|
{
|
|
|
|
layoutConverter.SetZ(z);
|
|
|
|
for (int y = 0; y < h; y++)
|
|
|
|
{
|
|
|
|
layoutConverter.SetY(y);
|
|
|
|
|
|
|
|
for (int x = 0; x < strideTrunc64; x += 64, outPtr += 64)
|
|
|
|
{
|
|
|
|
byte* offset = inBaseOffset + layoutConverter.GetOffsetWithLineOffset64(x);
|
|
|
|
byte* offset2 = offset + 0x20;
|
|
|
|
byte* offset3 = offset + 0x100;
|
|
|
|
byte* offset4 = offset + 0x120;
|
|
|
|
|
|
|
|
Vector128<byte> value = *(Vector128<byte>*)offset;
|
|
|
|
Vector128<byte> value2 = *(Vector128<byte>*)offset2;
|
|
|
|
Vector128<byte> value3 = *(Vector128<byte>*)offset3;
|
|
|
|
Vector128<byte> value4 = *(Vector128<byte>*)offset4;
|
|
|
|
|
|
|
|
*(Vector128<byte>*)outPtr = value;
|
|
|
|
*(Vector128<byte>*)(outPtr + 16) = value2;
|
|
|
|
*(Vector128<byte>*)(outPtr + 32) = value3;
|
|
|
|
*(Vector128<byte>*)(outPtr + 48) = value4;
|
|
|
|
}
|
|
|
|
|
|
|
|
for (int x = strideTrunc64; x < strideTrunc; x += 16, outPtr += 16)
|
|
|
|
{
|
|
|
|
byte* offset = inBaseOffset + layoutConverter.GetOffsetWithLineOffset16(x);
|
|
|
|
|
|
|
|
*(Vector128<byte>*)outPtr = *(Vector128<byte>*)offset;
|
|
|
|
}
|
|
|
|
|
|
|
|
for (int x = xStart; x < w; x++, outPtr += bytesPerPixel)
|
|
|
|
{
|
|
|
|
byte* offset = inBaseOffset + layoutConverter.GetOffset(x);
|
|
|
|
|
|
|
|
*(T*)outPtr = *(T*)offset;
|
|
|
|
}
|
|
|
|
|
|
|
|
outPtr += outStrideGap;
|
|
|
|
}
|
|
|
|
}
|
2019-10-13 06:02:07 +00:00
|
|
|
}
|
2020-06-13 22:31:06 +00:00
|
|
|
outOffs += stride * h * d * layers;
|
2019-10-13 06:02:07 +00:00
|
|
|
}
|
2020-06-13 22:31:06 +00:00
|
|
|
return true;
|
2019-10-13 06:02:07 +00:00
|
|
|
}
|
|
|
|
|
2020-06-13 22:31:06 +00:00
|
|
|
bool _ = bytesPerPixel switch
|
|
|
|
{
|
|
|
|
1 => Convert<byte>(output, data),
|
|
|
|
2 => Convert<ushort>(output, data),
|
|
|
|
4 => Convert<uint>(output, data),
|
|
|
|
8 => Convert<ulong>(output, data),
|
|
|
|
12 => Convert<Bpp12Pixel>(output, data),
|
|
|
|
16 => Convert<Vector128<byte>>(output, data),
|
|
|
|
_ => throw new NotSupportedException($"Unable to convert ${bytesPerPixel} bpp pixel format.")
|
|
|
|
};
|
|
|
|
}
|
2019-10-13 06:02:07 +00:00
|
|
|
return output;
|
|
|
|
}
|
|
|
|
|
GPU: Pass SpanOrArray for Texture SetData to avoid copy (#3745)
* GPU: Pass SpanOrArray for Texture SetData to avoid copy
Texture data is often converted before upload, meaning that an array was allocated to perform the conversion into. However, the backend SetData methods were being passed a Span of that data, and the Multithreaded layer does `ToArray()` on it so that it can be stored for later! This method can't extract the original array, so it creates a copy.
This PR changes the type passed for textures to a new ref struct called SpanOrArray, which is backed by either a ReadOnlySpan or an array. The benefit here is that we can have a ToArray method that doesn't copy if it is originally backed by an array.
This will also avoid a copy when running the ASTC decoder.
On NieR this was taking 38% of texture upload time, which it does a _lot_ of when you move between areas, so there should be a 1.6x performance boost when strictly uploading textures. No doubt this will also improve texture streaming performance in UE4 games, and maybe a small reduction with video playback.
From the numbers, it's probably possible to improve the upload rate by a further 1.6x by performing layout conversion on GPU. I'm not sure if we could improve it further than that - multithreading conversion on CPU would probably result in memory bottleneck.
This doesn't extend to buffers, since we don't convert their data on the GPU emulator side.
* Remove implicit cast to array.
2022-10-08 15:04:47 +00:00
|
|
|
public static byte[] ConvertLinearStridedToLinear(
|
2019-10-13 06:02:07 +00:00
|
|
|
int width,
|
|
|
|
int height,
|
|
|
|
int blockWidth,
|
|
|
|
int blockHeight,
|
2021-12-26 16:05:26 +00:00
|
|
|
int lineSize,
|
2019-10-13 06:02:07 +00:00
|
|
|
int stride,
|
|
|
|
int bytesPerPixel,
|
2019-12-05 20:34:47 +00:00
|
|
|
ReadOnlySpan<byte> data)
|
2019-10-13 06:02:07 +00:00
|
|
|
{
|
2019-10-14 01:48:09 +00:00
|
|
|
int w = BitUtils.DivRoundUp(width, blockWidth);
|
|
|
|
int h = BitUtils.DivRoundUp(height, blockHeight);
|
2019-10-13 06:02:07 +00:00
|
|
|
|
2019-12-05 20:34:47 +00:00
|
|
|
int outStride = BitUtils.AlignUp(w * bytesPerPixel, HostStrideAlignment);
|
2021-12-26 16:05:26 +00:00
|
|
|
lineSize = Math.Min(lineSize, outStride);
|
2019-10-13 06:02:07 +00:00
|
|
|
|
GPU: Pass SpanOrArray for Texture SetData to avoid copy (#3745)
* GPU: Pass SpanOrArray for Texture SetData to avoid copy
Texture data is often converted before upload, meaning that an array was allocated to perform the conversion into. However, the backend SetData methods were being passed a Span of that data, and the Multithreaded layer does `ToArray()` on it so that it can be stored for later! This method can't extract the original array, so it creates a copy.
This PR changes the type passed for textures to a new ref struct called SpanOrArray, which is backed by either a ReadOnlySpan or an array. The benefit here is that we can have a ToArray method that doesn't copy if it is originally backed by an array.
This will also avoid a copy when running the ASTC decoder.
On NieR this was taking 38% of texture upload time, which it does a _lot_ of when you move between areas, so there should be a 1.6x performance boost when strictly uploading textures. No doubt this will also improve texture streaming performance in UE4 games, and maybe a small reduction with video playback.
From the numbers, it's probably possible to improve the upload rate by a further 1.6x by performing layout conversion on GPU. I'm not sure if we could improve it further than that - multithreading conversion on CPU would probably result in memory bottleneck.
This doesn't extend to buffers, since we don't convert their data on the GPU emulator side.
* Remove implicit cast to array.
2022-10-08 15:04:47 +00:00
|
|
|
byte[] output = new byte[h * outStride];
|
|
|
|
Span<byte> outSpan = output;
|
2019-10-13 06:02:07 +00:00
|
|
|
|
2019-12-05 20:34:47 +00:00
|
|
|
int outOffs = 0;
|
2020-06-13 22:31:06 +00:00
|
|
|
int inOffs = 0;
|
2019-12-05 20:34:47 +00:00
|
|
|
|
2019-10-13 06:02:07 +00:00
|
|
|
for (int y = 0; y < h; y++)
|
|
|
|
{
|
GPU: Pass SpanOrArray for Texture SetData to avoid copy (#3745)
* GPU: Pass SpanOrArray for Texture SetData to avoid copy
Texture data is often converted before upload, meaning that an array was allocated to perform the conversion into. However, the backend SetData methods were being passed a Span of that data, and the Multithreaded layer does `ToArray()` on it so that it can be stored for later! This method can't extract the original array, so it creates a copy.
This PR changes the type passed for textures to a new ref struct called SpanOrArray, which is backed by either a ReadOnlySpan or an array. The benefit here is that we can have a ToArray method that doesn't copy if it is originally backed by an array.
This will also avoid a copy when running the ASTC decoder.
On NieR this was taking 38% of texture upload time, which it does a _lot_ of when you move between areas, so there should be a 1.6x performance boost when strictly uploading textures. No doubt this will also improve texture streaming performance in UE4 games, and maybe a small reduction with video playback.
From the numbers, it's probably possible to improve the upload rate by a further 1.6x by performing layout conversion on GPU. I'm not sure if we could improve it further than that - multithreading conversion on CPU would probably result in memory bottleneck.
This doesn't extend to buffers, since we don't convert their data on the GPU emulator side.
* Remove implicit cast to array.
2022-10-08 15:04:47 +00:00
|
|
|
data.Slice(inOffs, lineSize).CopyTo(outSpan.Slice(outOffs, lineSize));
|
2019-10-13 06:02:07 +00:00
|
|
|
|
2020-06-13 22:31:06 +00:00
|
|
|
inOffs += stride;
|
2019-10-13 06:02:07 +00:00
|
|
|
outOffs += outStride;
|
|
|
|
}
|
|
|
|
|
|
|
|
return output;
|
|
|
|
}
|
|
|
|
|
2020-07-12 03:07:01 +00:00
|
|
|
public static void ConvertLinearToBlockLinear(
|
|
|
|
Span<byte> dst,
|
|
|
|
int width,
|
|
|
|
int height,
|
|
|
|
int stride,
|
|
|
|
int bytesPerPixel,
|
|
|
|
int gobBlocksInY,
|
|
|
|
ReadOnlySpan<byte> data)
|
|
|
|
{
|
|
|
|
int gobHeight = gobBlocksInY * GobHeight;
|
|
|
|
|
|
|
|
int strideTrunc = BitUtils.AlignDown(width * bytesPerPixel, 16);
|
|
|
|
int strideTrunc64 = BitUtils.AlignDown(width * bytesPerPixel, 64);
|
|
|
|
|
|
|
|
int xStart = strideTrunc / bytesPerPixel;
|
|
|
|
|
|
|
|
int inStrideGap = stride - width * bytesPerPixel;
|
|
|
|
|
|
|
|
int alignment = GobStride / bytesPerPixel;
|
|
|
|
|
|
|
|
int wAligned = BitUtils.AlignUp(width, alignment);
|
|
|
|
|
|
|
|
BlockLinearLayout layoutConverter = new BlockLinearLayout(wAligned, height, gobBlocksInY, 1, bytesPerPixel);
|
|
|
|
|
|
|
|
unsafe bool Convert<T>(Span<byte> output, ReadOnlySpan<byte> data) where T : unmanaged
|
|
|
|
{
|
|
|
|
fixed (byte* outputPtr = output, dataPtr = data)
|
|
|
|
{
|
|
|
|
byte* inPtr = dataPtr;
|
|
|
|
|
|
|
|
for (int y = 0; y < height; y++)
|
|
|
|
{
|
|
|
|
layoutConverter.SetY(y);
|
|
|
|
|
|
|
|
for (int x = 0; x < strideTrunc64; x += 64, inPtr += 64)
|
|
|
|
{
|
|
|
|
byte* offset = outputPtr + layoutConverter.GetOffsetWithLineOffset64(x);
|
|
|
|
byte* offset2 = offset + 0x20;
|
|
|
|
byte* offset3 = offset + 0x100;
|
|
|
|
byte* offset4 = offset + 0x120;
|
|
|
|
|
|
|
|
Vector128<byte> value = *(Vector128<byte>*)inPtr;
|
|
|
|
Vector128<byte> value2 = *(Vector128<byte>*)(inPtr + 16);
|
|
|
|
Vector128<byte> value3 = *(Vector128<byte>*)(inPtr + 32);
|
|
|
|
Vector128<byte> value4 = *(Vector128<byte>*)(inPtr + 48);
|
|
|
|
|
|
|
|
*(Vector128<byte>*)offset = value;
|
|
|
|
*(Vector128<byte>*)offset2 = value2;
|
|
|
|
*(Vector128<byte>*)offset3 = value3;
|
|
|
|
*(Vector128<byte>*)offset4 = value4;
|
|
|
|
}
|
|
|
|
|
|
|
|
for (int x = strideTrunc64; x < strideTrunc; x += 16, inPtr += 16)
|
|
|
|
{
|
|
|
|
byte* offset = outputPtr + layoutConverter.GetOffsetWithLineOffset16(x);
|
|
|
|
|
|
|
|
*(Vector128<byte>*)offset = *(Vector128<byte>*)inPtr;
|
|
|
|
}
|
|
|
|
|
|
|
|
for (int x = xStart; x < width; x++, inPtr += bytesPerPixel)
|
|
|
|
{
|
|
|
|
byte* offset = outputPtr + layoutConverter.GetOffset(x);
|
|
|
|
|
|
|
|
*(T*)offset = *(T*)inPtr;
|
|
|
|
}
|
|
|
|
|
|
|
|
inPtr += inStrideGap;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
bool _ = bytesPerPixel switch
|
|
|
|
{
|
|
|
|
1 => Convert<byte>(dst, data),
|
|
|
|
2 => Convert<ushort>(dst, data),
|
|
|
|
4 => Convert<uint>(dst, data),
|
|
|
|
8 => Convert<ulong>(dst, data),
|
|
|
|
12 => Convert<Bpp12Pixel>(dst, data),
|
|
|
|
16 => Convert<Vector128<byte>>(dst, data),
|
|
|
|
_ => throw new NotSupportedException($"Unable to convert ${bytesPerPixel} bpp pixel format.")
|
|
|
|
};
|
|
|
|
}
|
|
|
|
|
2021-07-16 21:10:20 +00:00
|
|
|
public static ReadOnlySpan<byte> ConvertLinearToBlockLinear(
|
Return mapped buffer pointer directly for flush, WriteableRegion for textures (#2494)
* Return mapped buffer pointer directly for flush, WriteableRegion for textures
A few changes here to generally improve performance, even for platforms not using the persistent buffer flush.
- Texture and buffer flush now return a ReadOnlySpan<byte>. It's guaranteed that this span is pinned in memory, but it will be overwritten on the next flush from that thread, so it is expected that the data is used before calling again.
- As a result, persistent mappings no longer copy to a new array - rather the persistent map is returned directly as a Span<>. A similar host array is used for the glGet flushes instead of allocating new arrays each time.
- Texture flushes now do their layout conversion into a WriteableRegion when the texture is not MultiRange, which allows the flush to happen directly into guest memory rather than into a temporary span, then copied over. This avoids another copy when doing layout conversion.
Overall, this saves 1 data copy for buffer flush, 1 copy for linear textures with matching source/target stride, and 2 copies for block textures or linear textures with mismatching strides.
* Fix tests
* Fix array pointer for Mesa/Intel path
* Address some feedback
* Update method for getting array pointer.
2021-07-19 22:10:54 +00:00
|
|
|
Span<byte> output,
|
2019-12-05 20:34:47 +00:00
|
|
|
int width,
|
|
|
|
int height,
|
|
|
|
int depth,
|
2022-01-09 16:28:48 +00:00
|
|
|
int sliceDepth,
|
2019-12-05 20:34:47 +00:00
|
|
|
int levels,
|
|
|
|
int layers,
|
|
|
|
int blockWidth,
|
|
|
|
int blockHeight,
|
|
|
|
int bytesPerPixel,
|
|
|
|
int gobBlocksInY,
|
|
|
|
int gobBlocksInZ,
|
|
|
|
int gobBlocksInTileX,
|
|
|
|
SizeInfo sizeInfo,
|
|
|
|
ReadOnlySpan<byte> data)
|
|
|
|
{
|
Return mapped buffer pointer directly for flush, WriteableRegion for textures (#2494)
* Return mapped buffer pointer directly for flush, WriteableRegion for textures
A few changes here to generally improve performance, even for platforms not using the persistent buffer flush.
- Texture and buffer flush now return a ReadOnlySpan<byte>. It's guaranteed that this span is pinned in memory, but it will be overwritten on the next flush from that thread, so it is expected that the data is used before calling again.
- As a result, persistent mappings no longer copy to a new array - rather the persistent map is returned directly as a Span<>. A similar host array is used for the glGet flushes instead of allocating new arrays each time.
- Texture flushes now do their layout conversion into a WriteableRegion when the texture is not MultiRange, which allows the flush to happen directly into guest memory rather than into a temporary span, then copied over. This avoids another copy when doing layout conversion.
Overall, this saves 1 data copy for buffer flush, 1 copy for linear textures with matching source/target stride, and 2 copies for block textures or linear textures with mismatching strides.
* Fix tests
* Fix array pointer for Mesa/Intel path
* Address some feedback
* Update method for getting array pointer.
2021-07-19 22:10:54 +00:00
|
|
|
if (output.Length == 0)
|
|
|
|
{
|
|
|
|
output = new byte[sizeInfo.TotalSize];
|
|
|
|
}
|
2019-12-05 20:34:47 +00:00
|
|
|
|
|
|
|
int inOffs = 0;
|
|
|
|
|
|
|
|
int mipGobBlocksInY = gobBlocksInY;
|
|
|
|
int mipGobBlocksInZ = gobBlocksInZ;
|
|
|
|
|
2020-04-25 13:40:20 +00:00
|
|
|
int gobWidth = (GobStride / bytesPerPixel) * gobBlocksInTileX;
|
|
|
|
int gobHeight = gobBlocksInY * GobHeight;
|
|
|
|
|
2019-12-05 20:34:47 +00:00
|
|
|
for (int level = 0; level < levels; level++)
|
|
|
|
{
|
|
|
|
int w = Math.Max(1, width >> level);
|
|
|
|
int h = Math.Max(1, height >> level);
|
|
|
|
int d = Math.Max(1, depth >> level);
|
|
|
|
|
|
|
|
w = BitUtils.DivRoundUp(w, blockWidth);
|
|
|
|
h = BitUtils.DivRoundUp(h, blockHeight);
|
|
|
|
|
|
|
|
while (h <= (mipGobBlocksInY >> 1) * GobHeight && mipGobBlocksInY != 1)
|
|
|
|
{
|
|
|
|
mipGobBlocksInY >>= 1;
|
|
|
|
}
|
|
|
|
|
Texture: Fix layout conversion when gobs in z is used with depth = 1 (#5220)
* Texture: Fix layout conversion when gobs in z is used with depth = 1
The size calculator methods deliberately reduce the gob size of textures if they are deemed too small for it. This is required to get correct sizes when iterating mip levels of a texture.
Rendering to a slice of a 3D texture can produce a 3D texture with depth 1, but a gob size matching a much larger texture. We _can't_ "correct" this gob size, as it is intended as a slice of a larger 3D texture. Ignoring it causes layout conversion to break on read and flush.
This caused an issue in Tears of the Kingdom where the compressed 3D texture used for the gloom would always break on OpenGL, and seemingly randomly break on Vulkan. In the first case, the data is forcibly flushed to decompress the BC4 texture on the CPU to upload it as 3D, which was broken due to the incorrect layout. In the second, the data may be randomly flushed if it falls out of the cache, but it will appear correct if it's able to form copy dependencies.
This change only allows gob sizes to be reduced once per mip level. For the purpose of aligned size, it can still be reduced infinitely as our texture cache isn't properly able to handle a view being _misaligned_.
The SizeCalculator has also been changed to reduce the size of rendered depth slices to only include the exact range a single depth slice will cover. (before, the size was way too small with gobs in z reduced to 1, and too large when using the correct value)
Gobs in Y logic remains untouched, we don't support Y slices of textures so it's fine as is.
This is probably worth testing in a few games as it also affects texture size and view logic.
* Improve wording
* Maybe a bit better
2023-06-04 20:25:57 +00:00
|
|
|
if (level > 0 && d <= (mipGobBlocksInZ >> 1) && mipGobBlocksInZ != 1)
|
2019-12-05 20:34:47 +00:00
|
|
|
{
|
|
|
|
mipGobBlocksInZ >>= 1;
|
|
|
|
}
|
|
|
|
|
2020-06-13 22:31:06 +00:00
|
|
|
int strideTrunc = BitUtils.AlignDown(w * bytesPerPixel, 16);
|
|
|
|
int strideTrunc64 = BitUtils.AlignDown(w * bytesPerPixel, 64);
|
|
|
|
|
|
|
|
int xStart = strideTrunc / bytesPerPixel;
|
|
|
|
|
2020-04-25 13:40:20 +00:00
|
|
|
int stride = BitUtils.AlignUp(w * bytesPerPixel, HostStrideAlignment);
|
|
|
|
|
2020-06-13 22:31:06 +00:00
|
|
|
int inStrideGap = stride - w * bytesPerPixel;
|
|
|
|
|
2020-04-25 13:40:20 +00:00
|
|
|
int alignment = gobWidth;
|
|
|
|
|
|
|
|
if (d < gobBlocksInZ || w <= gobWidth || h <= gobHeight)
|
|
|
|
{
|
|
|
|
alignment = GobStride / bytesPerPixel;
|
|
|
|
}
|
|
|
|
|
|
|
|
int wAligned = BitUtils.AlignUp(w, alignment);
|
2019-12-05 20:34:47 +00:00
|
|
|
|
|
|
|
BlockLinearLayout layoutConverter = new BlockLinearLayout(
|
|
|
|
wAligned,
|
|
|
|
h,
|
|
|
|
mipGobBlocksInY,
|
|
|
|
mipGobBlocksInZ,
|
|
|
|
bytesPerPixel);
|
|
|
|
|
2022-01-09 16:28:48 +00:00
|
|
|
int sd = Math.Max(1, sliceDepth >> level);
|
|
|
|
|
2020-06-13 22:31:06 +00:00
|
|
|
unsafe bool Convert<T>(Span<byte> output, ReadOnlySpan<byte> data) where T : unmanaged
|
2019-12-05 20:34:47 +00:00
|
|
|
{
|
2020-06-13 22:31:06 +00:00
|
|
|
fixed (byte* outputPtr = output, dataPtr = data)
|
2019-12-05 20:34:47 +00:00
|
|
|
{
|
2020-06-13 22:31:06 +00:00
|
|
|
byte* inPtr = dataPtr + inOffs;
|
|
|
|
for (int layer = 0; layer < layers; layer++)
|
2019-12-05 20:34:47 +00:00
|
|
|
{
|
2020-06-13 22:31:06 +00:00
|
|
|
byte* outBaseOffset = outputPtr + (layer * sizeInfo.LayerSize + sizeInfo.GetMipOffset(level));
|
|
|
|
|
2022-01-09 16:28:48 +00:00
|
|
|
for (int z = 0; z < sd; z++)
|
2020-06-13 22:31:06 +00:00
|
|
|
{
|
|
|
|
layoutConverter.SetZ(z);
|
|
|
|
for (int y = 0; y < h; y++)
|
|
|
|
{
|
|
|
|
layoutConverter.SetY(y);
|
|
|
|
|
|
|
|
for (int x = 0; x < strideTrunc64; x += 64, inPtr += 64)
|
|
|
|
{
|
|
|
|
byte* offset = outBaseOffset + layoutConverter.GetOffsetWithLineOffset64(x);
|
|
|
|
byte* offset2 = offset + 0x20;
|
|
|
|
byte* offset3 = offset + 0x100;
|
|
|
|
byte* offset4 = offset + 0x120;
|
|
|
|
|
|
|
|
Vector128<byte> value = *(Vector128<byte>*)inPtr;
|
|
|
|
Vector128<byte> value2 = *(Vector128<byte>*)(inPtr + 16);
|
|
|
|
Vector128<byte> value3 = *(Vector128<byte>*)(inPtr + 32);
|
|
|
|
Vector128<byte> value4 = *(Vector128<byte>*)(inPtr + 48);
|
|
|
|
|
|
|
|
*(Vector128<byte>*)offset = value;
|
|
|
|
*(Vector128<byte>*)offset2 = value2;
|
|
|
|
*(Vector128<byte>*)offset3 = value3;
|
|
|
|
*(Vector128<byte>*)offset4 = value4;
|
|
|
|
}
|
|
|
|
|
|
|
|
for (int x = strideTrunc64; x < strideTrunc; x += 16, inPtr += 16)
|
|
|
|
{
|
|
|
|
byte* offset = outBaseOffset + layoutConverter.GetOffsetWithLineOffset16(x);
|
|
|
|
|
|
|
|
*(Vector128<byte>*)offset = *(Vector128<byte>*)inPtr;
|
|
|
|
}
|
|
|
|
|
|
|
|
for (int x = xStart; x < w; x++, inPtr += bytesPerPixel)
|
|
|
|
{
|
|
|
|
byte* offset = outBaseOffset + layoutConverter.GetOffset(x);
|
|
|
|
|
|
|
|
*(T*)offset = *(T*)inPtr;
|
|
|
|
}
|
|
|
|
|
|
|
|
inPtr += inStrideGap;
|
|
|
|
}
|
|
|
|
}
|
2019-12-05 20:34:47 +00:00
|
|
|
}
|
2020-06-13 22:31:06 +00:00
|
|
|
inOffs += stride * h * d * layers;
|
2019-12-05 20:34:47 +00:00
|
|
|
}
|
2020-06-13 22:31:06 +00:00
|
|
|
return true;
|
2019-12-05 20:34:47 +00:00
|
|
|
}
|
2020-06-13 22:31:06 +00:00
|
|
|
|
|
|
|
bool _ = bytesPerPixel switch
|
|
|
|
{
|
|
|
|
1 => Convert<byte>(output, data),
|
|
|
|
2 => Convert<ushort>(output, data),
|
|
|
|
4 => Convert<uint>(output, data),
|
|
|
|
8 => Convert<ulong>(output, data),
|
|
|
|
12 => Convert<Bpp12Pixel>(output, data),
|
|
|
|
16 => Convert<Vector128<byte>>(output, data),
|
|
|
|
_ => throw new NotSupportedException($"Unable to convert ${bytesPerPixel} bpp pixel format.")
|
|
|
|
};
|
2019-12-05 20:34:47 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
return output;
|
|
|
|
}
|
|
|
|
|
2021-07-16 21:10:20 +00:00
|
|
|
public static ReadOnlySpan<byte> ConvertLinearToLinearStrided(
|
Return mapped buffer pointer directly for flush, WriteableRegion for textures (#2494)
* Return mapped buffer pointer directly for flush, WriteableRegion for textures
A few changes here to generally improve performance, even for platforms not using the persistent buffer flush.
- Texture and buffer flush now return a ReadOnlySpan<byte>. It's guaranteed that this span is pinned in memory, but it will be overwritten on the next flush from that thread, so it is expected that the data is used before calling again.
- As a result, persistent mappings no longer copy to a new array - rather the persistent map is returned directly as a Span<>. A similar host array is used for the glGet flushes instead of allocating new arrays each time.
- Texture flushes now do their layout conversion into a WriteableRegion when the texture is not MultiRange, which allows the flush to happen directly into guest memory rather than into a temporary span, then copied over. This avoids another copy when doing layout conversion.
Overall, this saves 1 data copy for buffer flush, 1 copy for linear textures with matching source/target stride, and 2 copies for block textures or linear textures with mismatching strides.
* Fix tests
* Fix array pointer for Mesa/Intel path
* Address some feedback
* Update method for getting array pointer.
2021-07-19 22:10:54 +00:00
|
|
|
Span<byte> output,
|
2019-12-05 20:34:47 +00:00
|
|
|
int width,
|
|
|
|
int height,
|
|
|
|
int blockWidth,
|
|
|
|
int blockHeight,
|
|
|
|
int stride,
|
|
|
|
int bytesPerPixel,
|
|
|
|
ReadOnlySpan<byte> data)
|
|
|
|
{
|
|
|
|
int w = BitUtils.DivRoundUp(width, blockWidth);
|
|
|
|
int h = BitUtils.DivRoundUp(height, blockHeight);
|
|
|
|
|
|
|
|
int inStride = BitUtils.AlignUp(w * bytesPerPixel, HostStrideAlignment);
|
2020-06-13 22:31:06 +00:00
|
|
|
int lineSize = width * bytesPerPixel;
|
2019-12-05 20:34:47 +00:00
|
|
|
|
2021-07-16 21:10:20 +00:00
|
|
|
if (inStride == stride)
|
|
|
|
{
|
Return mapped buffer pointer directly for flush, WriteableRegion for textures (#2494)
* Return mapped buffer pointer directly for flush, WriteableRegion for textures
A few changes here to generally improve performance, even for platforms not using the persistent buffer flush.
- Texture and buffer flush now return a ReadOnlySpan<byte>. It's guaranteed that this span is pinned in memory, but it will be overwritten on the next flush from that thread, so it is expected that the data is used before calling again.
- As a result, persistent mappings no longer copy to a new array - rather the persistent map is returned directly as a Span<>. A similar host array is used for the glGet flushes instead of allocating new arrays each time.
- Texture flushes now do their layout conversion into a WriteableRegion when the texture is not MultiRange, which allows the flush to happen directly into guest memory rather than into a temporary span, then copied over. This avoids another copy when doing layout conversion.
Overall, this saves 1 data copy for buffer flush, 1 copy for linear textures with matching source/target stride, and 2 copies for block textures or linear textures with mismatching strides.
* Fix tests
* Fix array pointer for Mesa/Intel path
* Address some feedback
* Update method for getting array pointer.
2021-07-19 22:10:54 +00:00
|
|
|
if (output.Length != 0)
|
|
|
|
{
|
|
|
|
data.CopyTo(output);
|
|
|
|
return output;
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
return data;
|
|
|
|
}
|
2021-07-16 21:10:20 +00:00
|
|
|
}
|
|
|
|
|
Return mapped buffer pointer directly for flush, WriteableRegion for textures (#2494)
* Return mapped buffer pointer directly for flush, WriteableRegion for textures
A few changes here to generally improve performance, even for platforms not using the persistent buffer flush.
- Texture and buffer flush now return a ReadOnlySpan<byte>. It's guaranteed that this span is pinned in memory, but it will be overwritten on the next flush from that thread, so it is expected that the data is used before calling again.
- As a result, persistent mappings no longer copy to a new array - rather the persistent map is returned directly as a Span<>. A similar host array is used for the glGet flushes instead of allocating new arrays each time.
- Texture flushes now do their layout conversion into a WriteableRegion when the texture is not MultiRange, which allows the flush to happen directly into guest memory rather than into a temporary span, then copied over. This avoids another copy when doing layout conversion.
Overall, this saves 1 data copy for buffer flush, 1 copy for linear textures with matching source/target stride, and 2 copies for block textures or linear textures with mismatching strides.
* Fix tests
* Fix array pointer for Mesa/Intel path
* Address some feedback
* Update method for getting array pointer.
2021-07-19 22:10:54 +00:00
|
|
|
if (output.Length == 0)
|
|
|
|
{
|
|
|
|
output = new byte[h * stride];
|
|
|
|
}
|
2019-12-05 20:34:47 +00:00
|
|
|
|
|
|
|
int inOffs = 0;
|
2020-06-13 22:31:06 +00:00
|
|
|
int outOffs = 0;
|
2019-12-05 20:34:47 +00:00
|
|
|
|
|
|
|
for (int y = 0; y < h; y++)
|
|
|
|
{
|
2020-06-13 22:31:06 +00:00
|
|
|
data.Slice(inOffs, lineSize).CopyTo(output.Slice(outOffs, lineSize));
|
2019-12-05 20:34:47 +00:00
|
|
|
|
|
|
|
inOffs += inStride;
|
2020-06-13 22:31:06 +00:00
|
|
|
outOffs += stride;
|
2019-12-05 20:34:47 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
return output;
|
|
|
|
}
|
|
|
|
|
2019-10-13 06:02:07 +00:00
|
|
|
private static int GetTextureSize(
|
|
|
|
int width,
|
|
|
|
int height,
|
|
|
|
int depth,
|
|
|
|
int levels,
|
|
|
|
int layers,
|
|
|
|
int blockWidth,
|
|
|
|
int blockHeight,
|
|
|
|
int bytesPerPixel)
|
|
|
|
{
|
|
|
|
int layerSize = 0;
|
|
|
|
|
|
|
|
for (int level = 0; level < levels; level++)
|
|
|
|
{
|
|
|
|
int w = Math.Max(1, width >> level);
|
|
|
|
int h = Math.Max(1, height >> level);
|
|
|
|
int d = Math.Max(1, depth >> level);
|
|
|
|
|
|
|
|
w = BitUtils.DivRoundUp(w, blockWidth);
|
|
|
|
h = BitUtils.DivRoundUp(h, blockHeight);
|
|
|
|
|
2019-12-05 20:34:47 +00:00
|
|
|
int stride = BitUtils.AlignUp(w * bytesPerPixel, HostStrideAlignment);
|
2019-10-13 06:02:07 +00:00
|
|
|
|
|
|
|
layerSize += stride * h * d;
|
|
|
|
}
|
|
|
|
|
|
|
|
return layerSize * layers;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|