From 729ff5337cab8d1fd4cc66d7792d410172f25f62 Mon Sep 17 00:00:00 2001
From: gdkchan <gab.dark.100@gmail.com>
Date: Tue, 13 Sep 2022 03:24:09 -0300
Subject: [PATCH] Fix increment on Arm32 NEON VLDn/VSTn instructions with regs
 > 1 (#3695)

* Fix increment on Arm32 NEON VLDn/VSTn instructions with regs > 1

* PPTC version bump

* PR feedback
---
 ARMeilleure/Decoders/OpCode32SimdMemPair.cs   |  7 +-
 ARMeilleure/Decoders/OpCodeTable.cs           | 64 ++++++++++++-------
 .../Instructions/InstEmitSimdMemory32.cs      |  8 +--
 ARMeilleure/Translation/PTC/Ptc.cs            |  2 +-
 Ryujinx.Tests/Cpu/CpuTestSimdMemory32.cs      | 29 ++++++---
 5 files changed, 71 insertions(+), 39 deletions(-)

diff --git a/ARMeilleure/Decoders/OpCode32SimdMemPair.cs b/ARMeilleure/Decoders/OpCode32SimdMemPair.cs
index 4cee05e826..2da0dea3c5 100644
--- a/ARMeilleure/Decoders/OpCode32SimdMemPair.cs
+++ b/ARMeilleure/Decoders/OpCode32SimdMemPair.cs
@@ -1,11 +1,10 @@
 using ARMeilleure.State;
-using System;
 
 namespace ARMeilleure.Decoders
 {
     class OpCode32SimdMemPair : OpCode32, IOpCode32Simd
     {
-        private static int[] RegsMap =
+        private static int[] _regsMap =
         {
             1, 1, 4, 2,
             1, 1, 3, 1,
@@ -40,9 +39,9 @@ namespace ARMeilleure.Decoders
             WBack = Rm != RegisterAlias.Aarch32Pc;
             RegisterIndex = Rm != RegisterAlias.Aarch32Pc && Rm != RegisterAlias.Aarch32Sp;
 
-            Regs = RegsMap[(opCode >> 8) & 0xf];
+            Regs = _regsMap[(opCode >> 8) & 0xf];
 
-            Increment = Math.Min(Regs, ((opCode >> 8) & 0x1) + 1);
+            Increment = ((opCode >> 8) & 0x1) + 1;
         }
     }
 }
diff --git a/ARMeilleure/Decoders/OpCodeTable.cs b/ARMeilleure/Decoders/OpCodeTable.cs
index c5f86712f3..b222a21fe7 100644
--- a/ARMeilleure/Decoders/OpCodeTable.cs
+++ b/ARMeilleure/Decoders/OpCodeTable.cs
@@ -888,18 +888,31 @@ namespace ARMeilleure.Decoders
             SetA32("111100100x00xxxxxxxx1100xxx1xxxx", InstName.Vfma,        InstEmit32.Vfma_V,      OpCode32SimdReg.Create);
             SetA32("111100100x10xxxxxxxx1100xxx1xxxx", InstName.Vfms,        InstEmit32.Vfms_V,      OpCode32SimdReg.Create);
             SetA32("1111001x0x<<xxxxxxxx0000xxx0xxxx", InstName.Vhadd,       InstEmit32.Vhadd,       OpCode32SimdReg.Create);
-            SetA32("111101001x10xxxxxxxxxx00xxxxxxxx", InstName.Vld1,        InstEmit32.Vld1,        OpCode32SimdMemSingle.Create);
-            SetA32("111101000x10xxxxxxxx0111xxxxxxxx", InstName.Vld1,        InstEmit32.Vld1,        OpCode32SimdMemPair.Create); // Regs = 1.
-            SetA32("111101000x10xxxxxxxx1010xxxxxxxx", InstName.Vld1,        InstEmit32.Vld1,        OpCode32SimdMemPair.Create); // Regs = 2.
-            SetA32("111101000x10xxxxxxxx0110xxxxxxxx", InstName.Vld1,        InstEmit32.Vld1,        OpCode32SimdMemPair.Create); // Regs = 3.
+            SetA32("111101001x10xxxxxxxx0000xxx0xxxx", InstName.Vld1,        InstEmit32.Vld1,        OpCode32SimdMemSingle.Create);
+            SetA32("111101001x10xxxxxxxx0100xx0xxxxx", InstName.Vld1,        InstEmit32.Vld1,        OpCode32SimdMemSingle.Create);
+            SetA32("111101001x10xxxxxxxx1000x000xxxx", InstName.Vld1,        InstEmit32.Vld1,        OpCode32SimdMemSingle.Create);
+            SetA32("111101001x10xxxxxxxx1000x011xxxx", InstName.Vld1,        InstEmit32.Vld1,        OpCode32SimdMemSingle.Create);
+            SetA32("111101001x10xxxxxxxx110000x0xxxx", InstName.Vld1,        InstEmit32.Vld1,        OpCode32SimdMemSingle.Create);
+            SetA32("111101001x10xxxxxxxx110001xxxxxx", InstName.Vld1,        InstEmit32.Vld1,        OpCode32SimdMemSingle.Create);
+            SetA32("111101001x10xxxxxxxx110010xxxxxx", InstName.Vld1,        InstEmit32.Vld1,        OpCode32SimdMemSingle.Create);
+            SetA32("111101000x10xxxxxxxx0111xx0xxxxx", InstName.Vld1,        InstEmit32.Vld1,        OpCode32SimdMemPair.Create); // Regs = 1.
+            SetA32("111101000x10xxxxxxxx1010xx<<xxxx", InstName.Vld1,        InstEmit32.Vld1,        OpCode32SimdMemPair.Create); // Regs = 2.
+            SetA32("111101000x10xxxxxxxx0110xx0xxxxx", InstName.Vld1,        InstEmit32.Vld1,        OpCode32SimdMemPair.Create); // Regs = 3.
             SetA32("111101000x10xxxxxxxx0010xxxxxxxx", InstName.Vld1,        InstEmit32.Vld1,        OpCode32SimdMemPair.Create); // Regs = 4.
-            SetA32("111101001x10xxxxxxxxxx01xxxxxxxx", InstName.Vld2,        InstEmit32.Vld2,        OpCode32SimdMemSingle.Create);
-            SetA32("111101000x10xxxxxxxx100xxxxxxxxx", InstName.Vld2,        InstEmit32.Vld2,        OpCode32SimdMemPair.Create); // Regs = 1, inc = 1/2 (itype).
-            SetA32("111101000x10xxxxxxxx0011xxxxxxxx", InstName.Vld2,        InstEmit32.Vld2,        OpCode32SimdMemPair.Create); // Regs = 2, inc = 2.
-            SetA32("111101001x10xxxxxxxxxx10xxxxxxxx", InstName.Vld3,        InstEmit32.Vld3,        OpCode32SimdMemSingle.Create);
-            SetA32("111101000x10xxxxxxxx010xxxxxxxxx", InstName.Vld3,        InstEmit32.Vld3,        OpCode32SimdMemPair.Create); // Inc = 1/2 (itype).
-            SetA32("111101001x10xxxxxxxxxx11xxxxxxxx", InstName.Vld4,        InstEmit32.Vld4,        OpCode32SimdMemSingle.Create);
-            SetA32("111101000x10xxxxxxxx000xxxxxxxxx", InstName.Vld4,        InstEmit32.Vld4,        OpCode32SimdMemPair.Create); // Inc = 1/2 (itype).
+            SetA32("111101001x10xxxxxxxx0x01xxxxxxxx", InstName.Vld2,        InstEmit32.Vld2,        OpCode32SimdMemSingle.Create);
+            SetA32("111101001x10xxxxxxxx1001xx0xxxxx", InstName.Vld2,        InstEmit32.Vld2,        OpCode32SimdMemSingle.Create);
+            SetA32("111101001x10xxxxxxxx1101<<xxxxxx", InstName.Vld2,        InstEmit32.Vld2,        OpCode32SimdMemSingle.Create);
+            SetA32("111101000x10xxxxxxxx100x<<0xxxxx", InstName.Vld2,        InstEmit32.Vld2,        OpCode32SimdMemPair.Create); // Regs = 1, inc = 1/2 (itype).
+            SetA32("111101000x10xxxxxxxx100x<<10xxxx", InstName.Vld2,        InstEmit32.Vld2,        OpCode32SimdMemPair.Create); // Regs = 1, inc = 1/2 (itype).
+            SetA32("111101000x10xxxxxxxx0011<<xxxxxx", InstName.Vld2,        InstEmit32.Vld2,        OpCode32SimdMemPair.Create); // Regs = 2, inc = 2.
+            SetA32("111101001x10xxxxxxxx0x10xxx0xxxx", InstName.Vld3,        InstEmit32.Vld3,        OpCode32SimdMemSingle.Create);
+            SetA32("111101001x10xxxxxxxx1010xx00xxxx", InstName.Vld3,        InstEmit32.Vld3,        OpCode32SimdMemSingle.Create);
+            SetA32("111101001x10xxxxxxxx1110<<x0xxxx", InstName.Vld3,        InstEmit32.Vld3,        OpCode32SimdMemSingle.Create);
+            SetA32("111101000x10xxxxxxxx010x<<0xxxxx", InstName.Vld3,        InstEmit32.Vld3,        OpCode32SimdMemPair.Create); // Inc = 1/2 (itype).
+            SetA32("111101001x10xxxxxxxx0x11xxxxxxxx", InstName.Vld4,        InstEmit32.Vld4,        OpCode32SimdMemSingle.Create);
+            SetA32("111101001x10xxxxxxxx1011xx<<xxxx", InstName.Vld4,        InstEmit32.Vld4,        OpCode32SimdMemSingle.Create);
+            SetA32("111101001x10xxxxxxxx1111<<x>xxxx", InstName.Vld4,        InstEmit32.Vld4,        OpCode32SimdMemSingle.Create);
+            SetA32("111101000x10xxxxxxxx000x<<xxxxxx", InstName.Vld4,        InstEmit32.Vld4,        OpCode32SimdMemPair.Create); // Inc = 1/2 (itype).
             SetA32("1111001x0x<<xxxxxxxx0110xxx0xxxx", InstName.Vmax,        InstEmit32.Vmax_I,      OpCode32SimdReg.Create);
             SetA32("111100100x00xxxxxxxx1111xxx0xxxx", InstName.Vmax,        InstEmit32.Vmax_V,      OpCode32SimdReg.Create);
             SetA32("1111001x0x<<xxxxxxxx0110xxx1xxxx", InstName.Vmin,        InstEmit32.Vmin_I,      OpCode32SimdReg.Create);
@@ -974,18 +987,25 @@ namespace ARMeilleure.Decoders
             SetA32("1111001x1x>>>xxxxxxx0000>xx1xxxx", InstName.Vshr,        InstEmit32.Vshr,        OpCode32SimdShImm.Create);
             SetA32("111100101x>>>xxxxxxx100000x1xxx0", InstName.Vshrn,       InstEmit32.Vshrn,       OpCode32SimdShImmNarrow.Create);
             SetA32("1111001x1x>>>xxxxxxx0001>xx1xxxx", InstName.Vsra,        InstEmit32.Vsra,        OpCode32SimdShImm.Create);
-            SetA32("111101001x00xxxxxxxx<<00xxxxxxxx", InstName.Vst1,        InstEmit32.Vst1,        OpCode32SimdMemSingle.Create);
-            SetA32("111101000x00xxxxxxxx0111xxxxxxxx", InstName.Vst1,        InstEmit32.Vst1,        OpCode32SimdMemPair.Create); // Regs = 1.
-            SetA32("111101000x00xxxxxxxx1010xxxxxxxx", InstName.Vst1,        InstEmit32.Vst1,        OpCode32SimdMemPair.Create); // Regs = 2.
-            SetA32("111101000x00xxxxxxxx0110xxxxxxxx", InstName.Vst1,        InstEmit32.Vst1,        OpCode32SimdMemPair.Create); // Regs = 3.
+            SetA32("111101001x00xxxxxxxx0000xxx0xxxx", InstName.Vst1,        InstEmit32.Vst1,        OpCode32SimdMemSingle.Create);
+            SetA32("111101001x00xxxxxxxx0100xx0xxxxx", InstName.Vst1,        InstEmit32.Vst1,        OpCode32SimdMemSingle.Create);
+            SetA32("111101001x00xxxxxxxx1000x000xxxx", InstName.Vst1,        InstEmit32.Vst1,        OpCode32SimdMemSingle.Create);
+            SetA32("111101001x00xxxxxxxx1000x011xxxx", InstName.Vst1,        InstEmit32.Vst1,        OpCode32SimdMemSingle.Create);
+            SetA32("111101000x00xxxxxxxx0111xx0xxxxx", InstName.Vst1,        InstEmit32.Vst1,        OpCode32SimdMemPair.Create); // Regs = 1.
+            SetA32("111101000x00xxxxxxxx1010xx<<xxxx", InstName.Vst1,        InstEmit32.Vst1,        OpCode32SimdMemPair.Create); // Regs = 2.
+            SetA32("111101000x00xxxxxxxx0110xx0xxxxx", InstName.Vst1,        InstEmit32.Vst1,        OpCode32SimdMemPair.Create); // Regs = 3.
             SetA32("111101000x00xxxxxxxx0010xxxxxxxx", InstName.Vst1,        InstEmit32.Vst1,        OpCode32SimdMemPair.Create); // Regs = 4.
-            SetA32("111101001x00xxxxxxxx<<01xxxxxxxx", InstName.Vst2,        InstEmit32.Vst2,        OpCode32SimdMemSingle.Create);
-            SetA32("111101000x00xxxxxxxx100xxxxxxxxx", InstName.Vst2,        InstEmit32.Vst2,        OpCode32SimdMemPair.Create); // Regs = 1, inc = 1/2 (itype).
-            SetA32("111101000x00xxxxxxxx0011xxxxxxxx", InstName.Vst2,        InstEmit32.Vst2,        OpCode32SimdMemPair.Create); // Regs = 2, inc = 2.
-            SetA32("111101001x00xxxxxxxx<<10xxxxxxxx", InstName.Vst3,        InstEmit32.Vst3,        OpCode32SimdMemSingle.Create);
-            SetA32("111101000x00xxxxxxxx010xxxxxxxxx", InstName.Vst3,        InstEmit32.Vst3,        OpCode32SimdMemPair.Create); // Inc = 1/2 (itype).
-            SetA32("111101001x00xxxxxxxx<<11xxxxxxxx", InstName.Vst4,        InstEmit32.Vst4,        OpCode32SimdMemSingle.Create);
-            SetA32("111101000x00xxxxxxxx000xxxxxxxxx", InstName.Vst4,        InstEmit32.Vst4,        OpCode32SimdMemPair.Create); // Inc = 1/2 (itype).
+            SetA32("111101001x00xxxxxxxx0x01xxxxxxxx", InstName.Vst2,        InstEmit32.Vst2,        OpCode32SimdMemSingle.Create);
+            SetA32("111101001x00xxxxxxxx1001xx0xxxxx", InstName.Vst2,        InstEmit32.Vst2,        OpCode32SimdMemSingle.Create);
+            SetA32("111101000x00xxxxxxxx100x<<0xxxxx", InstName.Vst2,        InstEmit32.Vst2,        OpCode32SimdMemPair.Create); // Regs = 1, inc = 1/2 (itype).
+            SetA32("111101000x00xxxxxxxx100x<<10xxxx", InstName.Vst2,        InstEmit32.Vst2,        OpCode32SimdMemPair.Create); // Regs = 1, inc = 1/2 (itype).
+            SetA32("111101000x00xxxxxxxx0011<<xxxxxx", InstName.Vst2,        InstEmit32.Vst2,        OpCode32SimdMemPair.Create); // Regs = 2, inc = 2.
+            SetA32("111101001x00xxxxxxxx0x10xxx0xxxx", InstName.Vst3,        InstEmit32.Vst3,        OpCode32SimdMemSingle.Create);
+            SetA32("111101001x00xxxxxxxx1010xx00xxxx", InstName.Vst3,        InstEmit32.Vst3,        OpCode32SimdMemSingle.Create);
+            SetA32("111101000x00xxxxxxxx010x<<0xxxxx", InstName.Vst3,        InstEmit32.Vst3,        OpCode32SimdMemPair.Create); // Inc = 1/2 (itype).
+            SetA32("111101001x00xxxxxxxx0x11xxxxxxxx", InstName.Vst4,        InstEmit32.Vst4,        OpCode32SimdMemSingle.Create);
+            SetA32("111101001x00xxxxxxxx1011xx<<xxxx", InstName.Vst4,        InstEmit32.Vst4,        OpCode32SimdMemSingle.Create);
+            SetA32("111101000x00xxxxxxxx000x<<xxxxxx", InstName.Vst4,        InstEmit32.Vst4,        OpCode32SimdMemPair.Create); // Inc = 1/2 (itype).
             SetA32("111100110xxxxxxxxxxx1000xxx0xxxx", InstName.Vsub,        InstEmit32.Vsub_I,      OpCode32SimdReg.Create);
             SetA32("111100100x10xxxxxxxx1101xxx0xxxx", InstName.Vsub,        InstEmit32.Vsub_V,      OpCode32SimdReg.Create);
             SetA32("1111001x1x<<xxxxxxx00010x0x0xxxx", InstName.Vsubl,       InstEmit32.Vsubl_I,     OpCode32SimdRegLong.Create);
diff --git a/ARMeilleure/Instructions/InstEmitSimdMemory32.cs b/ARMeilleure/Instructions/InstEmitSimdMemory32.cs
index 72474ee3e9..b774bd0611 100644
--- a/ARMeilleure/Instructions/InstEmitSimdMemory32.cs
+++ b/ARMeilleure/Instructions/InstEmitSimdMemory32.cs
@@ -67,7 +67,7 @@ namespace ARMeilleure.Instructions
 
                 for (int i = 0; i < count; i++)
                 {
-                    // Write an element from a double simd register.
+                    // Accesses an element from a double simd register.
                     Operand address = context.Add(n, Const(offset));
                     if (eBytes == 8)
                     {
@@ -131,6 +131,7 @@ namespace ARMeilleure.Instructions
             {
                 OpCode32SimdMemPair op = (OpCode32SimdMemPair)context.CurrOp;
 
+                int increment = count > 1 ? op.Increment : 1;
                 int eBytes = 1 << op.Size;
 
                 Operand n = context.Copy(GetIntA32(context, op.Rn));
@@ -144,7 +145,7 @@ namespace ARMeilleure.Instructions
                         int elemD = d + reg;
                         for (int i = 0; i < count; i++)
                         {
-                            // Write an element from a double simd register
+                            // Accesses an element from a double simd register,
                             // add ebytes for each element.
                             Operand address = context.Add(n, Const(offset));
                             int index = ((elemD & 1) << (3 - op.Size)) + elem;
@@ -161,7 +162,6 @@ namespace ARMeilleure.Instructions
                             }
                             else
                             {
-
                                 if (load)
                                 {
                                     EmitLoadSimd(context, address, GetVecA32(elemD >> 1), elemD >> 1, index, op.Size);
@@ -173,7 +173,7 @@ namespace ARMeilleure.Instructions
                             }
 
                             offset += eBytes;
-                            elemD += op.Increment;
+                            elemD += increment;
                         }
                     }
                 }
diff --git a/ARMeilleure/Translation/PTC/Ptc.cs b/ARMeilleure/Translation/PTC/Ptc.cs
index 3a950354e8..71f69ef7cc 100644
--- a/ARMeilleure/Translation/PTC/Ptc.cs
+++ b/ARMeilleure/Translation/PTC/Ptc.cs
@@ -27,7 +27,7 @@ namespace ARMeilleure.Translation.PTC
         private const string OuterHeaderMagicString = "PTCohd\0\0";
         private const string InnerHeaderMagicString = "PTCihd\0\0";
 
-        private const uint InternalVersion = 3683; //! To be incremented manually for each change to the ARMeilleure project.
+        private const uint InternalVersion = 3695; //! To be incremented manually for each change to the ARMeilleure project.
 
         private const string ActualDir = "0";
         private const string BackupDir = "1";
diff --git a/Ryujinx.Tests/Cpu/CpuTestSimdMemory32.cs b/Ryujinx.Tests/Cpu/CpuTestSimdMemory32.cs
index ec2d53a424..b14fdcd559 100644
--- a/Ryujinx.Tests/Cpu/CpuTestSimdMemory32.cs
+++ b/Ryujinx.Tests/Cpu/CpuTestSimdMemory32.cs
@@ -12,7 +12,7 @@ namespace Ryujinx.Tests.Cpu
 #if SimdMemory32
         private const int RndCntImm = 2;
 
-        private uint[] LDSTModes =
+        private uint[] _ldStModes =
         {
             // LD1
             0b0111,
@@ -96,7 +96,7 @@ namespace Ryujinx.Tests.Cpu
                               [Values(0u, 13u)] uint rn,
                               [Values(1u, 13u, 15u)] uint rm,
                               [Values(0u, 1u, 2u, 3u, 4u, 5u, 6u, 7u)] uint vd,
-                              [Range(0u, 3u)] uint mode,
+                              [Range(0u, 10u)] uint mode,
                               [Values(0x0u)] [Random(0u, 0xffu, RndCntImm)] uint offset)
         {
             var data = GenerateVectorSequence(0x1000);
@@ -104,7 +104,13 @@ namespace Ryujinx.Tests.Cpu
 
             uint opcode = 0xf4200000u; // VLD4.8 {D0, D1, D2, D3}, [R0], R0
 
-            opcode |= ((size & 3) << 6) | ((rn & 15) << 16) | (rm & 15) | (LDSTModes[mode] << 8);
+            if (mode > 3 && size == 3)
+            {
+                // A size of 3 is only valid for VLD1.
+                size = 2;
+            }
+
+            opcode |= ((size & 3) << 6) | ((rn & 15) << 16) | (rm & 15) | (_ldStModes[mode] << 8);
 
             opcode |= ((vd & 0x10) << 18);
             opcode |= ((vd & 0xf) << 12);
@@ -151,17 +157,23 @@ namespace Ryujinx.Tests.Cpu
                               [Values(0u, 13u)] uint rn,
                               [Values(1u, 13u, 15u)] uint rm,
                               [Values(0u, 1u, 2u, 3u, 4u, 5u, 6u, 7u)] uint vd,
-                              [Range(0u, 3u)] uint mode,
+                              [Range(0u, 10u)] uint mode,
                               [Values(0x0u)] [Random(0u, 0xffu, RndCntImm)] uint offset)
         {
             var data = GenerateVectorSequence(0x1000);
             SetWorkingMemory(0, data);
 
             (V128 vec1, V128 vec2, V128 vec3, V128 vec4) = GenerateTestVectors();
-            
+
             uint opcode = 0xf4000000u; // VST4.8 {D0, D1, D2, D3}, [R0], R0
 
-            opcode |= ((size & 3) << 6) | ((rn & 15) << 16) | (rm & 15) | (LDSTModes[mode] << 8);
+            if (mode > 3 && size == 3)
+            {
+                // A size of 3 is only valid for VST1.
+                size = 2;
+            }
+
+            opcode |= ((size & 3) << 6) | ((rn & 15) << 16) | (rm & 15) | (_ldStModes[mode] << 8);
 
             opcode |= ((vd & 0x10) << 18);
             opcode |= ((vd & 0xf) << 12);
@@ -183,7 +195,8 @@ namespace Ryujinx.Tests.Cpu
 
             uint opcode = 0xec100a00u; // VST4.8 {D0, D1, D2, D3}, [R0], R0
 
-            uint[] vldmModes = {
+            uint[] vldmModes =
+            {
                 // Note: 3rd 0 leaves a space for "D".
                 0b0100, // Increment after.
                 0b0101, // Increment after. (!)
@@ -240,7 +253,7 @@ namespace Ryujinx.Tests.Cpu
             {
                 opcode |= ((sd & 0x1) << 22);
                 opcode |= ((sd & 0x1e) << 11);
-            } 
+            }
             else
             {
                 opcode |= ((sd & 0x10) << 18);