From 17620d18db8d4a67e4b917596c760107d26fadc5 Mon Sep 17 00:00:00 2001
From: Wunk <wunkolo@gmail.com>
Date: Mon, 20 Mar 2023 12:09:24 -0700
Subject: [PATCH] ARMeilleure: Add initial support for AVX512 (EVEX encoding)
 (cont) (#4147)

* ARMeilleure: Add AVX512{F,VL,DQ,BW} detection

Add `UseAvx512Ortho` and `UseAvx512OrthoFloat` optimization flags as
short-hands for `F+VL` and `F+VL+DQ`.

* ARMeilleure: Add initial support for EVEX instruction encoding

Does not implement rounding, or exception controls.

* ARMeilleure: Add `X86Vpternlogd`

Accelerates the vector-`Not` instruction.

* ARMeilleure: Add check for `OSXSAVE` for AVX{2,512}

* ARMeilleure: Add check for `XCR0` flags

Add XCR0 register checks for AVX and AVX512F, following the guidelines
from section 14.3 and 15.2 from the Intel Architecture Software
Developer's Manual.

* ARMeilleure: Remove redundant `ReProtect` and `Dispose`, formatting

* ARMeilleure: Move XCR0 procedure to GetXcr0Eax

* ARMeilleure: Add `XCR0` to `FeatureInfo` structure

* ARMeilleure: Utilize `ReadOnlySpan` for Xcr0 assembly

Avoids an additional allocation

* ARMeilleure: Formatting fixes

* ARMeilleure: Fix EVEX encoding src2 register index

> Just like in VEX prefix, vvvv is provided in inverted form.

* ARMeilleure: Add `X86Vpternlogd` acceleration to `Vmvn_I`

Passes unit tests, verified instruction utilization

* ARMeilleure: Fix EVEX register operand designations

Operand 2 was being sourced improperly.

EVEX encoded instructions source their operands like so:
Operand 1: ModRM:reg
Operand 2: EVEX.vvvvv
Operand 3: ModRM:r/m
Operand 4: Imm

This fixes the improper register designations when emitting vpternlog.
Now "dest", "src1", "src2" arguments emit in the proper order in EVEX instructions.

* ARMeilleure: Add `X86Vpternlogd` acceleration to `Orn_V`

* ARMeilleure: PTC version bump

* ARMeilleure: Update EVEX encoding Debug.Assert to Debug.Fail

* ARMeilleure: Update EVEX encoding comment capitalization
---
 ARMeilleure/ARMeilleure.csproj                |   1 +
 ARMeilleure/CodeGen/X86/Assembler.cs          | 105 +++++++++++++++++-
 ARMeilleure/CodeGen/X86/AssemblerTable.cs     |   2 +
 .../CodeGen/X86/HardwareCapabilities.cs       |  52 ++++++++-
 ARMeilleure/CodeGen/X86/IntrinsicTable.cs     |   1 +
 ARMeilleure/CodeGen/X86/X86Instruction.cs     |   1 +
 .../Instructions/InstEmitSimdLogical.cs       |  33 +++++-
 .../Instructions/InstEmitSimdLogical32.cs     |   7 ++
 .../Instructions/InstEmitSimdMove32.cs        |   9 +-
 .../IntermediateRepresentation/Intrinsic.cs   |   1 +
 ARMeilleure/Optimizations.cs                  |  11 ++
 ARMeilleure/Translation/PTC/Ptc.cs            |  14 ++-
 12 files changed, 226 insertions(+), 11 deletions(-)

diff --git a/ARMeilleure/ARMeilleure.csproj b/ARMeilleure/ARMeilleure.csproj
index 1c2135ed54..fa55511541 100644
--- a/ARMeilleure/ARMeilleure.csproj
+++ b/ARMeilleure/ARMeilleure.csproj
@@ -7,6 +7,7 @@
 
   <ItemGroup>
     <ProjectReference Include="..\Ryujinx.Common\Ryujinx.Common.csproj" />
+    <ProjectReference Include="..\Ryujinx.Memory\Ryujinx.Memory.csproj" />
   </ItemGroup>
 
   <ItemGroup>
diff --git a/ARMeilleure/CodeGen/X86/Assembler.cs b/ARMeilleure/CodeGen/X86/Assembler.cs
index 2ea4208b33..67736a31f2 100644
--- a/ARMeilleure/CodeGen/X86/Assembler.cs
+++ b/ARMeilleure/CodeGen/X86/Assembler.cs
@@ -1034,7 +1034,13 @@ namespace ARMeilleure.CodeGen.X86
 
             Debug.Assert(opCode != BadOp, "Invalid opcode value.");
 
-            if ((flags & InstructionFlags.Vex) != 0 && HardwareCapabilities.SupportsVexEncoding)
+            if ((flags & InstructionFlags.Evex) != 0 && HardwareCapabilities.SupportsEvexEncoding)
+            {
+                WriteEvexInst(dest, src1, src2, type, flags, opCode);
+
+                opCode &= 0xff;
+            }
+            else if ((flags & InstructionFlags.Vex) != 0 && HardwareCapabilities.SupportsVexEncoding)
             {
                 // In a vex encoding, only one prefix can be active at a time. The active prefix is encoded in the second byte using two bits.
 
@@ -1153,6 +1159,103 @@ namespace ARMeilleure.CodeGen.X86
             }
         }
 
+        private void WriteEvexInst(
+            Operand dest,
+            Operand src1,
+            Operand src2,
+            OperandType type,
+            InstructionFlags flags,
+            int opCode,
+            bool broadcast = false,
+            int registerWidth = 128,
+            int maskRegisterIdx = 0,
+            bool zeroElements = false)
+        {
+            int op1Idx = dest.GetRegister().Index;
+            int op2Idx = src1.GetRegister().Index;
+            int op3Idx = src2.GetRegister().Index;
+
+            WriteByte(0x62);
+
+            // P0
+            // Extend operand 1 register
+            bool r = (op1Idx & 8) == 0;
+            // Extend operand 3 register
+            bool x = (op3Idx & 16) == 0;
+            // Extend operand 3 register
+            bool b = (op3Idx & 8) == 0;
+            // Extend operand 1 register
+            bool rp = (op1Idx & 16) == 0;
+            // Escape code index
+            byte mm = 0b00;
+
+            switch ((ushort)(opCode >> 8))
+            {
+                case 0xf00: mm = 0b01; break;
+                case 0xf38: mm = 0b10; break;
+                case 0xf3a: mm = 0b11; break;
+
+                default: Debug.Fail($"Failed to EVEX encode opcode 0x{opCode:X}."); break;
+            }
+
+            WriteByte(
+                (byte)(
+                    (r ? 0x80 : 0) |
+                    (x ? 0x40 : 0) |
+                    (b ? 0x20 : 0) |
+                    (rp ? 0x10 : 0) |
+                    mm));
+
+            // P1
+            // Specify 64-bit lane mode
+            bool w = Is64Bits(type);
+            // Operand 2 register index
+            byte vvvv = (byte)(~op2Idx & 0b1111);
+            // Opcode prefix
+            byte pp = (flags & InstructionFlags.PrefixMask) switch
+            {
+                InstructionFlags.Prefix66 => 0b01,
+                InstructionFlags.PrefixF3 => 0b10,
+                InstructionFlags.PrefixF2 => 0b11,
+                _ => 0
+            };
+            WriteByte(
+                (byte)(
+                    (w ? 0x80 : 0) |
+                    (vvvv << 3) |
+                    0b100 |
+                    pp));
+
+            // P2
+            // Mask register determines what elements to zero, rather than what elements to merge
+            bool z = zeroElements;
+            // Specifies register-width
+            byte ll = 0b00;
+            switch (registerWidth)
+            {
+                case 128: ll = 0b00; break;
+                case 256: ll = 0b01; break;
+                case 512: ll = 0b10; break;
+
+                default: Debug.Fail($"Invalid EVEX vector register width {registerWidth}."); break;
+            }
+            // Embedded broadcast in the case of a memory operand
+            bool bcast = broadcast;
+            // Extend operand 2 register
+            bool vp = (op2Idx & 16) == 0;
+            // Mask register index
+            Debug.Assert(maskRegisterIdx < 8, $"Invalid mask register index {maskRegisterIdx}.");
+            byte aaa = (byte)(maskRegisterIdx & 0b111);
+
+            WriteByte(
+                (byte)(
+                    (z ? 0x80 : 0) |
+                    (ll << 5) |
+                    (bcast ? 0x10 : 0) |
+                    (vp ? 8 : 0) |
+                    aaa));
+        }
+
         private void WriteCompactInst(Operand operand, int opCode)
         {
             int regIndex = operand.GetRegister().Index;
diff --git a/ARMeilleure/CodeGen/X86/AssemblerTable.cs b/ARMeilleure/CodeGen/X86/AssemblerTable.cs
index ecdc029f96..b47b3ecd1a 100644
--- a/ARMeilleure/CodeGen/X86/AssemblerTable.cs
+++ b/ARMeilleure/CodeGen/X86/AssemblerTable.cs
@@ -20,6 +20,7 @@ namespace ARMeilleure.CodeGen.X86
             Reg8Dest = 1 << 2,
             RexW     = 1 << 3,
             Vex      = 1 << 4,
+            Evex     = 1 << 5,
 
             PrefixBit  = 16,
             PrefixMask = 7 << PrefixBit,
@@ -278,6 +279,7 @@ namespace ARMeilleure.CodeGen.X86
             Add(X86Instruction.Vfnmsub231sd,  new InstructionInfo(BadOp,      BadOp,      BadOp,      BadOp,      0x000f38bf, InstructionFlags.Vex | InstructionFlags.Prefix66 | InstructionFlags.RexW));
             Add(X86Instruction.Vfnmsub231ss,  new InstructionInfo(BadOp,      BadOp,      BadOp,      BadOp,      0x000f38bf, InstructionFlags.Vex | InstructionFlags.Prefix66));
             Add(X86Instruction.Vpblendvb,     new InstructionInfo(BadOp,      BadOp,      BadOp,      BadOp,      0x000f3a4c, InstructionFlags.Vex | InstructionFlags.Prefix66));
+            Add(X86Instruction.Vpternlogd,    new InstructionInfo(BadOp,      BadOp,      BadOp,      BadOp,      0x000f3a25, InstructionFlags.Evex | InstructionFlags.Prefix66));
             Add(X86Instruction.Xor,           new InstructionInfo(0x00000031, 0x06000083, 0x06000081, BadOp,      0x00000033, InstructionFlags.None));
             Add(X86Instruction.Xorpd,         new InstructionInfo(BadOp,      BadOp,      BadOp,      BadOp,      0x00000f57, InstructionFlags.Vex | InstructionFlags.Prefix66));
             Add(X86Instruction.Xorps,         new InstructionInfo(BadOp,      BadOp,      BadOp,      BadOp,      0x00000f57, InstructionFlags.Vex));
diff --git a/ARMeilleure/CodeGen/X86/HardwareCapabilities.cs b/ARMeilleure/CodeGen/X86/HardwareCapabilities.cs
index c12a4e28b7..63a9e46a24 100644
--- a/ARMeilleure/CodeGen/X86/HardwareCapabilities.cs
+++ b/ARMeilleure/CodeGen/X86/HardwareCapabilities.cs
@@ -1,10 +1,14 @@
+using Ryujinx.Memory;
 using System;
+using System.Runtime.InteropServices;
 using System.Runtime.Intrinsics.X86;
 
 namespace ARMeilleure.CodeGen.X86
 {
     static class HardwareCapabilities
     {
+        private delegate uint GetXcr0();
+
         static HardwareCapabilities()
         {
             if (!X86Base.IsSupported)
@@ -24,6 +28,28 @@ namespace ARMeilleure.CodeGen.X86
                 FeatureInfo7Ebx = (FeatureFlags7Ebx)ebx7;
                 FeatureInfo7Ecx = (FeatureFlags7Ecx)ecx7;
             }
+
+            Xcr0InfoEax = (Xcr0FlagsEax)GetXcr0Eax();
+        }
+
+        private static uint GetXcr0Eax()
+        {
+            ReadOnlySpan<byte> asmGetXcr0 = new byte[]
+            {
+                0x31, 0xc9, // xor ecx, ecx
+                0xf, 0x01, 0xd0, // xgetbv
+                0xc3, // ret
+            };
+
+            using MemoryBlock memGetXcr0 = new MemoryBlock((ulong)asmGetXcr0.Length);
+
+            memGetXcr0.Write(0, asmGetXcr0);
+
+            memGetXcr0.Reprotect(0, (ulong)asmGetXcr0.Length, MemoryPermission.ReadAndExecute);
+
+            var fGetXcr0 = Marshal.GetDelegateForFunctionPointer<GetXcr0>(memGetXcr0.Pointer);
+
+            return fGetXcr0();
         }
 
         [Flags]
@@ -44,6 +70,7 @@ namespace ARMeilleure.CodeGen.X86
             Sse42 = 1 << 20,
             Popcnt = 1 << 23,
             Aes = 1 << 25,
+            Osxsave = 1 << 27,
             Avx = 1 << 28,
             F16c = 1 << 29
         }
@@ -52,7 +79,11 @@ namespace ARMeilleure.CodeGen.X86
         public enum FeatureFlags7Ebx
         {
             Avx2 = 1 << 5,
-            Sha = 1 << 29
+            Avx512f = 1 << 16,
+            Avx512dq = 1 << 17,
+            Sha = 1 << 29,
+            Avx512bw = 1 << 30,
+            Avx512vl = 1 << 31
         }
 
         [Flags]
@@ -61,10 +92,21 @@ namespace ARMeilleure.CodeGen.X86
             Gfni = 1 << 8,
         }
 
+        [Flags]
+        public enum Xcr0FlagsEax
+        {
+            Sse = 1 << 1,
+            YmmHi128 = 1 << 2,
+            Opmask = 1 << 5,
+            ZmmHi256 = 1 << 6,
+            Hi16Zmm = 1 << 7
+        }
+
         public static FeatureFlags1Edx FeatureInfo1Edx { get; }
         public static FeatureFlags1Ecx FeatureInfo1Ecx { get; }
         public static FeatureFlags7Ebx FeatureInfo7Ebx { get; } = 0;
         public static FeatureFlags7Ecx FeatureInfo7Ecx { get; } = 0;
+        public static Xcr0FlagsEax Xcr0InfoEax { get; } = 0;
 
         public static bool SupportsSse => FeatureInfo1Edx.HasFlag(FeatureFlags1Edx.Sse);
         public static bool SupportsSse2 => FeatureInfo1Edx.HasFlag(FeatureFlags1Edx.Sse2);
@@ -76,8 +118,13 @@ namespace ARMeilleure.CodeGen.X86
         public static bool SupportsSse42 => FeatureInfo1Ecx.HasFlag(FeatureFlags1Ecx.Sse42);
         public static bool SupportsPopcnt => FeatureInfo1Ecx.HasFlag(FeatureFlags1Ecx.Popcnt);
         public static bool SupportsAesni => FeatureInfo1Ecx.HasFlag(FeatureFlags1Ecx.Aes);
-        public static bool SupportsAvx => FeatureInfo1Ecx.HasFlag(FeatureFlags1Ecx.Avx);
+        public static bool SupportsAvx => FeatureInfo1Ecx.HasFlag(FeatureFlags1Ecx.Avx | FeatureFlags1Ecx.Osxsave) && Xcr0InfoEax.HasFlag(Xcr0FlagsEax.Sse | Xcr0FlagsEax.YmmHi128);
         public static bool SupportsAvx2 => FeatureInfo7Ebx.HasFlag(FeatureFlags7Ebx.Avx2) && SupportsAvx;
+        public static bool SupportsAvx512F => FeatureInfo7Ebx.HasFlag(FeatureFlags7Ebx.Avx512f) && FeatureInfo1Ecx.HasFlag(FeatureFlags1Ecx.Osxsave)
+            && Xcr0InfoEax.HasFlag(Xcr0FlagsEax.Sse | Xcr0FlagsEax.YmmHi128 | Xcr0FlagsEax.Opmask | Xcr0FlagsEax.ZmmHi256 | Xcr0FlagsEax.Hi16Zmm);
+        public static bool SupportsAvx512Vl => FeatureInfo7Ebx.HasFlag(FeatureFlags7Ebx.Avx512vl) && SupportsAvx512F;
+        public static bool SupportsAvx512Bw => FeatureInfo7Ebx.HasFlag(FeatureFlags7Ebx.Avx512bw) && SupportsAvx512F;
+        public static bool SupportsAvx512Dq => FeatureInfo7Ebx.HasFlag(FeatureFlags7Ebx.Avx512dq) && SupportsAvx512F;
         public static bool SupportsF16c => FeatureInfo1Ecx.HasFlag(FeatureFlags1Ecx.F16c);
         public static bool SupportsSha => FeatureInfo7Ebx.HasFlag(FeatureFlags7Ebx.Sha);
         public static bool SupportsGfni => FeatureInfo7Ecx.HasFlag(FeatureFlags7Ecx.Gfni);
@@ -85,5 +132,6 @@ namespace ARMeilleure.CodeGen.X86
         public static bool ForceLegacySse { get; set; }
 
         public static bool SupportsVexEncoding => SupportsAvx && !ForceLegacySse;
+        public static bool SupportsEvexEncoding => SupportsAvx512F && !ForceLegacySse;
     }
 }
\ No newline at end of file
diff --git a/ARMeilleure/CodeGen/X86/IntrinsicTable.cs b/ARMeilleure/CodeGen/X86/IntrinsicTable.cs
index 8c909ac13d..c788fa4424 100644
--- a/ARMeilleure/CodeGen/X86/IntrinsicTable.cs
+++ b/ARMeilleure/CodeGen/X86/IntrinsicTable.cs
@@ -180,6 +180,7 @@ namespace ARMeilleure.CodeGen.X86
             Add(Intrinsic.X86Vfnmadd231ss,  new IntrinsicInfo(X86Instruction.Vfnmadd231ss,  IntrinsicType.Fma));
             Add(Intrinsic.X86Vfnmsub231sd,  new IntrinsicInfo(X86Instruction.Vfnmsub231sd,  IntrinsicType.Fma));
             Add(Intrinsic.X86Vfnmsub231ss,  new IntrinsicInfo(X86Instruction.Vfnmsub231ss,  IntrinsicType.Fma));
+            Add(Intrinsic.X86Vpternlogd,    new IntrinsicInfo(X86Instruction.Vpternlogd,    IntrinsicType.TernaryImm));
             Add(Intrinsic.X86Xorpd,         new IntrinsicInfo(X86Instruction.Xorpd,         IntrinsicType.Binary));
             Add(Intrinsic.X86Xorps,         new IntrinsicInfo(X86Instruction.Xorps,         IntrinsicType.Binary));
         }
diff --git a/ARMeilleure/CodeGen/X86/X86Instruction.cs b/ARMeilleure/CodeGen/X86/X86Instruction.cs
index b024394e16..ecfc432d70 100644
--- a/ARMeilleure/CodeGen/X86/X86Instruction.cs
+++ b/ARMeilleure/CodeGen/X86/X86Instruction.cs
@@ -219,6 +219,7 @@ namespace ARMeilleure.CodeGen.X86
         Vfnmsub231sd,
         Vfnmsub231ss,
         Vpblendvb,
+        Vpternlogd,
         Xor,
         Xorpd,
         Xorps,
diff --git a/ARMeilleure/Instructions/InstEmitSimdLogical.cs b/ARMeilleure/Instructions/InstEmitSimdLogical.cs
index 8ca815801a..2bf531e6c7 100644
--- a/ARMeilleure/Instructions/InstEmitSimdLogical.cs
+++ b/ARMeilleure/Instructions/InstEmitSimdLogical.cs
@@ -254,7 +254,22 @@ namespace ARMeilleure.Instructions
 
         public static void Not_V(ArmEmitterContext context)
         {
-            if (Optimizations.UseSse2)
+            if (Optimizations.UseAvx512Ortho)
+            {
+                OpCodeSimd op = (OpCodeSimd)context.CurrOp;
+
+                Operand n = GetVec(op.Rn);
+
+                Operand res = context.AddIntrinsic(Intrinsic.X86Vpternlogd, n, n, Const(~0b10101010));
+
+                if (op.RegisterSize == RegisterSize.Simd64)
+                {
+                    res = context.VectorZeroUpper64(res);
+                }
+
+                context.Copy(GetVec(op.Rd), res);
+            }
+            else if (Optimizations.UseSse2)
             {
                 OpCodeSimd op = (OpCodeSimd)context.CurrOp;
 
@@ -283,6 +298,22 @@ namespace ARMeilleure.Instructions
             {
                 InstEmitSimdHelperArm64.EmitVectorBinaryOp(context, Intrinsic.Arm64OrnV);
             }
+            else if (Optimizations.UseAvx512Ortho)
+            {
+                OpCodeSimdReg op = (OpCodeSimdReg)context.CurrOp;
+
+                Operand n = GetVec(op.Rn);
+                Operand m = GetVec(op.Rm);
+
+                Operand res = context.AddIntrinsic(Intrinsic.X86Vpternlogd, n, m, Const(0b11001100 | ~0b10101010));
+
+                if (op.RegisterSize == RegisterSize.Simd64)
+                {
+                    res = context.VectorZeroUpper64(res);
+                }
+
+                context.Copy(GetVec(op.Rd), res);
+            }
             else if (Optimizations.UseSse2)
             {
                 OpCodeSimdReg op = (OpCodeSimdReg)context.CurrOp;
diff --git a/ARMeilleure/Instructions/InstEmitSimdLogical32.cs b/ARMeilleure/Instructions/InstEmitSimdLogical32.cs
index c2a04778bb..68ef4ed175 100644
--- a/ARMeilleure/Instructions/InstEmitSimdLogical32.cs
+++ b/ARMeilleure/Instructions/InstEmitSimdLogical32.cs
@@ -151,6 +151,13 @@ namespace ARMeilleure.Instructions
             {
                 InstEmitSimdHelper32Arm64.EmitVectorBinaryOpSimd32(context, (n, m) => context.AddIntrinsic(Intrinsic.Arm64OrnV | Intrinsic.Arm64V128, n, m));
             }
+            else if (Optimizations.UseAvx512Ortho)
+            {
+                EmitVectorBinaryOpSimd32(context, (n, m) =>
+                {
+                    return context.AddIntrinsic(Intrinsic.X86Vpternlogd, n, m, Const(0b11001100 | ~0b10101010));
+                });
+            }
             else if (Optimizations.UseSse2)
             {
                 Operand mask = context.VectorOne();
diff --git a/ARMeilleure/Instructions/InstEmitSimdMove32.cs b/ARMeilleure/Instructions/InstEmitSimdMove32.cs
index 17100eb9c8..b8b91b31d2 100644
--- a/ARMeilleure/Instructions/InstEmitSimdMove32.cs
+++ b/ARMeilleure/Instructions/InstEmitSimdMove32.cs
@@ -34,7 +34,14 @@ namespace ARMeilleure.Instructions
 
         public static void Vmvn_I(ArmEmitterContext context)
         {
-            if (Optimizations.UseSse2)
+            if (Optimizations.UseAvx512Ortho)
+            {
+                EmitVectorUnaryOpSimd32(context, (op1) =>
+                {
+                    return context.AddIntrinsic(Intrinsic.X86Vpternlogd, op1, op1, Const(0b01010101));
+                });
+            }
+            else if (Optimizations.UseSse2)
             {
                 EmitVectorUnaryOpSimd32(context, (op1) =>
                 {
diff --git a/ARMeilleure/IntermediateRepresentation/Intrinsic.cs b/ARMeilleure/IntermediateRepresentation/Intrinsic.cs
index a665e4b7a1..b629345ee4 100644
--- a/ARMeilleure/IntermediateRepresentation/Intrinsic.cs
+++ b/ARMeilleure/IntermediateRepresentation/Intrinsic.cs
@@ -173,6 +173,7 @@ namespace ARMeilleure.IntermediateRepresentation
         X86Vfnmadd231ss,
         X86Vfnmsub231sd,
         X86Vfnmsub231ss,
+        X86Vpternlogd,
         X86Xorpd,
         X86Xorps,
 
diff --git a/ARMeilleure/Optimizations.cs b/ARMeilleure/Optimizations.cs
index 9044314f60..a84a4dc4f5 100644
--- a/ARMeilleure/Optimizations.cs
+++ b/ARMeilleure/Optimizations.cs
@@ -23,6 +23,10 @@ namespace ARMeilleure
         public static bool UseSse42IfAvailable     { get; set; } = true;
         public static bool UsePopCntIfAvailable    { get; set; } = true;
         public static bool UseAvxIfAvailable       { get; set; } = true;
+        public static bool UseAvx512FIfAvailable   { get; set; } = true;
+        public static bool UseAvx512VlIfAvailable  { get; set; } = true;
+        public static bool UseAvx512BwIfAvailable  { get; set; } = true;
+        public static bool UseAvx512DqIfAvailable  { get; set; } = true;
         public static bool UseF16cIfAvailable      { get; set; } = true;
         public static bool UseFmaIfAvailable       { get; set; } = true;
         public static bool UseAesniIfAvailable     { get; set; } = true;
@@ -47,11 +51,18 @@ namespace ARMeilleure
         internal static bool UseSse42     => UseSse42IfAvailable     && X86HardwareCapabilities.SupportsSse42;
         internal static bool UsePopCnt    => UsePopCntIfAvailable    && X86HardwareCapabilities.SupportsPopcnt;
         internal static bool UseAvx       => UseAvxIfAvailable       && X86HardwareCapabilities.SupportsAvx && !ForceLegacySse;
+        internal static bool UseAvx512F   => UseAvx512FIfAvailable   && X86HardwareCapabilities.SupportsAvx512F && !ForceLegacySse;
+        internal static bool UseAvx512Vl  => UseAvx512VlIfAvailable  && X86HardwareCapabilities.SupportsAvx512Vl && !ForceLegacySse;
+        internal static bool UseAvx512Bw  => UseAvx512BwIfAvailable  && X86HardwareCapabilities.SupportsAvx512Bw && !ForceLegacySse;
+        internal static bool UseAvx512Dq  => UseAvx512DqIfAvailable  && X86HardwareCapabilities.SupportsAvx512Dq && !ForceLegacySse;
         internal static bool UseF16c      => UseF16cIfAvailable      && X86HardwareCapabilities.SupportsF16c;
         internal static bool UseFma       => UseFmaIfAvailable       && X86HardwareCapabilities.SupportsFma;
         internal static bool UseAesni     => UseAesniIfAvailable     && X86HardwareCapabilities.SupportsAesni;
         internal static bool UsePclmulqdq => UsePclmulqdqIfAvailable && X86HardwareCapabilities.SupportsPclmulqdq;
         internal static bool UseSha       => UseShaIfAvailable       && X86HardwareCapabilities.SupportsSha;
         internal static bool UseGfni      => UseGfniIfAvailable      && X86HardwareCapabilities.SupportsGfni;
+
+        internal static bool UseAvx512Ortho => UseAvx512F && UseAvx512Vl;
+        internal static bool UseAvx512OrthoFloat => UseAvx512Ortho && UseAvx512Dq;
     }
 }
diff --git a/ARMeilleure/Translation/PTC/Ptc.cs b/ARMeilleure/Translation/PTC/Ptc.cs
index 0b23fd043b..17f6870623 100644
--- a/ARMeilleure/Translation/PTC/Ptc.cs
+++ b/ARMeilleure/Translation/PTC/Ptc.cs
@@ -30,7 +30,7 @@ namespace ARMeilleure.Translation.PTC
         private const string OuterHeaderMagicString = "PTCohd\0\0";
         private const string InnerHeaderMagicString = "PTCihd\0\0";
 
-        private const uint InternalVersion = 4484; //! To be incremented manually for each change to the ARMeilleure project.
+        private const uint InternalVersion = 4485; //! To be incremented manually for each change to the ARMeilleure project.
 
         private const string ActualDir = "0";
         private const string BackupDir = "1";
@@ -969,6 +969,7 @@ namespace ARMeilleure.Translation.PTC
                     (ulong)Arm64HardwareCapabilities.LinuxFeatureInfoHwCap,
                     (ulong)Arm64HardwareCapabilities.LinuxFeatureInfoHwCap2,
                     (ulong)Arm64HardwareCapabilities.MacOsFeatureInfo,
+                    0,
                     0);
             }
             else if (RuntimeInformation.ProcessArchitecture == Architecture.X64)
@@ -977,11 +978,12 @@ namespace ARMeilleure.Translation.PTC
                     (ulong)X86HardwareCapabilities.FeatureInfo1Ecx,
                     (ulong)X86HardwareCapabilities.FeatureInfo1Edx,
                     (ulong)X86HardwareCapabilities.FeatureInfo7Ebx,
-                    (ulong)X86HardwareCapabilities.FeatureInfo7Ecx);
+                    (ulong)X86HardwareCapabilities.FeatureInfo7Ecx,
+                    (ulong)X86HardwareCapabilities.Xcr0InfoEax);
             }
             else
             {
-                return new FeatureInfo(0, 0, 0, 0);
+                return new FeatureInfo(0, 0, 0, 0, 0);
             }
         }
 
@@ -1002,7 +1004,7 @@ namespace ARMeilleure.Translation.PTC
             return osPlatform;
         }
 
-        [StructLayout(LayoutKind.Sequential, Pack = 1/*, Size = 78*/)]
+        [StructLayout(LayoutKind.Sequential, Pack = 1/*, Size = 86*/)]
         private struct OuterHeader
         {
             public ulong Magic;
@@ -1034,8 +1036,8 @@ namespace ARMeilleure.Translation.PTC
             }
         }
 
-        [StructLayout(LayoutKind.Sequential, Pack = 1/*, Size = 32*/)]
-        private record struct FeatureInfo(ulong FeatureInfo0, ulong FeatureInfo1, ulong FeatureInfo2, ulong FeatureInfo3);
+        [StructLayout(LayoutKind.Sequential, Pack = 1/*, Size = 40*/)]
+        private record struct FeatureInfo(ulong FeatureInfo0, ulong FeatureInfo1, ulong FeatureInfo2, ulong FeatureInfo3, ulong FeatureInfo4);
 
         [StructLayout(LayoutKind.Sequential, Pack = 1/*, Size = 128*/)]
         private struct InnerHeader